From 8a62ffe2753a845272f4f2100b5fca0b6053ff6f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 21 Dec 2018 11:33:54 +0100
Subject: [PATCH 01/80] PM-runtime: Add new interface to get accounted time

Some drivers (like i915/drm) needs to get the accounted suspended time.
pm_runtime_suspended_time() will return the suspended accounted time
in ns unit.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 15 +++++++++++++++
 include/linux/pm_runtime.h   |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 457be03b744d..a453090c9449 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -88,6 +88,21 @@ static void __update_runtime_status(struct device *dev, enum rpm_status status)
 	dev->power.runtime_status = status;
 }
 
+u64 pm_runtime_suspended_time(struct device *dev)
+{
+	unsigned long flags, time;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+
+	update_pm_runtime_accounting(dev);
+	time = dev->power.suspended_jiffies;
+
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	return jiffies_to_nsecs(time);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_suspended_time);
+
 /**
  * pm_runtime_deactivate_timer - Deactivate given device's suspend timer.
  * @dev: Device to handle.
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 54af4eef169f..a370006921c0 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -113,6 +113,8 @@ static inline bool pm_runtime_is_irq_safe(struct device *dev)
 	return dev->power.irq_safe;
 }
 
+extern u64 pm_runtime_suspended_time(struct device *dev);
+
 #else /* !CONFIG_PM */
 
 static inline bool queue_pm_work(struct work_struct *work) { return false; }

From 3b4ed2e2eb5583774a1ed4aa4596a5a3c55f2567 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 21 Dec 2018 11:33:55 +0100
Subject: [PATCH 02/80] drm/i915: Move on the new pm runtime interface

Use the new PM-runtime interface to get the accounted suspended time:
pm_runtime_suspended_time().

This new interface helps to simplify and cleanup the code that computes
__I915_SAMPLE_RC6_ESTIMATED and to remove direct access to internals of
PM-runtime.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c | 16 ++++++----------
 drivers/gpu/drm/i915/i915_pmu.h |  4 ++--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index d6c8f8fdfda5..3f76f605ac31 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/pm_runtime.h>
 #include "i915_pmu.h"
 #include "intel_ringbuffer.h"
 #include "i915_drv.h"
@@ -478,7 +479,6 @@ static u64 get_rc6(struct drm_i915_private *i915)
 		 * counter value.
 		 */
 		spin_lock_irqsave(&i915->pmu.lock, flags);
-		spin_lock(&kdev->power.lock);
 
 		/*
 		 * After the above branch intel_runtime_pm_get_if_in_use failed
@@ -491,16 +491,13 @@ static u64 get_rc6(struct drm_i915_private *i915)
 		 * suspended and if not we cannot do better than report the last
 		 * known RC6 value.
 		 */
-		if (kdev->power.runtime_status == RPM_SUSPENDED) {
+		if (pm_runtime_status_suspended(kdev)) {
+			val = pm_runtime_suspended_time(kdev);
+
 			if (!i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur)
-				i915->pmu.suspended_jiffies_last =
-						  kdev->power.suspended_jiffies;
+				i915->pmu.suspended_time_last = val;
 
-			val = kdev->power.suspended_jiffies -
-			      i915->pmu.suspended_jiffies_last;
-			val += jiffies - kdev->power.accounting_timestamp;
-
-			val = jiffies_to_nsecs(val);
+			val -= i915->pmu.suspended_time_last;
 			val += i915->pmu.sample[__I915_SAMPLE_RC6].cur;
 
 			i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val;
@@ -510,7 +507,6 @@ static u64 get_rc6(struct drm_i915_private *i915)
 			val = i915->pmu.sample[__I915_SAMPLE_RC6].cur;
 		}
 
-		spin_unlock(&kdev->power.lock);
 		spin_unlock_irqrestore(&i915->pmu.lock, flags);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index 7f164ca3db12..3dc2a30188e7 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -95,9 +95,9 @@ struct i915_pmu {
 	 */
 	struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS];
 	/**
-	 * @suspended_jiffies_last: Cached suspend time from PM core.
+	 * @suspended_time_last: Cached suspend time from PM core.
 	 */
-	unsigned long suspended_jiffies_last;
+	u64 suspended_time_last;
 	/**
 	 * @i915_attr: Memory block holding device attributes.
 	 */

From 21469df4676e89cb9fd4be489d9a2cc4af09db8c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 7 Jan 2019 12:28:54 +0530
Subject: [PATCH 03/80] cpufreq: Don't update new_policy on failures

The local variable "new_policy" hasn't been used in the error path of
cpufreq_online() since commit f9f41e3ef99a (cpufreq: Remove policy
create/remove notifiers).  Don't update it in that error path.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e35a886e00bc..a8fa684f5f90 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1305,8 +1305,6 @@ static int cpufreq_online(unsigned int cpu)
 	if (ret) {
 		pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
 		       __func__, cpu, ret);
-		/* cpufreq_policy_free() will notify based on this */
-		new_policy = false;
 		goto out_destroy_policy;
 	}
 

From 4944514e6c7e613b578aaaafb2a2dd1a54bcf538 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 7 Jan 2019 11:33:43 -0600
Subject: [PATCH 04/80] cpufreq: e_powersaver: Use struct_size() in kzalloc()

One of the more common cases of allocation size calculations is
finding the size of a structure that has a zero-sized array at
the end, along with memory for some number of elements for that
array. For example:

struct foo {
    int stuff;
    void *entry[];
};

instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we
can now use the new struct_size() helper:

instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/e_powersaver.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c
index 60bea302abbe..2d3ef208dd70 100644
--- a/drivers/cpufreq/e_powersaver.c
+++ b/drivers/cpufreq/e_powersaver.c
@@ -323,9 +323,8 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
 		states = 2;
 
 	/* Allocate private data and frequency table for current cpu */
-	centaur = kzalloc(sizeof(*centaur)
-		    + (states + 1) * sizeof(struct cpufreq_frequency_table),
-		    GFP_KERNEL);
+	centaur = kzalloc(struct_size(centaur, freq_table, states + 1),
+			  GFP_KERNEL);
 	if (!centaur)
 		return -ENOMEM;
 	eps_cpu[0] = centaur;

From b26bf6ab716f27955e2a503ffca1691582127cbb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 4 Jan 2019 12:30:47 +0100
Subject: [PATCH 05/80] cpuidle: New timer events oriented governor for
 tickless systems

The venerable menu governor does some things that are quite
questionable in my view.

First, it includes timer wakeups in the pattern detection data and
mixes them up with wakeups from other sources which in some cases
causes it to expect what essentially would be a timer wakeup in a
time frame in which no timer wakeups are possible (because it knows
the time until the next timer event and that is later than the
expected wakeup time).

Second, it uses the extra exit latency limit based on the predicted
idle duration and depending on the number of tasks waiting on I/O,
even though those tasks may run on a different CPU when they are
woken up.  Moreover, the time ranges used by it for the sleep length
correction factors depend on whether or not there are tasks waiting
on I/O, which again doesn't imply anything in particular, and they
are not correlated to the list of available idle states in any way
whatever.

Also, the pattern detection code in menu may end up considering
values that are too large to matter at all, in which cases running
it is a waste of time.

A major rework of the menu governor would be required to address
these issues and the performance of at least some workloads (tuned
specifically to the current behavior of the menu governor) is likely
to suffer from that.  It is thus better to introduce an entirely new
governor without them and let everybody use the governor that works
better with their actual workloads.

The new governor introduced here, the timer events oriented (TEO)
governor, uses the same basic strategy as menu: it always tries to
find the deepest idle state that can be used in the given conditions.
However, it applies a different approach to that problem.

First, it doesn't use "correction factors" for the time till the
closest timer, but instead it tries to correlate the measured idle
duration values with the available idle states and use that
information to pick up the idle state that is most likely to "match"
the upcoming CPU idle interval.

Second, it doesn't take the number of "I/O waiters" into account at
all and the pattern detection code in it avoids taking timer wakeups
into account.  It also only uses idle duration values less than the
current time till the closest timer (with the tick excluded) for that
purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/admin-guide/pm/cpuidle.rst | 104 +++++-
 drivers/cpuidle/Kconfig                  |  11 +-
 drivers/cpuidle/governors/Makefile       |   1 +
 drivers/cpuidle/governors/teo.c          | 444 +++++++++++++++++++++++
 4 files changed, 551 insertions(+), 9 deletions(-)
 create mode 100644 drivers/cpuidle/governors/teo.c

diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index 106379e2619f..9c58b35a81cb 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -155,14 +155,14 @@ governor uses that information depends on what algorithm is implemented by it
 and that is the primary reason for having more than one governor in the
 ``CPUIdle`` subsystem.
 
-There are two ``CPUIdle`` governors available, ``menu`` and ``ladder``.  Which
-of them is used depends on the configuration of the kernel and in particular on
-whether or not the scheduler tick can be `stopped by the idle
-loop <idle-cpus-and-tick_>`_.  It is possible to change the governor at run time
-if the ``cpuidle_sysfs_switch`` command line parameter has been passed to the
-kernel, but that is not safe in general, so it should not be done on production
-systems (that may change in the future, though).  The name of the ``CPUIdle``
-governor currently used by the kernel can be read from the
+There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_
+and ``ladder``.  Which of them is used by default depends on the configuration
+of the kernel and in particular on whether or not the scheduler tick can be
+`stopped by the idle loop <idle-cpus-and-tick_>`_.  It is possible to change the
+governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has
+been passed to the kernel, but that is not safe in general, so it should not be
+done on production systems (that may change in the future, though).  The name of
+the ``CPUIdle`` governor currently used by the kernel can be read from the
 :file:`current_governor_ro` (or :file:`current_governor` if
 ``cpuidle_sysfs_switch`` is present in the kernel command line) file under
 :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
@@ -256,6 +256,8 @@ the ``menu`` governor by default and if it is not tickless, the default
 ``CPUIdle`` governor on it will be ``ladder``.
 
 
+.. _menu-gov:
+
 The ``menu`` Governor
 =====================
 
@@ -333,6 +335,92 @@ that time, the governor may need to select a shallower state with a suitable
 target residency.
 
 
+.. _teo-gov:
+
+The Timer Events Oriented (TEO) Governor
+========================================
+
+The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor
+for tickless systems.  It follows the same basic strategy as the ``menu`` `one
+<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
+given conditions.  However, it applies a different approach to that problem.
+
+First, it does not use sleep length correction factors, but instead it attempts
+to correlate the observed idle duration values with the available idle states
+and use that information to pick up the idle state that is most likely to
+"match" the upcoming CPU idle interval.   Second, it does not take the tasks
+that were running on the given CPU in the past and are waiting on some I/O
+operations to complete now at all (there is no guarantee that they will run on
+the same CPU when they become runnable again) and the pattern detection code in
+it avoids taking timer wakeups into account.  It also only uses idle duration
+values less than the current time till the closest timer (with the scheduler
+tick excluded) for that purpose.
+
+Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
+the *sleep length*, which is the time until the closest timer event with the
+assumption that the scheduler tick will be stopped (that also is the upper bound
+on the time until the next CPU wakeup).  That value is then used to preselect an
+idle state on the basis of three metrics maintained for each idle state provided
+by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
+
+The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
+state will "match" the observed (post-wakeup) idle duration if it "matches" the
+sleep length.  They both are subject to decay (after a CPU wakeup) every time
+the target residency of the idle state corresponding to them is less than or
+equal to the sleep length and the target residency of the next idle state is
+greater than the sleep length (that is, when the idle state corresponding to
+them "matches" the sleep length).  The ``hits`` metric is increased if the
+former condition is satisfied and the target residency of the given idle state
+is less than or equal to the observed idle duration and the target residency of
+the next idle state is greater than the observed idle duration at the same time
+(that is, it is increased when the given idle state "matches" both the sleep
+length and the observed idle duration).  In turn, the ``misses`` metric is
+increased when the given idle state "matches" the sleep length only and the
+observed idle duration is too short for its target residency.
+
+The ``early_hits`` metric measures the likelihood that a given idle state will
+"match" the observed (post-wakeup) idle duration if it does not "match" the
+sleep length.  It is subject to decay on every CPU wakeup and it is increased
+when the idle state corresponding to it "matches" the observed (post-wakeup)
+idle duration and the target residency of the next idle state is less than or
+equal to the sleep length (i.e. the idle state "matching" the sleep length is
+deeper than the given one).
+
+The governor walks the list of idle states provided by the ``CPUIdle`` driver
+and finds the last (deepest) one with the target residency less than or equal
+to the sleep length.  Then, the ``hits`` and ``misses`` metrics of that idle
+state are compared with each other and it is preselected if the ``hits`` one is
+greater (which means that that idle state is likely to "match" the observed idle
+duration after CPU wakeup).  If the ``misses`` one is greater, the governor
+preselects the shallower idle state with the maximum ``early_hits`` metric
+(or if there are multiple shallower idle states with equal ``early_hits``
+metric which also is the maximum, the shallowest of them will be preselected).
+[If there is a wakeup latency constraint coming from the `PM QoS framework
+<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
+target residency within the sleep length, the deepest idle state with the exit
+latency within the constraint is preselected without consulting the ``hits``,
+``misses`` and ``early_hits`` metrics.]
+
+Next, the governor takes several idle duration values observed most recently
+into consideration and if at least a half of them are greater than or equal to
+the target residency of the preselected idle state, that idle state becomes the
+final candidate to ask for.  Otherwise, the average of the most recent idle
+duration values below the target residency of the preselected idle state is
+computed and the governor walks the idle states shallower than the preselected
+one and finds the deepest of them with the target residency within that average.
+That idle state is then taken as the final candidate to ask for.
+
+Still, at this point the governor may need to refine the idle state selection if
+it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_.  That
+generally happens if the target residency of the idle state selected so far is
+less than the tick period and the tick has not been stopped already (in a
+previous iteration of the idle loop).  Then, like in the ``menu`` governor
+`case <menu-gov_>`_, the sleep length used in the previous computations may not
+reflect the real time until the closest timer event and if it really is greater
+than that time, a shallower state with a suitable target residency may need to
+be selected.
+
+
 .. _idle-states-representation:
 
 Representation of Idle States
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5bf0a7..8caccbbd7353 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -4,7 +4,7 @@ config CPU_IDLE
 	bool "CPU idle PM support"
 	default y if ACPI || PPC_PSERIES
 	select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
-	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE)
+	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO
 	help
 	  CPU idle is a generic framework for supporting software-controlled
 	  idle processor power management.  It includes modular cross-platform
@@ -23,6 +23,15 @@ config CPU_IDLE_GOV_LADDER
 config CPU_IDLE_GOV_MENU
 	bool "Menu governor (for tickless system)"
 
+config CPU_IDLE_GOV_TEO
+	bool "Timer events oriented (TEO) governor (for tickless systems)"
+	help
+	  This governor implements a simplified idle state selection method
+	  focused on timer events and does not do any interactivity boosting.
+
+	  Some workloads benefit from using it and it generally should be safe
+	  to use.  Say Y here if you are not happy with the alternatives.
+
 config DT_IDLE_STATES
 	bool
 
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 1b512722689f..4d8aff5248a8 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
 obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
new file mode 100644
index 000000000000..7d05efdbd3c6
--- /dev/null
+++ b/drivers/cpuidle/governors/teo.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Timer events oriented CPU idle governor
+ *
+ * Copyright (C) 2018 Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * The idea of this governor is based on the observation that on many systems
+ * timer events are two or more orders of magnitude more frequent than any
+ * other interrupts, so they are likely to be the most significant source of CPU
+ * wakeups from idle states.  Moreover, information about what happened in the
+ * (relatively recent) past can be used to estimate whether or not the deepest
+ * idle state with target residency within the time to the closest timer is
+ * likely to be suitable for the upcoming idle time of the CPU and, if not, then
+ * which of the shallower idle states to choose.
+ *
+ * Of course, non-timer wakeup sources are more important in some use cases and
+ * they can be covered by taking a few most recent idle time intervals of the
+ * CPU into account.  However, even in that case it is not necessary to consider
+ * idle duration values greater than the time till the closest timer, as the
+ * patterns that they may belong to produce average values close enough to
+ * the time till the closest timer (sleep length) anyway.
+ *
+ * Thus this governor estimates whether or not the upcoming idle time of the CPU
+ * is likely to be significantly shorter than the sleep length and selects an
+ * idle state for it in accordance with that, as follows:
+ *
+ * - Find an idle state on the basis of the sleep length and state statistics
+ *   collected over time:
+ *
+ *   o Find the deepest idle state whose target residency is less than or equal
+ *     to the sleep length.
+ *
+ *   o Select it if it matched both the sleep length and the observed idle
+ *     duration in the past more often than it matched the sleep length alone
+ *     (i.e. the observed idle duration was significantly shorter than the sleep
+ *     length matched by it).
+ *
+ *   o Otherwise, select the shallower state with the greatest matched "early"
+ *     wakeups metric.
+ *
+ * - If the majority of the most recent idle duration values are below the
+ *   target residency of the idle state selected so far, use those values to
+ *   compute the new expected idle duration and find an idle state matching it
+ *   (which has to be shallower than the one selected so far).
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/tick.h>
+
+/*
+ * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
+ * is used for decreasing metrics on a regular basis.
+ */
+#define PULSE		1024
+#define DECAY_SHIFT	3
+
+/*
+ * Number of the most recent idle duration values to take into consideration for
+ * the detection of wakeup patterns.
+ */
+#define INTERVALS	8
+
+/**
+ * struct teo_idle_state - Idle state data used by the TEO cpuidle governor.
+ * @early_hits: "Early" CPU wakeups "matching" this state.
+ * @hits: "On time" CPU wakeups "matching" this state.
+ * @misses: CPU wakeups "missing" this state.
+ *
+ * A CPU wakeup is "matched" by a given idle state if the idle duration measured
+ * after the wakeup is between the target residency of that state and the target
+ * residency of the next one (or if this is the deepest available idle state, it
+ * "matches" a CPU wakeup when the measured idle duration is at least equal to
+ * its target residency).
+ *
+ * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if
+ * it occurs significantly earlier than the closest expected timer event (that
+ * is, early enough to match an idle state shallower than the one matching the
+ * time till the closest timer event).  Otherwise, the wakeup is "on time", or
+ * it is a "hit".
+ *
+ * A "miss" occurs when the given state doesn't match the wakeup, but it matches
+ * the time till the closest timer event used for idle state selection.
+ */
+struct teo_idle_state {
+	unsigned int early_hits;
+	unsigned int hits;
+	unsigned int misses;
+};
+
+/**
+ * struct teo_cpu - CPU data used by the TEO cpuidle governor.
+ * @time_span_ns: Time between idle state selection and post-wakeup update.
+ * @sleep_length_ns: Time till the closest timer event (at the selection time).
+ * @states: Idle states data corresponding to this CPU.
+ * @last_state: Idle state entered by the CPU last time.
+ * @interval_idx: Index of the most recent saved idle interval.
+ * @intervals: Saved idle duration values.
+ */
+struct teo_cpu {
+	u64 time_span_ns;
+	u64 sleep_length_ns;
+	struct teo_idle_state states[CPUIDLE_STATE_MAX];
+	int last_state;
+	int interval_idx;
+	unsigned int intervals[INTERVALS];
+};
+
+static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
+
+/**
+ * teo_update - Update CPU data after wakeup.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ */
+static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns);
+	int i, idx_hit = -1, idx_timer = -1;
+	unsigned int measured_us;
+
+	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
+		/*
+		 * One of the safety nets has triggered or this was a timer
+		 * wakeup (or equivalent).
+		 */
+		measured_us = sleep_length_us;
+	} else {
+		unsigned int lat = drv->states[cpu_data->last_state].exit_latency;
+
+		measured_us = ktime_to_us(cpu_data->time_span_ns);
+		/*
+		 * The delay between the wakeup and the first instruction
+		 * executed by the CPU is not likely to be worst-case every
+		 * time, so take 1/2 of the exit latency as a very rough
+		 * approximation of the average of it.
+		 */
+		if (measured_us >= lat)
+			measured_us -= lat / 2;
+		else
+			measured_us /= 2;
+	}
+
+	/*
+	 * Decay the "early hits" metric for all of the states and find the
+	 * states matching the sleep length and the measured idle duration.
+	 */
+	for (i = 0; i < drv->state_count; i++) {
+		unsigned int early_hits = cpu_data->states[i].early_hits;
+
+		cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
+
+		if (drv->states[i].target_residency <= sleep_length_us) {
+			idx_timer = i;
+			if (drv->states[i].target_residency <= measured_us)
+				idx_hit = i;
+		}
+	}
+
+	/*
+	 * Update the "hits" and "misses" data for the state matching the sleep
+	 * length.  If it matches the measured idle duration too, this is a hit,
+	 * so increase the "hits" metric for it then.  Otherwise, this is a
+	 * miss, so increase the "misses" metric for it.  In the latter case
+	 * also increase the "early hits" metric for the state that actually
+	 * matches the measured idle duration.
+	 */
+	if (idx_timer >= 0) {
+		unsigned int hits = cpu_data->states[idx_timer].hits;
+		unsigned int misses = cpu_data->states[idx_timer].misses;
+
+		hits -= hits >> DECAY_SHIFT;
+		misses -= misses >> DECAY_SHIFT;
+
+		if (idx_timer > idx_hit) {
+			misses += PULSE;
+			if (idx_hit >= 0)
+				cpu_data->states[idx_hit].early_hits += PULSE;
+		} else {
+			hits += PULSE;
+		}
+
+		cpu_data->states[idx_timer].misses = misses;
+		cpu_data->states[idx_timer].hits = hits;
+	}
+
+	/*
+	 * If the total time span between idle state selection and the "reflect"
+	 * callback is greater than or equal to the sleep length determined at
+	 * the idle state selection time, the wakeup is likely to be due to a
+	 * timer event.
+	 */
+	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns)
+		measured_us = UINT_MAX;
+
+	/*
+	 * Save idle duration values corresponding to non-timer wakeups for
+	 * pattern detection.
+	 */
+	cpu_data->intervals[cpu_data->interval_idx++] = measured_us;
+	if (cpu_data->interval_idx > INTERVALS)
+		cpu_data->interval_idx = 0;
+}
+
+/**
+ * teo_find_shallower_state - Find shallower idle state matching given duration.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ * @state_idx: Index of the capping idle state.
+ * @duration_us: Idle duration value to match.
+ */
+static int teo_find_shallower_state(struct cpuidle_driver *drv,
+				    struct cpuidle_device *dev, int state_idx,
+				    unsigned int duration_us)
+{
+	int i;
+
+	for (i = state_idx - 1; i >= 0; i--) {
+		if (drv->states[i].disabled || dev->states_usage[i].disable)
+			continue;
+
+		state_idx = i;
+		if (drv->states[i].target_residency <= duration_us)
+			break;
+	}
+	return state_idx;
+}
+
+/**
+ * teo_select - Selects the next idle state to enter.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ * @stop_tick: Indication on whether or not to stop the scheduler tick.
+ */
+static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		      bool *stop_tick)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	int latency_req = cpuidle_governor_latency_req(dev->cpu);
+	unsigned int duration_us, count;
+	int max_early_idx, idx, i;
+	ktime_t delta_tick;
+
+	if (cpu_data->last_state >= 0) {
+		teo_update(drv, dev);
+		cpu_data->last_state = -1;
+	}
+
+	cpu_data->time_span_ns = local_clock();
+
+	cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
+	duration_us = ktime_to_us(cpu_data->sleep_length_ns);
+
+	count = 0;
+	max_early_idx = -1;
+	idx = -1;
+
+	for (i = 0; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+		struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+		if (s->disabled || su->disable) {
+			/*
+			 * If the "early hits" metric of a disabled state is
+			 * greater than the current maximum, it should be taken
+			 * into account, because it would be a mistake to select
+			 * a deeper state with lower "early hits" metric.  The
+			 * index cannot be changed to point to it, however, so
+			 * just increase the max count alone and let the index
+			 * still point to a shallower idle state.
+			 */
+			if (max_early_idx >= 0 &&
+			    count < cpu_data->states[i].early_hits)
+				count = cpu_data->states[i].early_hits;
+
+			continue;
+		}
+
+		if (idx < 0)
+			idx = i; /* first enabled state */
+
+		if (s->target_residency > duration_us)
+			break;
+
+		if (s->exit_latency > latency_req) {
+			/*
+			 * If we break out of the loop for latency reasons, use
+			 * the target residency of the selected state as the
+			 * expected idle duration to avoid stopping the tick
+			 * as long as that target residency is low enough.
+			 */
+			duration_us = drv->states[idx].target_residency;
+			goto refine;
+		}
+
+		idx = i;
+
+		if (count < cpu_data->states[i].early_hits &&
+		    !(tick_nohz_tick_stopped() &&
+		      drv->states[i].target_residency < TICK_USEC)) {
+			count = cpu_data->states[i].early_hits;
+			max_early_idx = i;
+		}
+	}
+
+	/*
+	 * If the "hits" metric of the idle state matching the sleep length is
+	 * greater than its "misses" metric, that is the one to use.  Otherwise,
+	 * it is more likely that one of the shallower states will match the
+	 * idle duration observed after wakeup, so take the one with the maximum
+	 * "early hits" metric, but if that cannot be determined, just use the
+	 * state selected so far.
+	 */
+	if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses &&
+	    max_early_idx >= 0) {
+		idx = max_early_idx;
+		duration_us = drv->states[idx].target_residency;
+	}
+
+refine:
+	if (idx < 0) {
+		idx = 0; /* No states enabled. Must use 0. */
+	} else if (idx > 0) {
+		u64 sum = 0;
+
+		count = 0;
+
+		/*
+		 * Count and sum the most recent idle duration values less than
+		 * the target residency of the state selected so far, find the
+		 * max.
+		 */
+		for (i = 0; i < INTERVALS; i++) {
+			unsigned int val = cpu_data->intervals[i];
+
+			if (val >= drv->states[idx].target_residency)
+				continue;
+
+			count++;
+			sum += val;
+		}
+
+		/*
+		 * Give up unless the majority of the most recent idle duration
+		 * values are in the interesting range.
+		 */
+		if (count > INTERVALS / 2) {
+			unsigned int avg_us = div64_u64(sum, count);
+
+			/*
+			 * Avoid spending too much time in an idle state that
+			 * would be too shallow.
+			 */
+			if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) {
+				idx = teo_find_shallower_state(drv, dev, idx, avg_us);
+				duration_us = avg_us;
+			}
+		}
+	}
+
+	/*
+	 * Don't stop the tick if the selected state is a polling one or if the
+	 * expected idle duration is shorter than the tick period length.
+	 */
+	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
+	    duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
+		unsigned int delta_tick_us = ktime_to_us(delta_tick);
+
+		*stop_tick = false;
+
+		/*
+		 * The tick is not going to be stopped, so if the target
+		 * residency of the state to be returned is not within the time
+		 * till the closest timer including the tick, try to correct
+		 * that.
+		 */
+		if (idx > 0 && drv->states[idx].target_residency > delta_tick_us)
+			idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us);
+	}
+
+	return idx;
+}
+
+/**
+ * teo_reflect - Note that governor data for the CPU need to be updated.
+ * @dev: Target CPU.
+ * @state: Entered state.
+ */
+static void teo_reflect(struct cpuidle_device *dev, int state)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+
+	cpu_data->last_state = state;
+	/*
+	 * If the wakeup was not "natural", but triggered by one of the safety
+	 * nets, assume that the CPU might have been idle for the entire sleep
+	 * length time.
+	 */
+	if (dev->poll_time_limit ||
+	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
+		dev->poll_time_limit = false;
+		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
+	} else {
+		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
+	}
+}
+
+/**
+ * teo_enable_device - Initialize the governor's data for the target CPU.
+ * @drv: cpuidle driver (not used).
+ * @dev: Target CPU.
+ */
+static int teo_enable_device(struct cpuidle_driver *drv,
+			     struct cpuidle_device *dev)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	int i;
+
+	memset(cpu_data, 0, sizeof(*cpu_data));
+
+	for (i = 0; i < INTERVALS; i++)
+		cpu_data->intervals[i] = UINT_MAX;
+
+	return 0;
+}
+
+static struct cpuidle_governor teo_governor = {
+	.name =		"teo",
+	.rating =	19,
+	.enable =	teo_enable_device,
+	.select =	teo_select,
+	.reflect =	teo_reflect,
+};
+
+static int __init teo_governor_init(void)
+{
+	return cpuidle_register_governor(&teo_governor);
+}
+
+postcore_initcall(teo_governor_init);

From 3b7357663a47092576c2802dd1dc74d35c8c8721 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 9 Jan 2019 11:53:50 +0100
Subject: [PATCH 06/80] Documentation: driver-api: PM: Add cpuidle document

Replace the remaining documents under Documentation/cpuidle/
with one more complete governor and driver API document for cpuidle
under Documentation/driver-api/pm/.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/cpuidle/driver.txt        |  37 ----
 Documentation/cpuidle/governor.txt      |  28 ---
 Documentation/driver-api/pm/cpuidle.rst | 282 ++++++++++++++++++++++++
 Documentation/driver-api/pm/index.rst   |   7 +-
 MAINTAINERS                             |   1 +
 5 files changed, 287 insertions(+), 68 deletions(-)
 delete mode 100644 Documentation/cpuidle/driver.txt
 delete mode 100644 Documentation/cpuidle/governor.txt
 create mode 100644 Documentation/driver-api/pm/cpuidle.rst

diff --git a/Documentation/cpuidle/driver.txt b/Documentation/cpuidle/driver.txt
deleted file mode 100644
index 1b0d81d92583..000000000000
--- a/Documentation/cpuidle/driver.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-		Supporting multiple CPU idle levels in kernel
-
-				cpuidle drivers
-
-
-
-
-cpuidle driver hooks into the cpuidle infrastructure and handles the
-architecture/platform dependent part of CPU idle states. Driver
-provides the platform idle state detection capability and also
-has mechanisms in place to support actual entry-exit into CPU idle states.
-
-cpuidle driver initializes the cpuidle_device structure for each CPU device
-and registers with cpuidle using cpuidle_register_device.
-
-If all the idle states are the same, the wrapper function cpuidle_register
-could be used instead.
-
-It can also support the dynamic changes (like battery <-> AC), by using
-cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device,
-cpuidle_resume_and_unlock.
-
-Interfaces:
-extern int cpuidle_register(struct cpuidle_driver *drv,
-                            const struct cpumask *const coupled_cpus);
-extern int cpuidle_unregister(struct cpuidle_driver *drv);
-extern int cpuidle_register_driver(struct cpuidle_driver *drv);
-extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
-extern int cpuidle_register_device(struct cpuidle_device *dev);
-extern void cpuidle_unregister_device(struct cpuidle_device *dev);
-
-extern void cpuidle_pause_and_lock(void);
-extern void cpuidle_resume_and_unlock(void);
-extern int cpuidle_enable_device(struct cpuidle_device *dev);
-extern void cpuidle_disable_device(struct cpuidle_device *dev);
diff --git a/Documentation/cpuidle/governor.txt b/Documentation/cpuidle/governor.txt
deleted file mode 100644
index d9020f5e847b..000000000000
--- a/Documentation/cpuidle/governor.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
-		Supporting multiple CPU idle levels in kernel
-
-				cpuidle governors
-
-
-
-
-cpuidle governor is policy routine that decides what idle state to enter at
-any given time. cpuidle core uses different callbacks to the governor.
-
-* enable() to enable governor for a particular device
-* disable() to disable governor for a particular device
-* select() to select an idle state to enter
-* reflect() called after returning from the idle state, which can be used
-  by the governor for some record keeping.
-
-More than one governor can be registered at the same time and
-users can switch between drivers using /sysfs interface (when enabled).
-More than one governor part is supported for developers to easily experiment
-with different governors. By default, most optimal governor based on your
-kernel configuration and platform will be selected by cpuidle.
-
-Interfaces:
-extern int cpuidle_register_governor(struct cpuidle_governor *gov);
-struct cpuidle_governor
diff --git a/Documentation/driver-api/pm/cpuidle.rst b/Documentation/driver-api/pm/cpuidle.rst
new file mode 100644
index 000000000000..5842ab621a58
--- /dev/null
+++ b/Documentation/driver-api/pm/cpuidle.rst
@@ -0,0 +1,282 @@
+.. |struct cpuidle_governor| replace:: :c:type:`struct cpuidle_governor <cpuidle_governor>`
+.. |struct cpuidle_device| replace:: :c:type:`struct cpuidle_device <cpuidle_device>`
+.. |struct cpuidle_driver| replace:: :c:type:`struct cpuidle_driver <cpuidle_driver>`
+.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
+
+========================
+CPU Idle Time Management
+========================
+
+::
+
+ Copyright (c) 2019 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+CPU Idle Time Management Subsystem
+==================================
+
+Every time one of the logical CPUs in the system (the entities that appear to
+fetch and execute instructions: hardware threads, if present, or processor
+cores) is idle after an interrupt or equivalent wakeup event, which means that
+there are no tasks to run on it except for the special "idle" task associated
+with it, there is an opportunity to save energy for the processor that it
+belongs to.  That can be done by making the idle logical CPU stop fetching
+instructions from memory and putting some of the processor's functional units
+depended on by it into an idle state in which they will draw less power.
+
+However, there may be multiple different idle states that can be used in such a
+situation in principle, so it may be necessary to find the most suitable one
+(from the kernel perspective) and ask the processor to use (or "enter") that
+particular idle state.  That is the role of the CPU idle time management
+subsystem in the kernel, called ``CPUIdle``.
+
+The design of ``CPUIdle`` is modular and based on the code duplication avoidance
+principle, so the generic code that in principle need not depend on the hardware
+or platform design details in it is separate from the code that interacts with
+the hardware.  It generally is divided into three categories of functional
+units: *governors* responsible for selecting idle states to ask the processor
+to enter, *drivers* that pass the governors' decisions on to the hardware and
+the *core* providing a common framework for them.
+
+
+CPU Idle Time Governors
+=======================
+
+A CPU idle time (``CPUIdle``) governor is a bundle of policy code invoked when
+one of the logical CPUs in the system turns out to be idle.  Its role is to
+select an idle state to ask the processor to enter in order to save some energy.
+
+``CPUIdle`` governors are generic and each of them can be used on any hardware
+platform that the Linux kernel can run on.  For this reason, data structures
+operated on by them cannot depend on any hardware architecture or platform
+design details as well.
+
+The governor itself is represented by a |struct cpuidle_governor| object
+containing four callback pointers, :c:member:`enable`, :c:member:`disable`,
+:c:member:`select`, :c:member:`reflect`, a :c:member:`rating` field described
+below, and a name (string) used for identifying it.
+
+For the governor to be available at all, that object needs to be registered
+with the ``CPUIdle`` core by calling :c:func:`cpuidle_register_governor()` with
+a pointer to it passed as the argument.  If successful, that causes the core to
+add the governor to the global list of available governors and, if it is the
+only one in the list (that is, the list was empty before) or the value of its
+:c:member:`rating` field is greater than the value of that field for the
+governor currently in use, or the name of the new governor was passed to the
+kernel as the value of the ``cpuidle.governor=`` command line parameter, the new
+governor will be used from that point on (there can be only one ``CPUIdle``
+governor in use at a time).  Also, if ``cpuidle_sysfs_switch`` is passed to the
+kernel in the command line, user space can choose the ``CPUIdle`` governor to
+use at run time via ``sysfs``.
+
+Once registered, ``CPUIdle`` governors cannot be unregistered, so it is not
+practical to put them into loadable kernel modules.
+
+The interface between ``CPUIdle`` governors and the core consists of four
+callbacks:
+
+:c:member:`enable`
+	::
+
+	  int (*enable) (struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+	The role of this callback is to prepare the governor for handling the
+	(logical) CPU represented by the |struct cpuidle_device| object	pointed
+	to by the ``dev`` argument.  The |struct cpuidle_driver| object pointed
+	to by the ``drv`` argument represents the ``CPUIdle`` driver to be used
+	with that CPU (among other things, it should contain the list of
+	|struct cpuidle_state| objects representing idle states that the
+	processor holding the given CPU can be asked to enter).
+
+	It may fail, in which case it is expected to return a negative error
+	code, and that causes the kernel to run the architecture-specific
+	default code for idle CPUs on the CPU in question instead of ``CPUIdle``
+	until the ``->enable()`` governor callback is invoked for that CPU
+	again.
+
+:c:member:`disable`
+	::
+
+	  void (*disable) (struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+	Called to make the governor stop handling the (logical) CPU represented
+	by the |struct cpuidle_device| object pointed to by the ``dev``
+	argument.
+
+	It is expected to reverse any changes made by the ``->enable()``
+	callback when it was last invoked for the target CPU, free all memory
+	allocated by that callback and so on.
+
+:c:member:`select`
+	::
+
+	  int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev,
+	                 bool *stop_tick);
+
+	Called to select an idle state for the processor holding the (logical)
+	CPU represented by the |struct cpuidle_device| object pointed to by the
+	``dev`` argument.
+
+	The list of idle states to take into consideration is represented by the
+	:c:member:`states` array of |struct cpuidle_state| objects held by the
+	|struct cpuidle_driver| object pointed to by the ``drv`` argument (which
+	represents the ``CPUIdle`` driver to be used with the CPU at hand).  The
+	value returned by this callback is interpreted as an index into that
+	array (unless it is a negative error code).
+
+	The ``stop_tick`` argument is used to indicate whether or not to stop
+	the scheduler tick before asking the processor to enter the selected
+	idle state.  When the ``bool`` variable pointed to by it (which is set
+	to ``true`` before invoking this callback) is cleared to ``false``, the
+	processor will be asked to enter the selected idle state without
+	stopping the scheduler tick on the given CPU (if the tick has been
+	stopped on that CPU already, however, it will not be restarted before
+	asking the processor to enter the idle state).
+
+	This callback is mandatory (i.e. the :c:member:`select` callback pointer
+	in |struct cpuidle_governor| must not be ``NULL`` for the registration
+	of the governor to succeed).
+
+:c:member:`reflect`
+	::
+
+	  void (*reflect) (struct cpuidle_device *dev, int index);
+
+	Called to allow the governor to evaluate the accuracy of the idle state
+	selection made by the ``->select()`` callback (when it was invoked last
+	time) and possibly use the result of that to improve the accuracy of
+	idle state selections in the future.
+
+In addition, ``CPUIdle`` governors are required to take power management
+quality of service (PM QoS) constraints on the processor wakeup latency into
+account when selecting idle states.  In order to obtain the current effective
+PM QoS wakeup latency constraint for a given CPU, a ``CPUIdle`` governor is
+expected to pass the number of the CPU to
+:c:func:`cpuidle_governor_latency_req()`.  Then, the governor's ``->select()``
+callback must not return the index of an indle state whose
+:c:member:`exit_latency` value is greater than the number returned by that
+function.
+
+
+CPU Idle Time Management Drivers
+================================
+
+CPU idle time management (``CPUIdle``) drivers provide an interface between the
+other parts of ``CPUIdle`` and the hardware.
+
+First of all, a ``CPUIdle`` driver has to populate the :c:member:`states` array
+of |struct cpuidle_state| objects included in the |struct cpuidle_driver| object
+representing it.  Going forward this array will represent the list of available
+idle states that the processor hardware can be asked to enter shared by all of
+the logical CPUs handled by the given driver.
+
+The entries in the :c:member:`states` array are expected to be sorted by the
+value of the :c:member:`target_residency` field in |struct cpuidle_state| in
+the ascending order (that is, index 0 should correspond to the idle state with
+the minimum value of :c:member:`target_residency`).  [Since the
+:c:member:`target_residency` value is expected to reflect the "depth" of the
+idle state represented by the |struct cpuidle_state| object holding it, this
+sorting order should be the same as the ascending sorting order by the idle
+state "depth".]
+
+Three fields in |struct cpuidle_state| are used by the existing ``CPUIdle``
+governors for computations related to idle state selection:
+
+:c:member:`target_residency`
+	Minimum time to spend in this idle state including the time needed to
+	enter it (which may be substantial) to save more energy than could
+	be saved by staying in a shallower idle state for the same amount of
+	time, in microseconds.
+
+:c:member:`exit_latency`
+	Maximum time it will take a CPU asking the processor to enter this idle
+	state to start executing the first instruction after a wakeup from it,
+	in microseconds.
+
+:c:member:`flags`
+	Flags representing idle state properties.  Currently, governors only use
+	the ``CPUIDLE_FLAG_POLLING`` flag which is set if the given object
+	does not represent a real idle state, but an interface to a software
+	"loop" that can be used in order to avoid asking the processor to enter
+	any idle state at all.  [There are other flags used by the ``CPUIdle``
+	core in special situations.]
+
+The :c:member:`enter` callback pointer in |struct cpuidle_state|, which must not
+be ``NULL``, points to the routine to execute in order to ask the processor to
+enter this particular idle state:
+
+::
+
+  void (*enter) (struct cpuidle_device *dev, struct cpuidle_driver *drv,
+                 int index);
+
+The first two arguments of it point to the |struct cpuidle_device| object
+representing the logical CPU running this callback and the
+|struct cpuidle_driver| object representing the driver itself, respectively,
+and the last one is an index of the |struct cpuidle_state| entry in the driver's
+:c:member:`states` array representing the idle state to ask the processor to
+enter.
+
+The analogous ``->enter_s2idle()`` callback in |struct cpuidle_state| is used
+only for implementing the suspend-to-idle system-wide power management feature.
+The difference between in and ``->enter()`` is that it must not re-enable
+interrupts at any point (even temporarily) or attempt to change the states of
+clock event devices, which the ``->enter()`` callback may do sometimes.
+
+Once the :c:member:`states` array has been populated, the number of valid
+entries in it has to be stored in the :c:member:`state_count` field of the
+|struct cpuidle_driver| object representing the driver.  Moreover, if any
+entries in the :c:member:`states` array represent "coupled" idle states (that
+is, idle states that can only be asked for if multiple related logical CPUs are
+idle), the :c:member:`safe_state_index` field in |struct cpuidle_driver| needs
+to be the index of an idle state that is not "coupled" (that is, one that can be
+asked for if only one logical CPU is idle).
+
+In addition to that, if the given ``CPUIdle`` driver is only going to handle a
+subset of logical CPUs in the system, the :c:member:`cpumask` field in its
+|struct cpuidle_driver| object must point to the set (mask) of CPUs that will be
+handled by it.
+
+A ``CPUIdle`` driver can only be used after it has been registered.  If there
+are no "coupled" idle state entries in the driver's :c:member:`states` array,
+that can be accomplished by passing the driver's |struct cpuidle_driver| object
+to :c:func:`cpuidle_register_driver()`.  Otherwise, :c:func:`cpuidle_register()`
+should be used for this purpose.
+
+However, it also is necessary to register |struct cpuidle_device| objects for
+all of the logical CPUs to be handled by the given ``CPUIdle`` driver with the
+help of :c:func:`cpuidle_register_device()` after the driver has been registered
+and :c:func:`cpuidle_register_driver()`, unlike :c:func:`cpuidle_register()`,
+does not do that automatically.  For this reason, the drivers that use
+:c:func:`cpuidle_register_driver()` to register themselves must also take care
+of registering the |struct cpuidle_device| objects as needed, so it is generally
+recommended to use :c:func:`cpuidle_register()` for ``CPUIdle`` driver
+registration in all cases.
+
+The registration of a |struct cpuidle_device| object causes the ``CPUIdle``
+``sysfs`` interface to be created and the governor's ``->enable()`` callback to
+be invoked for the logical CPU represented by it, so it must take place after
+registering the driver that will handle the CPU in question.
+
+``CPUIdle`` drivers and |struct cpuidle_device| objects can be unregistered
+when they are not necessary any more which allows some resources associated with
+them to be released.  Due to dependencies between them, all of the
+|struct cpuidle_device| objects representing CPUs handled by the given
+``CPUIdle`` driver must be unregistered, with the help of
+:c:func:`cpuidle_unregister_device()`, before calling
+:c:func:`cpuidle_unregister_driver()` to unregister the driver.  Alternatively,
+:c:func:`cpuidle_unregister()` can be called to unregister a ``CPUIdle`` driver
+along with all of the |struct cpuidle_device| objects representing CPUs handled
+by it.
+
+``CPUIdle`` drivers can respond to runtime system configuration changes that
+lead to modifications of the list of available processor idle states (which can
+happen, for example, when the system's power source is switched from AC to
+battery or the other way around).  Upon a notification of such a change,
+a ``CPUIdle`` driver is expected to call :c:func:`cpuidle_pause_and_lock()` to
+turn ``CPUIdle`` off temporarily and then :c:func:`cpuidle_disable_device()` for
+all of the |struct cpuidle_device| objects representing CPUs affected by that
+change.  Next, it can update its :c:member:`states` array in accordance with
+the new configuration of the system, call :c:func:`cpuidle_enable_device()` for
+all of the relevant |struct cpuidle_device| objects and invoke
+:c:func:`cpuidle_resume_and_unlock()` to allow ``CPUIdle`` to be used again.
diff --git a/Documentation/driver-api/pm/index.rst b/Documentation/driver-api/pm/index.rst
index 2f6d0e9cf6b7..56975c6bc789 100644
--- a/Documentation/driver-api/pm/index.rst
+++ b/Documentation/driver-api/pm/index.rst
@@ -1,9 +1,10 @@
-=======================
-Device Power Management
-=======================
+===============================
+CPU and Device Power Management
+===============================
 
 .. toctree::
 
+   cpuidle
    devices
    notifiers
    types
diff --git a/MAINTAINERS b/MAINTAINERS
index 4d04cebb4a71..750aa2df031f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4016,6 +4016,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
 B:	https://bugzilla.kernel.org
 F:	Documentation/admin-guide/pm/cpuidle.rst
+F:	Documentation/driver-api/pm/cpuidle.rst
 F:	drivers/cpuidle/*
 F:	include/linux/cpuidle.h
 

From 44021606298870e4adc641ef3927e7bb47ca8236 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 15 Jan 2019 12:22:10 -0500
Subject: [PATCH 07/80] cpuidle: use BIT() for idle state flags and remove
 CPUIDLE_DRIVER_FLAGS_MASK

Use BIT() macro to do a small tidy-up.

CPUIDLE_DRIVER_FLAGS_MASK is not used, so remove it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpuidle.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4dff74f48d4b..3b39472324a3 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -69,11 +69,9 @@ struct cpuidle_state {
 
 /* Idle State Flags */
 #define CPUIDLE_FLAG_NONE       (0x00)
-#define CPUIDLE_FLAG_POLLING	(0x01) /* polling state */
-#define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
-#define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
-
-#define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
+#define CPUIDLE_FLAG_POLLING	BIT(0) /* polling state */
+#define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
+#define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;

From 40619f7dd3ef05ae7861bc60d401585d316e1374 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Sat, 5 Jan 2019 13:58:45 -0600
Subject: [PATCH 08/80] PM: clock_ops: fix missing clk_prepare() return value
 check

clk_prepare() can fail, so check its status and if it fails,
issue an error message and change the clock_entry_status to
PCE_STATUS_ERROR.

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
[ rjw: Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/clock_ops.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index 5a42ae4078c2..365ad751ce0f 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -65,10 +65,15 @@ static void pm_clk_acquire(struct device *dev, struct pm_clock_entry *ce)
 	if (IS_ERR(ce->clk)) {
 		ce->status = PCE_STATUS_ERROR;
 	} else {
-		clk_prepare(ce->clk);
-		ce->status = PCE_STATUS_ACQUIRED;
-		dev_dbg(dev, "Clock %pC con_id %s managed by runtime PM.\n",
-			ce->clk, ce->con_id);
+		if (clk_prepare(ce->clk)) {
+			ce->status = PCE_STATUS_ERROR;
+			dev_err(dev, "clk_prepare() failed\n");
+		} else {
+			ce->status = PCE_STATUS_ACQUIRED;
+			dev_dbg(dev,
+				"Clock %pC con_id %s managed by runtime PM.\n",
+				ce->clk, ce->con_id);
+		}
 	}
 }
 

From 8321be6a9df5c5cfbf3fb5f716caf8698a5a7016 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Mon, 21 Jan 2019 14:17:37 +0530
Subject: [PATCH 09/80] cpufreq: Replace open-coded << with BIT()

Minor clean-up to use BIT() and keep checkpatch happy. Clean up the
comment formatting while we're at it to make it easier to read.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c86d6d8bdfed..bd7fbd6a4478 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -346,14 +346,15 @@ struct cpufreq_driver {
 };
 
 /* flags */
-#define CPUFREQ_STICKY		(1 << 0)	/* driver isn't removed even if
-						   all ->init() calls failed */
-#define CPUFREQ_CONST_LOOPS	(1 << 1)	/* loops_per_jiffy or other
-						   kernel "constants" aren't
-						   affected by frequency
-						   transitions */
-#define CPUFREQ_PM_NO_WARN	(1 << 2)	/* don't warn on suspend/resume
-						   speed mismatches */
+
+/* driver isn't removed even if all ->init() calls failed */
+#define CPUFREQ_STICKY				BIT(0)
+
+/* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */
+#define CPUFREQ_CONST_LOOPS			BIT(1)
+
+/* don't warn on suspend/resume speed mismatches */
+#define CPUFREQ_PM_NO_WARN			BIT(2)
 
 /*
  * This should be set by platforms having multiple clock-domains, i.e.
@@ -361,14 +362,14 @@ struct cpufreq_driver {
  * be created in cpu/cpu<num>/cpufreq/ directory and so they can use the same
  * governor with different tunables for different clusters.
  */
-#define CPUFREQ_HAVE_GOVERNOR_PER_POLICY (1 << 3)
+#define CPUFREQ_HAVE_GOVERNOR_PER_POLICY	BIT(3)
 
 /*
  * Driver will do POSTCHANGE notifications from outside of their ->target()
  * routine and so must set cpufreq_driver->flags with this flag, so that core
  * can handle them specially.
  */
-#define CPUFREQ_ASYNC_NOTIFICATION  (1 << 4)
+#define CPUFREQ_ASYNC_NOTIFICATION		BIT(4)
 
 /*
  * Set by drivers which want cpufreq core to check if CPU is running at a
@@ -377,13 +378,13 @@ struct cpufreq_driver {
  * from the table. And if that fails, we will stop further boot process by
  * issuing a BUG_ON().
  */
-#define CPUFREQ_NEED_INITIAL_FREQ_CHECK	(1 << 5)
+#define CPUFREQ_NEED_INITIAL_FREQ_CHECK	BIT(5)
 
 /*
  * Set by drivers to disallow use of governors with "dynamic_switching" flag
  * set.
  */
-#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING (1 << 6)
+#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
 
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);

From 659dc4562c1b15e3401ad15338d2109af7c6d718 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Jan 2019 16:21:47 +0100
Subject: [PATCH 10/80] PM: QoS: no need to check return value of
 debugfs_create functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index b7a82502857a..9d22131afc1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -582,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
 	qos->pm_qos_power_miscdev.name = qos->name;
 	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
 
-	if (d) {
-		(void)debugfs_create_file(qos->name, S_IRUGO, d,
-					  (void *)qos, &pm_qos_debug_fops);
-	}
+	debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos,
+			    &pm_qos_debug_fops);
 
 	return misc_register(&qos->pm_qos_power_miscdev);
 }
@@ -685,8 +683,6 @@ static int __init pm_qos_power_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
 
 	d = debugfs_create_dir("pm_qos", NULL);
-	if (IS_ERR_OR_NULL(d))
-		d = NULL;
 
 	for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
 		ret = register_pm_qos_misc(pm_qos_array[i], d);

From e16a42c3faa8871353b3e7080cbc5a89bf285187 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Jan 2019 16:21:05 +0100
Subject: [PATCH 11/80] PM: domains: no need to check return value of
 debugfs_create functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 500de1dee967..45eafe8cf7dd 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2948,18 +2948,11 @@ static int __init genpd_debug_init(void)
 
 	genpd_debugfs_dir = debugfs_create_dir("pm_genpd", NULL);
 
-	if (!genpd_debugfs_dir)
-		return -ENOMEM;
-
-	d = debugfs_create_file("pm_genpd_summary", S_IRUGO,
-			genpd_debugfs_dir, NULL, &summary_fops);
-	if (!d)
-		return -ENOMEM;
+	debugfs_create_file("pm_genpd_summary", S_IRUGO, genpd_debugfs_dir,
+			    NULL, &summary_fops);
 
 	list_for_each_entry(genpd, &gpd_list, gpd_list_node) {
 		d = debugfs_create_dir(genpd->name, genpd_debugfs_dir);
-		if (!d)
-			return -ENOMEM;
 
 		debugfs_create_file("current_state", 0444,
 				d, genpd, &status_fops);

From 9cac42d0645ceb2ca8b815cee04810ec9b0d13b3 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Tue, 22 Jan 2019 16:42:47 +0000
Subject: [PATCH 12/80] PM / EM: Expose the Energy Model in debugfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recently introduced Energy Model (EM) framework manages power cost
tables of CPUs. These tables are currently only visible from kernel
space. However, in order to debug the behaviour of subsystems that use
the EM (EAS for example), it is often required to know what the power
costs are from userspace.

For this reason, introduce under /sys/kernel/debug/energy_model a set of
directories representing the performance domains of the system. Each
performance domain contains a set of sub-directories representing the
different capacity states (cs) and their attributes, as well as a file
exposing the related CPUs.

The resulting hierarchy is as follows on Arm juno r0 for example:

    /sys/kernel/debug/energy_model
    ├── pd0
    │   ├── cpus
    │   ├── cs:450000
    │   │   ├── cost
    │   │   ├── frequency
    │   │   └── power
    │   ├── cs:575000
    │   │   ├── cost
    │   │   ├── frequency
    │   │   └── power
    │   ├── cs:700000
    │   │   ├── cost
    │   │   ├── frequency
    │   │   └── power
    │   ├── cs:775000
    │   │   ├── cost
    │   │   ├── frequency
    │   │   └── power
    │   └── cs:850000
    │       ├── cost
    │       ├── frequency
    │       └── power
    └── pd1
        ├── cpus
        ├── cs:1100000
        │   ├── cost
        │   ├── frequency
        │   └── power
        ├── cs:450000
        │   ├── cost
        │   ├── frequency
        │   └── power
        ├── cs:625000
        │   ├── cost
        │   ├── frequency
        │   └── power
        ├── cs:800000
        │   ├── cost
        │   ├── frequency
        │   └── power
        └── cs:950000
            ├── cost
            ├── frequency
            └── power

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 57 +++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d9dc2c38764a..7d66ee68aaaf 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -10,6 +10,7 @@
 
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/debugfs.h>
 #include <linux/energy_model.h>
 #include <linux/sched/topology.h>
 #include <linux/slab.h>
@@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
  */
 static DEFINE_MUTEX(em_pd_mutex);
 
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+{
+	struct dentry *d;
+	char name[24];
+
+	snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+
+	/* Create per-cs directory */
+	d = debugfs_create_dir(name, pd);
+	debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
+	debugfs_create_ulong("power", 0444, d, &cs->power);
+	debugfs_create_ulong("cost", 0444, d, &cs->cost);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+	seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+{
+	struct dentry *d;
+	char name[8];
+	int i;
+
+	snprintf(name, sizeof(name), "pd%d", cpu);
+
+	/* Create the directory of the performance domain */
+	d = debugfs_create_dir(name, rootdir);
+
+	debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+
+	/* Create a sub-directory for each capacity state */
+	for (i = 0; i < pd->nr_cap_states; i++)
+		em_debug_create_cs(&pd->table[i], d);
+}
+
+static int __init em_debug_init(void)
+{
+	/* Create /sys/kernel/debug/energy_model directory */
+	rootdir = debugfs_create_dir("energy_model", NULL);
+
+	return 0;
+}
+core_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+#endif
 static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 						struct em_data_callback *cb)
 {
@@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 	pd->nr_cap_states = nr_states;
 	cpumask_copy(to_cpumask(pd->cpus), span);
 
+	em_debug_create_pd(pd, cpu);
+
 	return pd;
 
 free_cs_table:

From d1c6b41b0f65d4c3933505f598b9f69873744404 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Wed, 23 Jan 2019 11:22:01 -0500
Subject: [PATCH 13/80] PM / wakeup: fix kerneldoc comment for
 pm_wakeup_dev_event()

This brings the kernel doc in line with the function signature.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
[ rjw: Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 5fa1898755a3..f1fee72ed970 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -783,7 +783,7 @@ void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
 EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);
 
 /**
- * pm_wakeup_event - Notify the PM core of a wakeup event.
+ * pm_wakeup_dev_event - Notify the PM core of a wakeup event.
  * @dev: Device the wakeup event is related to.
  * @msec: Anticipated event processing time (in milliseconds).
  * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.

From 625c85a62cb7d3c79f6e16de3cfa972033658250 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 25 Jan 2019 12:53:07 +0530
Subject: [PATCH 14/80] cpufreq: Use struct kobj_attribute instead of struct
 global_attr

The cpufreq_global_kobject is created using kobject_create_and_add()
helper, which assigns the kobj_type as dynamic_kobj_ktype and show/store
routines are set to kobj_attr_show() and kobj_attr_store().

These routines pass struct kobj_attribute as an argument to the
show/store callbacks. But all the cpufreq files created using the
cpufreq_global_kobject expect the argument to be of type struct
attribute. Things work fine currently as no one accesses the "attr"
argument. We may not see issues even if the argument is used, as struct
kobj_attribute has struct attribute as its first element and so they
will both get same address.

But this is logically incorrect and we should rather use struct
kobj_attribute instead of struct global_attr in the cpufreq core and
drivers and the show/store callbacks should take struct kobj_attribute
as argument instead.

This bug is caught using CFI CLANG builds in android kernel which
catches mismatch in function prototypes for such callbacks.

Reported-by: Donghee Han <dh.han@samsung.com>
Reported-by: Sangkyu Kim <skwith.kim@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c      |  6 +++---
 drivers/cpufreq/intel_pstate.c | 23 ++++++++++++-----------
 include/linux/cpufreq.h        | 12 ++----------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index a8fa684f5f90..3eff158d9750 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -545,13 +545,13 @@ EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
 static ssize_t show_boost(struct kobject *kobj,
-				 struct attribute *attr, char *buf)
+			  struct kobj_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", cpufreq_driver->boost_enabled);
 }
 
-static ssize_t store_boost(struct kobject *kobj, struct attribute *attr,
-				  const char *buf, size_t count)
+static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr,
+			   const char *buf, size_t count)
 {
 	int ret, enable;
 
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index dd66decf2087..5ab6a4fe93aa 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -895,7 +895,7 @@ static void intel_pstate_update_policies(void)
 /************************** sysfs begin ************************/
 #define show_one(file_name, object)					\
 	static ssize_t show_##file_name					\
-	(struct kobject *kobj, struct attribute *attr, char *buf)	\
+	(struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
 	{								\
 		return sprintf(buf, "%u\n", global.object);		\
 	}
@@ -904,7 +904,7 @@ static ssize_t intel_pstate_show_status(char *buf);
 static int intel_pstate_update_status(const char *buf, size_t size);
 
 static ssize_t show_status(struct kobject *kobj,
-			   struct attribute *attr, char *buf)
+			   struct kobj_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -915,7 +915,7 @@ static ssize_t show_status(struct kobject *kobj,
 	return ret;
 }
 
-static ssize_t store_status(struct kobject *a, struct attribute *b,
+static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
 			    const char *buf, size_t count)
 {
 	char *p = memchr(buf, '\n', count);
@@ -929,7 +929,7 @@ static ssize_t store_status(struct kobject *a, struct attribute *b,
 }
 
 static ssize_t show_turbo_pct(struct kobject *kobj,
-				struct attribute *attr, char *buf)
+				struct kobj_attribute *attr, char *buf)
 {
 	struct cpudata *cpu;
 	int total, no_turbo, turbo_pct;
@@ -955,7 +955,7 @@ static ssize_t show_turbo_pct(struct kobject *kobj,
 }
 
 static ssize_t show_num_pstates(struct kobject *kobj,
-				struct attribute *attr, char *buf)
+				struct kobj_attribute *attr, char *buf)
 {
 	struct cpudata *cpu;
 	int total;
@@ -976,7 +976,7 @@ static ssize_t show_num_pstates(struct kobject *kobj,
 }
 
 static ssize_t show_no_turbo(struct kobject *kobj,
-			     struct attribute *attr, char *buf)
+			     struct kobj_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -998,7 +998,7 @@ static ssize_t show_no_turbo(struct kobject *kobj,
 	return ret;
 }
 
-static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
+static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 			      const char *buf, size_t count)
 {
 	unsigned int input;
@@ -1045,7 +1045,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
+static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
 				  const char *buf, size_t count)
 {
 	unsigned int input;
@@ -1075,7 +1075,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
+static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
 				  const char *buf, size_t count)
 {
 	unsigned int input;
@@ -1107,12 +1107,13 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 }
 
 static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
-				struct attribute *attr, char *buf)
+				struct kobj_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%u\n", hwp_boost);
 }
 
-static ssize_t store_hwp_dynamic_boost(struct kobject *a, struct attribute *b,
+static ssize_t store_hwp_dynamic_boost(struct kobject *a,
+				       struct kobj_attribute *b,
 				       const char *buf, size_t count)
 {
 	unsigned int input;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index bd7fbd6a4478..c19142911554 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -254,20 +254,12 @@ __ATTR(_name, 0644, show_##_name, store_##_name)
 static struct freq_attr _name =			\
 __ATTR(_name, 0200, NULL, store_##_name)
 
-struct global_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct kobject *kobj,
-			struct attribute *attr, char *buf);
-	ssize_t (*store)(struct kobject *a, struct attribute *b,
-			 const char *c, size_t count);
-};
-
 #define define_one_global_ro(_name)		\
-static struct global_attr _name =		\
+static struct kobj_attribute _name =		\
 __ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_one_global_rw(_name)		\
-static struct global_attr _name =		\
+static struct kobj_attribute _name =		\
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
 

From afa1f2ab43d48d0e1fa1bda524a0cf53e4cd6c87 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:07 +0530
Subject: [PATCH 15/80] thermal: cpu_cooling: Require thermal core to be
 compiled in

The CPU cooling driver (cpu_cooling.c) allows the platform's cpufreq
driver to register as a cooling device and cool down the platform by
throttling the CPU frequency. In order to be able to auto-register a
cpufreq driver as a cooling device from the cpufreq core, we need access
to code inside cpu_cooling.c which, in turn, accesses code inside
thermal core.

CPU_FREQ is a bool while THERMAL is tristate.  In some configurations
(e.g. allmodconfig), CONFIG_THERMAL ends up as a module while
CONFIG_CPU_FREQ is compiled in. This leads to following error:

drivers/cpufreq/cpufreq.o: In function `cpufreq_offline':
cpufreq.c:(.text+0x407c): undefined reference to `cpufreq_cooling_unregister'
drivers/cpufreq/cpufreq.o: In function `cpufreq_online':
cpufreq.c:(.text+0x70c0): undefined reference to `of_cpufreq_cooling_register'

Given that platforms using CPU_THERMAL usually want it compiled-in so it
is available early in boot, make CPU_THERMAL depend on THERMAL being
compiled-in instead of allowing it to be a module.

As a result of this change, get rid of the ugly (!CPU_THERMAL ||
THERMAL) dependency in all cpufreq drivers using CPU_THERMAL.

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig     | 3 ---
 drivers/cpufreq/Kconfig.arm | 5 -----
 drivers/thermal/Kconfig     | 1 +
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 608af20a3494..b22e6bba71f1 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -207,8 +207,6 @@ comment "CPU frequency scaling drivers"
 config CPUFREQ_DT
 	tristate "Generic DT based cpufreq driver"
 	depends on HAVE_CLK && OF
-	# if CPU_THERMAL is on and THERMAL=m, CPUFREQ_DT cannot be =y:
-	depends on !CPU_THERMAL || THERMAL
 	select CPUFREQ_DT_PLATDEV
 	select PM_OPP
 	help
@@ -327,7 +325,6 @@ endif
 config QORIQ_CPUFREQ
 	tristate "CPU frequency scaling driver for Freescale QorIQ SoCs"
 	depends on OF && COMMON_CLK && (PPC_E500MC || ARM || ARM64)
-	depends on !CPU_THERMAL || THERMAL
 	select CLK_QORIQ
 	help
 	  This adds the CPUFreq driver support for Freescale QorIQ SoCs
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 688f10227793..ca8567c3152c 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -29,8 +29,6 @@ config ARM_ARMADA_37XX_CPUFREQ
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
 	depends on ARM_CPU_TOPOLOGY && HAVE_CLK
-	# if CPU_THERMAL is on and THERMAL=m, ARM_BIT_LITTLE_CPUFREQ cannot be =y
-	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
@@ -38,7 +36,6 @@ config ARM_BIG_LITTLE_CPUFREQ
 config ARM_SCPI_CPUFREQ
 	tristate "SCPI based CPUfreq driver"
 	depends on ARM_SCPI_PROTOCOL && COMMON_CLK_SCPI
-	depends on !CPU_THERMAL || THERMAL
 	help
 	  This adds the CPUfreq driver support for ARM platforms using SCPI
 	  protocol for CPU power management.
@@ -93,7 +90,6 @@ config ARM_KIRKWOOD_CPUFREQ
 config ARM_MEDIATEK_CPUFREQ
 	tristate "CPU Frequency scaling support for MediaTek SoCs"
 	depends on ARCH_MEDIATEK && REGULATOR
-	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This adds the CPUFreq driver support for MediaTek SoCs.
@@ -233,7 +229,6 @@ config ARM_SA1110_CPUFREQ
 config ARM_SCMI_CPUFREQ
 	tristate "SCMI based CPUfreq driver"
 	depends on ARM_SCMI_PROTOCOL || COMPILE_TEST
-	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This adds the CPUfreq driver support for ARM platforms using SCMI
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 30323426902e..58bb7d72dc2b 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -152,6 +152,7 @@ config CPU_THERMAL
 	bool "generic cpu cooling support"
 	depends on CPU_FREQ
 	depends on THERMAL_OF
+	depends on THERMAL=y
 	help
 	  This implements the generic cpu cooling mechanism through frequency
 	  reduction. An ACPI version of this already exists

From 5c238a8b599f1ae25eaeb08ad0e9e13e2b9eb023 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Wed, 30 Jan 2019 10:52:01 +0530
Subject: [PATCH 16/80] cpufreq: Auto-register the driver as a thermal cooling
 device if asked

All cpufreq drivers do similar things to register as a cooling device.
Provide a cpufreq driver flag so drivers can just ask the cpufreq core
to register the cooling device on their behalf. This allows us to get
rid of duplicated code in the drivers.

In order to allow this, we add a struct thermal_cooling_device pointer
to struct cpufreq_policy so that drivers don't need to store it in a
private data structure.

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 11 +++++++++++
 include/linux/cpufreq.h   |  9 +++++++++
 2 files changed, 20 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 3eff158d9750..96a69c67a545 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -19,6 +19,7 @@
 
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_cooling.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -1316,6 +1317,10 @@ static int cpufreq_online(unsigned int cpu)
 	if (cpufreq_driver->ready)
 		cpufreq_driver->ready(policy);
 
+	if (IS_ENABLED(CONFIG_CPU_THERMAL) &&
+	    cpufreq_driver->flags & CPUFREQ_IS_COOLING_DEV)
+		policy->cdev = of_cpufreq_cooling_register(policy);
+
 	pr_debug("initialization complete\n");
 
 	return 0;
@@ -1403,6 +1408,12 @@ static int cpufreq_offline(unsigned int cpu)
 		goto unlock;
 	}
 
+	if (IS_ENABLED(CONFIG_CPU_THERMAL) &&
+	    cpufreq_driver->flags & CPUFREQ_IS_COOLING_DEV) {
+		cpufreq_cooling_unregister(policy->cdev);
+		policy->cdev = NULL;
+	}
+
 	if (cpufreq_driver->stop_cpu)
 		cpufreq_driver->stop_cpu(policy);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c19142911554..9db074ecbbd7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -151,6 +151,9 @@ struct cpufreq_policy {
 
 	/* For cpufreq driver's internal use */
 	void			*driver_data;
+
+	/* Pointer to the cooling device if used for thermal mitigation */
+	struct thermal_cooling_device *cdev;
 };
 
 /* Only for ACPI */
@@ -378,6 +381,12 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
 
+/*
+ * Set by drivers that want the core to automatically register the cpufreq
+ * driver as a thermal cooling device.
+ */
+#define CPUFREQ_IS_COOLING_DEV			BIT(7)
+
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 

From 4c5ff1c8320d209c038f3698d8fa91fe946a3818 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:09 +0530
Subject: [PATCH 17/80] cpufreq: qcom-hw: Register as a cpufreq cooling device

Add the CPUFREQ_IS_COOLING_DEV flag to allow the cpufreq core to
auto-register the driver as a cooling device.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index d83939a1b3d4..c88b51304d89 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -231,7 +231,8 @@ static struct freq_attr *qcom_cpufreq_hw_attr[] = {
 
 static struct cpufreq_driver cpufreq_qcom_hw_driver = {
 	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
-			  CPUFREQ_HAVE_GOVERNOR_PER_POLICY,
+			  CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+			  CPUFREQ_IS_COOLING_DEV,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= qcom_cpufreq_hw_target_index,
 	.get		= qcom_cpufreq_hw_get,

From 4b498869268ed631c7a383c50a6a6048b280dcd7 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:10 +0530
Subject: [PATCH 18/80] cpufreq: imx6q: Use auto-registration of thermal
 cooling device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 9fedf627e000..ca955713e070 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -9,7 +9,6 @@
 #include <linux/clk.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
-#include <linux/cpu_cooling.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/nvmem-consumer.h>
@@ -52,7 +51,6 @@ static struct clk_bulk_data clks[] = {
 };
 
 static struct device *cpu_dev;
-static struct thermal_cooling_device *cdev;
 static bool free_opp;
 static struct cpufreq_frequency_table *freq_table;
 static unsigned int max_freq;
@@ -193,16 +191,6 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 	return 0;
 }
 
-static void imx6q_cpufreq_ready(struct cpufreq_policy *policy)
-{
-	cdev = of_cpufreq_cooling_register(policy);
-
-	if (!cdev)
-		dev_err(cpu_dev,
-			"running cpufreq without cooling device: %ld\n",
-			PTR_ERR(cdev));
-}
-
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
 	int ret;
@@ -214,22 +202,14 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
-static int imx6q_cpufreq_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_cooling_unregister(cdev);
-
-	return 0;
-}
-
 static struct cpufreq_driver imx6q_cpufreq_driver = {
-	.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		 CPUFREQ_IS_COOLING_DEV,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = imx6q_set_target,
 	.get = cpufreq_generic_get,
 	.init = imx6q_cpufreq_init,
-	.exit = imx6q_cpufreq_exit,
 	.name = "imx6q-cpufreq",
-	.ready = imx6q_cpufreq_ready,
 	.attr = cpufreq_generic_attr,
 	.suspend = cpufreq_generic_suspend,
 };

From e248d8d35cff6a8351a10544588deeca345bbbc6 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:11 +0530
Subject: [PATCH 19/80] cpufreq: cpufreq-dt: Use auto-registration of thermal
 cooling device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index e58bfcb1169e..7ba392911cd0 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -13,7 +13,6 @@
 
 #include <linux/clk.h>
 #include <linux/cpu.h>
-#include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 #include <linux/err.h>
@@ -30,7 +29,6 @@
 struct private_data {
 	struct opp_table *opp_table;
 	struct device *cpu_dev;
-	struct thermal_cooling_device *cdev;
 	const char *reg_name;
 	bool have_static_opps;
 };
@@ -301,7 +299,6 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct private_data *priv = policy->driver_data;
 
-	cpufreq_cooling_unregister(priv->cdev);
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	if (priv->have_static_opps)
 		dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
@@ -314,21 +311,14 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static void cpufreq_ready(struct cpufreq_policy *policy)
-{
-	struct private_data *priv = policy->driver_data;
-
-	priv->cdev = of_cpufreq_cooling_register(policy);
-}
-
 static struct cpufreq_driver dt_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		 CPUFREQ_IS_COOLING_DEV,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = set_target,
 	.get = cpufreq_generic_get,
 	.init = cpufreq_init,
 	.exit = cpufreq_exit,
-	.ready = cpufreq_ready,
 	.name = "cpufreq-dt",
 	.attr = cpufreq_dt_attr,
 	.suspend = cpufreq_generic_suspend,

From 0db60d6b89b921c26d6dac4ec7b35e0102d8f9f8 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:12 +0530
Subject: [PATCH 20/80] cpufreq: mediatek: Use auto-registration of thermal
 cooling device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mediatek-cpufreq.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index eb8920d39818..4229fcc31310 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -14,7 +14,6 @@
 
 #include <linux/clk.h>
 #include <linux/cpu.h>
-#include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
@@ -48,7 +47,6 @@ struct mtk_cpu_dvfs_info {
 	struct regulator *sram_reg;
 	struct clk *cpu_clk;
 	struct clk *inter_clk;
-	struct thermal_cooling_device *cdev;
 	struct list_head list_head;
 	int intermediate_voltage;
 	bool need_voltage_tracking;
@@ -307,13 +305,6 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
 
 #define DYNAMIC_POWER "dynamic-power-coefficient"
 
-static void mtk_cpufreq_ready(struct cpufreq_policy *policy)
-{
-	struct mtk_cpu_dvfs_info *info = policy->driver_data;
-
-	info->cdev = of_cpufreq_cooling_register(policy);
-}
-
 static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
 {
 	struct device *cpu_dev;
@@ -472,7 +463,6 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct mtk_cpu_dvfs_info *info = policy->driver_data;
 
-	cpufreq_cooling_unregister(info->cdev);
 	dev_pm_opp_free_cpufreq_table(info->cpu_dev, &policy->freq_table);
 
 	return 0;
@@ -480,13 +470,13 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver mtk_cpufreq_driver = {
 	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
-		 CPUFREQ_HAVE_GOVERNOR_PER_POLICY,
+		 CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+		 CPUFREQ_IS_COOLING_DEV,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = mtk_cpufreq_set_target,
 	.get = cpufreq_generic_get,
 	.init = mtk_cpufreq_init,
 	.exit = mtk_cpufreq_exit,
-	.ready = mtk_cpufreq_ready,
 	.name = "mtk-cpufreq",
 	.attr = cpufreq_generic_attr,
 };

From 17170ec17109bdcd26bf542392a5bafcb6a5732f Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:13 +0530
Subject: [PATCH 21/80] cpufreq: qoriq: Use auto-registration of thermal
 cooling device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/qoriq-cpufreq.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 3d773f64b4df..4295e5476264 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -13,7 +13,6 @@
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/cpufreq.h>
-#include <linux/cpu_cooling.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -31,7 +30,6 @@
 struct cpu_data {
 	struct clk **pclk;
 	struct cpufreq_frequency_table *table;
-	struct thermal_cooling_device *cdev;
 };
 
 /*
@@ -239,7 +237,6 @@ static int qoriq_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
 	struct cpu_data *data = policy->driver_data;
 
-	cpufreq_cooling_unregister(data->cdev);
 	kfree(data->pclk);
 	kfree(data->table);
 	kfree(data);
@@ -258,23 +255,15 @@ static int qoriq_cpufreq_target(struct cpufreq_policy *policy,
 	return clk_set_parent(policy->clk, parent);
 }
 
-
-static void qoriq_cpufreq_ready(struct cpufreq_policy *policy)
-{
-	struct cpu_data *cpud = policy->driver_data;
-
-	cpud->cdev = of_cpufreq_cooling_register(policy);
-}
-
 static struct cpufreq_driver qoriq_cpufreq_driver = {
 	.name		= "qoriq_cpufreq",
-	.flags		= CPUFREQ_CONST_LOOPS,
+	.flags		= CPUFREQ_CONST_LOOPS |
+			  CPUFREQ_IS_COOLING_DEV,
 	.init		= qoriq_cpufreq_cpu_init,
 	.exit		= qoriq_cpufreq_cpu_exit,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= qoriq_cpufreq_target,
 	.get		= cpufreq_generic_get,
-	.ready		= qoriq_cpufreq_ready,
 	.attr		= cpufreq_generic_attr,
 };
 

From 5da7af9a94a735c683de6271f5c1704e8e756695 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:14 +0530
Subject: [PATCH 22/80] cpufreq: scmi: Use auto-registration of thermal cooling
 device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scmi-cpufreq.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 242c3370544e..66b633b48eb1 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -11,7 +11,6 @@
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
-#include <linux/cpu_cooling.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/pm_opp.h>
@@ -22,7 +21,6 @@
 struct scmi_data {
 	int domain_id;
 	struct device *cpu_dev;
-	struct thermal_cooling_device *cdev;
 };
 
 static const struct scmi_handle *handle;
@@ -185,7 +183,6 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct scmi_data *priv = policy->driver_data;
 
-	cpufreq_cooling_unregister(priv->cdev);
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	kfree(priv);
 	dev_pm_opp_remove_all_dynamic(priv->cpu_dev);
@@ -193,17 +190,11 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static void scmi_cpufreq_ready(struct cpufreq_policy *policy)
-{
-	struct scmi_data *priv = policy->driver_data;
-
-	priv->cdev = of_cpufreq_cooling_register(policy);
-}
-
 static struct cpufreq_driver scmi_cpufreq_driver = {
 	.name	= "scmi",
 	.flags	= CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
-		  CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+		  CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		  CPUFREQ_IS_COOLING_DEV,
 	.verify	= cpufreq_generic_frequency_table_verify,
 	.attr	= cpufreq_generic_attr,
 	.target_index	= scmi_cpufreq_set_target,
@@ -211,7 +202,6 @@ static struct cpufreq_driver scmi_cpufreq_driver = {
 	.get	= scmi_cpufreq_get_rate,
 	.init	= scmi_cpufreq_init,
 	.exit	= scmi_cpufreq_exit,
-	.ready	= scmi_cpufreq_ready,
 };
 
 static int scmi_cpufreq_probe(struct scmi_device *sdev)

From cb772b8ce4b9f5353c34f4d38ab0c697c0071625 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Tue, 29 Jan 2019 10:25:15 +0530
Subject: [PATCH 23/80] cpufreq: scpi: Use auto-registration of thermal cooling
 device

Use the CPUFREQ_IS_COOLING_DEV flag to allow cpufreq core to
automatically register as a thermal cooling device.

This allows removal of boiler plate code from the driver.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scpi-cpufreq.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index 99449738faa4..1db2f6927e13 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -22,7 +22,6 @@
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
-#include <linux/cpu_cooling.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/of_platform.h>
@@ -34,7 +33,6 @@
 struct scpi_data {
 	struct clk *clk;
 	struct device *cpu_dev;
-	struct thermal_cooling_device *cdev;
 };
 
 static struct scpi_ops *scpi_ops;
@@ -186,7 +184,6 @@ static int scpi_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct scpi_data *priv = policy->driver_data;
 
-	cpufreq_cooling_unregister(priv->cdev);
 	clk_put(priv->clk);
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	kfree(priv);
@@ -195,23 +192,16 @@ static int scpi_cpufreq_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static void scpi_cpufreq_ready(struct cpufreq_policy *policy)
-{
-	struct scpi_data *priv = policy->driver_data;
-
-	priv->cdev = of_cpufreq_cooling_register(policy);
-}
-
 static struct cpufreq_driver scpi_cpufreq_driver = {
 	.name	= "scpi-cpufreq",
 	.flags	= CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
-		  CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+		  CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		  CPUFREQ_IS_COOLING_DEV,
 	.verify	= cpufreq_generic_frequency_table_verify,
 	.attr	= cpufreq_generic_attr,
 	.get	= scpi_cpufreq_get_rate,
 	.init	= scpi_cpufreq_init,
 	.exit	= scpi_cpufreq_exit,
-	.ready	= scpi_cpufreq_ready,
 	.target_index	= scpi_cpufreq_set_target,
 };
 

From 58456488e0e3793f2f95a3d2e2a78b6eba0ad0aa Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 23 Jan 2019 08:50:13 +0100
Subject: [PATCH 24/80] PM-runtime: update accounting_timestamp on enable

Initializing accounting_timestamp to something different from 0 during
pm_runtime_init() doesn't make sense and puts an artificial ordering
constraint between timekeeping_init() and pm_runtime_init().

PM-runtime should start time accounting only when it is enabled and
discard the period when disabled.

Set accounting_timestamp to now when enabling PM-runtime.

Suggested-by: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index a453090c9449..f23b7ecfce3b 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1309,10 +1309,15 @@ void pm_runtime_enable(struct device *dev)
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
-	if (dev->power.disable_depth > 0)
+	if (dev->power.disable_depth > 0) {
 		dev->power.disable_depth--;
-	else
+
+		/* About to enable runtime pm, set accounting_timestamp to now */
+		if (!dev->power.disable_depth)
+			dev->power.accounting_timestamp = jiffies;
+	} else {
 		dev_warn(dev, "Unbalanced %s!\n", __func__);
+	}
 
 	WARN(!dev->power.disable_depth &&
 	     dev->power.runtime_status == RPM_SUSPENDED &&
@@ -1509,7 +1514,7 @@ void pm_runtime_init(struct device *dev)
 	dev->power.request_pending = false;
 	dev->power.request = RPM_REQ_NONE;
 	dev->power.deferred_resume = false;
-	dev->power.accounting_timestamp = jiffies;
+	dev->power.accounting_timestamp = 0;
 	INIT_WORK(&dev->power.work, pm_runtime_work);
 
 	dev->power.timer_expires = 0;

From a08c2a5a31941131c41feaa0429e4c8854cf48f2 Mon Sep 17 00:00:00 2001
From: Thara Gopinath <thara.gopinath@linaro.org>
Date: Wed, 23 Jan 2019 08:50:14 +0100
Subject: [PATCH 25/80] PM-runtime: Replace jiffies-based accounting with
 ktime-based accounting

Replace jiffies-based accounting for runtime_active_time and
runtime_suspended_time with ktime-based accounting. This makes the
runtime debug counters inline with genpd and other PM subsytems which
use ktime-based accounting.

Timekeeping is initialized before driver_init(). It's only at that time
that PM-runtime can be enabled.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
[switch from ktime to raw nsec]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 17 +++++++++--------
 drivers/base/power/sysfs.c   | 11 ++++++++---
 include/linux/pm.h           |  6 +++---
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index f23b7ecfce3b..eb1a3b878e1e 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -66,8 +66,8 @@ static int rpm_suspend(struct device *dev, int rpmflags);
  */
 void update_pm_runtime_accounting(struct device *dev)
 {
-	unsigned long now = jiffies;
-	unsigned long delta;
+	u64 now = ktime_to_ns(ktime_get());
+	u64 delta;
 
 	delta = now - dev->power.accounting_timestamp;
 
@@ -77,9 +77,9 @@ void update_pm_runtime_accounting(struct device *dev)
 		return;
 
 	if (dev->power.runtime_status == RPM_SUSPENDED)
-		dev->power.suspended_jiffies += delta;
+		dev->power.suspended_time += delta;
 	else
-		dev->power.active_jiffies += delta;
+		dev->power.active_time += delta;
 }
 
 static void __update_runtime_status(struct device *dev, enum rpm_status status)
@@ -90,16 +90,17 @@ static void __update_runtime_status(struct device *dev, enum rpm_status status)
 
 u64 pm_runtime_suspended_time(struct device *dev)
 {
-	unsigned long flags, time;
+	u64 time;
+	unsigned long flags;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
 	update_pm_runtime_accounting(dev);
-	time = dev->power.suspended_jiffies;
+	time = dev->power.suspended_time;
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
-	return jiffies_to_nsecs(time);
+	return time;
 }
 EXPORT_SYMBOL_GPL(pm_runtime_suspended_time);
 
@@ -1314,7 +1315,7 @@ void pm_runtime_enable(struct device *dev)
 
 		/* About to enable runtime pm, set accounting_timestamp to now */
 		if (!dev->power.disable_depth)
-			dev->power.accounting_timestamp = jiffies;
+			dev->power.accounting_timestamp = ktime_to_ns(ktime_get());
 	} else {
 		dev_warn(dev, "Unbalanced %s!\n", __func__);
 	}
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index d713738ce796..96c8a227610a 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -125,9 +125,12 @@ static ssize_t runtime_active_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
+	u64 tmp;
 	spin_lock_irq(&dev->power.lock);
 	update_pm_runtime_accounting(dev);
-	ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies));
+	tmp = dev->power.active_time;
+	do_div(tmp, NSEC_PER_MSEC);
+	ret = sprintf(buf, "%llu\n", tmp);
 	spin_unlock_irq(&dev->power.lock);
 	return ret;
 }
@@ -138,10 +141,12 @@ static ssize_t runtime_suspended_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
+	u64 tmp;
 	spin_lock_irq(&dev->power.lock);
 	update_pm_runtime_accounting(dev);
-	ret = sprintf(buf, "%i\n",
-		jiffies_to_msecs(dev->power.suspended_jiffies));
+	tmp = dev->power.suspended_time;
+	do_div(tmp, NSEC_PER_MSEC);
+	ret = sprintf(buf, "%llu\n", tmp);
 	spin_unlock_irq(&dev->power.lock);
 	return ret;
 }
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 0bd9de116826..3d2cbf947768 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -633,9 +633,9 @@ struct dev_pm_info {
 	int			runtime_error;
 	int			autosuspend_delay;
 	u64			last_busy;
-	unsigned long		active_jiffies;
-	unsigned long		suspended_jiffies;
-	unsigned long		accounting_timestamp;
+	u64			active_time;
+	u64			suspended_time;
+	u64			accounting_timestamp;
 #endif
 	struct pm_subsys_data	*subsys_data;  /* Owned by the subsystem. */
 	void (*set_latency_tolerance)(struct device *, s32);

From 10b818211d04805da731087e596fc7c6ce199dea Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 1 Feb 2019 11:45:44 +0530
Subject: [PATCH 26/80] cpufreq: stats: Declare freq-attr right after their
 callbacks

Freq attribute for "trans_table" is defined right after its callback
(without any blank line between them), but the others are defined
separately later on. Keep this consistent and define all attributes
right after their callbacks.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 1572129844a5..941e63e3e652 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -51,6 +51,7 @@ static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%d\n", policy->stats->total_trans);
 }
+cpufreq_freq_attr_ro(total_trans);
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
@@ -69,6 +70,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	}
 	return len;
 }
+cpufreq_freq_attr_ro(time_in_state);
 
 static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 			   size_t count)
@@ -77,6 +79,7 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 	cpufreq_stats_clear_table(policy->stats);
 	return count;
 }
+cpufreq_freq_attr_wo(reset);
 
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
@@ -126,10 +129,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 }
 cpufreq_freq_attr_ro(trans_table);
 
-cpufreq_freq_attr_ro(total_trans);
-cpufreq_freq_attr_ro(time_in_state);
-cpufreq_freq_attr_wo(reset);
-
 static struct attribute *default_attrs[] = {
 	&total_trans.attr,
 	&time_in_state.attr,

From 9795607dc41e7606e90e48bffe6927bee32cd336 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 1 Feb 2019 11:45:45 +0530
Subject: [PATCH 27/80] cpufreq: stats: Fix concurrency issues while resetting
 stats

It is possible for cpufreq_stats_clear_table() and
cpufreq_stats_record_transition() to get called concurrently and they
will try to update same variables simultaneously and may lead to
corruption of data.

Prevent that with the help of existing spinlock.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 941e63e3e652..e2db5581489a 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -31,20 +31,20 @@ static void cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
 
-	spin_lock(&cpufreq_stats_lock);
 	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
 	stats->last_time = cur_time;
-	spin_unlock(&cpufreq_stats_lock);
 }
 
 static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
+	spin_lock(&cpufreq_stats_lock);
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
+	spin_unlock(&cpufreq_stats_lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
@@ -62,7 +62,10 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	if (policy->fast_switch_enabled)
 		return 0;
 
+	spin_lock(&cpufreq_stats_lock);
 	cpufreq_stats_update(stats);
+	spin_unlock(&cpufreq_stats_lock);
+
 	for (i = 0; i < stats->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
 			(unsigned long long)
@@ -239,9 +242,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	if (old_index == -1 || new_index == -1 || old_index == new_index)
 		return;
 
+	spin_lock(&cpufreq_stats_lock);
 	cpufreq_stats_update(stats);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
 	stats->total_trans++;
+	spin_unlock(&cpufreq_stats_lock);
 }

From f800ea320c09fbc8bf15aecd5c05e8e6b27ae64e Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Wed, 30 Jan 2019 22:40:17 +0100
Subject: [PATCH 28/80] PM-runtime: Optimize
 pm_runtime_autosuspend_expiration()

pm_runtime_autosuspend_expiration calls ktime_get_mono_fast_ns()
even when its returned value may be unused. Therefore get the
current time later and remove gotos while there.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 4eaf166d7de1..1caa3944898f 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -145,24 +145,21 @@ static void pm_runtime_cancel_pending(struct device *dev)
 u64 pm_runtime_autosuspend_expiration(struct device *dev)
 {
 	int autosuspend_delay;
-	u64 last_busy, expires = 0;
-	u64 now = ktime_get_mono_fast_ns();
+	u64 expires;
 
 	if (!dev->power.use_autosuspend)
-		goto out;
+		return 0;
 
 	autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay);
 	if (autosuspend_delay < 0)
-		goto out;
+		return 0;
 
-	last_busy = READ_ONCE(dev->power.last_busy);
+	expires  = READ_ONCE(dev->power.last_busy);
+	expires += (u64)autosuspend_delay * NSEC_PER_MSEC;
+	if (expires > ktime_get_mono_fast_ns())
+		return expires;	/* Expires in the future */
 
-	expires = last_busy + (u64)autosuspend_delay * NSEC_PER_MSEC;
-	if (expires <= now)
-		expires = 0;	/* Already expired. */
-
- out:
-	return expires;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
 

From db10945cf49e9c04053b436abe334412003f9af9 Mon Sep 17 00:00:00 2001
From: Joseph Lo <josephl@nvidia.com>
Date: Fri, 1 Feb 2019 10:16:39 +0800
Subject: [PATCH 29/80] cpuidle: dt: bail out if the idle-state DT node is not
 compatible

Currently, the DT of the idle states will be parsed first whether it's
compatible or not. This could cause a warning message that comes from if
the CPU doesn't support identical idle states. E.g. Tegra186 can run
with 2 Cortex-A57 and 2 Denver cores with different idle states on
different types of these cores.

So fix it by checking the match node earlier, then it can make sure it
only goes through the idle states that the CPU supported.

Signed-off-by: Joseph Lo <josephl@nvidia.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/dt_idle_states.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index 53342b7f1010..add9569636b5 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -22,16 +22,12 @@
 #include "dt_idle_states.h"
 
 static int init_state_node(struct cpuidle_state *idle_state,
-			   const struct of_device_id *matches,
+			   const struct of_device_id *match_id,
 			   struct device_node *state_node)
 {
 	int err;
-	const struct of_device_id *match_id;
 	const char *desc;
 
-	match_id = of_match_node(matches, state_node);
-	if (!match_id)
-		return -ENODEV;
 	/*
 	 * CPUidle drivers are expected to initialize the const void *data
 	 * pointer of the passed in struct of_device_id array to the idle
@@ -160,6 +156,7 @@ int dt_init_idle_driver(struct cpuidle_driver *drv,
 {
 	struct cpuidle_state *idle_state;
 	struct device_node *state_node, *cpu_node;
+	const struct of_device_id *match_id;
 	int i, err = 0;
 	const cpumask_t *cpumask;
 	unsigned int state_idx = start_idx;
@@ -180,6 +177,12 @@ int dt_init_idle_driver(struct cpuidle_driver *drv,
 		if (!state_node)
 			break;
 
+		match_id = of_match_node(matches, state_node);
+		if (!match_id) {
+			err = -ENODEV;
+			break;
+		}
+
 		if (!of_device_is_available(state_node)) {
 			of_node_put(state_node);
 			continue;
@@ -198,7 +201,7 @@ int dt_init_idle_driver(struct cpuidle_driver *drv,
 		}
 
 		idle_state = &drv->states[state_idx++];
-		err = init_state_node(idle_state, matches, state_node);
+		err = init_state_node(idle_state, match_id, state_node);
 		if (err) {
 			pr_err("Parsing idle state node %pOF failed with err %d\n",
 			       state_node, err);

From c155f6499f9797f200aa46eb3ccbf198f4206970 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 4 Feb 2019 17:25:52 +0100
Subject: [PATCH 30/80] PM-runtime: Switch accounting over to
 ktime_get_mono_fast_ns()

Similar to what happened whith autosuspend, a deadlock has been
reported with PM-runtime accounting in the call path:

change_clocksource
    ...
    write_seqcount_begin
    ...
    timekeeping_update
        ...
        sh_cmt_clocksource_enable
            ...
            rpm_resume
                update_pm_runtime_accounting
                    ktime_get
                        do
                            read_seqcount_begin
                        while read_seqcount_retry
    ....
    write_seqcount_end

Make PM-runtime accounting use ktime_get_mono_fast_ns() to avoid this
problem.

With ktime_get_mono_fast_ns(), the timestamp is not guaranteed to be
monotonic across an update of timekeeping and as a result time can go
backward. Add a test to skip accounting for such situation which should
stay exceptional.

Fixes: a08c2a5a3194 ("PM-runtime: Replace jiffies-based accounting with ktime-based accounting")
Reported-by: Biju Das <biju.das@bp.renesas.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Subject, changelog, comment cleanup ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 1caa3944898f..6a58bb77a018 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -66,13 +66,22 @@ static int rpm_suspend(struct device *dev, int rpmflags);
  */
 void update_pm_runtime_accounting(struct device *dev)
 {
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = ktime_get_mono_fast_ns();
+	u64 last = dev->power.accounting_timestamp;
 	u64 delta;
 
-	delta = now - dev->power.accounting_timestamp;
-
 	dev->power.accounting_timestamp = now;
 
+	/*
+	 * Because ktime_get_mono_fast_ns() is not monotonic during
+	 * timekeeping updates, ensure that 'now' is after the last saved
+	 * timesptamp.
+	 */
+	if (now < last)
+		return;
+
+	delta = now - last;
+
 	if (dev->power.disable_depth > 0)
 		return;
 
@@ -1312,7 +1321,7 @@ void pm_runtime_enable(struct device *dev)
 
 		/* About to enable runtime pm, set accounting_timestamp to now */
 		if (!dev->power.disable_depth)
-			dev->power.accounting_timestamp = ktime_to_ns(ktime_get());
+			dev->power.accounting_timestamp = ktime_get_mono_fast_ns();
 	} else {
 		dev_warn(dev, "Unbalanced %s!\n", __func__);
 	}

From fed7e88c0702e30fcd55563ca6eecd548a5d13af Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 4 Feb 2019 17:25:53 +0100
Subject: [PATCH 31/80] PM-runtime: update time accounting only when enabled

Update the accounting_timestamp field only when PM runtime is enabled
and don't forget to account the last state before disabling it.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Minor cleanups ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 6a58bb77a018..04407d9f76fd 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -66,10 +66,14 @@ static int rpm_suspend(struct device *dev, int rpmflags);
  */
 void update_pm_runtime_accounting(struct device *dev)
 {
-	u64 now = ktime_get_mono_fast_ns();
-	u64 last = dev->power.accounting_timestamp;
-	u64 delta;
+	u64 now, last, delta;
 
+	if (dev->power.disable_depth > 0)
+		return;
+
+	last = dev->power.accounting_timestamp;
+
+	now = ktime_get_mono_fast_ns();
 	dev->power.accounting_timestamp = now;
 
 	/*
@@ -82,9 +86,6 @@ void update_pm_runtime_accounting(struct device *dev)
 
 	delta = now - last;
 
-	if (dev->power.disable_depth > 0)
-		return;
-
 	if (dev->power.runtime_status == RPM_SUSPENDED)
 		dev->power.suspended_time += delta;
 	else
@@ -1298,6 +1299,9 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
 		pm_runtime_put_noidle(dev);
 	}
 
+	/* Update time accounting before disabling PM-runtime. */
+	update_pm_runtime_accounting(dev);
+
 	if (!dev->power.disable_depth++)
 		__pm_runtime_barrier(dev);
 
@@ -1521,7 +1525,6 @@ void pm_runtime_init(struct device *dev)
 	dev->power.request_pending = false;
 	dev->power.request = RPM_REQ_NONE;
 	dev->power.deferred_resume = false;
-	dev->power.accounting_timestamp = 0;
 	INIT_WORK(&dev->power.work, pm_runtime_work);
 
 	dev->power.timer_expires = 0;

From 76d004bf72c9c7bcbcb9ff092898530cedbca296 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 4 Feb 2019 11:09:49 +0000
Subject: [PATCH 32/80] cpufreq: dt: Register an Energy Model

Now that PM_OPP provides a helper function to estimate the power
consumed by CPUs, make sure to try and register an Energy Model (EM)
from cpufreq-dt, hence ensuring interested subsystems (the task
scheduler, for example) can make use of that information when available.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index e58bfcb1169e..d74e0402512a 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -280,6 +280,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = transition_latency;
 	policy->dvfs_possible_from_any_cpu = true;
 
+	dev_pm_opp_of_register_em(policy->cpus);
+
 	return 0;
 
 out_free_cpufreq_table:

From 6915d7ad2105447776aa1a2ee83170f98de0a4e5 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 4 Feb 2019 11:09:50 +0000
Subject: [PATCH 33/80] cpufreq: scpi: Register an Energy Model

Now that PM_OPP provides a helper function to estimate the power
consumed by CPUs, make sure to try and register an Energy Model (EM)
from scpi-cpufreq, hence ensuring interested subsystems (the task
scheduler, for example) can make use of that information when available.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/scpi-cpufreq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index 87a98ec77773..c7e177011cff 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -170,6 +170,9 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = latency;
 
 	policy->fast_switch_possible = false;
+
+	dev_pm_opp_of_register_em(policy->cpus);
+
 	return 0;
 
 out_free_cpufreq_table:

From 285881b51eb58dacae78073763e782e56e2fb253 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 31 Jan 2019 13:53:25 +0530
Subject: [PATCH 34/80] PM / OPP: Remove unused parameter of
 _generic_set_opp_clk_only()

The previous frequency value isn't getting used in the routine
_generic_set_opp_clk_only(), drop it.

Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e5507add8f04..57d88a275de0 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -533,9 +533,8 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 	return ret;
 }
 
-static inline int
-_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
-			  unsigned long old_freq, unsigned long freq)
+static inline int _generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+					    unsigned long freq)
 {
 	int ret;
 
@@ -572,7 +571,7 @@ static int _generic_set_opp_regulator(const struct opp_table *opp_table,
 	}
 
 	/* Change frequency */
-	ret = _generic_set_opp_clk_only(dev, opp_table->clk, old_freq, freq);
+	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
 	if (ret)
 		goto restore_voltage;
 
@@ -586,7 +585,7 @@ static int _generic_set_opp_regulator(const struct opp_table *opp_table,
 	return 0;
 
 restore_freq:
-	if (_generic_set_opp_clk_only(dev, opp_table->clk, freq, old_freq))
+	if (_generic_set_opp_clk_only(dev, opp_table->clk, old_freq))
 		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
 			__func__, old_freq);
 restore_voltage:
@@ -759,7 +758,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 						 opp->supplies);
 	} else {
 		/* Only frequency scaling */
-		ret = _generic_set_opp_clk_only(dev, clk, old_freq, freq);
+		ret = _generic_set_opp_clk_only(dev, clk, freq);
 	}
 
 	/* Scaling down? Configure required OPPs after frequency */

From 2516d670052d5c3eb3253e5c815ff85bf0daf35a Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 4 Feb 2019 11:09:51 +0000
Subject: [PATCH 35/80] cpufreq: arm_big_little: Register an Energy Model

Now that PM_OPP provides a helper function to estimate the power
consumed by CPUs, make sure to try and register an Energy Model (EM)
from the arm_big_little CPUFreq driver, hence ensuring interested
subsystems (the task scheduler, for example) can make use of that
information when available.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/arm_big_little.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c
index cf62a1f64dd7..7fe52fcddcf1 100644
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -487,6 +487,8 @@ static int bL_cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency =
 				arm_bL_ops->get_transition_latency(cpu_dev);
 
+	dev_pm_opp_of_register_em(policy->cpus);
+
 	if (is_bL_switching_enabled())
 		per_cpu(cpu_last_req_freq, policy->cpu) = clk_get_cpu_rate(policy->cpu);
 

From a4f342b9607d8c2034d3135cbbb11b4028be3678 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 4 Feb 2019 11:09:48 +0000
Subject: [PATCH 36/80] PM / OPP: Introduce a power estimation helper

The Energy Model (EM) framework provides an API to let drivers register
the active power of CPUs. The drivers are expected to provide a callback
method which estimates the power consumed by a CPU at each available
performance levels. How exactly this should be implemented, however,
depends on the platform.

On some systems, PM_OPP knows the voltage and frequency at which CPUs
can run. When coupled with the CPU 'capacitance' (as provided by the
'dynamic-power-coefficient' devicetree binding), it is possible to
estimate the dynamic power consumption of a CPU as P = C * V^2 * f, with
C its capacitance and V and f respectively the voltage and frequency of
the OPP. The Intelligent Power Allocator (IPA) thermal governor already
implements that estimation method, in the thermal framework.

However, this power estimation method can be applied to any platform
where all the parameters are known (C, V and f), and not only those
suffering thermal issues. As such, the code implementing this feature
can be re-used to also populate the EM framework now used by EAS.

As a first step, introduce in PM_OPP a helper function which CPUFreq
drivers can use to register into the EM framework. This duplicates the
power estimation done in IPA until it can be migrated to using the EM
framework. This will be done later, once the EM framework has support
for at least all platforms currently supported by IPA.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/of.c       | 99 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  6 +++
 2 files changed, 105 insertions(+)

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 06f0f632ec47..cd58959e5158 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -20,6 +20,7 @@
 #include <linux/pm_domain.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/energy_model.h>
 
 #include "opp.h"
 
@@ -1047,3 +1048,101 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
 	return of_node_get(opp->np);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
+
+/*
+ * Callback function provided to the Energy Model framework upon registration.
+ * This computes the power estimated by @CPU at @kHz if it is the frequency
+ * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise
+ * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
+ * frequency and @mW to the associated power. The power is estimated as
+ * P = C * V^2 * f with C being the CPU's capacitance and V and f respectively
+ * the voltage and frequency of the OPP.
+ *
+ * Returns -ENODEV if the CPU device cannot be found, -EINVAL if the power
+ * calculation failed because of missing parameters, 0 otherwise.
+ */
+static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz,
+					 int cpu)
+{
+	struct device *cpu_dev;
+	struct dev_pm_opp *opp;
+	struct device_node *np;
+	unsigned long mV, Hz;
+	u32 cap;
+	u64 tmp;
+	int ret;
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return -ENODEV;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (!np)
+		return -EINVAL;
+
+	ret = of_property_read_u32(np, "dynamic-power-coefficient", &cap);
+	of_node_put(np);
+	if (ret)
+		return -EINVAL;
+
+	Hz = *kHz * 1000;
+	opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
+	if (IS_ERR(opp))
+		return -EINVAL;
+
+	mV = dev_pm_opp_get_voltage(opp) / 1000;
+	dev_pm_opp_put(opp);
+	if (!mV)
+		return -EINVAL;
+
+	tmp = (u64)cap * mV * mV * (Hz / 1000000);
+	do_div(tmp, 1000000000);
+
+	*mW = (unsigned long)tmp;
+	*kHz = Hz / 1000;
+
+	return 0;
+}
+
+/**
+ * dev_pm_opp_of_register_em() - Attempt to register an Energy Model
+ * @cpus	: CPUs for which an Energy Model has to be registered
+ *
+ * This checks whether the "dynamic-power-coefficient" devicetree property has
+ * been specified, and tries to register an Energy Model with it if it has.
+ */
+void dev_pm_opp_of_register_em(struct cpumask *cpus)
+{
+	struct em_data_callback em_cb = EM_DATA_CB(_get_cpu_power);
+	int ret, nr_opp, cpu = cpumask_first(cpus);
+	struct device *cpu_dev;
+	struct device_node *np;
+	u32 cap;
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return;
+
+	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
+	if (nr_opp <= 0)
+		return;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (!np)
+		return;
+
+	/*
+	 * Register an EM only if the 'dynamic-power-coefficient' property is
+	 * set in devicetree. It is assumed the voltage values are known if that
+	 * property is set since it is useless otherwise. If voltages are not
+	 * known, just let the EM registration fail with an error to alert the
+	 * user about the inconsistent configuration.
+	 */
+	ret = of_property_read_u32(np, "dynamic-power-coefficient", &cap);
+	of_node_put(np);
+	if (ret || !cap)
+		return;
+
+	em_register_perf_domain(cpus, nr_opp, &em_cb);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_register_em);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 0a2a88e5a383..1470c57933cf 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -322,6 +322,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpuma
 struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev);
 struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp);
 int of_get_required_opp_performance_state(struct device_node *np, int index);
+void dev_pm_opp_of_register_em(struct cpumask *cpus);
 #else
 static inline int dev_pm_opp_of_add_table(struct device *dev)
 {
@@ -360,6 +361,11 @@ static inline struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
 {
 	return NULL;
 }
+
+static inline void dev_pm_opp_of_register_em(struct cpumask *cpus)
+{
+}
+
 static inline int of_get_required_opp_performance_state(struct device_node *np, int index)
 {
 	return -ENOTSUPP;

From 3c429851f9986103ce99c5747a795c6a6c572310 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 4 Feb 2019 11:09:52 +0000
Subject: [PATCH 37/80] cpufreq: scmi: Register an Energy Model

The Energy Model (EM) framework provides an API to register the active
power of CPUs. Call this API from the scmi-cpufreq driver by using the
power costs obtained from firmware. This is done to ensure interested
subsystems (the task scheduler, for example) can make use of the EM
when available.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/scmi-cpufreq.c | 39 +++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 50b1551ba894..44449ff8ad47 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -12,6 +12,7 @@
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 #include <linux/cpu_cooling.h>
+#include <linux/energy_model.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/pm_opp.h>
@@ -103,13 +104,42 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
 	return 0;
 }
 
+static int __maybe_unused
+scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, int cpu)
+{
+	struct device *cpu_dev = get_cpu_device(cpu);
+	unsigned long Hz;
+	int ret, domain;
+
+	if (!cpu_dev) {
+		pr_err("failed to get cpu%d device\n", cpu);
+		return -ENODEV;
+	}
+
+	domain = handle->perf_ops->device_domain_id(cpu_dev);
+	if (domain < 0)
+		return domain;
+
+	/* Get the power cost of the performance domain. */
+	Hz = *KHz * 1000;
+	ret = handle->perf_ops->est_power_get(handle, domain, &Hz, power);
+	if (ret)
+		return ret;
+
+	/* The EM framework specifies the frequency in KHz. */
+	*KHz = Hz / 1000;
+
+	return 0;
+}
+
 static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 {
-	int ret;
+	int ret, nr_opp;
 	unsigned int latency;
 	struct device *cpu_dev;
 	struct scmi_data *priv;
 	struct cpufreq_frequency_table *freq_table;
+	struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power);
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -136,8 +166,8 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 		return ret;
 	}
 
-	ret = dev_pm_opp_get_opp_count(cpu_dev);
-	if (ret <= 0) {
+	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
+	if (nr_opp <= 0) {
 		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
 		ret = -EPROBE_DEFER;
 		goto out_free_opp;
@@ -171,6 +201,9 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = latency;
 
 	policy->fast_switch_possible = true;
+
+	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+
 	return 0;
 
 out_free_priv:

From 1058d1efbc84e3b48d2130f46a149cea178b28a1 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 6 Feb 2019 10:50:04 -0800
Subject: [PATCH 38/80] cpufreq: mediatek: Register an Energy Model

Try and register an Energy Model from mediatek-cpufreq to allow
interested subsystems like the task scheduler to use the provided
information.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/mediatek-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index eb8920d39818..5250df762814 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -465,6 +465,8 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 	policy->driver_data = info;
 	policy->clk = info->cpu_clk;
 
+	dev_pm_opp_of_register_em(policy->cpus);
+
 	return 0;
 }
 

From a2dea4cb907022447298d46b15a5c41f3f9903e6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Jan 2019 16:21:17 +0100
Subject: [PATCH 39/80] opp: no need to check return value of debugfs_create
 functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

Cc: Viresh Kumar <vireshk@kernel.org>
Cc: Nishanth Menon <nm@ti.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: linux-pm@vger.kernel.org
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c    |  11 +----
 drivers/opp/debugfs.c | 110 +++++++++++-------------------------------
 drivers/opp/opp.h     |  15 +++---
 3 files changed, 38 insertions(+), 98 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 57d88a275de0..fa15aa8e00b0 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -792,7 +792,6 @@ static struct opp_device *_add_opp_dev_unlocked(const struct device *dev,
 						struct opp_table *opp_table)
 {
 	struct opp_device *opp_dev;
-	int ret;
 
 	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
 	if (!opp_dev)
@@ -804,10 +803,7 @@ static struct opp_device *_add_opp_dev_unlocked(const struct device *dev,
 	list_add(&opp_dev->node, &opp_table->dev_list);
 
 	/* Create debugfs entries for the opp_table */
-	ret = opp_debug_register(opp_dev, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
-			__func__, ret);
+	opp_debug_register(opp_dev, opp_table);
 
 	return opp_dev;
 }
@@ -1175,10 +1171,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 	new_opp->opp_table = opp_table;
 	kref_init(&new_opp->kref);
 
-	ret = opp_debug_create_one(new_opp, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
-			__func__, ret);
+	opp_debug_create_one(new_opp, opp_table);
 
 	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
 		new_opp->available = false;
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index e6828e5f81b0..a1c57fe14de4 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -35,7 +35,7 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
 	debugfs_remove_recursive(opp->dentry);
 }
 
-static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+static void opp_debug_create_supplies(struct dev_pm_opp *opp,
 				      struct opp_table *opp_table,
 				      struct dentry *pdentry)
 {
@@ -50,30 +50,21 @@ static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
 		/* Create per-opp directory */
 		d = debugfs_create_dir(name, pdentry);
 
-		if (!d)
-			return false;
+		debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+				     &opp->supplies[i].u_volt);
 
-		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
-					  &opp->supplies[i].u_volt))
-			return false;
+		debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+				     &opp->supplies[i].u_volt_min);
 
-		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_min))
-			return false;
+		debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+				     &opp->supplies[i].u_volt_max);
 
-		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_max))
-			return false;
-
-		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
-					  &opp->supplies[i].u_amp))
-			return false;
+		debugfs_create_ulong("u_amp", S_IRUGO, d,
+				     &opp->supplies[i].u_amp);
 	}
-
-	return true;
 }
 
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 {
 	struct dentry *pdentry = opp_table->dentry;
 	struct dentry *d;
@@ -95,40 +86,23 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 
 	/* Create per-opp directory */
 	d = debugfs_create_dir(name, pdentry);
-	if (!d)
-		return -ENOMEM;
 
-	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
-		return -ENOMEM;
+	debugfs_create_bool("available", S_IRUGO, d, &opp->available);
+	debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic);
+	debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo);
+	debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend);
+	debugfs_create_u32("performance_state", S_IRUGO, d, &opp->pstate);
+	debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate);
+	debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+			     &opp->clock_latency_ns);
 
-	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
-		return -ENOMEM;
-
-	if (!debugfs_create_u32("performance_state", S_IRUGO, d, &opp->pstate))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
-		return -ENOMEM;
-
-	if (!opp_debug_create_supplies(opp, opp_table, d))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
-				  &opp->clock_latency_ns))
-		return -ENOMEM;
+	opp_debug_create_supplies(opp, opp_table, d);
 
 	opp->dentry = d;
-	return 0;
 }
 
-static int opp_list_debug_create_dir(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
+static void opp_list_debug_create_dir(struct opp_device *opp_dev,
+				      struct opp_table *opp_table)
 {
 	const struct device *dev = opp_dev->dev;
 	struct dentry *d;
@@ -137,36 +111,21 @@ static int opp_list_debug_create_dir(struct opp_device *opp_dev,
 
 	/* Create device specific directory */
 	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
-		return -ENOMEM;
-	}
 
 	opp_dev->dentry = d;
 	opp_table->dentry = d;
-
-	return 0;
 }
 
-static int opp_list_debug_create_link(struct opp_device *opp_dev,
-				      struct opp_table *opp_table)
+static void opp_list_debug_create_link(struct opp_device *opp_dev,
+				       struct opp_table *opp_table)
 {
-	const struct device *dev = opp_dev->dev;
 	char name[NAME_MAX];
-	struct dentry *d;
 
 	opp_set_dev_name(opp_dev->dev, name);
 
 	/* Create device specific directory link */
-	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create link\n", __func__);
-		return -ENOMEM;
-	}
-
-	opp_dev->dentry = d;
-
-	return 0;
+	opp_dev->dentry = debugfs_create_symlink(name, rootdir,
+						 opp_table->dentry_name);
 }
 
 /**
@@ -177,20 +136,13 @@ static int opp_list_debug_create_link(struct opp_device *opp_dev,
  * Dynamically adds device specific directory in debugfs 'opp' directory. If the
  * device-opp is shared with other devices, then links will be created for all
  * devices except the first.
- *
- * Return: 0 on success, otherwise negative error.
  */
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+void opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
 {
-	if (!rootdir) {
-		pr_debug("%s: Uninitialized rootdir\n", __func__);
-		return -EINVAL;
-	}
-
 	if (opp_table->dentry)
-		return opp_list_debug_create_link(opp_dev, opp_table);
-
-	return opp_list_debug_create_dir(opp_dev, opp_table);
+		opp_list_debug_create_link(opp_dev, opp_table);
+	else
+		opp_list_debug_create_dir(opp_dev, opp_table);
 }
 
 static void opp_migrate_dentry(struct opp_device *opp_dev,
@@ -252,10 +204,6 @@ static int __init opp_debug_init(void)
 {
 	/* Create /sys/kernel/debug/opp directory */
 	rootdir = debugfs_create_dir("opp", NULL);
-	if (!rootdir) {
-		pr_err("%s: Failed to create root directory\n", __func__);
-		return -ENOMEM;
-	}
 
 	return 0;
 }
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e24d81497375..810a85b9a66d 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -236,18 +236,17 @@ static inline void _of_opp_free_required_opps(struct opp_table *opp_table,
 
 #ifdef CONFIG_DEBUG_FS
 void opp_debug_remove_one(struct dev_pm_opp *opp);
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+void opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
 void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
 #else
 static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
 
-static inline int opp_debug_create_one(struct dev_pm_opp *opp,
-				       struct opp_table *opp_table)
-{ return 0; }
-static inline int opp_debug_register(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
-{ return 0; }
+static inline void opp_debug_create_one(struct dev_pm_opp *opp,
+					struct opp_table *opp_table) { }
+
+static inline void opp_debug_register(struct opp_device *opp_dev,
+				      struct opp_table *opp_table) { }
 
 static inline void opp_debug_unregister(struct opp_device *opp_dev,
 					struct opp_table *opp_table)

From 70e6e7d92bca041b66dac4512821a3aa72dda0cd Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 16 Jan 2019 07:37:02 +0100
Subject: [PATCH 40/80] MAINTAINERS: use common indentation

Commit 46e2856b8e18 ("cpufreq: Add Kryo CPU scaling driver") slips in
some formatting with spaces instead of tabs, which are used in the common
format for the MAINTAINERS file.

Also update to Ilia's new email address, as Ilia requested.

Fixes: 46e2856b8e18 ("cpufreq: Add Kryo CPU scaling driver")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 MAINTAINERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 32d444476a90..6957548f73a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12593,11 +12593,11 @@ F:	Documentation/media/v4l-drivers/qcom_camss.rst
 F:	drivers/media/platform/qcom/camss/
 
 QUALCOMM CPUFREQ DRIVER MSM8996/APQ8096
-M:  Ilia Lin <ilia.lin@gmail.com>
-L:  linux-pm@vger.kernel.org
-S:  Maintained
-F:  Documentation/devicetree/bindings/opp/kryo-cpufreq.txt
-F:  drivers/cpufreq/qcom-cpufreq-kryo.c
+M:	Ilia Lin <ilia.lin@kernel.org>
+L:	linux-pm@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/opp/kryo-cpufreq.txt
+F:	drivers/cpufreq/qcom-cpufreq-kryo.c
 
 QUALCOMM EMAC GIGABIT ETHERNET DRIVER
 M:	Timur Tabi <timur@kernel.org>

From 0dc10eac6583602335c275c7dc5975946da30901 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 24 Jan 2019 11:53:11 +0200
Subject: [PATCH 41/80] MAINTAINERS: Update the active pm tree for ARM

The Linaro hosted git tree is no longer active. Update the cpufreq
entry.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index fabbd9a59ab5..f0400a466870 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3958,7 +3958,7 @@ M:	Viresh Kumar <viresh.kumar@linaro.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
-T:	git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git (For ARM Updates)
 B:	https://bugzilla.kernel.org
 F:	Documentation/cpu-freq/
 F:	Documentation/devicetree/bindings/cpufreq/

From e0e5b2b4f427cf6f403c6edfb52ce2b0e3acae23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Chmiel?= <pawel.mikolaj.chmiel@gmail.com>
Date: Sun, 13 Jan 2019 20:57:54 +0100
Subject: [PATCH 42/80] cpufreq: s5pv210: Defer probe if getting regulators
 fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is possibility, that when probing driver, regulators are not yet
initialized. In this case we should return EPROBE_DEFER and wait till
they're initialized, since they're required currently for cpufreq driver
to work. Also move regulator initialization code at beginning of probe,
so we can defer as fast as posibble.

Signed-off-by: Paweł Chmiel <pawel.mikolaj.chmiel@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/s5pv210-cpufreq.c | 67 ++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index dbecd7667db2..5b4289460bc9 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -584,7 +584,7 @@ static struct notifier_block s5pv210_cpufreq_reboot_notifier = {
 static int s5pv210_cpufreq_probe(struct platform_device *pdev)
 {
 	struct device_node *np;
-	int id;
+	int id, result = 0;
 
 	/*
 	 * HACK: This is a temporary workaround to get access to clock
@@ -594,18 +594,39 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev)
 	 * this whole driver as soon as S5PV210 gets migrated to use
 	 * cpufreq-dt driver.
 	 */
+	arm_regulator = regulator_get(NULL, "vddarm");
+	if (IS_ERR(arm_regulator)) {
+		if (PTR_ERR(arm_regulator) == -EPROBE_DEFER)
+			pr_debug("vddarm regulator not ready, defer\n");
+		else
+			pr_err("failed to get regulator vddarm\n");
+		return PTR_ERR(arm_regulator);
+	}
+
+	int_regulator = regulator_get(NULL, "vddint");
+	if (IS_ERR(int_regulator)) {
+		if (PTR_ERR(int_regulator) == -EPROBE_DEFER)
+			pr_debug("vddint regulator not ready, defer\n");
+		else
+			pr_err("failed to get regulator vddint\n");
+		result = PTR_ERR(int_regulator);
+		goto err_int_regulator;
+	}
+
 	np = of_find_compatible_node(NULL, NULL, "samsung,s5pv210-clock");
 	if (!np) {
 		pr_err("%s: failed to find clock controller DT node\n",
 			__func__);
-		return -ENODEV;
+		result = -ENODEV;
+		goto err_clock;
 	}
 
 	clk_base = of_iomap(np, 0);
 	of_node_put(np);
 	if (!clk_base) {
 		pr_err("%s: failed to map clock registers\n", __func__);
-		return -EFAULT;
+		result = -EFAULT;
+		goto err_clock;
 	}
 
 	for_each_compatible_node(np, NULL, "samsung,s5pv210-dmc") {
@@ -614,7 +635,8 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev)
 			pr_err("%s: failed to get alias of dmc node '%pOFn'\n",
 				__func__, np);
 			of_node_put(np);
-			return id;
+			result = id;
+			goto err_clk_base;
 		}
 
 		dmc_base[id] = of_iomap(np, 0);
@@ -622,33 +644,40 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev)
 			pr_err("%s: failed to map dmc%d registers\n",
 				__func__, id);
 			of_node_put(np);
-			return -EFAULT;
+			result = -EFAULT;
+			goto err_dmc;
 		}
 	}
 
 	for (id = 0; id < ARRAY_SIZE(dmc_base); ++id) {
 		if (!dmc_base[id]) {
 			pr_err("%s: failed to find dmc%d node\n", __func__, id);
-			return -ENODEV;
+			result = -ENODEV;
+			goto err_dmc;
 		}
 	}
 
-	arm_regulator = regulator_get(NULL, "vddarm");
-	if (IS_ERR(arm_regulator)) {
-		pr_err("failed to get regulator vddarm\n");
-		return PTR_ERR(arm_regulator);
-	}
-
-	int_regulator = regulator_get(NULL, "vddint");
-	if (IS_ERR(int_regulator)) {
-		pr_err("failed to get regulator vddint\n");
-		regulator_put(arm_regulator);
-		return PTR_ERR(int_regulator);
-	}
-
 	register_reboot_notifier(&s5pv210_cpufreq_reboot_notifier);
 
 	return cpufreq_register_driver(&s5pv210_driver);
+
+err_dmc:
+	for (id = 0; id < ARRAY_SIZE(dmc_base); ++id)
+		if (dmc_base[id]) {
+			iounmap(dmc_base[id]);
+			dmc_base[id] = NULL;
+		}
+
+err_clk_base:
+	iounmap(clk_base);
+
+err_clock:
+	regulator_put(int_regulator);
+
+err_int_regulator:
+	regulator_put(arm_regulator);
+
+	return result;
 }
 
 static struct platform_driver s5pv210_cpufreq_platdrv = {

From 50c0b12f098fb3cfac7abcc0f2b5409f6bac5fa2 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Mon, 4 Feb 2019 01:13:10 -0500
Subject: [PATCH 43/80] cpufreq: qcom-kryo: make some variables static

The variables are local to the source and do not
need to be in global scope, so make them static.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-kryo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c
index 2a3675c24032..1c8583cc06a2 100644
--- a/drivers/cpufreq/qcom-cpufreq-kryo.c
+++ b/drivers/cpufreq/qcom-cpufreq-kryo.c
@@ -42,7 +42,7 @@ enum _msm8996_version {
 	NUM_OF_MSM8996_VERSIONS,
 };
 
-struct platform_device *cpufreq_dt_pdev, *kryo_cpufreq_pdev;
+static struct platform_device *cpufreq_dt_pdev, *kryo_cpufreq_pdev;
 
 static enum _msm8996_version qcom_cpufreq_kryo_get_msm_id(void)
 {

From 8e3151d16c7d8d3576fee23a6c898dd54ead4e0a Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Fri, 18 Jan 2019 15:11:39 +0100
Subject: [PATCH 44/80] MAINTAINERS: add new entries for Armada 8K cpufreq
 driver

This new driver belongs to the mvebu family, update the MAINTAINER file
to document it.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6957548f73a4..fabbd9a59ab5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1737,6 +1737,7 @@ F:	arch/arm/configs/mvebu_*_defconfig
 F:	arch/arm/mach-mvebu/
 F:	arch/arm64/boot/dts/marvell/armada*
 F:	drivers/cpufreq/armada-37xx-cpufreq.c
+F:	drivers/cpufreq/armada-8k-cpufreq.c
 F:	drivers/cpufreq/mvebu-cpufreq.c
 F:	drivers/irqchip/irq-armada-370-xp.c
 F:	drivers/irqchip/irq-mvebu-*

From 446fae2bb5395f3028d8e3aae1508737e5a72ea1 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Mon, 4 Feb 2019 02:48:54 -0500
Subject: [PATCH 45/80] cpufreq: tegra124: add missing of_node_put()

of_cpu_device_node_get() will increase the refcount of device_node,
it is necessary to call of_node_put() at the end to release the
refcount.

Fixes: 9eb15dbbfa1a2 ("cpufreq: Add cpufreq driver for Tegra124")
Cc: <stable@vger.kernel.org> # 4.4+
Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/tegra124-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c
index 43530254201a..4bb154f6c54c 100644
--- a/drivers/cpufreq/tegra124-cpufreq.c
+++ b/drivers/cpufreq/tegra124-cpufreq.c
@@ -134,6 +134,8 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, priv);
 
+	of_node_put(np);
+
 	return 0;
 
 out_switch_to_pllx:

From f525a670533d961fd72ab748e3aac002d7b3d1b9 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Fri, 18 Jan 2019 15:51:01 +0100
Subject: [PATCH 46/80] cpufreq: ap806: add cpufreq driver for Armada 8K

Add cpufreq driver for Marvell AP-806 found on Aramda 8K.
The AP-806 has DFS (Dynamic Frequency Scaling) with coupled
clock domain for two clusters, so this driver will directly
use generic cpufreq-dt driver as backend.

Based on the work of Omri Itach <omrii@marvell.com>.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig.arm         |  11 ++
 drivers/cpufreq/Makefile            |   1 +
 drivers/cpufreq/armada-8k-cpufreq.c | 204 ++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)
 create mode 100644 drivers/cpufreq/armada-8k-cpufreq.c

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 688f10227793..10bc5c798d17 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -25,6 +25,17 @@ config ARM_ARMADA_37XX_CPUFREQ
 	  This adds the CPUFreq driver support for Marvell Armada 37xx SoCs.
 	  The Armada 37xx PMU supports 4 frequency and VDD levels.
 
+config ARM_ARMADA_8K_CPUFREQ
+	tristate "Armada 8K CPUFreq driver"
+	depends on ARCH_MVEBU && CPUFREQ_DT
+	help
+	  This enables the CPUFreq driver support for Marvell
+	  Armada8k SOCs.
+	  Armada8K device has the AP806 which supports scaling
+	  to any full integer divider.
+
+	  If in doubt, say N.
+
 # big LITTLE core layer and glue drivers
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 08c071be2491..689b26c6f949 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_X86_SFI_CPUFREQ)		+= sfi-cpufreq.o
 obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ)	+= arm_big_little.o
 
 obj-$(CONFIG_ARM_ARMADA_37XX_CPUFREQ)	+= armada-37xx-cpufreq.o
+obj-$(CONFIG_ARM_ARMADA_8K_CPUFREQ)	+= armada-8k-cpufreq.o
 obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ)	+= brcmstb-avs-cpufreq.o
 obj-$(CONFIG_ACPI_CPPC_CPUFREQ)		+= cppc_cpufreq.o
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci-cpufreq.o
diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c
new file mode 100644
index 000000000000..8a5ddb93fc58
--- /dev/null
+++ b/drivers/cpufreq/armada-8k-cpufreq.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * CPUFreq support for Armada 8K
+ *
+ * Copyright (C) 2018 Marvell
+ *
+ * Omri Itach <omrii@marvell.com>
+ * Gregory Clement <gregory.clement@bootlin.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/slab.h>
+
+/*
+ * Setup the opps list with the divider for the max frequency, that
+ * will be filled at runtime.
+ */
+static const int opps_div[] __initconst = {1, 2, 3, 4};
+
+static struct platform_device *armada_8k_pdev;
+
+struct freq_table {
+	struct device *cpu_dev;
+	unsigned int freq[ARRAY_SIZE(opps_div)];
+};
+
+/* If the CPUs share the same clock, then they are in the same cluster. */
+static void __init armada_8k_get_sharing_cpus(struct clk *cur_clk,
+					      struct cpumask *cpumask)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct device *cpu_dev;
+		struct clk *clk;
+
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_warn("Failed to get cpu%d device\n", cpu);
+			continue;
+		}
+
+		clk = clk_get(cpu_dev, 0);
+		if (IS_ERR(clk)) {
+			pr_warn("Cannot get clock for CPU %d\n", cpu);
+		} else {
+			if (clk_is_match(clk, cur_clk))
+				cpumask_set_cpu(cpu, cpumask);
+
+			clk_put(clk);
+		}
+	}
+}
+
+static int __init armada_8k_add_opp(struct clk *clk, struct device *cpu_dev,
+				    struct freq_table *freq_tables,
+				    int opps_index)
+{
+	unsigned int cur_frequency;
+	unsigned int freq;
+	int i, ret;
+
+	/* Get nominal (current) CPU frequency. */
+	cur_frequency = clk_get_rate(clk);
+	if (!cur_frequency) {
+		dev_err(cpu_dev, "Failed to get clock rate for this CPU\n");
+		return -EINVAL;
+	}
+
+	freq_tables[opps_index].cpu_dev = cpu_dev;
+
+	for (i = 0; i < ARRAY_SIZE(opps_div); i++) {
+		freq = cur_frequency / opps_div[i];
+
+		ret = dev_pm_opp_add(cpu_dev, freq, 0);
+		if (ret)
+			return ret;
+
+		freq_tables[opps_index].freq[i] = freq;
+	}
+
+	return 0;
+}
+
+static void armada_8k_cpufreq_free_table(struct freq_table *freq_tables)
+{
+	int opps_index, nb_cpus = num_possible_cpus();
+
+	for (opps_index = 0 ; opps_index <= nb_cpus; opps_index++) {
+		int i;
+
+		/* If cpu_dev is NULL then we reached the end of the array */
+		if (!freq_tables[opps_index].cpu_dev)
+			break;
+
+		for (i = 0; i < ARRAY_SIZE(opps_div); i++) {
+			/*
+			 * A 0Hz frequency is not valid, this meant
+			 * that it was not yet initialized so there is
+			 * no more opp to free
+			 */
+			if (freq_tables[opps_index].freq[i] == 0)
+				break;
+
+			dev_pm_opp_remove(freq_tables[opps_index].cpu_dev,
+					  freq_tables[opps_index].freq[i]);
+		}
+	}
+
+	kfree(freq_tables);
+}
+
+static int __init armada_8k_cpufreq_init(void)
+{
+	int ret = 0, opps_index = 0, cpu, nb_cpus;
+	struct freq_table *freq_tables;
+	struct device_node *node;
+	struct cpumask cpus;
+
+	node = of_find_compatible_node(NULL, NULL, "marvell,ap806-cpu-clock");
+	if (!node || !of_device_is_available(node))
+		return -ENODEV;
+
+	nb_cpus = num_possible_cpus();
+	freq_tables = kcalloc(nb_cpus, sizeof(*freq_tables), GFP_KERNEL);
+	cpumask_copy(&cpus, cpu_possible_mask);
+
+	/*
+	 * For each CPU, this loop registers the operating points
+	 * supported (which are the nominal CPU frequency and full integer
+	 * divisions of it).
+	 */
+	for_each_cpu(cpu, &cpus) {
+		struct cpumask shared_cpus;
+		struct device *cpu_dev;
+		struct clk *clk;
+
+		cpu_dev = get_cpu_device(cpu);
+
+		if (!cpu_dev) {
+			pr_err("Cannot get CPU %d\n", cpu);
+			continue;
+		}
+
+		clk = clk_get(cpu_dev, 0);
+
+		if (IS_ERR(clk)) {
+			pr_err("Cannot get clock for CPU %d\n", cpu);
+			ret = PTR_ERR(clk);
+			goto remove_opp;
+		}
+
+		ret = armada_8k_add_opp(clk, cpu_dev, freq_tables, opps_index);
+		if (ret) {
+			clk_put(clk);
+			goto remove_opp;
+		}
+
+		opps_index++;
+		cpumask_clear(&shared_cpus);
+		armada_8k_get_sharing_cpus(clk, &shared_cpus);
+		dev_pm_opp_set_sharing_cpus(cpu_dev, &shared_cpus);
+		cpumask_andnot(&cpus, &cpus, &shared_cpus);
+		clk_put(clk);
+	}
+
+	armada_8k_pdev = platform_device_register_simple("cpufreq-dt", -1,
+							 NULL, 0);
+	ret = PTR_ERR_OR_ZERO(armada_8k_pdev);
+	if (ret)
+		goto remove_opp;
+
+	platform_set_drvdata(armada_8k_pdev, freq_tables);
+
+	return 0;
+
+remove_opp:
+	armada_8k_cpufreq_free_table(freq_tables);
+	return ret;
+}
+module_init(armada_8k_cpufreq_init);
+
+static void __exit armada_8k_cpufreq_exit(void)
+{
+	struct freq_table *freq_tables = platform_get_drvdata(armada_8k_pdev);
+
+	platform_device_unregister(armada_8k_pdev);
+	armada_8k_cpufreq_free_table(freq_tables);
+}
+module_exit(armada_8k_cpufreq_exit);
+
+MODULE_AUTHOR("Gregory Clement <gregory.clement@bootlin.com>");
+MODULE_DESCRIPTION("Armada 8K cpufreq driver");
+MODULE_LICENSE("GPL");

From 3ad63a6b1b3e770f9af63d938efe0212f3d95d6d Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Wed, 6 Feb 2019 23:34:20 -0500
Subject: [PATCH 47/80] cpufreq: imx6q: Register an Energy Model

Try and register an Energy Model from imx6q-cpufreq to allow
interested subsystems like the task scheduler to use the provided
information.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/imx6q-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 9fedf627e000..73bfd5bc3087 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -210,6 +210,7 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 	policy->clk = clks[ARM].clk;
 	ret = cpufreq_generic_init(policy, freq_table, transition_latency);
 	policy->suspend_freq = max_freq;
+	dev_pm_opp_of_register_em(policy->cpus);
 
 	return ret;
 }

From a9a744dd5b82843a9c99d9f97794fb51bc2ed8dd Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Wed, 6 Feb 2019 23:35:50 -0500
Subject: [PATCH 48/80] cpufreq: OMAP: Register an Energy Model

Try and register an Energy Model from omap-cpufreq.c to allow
interested subsystems like the task scheduler to use the provided
information.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/omap-cpufreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 71e81bbf031b..68052b74d28f 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -133,8 +133,10 @@ static int omap_cpu_init(struct cpufreq_policy *policy)
 
 	/* FIXME: what's the actual transition time? */
 	result = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (!result)
+	if (!result) {
+		dev_pm_opp_of_register_em(policy->cpus);
 		return 0;
+	}
 
 	freq_table_free();
 fail:

From f896d06665ec8ae09f6412a47b97863d9ef8ff64 Mon Sep 17 00:00:00 2001
From: Amit Kucheria <amit.kucheria@linaro.org>
Date: Thu, 10 Jan 2019 05:30:53 +0530
Subject: [PATCH 49/80] cpufreq: qcom-hw: Move to device_initcall

subsys_initcall causes problems registering the driver as a thermal
cooling device.

If "faster boot" is the main reason for doing subsys_initcall, this
should be handled in the bootloader or another boot constraint
framework.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index d83939a1b3d4..649dddd72749 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -296,7 +296,7 @@ static int __init qcom_cpufreq_hw_init(void)
 {
 	return platform_driver_register(&qcom_cpufreq_hw_driver);
 }
-subsys_initcall(qcom_cpufreq_hw_init);
+device_initcall(qcom_cpufreq_hw_init);
 
 static void __exit qcom_cpufreq_hw_exit(void)
 {

From 55538fbc79e926f159a5abcc571ba1c02529a3d1 Mon Sep 17 00:00:00 2001
From: Taniya Das <tdas@codeaurora.org>
Date: Thu, 31 Jan 2019 23:02:50 +0530
Subject: [PATCH 50/80] cpufreq: qcom: Read voltage LUT and populate OPP

Add support to read the voltage look up table and populate OPP for all
corresponding CPUS for consumers like the energy model could use the
frequency and voltage from the OPP tables. Also update the logic to not add
duplicate OPPs.

Tested-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Taniya Das <tdas@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 46 +++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 649dddd72749..72223536ca4e 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -10,18 +10,21 @@
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
+#include <linux/pm_opp.h>
 #include <linux/slab.h>
 
 #define LUT_MAX_ENTRIES			40U
 #define LUT_SRC				GENMASK(31, 30)
 #define LUT_L_VAL			GENMASK(7, 0)
 #define LUT_CORE_COUNT			GENMASK(18, 16)
+#define LUT_VOLT			GENMASK(11, 0)
 #define LUT_ROW_SIZE			32
 #define CLK_HW_DIV			2
 
 /* Register offsets */
 #define REG_ENABLE			0x0
-#define REG_LUT_TABLE			0x110
+#define REG_FREQ_LUT			0x110
+#define REG_VOLT_LUT			0x114
 #define REG_PERF_STATE			0x920
 
 static unsigned long cpu_hw_rate, xo_rate;
@@ -70,11 +73,12 @@ static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy,
 	return policy->freq_table[index].frequency;
 }
 
-static int qcom_cpufreq_hw_read_lut(struct device *dev,
+static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev,
 				    struct cpufreq_policy *policy,
 				    void __iomem *base)
 {
 	u32 data, src, lval, i, core_count, prev_cc = 0, prev_freq = 0, freq;
+	u32 volt;
 	unsigned int max_cores = cpumask_weight(policy->cpus);
 	struct cpufreq_frequency_table	*table;
 
@@ -83,23 +87,28 @@ static int qcom_cpufreq_hw_read_lut(struct device *dev,
 		return -ENOMEM;
 
 	for (i = 0; i < LUT_MAX_ENTRIES; i++) {
-		data = readl_relaxed(base + REG_LUT_TABLE + i * LUT_ROW_SIZE);
+		data = readl_relaxed(base + REG_FREQ_LUT +
+				      i * LUT_ROW_SIZE);
 		src = FIELD_GET(LUT_SRC, data);
 		lval = FIELD_GET(LUT_L_VAL, data);
 		core_count = FIELD_GET(LUT_CORE_COUNT, data);
 
+		data = readl_relaxed(base + REG_VOLT_LUT +
+				      i * LUT_ROW_SIZE);
+		volt = FIELD_GET(LUT_VOLT, data) * 1000;
+
 		if (src)
 			freq = xo_rate * lval / 1000;
 		else
 			freq = cpu_hw_rate / 1000;
 
-		/* Ignore boosts in the middle of the table */
-		if (core_count != max_cores) {
-			table[i].frequency = CPUFREQ_ENTRY_INVALID;
-		} else {
+		if (freq != prev_freq && core_count == max_cores) {
 			table[i].frequency = freq;
-			dev_dbg(dev, "index=%d freq=%d, core_count %d\n", i,
+			dev_pm_opp_add(cpu_dev, freq * 1000, volt);
+			dev_dbg(cpu_dev, "index=%d freq=%d, core_count %d\n", i,
 				freq, core_count);
+		} else {
+			table[i].frequency = CPUFREQ_ENTRY_INVALID;
 		}
 
 		/*
@@ -116,6 +125,7 @@ static int qcom_cpufreq_hw_read_lut(struct device *dev,
 			if (prev_cc != max_cores) {
 				prev->frequency = prev_freq;
 				prev->flags = CPUFREQ_BOOST_FREQ;
+				dev_pm_opp_add(cpu_dev,	prev_freq * 1000, volt);
 			}
 
 			break;
@@ -127,6 +137,7 @@ static int qcom_cpufreq_hw_read_lut(struct device *dev,
 
 	table[i].frequency = CPUFREQ_TABLE_END;
 	policy->freq_table = table;
+	dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
 
 	return 0;
 }
@@ -159,10 +170,18 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 	struct device *dev = &global_pdev->dev;
 	struct of_phandle_args args;
 	struct device_node *cpu_np;
+	struct device *cpu_dev;
 	struct resource *res;
 	void __iomem *base;
 	int ret, index;
 
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev) {
+		pr_err("%s: failed to get cpu%d device\n", __func__,
+		       policy->cpu);
+		return -ENODEV;
+	}
+
 	cpu_np = of_cpu_device_node_get(policy->cpu);
 	if (!cpu_np)
 		return -EINVAL;
@@ -199,12 +218,19 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 
 	policy->driver_data = base + REG_PERF_STATE;
 
-	ret = qcom_cpufreq_hw_read_lut(dev, policy, base);
+	ret = qcom_cpufreq_hw_read_lut(cpu_dev, policy, base);
 	if (ret) {
 		dev_err(dev, "Domain-%d failed to read LUT\n", index);
 		goto error;
 	}
 
+	ret = dev_pm_opp_get_opp_count(cpu_dev);
+	if (ret <= 0) {
+		dev_err(cpu_dev, "Failed to add OPPs\n");
+		ret = -ENODEV;
+		goto error;
+	}
+
 	policy->fast_switch_possible = true;
 
 	return 0;
@@ -215,8 +241,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 
 static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
 {
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	void __iomem *base = policy->driver_data - REG_PERF_STATE;
 
+	dev_pm_opp_remove_all_dynamic(cpu_dev);
 	kfree(policy->freq_table);
 	devm_iounmap(&global_pdev->dev, base);
 

From dab535052f67db0ff48b1b23e714b58650d1a787 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 5 Feb 2019 09:52:24 -0800
Subject: [PATCH 51/80] cpufreq: qcom-hw: Register an Energy Model

Try and register an Energy Model from qcom-cpufreq-hw to allow
interested sub-systems like the task scheduler to use the provided
information.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
[ Viresh: Rebased over cpufreq related changes ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 72223536ca4e..c647cfb1f5e6 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -231,6 +231,8 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 		goto error;
 	}
 
+	dev_pm_opp_of_register_em(policy->cpus);
+
 	policy->fast_switch_possible = true;
 
 	return 0;

From df7f8e00a50f4d35b660ea2a2bc734decc1343e6 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 12 Feb 2019 10:47:10 +0800
Subject: [PATCH 52/80] powercap: intel_rapl: add support for Jacobsville

Jacobsville RAPL interface is compatible with Core-based CPUs.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 6cdb2c14eee4..393463eb157f 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1164,6 +1164,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		rapl_defaults_core),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	rapl_defaults_core),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_X,		rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_TREMONT_X,		rapl_defaults_core),
 
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		rapl_defaults_hsw_server),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		rapl_defaults_hsw_server),

From 91a12e91dc39137906d929a4ff6f9c32c59697fa Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 12 Feb 2019 16:36:04 +0530
Subject: [PATCH 53/80] cpufreq: Allow light-weight tear down and bring up of
 CPUs

The cpufreq core doesn't remove the cpufreq policy anymore on CPU
offline operation, rather that happens when the CPU device gets
unregistered from the kernel. This allows faster recovery when the CPU
comes back online. This is also very useful during system wide
suspend/resume where we offline all non-boot CPUs during suspend and
then bring them back on resume.

This commit takes the same idea a step ahead to allow drivers to do
light weight tear-down and bring-up during CPU offline and online
operations.

A new set of callbacks is introduced, online/offline(). online() gets
called when the first CPU of an inactive policy is brought up and
offline() gets called when all the CPUs of a policy are offlined.

The existing init/exit() callback get called on policy
creation/destruction. They also get called instead of online/offline()
callbacks if the online/offline() callbacks aren't provided.

This also moves around some code to get executed only for the new-policy
case going forward.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 58 +++++++++++++++++++++++++--------------
 include/linux/cpufreq.h   |  2 ++
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 96a69c67a545..55e9795801a4 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1201,28 +1201,39 @@ static int cpufreq_online(unsigned int cpu)
 			return -ENOMEM;
 	}
 
-	cpumask_copy(policy->cpus, cpumask_of(cpu));
+	if (!new_policy && cpufreq_driver->online) {
+		ret = cpufreq_driver->online(policy);
+		if (ret) {
+			pr_debug("%s: %d: initialization failed\n", __func__,
+				 __LINE__);
+			goto out_exit_policy;
+		}
 
-	/* call driver. From then on the cpufreq must be able
-	 * to accept all calls to ->verify and ->setpolicy for this CPU
-	 */
-	ret = cpufreq_driver->init(policy);
-	if (ret) {
-		pr_debug("initialization failed\n");
-		goto out_free_policy;
-	}
+		/* Recover policy->cpus using related_cpus */
+		cpumask_copy(policy->cpus, policy->related_cpus);
+	} else {
+		cpumask_copy(policy->cpus, cpumask_of(cpu));
 
-	ret = cpufreq_table_validate_and_sort(policy);
-	if (ret)
-		goto out_exit_policy;
+		/*
+		 * Call driver. From then on the cpufreq must be able
+		 * to accept all calls to ->verify and ->setpolicy for this CPU.
+		 */
+		ret = cpufreq_driver->init(policy);
+		if (ret) {
+			pr_debug("%s: %d: initialization failed\n", __func__,
+				 __LINE__);
+			goto out_free_policy;
+		}
 
-	down_write(&policy->rwsem);
+		ret = cpufreq_table_validate_and_sort(policy);
+		if (ret)
+			goto out_exit_policy;
 
-	if (new_policy) {
 		/* related_cpus should at least include policy->cpus. */
 		cpumask_copy(policy->related_cpus, policy->cpus);
 	}
 
+	down_write(&policy->rwsem);
 	/*
 	 * affected cpus must always be the one, which are online. We aren't
 	 * managing offline cpus here.
@@ -1421,11 +1432,12 @@ static int cpufreq_offline(unsigned int cpu)
 		cpufreq_exit_governor(policy);
 
 	/*
-	 * Perform the ->exit() even during light-weight tear-down,
-	 * since this is a core component, and is essential for the
-	 * subsequent light-weight ->init() to succeed.
+	 * Perform the ->offline() during light-weight tear-down, as
+	 * that allows fast recovery when the CPU comes back.
 	 */
-	if (cpufreq_driver->exit) {
+	if (cpufreq_driver->offline) {
+		cpufreq_driver->offline(policy);
+	} else if (cpufreq_driver->exit) {
 		cpufreq_driver->exit(policy);
 		policy->freq_table = NULL;
 	}
@@ -1454,8 +1466,13 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 	cpumask_clear_cpu(cpu, policy->real_cpus);
 	remove_cpu_dev_symlink(policy, dev);
 
-	if (cpumask_empty(policy->real_cpus))
+	if (cpumask_empty(policy->real_cpus)) {
+		/* We did light-weight exit earlier, do full tear down now */
+		if (cpufreq_driver->offline)
+			cpufreq_driver->exit(policy);
+
 		cpufreq_policy_free(policy);
+	}
 }
 
 /**
@@ -2488,7 +2505,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 		    driver_data->target) ||
 	     (driver_data->setpolicy && (driver_data->target_index ||
 		    driver_data->target)) ||
-	     (!!driver_data->get_intermediate != !!driver_data->target_intermediate))
+	     (!!driver_data->get_intermediate != !!driver_data->target_intermediate) ||
+	     (!driver_data->online != !driver_data->offline))
 		return -EINVAL;
 
 	pr_debug("trying to register driver %s\n", driver_data->name);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9db074ecbbd7..b160e98076e3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -325,6 +325,8 @@ struct cpufreq_driver {
 	/* optional */
 	int		(*bios_limit)(int cpu, unsigned int *limit);
 
+	int		(*online)(struct cpufreq_policy *policy);
+	int		(*offline)(struct cpufreq_policy *policy);
 	int		(*exit)(struct cpufreq_policy *policy);
 	void		(*stop_cpu)(struct cpufreq_policy *policy);
 	int		(*suspend)(struct cpufreq_policy *policy);

From 263abfe74b5f1058887b14e7668b0d737e040e42 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 12 Feb 2019 12:36:47 +0530
Subject: [PATCH 54/80] cpufreq: dt: Implement online/offline() callbacks

Implement the light-weight tear down and bring up helpers to reduce the
amount of work to do on CPU offline/online operation.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 7ba392911cd0..1aefaa1b0ca2 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -295,6 +295,21 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
+static int cpufreq_online(struct cpufreq_policy *policy)
+{
+	/* We did light-weight tear down earlier, nothing to do here */
+	return 0;
+}
+
+static int cpufreq_offline(struct cpufreq_policy *policy)
+{
+	/*
+	 * Preserve policy->driver_data and don't free resources on light-weight
+	 * tear down.
+	 */
+	return 0;
+}
+
 static int cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct private_data *priv = policy->driver_data;
@@ -319,6 +334,8 @@ static struct cpufreq_driver dt_cpufreq_driver = {
 	.get = cpufreq_generic_get,
 	.init = cpufreq_init,
 	.exit = cpufreq_exit,
+	.online = cpufreq_online,
+	.offline = cpufreq_offline,
 	.name = "cpufreq-dt",
 	.attr = cpufreq_dt_attr,
 	.suspend = cpufreq_generic_suspend,

From 076b862c7e4409d2dcacfda19f7eaf8d07ab9200 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 13 Feb 2019 09:58:35 +0100
Subject: [PATCH 55/80] cpufreq: intel_pstate: Add reasons for failure and
 debug messages

The init code path has several exceptions where the driver can
decide not to load.

As CONFIG_X86_INTEL_PSTATE is generally set to Y, the return code is
not reachable.  The initialization code is neither verbose of the
reason why it did choose to prematurely exit, so it is difficult for
a user to determine, on a given platform, why the driver didn't load
properly.

This patch is about reporting to the user the reason/context of why
the driver failed to load.  That is a precious hint when debugging
a platform.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
[ rjw: Subject & changelog, minor fixups ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 5ab6a4fe93aa..75a5408e6981 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2476,6 +2476,7 @@ static bool __init intel_pstate_no_acpi_pss(void)
 		kfree(pss);
 	}
 
+	pr_debug("ACPI _PSS not found\n");
 	return true;
 }
 
@@ -2486,9 +2487,14 @@ static bool __init intel_pstate_no_acpi_pcch(void)
 
 	status = acpi_get_handle(NULL, "\\_SB", &handle);
 	if (ACPI_FAILURE(status))
-		return true;
+		goto not_found;
 
-	return !acpi_has_method(handle, "PCCH");
+	if (acpi_has_method(handle, "PCCH"))
+		return false;
+
+not_found:
+	pr_debug("ACPI PCCH not found\n");
+	return true;
 }
 
 static bool __init intel_pstate_has_acpi_ppc(void)
@@ -2503,6 +2509,7 @@ static bool __init intel_pstate_has_acpi_ppc(void)
 		if (acpi_has_method(pr->handle, "_PPC"))
 			return true;
 	}
+	pr_debug("ACPI _PPC not found\n");
 	return false;
 }
 
@@ -2540,8 +2547,10 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
 	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
 	if (id) {
 		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
-		if ( misc_pwr & (1 << 8))
+		if (misc_pwr & (1 << 8)) {
+			pr_debug("Bit 8 in the MISC_PWR_MGMT MSR set\n");
 			return true;
+		}
 	}
 
 	idx = acpi_match_platform_list(plat_info);
@@ -2607,22 +2616,28 @@ static int __init intel_pstate_init(void)
 		}
 	} else {
 		id = x86_match_cpu(intel_pstate_cpu_ids);
-		if (!id)
+		if (!id) {
+			pr_info("CPU ID not supported\n");
 			return -ENODEV;
+		}
 
 		copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
 	}
 
-	if (intel_pstate_msrs_not_valid())
+	if (intel_pstate_msrs_not_valid()) {
+		pr_info("Invalid MSRs\n");
 		return -ENODEV;
+	}
 
 hwp_cpu_matched:
 	/*
 	 * The Intel pstate driver will be ignored if the platform
 	 * firmware has its own power management modes.
 	 */
-	if (intel_pstate_platform_pwr_mgmt_exists())
+	if (intel_pstate_platform_pwr_mgmt_exists()) {
+		pr_info("P-states controlled by the platform\n");
 		return -ENODEV;
+	}
 
 	if (!hwp_active && hwp_only)
 		return -ENOTSUPP;

From 4a0fa9f9fdb5a7230da6fb250c270c01d5f21fb5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 11 Feb 2019 13:17:12 +0100
Subject: [PATCH 56/80] PM / suspend: Print debug messages for device using
 direct-complete

Devices using the direct-complete optimization are not present it
debug messages printed by the core device suspend and resume code,
which sometimes makes it difficult to diagnose problems related to
them, so add debug messages for those devices.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 0992e67e862b..337a56ff11b7 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1741,8 +1741,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	if (dev->power.direct_complete) {
 		if (pm_runtime_status_suspended(dev)) {
 			pm_runtime_disable(dev);
-			if (pm_runtime_status_suspended(dev))
+			if (pm_runtime_status_suspended(dev)) {
+				pm_dev_dbg(dev, state, "direct-complete ");
 				goto Complete;
+			}
 
 			pm_runtime_enable(dev);
 		}

From a9a22b570bd512eb1ed946042bb88f11ecccca0c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 14 Feb 2019 16:16:21 +0530
Subject: [PATCH 57/80] cpufreq: Replace double NOT (!!) with single NOT (!)

Double NOT (!!) operation is normally done to convert a non-zero value
to 1 and keep zero as is, but that isn't the requirement in this case.
All we wanted was to make sure that only one of the two routines isn't
set, i.e. either both function pointers are set or both are unset.

This can be done with a single NOT (!) operation as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 55e9795801a4..ad4e9991c3cc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2505,7 +2505,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 		    driver_data->target) ||
 	     (driver_data->setpolicy && (driver_data->target_index ||
 		    driver_data->target)) ||
-	     (!!driver_data->get_intermediate != !!driver_data->target_intermediate) ||
+	     (!driver_data->get_intermediate != !driver_data->target_intermediate) ||
 	     (!driver_data->online != !driver_data->offline))
 		return -EINVAL;
 

From 04b1d5d098491244f506c4265cc95b87210eef2f Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 14 Feb 2019 21:35:06 +0800
Subject: [PATCH 58/80] intel_idle: add support for Jacobsville

Jacobsville uses the same C-states as Denverton.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 8b5d85c91e9d..b8647b5c3d4d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1103,6 +1103,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_X,		idle_cpu_dnv),
+	INTEL_CPU_FAM6(ATOM_TREMONT_X,		idle_cpu_dnv),
 	{}
 };
 

From 7416f1f206877fa2f61ada3dadbefdb4817b541f Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Thu, 14 Feb 2019 10:12:48 -0800
Subject: [PATCH 59/80] PM / Domains: Mark "name" const in
 genpd_dev_pm_attach_by_name()

The genpd_dev_pm_attach_by_name() simply takes the name and passes it
to of_property_match_string() where the argument is "const char *".
Adding a const here allows a later patch to add a const to
dev_pm_domain_attach_by_name() which allows drivers to pass in a name
that was declared "const" in a driver.

Fixes: 5d6be70add65 ("PM / Domains: Introduce option to attach a device by name to genpd")
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 2 +-
 include/linux/pm_domain.h   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 45eafe8cf7dd..2c334c01fc43 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2483,7 +2483,7 @@ EXPORT_SYMBOL_GPL(genpd_dev_pm_attach_by_id);
  * power-domain-names DT property. For further description see
  * genpd_dev_pm_attach_by_id().
  */
-struct device *genpd_dev_pm_attach_by_name(struct device *dev, char *name)
+struct device *genpd_dev_pm_attach_by_name(struct device *dev, const char *name)
 {
 	int index;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index dd364abb649a..203be5082f33 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -271,7 +271,7 @@ int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 					 unsigned int index);
 struct device *genpd_dev_pm_attach_by_name(struct device *dev,
-					   char *name);
+					   const char *name);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
 static inline int of_genpd_add_provider_simple(struct device_node *np,
 					struct generic_pm_domain *genpd)
@@ -324,7 +324,7 @@ static inline struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 }
 
 static inline struct device *genpd_dev_pm_attach_by_name(struct device *dev,
-							 char *name)
+							 const char *name)
 {
 	return NULL;
 }

From eeb35df05244c268cd69b425edf6dc6a49ee7ab4 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Thu, 14 Feb 2019 10:12:49 -0800
Subject: [PATCH 60/80] PM / Domains: Mark "name" const in
 dev_pm_domain_attach_by_name()

As of the patch ("PM / Domains: Mark "name" const in
genpd_dev_pm_attach_by_name()") it's clear that the name in
dev_pm_domain_attach_by_name() can be const.  Mark it as so.  This
allows drivers to pass in a name that was declared "const" in a
driver.

Fixes: 27dceb81f445 ("PM / Domains: Introduce dev_pm_domain_attach_by_name()")
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 2 +-
 include/linux/pm_domain.h   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index b413951c6abc..22aedb28aad7 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -160,7 +160,7 @@ EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id);
  * For a detailed function description, see dev_pm_domain_attach_by_id().
  */
 struct device *dev_pm_domain_attach_by_name(struct device *dev,
-					    char *name)
+					    const char *name)
 {
 	if (dev->pm_domain)
 		return ERR_PTR(-EEXIST);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 203be5082f33..1ed5874bcee0 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -341,7 +341,7 @@ int dev_pm_domain_attach(struct device *dev, bool power_on);
 struct device *dev_pm_domain_attach_by_id(struct device *dev,
 					  unsigned int index);
 struct device *dev_pm_domain_attach_by_name(struct device *dev,
-					    char *name);
+					    const char *name);
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
@@ -355,7 +355,7 @@ static inline struct device *dev_pm_domain_attach_by_id(struct device *dev,
 	return NULL;
 }
 static inline struct device *dev_pm_domain_attach_by_name(struct device *dev,
-							  char *name)
+							  const char *name)
 {
 	return NULL;
 }

From cd284ae36b6a43d478261bcb7c001db07e3a0ed5 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 16 Feb 2019 10:55:26 -0500
Subject: [PATCH 61/80] cpufreq: pcc-cpufreq: remove unneeded semicolon

The semicolon is unneeded, so remove it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/pcc-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 099a849396f6..1e5e64643c3a 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -268,7 +268,7 @@ static int pcc_get_offset(int cpu)
 	if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
 		ret = -ENODEV;
 		goto out_free;
-	};
+	}
 
 	offset = &(pccp->package.elements[0]);
 	if (!offset || offset->type != ACPI_TYPE_INTEGER) {

From 2814335cb3c8b364dcc740b6317bd3297e81995c Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 16 Feb 2019 10:55:27 -0500
Subject: [PATCH 62/80] cpufreq: longhaul: remove unneeded semicolon

The semicolon is unneeded, so remove it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/longhaul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index 279bd9e9fa95..fb546e0d0356 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -851,7 +851,7 @@ static int longhaul_cpu_init(struct cpufreq_policy *policy)
 	case TYPE_POWERSAVER:
 		pr_cont("Powersaver supported\n");
 		break;
-	};
+	}
 
 	/* Doesn't hurt */
 	longhaul_setup_southbridge();

From 5ae06c237fd05a1e9acf6f35891cfd85de792521 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 16 Feb 2019 12:06:23 -0500
Subject: [PATCH 63/80] cpufreq: powernv: fix missing check of return value in
 init_powernv_pstates()

kmalloc() could fail, so insert a check of its return value. And
if it fails, returns -ENOMEM.

And remove (struct pstate_idx_revmap_data *) to fix coccinelle WARNING
by the way.

WARNING: casting value returned by memory allocation function to (struct
pstate_idx_revmap_data *) is useless.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/powernv-cpufreq.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 7e7ad3879c4e..d2230812fa4b 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -244,6 +244,7 @@ static int init_powernv_pstates(void)
 	u32 len_ids, len_freqs;
 	u32 pstate_min, pstate_max, pstate_nominal;
 	u32 pstate_turbo, pstate_ultra_turbo;
+	int rc = -ENODEV;
 
 	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
 	if (!power_mgt) {
@@ -327,8 +328,11 @@ static int init_powernv_pstates(void)
 		powernv_freqs[i].frequency = freq * 1000; /* kHz */
 		powernv_freqs[i].driver_data = id & 0xFF;
 
-		revmap_data = (struct pstate_idx_revmap_data *)
-			      kmalloc(sizeof(*revmap_data), GFP_KERNEL);
+		revmap_data = kmalloc(sizeof(*revmap_data), GFP_KERNEL);
+		if (!revmap_data) {
+			rc = -ENOMEM;
+			goto out;
+		}
 
 		revmap_data->pstate_id = id & 0xFF;
 		revmap_data->cpufreq_table_idx = i;
@@ -357,7 +361,7 @@ static int init_powernv_pstates(void)
 	return 0;
 out:
 	of_node_put(power_mgt);
-	return -ENODEV;
+	return rc;
 }
 
 /* Returns the CPU frequency corresponding to the pstate_id. */

From d6c8e086e9d98a591a6b515078cb0e05fb538b5c Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 16 Feb 2019 11:15:01 -0500
Subject: [PATCH 64/80] cpufreq: speedstep: convert BUG() to BUG_ON()

To fix coccinelle WARNING.

WARNING: Use BUG_ON instead of if condition followed by BUG.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/speedstep-ich.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
index fbbcb88db061..5d8a09b82efb 100644
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -243,8 +243,7 @@ static unsigned int speedstep_get(unsigned int cpu)
 	unsigned int speed;
 
 	/* You're supposed to ensure CPU is online. */
-	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
-		BUG();
+	BUG_ON(smp_call_function_single(cpu, get_freq_data, &speed, 1));
 
 	pr_debug("detected %u kHz as current frequency\n", speed);
 	return speed;

From 40b46b3b2f098e3740f65024099a7c55ff4b9866 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Thu, 14 Feb 2019 18:05:05 +0100
Subject: [PATCH 65/80] cpufreq: davinci: move configuration to
 include/linux/platform_data

The header containing the configuration structure for davinci cpufreq
driver lives in mach-davinci/include/mach/. This is fine for now but
if we want to make davinci part of the multi_v5 build, no code external
to mach-davinci should include machine-specific headers.

Move the configuration structure to include/linux/platform_data.

While we're at it: convert the GPL-2.0 boilerplate to a proper SPDX
license identifier.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm/mach-davinci/da850.c                 |  2 +-
 arch/arm/mach-davinci/include/mach/cpufreq.h  | 26 -------------------
 drivers/cpufreq/davinci-cpufreq.c             |  5 +---
 include/linux/platform_data/davinci-cpufreq.h | 19 ++++++++++++++
 4 files changed, 21 insertions(+), 31 deletions(-)
 delete mode 100644 arch/arm/mach-davinci/include/mach/cpufreq.h
 create mode 100644 include/linux/platform_data/davinci-cpufreq.h

diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c
index e7b78df2bfef..a02ff431ba47 100644
--- a/arch/arm/mach-davinci/da850.c
+++ b/arch/arm/mach-davinci/da850.c
@@ -21,6 +21,7 @@
 #include <linux/mfd/da8xx-cfgchip.h>
 #include <linux/platform_data/clk-da8xx-cfgchip.h>
 #include <linux/platform_data/clk-davinci-pll.h>
+#include <linux/platform_data/davinci-cpufreq.h>
 #include <linux/platform_data/gpio-davinci.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
@@ -29,7 +30,6 @@
 #include <asm/mach/map.h>
 
 #include <mach/common.h>
-#include <mach/cpufreq.h>
 #include <mach/cputype.h>
 #include <mach/da8xx.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-davinci/include/mach/cpufreq.h b/arch/arm/mach-davinci/include/mach/cpufreq.h
deleted file mode 100644
index 3c089cfb6cd6..000000000000
--- a/arch/arm/mach-davinci/include/mach/cpufreq.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * TI DaVinci CPUFreq platform support.
- *
- * Copyright (C) 2009 Texas Instruments, Inc. http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#ifndef _MACH_DAVINCI_CPUFREQ_H
-#define _MACH_DAVINCI_CPUFREQ_H
-
-#include <linux/cpufreq.h>
-
-struct davinci_cpufreq_config {
-	struct cpufreq_frequency_table *freq_table;
-	int (*set_voltage) (unsigned int index);
-	int (*init) (void);
-};
-
-#endif
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index d54a27c99121..940fe85db97a 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -23,13 +23,10 @@
 #include <linux/init.h>
 #include <linux/err.h>
 #include <linux/clk.h>
+#include <linux/platform_data/davinci-cpufreq.h>
 #include <linux/platform_device.h>
 #include <linux/export.h>
 
-#include <mach/hardware.h>
-#include <mach/cpufreq.h>
-#include <mach/common.h>
-
 struct davinci_cpufreq {
 	struct device *dev;
 	struct clk *armclk;
diff --git a/include/linux/platform_data/davinci-cpufreq.h b/include/linux/platform_data/davinci-cpufreq.h
new file mode 100644
index 000000000000..3fbf9f2793b5
--- /dev/null
+++ b/include/linux/platform_data/davinci-cpufreq.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TI DaVinci CPUFreq platform support.
+ *
+ * Copyright (C) 2009 Texas Instruments, Inc. http://www.ti.com/
+ */
+
+#ifndef _MACH_DAVINCI_CPUFREQ_H
+#define _MACH_DAVINCI_CPUFREQ_H
+
+#include <linux/cpufreq.h>
+
+struct davinci_cpufreq_config {
+	struct cpufreq_frequency_table *freq_table;
+	int (*set_voltage)(unsigned int index);
+	int (*init)(void);
+};
+
+#endif /* _MACH_DAVINCI_CPUFREQ_H */

From 1757d05f3112acc5c0cdbcccad3afdee99655bf9 Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Date: Sun, 17 Feb 2019 11:54:14 +0800
Subject: [PATCH 66/80] ACPI / CPPC: Add a helper to get desired performance

This patch add a helper to get the value of desired performance
register.

Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
[ rjw: More white space ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 42 ++++++++++++++++++++++++++++++++++++++++
 include/acpi/cppc_acpi.h |  1 +
 2 files changed, 43 insertions(+)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 217a782c3e55..1b207fca1420 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1050,6 +1050,48 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 	return ret_val;
 }
 
+/**
+ * cppc_get_desired_perf - Get the value of desired performance register.
+ * @cpunum: CPU from which to get desired performance.
+ * @desired_perf: address of a variable to store the returned desired performance
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+{
+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
+	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
+	struct cpc_register_resource *desired_reg;
+	struct cppc_pcc_data *pcc_ss_data = NULL;
+
+	desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
+
+	if (CPC_IN_PCC(desired_reg)) {
+		int ret = 0;
+
+		if (pcc_ss_id < 0)
+			return -EIO;
+
+		pcc_ss_data = pcc_data[pcc_ss_id];
+
+		down_write(&pcc_ss_data->pcc_lock);
+
+		if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0)
+			cpc_read(cpunum, desired_reg, desired_perf);
+		else
+			ret = -EIO;
+
+		up_write(&pcc_ss_data->pcc_lock);
+
+		return ret;
+	}
+
+	cpc_read(cpunum, desired_reg, desired_perf);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cppc_get_desired_perf);
+
 /**
  * cppc_get_perf_caps - Get a CPUs performance capabilities.
  * @cpunum: CPU from which to get capabilities info.
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 4f34734e7f36..ba6fd7202775 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -137,6 +137,7 @@ struct cppc_cpudata {
 	cpumask_var_t shared_cpu_map;
 };
 
+extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
 extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
 extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
 extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps);

From 6c8d750f9784cef32a8cffdad74c8a351b4ca3a6 Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Date: Sun, 17 Feb 2019 11:54:15 +0800
Subject: [PATCH 67/80] cpufreq / cppc: Work around for Hisilicon CPPC cpufreq

Hisilicon chips do not support delivered performance counter register
and reference performance counter register. But the platform can
calculate the real performance using its own method. We reuse the
desired performance register to store the real performance calculated by
the platform. After the platform finished the frequency adjust, it gets
the real performance and writes it into desired performance register. Os
can use it to calculate the real frequency.

Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
[ rjw: Drop unnecessary braces ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 65 ++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index fd25c21cee72..2ae978d27e61 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -42,6 +42,66 @@
  */
 static struct cppc_cpudata **all_cpu_data;
 
+struct cppc_workaround_oem_info {
+	char oem_id[ACPI_OEM_ID_SIZE +1];
+	char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+	u32 oem_revision;
+};
+
+static bool apply_hisi_workaround;
+
+static struct cppc_workaround_oem_info wa_info[] = {
+	{
+		.oem_id		= "HISI  ",
+		.oem_table_id	= "HIP07   ",
+		.oem_revision	= 0,
+	}, {
+		.oem_id		= "HISI  ",
+		.oem_table_id	= "HIP08   ",
+		.oem_revision	= 0,
+	}
+};
+
+static unsigned int cppc_cpufreq_perf_to_khz(struct cppc_cpudata *cpu,
+					unsigned int perf);
+
+/*
+ * HISI platform does not support delivered performance counter and
+ * reference performance counter. It can calculate the performance using the
+ * platform specific mechanism. We reuse the desired performance register to
+ * store the real performance calculated by the platform.
+ */
+static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpunum)
+{
+	struct cppc_cpudata *cpudata = all_cpu_data[cpunum];
+	u64 desired_perf;
+	int ret;
+
+	ret = cppc_get_desired_perf(cpunum, &desired_perf);
+	if (ret < 0)
+		return -EIO;
+
+	return cppc_cpufreq_perf_to_khz(cpudata, desired_perf);
+}
+
+static void cppc_check_hisi_workaround(void)
+{
+	struct acpi_table_header *tbl;
+	acpi_status status = AE_OK;
+	int i;
+
+	status = acpi_get_table(ACPI_SIG_PCCT, 0, &tbl);
+	if (ACPI_FAILURE(status) || !tbl)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(wa_info); i++) {
+		if (!memcmp(wa_info[i].oem_id, tbl->oem_id, ACPI_OEM_ID_SIZE) &&
+		    !memcmp(wa_info[i].oem_table_id, tbl->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) &&
+		    wa_info[i].oem_revision == tbl->oem_revision)
+			apply_hisi_workaround = true;
+	}
+}
+
 /* Callback function used to retrieve the max frequency from DMI */
 static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private)
 {
@@ -334,6 +394,9 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpunum)
 	struct cppc_cpudata *cpu = all_cpu_data[cpunum];
 	int ret;
 
+	if (apply_hisi_workaround)
+		return hisi_cppc_cpufreq_get_rate(cpunum);
+
 	ret = cppc_get_perf_ctrs(cpunum, &fb_ctrs_t0);
 	if (ret)
 		return ret;
@@ -386,6 +449,8 @@ static int __init cppc_cpufreq_init(void)
 		goto out;
 	}
 
+	cppc_check_hisi_workaround();
+
 	ret = cpufreq_register_driver(&cppc_cpufreq_driver);
 	if (ret)
 		goto out;

From 34a62cd0df89dd7034165048c0921d1314191b66 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Mon, 18 Feb 2019 01:33:49 +0000
Subject: [PATCH 68/80] ACPI / processor: Set P_LVL{2,3} idle state
 descriptions

The ACPI idle driver will fallback to using the legacy P_LVL* SystemIO
method of entering C-states if the _CST method is disabled and P_BLK is
defined. However, in this case the C2 and C3 states won't have a
description set, so the user will see "<null>" when reading the
description from sysfs.

Give each of these states a description.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_idle.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index b2131c4ea124..98d4ec5bf450 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -282,6 +282,13 @@ static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr)
 			  pr->power.states[ACPI_STATE_C2].address,
 			  pr->power.states[ACPI_STATE_C3].address));
 
+	snprintf(pr->power.states[ACPI_STATE_C2].desc,
+			 ACPI_CX_DESC_LEN, "ACPI P_LVL2 IOPORT 0x%x",
+			 pr->power.states[ACPI_STATE_C2].address);
+	snprintf(pr->power.states[ACPI_STATE_C3].desc,
+			 ACPI_CX_DESC_LEN, "ACPI P_LVL3 IOPORT 0x%x",
+			 pr->power.states[ACPI_STATE_C3].address);
+
 	return 0;
 }
 

From ba6f3ec198d555d0c9b7e038cb003922aea2797d Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Mon, 18 Feb 2019 15:01:02 +0800
Subject: [PATCH 69/80] powercap/intel_rapl: add Ice Lake mobile

Add Ice Lake mobile support to intel_rapl driver.

Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Signed-off-by: Joe Konno <joe.konno@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 393463eb157f..4347f15165f8 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1156,6 +1156,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	INTEL_CPU_FAM6(KABYLAKE_MOBILE,		rapl_defaults_core),
 	INTEL_CPU_FAM6(KABYLAKE_DESKTOP,	rapl_defaults_core),
 	INTEL_CPU_FAM6(CANNONLAKE_MOBILE,	rapl_defaults_core),
+	INTEL_CPU_FAM6(ICELAKE_MOBILE,		rapl_defaults_core),
 
 	INTEL_CPU_FAM6(ATOM_SILVERMONT,		rapl_defaults_byt),
 	INTEL_CPU_FAM6(ATOM_AIRMONT,		rapl_defaults_cht),

From fa93b51c55099660c2583ec918ef27df18fc511a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 15 Feb 2019 13:15:32 +0100
Subject: [PATCH 70/80] cpufreq: intel_pstate: Avoid redundant initialization
 of local vars

After commit 1a4fe38add8b ("cpufreq: intel_pstate: Remove max/min
fractions to limit performance") the initial value of the pstate local
variable in intel_pstate_max_within_limits() and the initial value of
the max_pstate local variable in intel_pstate_prepare_request() are
both immediately discarded, so initialize both these variables to
their target values upfront.

No intentional changes of behavior.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 75a5408e6981..3a4d15a48c1f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1471,11 +1471,9 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu)
 
 static void intel_pstate_max_within_limits(struct cpudata *cpu)
 {
-	int pstate;
+	int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
 
 	update_turbo_state();
-	pstate = intel_pstate_get_base_pstate(cpu);
-	pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
 	intel_pstate_set_pstate(cpu, pstate);
 }
 
@@ -1716,11 +1714,9 @@ static inline int32_t get_target_pstate(struct cpudata *cpu)
 
 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
 {
-	int max_pstate = intel_pstate_get_base_pstate(cpu);
-	int min_pstate;
+	int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
+	int max_pstate = max(min_pstate, cpu->max_perf_ratio);
 
-	min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
-	max_pstate = max(min_pstate, cpu->max_perf_ratio);
 	return clamp_t(int, pstate, min_pstate, max_pstate);
 }
 

From a8e1942d97dcc44d1425807c71a4252f9e3b53b6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 15 Feb 2019 13:16:40 +0100
Subject: [PATCH 71/80] cpufreq: intel_pstate: Eliminate
 intel_pstate_get_base_pstate()

There is only one caller of intel_pstate_get_base_pstate() and it is
more straightforward to carry out the computation directly in the
caller, so do that and drop intel_pstate_get_base_pstate().

No intentional changes of behavior.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3a4d15a48c1f..9a6cb3a1be50 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1445,12 +1445,6 @@ static int knl_get_turbo_pstate(void)
 	return ret;
 }
 
-static int intel_pstate_get_base_pstate(struct cpudata *cpu)
-{
-	return global.no_turbo || global.turbo_disabled ?
-			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
-}
-
 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
 {
 	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
@@ -1973,7 +1967,8 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	if (hwp_active) {
 		intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state);
 	} else {
-		max_state = intel_pstate_get_base_pstate(cpu);
+		max_state = global.no_turbo || global.turbo_disabled ?
+			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
 		turbo_max = cpu->pstate.turbo_pstate;
 	}
 

From b8bd1581aa6110eb234c0d424eccd3f32d7317e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Feb 2019 12:51:04 +0100
Subject: [PATCH 72/80] cpufreq: intel_pstate: Rework iowait boosting to be
 less aggressive

The current iowait boosting mechanism in intel_pstate_update_util()
is quite aggressive, as it goes to the maximum P-state right away,
and may cause excessive amounts of energy to be used, which is not
desirable and arguably isn't necessary too.

Follow commit a5a0809bc58e ("cpufreq: schedutil: Make iowait boost
more energy efficient") that reworked the analogous iowait boost
mechanism in the schedutil governor and make the iowait boosting
in intel_pstate_update_util() work along the same lines.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 9a6cb3a1be50..002f5169d4eb 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -50,6 +50,8 @@
 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
 #define fp_toint(X) ((X) >> FRAC_BITS)
 
+#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
+
 #define EXT_BITS 6
 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
@@ -1671,17 +1673,14 @@ static inline int32_t get_avg_pstate(struct cpudata *cpu)
 static inline int32_t get_target_pstate(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
-	int32_t busy_frac, boost;
+	int32_t busy_frac;
 	int target, avg_pstate;
 
 	busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
 			   sample->tsc);
 
-	boost = cpu->iowait_boost;
-	cpu->iowait_boost >>= 1;
-
-	if (busy_frac < boost)
-		busy_frac = boost;
+	if (busy_frac < cpu->iowait_boost)
+		busy_frac = cpu->iowait_boost;
 
 	sample->busy_scaled = busy_frac * 100;
 
@@ -1758,29 +1757,30 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	if (smp_processor_id() != cpu->cpu)
 		return;
 
+	delta_ns = time - cpu->last_update;
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		cpu->iowait_boost = int_tofp(1);
-		cpu->last_update = time;
-		/*
-		 * The last time the busy was 100% so P-state was max anyway
-		 * so avoid overhead of computation.
-		 */
-		if (fp_toint(cpu->sample.busy_scaled) == 100)
-			return;
-
-		goto set_pstate;
+		/* Start over if the CPU may have been idle. */
+		if (delta_ns > TICK_NSEC) {
+			cpu->iowait_boost = ONE_EIGHTH_FP;
+		} else if (cpu->iowait_boost) {
+			cpu->iowait_boost <<= 1;
+			if (cpu->iowait_boost > int_tofp(1))
+				cpu->iowait_boost = int_tofp(1);
+		} else {
+			cpu->iowait_boost = ONE_EIGHTH_FP;
+		}
 	} else if (cpu->iowait_boost) {
 		/* Clear iowait_boost if the CPU may have been idle. */
-		delta_ns = time - cpu->last_update;
 		if (delta_ns > TICK_NSEC)
 			cpu->iowait_boost = 0;
+		else
+			cpu->iowait_boost >>= 1;
 	}
 	cpu->last_update = time;
 	delta_ns = time - cpu->sample.time;
 	if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
 		return;
 
-set_pstate:
 	if (intel_pstate_sample(cpu, time))
 		intel_pstate_adjust_pstate(cpu);
 }

From 85945c28b5a888043cb2b54f880d80d8915f21f5 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 14 Feb 2019 18:29:10 +0000
Subject: [PATCH 73/80] PM / core: Add support to skip power management in
 device/driver model

All device objects in the driver model contain fields that control the
handling of various power management activities. However, it's not
always useful. There are few instances where pseudo devices are added
to the model just to take advantage of many other features like
kobjects, udev events, and so on. One such example is cpu devices and
their caches.

The sysfs for the cpu caches are managed by adding devices with cpu
as the parent in cpu_device_create() when secondary cpu is brought
online. Generally when the secondary CPUs are hotplugged back in as part
of resume from suspend-to-ram, we call cpu_device_create() from the cpu
hotplug state machine while the cpu device associated with that CPU is
not yet ready to be resumed as the device_resume() call happens bit
later. It's not really needed to set the flag is_prepared for cpu
devices as they are mostly pseudo device and hotplug framework deals
with state machine and not managed through the cpu device.

This often results in annoying warning when resuming:
Enabling non-boot CPUs ...
CPU1: Booted secondary processor
 cache: parent cpu1 should not be sleeping
CPU1 is up
CPU2: Booted secondary processor
 cache: parent cpu2 should not be sleeping
CPU2 is up
.... and so on.

So in order to fix these kind of errors, we could just completely avoid
doing any power management related initialisations and operations if
they are not used by these devices.

Add no_pm flags to indicate that the device doesn't require any sort of
PM activities and all of them can be completely skipped. We can use the
same flag to also avoid adding not used *power* sysfs entries for these
devices. For now, lets use this for cpu cache devices.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Eugeniu Rosca <erosca@de.adit-jv.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/cpu.c         |  1 +
 drivers/base/power/main.c  |  7 +++++++
 drivers/base/power/sysfs.c |  6 ++++++
 include/linux/device.h     | 10 ++++++++++
 include/linux/pm.h         |  1 +
 5 files changed, 25 insertions(+)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index eb9443d5bae1..6ce93a52bf3f 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -427,6 +427,7 @@ __cpu_device_create(struct device *parent, void *drvdata,
 	dev->parent = parent;
 	dev->groups = groups;
 	dev->release = device_create_release;
+	device_set_pm_not_required(dev);
 	dev_set_drvdata(dev, drvdata);
 
 	retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 337a56ff11b7..893ae464bfd6 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -124,6 +124,10 @@ void device_pm_unlock(void)
  */
 void device_pm_add(struct device *dev)
 {
+	/* Skip PM setup/initialization. */
+	if (device_pm_not_required(dev))
+		return;
+
 	pr_debug("PM: Adding info for %s:%s\n",
 		 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
 	device_pm_check_callbacks(dev);
@@ -142,6 +146,9 @@ void device_pm_add(struct device *dev)
  */
 void device_pm_remove(struct device *dev)
 {
+	if (device_pm_not_required(dev))
+		return;
+
 	pr_debug("PM: Removing info for %s:%s\n",
 		 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
 	complete_all(&dev->power.completion);
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 96c8a227610a..c6bf76124184 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -653,6 +653,10 @@ int dpm_sysfs_add(struct device *dev)
 {
 	int rc;
 
+	/* No need to create PM sysfs if explicitly disabled. */
+	if (device_pm_not_required(dev))
+		return 0;
+
 	rc = sysfs_create_group(&dev->kobj, &pm_attr_group);
 	if (rc)
 		return rc;
@@ -732,6 +736,8 @@ void rpm_sysfs_remove(struct device *dev)
 
 void dpm_sysfs_remove(struct device *dev)
 {
+	if (device_pm_not_required(dev))
+		return;
 	sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
 	dev_pm_qos_constraints_destroy(dev);
 	rpm_sysfs_remove(dev);
diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..53028636fe39 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1165,6 +1165,16 @@ static inline bool device_async_suspend_enabled(struct device *dev)
 	return !!dev->power.async_suspend;
 }
 
+static inline bool device_pm_not_required(struct device *dev)
+{
+	return dev->power.no_pm;
+}
+
+static inline void device_set_pm_not_required(struct device *dev)
+{
+	dev->power.no_pm = true;
+}
+
 static inline void dev_pm_syscore_device(struct device *dev, bool val)
 {
 #ifdef CONFIG_PM_SLEEP
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 3d2cbf947768..06f7ed893928 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -592,6 +592,7 @@ struct dev_pm_info {
 	bool			is_suspended:1;	/* Ditto */
 	bool			is_noirq_suspended:1;
 	bool			is_late_suspended:1;
+	bool			no_pm:1;
 	bool			early_init:1;	/* Owned by the PM core */
 	bool			direct_complete:1;	/* Owned by the PM core */
 	u32			driver_flags;

From a0dbb819b84f87b3cb35083e264e9e584630e1f7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 20 Feb 2019 00:22:51 +0100
Subject: [PATCH 74/80] cpufreq: Add kerneldoc comments for two core functions

Add kerneldoc comments describing cpufreq_set_policy() and
cpufreq_update_policy() as they have not been properly documented
so far and they really need to be documented.

While at it, fix white space around the cpufreq_set_policy() header.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ad4e9991c3cc..e4a7dbee84fc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2218,12 +2218,25 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
-/*
- * policy : current policy.
- * new_policy: policy to be set.
+/**
+ * cpufreq_set_policy - Modify cpufreq policy parameters.
+ * @policy: Policy object to modify.
+ * @new_policy: New policy data.
+ *
+ * Pass @new_policy to the cpufreq driver's ->verify() callback, run the
+ * installed policy notifiers for it with the CPUFREQ_ADJUST value, pass it to
+ * the driver's ->verify() callback again and run the notifiers for it again
+ * with the CPUFREQ_NOTIFY value.  Next, copy the min and max parameters
+ * of @new_policy to @policy and either invoke the driver's ->setpolicy()
+ * callback (if present) or carry out a governor update for @policy.  That is,
+ * run the current governor's ->limits() callback (if the governor field in
+ * @new_policy points to the same object as the one in @policy) or replace the
+ * governor for @policy with the new one stored in @new_policy.
+ *
+ * The cpuinfo part of @policy is not updated by this function.
  */
 static int cpufreq_set_policy(struct cpufreq_policy *policy,
-				struct cpufreq_policy *new_policy)
+			      struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
 	int ret;
@@ -2319,11 +2332,14 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 }
 
 /**
- *	cpufreq_update_policy - re-evaluate an existing cpufreq policy
- *	@cpu: CPU which shall be re-evaluated
+ * cpufreq_update_policy - Re-evaluate an existing cpufreq policy.
+ * @cpu: CPU to re-evaluate the policy for.
  *
- *	Useful for policy notifiers which have different necessities
- *	at different times.
+ * Update the current frequency for the cpufreq policy of @cpu and use
+ * cpufreq_set_policy() to re-apply the min and max limits saved in the
+ * user_policy sub-structure of that policy, which triggers the evaluation
+ * of policy notifiers and the cpufreq driver's ->verify() callback for the
+ * policy in question, among other things.
  */
 void cpufreq_update_policy(unsigned int cpu)
 {

From 348a2ec5f5a5af15509252dfc6a30fa190ac0203 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 20 Feb 2019 00:24:25 +0100
Subject: [PATCH 75/80] cpufreq: Reorder and simplify cpufreq_update_policy()

In cpufreq_update_policy(), instead of updating new_policy.cur
separately, which is kind of confusing, because cpufreq_set_policy()
doesn't take that value into account directly anyway, make the copy
of the existing policy after calling cpufreq_update_current_freq().

No intentional changes of behavior.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e4a7dbee84fc..fd5221007ade 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2354,23 +2354,18 @@ void cpufreq_update_policy(unsigned int cpu)
 	if (policy_is_inactive(policy))
 		goto unlock;
 
-	pr_debug("updating policy for CPU %u\n", cpu);
-	memcpy(&new_policy, policy, sizeof(*policy));
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
-
 	/*
 	 * BIOS might change freq behind our back
 	 * -> ask driver for current freq and notify governors about a change
 	 */
-	if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
-		if (cpufreq_suspended)
-			goto unlock;
+	if (cpufreq_driver->get && !cpufreq_driver->setpolicy &&
+	    (cpufreq_suspended || WARN_ON(!cpufreq_update_current_freq(policy))))
+		goto unlock;
 
-		new_policy.cur = cpufreq_update_current_freq(policy);
-		if (WARN_ON(!new_policy.cur))
-			goto unlock;
-	}
+	pr_debug("updating policy for CPU %u\n", cpu);
+	memcpy(&new_policy, policy, sizeof(*policy));
+	new_policy.min = policy->user_policy.min;
+	new_policy.max = policy->user_policy.max;
 
 	cpufreq_set_policy(policy, &new_policy);
 

From 2bb4059e075dcb8d5a2f8689bb661aa76c487ab0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 20 Feb 2019 00:25:18 +0100
Subject: [PATCH 76/80] cpufreq: Fix two debug messages in cpufreq_set_policy()

Remove the redundant "cpufreq:" prefix from two debug messages in
cpufreq_set_policy().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index fd5221007ade..06b1a954d6e4 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2290,7 +2290,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	}
 
 	if (new_policy->governor == policy->governor) {
-		pr_debug("cpufreq: governor limits update\n");
+		pr_debug("governor limits update\n");
 		cpufreq_governor_limits(policy);
 		return 0;
 	}
@@ -2311,7 +2311,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (!ret) {
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
-			pr_debug("cpufreq: governor change\n");
+			pr_debug("governor change\n");
 			sched_cpufreq_governor_change(policy, old_gov);
 			return 0;
 		}

From 167a38dcd5caaa85994ac8b7d2d1c20a71cd947b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 20 Feb 2019 00:26:30 +0100
Subject: [PATCH 77/80] cpufreq: Pass updated policy to driver ->setpolicy()
 callback

The invocation of the ->setpolicy() cpufreq driver callback should
be equivalent to calling cpufreq_governor_limits(policy) for drivers
with internal governors, but in fact it isn't so, because the
temporary new_policy object is passed to it instead of the updated
policy.

That is a bit confusing, so make cpufreq_set_policy() pass the
updated policy to the driver ->setpolicy() callback.

No intentional changes of behavior.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 06b1a954d6e4..0e626b00053b 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2286,7 +2286,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (cpufreq_driver->setpolicy) {
 		policy->policy = new_policy->policy;
 		pr_debug("setting range\n");
-		return cpufreq_driver->setpolicy(new_policy);
+		return cpufreq_driver->setpolicy(policy);
 	}
 
 	if (new_policy->governor == policy->governor) {

From 1222d527f314c86a3b59a522115d62facc5a7965 Mon Sep 17 00:00:00 2001
From: Erwan Velu <erwanaliasr1@gmail.com>
Date: Wed, 20 Feb 2019 11:10:17 +0100
Subject: [PATCH 78/80] cpufreq: acpi-cpufreq: Report if CPU doesn't support
 boost technologies

There is some rare cases where CPB (and possibly IDA) are missing on
processors.

This is the case fixed by commit f7f3dc00f612 ("x86/cpu/AMD: Fix
erratum 1076 (CPB bit)") and following.

In such context, the boost status isn't reported by
/sys/devices/system/cpu/cpufreq/boost.

This commit is about printing a message to report that the CPU
doesn't expose the boost capabilities.

This message could help debugging platforms hit by this phenomena.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
[ rjw: Change the message text somewhat ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index d62fd374d5c7..c72258a44ba4 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -916,8 +916,10 @@ static void __init acpi_cpufreq_boost_init(void)
 {
 	int ret;
 
-	if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
+	if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA))) {
+		pr_debug("Boost capabilities not present in the processor\n");
 		return;
+	}
 
 	acpi_cpufreq_driver.set_boost = set_boost;
 	acpi_cpufreq_driver.boost_enabled = boost_state(0);

From d3c1e33f5c8670ca4512d17bf712ec946eab20ea Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sat, 23 Feb 2019 14:20:41 +0100
Subject: [PATCH 79/80] cpufreq: ap806: add missing of_node_put after
 of_device_is_available

Add an of_node_put when a tested device node is not available.

The semantic patch that fixes this problem is as follows
(http://coccinelle.lip6.fr):

// <smpl>
@@
identifier f;
local idexpression e;
expression x;
@@

e = f(...);
... when != of_node_put(e)
    when != x = e
    when != e = x
    when any
if (<+...of_device_is_available(e)...+>) {
  ... when != of_node_put(e)
(
  return e;
|
+ of_node_put(e);
  return ...;
)
}
// </smpl>

Fixes: f525a670533d9 ("cpufreq: ap806: add cpufreq driver for Armada 8K")
Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/armada-8k-cpufreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c
index 8a5ddb93fc58..b3f4bd647e9b 100644
--- a/drivers/cpufreq/armada-8k-cpufreq.c
+++ b/drivers/cpufreq/armada-8k-cpufreq.c
@@ -128,8 +128,10 @@ static int __init armada_8k_cpufreq_init(void)
 	struct cpumask cpus;
 
 	node = of_find_compatible_node(NULL, NULL, "marvell,ap806-cpu-clock");
-	if (!node || !of_device_is_available(node))
+	if (!node || !of_device_is_available(node)) {
+		of_node_put(node);
 		return -ENODEV;
+	}
 
 	nb_cpus = num_possible_cpus();
 	freq_tables = kcalloc(nb_cpus, sizeof(*freq_tables), GFP_KERNEL);

From 0334906c06967142c8805fbe88acf787f65d3d26 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 20 Feb 2019 16:41:18 +0530
Subject: [PATCH 80/80] cpufreq: kryo: Release OPP tables on module removal

Commit 5ad7346b4ae2 ("cpufreq: kryo: Add module remove and exit") made
it possible to build the kryo cpufreq driver as a module, but it failed
to release all the resources, i.e. OPP tables, when the module is
unloaded.

This patch fixes it by releasing the OPP tables, by calling
dev_pm_opp_put_supported_hw() for them, from the
qcom_cpufreq_kryo_remove() routine. The array of pointers to the OPP
tables is also allocated dynamically now in qcom_cpufreq_kryo_probe(),
as the pointers will be required while releasing the resources.

Compile tested only.

Cc: 4.18+ <stable@vger.kernel.org> # v4.18+
Fixes: 5ad7346b4ae2 ("cpufreq: kryo: Add module remove and exit")
Reviewed-by: Georgi Djakov <georgi.djakov@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-kryo.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c
index 2a3675c24032..a472b814058f 100644
--- a/drivers/cpufreq/qcom-cpufreq-kryo.c
+++ b/drivers/cpufreq/qcom-cpufreq-kryo.c
@@ -75,7 +75,7 @@ static enum _msm8996_version qcom_cpufreq_kryo_get_msm_id(void)
 
 static int qcom_cpufreq_kryo_probe(struct platform_device *pdev)
 {
-	struct opp_table *opp_tables[NR_CPUS] = {0};
+	struct opp_table **opp_tables;
 	enum _msm8996_version msm8996_version;
 	struct nvmem_cell *speedbin_nvmem;
 	struct device_node *np;
@@ -133,6 +133,10 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev)
 	}
 	kfree(speedbin);
 
+	opp_tables = kcalloc(num_possible_cpus(), sizeof(*opp_tables), GFP_KERNEL);
+	if (!opp_tables)
+		return -ENOMEM;
+
 	for_each_possible_cpu(cpu) {
 		cpu_dev = get_cpu_device(cpu);
 		if (NULL == cpu_dev) {
@@ -151,8 +155,10 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev)
 
 	cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
 							  NULL, 0);
-	if (!IS_ERR(cpufreq_dt_pdev))
+	if (!IS_ERR(cpufreq_dt_pdev)) {
+		platform_set_drvdata(pdev, opp_tables);
 		return 0;
+	}
 
 	ret = PTR_ERR(cpufreq_dt_pdev);
 	dev_err(cpu_dev, "Failed to register platform device\n");
@@ -163,13 +169,23 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev)
 			break;
 		dev_pm_opp_put_supported_hw(opp_tables[cpu]);
 	}
+	kfree(opp_tables);
 
 	return ret;
 }
 
 static int qcom_cpufreq_kryo_remove(struct platform_device *pdev)
 {
+	struct opp_table **opp_tables = platform_get_drvdata(pdev);
+	unsigned int cpu;
+
 	platform_device_unregister(cpufreq_dt_pdev);
+
+	for_each_possible_cpu(cpu)
+		dev_pm_opp_put_supported_hw(opp_tables[cpu]);
+
+	kfree(opp_tables);
+
 	return 0;
 }