From 425b753645767049f1a3eab02f39120c02156b72 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:34:22 +0100
Subject: [PATCH 01/20] cpuidle: teo: Rearrange idle state lookup code

Rearrange code in the idle state lookup loop in teo_select() to make it
somewhat easier to follow and update comments around it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/4619938.LvFx2qVVIh@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 34 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 173ddcac540a..68af712f7064 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -367,7 +367,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * If the sum of the intercepts metric for all of the idle states
 	 * shallower than the current candidate one (idx) is greater than the
 	 * sum of the intercepts and hits metrics for the candidate state and
-	 * all of the deeper states a shallower idle state is likely to be a
+	 * all of the deeper states, a shallower idle state is likely to be a
 	 * better choice.
 	 */
 	prev_intercept_idx = idx;
@@ -396,30 +396,36 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 				 * first enabled state that is deep enough.
 				 */
 				if (teo_state_ok(i, drv) &&
-				    !dev->states_usage[i].disable)
+				    !dev->states_usage[i].disable) {
 					idx = i;
-				else
-					idx = first_suitable_idx;
-
+					break;
+				}
+				idx = first_suitable_idx;
 				break;
 			}
 
 			if (dev->states_usage[i].disable)
 				continue;
 
-			if (!teo_state_ok(i, drv)) {
+			if (teo_state_ok(i, drv)) {
 				/*
-				 * The current state is too shallow, but if an
-				 * alternative candidate state has been found,
-				 * it may still turn out to be a better choice.
+				 * The current state is deep enough, but still
+				 * there may be a better one.
 				 */
-				if (first_suitable_idx != idx)
-					continue;
-
-				break;
+				first_suitable_idx = i;
+				continue;
 			}
 
-			first_suitable_idx = i;
+			/*
+			 * The current state is too shallow, so if no suitable
+			 * states other than the initial candidate have been
+			 * found, give up (the remaining states to check are
+			 * shallower still), but otherwise the first suitable
+			 * state other than the initial candidate may turn out
+			 * to be preferable.
+			 */
+			if (first_suitable_idx == idx)
+				break;
 		}
 	}
 	if (!idx && prev_intercept_idx) {

From 92ce5c07b7a1913246fd5492aee52db8a0c66f55 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:36:57 +0100
Subject: [PATCH 02/20] cpuidle: teo: Reorder candidate state index checks

Since constraint_idx may be 0, the candidate state index may change to 0
after assigning constraint_idx to it, so first check if it is greater
than constraint_idx (and update it if so) and then check it against 0.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/1907276.tdWV9SEqCh@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 68af712f7064..30e444c9c40b 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -428,6 +428,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 				break;
 		}
 	}
+
+	/*
+	 * If there is a latency constraint, it may be necessary to select an
+	 * idle state shallower than the current candidate one.
+	 */
+	if (idx > constraint_idx)
+		idx = constraint_idx;
+
 	if (!idx && prev_intercept_idx) {
 		/*
 		 * We have to query the sleep length here otherwise we don't
@@ -438,13 +446,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		goto out_tick;
 	}
 
-	/*
-	 * If there is a latency constraint, it may be necessary to select an
-	 * idle state shallower than the current candidate one.
-	 */
-	if (idx > constraint_idx)
-		idx = constraint_idx;
-
 	/*
 	 * Skip the timers check if state 0 is the current candidate one,
 	 * because an immediate non-timer wakeup is expected in that case.

From ea185406d1ed90493ef0868a03ddcb6b2701b11b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:39:00 +0100
Subject: [PATCH 03/20] cpuidle: teo: Combine candidate state index checks
 against 0

There are two candidate state index checks against 0 in teo_select()
that need not be separate, so combine them and update comments around
them.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/13676346.uLZWGnKmhe@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 30e444c9c40b..bd2fe41b4287 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -436,23 +436,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	if (idx > constraint_idx)
 		idx = constraint_idx;
 
-	if (!idx && prev_intercept_idx) {
-		/*
-		 * We have to query the sleep length here otherwise we don't
-		 * know after wakeup if our guess was correct.
-		 */
-		duration_ns = tick_nohz_get_sleep_length(&delta_tick);
-		cpu_data->sleep_length_ns = duration_ns;
+	if (!idx) {
+		if (prev_intercept_idx) {
+			/*
+			 * Query the sleep length to be able to count the wakeup
+			 * as a hit if it is caused by a timer.
+			 */
+			duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+			cpu_data->sleep_length_ns = duration_ns;
+		}
 		goto out_tick;
 	}
 
-	/*
-	 * Skip the timers check if state 0 is the current candidate one,
-	 * because an immediate non-timer wakeup is expected in that case.
-	 */
-	if (!idx)
-		goto out_tick;
-
 	/*
 	 * If state 0 is a polling one, check if the target residency of
 	 * the current candidate state is low enough and skip the timers

From b9a6af26bd83fb23d15c53b1ce63df77dda15513 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:40:49 +0100
Subject: [PATCH 04/20] cpuidle: teo: Drop local variable prev_intercept_idx

Local variable prev_intercept_idx in teo_select() is redundant because
it cannot be 0 when candidate state index is 0.

The prev_intercept_idx value is the index of the deepest enabled idle
state, so if it is 0, state 0 is the deepest enabled idle state, in
which case it must be the only enabled idle state, but then teo_select()
would have returned early before initializing prev_intercept_idx.

Thus prev_intercept_idx must be nonzero and the check of it against 0
always passes, so it can be dropped altogether.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/3327997.aeNJFYEL58@rjwysocki.net
[ rjw: Fixed typo in the changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/teo.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index bd2fe41b4287..95d76c8e0d12 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -292,7 +292,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	unsigned int hit_sum = 0;
 	int constraint_idx = 0;
 	int idx0 = 0, idx = -1;
-	int prev_intercept_idx;
 	s64 duration_ns;
 	int i;
 
@@ -370,7 +369,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * all of the deeper states, a shallower idle state is likely to be a
 	 * better choice.
 	 */
-	prev_intercept_idx = idx;
 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
 		int first_suitable_idx = idx;
 
@@ -437,14 +435,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		idx = constraint_idx;
 
 	if (!idx) {
-		if (prev_intercept_idx) {
-			/*
-			 * Query the sleep length to be able to count the wakeup
-			 * as a hit if it is caused by a timer.
-			 */
-			duration_ns = tick_nohz_get_sleep_length(&delta_tick);
-			cpu_data->sleep_length_ns = duration_ns;
-		}
+		/*
+		 * Query the sleep length to be able to count the wakeup as a
+		 * hit if it is caused by a timer.
+		 */
+		cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
 		goto out_tick;
 	}
 

From e24f8a55de509ba26726f094e084d90428cbcf26 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:41:55 +0100
Subject: [PATCH 05/20] cpuidle: teo: Clarify two code comments

Rewrite two code comments suposed to explain its behavior that are too
concise or not sufficiently clear.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/8472971.T7Z3S40VBb@rjwysocki.net
[ rjw: Fixed 2 typos in new comments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/teo.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 95d76c8e0d12..411b315081f8 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -154,9 +154,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
 		/*
-		 * One of the safety nets has triggered or the wakeup was close
-		 * enough to the closest timer event expected at the idle state
-		 * selection time to be discarded.
+		 * This causes the wakeup to be counted as a hit regardless of
+		 * the real idle duration which doesn't need to be computed
+		 * because the wakeup has been close enough to an anticipated
+		 * timer.
 		 */
 		measured_ns = U64_MAX;
 	} else {
@@ -302,8 +303,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 	cpu_data->time_span_ns = local_clock();
 	/*
-	 * Set the expected sleep length to infinity in case of an early
-	 * return.
+	 * Set the sleep length to infinity in case the invocation of
+	 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
+	 * be known whether or not the subsequent wakeup is caused by a timer.
+	 * It is generally fine to count the wakeup as an intercept then, except
+	 * for the cases when the CPU is mostly woken up by timers and there may
+	 * be opportunities to ask for a deeper idle state when no imminent
+	 * timers are scheduled which may be missed.
 	 */
 	cpu_data->sleep_length_ns = KTIME_MAX;
 

From d619b5cc678024fa5ed7eb3702c3991a2aa96823 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:45:50 +0100
Subject: [PATCH 06/20] cpuidle: teo: Simplify counting events used for tick
 management

Replace the tick_hits metric with a new tick_intercepts one that can be
used directly when deciding whether or not to stop the scheduler tick
and update the governor functional description accordingly.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/1987985.PYKUYFuaPT@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 49 ++++++++++-----------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 411b315081f8..62f323f2e245 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -41,11 +41,7 @@
  * idle state 2, the third bin spans from the target residency of idle state 2
  * up to, but not including, the target residency of idle state 3 and so on.
  * The last bin spans from the target residency of the deepest idle state
- * supplied by the driver to the scheduler tick period length or to infinity if
- * the tick period length is less than the target residency of that state.  In
- * the latter case, the governor also counts events with the measured idle
- * duration between the tick period length and the target residency of the
- * deepest idle state.
+ * supplied by the driver to infinity.
  *
  * Two metrics called "hits" and "intercepts" are associated with each bin.
  * They are updated every time before selecting an idle state for the given CPU
@@ -60,6 +56,10 @@
  * into by the sleep length (these events are also referred to as "intercepts"
  * below).
  *
+ * The governor also counts "intercepts" with the measured idle duration below
+ * the tick period length and uses this information when deciding whether or not
+ * to stop the scheduler tick.
+ *
  * In order to select an idle state for a CPU, the governor takes the following
  * steps (modulo the possible latency constraint that must be taken into account
  * too):
@@ -128,14 +128,14 @@ struct teo_bin {
  * @sleep_length_ns: Time till the closest timer event (at the selection time).
  * @state_bins: Idle state data bins for this CPU.
  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
- * @tick_hits: Number of "hits" after TICK_NSEC.
+ * @tick_intercepts: "Intercepts" before TICK_NSEC.
  */
 struct teo_cpu {
 	s64 time_span_ns;
 	s64 sleep_length_ns;
 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 	unsigned int total;
-	unsigned int tick_hits;
+	unsigned int tick_intercepts;
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -207,38 +207,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		}
 	}
 
-	/*
-	 * If the deepest state's target residency is below the tick length,
-	 * make a record of it to help teo_select() decide whether or not
-	 * to stop the tick.  This effectively adds an extra hits-only bin
-	 * beyond the last state-related one.
-	 */
-	if (target_residency_ns < TICK_NSEC) {
-		cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
-
-		cpu_data->total += cpu_data->tick_hits;
-
-		if (TICK_NSEC <= cpu_data->sleep_length_ns) {
-			idx_timer = drv->state_count;
-			if (TICK_NSEC <= measured_ns) {
-				cpu_data->tick_hits += PULSE;
-				goto end;
-			}
-		}
-	}
-
+	cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
 	/*
 	 * If the measured idle duration falls into the same bin as the sleep
 	 * length, this is a "hit", so update the "hits" metric for that bin.
 	 * Otherwise, update the "intercepts" metric for the bin fallen into by
 	 * the measured idle duration.
 	 */
-	if (idx_timer == idx_duration)
+	if (idx_timer == idx_duration) {
 		cpu_data->state_bins[idx_timer].hits += PULSE;
-	else
+	} else {
 		cpu_data->state_bins[idx_duration].intercepts += PULSE;
+		if (TICK_NSEC <= measured_ns)
+			cpu_data->tick_intercepts += PULSE;
+	}
 
-end:
 	cpu_data->total += PULSE;
 }
 
@@ -286,7 +269,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	ktime_t delta_tick = TICK_NSEC / 2;
-	unsigned int tick_intercept_sum = 0;
 	unsigned int idx_intercept_sum = 0;
 	unsigned int intercept_sum = 0;
 	unsigned int idx_hit_sum = 0;
@@ -365,9 +347,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		goto end;
 	}
 
-	tick_intercept_sum = intercept_sum +
-			cpu_data->state_bins[drv->state_count-1].intercepts;
-
 	/*
 	 * If the sum of the intercepts metric for all of the idle states
 	 * shallower than the current candidate one (idx) is greater than the
@@ -477,7 +456,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * total wakeup events, do not stop the tick.
 	 */
 	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
-	    tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
+	    cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
 		duration_ns = TICK_NSEC / 2;
 
 end:

From 13ed5c4a6d9c91755227b7b0fab7b2543f6adfd2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:48:47 +0100
Subject: [PATCH 07/20] cpuidle: teo: Skip getting the sleep length if wakeups
 are very frequent

Commit 6da8f9ba5a87 ("cpuidle: teo: Skip tick_nohz_get_sleep_length()
call in some cases") attempted to reduce the governor overhead in some
cases by making it avoid obtaining the sleep length (the time till the
next timer event) which may be costly.

Among other things, after the above commit, tick_nohz_get_sleep_length()
was not called any more when idle state 0 was to be returned, which
turned out to be problematic and the previous behavior in that respect
was restored by commit 4b20b07ce72f ("cpuidle: teo: Don't count non-
existent intercepts").

However, commit 6da8f9ba5a87 also caused the governor to avoid calling
tick_nohz_get_sleep_length() on systems where idle state 0 is a "polling"
one (that is, it is not really an idle state, but a loop continuously
executed by the CPU) when the target residency of the idle state to be
returned was low enough, so there was no practical need to refine the
idle state selection in any way.  This change was not removed by the
other commit, so now on systems where idle state 0 is a "polling" one,
tick_nohz_get_sleep_length() is called when idle state 0 is to be
returned, but it is not called when a deeper idle state with
sufficiently low target residency is to be returned.  That is arguably
confusing and inconsistent.

Moreover, there is no specific reason why the behavior in question
should depend on whether or not idle state 0 is a "polling" one.

One way to address this would be to make the governor always call
tick_nohz_get_sleep_length() to obtain the sleep length, but that would
effectively mean reverting commit 6da8f9ba5a87 and restoring the latency
issue that was the reason for doing it.  This approach is thus not
particularly attractive.

To address it differently, notice that if a CPU is woken up very often,
this is not likely to be caused by timers in the first place (user space
has a default timer slack of 50 us and there are relatively few timers
with a deadline shorter than several microseconds in the kernel) and
even if it were the case, the potential benefit from using a deep idle
state would then be questionable for latency reasons.  Therefore, if the
majority of CPU wakeups occur within several microseconds, it can be
assumed that all wakeups in that range are non-timer and the sleep
length need not be determined.

Accordingly, introduce a new metric for counting wakeups with the
measured idle duration below RESIDENCY_THRESHOLD_NS and modify the idle
state selection to skip the tick_nohz_get_sleep_length() invocation if
idle state 0 has been selected or the target residency of the candidate
idle state is below RESIDENCY_THRESHOLD_NS and the value of the new
metric is at least 1/2 of the total event count.

Since the above requires the measured idle duration to be determined
every time, except for the cases when one of the safety nets has
triggered in which the wakeup is counted as a hit in the deepest
idle state idle residency range, update the handling of those cases
to avoid skipping the idle duration computation when the CPU wakeup
is "genuine".

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://patch.msgid.link/3851791.kQq0lBPeGt@rjwysocki.net
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
[ rjw: Renamed a struct field ]
[ rjw: Fixed typo in the subject and one in a comment ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/teo.c | 58 ++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 62f323f2e245..d772a3b7ccbd 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -129,6 +129,7 @@ struct teo_bin {
  * @state_bins: Idle state data bins for this CPU.
  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
  * @tick_intercepts: "Intercepts" before TICK_NSEC.
+ * @short_idles: Wakeups after short idle periods.
  */
 struct teo_cpu {
 	s64 time_span_ns;
@@ -136,6 +137,7 @@ struct teo_cpu {
 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 	unsigned int total;
 	unsigned int tick_intercepts;
+	unsigned int short_idles;
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -152,12 +154,12 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	s64 target_residency_ns;
 	u64 measured_ns;
 
-	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
+	cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
+
+	if (cpu_data->time_span_ns < 0) {
 		/*
-		 * This causes the wakeup to be counted as a hit regardless of
-		 * the real idle duration which doesn't need to be computed
-		 * because the wakeup has been close enough to an anticipated
-		 * timer.
+		 * If one of the safety nets has triggered, assume that this
+		 * might have been a long sleep.
 		 */
 		measured_ns = U64_MAX;
 	} else {
@@ -177,10 +179,14 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * time, so take 1/2 of the exit latency as a very rough
 		 * approximation of the average of it.
 		 */
-		if (measured_ns >= lat_ns)
+		if (measured_ns >= lat_ns) {
 			measured_ns -= lat_ns / 2;
-		else
+			if (measured_ns < RESIDENCY_THRESHOLD_NS)
+				cpu_data->short_idles += PULSE;
+		} else {
 			measured_ns /= 2;
+			cpu_data->short_idles += PULSE;
+		}
 	}
 
 	cpu_data->total = 0;
@@ -419,27 +425,35 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	if (idx > constraint_idx)
 		idx = constraint_idx;
 
-	if (!idx) {
-		/*
-		 * Query the sleep length to be able to count the wakeup as a
-		 * hit if it is caused by a timer.
-		 */
-		cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
-		goto out_tick;
-	}
-
 	/*
-	 * If state 0 is a polling one, check if the target residency of
-	 * the current candidate state is low enough and skip the timers
-	 * check in that case too.
+	 * If either the candidate state is state 0 or its target residency is
+	 * low enough, there is basically nothing more to do, but if the sleep
+	 * length is not updated, the subsequent wakeup will be counted as an
+	 * "intercept" which may be problematic in the cases when timer wakeups
+	 * are dominant.  Namely, it may effectively prevent deeper idle states
+	 * from being selected at one point even if no imminent timers are
+	 * scheduled.
+	 *
+	 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
+	 * CPU are unlikely (user space has a default 50 us slack value for
+	 * hrtimers and there are relatively few timers with a lower deadline
+	 * value in the kernel), and even if they did happen, the potential
+	 * benefit from using a deep idle state in that case would be
+	 * questionable anyway for latency reasons.  Thus if the measured idle
+	 * duration falls into that range in the majority of cases, assume
+	 * non-timer wakeups to be dominant and skip updating the sleep length
+	 * to reduce latency.
 	 */
-	if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
-	    drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
+	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
+	    2 * cpu_data->short_idles >= cpu_data->total)
 		goto out_tick;
 
 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
 	cpu_data->sleep_length_ns = duration_ns;
 
+	if (!idx)
+		goto out_tick;
+
 	/*
 	 * If the closest expected timer is before the target residency of the
 	 * candidate state, a shallower one needs to be found.
@@ -501,7 +515,7 @@ static void teo_reflect(struct cpuidle_device *dev, int state)
 	if (dev->poll_time_limit ||
 	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
 		dev->poll_time_limit = false;
-		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
+		cpu_data->time_span_ns = KTIME_MIN;
 	} else {
 		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
 	}

From ddcfa7964677b1298712edb931a98ac25ffd2fb6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:50:23 +0100
Subject: [PATCH 08/20] cpuidle: teo: Simplify handling of total events count

Instead of computing the total events count from scratch every time,
decay it and add a PULSE value to it in teo_update().

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/9388883.CDJkKcVGEf@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index d772a3b7ccbd..600b54a9f1e1 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -189,8 +189,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		}
 	}
 
-	cpu_data->total = 0;
-
 	/*
 	 * Decay the "hits" and "intercepts" metrics for all of the bins and
 	 * find the bins that the sleep length and the measured idle duration
@@ -202,8 +200,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		bin->hits -= bin->hits >> DECAY_SHIFT;
 		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
 
-		cpu_data->total += bin->hits + bin->intercepts;
-
 		target_residency_ns = drv->states[i].target_residency_ns;
 
 		if (target_residency_ns <= cpu_data->sleep_length_ns) {
@@ -228,6 +224,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 			cpu_data->tick_intercepts += PULSE;
 	}
 
+	cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
 	cpu_data->total += PULSE;
 }
 

From 65e18e6544751ac25dc284794566ee90d65a379e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Jan 2025 19:51:59 +0100
Subject: [PATCH 09/20] cpuidle: teo: Replace time_span_ns with a flag

After recent updates, the time_span_ns field in struct teo_cpu has
become an indicator on whether or not the most recent wakeup has been
"genuine" which may as well be indicated by a bool field without
calling local_clock(), so update the code accordingly.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/6010475.MhkbZ0Pkbq@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 600b54a9f1e1..c232c95ca7fa 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -124,20 +124,20 @@ struct teo_bin {
 
 /**
  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
- * @time_span_ns: Time between idle state selection and post-wakeup update.
  * @sleep_length_ns: Time till the closest timer event (at the selection time).
  * @state_bins: Idle state data bins for this CPU.
  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
  * @tick_intercepts: "Intercepts" before TICK_NSEC.
  * @short_idles: Wakeups after short idle periods.
+ * @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
  */
 struct teo_cpu {
-	s64 time_span_ns;
 	s64 sleep_length_ns;
 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
 	unsigned int total;
 	unsigned int tick_intercepts;
 	unsigned int short_idles;
+	bool artificial_wakeup;
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -156,7 +156,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 	cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
 
-	if (cpu_data->time_span_ns < 0) {
+	if (cpu_data->artificial_wakeup) {
 		/*
 		 * If one of the safety nets has triggered, assume that this
 		 * might have been a long sleep.
@@ -165,13 +165,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	} else {
 		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 
-		/*
-		 * The computations below are to determine whether or not the
-		 * (saved) time till the next timer event and the measured idle
-		 * duration fall into the same "bin", so use last_residency_ns
-		 * for that instead of time_span_ns which includes the cpuidle
-		 * overhead.
-		 */
 		measured_ns = dev->last_residency_ns;
 		/*
 		 * The delay between the wakeup and the first instruction
@@ -286,7 +279,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		dev->last_state_idx = -1;
 	}
 
-	cpu_data->time_span_ns = local_clock();
 	/*
 	 * Set the sleep length to infinity in case the invocation of
 	 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
@@ -504,17 +496,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state)
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 
 	dev->last_state_idx = state;
-	/*
-	 * If the wakeup was not "natural", but triggered by one of the safety
-	 * nets, assume that the CPU might have been idle for the entire sleep
-	 * length time.
-	 */
 	if (dev->poll_time_limit ||
 	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
+		/*
+		 * The wakeup was not "genuine", but triggered by one of the
+		 * safety nets.
+		 */
 		dev->poll_time_limit = false;
-		cpu_data->time_span_ns = KTIME_MIN;
+		cpu_data->artificial_wakeup = true;
 	} else {
-		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
+		cpu_data->artificial_wakeup = false;
 	}
 }
 

From 16c8d7586c196cddcc8822a946ef03c9cfabae30 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 20 Jan 2025 17:08:50 +0100
Subject: [PATCH 10/20] cpuidle: teo: Skip sleep length computation for low
 latency constraints

If the idle state exit latency constraint is sufficiently low, it
is better to avoid the additional latency related to calling
tick_nohz_get_sleep_length().  It is also not necessary to compute
the sleep length in that case because shallow idle state selection
will be forced then regardless of the recent wakeup history.

Accordingly, skip the sleep length computation and subsequent
checks of the exit latency constraint is low enough.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/6122398.lOV4Wx5bFT@rjwysocki.net
---
 drivers/cpuidle/governors/teo.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index c232c95ca7fa..8fe5e1b47ef9 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -105,6 +105,12 @@
 
 #include "gov.h"
 
+/*
+ * Idle state exit latency threshold used for deciding whether or not to check
+ * the time till the closest expected timer event.
+ */
+#define LATENCY_THRESHOLD_NS	(RESIDENCY_THRESHOLD_NS / 2)
+
 /*
  * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
  * is used for decreasing metrics on a regular basis.
@@ -432,9 +438,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * duration falls into that range in the majority of cases, assume
 	 * non-timer wakeups to be dominant and skip updating the sleep length
 	 * to reduce latency.
+	 *
+	 * Also, if the latency constraint is sufficiently low, it will force
+	 * shallow idle states regardless of the wakeup type, so the sleep
+	 * length need not be known in that case.
 	 */
 	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
-	    2 * cpu_data->short_idles >= cpu_data->total)
+	    (2 * cpu_data->short_idles >= cpu_data->total ||
+	     latency_req < LATENCY_THRESHOLD_NS))
 		goto out_tick;
 
 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);

From 43855ac61483cb914f060851535ea753c094b3e0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 22 Jan 2025 11:36:16 +0530
Subject: [PATCH 11/20] cpufreq: s3c64xx: Fix compilation warning

The driver generates following warning when regulator support isn't
enabled in the kernel. Fix it.

   drivers/cpufreq/s3c64xx-cpufreq.c: In function 's3c64xx_cpufreq_set_target':
>> drivers/cpufreq/s3c64xx-cpufreq.c:55:22: warning: variable 'old_freq' set but not used [-Wunused-but-set-variable]
      55 |         unsigned int old_freq, new_freq;
         |                      ^~~~~~~~
>> drivers/cpufreq/s3c64xx-cpufreq.c:54:30: warning: variable 'dvfs' set but not used [-Wunused-but-set-variable]
      54 |         struct s3c64xx_dvfs *dvfs;
         |                              ^~~~

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202501191803.CtfT7b2o-lkp@intel.com/
Cc: 5.4+ <stable@vger.kernel.org> # v5.4+
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/236b227e929e5adc04d1e9e7af6845a46c8e9432.1737525916.git.viresh.kumar@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/s3c64xx-cpufreq.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index c6bdfc308e99..9cef71528076 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -24,6 +24,7 @@ struct s3c64xx_dvfs {
 	unsigned int vddarm_max;
 };
 
+#ifdef CONFIG_REGULATOR
 static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = {
 	[0] = { 1000000, 1150000 },
 	[1] = { 1050000, 1150000 },
@@ -31,6 +32,7 @@ static struct s3c64xx_dvfs s3c64xx_dvfs_table[] = {
 	[3] = { 1200000, 1350000 },
 	[4] = { 1300000, 1350000 },
 };
+#endif
 
 static struct cpufreq_frequency_table s3c64xx_freq_table[] = {
 	{ 0, 0,  66000 },
@@ -51,15 +53,16 @@ static struct cpufreq_frequency_table s3c64xx_freq_table[] = {
 static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 				      unsigned int index)
 {
-	struct s3c64xx_dvfs *dvfs;
-	unsigned int old_freq, new_freq;
+	unsigned int new_freq = s3c64xx_freq_table[index].frequency;
 	int ret;
 
+#ifdef CONFIG_REGULATOR
+	struct s3c64xx_dvfs *dvfs;
+	unsigned int old_freq;
+
 	old_freq = clk_get_rate(policy->clk) / 1000;
-	new_freq = s3c64xx_freq_table[index].frequency;
 	dvfs = &s3c64xx_dvfs_table[s3c64xx_freq_table[index].driver_data];
 
-#ifdef CONFIG_REGULATOR
 	if (vddarm && new_freq > old_freq) {
 		ret = regulator_set_voltage(vddarm,
 					    dvfs->vddarm_min,

From 1608f0230510489d74a2e24e47054233b7e4678a Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Fri, 17 Jan 2025 18:14:54 +0800
Subject: [PATCH 12/20] cpufreq: Fix re-boost issue after hotplugging a CPU

It turns out that CPUX will stay on the base frequency after performing
these operations:

 1. boost all CPUs: echo 1 > /sys/devices/system/cpu/cpufreq/boost

 2. offline one CPU: echo 0 > /sys/devices/system/cpu/cpuX/online

 3. deboost all CPUs: echo 0 > /sys/devices/system/cpu/cpufreq/boost

 4. online CPUX: echo 1 > /sys/devices/system/cpu/cpuX/online

 5. boost all CPUs again: echo 1 > /sys/devices/system/cpu/cpufreq/boost

This is because max_freq_req of the policy is not updated during the
online process, and the value of max_freq_req before the last offline is
retained.

When the CPU is boosted again, freq_qos_update_request() will do nothing
because the old value is the same as the new one. This causes the CPU to
stay at the base frequency. Updating max_freq_req  in cpufreq_online()
will solve this problem.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/20250117101457.1530653-2-zhenglifeng1@huawei.com
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 1076e37a18ad..4d54e30f94ee 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1476,6 +1476,10 @@ static int cpufreq_online(unsigned int cpu)
 
 		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				CPUFREQ_CREATE_POLICY, policy);
+	} else {
+		ret = freq_qos_update_request(policy->max_freq_req, policy->max);
+		if (ret < 0)
+			goto out_destroy_policy;
 	}
 
 	if (cpufreq_driver->get && has_target()) {

From dd016f379ebc2d43a9405742d1a6066577509bd7 Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Fri, 17 Jan 2025 18:14:55 +0800
Subject: [PATCH 13/20] cpufreq: Introduce a more generic way to set default
 per-policy boost flag

In cpufreq_online() of cpufreq.c, the per-policy boost flag is already
set to mirror the cpufreq_driver boost during init but using freq_table
to judge if the policy has boost frequency. There are two drawbacks to
this approach:

 1. It doesn't work for the cpufreq drivers that do not use a frequency
    table. For now, acpi-cpufreq and amd-pstate have to enable boost in
    policy initialization. And cppc_cpufreq never set policy to boost
    when going online no matter what the cpufreq_driver boost flag is.

 2. If the CPU goes offline when cpufreq_driver boost is enabled and
    then goes online when cpufreq_driver boost is disabled, the
    per-policy boost flag will incorrectly remain true.

Running set_boost at the end of the online process is a more generic way
for all cpufreq drivers.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Link: https://patch.msgid.link/20250117101457.1530653-3-zhenglifeng1@huawei.com
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 4d54e30f94ee..e0048856ecee 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1410,10 +1410,6 @@ static int cpufreq_online(unsigned int cpu)
 			goto out_free_policy;
 		}
 
-		/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
-		if (cpufreq_boost_enabled() && policy_has_boost_freq(policy))
-			policy->boost_enabled = true;
-
 		/*
 		 * The initialization has succeeded and the policy is online.
 		 * If there is a problem with its frequency table, take it
@@ -1574,6 +1570,18 @@ static int cpufreq_online(unsigned int cpu)
 	if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver))
 		policy->cdev = of_cpufreq_cooling_register(policy);
 
+	/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
+	if (policy->boost_enabled != cpufreq_boost_enabled()) {
+		policy->boost_enabled = cpufreq_boost_enabled();
+		ret = cpufreq_driver->set_boost(policy, policy->boost_enabled);
+		if (ret) {
+			/* If the set_boost fails, the online operation is not affected */
+			pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu,
+				policy->boost_enabled ? "enable" : "disable");
+			policy->boost_enabled = !policy->boost_enabled;
+		}
+	}
+
 	pr_debug("initialization complete\n");
 
 	return 0;

From 03d8b4e76266e11662c5e544854b737843173e2d Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Fri, 17 Jan 2025 18:14:56 +0800
Subject: [PATCH 14/20] cpufreq: CPPC: Fix wrong max_freq in policy
 initialization

In policy initialization, policy->max and policy->cpuinfo.max_freq are
always set to the value calculated from caps->nominal_perf.

This will cause the frequency stay on base frequency even if the policy
is already boosted when a CPU is going online.

Fix this by using policy->boost_enabled to determine which value should
be set.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/20250117101457.1530653-4-zhenglifeng1@huawei.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 2486a6c5256a..8f512448382f 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -611,7 +611,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * Section 8.4.7.1.1.5 of ACPI 6.1 spec)
 	 */
 	policy->min = cppc_perf_to_khz(caps, caps->lowest_nonlinear_perf);
-	policy->max = cppc_perf_to_khz(caps, caps->nominal_perf);
+	policy->max = cppc_perf_to_khz(caps, policy->boost_enabled ?
+						caps->highest_perf : caps->nominal_perf);
 
 	/*
 	 * Set cpuinfo.min_freq to Lowest to make the full range of performance
@@ -619,7 +620,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * nonlinear perf
 	 */
 	policy->cpuinfo.min_freq = cppc_perf_to_khz(caps, caps->lowest_perf);
-	policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->nominal_perf);
+	policy->cpuinfo.max_freq = policy->max;
 
 	policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu);
 	policy->shared_type = cpu_data->shared_type;

From 2b16c631832df6cf8782fb1fdc7df8a4f03f4f16 Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Fri, 17 Jan 2025 18:14:57 +0800
Subject: [PATCH 15/20] cpufreq: ACPI: Remove set_boost in
 acpi_cpufreq_cpu_init()

At the end of cpufreq_online() in cpufreq.c, set_boost is executed and
the per-policy boost flag is set to mirror the cpufreq_driver boost, so
it is not necessary to run set_boost in acpi_cpufreq_cpu_init().

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/20250117101457.1530653-5-zhenglifeng1@huawei.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 302df42d6887..463b69a2dff5 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -909,11 +909,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency)
 		pr_warn(FW_WARN "P-state 0 is not max freq\n");
 
-	if (acpi_cpufreq_driver.set_boost) {
-		set_boost(policy, acpi_cpufreq_driver.boost_enabled);
-		policy->boost_enabled = acpi_cpufreq_driver.boost_enabled;
-	}
-
 	return result;
 
 err_unreg:

From 93940fbdc46843cea58708bbe8dd225bd0f32e67 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Mon, 20 Jan 2025 10:09:46 +0000
Subject: [PATCH 16/20] cpufreq/schedutil: Only bind threads if needed

Remove the unconditional binding of sugov kthreads to the affected CPUs
if the cpufreq driver indicates that updates can happen from any CPU.
This allows userspace to set affinities to either save power (waking up
bigger CPUs on HMP can be expensive) or increasing performance (by
letting the utilized CPUs run without preemption of the sugov kthread).

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/5a8deed4-7764-4729-a9d4-9520c25fa7e8@arm.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++++-
 kernel/sched/syscalls.c          | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a2a29e3fffca..1a19d69b91ed 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -666,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 	}
 
 	sg_policy->thread = thread;
-	kthread_bind_mask(thread, policy->related_cpus);
+	if (policy->dvfs_possible_from_any_cpu)
+		set_cpus_allowed_ptr(thread, policy->related_cpus);
+	else
+		kthread_bind_mask(thread, policy->related_cpus);
+
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 	mutex_init(&sg_policy->work_lock);
 
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 149e2c8036d3..456d339be98f 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1129,6 +1129,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
 		return 0;
 
+	/*
+	 * The special/sugov task isn't part of regular bandwidth/admission
+	 * control so let userspace change affinities.
+	 */
+	if (dl_entity_is_special(&p->dl))
+		return 0;
+
 	/*
 	 * Since bandwidth control happens on root_domain basis,
 	 * if admission test is enabled, we only admit -deadline

From e20a70c572539a486dbd91b225fa6a194a5e2122 Mon Sep 17 00:00:00 2001
From: Wentao Liang <vulab@iscas.ac.cn>
Date: Sun, 19 Jan 2025 22:32:05 +0800
Subject: [PATCH 17/20] PM: hibernate: Add error handling for syscore_suspend()

In hibernation_platform_enter(), the code did not check the
return value of syscore_suspend(), potentially leading to a
situation where syscore_resume() would be called even if
syscore_suspend() failed. This could cause unpredictable
behavior or system instability.

Modify the code sequence in question to properly handle errors returned
by syscore_suspend(). If an error occurs in the suspend path, the code
now jumps to label 'Enable_irqs' skipping the syscore_resume() call and
only enabling interrupts after setting the system state to SYSTEM_RUNNING.

Fixes: 40dc166cb5dd ("PM / Core: Introduce struct syscore_ops for core subsystems PM")
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
Link: https://patch.msgid.link/20250119143205.2103-1-vulab@iscas.ac.cn
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f87aa01ba44..10a01af63a80 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -608,7 +608,11 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	system_state = SYSTEM_SUSPEND;
-	syscore_suspend();
+
+	error = syscore_suspend();
+	if (error)
+		goto Enable_irqs;
+
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
 		goto Power_up;
@@ -620,6 +624,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+ Enable_irqs:
 	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 

From 4891cd3eba62ac611a7929948cf5588a1abed909 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 16 Jan 2025 17:43:29 +0200
Subject: [PATCH 18/20] PM: Revert "Add EXPORT macros for exporting PM
 functions"

Revert commit 41a337b40e98 ("Add EXPORT macros for exporting PM
functions") because the macros added by it are still unused almost
two years after they had been introduced.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20250116154354.149297-1-andriy.shevchenko@linux.intel.com
[ rjw: New changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/linux/pm.h b/include/linux/pm.h
index e7f0260f15ad..0627a795892b 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -384,12 +384,8 @@ const struct dev_pm_ops name = { \
 
 #ifdef CONFIG_PM
 #define _EXPORT_DEV_PM_OPS(name, license, ns)		_EXPORT_PM_OPS(name, license, ns)
-#define EXPORT_PM_FN_GPL(name)				EXPORT_SYMBOL_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)			EXPORT_SYMBOL_NS_GPL(name, "ns")
 #else
 #define _EXPORT_DEV_PM_OPS(name, license, ns)		_DISCARD_PM_OPS(name, license, ns)
-#define EXPORT_PM_FN_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)
 #endif
 
 #ifdef CONFIG_PM_SLEEP

From b865a8404642279e53644fc7288d172afd6a170e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 28 Jan 2025 13:21:23 +0530
Subject: [PATCH 19/20] cpufreq: airoha: Depends on OF

The Airoha cpufreq depends on OF and must be marked as such. With the
kernel compiled without OF support, we get following warning:

drivers/cpufreq/airoha-cpufreq.c:109:34: warning: 'airoha_cpufreq_match_list' defined but not used [-Wunused-const-variable=]
    109 | static const struct of_device_id airoha_cpufreq_match_list[] __initconst = {
        |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202501251941.0fXlcd1D-lkp@intel.com/
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/455e18c947bd9529701a2f1c796f0f934d1354d7.1738050679.git.viresh.kumar@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 704e84d00639..0ee5c691fb36 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -17,7 +17,7 @@ config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
 
 config ARM_AIROHA_SOC_CPUFREQ
 	tristate "Airoha EN7581 SoC CPUFreq support"
-	depends on ARCH_AIROHA || COMPILE_TEST
+	depends on (ARCH_AIROHA && OF) || COMPILE_TEST
 	select PM_OPP
 	default ARCH_AIROHA
 	help

From 3775fc538f535a7c5adaf11990c7932a0bd1f9eb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 28 Jan 2025 20:24:41 +0100
Subject: [PATCH 20/20] PM: sleep: core: Synchronize runtime PM status of
 parents and children

Commit 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the
resume phase") overlooked the case in which the parent of a device with
DPM_FLAG_SMART_SUSPEND set did not use that flag and could be runtime-
suspended before a transition into a system-wide sleep state.  In that
case, if the child is resumed during the subsequent transition from
that state into the working state, its runtime PM status will be set to
RPM_ACTIVE, but the runtime PM status of the parent will not be updated
accordingly, even though the parent will be resumed too, because of the
dev_pm_skip_suspend() check in device_resume_noirq().

Address this problem by tracking the need to set the runtime PM status
to RPM_ACTIVE during system-wide resume transitions for devices with
DPM_FLAG_SMART_SUSPEND set and all of the devices depended on by them.

Fixes: 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the resume phase")
Closes: https://lore.kernel.org/linux-pm/Z30p2Etwf3F2AUvD@hovoldconsulting.com/
Reported-by: Johan Hovold <johan@kernel.org>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Tested-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://patch.msgid.link/12619233.O9o76ZdvQC@rjwysocki.net
---
 drivers/base/power/main.c | 29 ++++++++++++++++++++---------
 include/linux/pm.h        |  1 +
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbc9a7a75def..d497d448e4b2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -656,13 +656,15 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy
 	 * so change its status accordingly.
 	 *
 	 * Otherwise, the device is going to be resumed, so set its PM-runtime
-	 * status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set
-	 * to avoid confusing drivers that don't use it.
+	 * status to "active" unless its power.set_active flag is clear, in
+	 * which case it is not necessary to update its PM-runtime status.
 	 */
-	if (skip_resume)
+	if (skip_resume) {
 		pm_runtime_set_suspended(dev);
-	else if (dev_pm_skip_suspend(dev))
+	} else if (dev->power.set_active) {
 		pm_runtime_set_active(dev);
+		dev->power.set_active = false;
+	}
 
 	if (dev->pm_domain) {
 		info = "noirq power domain ";
@@ -1189,18 +1191,24 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 	return PMSG_ON;
 }
 
-static void dpm_superior_set_must_resume(struct device *dev)
+static void dpm_superior_set_must_resume(struct device *dev, bool set_active)
 {
 	struct device_link *link;
 	int idx;
 
-	if (dev->parent)
+	if (dev->parent) {
 		dev->parent->power.must_resume = true;
+		if (set_active)
+			dev->parent->power.set_active = true;
+	}
 
 	idx = device_links_read_lock();
 
-	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node)
+	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) {
 		link->supplier->power.must_resume = true;
+		if (set_active)
+			link->supplier->power.set_active = true;
+	}
 
 	device_links_read_unlock(idx);
 }
@@ -1278,8 +1286,11 @@ Skip:
 	      dev->power.may_skip_resume))
 		dev->power.must_resume = true;
 
-	if (dev->power.must_resume)
-		dpm_superior_set_must_resume(dev);
+	if (dev->power.must_resume) {
+		dev->power.set_active = dev->power.set_active ||
+			dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND);
+		dpm_superior_set_must_resume(dev, dev->power.set_active);
+	}
 
 Complete:
 	complete_all(&dev->power.completion);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 0627a795892b..0d2597a76dfc 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -679,6 +679,7 @@ struct dev_pm_info {
 	bool			no_pm_callbacks:1;	/* Owned by the PM core */
 	bool			async_in_progress:1;	/* Owned by the PM core */
 	bool			must_resume:1;		/* Owned by the PM core */
+	bool			set_active:1;		/* Owned by the PM core */
 	bool			may_skip_resume:1;	/* Set by subsystems */
 #else
 	bool			should_wakeup:1;