/root/src/xen/xen/arch/x86/acpi/cpuidle_menu.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * cpuidle_menu - menu governor for cpu idle, main idea come from Linux. |
3 | | * drivers/cpuidle/governors/menu.c |
4 | | * |
5 | | * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> |
6 | | * Copyright (C) 2007, 2008 Intel Corporation |
7 | | * |
8 | | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or (at |
13 | | * your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, but |
16 | | * WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | | * General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License along |
21 | | * with this program; If not, see <http://www.gnu.org/licenses/>. |
22 | | * |
23 | | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
24 | | */ |
25 | | #include <xen/errno.h> |
26 | | #include <xen/lib.h> |
27 | | #include <xen/types.h> |
28 | | #include <xen/acpi.h> |
29 | | #include <xen/timer.h> |
30 | | #include <xen/cpuidle.h> |
31 | | #include <asm/irq.h> |
32 | | |
33 | | #define BUCKETS 6 |
34 | 1.84M | #define RESOLUTION 1024 |
35 | 11.3M | #define DECAY 4 |
36 | 1.85M | #define MAX_INTERESTING 50000 |
37 | 172k | #define LATENCY_MULTIPLIER 10 |
38 | | |
39 | | /* |
40 | | * Concepts and ideas behind the menu governor |
41 | | * |
42 | | * For the menu governor, there are 3 decision factors for picking a C |
43 | | * state: |
44 | | * 1) Energy break even point |
45 | | * 2) Performance impact |
46 | | * 3) Latency tolerance (TBD: from guest virtual C state) |
47 | | * These these three factors are treated independently. |
48 | | * |
49 | | * Energy break even point |
50 | | * ----------------------- |
51 | | * C state entry and exit have an energy cost, and a certain amount of time in |
52 | | * the C state is required to actually break even on this cost. CPUIDLE |
53 | | * provides us this duration in the "target_residency" field. So all that we |
54 | | * need is a good prediction of how long we'll be idle. Like the traditional |
55 | | * menu governor, we start with the actual known "next timer event" time. |
56 | | * |
57 | | * Since there are other source of wakeups (interrupts for example) than |
58 | | * the next timer event, this estimation is rather optimistic. To get a |
59 | | * more realistic estimate, a correction factor is applied to the estimate, |
60 | | * that is based on historic behavior. For example, if in the past the actual |
61 | | * duration always was 50% of the next timer tick, the correction factor will |
62 | | * be 0.5. |
63 | | * |
64 | | * menu uses a running average for this correction factor, however it uses a |
65 | | * set of factors, not just a single factor. This stems from the realization |
66 | | * that the ratio is dependent on the order of magnitude of the expected |
67 | | * duration; if we expect 500 milliseconds of idle time the likelihood of |
68 | | * getting an interrupt very early is much higher than if we expect 50 micro |
69 | | * seconds of idle time. |
70 | | * For this reason we keep an array of 6 independent factors, that gets |
71 | | * indexed based on the magnitude of the expected duration |
72 | | * |
73 | | * Limiting Performance Impact |
74 | | * --------------------------- |
75 | | * C states, especially those with large exit latencies, can have a real |
76 | | * noticable impact on workloads, which is not acceptable for most sysadmins, |
77 | | * and in addition, less performance has a power price of its own. |
78 | | * |
79 | | * As a general rule of thumb, menu assumes that the following heuristic |
80 | | * holds: |
81 | | * The busier the system, the less impact of C states is acceptable |
82 | | * |
83 | | * This rule-of-thumb is implemented using average interrupt interval: |
84 | | * If the exit latency times multiplier is longer than the average |
85 | | * interrupt interval, the C state is not considered a candidate |
86 | | * for selection due to a too high performance impact. So the smaller |
87 | | * the average interrupt interval is, the smaller C state latency should be |
88 | | * and thus the less likely a busy CPU will hit such a deep C state. |
89 | | * |
90 | | * As an additional rule to reduce the performance impact, menu tries to |
91 | | * limit the exit latency duration to be no more than 10% of the decaying |
92 | | * measured idle time. |
93 | | */ |
94 | | |
95 | | struct perf_factor{ |
96 | | s_time_t time_stamp; |
97 | | s_time_t duration; |
98 | | unsigned int irq_count_stamp; |
99 | | unsigned int irq_sum; |
100 | | }; |
101 | | |
102 | | struct menu_device |
103 | | { |
104 | | int last_state_idx; |
105 | | unsigned int expected_us; |
106 | | u64 predicted_us; |
107 | | u64 latency_factor; |
108 | | unsigned int measured_us; |
109 | | unsigned int exit_us; |
110 | | unsigned int bucket; |
111 | | u64 correction_factor[BUCKETS]; |
112 | | struct perf_factor pf; |
113 | | }; |
114 | | |
115 | | static DEFINE_PER_CPU(struct menu_device, menu_devices); |
116 | | |
117 | | static inline int which_bucket(unsigned int duration) |
118 | 1.82M | { |
119 | 1.82M | int bucket = 0; |
120 | 1.82M | |
121 | 1.82M | if (duration < 10) |
122 | 229 | return bucket; |
123 | 1.82M | if (duration < 100) |
124 | 21.0k | return bucket + 1; |
125 | 1.80M | if (duration < 1000) |
126 | 181k | return bucket + 2; |
127 | 1.62M | if (duration < 10000) |
128 | 1.64M | return bucket + 3; |
129 | 18.4E | if (duration < 100000) |
130 | 0 | return bucket + 4; |
131 | 18.4E | return bucket + 5; |
132 | 18.4E | } |
133 | | |
134 | | /* |
135 | | * Return the average interrupt interval to take I/O performance |
136 | | * requirements into account. The smaller the average interrupt |
137 | | * interval to be, the more busy I/O activity, and thus the higher |
138 | | * the barrier to go to an expensive C state. |
139 | | */ |
140 | | |
141 | | /* 5 milisec sampling period */ |
142 | 1.91M | #define SAMPLING_PERIOD 5000000 |
143 | | |
144 | | /* for I/O interrupt, we give 8x multiplier compared to C state latency*/ |
145 | 178k | #define IO_MULTIPLIER 8 |
146 | | |
147 | | static inline s_time_t avg_intr_interval_us(void) |
148 | 1.91M | { |
149 | 1.91M | struct menu_device *data = &__get_cpu_var(menu_devices); |
150 | 1.91M | s_time_t duration, now; |
151 | 1.91M | s_time_t avg_interval; |
152 | 1.91M | unsigned int irq_sum; |
153 | 1.91M | |
154 | 1.91M | now = NOW(); |
155 | 1.91M | duration = (data->pf.duration + (now - data->pf.time_stamp) |
156 | 1.91M | * (DECAY - 1)) / DECAY; |
157 | 1.91M | |
158 | 1.91M | irq_sum = (data->pf.irq_sum + (this_cpu(irq_count) - data->pf.irq_count_stamp) |
159 | 1.91M | * (DECAY - 1)) / DECAY; |
160 | 1.91M | |
161 | 1.91M | if (irq_sum == 0) |
162 | 1.91M | /* no irq recently, so return a big enough interval: 1 sec */ |
163 | 51.8k | avg_interval = 1000000; |
164 | 1.91M | else |
165 | 1.86M | avg_interval = duration / irq_sum / 1000; /* in us */ |
166 | 1.91M | |
167 | 1.91M | if ( duration >= SAMPLING_PERIOD){ |
168 | 32.2k | data->pf.time_stamp = now; |
169 | 32.2k | data->pf.duration = duration; |
170 | 32.2k | data->pf.irq_count_stamp= this_cpu(irq_count); |
171 | 32.2k | data->pf.irq_sum = irq_sum; |
172 | 32.2k | } |
173 | 1.91M | |
174 | 1.91M | return avg_interval; |
175 | 1.91M | } |
176 | | |
177 | | static unsigned int get_sleep_length_us(void) |
178 | 1.98M | { |
179 | 1.98M | s_time_t us = (this_cpu(timer_deadline) - NOW()) / 1000; |
180 | 1.98M | /* |
181 | 1.98M | * while us < 0 or us > (u32)-1, return a large u32, |
182 | 1.98M | * choose (unsigned int)-2000 to avoid wrapping while added with exit |
183 | 1.98M | * latency because the latency should not larger than 2ms |
184 | 1.98M | */ |
185 | 1.98M | return (us >> 32) ? (unsigned int)-2000 : (unsigned int)us; |
186 | 1.98M | } |
187 | | |
188 | | static int menu_select(struct acpi_processor_power *power) |
189 | 2.02M | { |
190 | 2.02M | struct menu_device *data = &__get_cpu_var(menu_devices); |
191 | 2.02M | int i; |
192 | 2.02M | s_time_t io_interval; |
193 | 2.02M | |
194 | 2.02M | /* TBD: Change to 0 if C0(polling mode) support is added later*/ |
195 | 2.02M | data->last_state_idx = CPUIDLE_DRIVER_STATE_START; |
196 | 2.02M | data->exit_us = 0; |
197 | 2.02M | |
198 | 2.02M | /* determine the expected residency time, round up */ |
199 | 2.02M | data->expected_us = get_sleep_length_us(); |
200 | 2.02M | |
201 | 2.02M | data->bucket = which_bucket(data->expected_us); |
202 | 2.02M | |
203 | 2.02M | io_interval = avg_intr_interval_us(); |
204 | 2.02M | |
205 | 2.02M | data->latency_factor = DIV_ROUND( |
206 | 2.02M | data->latency_factor * (DECAY - 1) + data->measured_us, |
207 | 2.02M | DECAY); |
208 | 2.02M | |
209 | 2.02M | /* |
210 | 2.02M | * if the correction factor is 0 (eg first time init or cpu hotplug |
211 | 2.02M | * etc), we actually want to start out with a unity factor. |
212 | 2.02M | */ |
213 | 2.02M | if (data->correction_factor[data->bucket] == 0) |
214 | 49 | data->correction_factor[data->bucket] = RESOLUTION * DECAY; |
215 | 2.02M | |
216 | 2.02M | /* Make sure to round up for half microseconds */ |
217 | 2.02M | data->predicted_us = DIV_ROUND( |
218 | 2.02M | data->expected_us * data->correction_factor[data->bucket], |
219 | 2.02M | RESOLUTION * DECAY); |
220 | 2.02M | |
221 | 2.02M | /* find the deepest idle state that satisfies our constraints */ |
222 | 2.17M | for ( i = CPUIDLE_DRIVER_STATE_START + 1; i < power->count; i++ ) |
223 | 1.93M | { |
224 | 1.93M | struct acpi_processor_cx *s = &power->states[i]; |
225 | 1.93M | |
226 | 1.93M | if (s->target_residency > data->predicted_us) |
227 | 1.75M | break; |
228 | 178k | if (s->latency * IO_MULTIPLIER > io_interval) |
229 | 5.75k | break; |
230 | 172k | if (s->latency * LATENCY_MULTIPLIER > data->latency_factor) |
231 | 18.9k | break; |
232 | 172k | /* TBD: we need to check the QoS requirment in future */ |
233 | 153k | data->exit_us = s->latency; |
234 | 153k | data->last_state_idx = i; |
235 | 153k | } |
236 | 2.02M | |
237 | 2.02M | return data->last_state_idx; |
238 | 2.02M | } |
239 | | |
240 | | static void menu_reflect(struct acpi_processor_power *power) |
241 | 1.83M | { |
242 | 1.83M | struct menu_device *data = &__get_cpu_var(menu_devices); |
243 | 1.83M | u64 new_factor; |
244 | 1.83M | |
245 | 1.83M | data->measured_us = power->last_residency; |
246 | 1.83M | |
247 | 1.83M | /* |
248 | 1.83M | * We correct for the exit latency; we are assuming here that the |
249 | 1.83M | * exit latency happens after the event that we're interested in. |
250 | 1.83M | */ |
251 | 1.83M | if (data->measured_us > data->exit_us) |
252 | 1.84M | data->measured_us -= data->exit_us; |
253 | 1.83M | |
254 | 1.83M | /* update our correction ratio */ |
255 | 1.83M | |
256 | 1.83M | new_factor = data->correction_factor[data->bucket] |
257 | 1.83M | * (DECAY - 1) / DECAY; |
258 | 1.83M | |
259 | 1.85M | if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING) |
260 | 1.86M | new_factor += RESOLUTION * data->measured_us / data->expected_us; |
261 | 1.83M | else |
262 | 1.83M | /* |
263 | 1.83M | * we were idle so long that we count it as a perfect |
264 | 1.83M | * prediction |
265 | 1.83M | */ |
266 | 18.4E | new_factor += RESOLUTION; |
267 | 1.83M | |
268 | 1.83M | /* |
269 | 1.83M | * We don't want 0 as factor; we always want at least |
270 | 1.83M | * a tiny bit of estimated time. |
271 | 1.83M | */ |
272 | 1.83M | if (new_factor == 0) |
273 | 997k | new_factor = 1; |
274 | 1.83M | |
275 | 1.83M | data->correction_factor[data->bucket] = new_factor; |
276 | 1.83M | } |
277 | | |
278 | | static int menu_enable_device(struct acpi_processor_power *power) |
279 | 0 | { |
280 | 0 | if (!cpu_online(power->cpu)) |
281 | 0 | return -1; |
282 | 0 |
|
283 | 0 | memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device)); |
284 | 0 |
|
285 | 0 | return 0; |
286 | 0 | } |
287 | | |
288 | | static struct cpuidle_governor menu_governor = |
289 | | { |
290 | | .name = "menu", |
291 | | .rating = 20, |
292 | | .enable = menu_enable_device, |
293 | | .select = menu_select, |
294 | | .reflect = menu_reflect, |
295 | | }; |
296 | | |
297 | | struct cpuidle_governor *cpuidle_current_governor = &menu_governor; |
298 | | void menu_get_trace_data(u32 *expected, u32 *pred) |
299 | 1.86M | { |
300 | 1.86M | struct menu_device *data = &__get_cpu_var(menu_devices); |
301 | 1.86M | *expected = data->expected_us; |
302 | 1.86M | *pred = data->predicted_us; |
303 | 1.86M | } |