debuggers.hg

view xen/arch/x86/acpi/cpufreq/cpufreq.c @ 17943:baaea9f0db5e

x86: Add cpufreq logic to S3 suspend/resume

When suspend to S3, stop the cpufreq dbs governor. When resume from
S3, firstly sync cpu state and freq at the 1st dbs timer; from 2nd dbs
timer on, cpufreq dbs governor control cpu px transfer according to
its workload algorithm. Px statistic is also handled.

Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 27 16:16:47 2008 +0100 (2008-06-27)
parents d0817f08599a
children 48be9885d341
line source
1 /*
2 * cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * Feb 2008 - Liu Jinsong <jinsong.liu@intel.com>
10 * porting acpi-cpufreq.c from Linux 2.6.23 to Xen hypervisor
11 *
12 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or (at
17 * your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 *
28 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 */
31 #include <xen/types.h>
32 #include <xen/errno.h>
33 #include <xen/delay.h>
34 #include <xen/cpumask.h>
35 #include <xen/timer.h>
36 #include <xen/xmalloc.h>
37 #include <asm/bug.h>
38 #include <asm/msr.h>
39 #include <asm/io.h>
40 #include <asm/config.h>
41 #include <asm/processor.h>
42 #include <asm/percpu.h>
43 #include <asm/cpufeature.h>
44 #include <acpi/acpi.h>
45 #include <acpi/cpufreq/cpufreq.h>
47 struct processor_pminfo processor_pminfo[NR_CPUS];
48 struct cpufreq_policy xen_px_policy[NR_CPUS];
50 static cpumask_t *cpufreq_dom_pt;
51 static cpumask_t cpufreq_dom_mask;
52 static unsigned int cpufreq_dom_max;
54 enum {
55 UNDEFINED_CAPABLE = 0,
56 SYSTEM_INTEL_MSR_CAPABLE,
57 SYSTEM_IO_CAPABLE,
58 };
60 #define INTEL_MSR_RANGE (0xffff)
61 #define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
63 struct acpi_cpufreq_data {
64 struct processor_performance *acpi_data;
65 struct cpufreq_frequency_table *freq_table;
66 unsigned int max_freq;
67 unsigned int cpu_feature;
68 };
70 static struct acpi_cpufreq_data *drv_data[NR_CPUS];
72 static struct cpufreq_driver acpi_cpufreq_driver;
74 static int check_est_cpu(unsigned int cpuid)
75 {
76 struct cpuinfo_x86 *cpu = &cpu_data[cpuid];
78 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
79 !cpu_has(cpu, X86_FEATURE_EST))
80 return 0;
82 return 1;
83 }
85 static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
86 {
87 struct processor_performance *perf;
88 int i;
90 perf = data->acpi_data;
92 for (i=0; i<perf->state_count; i++) {
93 if (value == perf->states[i].status)
94 return data->freq_table[i].frequency;
95 }
96 return 0;
97 }
99 static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
100 {
101 int i;
102 struct processor_performance *perf;
104 msr &= INTEL_MSR_RANGE;
105 perf = data->acpi_data;
107 for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
108 if (msr == perf->states[data->freq_table[i].index].status)
109 return data->freq_table[i].frequency;
110 }
111 return data->freq_table[0].frequency;
112 }
114 static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
115 {
116 switch (data->cpu_feature) {
117 case SYSTEM_INTEL_MSR_CAPABLE:
118 return extract_msr(val, data);
119 case SYSTEM_IO_CAPABLE:
120 return extract_io(val, data);
121 default:
122 return 0;
123 }
124 }
126 struct msr_addr {
127 u32 reg;
128 };
130 struct io_addr {
131 u16 port;
132 u8 bit_width;
133 };
135 typedef union {
136 struct msr_addr msr;
137 struct io_addr io;
138 } drv_addr_union;
140 struct drv_cmd {
141 unsigned int type;
142 cpumask_t mask;
143 drv_addr_union addr;
144 u32 val;
145 };
147 static void do_drv_read(struct drv_cmd *cmd)
148 {
149 u32 h;
151 switch (cmd->type) {
152 case SYSTEM_INTEL_MSR_CAPABLE:
153 rdmsr(cmd->addr.msr.reg, cmd->val, h);
154 break;
155 case SYSTEM_IO_CAPABLE:
156 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
157 &cmd->val, (u32)cmd->addr.io.bit_width);
158 break;
159 default:
160 break;
161 }
162 }
164 static void do_drv_write(void *drvcmd)
165 {
166 struct drv_cmd *cmd;
167 u32 lo, hi;
169 cmd = (struct drv_cmd *)drvcmd;
171 switch (cmd->type) {
172 case SYSTEM_INTEL_MSR_CAPABLE:
173 rdmsr(cmd->addr.msr.reg, lo, hi);
174 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
175 wrmsr(cmd->addr.msr.reg, lo, hi);
176 break;
177 case SYSTEM_IO_CAPABLE:
178 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
179 cmd->val, (u32)cmd->addr.io.bit_width);
180 break;
181 default:
182 break;
183 }
184 }
186 static void drv_read(struct drv_cmd *cmd)
187 {
188 cmd->val = 0;
190 do_drv_read(cmd);
191 }
193 static void drv_write(struct drv_cmd *cmd)
194 {
195 on_selected_cpus( cmd->mask, do_drv_write, (void *)cmd, 0, 0);
196 }
198 static u32 get_cur_val(cpumask_t mask)
199 {
200 struct processor_performance *perf;
201 struct drv_cmd cmd;
203 if (unlikely(cpus_empty(mask)))
204 return 0;
206 switch (drv_data[first_cpu(mask)]->cpu_feature) {
207 case SYSTEM_INTEL_MSR_CAPABLE:
208 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
209 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
210 break;
211 case SYSTEM_IO_CAPABLE:
212 cmd.type = SYSTEM_IO_CAPABLE;
213 perf = drv_data[first_cpu(mask)]->acpi_data;
214 cmd.addr.io.port = perf->control_register.address;
215 cmd.addr.io.bit_width = perf->control_register.bit_width;
216 break;
217 default:
218 return 0;
219 }
221 cmd.mask = mask;
223 drv_read(&cmd);
224 return cmd.val;
225 }
227 /*
228 * Return the measured active (C0) frequency on this CPU since last call
229 * to this function.
230 * Input: cpu number
231 * Return: Average CPU frequency in terms of max frequency (zero on error)
232 *
233 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
234 * over a period of time, while CPU is in C0 state.
235 * IA32_MPERF counts at the rate of max advertised frequency
236 * IA32_APERF counts at the rate of actual CPU frequency
237 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
238 * no meaning should be associated with absolute values of these MSRs.
239 */
240 /* FIXME: handle query on non-current cpu later */
241 static unsigned int get_measured_perf(unsigned int cpu)
242 {
243 union {
244 struct {
245 uint32_t lo;
246 uint32_t hi;
247 } split;
248 uint64_t whole;
249 } aperf_cur, mperf_cur;
251 unsigned int perf_percent;
252 unsigned int retval;
254 rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
255 rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
257 wrmsr(MSR_IA32_APERF, 0,0);
258 wrmsr(MSR_IA32_MPERF, 0,0);
260 if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
261 int shift_count = 7;
262 aperf_cur.whole >>= shift_count;
263 mperf_cur.whole >>= shift_count;
264 }
266 if (aperf_cur.whole && mperf_cur.whole)
267 perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
268 else
269 perf_percent = 0;
272 retval = drv_data[cpu]->max_freq * perf_percent / 100;
273 return retval;
274 }
276 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
277 {
278 struct acpi_cpufreq_data *data = drv_data[cpu];
279 unsigned int freq;
281 if (unlikely(data == NULL ||
282 data->acpi_data == NULL || data->freq_table == NULL)) {
283 return 0;
284 }
286 freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
287 return freq;
288 }
290 static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
291 struct acpi_cpufreq_data *data)
292 {
293 unsigned int cur_freq;
294 unsigned int i;
296 for (i=0; i<100; i++) {
297 cur_freq = extract_freq(get_cur_val(mask), data);
298 if (cur_freq == freq)
299 return 1;
300 udelay(10);
301 }
302 return 0;
303 }
305 static int acpi_cpufreq_target(struct cpufreq_policy *policy,
306 unsigned int target_freq, unsigned int relation)
307 {
308 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
309 struct processor_performance *perf;
310 struct cpufreq_freqs freqs;
311 cpumask_t online_policy_cpus;
312 struct drv_cmd cmd;
313 unsigned int next_state = 0; /* Index into freq_table */
314 unsigned int next_perf_state = 0; /* Index into perf table */
315 int result = 0;
317 if (unlikely(data == NULL ||
318 data->acpi_data == NULL || data->freq_table == NULL)) {
319 return -ENODEV;
320 }
322 perf = data->acpi_data;
323 result = cpufreq_frequency_table_target(policy,
324 data->freq_table,
325 target_freq,
326 relation, &next_state);
327 if (unlikely(result))
328 return -ENODEV;
330 online_policy_cpus = policy->cpus;
332 next_perf_state = data->freq_table[next_state].index;
333 if (perf->state == next_perf_state) {
334 if (unlikely(policy->resume)) {
335 printk(KERN_INFO "Called after resume, resetting to P%d\n",
336 next_perf_state);
337 policy->resume = 0;
338 }
339 else {
340 printk(KERN_INFO "Already at target state (P%d)\n",
341 next_perf_state);
342 return 0;
343 }
344 }
346 switch (data->cpu_feature) {
347 case SYSTEM_INTEL_MSR_CAPABLE:
348 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
349 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
350 cmd.val = (u32) perf->states[next_perf_state].control;
351 break;
352 case SYSTEM_IO_CAPABLE:
353 cmd.type = SYSTEM_IO_CAPABLE;
354 cmd.addr.io.port = perf->control_register.address;
355 cmd.addr.io.bit_width = perf->control_register.bit_width;
356 cmd.val = (u32) perf->states[next_perf_state].control;
357 break;
358 default:
359 return -ENODEV;
360 }
362 cpus_clear(cmd.mask);
364 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
365 cmd.mask = online_policy_cpus;
366 else
367 cpu_set(policy->cpu, cmd.mask);
369 freqs.old = perf->states[perf->state].core_frequency * 1000;
370 freqs.new = data->freq_table[next_state].frequency;
372 drv_write(&cmd);
374 if (!check_freqs(cmd.mask, freqs.new, data))
375 return -EAGAIN;
377 px_statistic_update(cmd.mask, perf->state, next_perf_state);
379 perf->state = next_perf_state;
380 policy->cur = freqs.new;
382 return result;
383 }
385 static unsigned long
386 acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
387 {
388 struct processor_performance *perf = data->acpi_data;
390 if (cpu_khz) {
391 /* search the closest match to cpu_khz */
392 unsigned int i;
393 unsigned long freq;
394 unsigned long freqn = perf->states[0].core_frequency * 1000;
396 for (i=0; i<(perf->state_count-1); i++) {
397 freq = freqn;
398 freqn = perf->states[i+1].core_frequency * 1000;
399 if ((2 * cpu_khz) > (freqn + freq)) {
400 perf->state = i;
401 return freq;
402 }
403 }
404 perf->state = perf->state_count-1;
405 return freqn;
406 } else {
407 /* assume CPU is at P0... */
408 perf->state = 0;
409 return perf->states[0].core_frequency * 1000;
410 }
411 }
413 static int
414 acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
415 {
416 unsigned int i;
417 unsigned int valid_states = 0;
418 unsigned int cpu = policy->cpu;
419 struct acpi_cpufreq_data *data;
420 unsigned int result = 0;
421 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
422 struct processor_performance *perf;
424 data = xmalloc(struct acpi_cpufreq_data);
425 if (!data)
426 return -ENOMEM;
427 memset(data, 0, sizeof(struct acpi_cpufreq_data));
429 drv_data[cpu] = data;
431 data->acpi_data = &processor_pminfo[cpu].perf;
433 perf = data->acpi_data;
434 policy->shared_type = perf->shared_type;
436 /*
437 * Will let policy->cpus know about dependency only when software
438 * coordination is required.
439 */
440 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
441 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
442 policy->cpus = perf->shared_cpu_map;
443 } else {
444 policy->cpus = cpumask_of_cpu(cpu);
445 }
447 /* capability check */
448 if (perf->state_count <= 1) {
449 printk("No P-States\n");
450 result = -ENODEV;
451 goto err_unreg;
452 }
454 if (perf->control_register.space_id != perf->status_register.space_id) {
455 result = -ENODEV;
456 goto err_unreg;
457 }
459 switch (perf->control_register.space_id) {
460 case ACPI_ADR_SPACE_SYSTEM_IO:
461 printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
462 "SYSTEM IO addr space\n");
463 data->cpu_feature = SYSTEM_IO_CAPABLE;
464 break;
465 case ACPI_ADR_SPACE_FIXED_HARDWARE:
466 printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
467 "HARDWARE addr space\n");
468 if (!check_est_cpu(cpu)) {
469 result = -ENODEV;
470 goto err_unreg;
471 }
472 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
473 break;
474 default:
475 result = -ENODEV;
476 goto err_unreg;
477 }
479 data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
480 (perf->state_count+1));
481 if (!data->freq_table) {
482 result = -ENOMEM;
483 goto err_unreg;
484 }
486 /* detect transition latency */
487 policy->cpuinfo.transition_latency = 0;
488 for (i=0; i<perf->state_count; i++) {
489 if ((perf->states[i].transition_latency * 1000) >
490 policy->cpuinfo.transition_latency)
491 policy->cpuinfo.transition_latency =
492 perf->states[i].transition_latency * 1000;
493 }
495 data->max_freq = perf->states[0].core_frequency * 1000;
496 /* table init */
497 for (i=0; i<perf->state_count; i++) {
498 if (i>0 && perf->states[i].core_frequency >=
499 data->freq_table[valid_states-1].frequency / 1000)
500 continue;
502 data->freq_table[valid_states].index = i;
503 data->freq_table[valid_states].frequency =
504 perf->states[i].core_frequency * 1000;
505 valid_states++;
506 }
507 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
508 perf->state = 0;
510 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
511 if (result)
512 goto err_freqfree;
514 switch (perf->control_register.space_id) {
515 case ACPI_ADR_SPACE_SYSTEM_IO:
516 /* Current speed is unknown and not detectable by IO port */
517 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
518 break;
519 case ACPI_ADR_SPACE_FIXED_HARDWARE:
520 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
521 policy->cur = get_cur_freq_on_cpu(cpu);
522 break;
523 default:
524 break;
525 }
527 /* Check for APERF/MPERF support in hardware */
528 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
529 unsigned int ecx;
530 ecx = cpuid_ecx(6);
531 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
532 acpi_cpufreq_driver.getavg = get_measured_perf;
533 }
535 /*
536 * the first call to ->target() should result in us actually
537 * writing something to the appropriate registers.
538 */
539 policy->resume = 1;
541 return result;
543 err_freqfree:
544 xfree(data->freq_table);
545 err_unreg:
546 xfree(data);
547 drv_data[cpu] = NULL;
549 return result;
550 }
552 static struct cpufreq_driver acpi_cpufreq_driver = {
553 .target = acpi_cpufreq_target,
554 .init = acpi_cpufreq_cpu_init,
555 };
557 void cpufreq_dom_exit(void)
558 {
559 cpufreq_dom_max = 0;
560 cpus_clear(cpufreq_dom_mask);
561 if (cpufreq_dom_pt)
562 xfree(cpufreq_dom_pt);
563 }
565 int cpufreq_dom_init(void)
566 {
567 unsigned int i;
569 cpufreq_dom_max = 0;
570 cpus_clear(cpufreq_dom_mask);
572 for_each_online_cpu(i) {
573 cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
574 if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
575 cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
576 }
577 cpufreq_dom_max++;
579 cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
580 if (!cpufreq_dom_pt)
581 return -ENOMEM;
582 memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
584 for_each_online_cpu(i)
585 cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
587 for_each_online_cpu(i)
588 processor_pminfo[i].perf.shared_cpu_map =
589 cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
591 return 0;
592 }
594 static int cpufreq_cpu_init(void)
595 {
596 int i, ret = 0;
598 for_each_online_cpu(i) {
599 xen_px_policy[i].cpu = i;
601 ret = px_statistic_init(i);
602 if (ret)
603 return ret;
605 ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
606 if (ret)
607 return ret;
608 }
609 return ret;
610 }
612 int cpufreq_dom_dbs(unsigned int event)
613 {
614 int cpu, dom, ret = 0;
616 for (dom=0; dom<cpufreq_dom_max; dom++) {
617 if (!cpu_isset(dom, cpufreq_dom_mask))
618 continue;
619 cpu = first_cpu(cpufreq_dom_pt[dom]);
620 ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
621 if (ret)
622 return ret;
623 }
624 return ret;
625 }
627 int acpi_cpufreq_init(void)
628 {
629 int ret = 0;
631 /* setup cpumask of psd dom and shared cpu map of cpu */
632 ret = cpufreq_dom_init();
633 if (ret)
634 goto err;
636 /* setup cpufreq driver */
637 cpufreq_driver = &acpi_cpufreq_driver;
639 /* setup cpufreq infrastructure */
640 ret = cpufreq_cpu_init();
641 if (ret)
642 goto err;
644 /* setup cpufreq dbs according to dom coordiation */
645 ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
646 if (ret)
647 goto err;
649 return ret;
651 err:
652 cpufreq_dom_exit();
653 return ret;
654 }