/root/src/xen/xen/arch/x86/time.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * arch/x86/time.c |
3 | | * |
4 | | * Per-CPU time calibration and management. |
5 | | * |
6 | | * Copyright (c) 2002-2005, K A Fraser |
7 | | * |
8 | | * Portions from Linux are: |
9 | | * Copyright (c) 1991, 1992, 1995 Linus Torvalds |
10 | | */ |
11 | | |
12 | | #include <xen/errno.h> |
13 | | #include <xen/event.h> |
14 | | #include <xen/sched.h> |
15 | | #include <xen/lib.h> |
16 | | #include <xen/init.h> |
17 | | #include <xen/time.h> |
18 | | #include <xen/timer.h> |
19 | | #include <xen/smp.h> |
20 | | #include <xen/irq.h> |
21 | | #include <xen/softirq.h> |
22 | | #include <xen/efi.h> |
23 | | #include <xen/cpuidle.h> |
24 | | #include <xen/symbols.h> |
25 | | #include <xen/keyhandler.h> |
26 | | #include <xen/guest_access.h> |
27 | | #include <asm/io.h> |
28 | | #include <asm/msr.h> |
29 | | #include <asm/mpspec.h> |
30 | | #include <asm/processor.h> |
31 | | #include <asm/fixmap.h> |
32 | | #include <asm/mc146818rtc.h> |
33 | | #include <asm/div64.h> |
34 | | #include <asm/acpi.h> |
35 | | #include <asm/hpet.h> |
36 | | #include <io_ports.h> |
37 | | #include <asm/setup.h> /* for early_time_init */ |
38 | | #include <public/arch-x86/cpuid.h> |
39 | | |
40 | | /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */ |
41 | | static char __initdata opt_clocksource[10]; |
42 | | string_param("clocksource", opt_clocksource); |
43 | | |
44 | | unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */ |
45 | | DEFINE_SPINLOCK(rtc_lock); |
46 | | unsigned long pit0_ticks; |
47 | | |
48 | | struct cpu_time_stamp { |
49 | | u64 local_tsc; |
50 | | s_time_t local_stime; |
51 | | s_time_t master_stime; |
52 | | }; |
53 | | |
54 | | struct cpu_time { |
55 | | struct cpu_time_stamp stamp; |
56 | | struct time_scale tsc_scale; |
57 | | }; |
58 | | |
59 | | struct platform_timesource { |
60 | | char *id; |
61 | | char *name; |
62 | | u64 frequency; |
63 | | u64 (*read_counter)(void); |
64 | | s64 (*init)(struct platform_timesource *); |
65 | | void (*resume)(struct platform_timesource *); |
66 | | int counter_bits; |
67 | | }; |
68 | | |
69 | | static DEFINE_PER_CPU(struct cpu_time, cpu_time); |
70 | | |
71 | | /* Calibrate all CPUs to platform timer every EPOCH. */ |
72 | 179 | #define EPOCH MILLISECS(1000) |
73 | | static struct timer calibration_timer; |
74 | | |
75 | | /* |
76 | | * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter. |
77 | | * Otherwise overflow happens too quickly (~50ms) for us to guarantee that |
78 | | * softirq handling will happen in time. |
79 | | * |
80 | | * The pit_lock protects the 16- and 32-bit stamp fields as well as the |
81 | | */ |
82 | | static DEFINE_SPINLOCK(pit_lock); |
83 | | static u16 pit_stamp16; |
84 | | static u32 pit_stamp32; |
85 | | static bool __read_mostly using_pit; |
86 | | |
87 | | /* Boot timestamp, filled in head.S */ |
88 | | u64 __initdata boot_tsc_stamp; |
89 | | |
90 | | /* |
91 | | * 32-bit division of integer dividend and integer divisor yielding |
92 | | * 32-bit fractional quotient. |
93 | | */ |
94 | | static inline u32 div_frac(u32 dividend, u32 divisor) |
95 | 3 | { |
96 | 3 | u32 quotient, remainder; |
97 | 3 | ASSERT(dividend < divisor); |
98 | 3 | asm ( |
99 | 3 | "divl %4" |
100 | 3 | : "=a" (quotient), "=d" (remainder) |
101 | 3 | : "0" (0), "1" (dividend), "r" (divisor) ); |
102 | 3 | return quotient; |
103 | 3 | } |
104 | | |
105 | | /* |
106 | | * 32-bit multiplication of multiplicand and fractional multiplier |
107 | | * yielding 32-bit product (radix point at same position as in multiplicand). |
108 | | */ |
109 | | static inline u32 mul_frac(u32 multiplicand, u32 multiplier) |
110 | 0 | { |
111 | 0 | u32 product_int, product_frac; |
112 | 0 | asm ( |
113 | 0 | "mul %3" |
114 | 0 | : "=a" (product_frac), "=d" (product_int) |
115 | 0 | : "0" (multiplicand), "r" (multiplier) ); |
116 | 0 | return product_int; |
117 | 0 | } |
118 | | |
119 | | /* |
120 | | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, |
121 | | * yielding a 64-bit result. |
122 | | */ |
123 | | u64 scale_delta(u64 delta, const struct time_scale *scale) |
124 | 20.4M | { |
125 | 20.4M | u64 product; |
126 | 20.4M | |
127 | 20.4M | if ( scale->shift < 0 ) |
128 | 19.6M | delta >>= -scale->shift; |
129 | 20.4M | else |
130 | 784k | delta <<= scale->shift; |
131 | 20.4M | |
132 | 20.4M | asm ( |
133 | 20.4M | "mulq %2 ; shrd $32,%1,%0" |
134 | 20.4M | : "=a" (product), "=d" (delta) |
135 | 20.4M | : "rm" (delta), "0" ((u64)scale->mul_frac) ); |
136 | 20.4M | |
137 | 20.4M | return product; |
138 | 20.4M | } |
139 | | |
140 | 2.12M | #define _TS_MUL_FRAC_IDENTITY 0x80000000UL |
141 | | |
142 | | /* Compute the reciprocal of the given time_scale. */ |
143 | | static inline struct time_scale scale_reciprocal(struct time_scale scale) |
144 | 2.12M | { |
145 | 2.12M | struct time_scale reciprocal; |
146 | 2.12M | u32 dividend; |
147 | 2.12M | |
148 | 2.12M | ASSERT(scale.mul_frac != 0); |
149 | 2.12M | dividend = _TS_MUL_FRAC_IDENTITY; |
150 | 2.12M | reciprocal.shift = 1 - scale.shift; |
151 | 2.12M | while ( unlikely(dividend >= scale.mul_frac) ) |
152 | 0 | { |
153 | 0 | dividend >>= 1; |
154 | 0 | reciprocal.shift++; |
155 | 0 | } |
156 | 2.12M | |
157 | 2.12M | asm ( |
158 | 2.12M | "divl %4" |
159 | 2.12M | : "=a" (reciprocal.mul_frac), "=d" (dividend) |
160 | 2.12M | : "0" (0), "1" (dividend), "r" (scale.mul_frac) ); |
161 | 2.12M | |
162 | 2.12M | return reciprocal; |
163 | 2.12M | } |
164 | | |
165 | | /* |
166 | | * cpu_mask that denotes the CPUs that needs timer interrupt coming in as |
167 | | * IPIs in place of local APIC timers |
168 | | */ |
169 | | static cpumask_t pit_broadcast_mask; |
170 | | |
171 | | static void smp_send_timer_broadcast_ipi(void) |
172 | 35 | { |
173 | 35 | int cpu = smp_processor_id(); |
174 | 35 | cpumask_t mask; |
175 | 35 | |
176 | 35 | cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask); |
177 | 35 | |
178 | 35 | if ( cpumask_test_cpu(cpu, &mask) ) |
179 | 0 | { |
180 | 0 | __cpumask_clear_cpu(cpu, &mask); |
181 | 0 | raise_softirq(TIMER_SOFTIRQ); |
182 | 0 | } |
183 | 35 | |
184 | 35 | if ( !cpumask_empty(&mask) ) |
185 | 0 | { |
186 | 0 | cpumask_raise_softirq(&mask, TIMER_SOFTIRQ); |
187 | 0 | } |
188 | 35 | } |
189 | | |
190 | | static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) |
191 | 35 | { |
192 | 35 | ASSERT(local_irq_is_enabled()); |
193 | 35 | |
194 | 35 | if ( hpet_legacy_irq_tick() ) |
195 | 0 | return; |
196 | 35 | |
197 | 35 | /* Only for start-of-day interruopt tests in io_apic.c. */ |
198 | 35 | pit0_ticks++; |
199 | 35 | |
200 | 35 | /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ |
201 | 35 | if ( !cpu_has_apic ) |
202 | 0 | raise_softirq(TIMER_SOFTIRQ); |
203 | 35 | |
204 | 35 | if ( xen_cpuidle ) |
205 | 35 | smp_send_timer_broadcast_ipi(); |
206 | 35 | |
207 | 35 | /* Emulate a 32-bit PIT counter. */ |
208 | 35 | if ( using_pit ) |
209 | 0 | { |
210 | 0 | u16 count; |
211 | 0 |
|
212 | 0 | spin_lock_irq(&pit_lock); |
213 | 0 |
|
214 | 0 | outb(0x80, PIT_MODE); |
215 | 0 | count = inb(PIT_CH2); |
216 | 0 | count |= inb(PIT_CH2) << 8; |
217 | 0 |
|
218 | 0 | pit_stamp32 += (u16)(pit_stamp16 - count); |
219 | 0 | pit_stamp16 = count; |
220 | 0 |
|
221 | 0 | spin_unlock_irq(&pit_lock); |
222 | 0 | } |
223 | 35 | } |
224 | | |
225 | | static struct irqaction __read_mostly irq0 = { |
226 | | timer_interrupt, "timer", NULL |
227 | | }; |
228 | | |
229 | 2 | #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */ |
230 | 3 | #define CALIBRATE_FRAC 20 /* calibrate over 50ms */ |
231 | 1 | #define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC) |
232 | | |
233 | | static void preinit_pit(void) |
234 | 1 | { |
235 | 1 | /* Set PIT channel 0 to HZ Hz. */ |
236 | 2 | #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ) |
237 | 1 | outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ |
238 | 1 | outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ |
239 | 1 | outb(LATCH >> 8, PIT_CH0); /* MSB */ |
240 | 1 | #undef LATCH |
241 | 1 | } |
242 | | |
243 | | void set_time_scale(struct time_scale *ts, u64 ticks_per_sec) |
244 | 3 | { |
245 | 3 | u64 tps64 = ticks_per_sec; |
246 | 3 | u32 tps32; |
247 | 3 | int shift = 0; |
248 | 3 | |
249 | 3 | ASSERT(tps64 != 0); |
250 | 3 | |
251 | 4 | while ( tps64 > (MILLISECS(1000)*2) ) |
252 | 1 | { |
253 | 1 | tps64 >>= 1; |
254 | 1 | shift--; |
255 | 1 | } |
256 | 3 | |
257 | 3 | tps32 = (u32)tps64; |
258 | 19 | while ( tps32 <= (u32)MILLISECS(1000) ) |
259 | 16 | { |
260 | 16 | tps32 <<= 1; |
261 | 16 | shift++; |
262 | 16 | } |
263 | 3 | |
264 | 3 | ts->mul_frac = div_frac(MILLISECS(1000), tps32); |
265 | 3 | ts->shift = shift; |
266 | 3 | } |
267 | | |
268 | | static char *freq_string(u64 freq) |
269 | 1 | { |
270 | 1 | static char s[20]; |
271 | 1 | unsigned int x, y; |
272 | 1 | y = (unsigned int)do_div(freq, 1000000) / 1000; |
273 | 1 | x = (unsigned int)freq; |
274 | 1 | snprintf(s, sizeof(s), "%u.%03uMHz", x, y); |
275 | 1 | return s; |
276 | 1 | } |
277 | | |
278 | | /************************************************************ |
279 | | * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT) |
280 | | */ |
281 | | |
282 | | static u64 read_pit_count(void) |
283 | 0 | { |
284 | 0 | u16 count16; |
285 | 0 | u32 count32; |
286 | 0 | unsigned long flags; |
287 | 0 |
|
288 | 0 | spin_lock_irqsave(&pit_lock, flags); |
289 | 0 |
|
290 | 0 | outb(0x80, PIT_MODE); |
291 | 0 | count16 = inb(PIT_CH2); |
292 | 0 | count16 |= inb(PIT_CH2) << 8; |
293 | 0 |
|
294 | 0 | count32 = pit_stamp32 + (u16)(pit_stamp16 - count16); |
295 | 0 |
|
296 | 0 | spin_unlock_irqrestore(&pit_lock, flags); |
297 | 0 |
|
298 | 0 | return count32; |
299 | 0 | } |
300 | | |
301 | | static s64 __init init_pit(struct platform_timesource *pts) |
302 | 0 | { |
303 | 0 | u8 portb = inb(0x61); |
304 | 0 | u64 start, end; |
305 | 0 | unsigned long count; |
306 | 0 |
|
307 | 0 | using_pit = true; |
308 | 0 |
|
309 | 0 | /* Set the Gate high, disable speaker. */ |
310 | 0 | outb((portb & ~0x02) | 0x01, 0x61); |
311 | 0 |
|
312 | 0 | /* |
313 | 0 | * Now let's take care of CTC channel 2: mode 0, (interrupt on |
314 | 0 | * terminal count mode), binary count, load CALIBRATE_LATCH count, |
315 | 0 | * (LSB and MSB) to begin countdown. |
316 | 0 | */ |
317 | 0 | #define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE) |
318 | 0 | outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */ |
319 | 0 | outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */ |
320 | 0 | outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */ |
321 | 0 | #undef CALIBRATE_LATCH |
322 | 0 |
|
323 | 0 | start = rdtsc_ordered(); |
324 | 0 | for ( count = 0; !(inb(0x61) & 0x20); ++count ) |
325 | 0 | continue; |
326 | 0 | end = rdtsc_ordered(); |
327 | 0 |
|
328 | 0 | /* Set the Gate low, disable speaker. */ |
329 | 0 | outb(portb & ~0x03, 0x61); |
330 | 0 |
|
331 | 0 | /* Error if the CTC doesn't behave itself. */ |
332 | 0 | if ( count == 0 ) |
333 | 0 | return 0; |
334 | 0 |
|
335 | 0 | return (end - start) * CALIBRATE_FRAC; |
336 | 0 | } |
337 | | |
338 | | static void resume_pit(struct platform_timesource *pts) |
339 | 0 | { |
340 | 0 | /* Set CTC channel 2 to mode 0 again; initial value does not matter. */ |
341 | 0 | outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */ |
342 | 0 | outb(0, PIT_CH2); /* LSB of count */ |
343 | 0 | outb(0, PIT_CH2); /* MSB of count */ |
344 | 0 | } |
345 | | |
346 | | static struct platform_timesource __initdata plt_pit = |
347 | | { |
348 | | .id = "pit", |
349 | | .name = "PIT", |
350 | | .frequency = CLOCK_TICK_RATE, |
351 | | .read_counter = read_pit_count, |
352 | | .counter_bits = 32, |
353 | | .init = init_pit, |
354 | | .resume = resume_pit, |
355 | | }; |
356 | | |
357 | | /************************************************************ |
358 | | * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET) |
359 | | */ |
360 | | |
361 | | static u64 read_hpet_count(void) |
362 | 200 | { |
363 | 200 | return hpet_read32(HPET_COUNTER); |
364 | 200 | } |
365 | | |
366 | | static s64 __init init_hpet(struct platform_timesource *pts) |
367 | 1 | { |
368 | 1 | u64 hpet_rate = hpet_setup(), start; |
369 | 1 | u32 count, target; |
370 | 1 | |
371 | 1 | if ( hpet_rate == 0 ) |
372 | 0 | return 0; |
373 | 1 | |
374 | 1 | pts->frequency = hpet_rate; |
375 | 1 | |
376 | 1 | count = hpet_read32(HPET_COUNTER); |
377 | 1 | start = rdtsc_ordered(); |
378 | 1 | target = count + CALIBRATE_VALUE(hpet_rate); |
379 | 1 | if ( target < count ) |
380 | 0 | while ( hpet_read32(HPET_COUNTER) >= count ) |
381 | 0 | continue; |
382 | 86.0k | while ( hpet_read32(HPET_COUNTER) < target ) |
383 | 86.0k | continue; |
384 | 1 | |
385 | 1 | return (rdtsc_ordered() - start) * CALIBRATE_FRAC; |
386 | 1 | } |
387 | | |
388 | | static void resume_hpet(struct platform_timesource *pts) |
389 | 0 | { |
390 | 0 | hpet_resume(NULL); |
391 | 0 | } |
392 | | |
393 | | static struct platform_timesource __initdata plt_hpet = |
394 | | { |
395 | | .id = "hpet", |
396 | | .name = "HPET", |
397 | | .read_counter = read_hpet_count, |
398 | | .counter_bits = 32, |
399 | | .init = init_hpet, |
400 | | .resume = resume_hpet |
401 | | }; |
402 | | |
403 | | /************************************************************ |
404 | | * PLATFORM TIMER 3: ACPI PM TIMER |
405 | | */ |
406 | | |
407 | | u32 __read_mostly pmtmr_ioport; |
408 | | unsigned int __initdata pmtmr_width; |
409 | | |
410 | | /* ACPI PM timer ticks at 3.579545 MHz. */ |
411 | 1 | #define ACPI_PM_FREQUENCY 3579545 |
412 | | |
413 | | static u64 read_pmtimer_count(void) |
414 | 0 | { |
415 | 0 | return inl(pmtmr_ioport); |
416 | 0 | } |
417 | | |
418 | | static s64 __init init_pmtimer(struct platform_timesource *pts) |
419 | 0 | { |
420 | 0 | u64 start; |
421 | 0 | u32 count, target, mask = 0xffffff; |
422 | 0 |
|
423 | 0 | if ( !pmtmr_ioport || !pmtmr_width ) |
424 | 0 | return 0; |
425 | 0 |
|
426 | 0 | if ( pmtmr_width == 32 ) |
427 | 0 | { |
428 | 0 | pts->counter_bits = 32; |
429 | 0 | mask = 0xffffffff; |
430 | 0 | } |
431 | 0 |
|
432 | 0 | count = inl(pmtmr_ioport) & mask; |
433 | 0 | start = rdtsc_ordered(); |
434 | 0 | target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY); |
435 | 0 | if ( target < count ) |
436 | 0 | while ( (inl(pmtmr_ioport) & mask) >= count ) |
437 | 0 | continue; |
438 | 0 | while ( (inl(pmtmr_ioport) & mask) < target ) |
439 | 0 | continue; |
440 | 0 |
|
441 | 0 | return (rdtsc_ordered() - start) * CALIBRATE_FRAC; |
442 | 0 | } |
443 | | |
444 | | static struct platform_timesource __initdata plt_pmtimer = |
445 | | { |
446 | | .id = "acpi", |
447 | | .name = "ACPI PM Timer", |
448 | | .frequency = ACPI_PM_FREQUENCY, |
449 | | .read_counter = read_pmtimer_count, |
450 | | .counter_bits = 24, |
451 | | .init = init_pmtimer |
452 | | }; |
453 | | |
454 | | static struct time_scale __read_mostly pmt_scale; |
455 | | static struct time_scale __read_mostly pmt_scale_r; |
456 | | |
457 | | static __init int init_pmtmr_scale(void) |
458 | 1 | { |
459 | 1 | set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY); |
460 | 1 | pmt_scale_r = scale_reciprocal(pmt_scale); |
461 | 1 | return 0; |
462 | 1 | } |
463 | | __initcall(init_pmtmr_scale); |
464 | | |
465 | | uint64_t acpi_pm_tick_to_ns(uint64_t ticks) |
466 | 0 | { |
467 | 0 | return scale_delta(ticks, &pmt_scale); |
468 | 0 | } |
469 | | |
470 | | uint64_t ns_to_acpi_pm_tick(uint64_t ns) |
471 | 0 | { |
472 | 0 | return scale_delta(ns, &pmt_scale_r); |
473 | 0 | } |
474 | | |
475 | | /************************************************************ |
476 | | * PLATFORM TIMER 4: TSC |
477 | | */ |
478 | | static unsigned int __initdata tsc_flags; |
479 | | |
480 | | /* TSC is reliable across sockets */ |
481 | 0 | #define TSC_RELIABLE_SOCKET (1 << 0) |
482 | | |
483 | | /* |
484 | | * Called in verify_tsc_reliability() under reliable TSC conditions |
485 | | * thus reusing all the checks already performed there. |
486 | | */ |
487 | | static s64 __init init_tsc(struct platform_timesource *pts) |
488 | 0 | { |
489 | 0 | u64 ret = pts->frequency; |
490 | 0 |
|
491 | 0 | if ( nr_cpu_ids != num_present_cpus() ) |
492 | 0 | { |
493 | 0 | printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n"); |
494 | 0 | ret = 0; |
495 | 0 | } |
496 | 0 |
|
497 | 0 | if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) ) |
498 | 0 | { |
499 | 0 | printk(XENLOG_WARNING "TSC: Not invariant across sockets\n"); |
500 | 0 | ret = 0; |
501 | 0 | } |
502 | 0 |
|
503 | 0 | if ( !ret ) |
504 | 0 | printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n"); |
505 | 0 |
|
506 | 0 | return ret; |
507 | 0 | } |
508 | | |
509 | | static u64 read_tsc(void) |
510 | 0 | { |
511 | 0 | return rdtsc_ordered(); |
512 | 0 | } |
513 | | |
514 | | static struct platform_timesource __initdata plt_tsc = |
515 | | { |
516 | | .id = "tsc", |
517 | | .name = "TSC", |
518 | | .read_counter = read_tsc, |
519 | | /* |
520 | | * Calculations for platform timer overflow assume u64 boundary. |
521 | | * Hence we set to less than 64, such that the TSC wraparound is |
522 | | * correctly checked and handled. |
523 | | */ |
524 | | .counter_bits = 63, |
525 | | .init = init_tsc, |
526 | | }; |
527 | | |
528 | | /************************************************************ |
529 | | * GENERIC PLATFORM TIMER INFRASTRUCTURE |
530 | | */ |
531 | | |
532 | | /* details of chosen timesource */ |
533 | | static struct platform_timesource __read_mostly plt_src; |
534 | | /* hardware-width mask */ |
535 | | static u64 __read_mostly plt_mask; |
536 | | /* ns between calls to plt_overflow() */ |
537 | | static u64 __read_mostly plt_overflow_period; |
538 | | /* scale: platform counter -> nanosecs */ |
539 | | static struct time_scale __read_mostly plt_scale; |
540 | | |
541 | | /* Protected by platform_timer_lock. */ |
542 | | static DEFINE_SPINLOCK(platform_timer_lock); |
543 | | static s_time_t stime_platform_stamp; /* System time at below platform time */ |
544 | | static u64 platform_timer_stamp; /* Platform time at above system time */ |
545 | | static u64 plt_stamp64; /* 64-bit platform counter stamp */ |
546 | | static u64 plt_stamp; /* hardware-width platform counter stamp */ |
547 | | static struct timer plt_overflow_timer; |
548 | | |
549 | | static s_time_t __read_platform_stime(u64 platform_time) |
550 | 201 | { |
551 | 201 | u64 diff = platform_time - platform_timer_stamp; |
552 | 201 | ASSERT(spin_is_locked(&platform_timer_lock)); |
553 | 201 | return (stime_platform_stamp + scale_delta(diff, &plt_scale)); |
554 | 201 | } |
555 | | |
556 | | static void plt_overflow(void *unused) |
557 | 1 | { |
558 | 1 | int i; |
559 | 1 | u64 count; |
560 | 1 | s_time_t now, plt_now, plt_wrap; |
561 | 1 | |
562 | 1 | spin_lock_irq(&platform_timer_lock); |
563 | 1 | |
564 | 1 | count = plt_src.read_counter(); |
565 | 1 | plt_stamp64 += (count - plt_stamp) & plt_mask; |
566 | 1 | plt_stamp = count; |
567 | 1 | |
568 | 1 | now = NOW(); |
569 | 1 | plt_wrap = __read_platform_stime(plt_stamp64); |
570 | 1 | for ( i = 0; i < 10; i++ ) |
571 | 1 | { |
572 | 1 | plt_now = plt_wrap; |
573 | 1 | plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1); |
574 | 1 | if ( ABS(plt_wrap - now) > ABS(plt_now - now) ) |
575 | 1 | break; |
576 | 0 | plt_stamp64 += plt_mask + 1; |
577 | 0 | } |
578 | 1 | if ( i != 0 ) |
579 | 0 | { |
580 | 0 | static bool warned_once; |
581 | 0 |
|
582 | 0 | if ( !test_and_set_bool(warned_once) ) |
583 | 0 | printk("Platform timer appears to have unexpectedly wrapped " |
584 | 0 | "%u%s times.\n", i, (i == 10) ? " or more" : ""); |
585 | 0 | } |
586 | 1 | |
587 | 1 | spin_unlock_irq(&platform_timer_lock); |
588 | 1 | |
589 | 1 | set_timer(&plt_overflow_timer, NOW() + plt_overflow_period); |
590 | 1 | } |
591 | | |
592 | | static s_time_t read_platform_stime(u64 *stamp) |
593 | 111 | { |
594 | 111 | u64 plt_counter, count; |
595 | 111 | s_time_t stime; |
596 | 111 | |
597 | 111 | ASSERT(!local_irq_is_enabled()); |
598 | 111 | |
599 | 111 | spin_lock(&platform_timer_lock); |
600 | 111 | plt_counter = plt_src.read_counter(); |
601 | 111 | count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask); |
602 | 111 | stime = __read_platform_stime(count); |
603 | 111 | spin_unlock(&platform_timer_lock); |
604 | 111 | |
605 | 111 | if ( unlikely(stamp) ) |
606 | 0 | *stamp = plt_counter; |
607 | 111 | |
608 | 111 | return stime; |
609 | 111 | } |
610 | | |
611 | | static void platform_time_calibration(void) |
612 | 88 | { |
613 | 88 | u64 count; |
614 | 88 | s_time_t stamp; |
615 | 88 | unsigned long flags; |
616 | 88 | |
617 | 88 | spin_lock_irqsave(&platform_timer_lock, flags); |
618 | 88 | count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask); |
619 | 88 | stamp = __read_platform_stime(count); |
620 | 88 | stime_platform_stamp = stamp; |
621 | 88 | platform_timer_stamp = count; |
622 | 88 | spin_unlock_irqrestore(&platform_timer_lock, flags); |
623 | 88 | } |
624 | | |
625 | | static void resume_platform_timer(void) |
626 | 0 | { |
627 | 0 | /* Timer source can be reset when backing from S3 to S0 */ |
628 | 0 | if ( plt_src.resume ) |
629 | 0 | plt_src.resume(&plt_src); |
630 | 0 |
|
631 | 0 | plt_stamp64 = platform_timer_stamp; |
632 | 0 | plt_stamp = plt_src.read_counter(); |
633 | 0 | } |
634 | | |
635 | | static void __init reset_platform_timer(void) |
636 | 0 | { |
637 | 0 | /* Deactivate any timers running */ |
638 | 0 | kill_timer(&plt_overflow_timer); |
639 | 0 | kill_timer(&calibration_timer); |
640 | 0 |
|
641 | 0 | /* Reset counters and stamps */ |
642 | 0 | spin_lock_irq(&platform_timer_lock); |
643 | 0 | plt_stamp = 0; |
644 | 0 | plt_stamp64 = 0; |
645 | 0 | platform_timer_stamp = 0; |
646 | 0 | stime_platform_stamp = 0; |
647 | 0 | spin_unlock_irq(&platform_timer_lock); |
648 | 0 | } |
649 | | |
650 | | static s64 __init try_platform_timer(struct platform_timesource *pts) |
651 | 1 | { |
652 | 1 | s64 rc = pts->init(pts); |
653 | 1 | |
654 | 1 | if ( rc <= 0 ) |
655 | 0 | return rc; |
656 | 1 | |
657 | 1 | /* We have a platform timesource already so reset it */ |
658 | 1 | if ( plt_src.counter_bits != 0 ) |
659 | 0 | reset_platform_timer(); |
660 | 1 | |
661 | 1 | plt_mask = (u64)~0ull >> (64 - pts->counter_bits); |
662 | 1 | |
663 | 1 | set_time_scale(&plt_scale, pts->frequency); |
664 | 1 | |
665 | 1 | plt_overflow_period = scale_delta( |
666 | 1 | 1ull << (pts->counter_bits - 1), &plt_scale); |
667 | 1 | plt_src = *pts; |
668 | 1 | |
669 | 1 | return rc; |
670 | 1 | } |
671 | | |
672 | | static u64 __init init_platform_timer(void) |
673 | 1 | { |
674 | 1 | static struct platform_timesource * __initdata plt_timers[] = { |
675 | 1 | &plt_hpet, &plt_pmtimer, &plt_pit |
676 | 1 | }; |
677 | 1 | |
678 | 1 | struct platform_timesource *pts = NULL; |
679 | 1 | unsigned int i; |
680 | 1 | s64 rc = -1; |
681 | 1 | |
682 | 1 | /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */ |
683 | 1 | if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") ) |
684 | 0 | { |
685 | 0 | for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) |
686 | 0 | { |
687 | 0 | pts = plt_timers[i]; |
688 | 0 | if ( !strcmp(opt_clocksource, pts->id) ) |
689 | 0 | { |
690 | 0 | rc = try_platform_timer(pts); |
691 | 0 | break; |
692 | 0 | } |
693 | 0 | } |
694 | 0 |
|
695 | 0 | if ( rc <= 0 ) |
696 | 0 | printk("WARNING: %s clocksource '%s'.\n", |
697 | 0 | (rc == 0) ? "Could not initialise" : "Unrecognised", |
698 | 0 | opt_clocksource); |
699 | 0 | } |
700 | 1 | |
701 | 1 | if ( rc <= 0 ) |
702 | 1 | { |
703 | 1 | for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) |
704 | 1 | { |
705 | 1 | pts = plt_timers[i]; |
706 | 1 | if ( (rc = try_platform_timer(pts)) > 0 ) |
707 | 1 | break; |
708 | 1 | } |
709 | 1 | } |
710 | 1 | |
711 | 1 | BUG_ON(rc <= 0); |
712 | 1 | |
713 | 1 | printk("Platform timer is %s %s\n", |
714 | 1 | freq_string(pts->frequency), pts->name); |
715 | 1 | |
716 | 1 | return rc; |
717 | 1 | } |
718 | | |
719 | | u64 stime2tsc(s_time_t stime) |
720 | 2.09M | { |
721 | 2.09M | struct cpu_time *t; |
722 | 2.09M | struct time_scale sys_to_tsc; |
723 | 2.09M | s_time_t stime_delta; |
724 | 2.09M | |
725 | 2.09M | t = &this_cpu(cpu_time); |
726 | 2.09M | sys_to_tsc = scale_reciprocal(t->tsc_scale); |
727 | 2.09M | |
728 | 2.09M | stime_delta = stime - t->stamp.local_stime; |
729 | 2.09M | if ( stime_delta < 0 ) |
730 | 0 | stime_delta = 0; |
731 | 2.09M | |
732 | 2.09M | return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc); |
733 | 2.09M | } |
734 | | |
735 | | void cstate_restore_tsc(void) |
736 | 1.40M | { |
737 | 1.40M | if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) |
738 | 1.43M | return; |
739 | 1.40M | |
740 | 18.4E | write_tsc(stime2tsc(read_platform_stime(NULL))); |
741 | 18.4E | } |
742 | | |
743 | | /*************************************************************************** |
744 | | * CMOS Timer functions |
745 | | ***************************************************************************/ |
746 | | |
747 | | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
748 | | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
749 | | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
750 | | * |
751 | | * [For the Julian calendar (which was used in Russia before 1917, |
752 | | * Britain & colonies before 1752, anywhere else before 1582, |
753 | | * and is still in use by some communities) leave out the |
754 | | * -year/100+year/400 terms, and add 10.] |
755 | | * |
756 | | * This algorithm was first published by Gauss (I think). |
757 | | * |
758 | | * WARNING: this function will overflow on 2106-02-07 06:28:16 on |
759 | | * machines were long is 32-bit! (However, as time_t is signed, we |
760 | | * will already get problems at other places on 2038-01-19 03:14:08) |
761 | | */ |
762 | | unsigned long |
763 | | mktime (unsigned int year, unsigned int mon, |
764 | | unsigned int day, unsigned int hour, |
765 | | unsigned int min, unsigned int sec) |
766 | 1 | { |
767 | 1 | /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */ |
768 | 1 | if ( 0 >= (int) (mon -= 2) ) |
769 | 0 | { |
770 | 0 | mon += 12; |
771 | 0 | year -= 1; |
772 | 0 | } |
773 | 1 | |
774 | 1 | return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+ |
775 | 1 | year*365 - 719499 |
776 | 1 | )*24 + hour /* now have hours */ |
777 | 1 | )*60 + min /* now have minutes */ |
778 | 1 | )*60 + sec; /* finally seconds */ |
779 | 1 | } |
780 | | |
781 | | struct rtc_time { |
782 | | unsigned int year, mon, day, hour, min, sec; |
783 | | }; |
784 | | |
785 | | static void __get_cmos_time(struct rtc_time *rtc) |
786 | 1 | { |
787 | 1 | rtc->sec = CMOS_READ(RTC_SECONDS); |
788 | 1 | rtc->min = CMOS_READ(RTC_MINUTES); |
789 | 1 | rtc->hour = CMOS_READ(RTC_HOURS); |
790 | 1 | rtc->day = CMOS_READ(RTC_DAY_OF_MONTH); |
791 | 1 | rtc->mon = CMOS_READ(RTC_MONTH); |
792 | 1 | rtc->year = CMOS_READ(RTC_YEAR); |
793 | 1 | |
794 | 1 | if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ) |
795 | 1 | { |
796 | 1 | BCD_TO_BIN(rtc->sec); |
797 | 1 | BCD_TO_BIN(rtc->min); |
798 | 1 | BCD_TO_BIN(rtc->hour); |
799 | 1 | BCD_TO_BIN(rtc->day); |
800 | 1 | BCD_TO_BIN(rtc->mon); |
801 | 1 | BCD_TO_BIN(rtc->year); |
802 | 1 | } |
803 | 1 | |
804 | 1 | if ( (rtc->year += 1900) < 1970 ) |
805 | 1 | rtc->year += 100; |
806 | 1 | } |
807 | | |
808 | | static unsigned long get_cmos_time(void) |
809 | 1 | { |
810 | 1 | unsigned long res, flags; |
811 | 1 | struct rtc_time rtc; |
812 | 1 | unsigned int seconds = 60; |
813 | 1 | static bool __read_mostly cmos_rtc_probe; |
814 | 1 | boolean_param("cmos-rtc-probe", cmos_rtc_probe); |
815 | 1 | |
816 | 1 | if ( efi_enabled(EFI_RS) ) |
817 | 0 | { |
818 | 0 | res = efi_get_time(); |
819 | 0 | if ( res ) |
820 | 0 | return res; |
821 | 0 | } |
822 | 1 | |
823 | 1 | if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) ) |
824 | 1 | cmos_rtc_probe = false; |
825 | 0 | else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe ) |
826 | 0 | panic("System with no CMOS RTC advertised must be booted from EFI" |
827 | 0 | " (or with command line option \"cmos-rtc-probe\")"); |
828 | 1 | |
829 | 1 | for ( ; ; ) |
830 | 1 | { |
831 | 1 | s_time_t start, t1, t2; |
832 | 1 | |
833 | 1 | spin_lock_irqsave(&rtc_lock, flags); |
834 | 1 | |
835 | 1 | /* read RTC exactly on falling edge of update flag */ |
836 | 1 | start = NOW(); |
837 | 152k | do { /* may take up to 1 second... */ |
838 | 152k | t1 = NOW() - start; |
839 | 152k | } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) && |
840 | 152k | t1 <= SECONDS(1) ); |
841 | 1 | |
842 | 1 | start = NOW(); |
843 | 577 | do { /* must try at least 2.228 ms */ |
844 | 577 | t2 = NOW() - start; |
845 | 577 | } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) && |
846 | 576 | t2 < MILLISECS(3) ); |
847 | 1 | |
848 | 1 | __get_cmos_time(&rtc); |
849 | 1 | |
850 | 1 | spin_unlock_irqrestore(&rtc_lock, flags); |
851 | 1 | |
852 | 1 | if ( likely(!cmos_rtc_probe) || |
853 | 0 | t1 > SECONDS(1) || t2 >= MILLISECS(3) || |
854 | 0 | rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 || |
855 | 0 | !rtc.day || rtc.day > 31 || |
856 | 0 | !rtc.mon || rtc.mon > 12 ) |
857 | 1 | break; |
858 | 1 | |
859 | 0 | if ( seconds < 60 ) |
860 | 0 | { |
861 | 0 | if ( rtc.sec != seconds ) |
862 | 0 | cmos_rtc_probe = false; |
863 | 0 | break; |
864 | 0 | } |
865 | 0 |
|
866 | 0 | process_pending_softirqs(); |
867 | 0 |
|
868 | 0 | seconds = rtc.sec; |
869 | 0 | } |
870 | 1 | |
871 | 1 | if ( unlikely(cmos_rtc_probe) ) |
872 | 0 | panic("No CMOS RTC found - system must be booted from EFI"); |
873 | 1 | |
874 | 1 | return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec); |
875 | 1 | } |
876 | | |
877 | | /*************************************************************************** |
878 | | * System Time |
879 | | ***************************************************************************/ |
880 | | |
881 | | s_time_t get_s_time_fixed(u64 at_tsc) |
882 | 20.0M | { |
883 | 20.0M | const struct cpu_time *t = &this_cpu(cpu_time); |
884 | 20.0M | u64 tsc, delta; |
885 | 20.0M | s_time_t now; |
886 | 20.0M | |
887 | 20.0M | if ( at_tsc ) |
888 | 416 | tsc = at_tsc; |
889 | 20.0M | else |
890 | 20.0M | tsc = rdtsc_ordered(); |
891 | 20.0M | delta = tsc - t->stamp.local_tsc; |
892 | 20.0M | now = t->stamp.local_stime + scale_delta(delta, &t->tsc_scale); |
893 | 20.0M | |
894 | 20.0M | return now; |
895 | 20.0M | } |
896 | | |
897 | | s_time_t get_s_time() |
898 | 19.9M | { |
899 | 19.9M | return get_s_time_fixed(0); |
900 | 19.9M | } |
901 | | |
902 | | uint64_t tsc_ticks2ns(uint64_t ticks) |
903 | 0 | { |
904 | 0 | struct cpu_time *t = &this_cpu(cpu_time); |
905 | 0 |
|
906 | 0 | return scale_delta(ticks, &t->tsc_scale); |
907 | 0 | } |
908 | | |
909 | | static void __update_vcpu_system_time(struct vcpu *v, int force) |
910 | 165k | { |
911 | 165k | const struct cpu_time *t; |
912 | 165k | struct vcpu_time_info *u, _u = {}; |
913 | 165k | struct domain *d = v->domain; |
914 | 165k | s_time_t tsc_stamp; |
915 | 165k | |
916 | 165k | if ( v->vcpu_info == NULL ) |
917 | 66.5k | return; |
918 | 165k | |
919 | 98.9k | t = &this_cpu(cpu_time); |
920 | 98.9k | u = &vcpu_info(v, time); |
921 | 98.9k | |
922 | 98.9k | if ( d->arch.vtsc ) |
923 | 0 | { |
924 | 0 | s_time_t stime = t->stamp.local_stime; |
925 | 0 |
|
926 | 0 | if ( is_hvm_domain(d) ) |
927 | 0 | { |
928 | 0 | struct pl_time *pl = v->domain->arch.hvm_domain.pl_time; |
929 | 0 |
|
930 | 0 | stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset; |
931 | 0 | if ( stime >= 0 ) |
932 | 0 | tsc_stamp = gtime_to_gtsc(d, stime); |
933 | 0 | else |
934 | 0 | tsc_stamp = -gtime_to_gtsc(d, -stime); |
935 | 0 | } |
936 | 0 | else |
937 | 0 | tsc_stamp = gtime_to_gtsc(d, stime); |
938 | 0 |
|
939 | 0 | _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac; |
940 | 0 | _u.tsc_shift = d->arch.vtsc_to_ns.shift; |
941 | 0 | } |
942 | 98.9k | else |
943 | 98.9k | { |
944 | 99.4k | if ( is_hvm_domain(d) && hvm_tsc_scaling_supported ) |
945 | 0 | { |
946 | 0 | tsc_stamp = hvm_scale_tsc(d, t->stamp.local_tsc); |
947 | 0 | _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac; |
948 | 0 | _u.tsc_shift = d->arch.vtsc_to_ns.shift; |
949 | 0 | } |
950 | 98.9k | else |
951 | 98.9k | { |
952 | 98.9k | tsc_stamp = t->stamp.local_tsc; |
953 | 98.9k | _u.tsc_to_system_mul = t->tsc_scale.mul_frac; |
954 | 98.9k | _u.tsc_shift = t->tsc_scale.shift; |
955 | 98.9k | } |
956 | 98.9k | } |
957 | 98.9k | |
958 | 98.9k | _u.tsc_timestamp = tsc_stamp; |
959 | 98.9k | _u.system_time = t->stamp.local_stime; |
960 | 98.9k | |
961 | 98.9k | /* |
962 | 98.9k | * It's expected that domains cope with this bit changing on every |
963 | 98.9k | * pvclock read to check whether they can resort solely on this tuple |
964 | 98.9k | * or if it further requires monotonicity checks with other vcpus. |
965 | 98.9k | */ |
966 | 98.9k | if ( clocksource_is_tsc() ) |
967 | 0 | _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT; |
968 | 98.9k | |
969 | 98.9k | if ( is_hvm_domain(d) ) |
970 | 99.4k | _u.tsc_timestamp += v->arch.hvm_vcpu.cache_tsc_offset; |
971 | 98.9k | |
972 | 98.9k | /* Don't bother unless timestamp record has changed or we are forced. */ |
973 | 98.9k | _u.version = u->version; /* make versions match for memcmp test */ |
974 | 100k | if ( !force && !memcmp(u, &_u, sizeof(_u)) ) |
975 | 99.3k | return; |
976 | 98.9k | |
977 | 98.9k | /* 1. Update guest kernel version. */ |
978 | 18.4E | _u.version = u->version = version_update_begin(u->version); |
979 | 18.4E | wmb(); |
980 | 18.4E | /* 2. Update all other guest kernel fields. */ |
981 | 18.4E | *u = _u; |
982 | 18.4E | wmb(); |
983 | 18.4E | /* 3. Update guest kernel version. */ |
984 | 18.4E | u->version = version_update_end(u->version); |
985 | 18.4E | |
986 | 18.4E | if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) && |
987 | 0 | !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) ) |
988 | 0 | v->arch.pv_vcpu.pending_system_time = _u; |
989 | 18.4E | } |
990 | | |
991 | | bool update_secondary_system_time(struct vcpu *v, |
992 | | struct vcpu_time_info *u) |
993 | 1.41k | { |
994 | 1.41k | XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest; |
995 | 1.41k | struct guest_memory_policy policy = |
996 | 1.41k | { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false }; |
997 | 1.41k | |
998 | 1.41k | if ( guest_handle_is_null(user_u) ) |
999 | 1.41k | return true; |
1000 | 1.41k | |
1001 | 18.4E | update_guest_memory_policy(v, &policy); |
1002 | 18.4E | |
1003 | 18.4E | /* 1. Update userspace version. */ |
1004 | 18.4E | if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) ) |
1005 | 0 | { |
1006 | 0 | update_guest_memory_policy(v, &policy); |
1007 | 0 | return false; |
1008 | 0 | } |
1009 | 18.4E | wmb(); |
1010 | 18.4E | /* 2. Update all other userspace fields. */ |
1011 | 18.4E | __copy_to_guest(user_u, u, 1); |
1012 | 18.4E | wmb(); |
1013 | 18.4E | /* 3. Update userspace version. */ |
1014 | 18.4E | u->version = version_update_end(u->version); |
1015 | 18.4E | __copy_field_to_guest(user_u, u, version); |
1016 | 18.4E | |
1017 | 18.4E | update_guest_memory_policy(v, &policy); |
1018 | 18.4E | |
1019 | 18.4E | return true; |
1020 | 18.4E | } |
1021 | | |
1022 | | void update_vcpu_system_time(struct vcpu *v) |
1023 | 165k | { |
1024 | 165k | __update_vcpu_system_time(v, 0); |
1025 | 165k | } |
1026 | | |
1027 | | void force_update_vcpu_system_time(struct vcpu *v) |
1028 | 0 | { |
1029 | 0 | __update_vcpu_system_time(v, 1); |
1030 | 0 | } |
1031 | | |
1032 | | static void update_domain_rtc(void) |
1033 | 0 | { |
1034 | 0 | struct domain *d; |
1035 | 0 |
|
1036 | 0 | rcu_read_lock(&domlist_read_lock); |
1037 | 0 |
|
1038 | 0 | for_each_domain ( d ) |
1039 | 0 | if ( is_hvm_domain(d) ) |
1040 | 0 | rtc_update_clock(d); |
1041 | 0 |
|
1042 | 0 | rcu_read_unlock(&domlist_read_lock); |
1043 | 0 | } |
1044 | | |
1045 | | void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds) |
1046 | 0 | { |
1047 | 0 | d->time_offset_seconds = time_offset_seconds; |
1048 | 0 | if ( is_hvm_domain(d) ) |
1049 | 0 | rtc_update_clock(d); |
1050 | 0 | update_domain_wallclock_time(d); |
1051 | 0 | } |
1052 | | |
1053 | | int cpu_frequency_change(u64 freq) |
1054 | 0 | { |
1055 | 0 | struct cpu_time *t = &this_cpu(cpu_time); |
1056 | 0 | u64 curr_tsc; |
1057 | 0 |
|
1058 | 0 | /* Sanity check: CPU frequency allegedly dropping below 1MHz? */ |
1059 | 0 | if ( freq < 1000000u ) |
1060 | 0 | { |
1061 | 0 | printk(XENLOG_WARNING "Rejecting CPU frequency change " |
1062 | 0 | "to %"PRIu64" Hz\n", freq); |
1063 | 0 | return -EINVAL; |
1064 | 0 | } |
1065 | 0 |
|
1066 | 0 | local_irq_disable(); |
1067 | 0 | /* Platform time /first/, as we may be delayed by platform_timer_lock. */ |
1068 | 0 | t->stamp.master_stime = read_platform_stime(NULL); |
1069 | 0 | curr_tsc = rdtsc_ordered(); |
1070 | 0 | /* TSC-extrapolated time may be bogus after frequency change. */ |
1071 | 0 | /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/ |
1072 | 0 | t->stamp.local_stime = t->stamp.master_stime; |
1073 | 0 | t->stamp.local_tsc = curr_tsc; |
1074 | 0 | set_time_scale(&t->tsc_scale, freq); |
1075 | 0 | local_irq_enable(); |
1076 | 0 |
|
1077 | 0 | update_vcpu_system_time(current); |
1078 | 0 |
|
1079 | 0 | /* A full epoch should pass before we check for deviation. */ |
1080 | 0 | if ( smp_processor_id() == 0 ) |
1081 | 0 | { |
1082 | 0 | set_timer(&calibration_timer, NOW() + EPOCH); |
1083 | 0 | platform_time_calibration(); |
1084 | 0 | } |
1085 | 0 |
|
1086 | 0 | return 0; |
1087 | 0 | } |
1088 | | |
1089 | | /* Per-CPU communication between rendezvous IRQ and softirq handler. */ |
1090 | | static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration); |
1091 | | |
1092 | | /* Softirq handler for per-CPU time calibration. */ |
1093 | | static void local_time_calibration(void) |
1094 | 980 | { |
1095 | 980 | struct cpu_time *t = &this_cpu(cpu_time); |
1096 | 980 | const struct cpu_time_stamp *c = &this_cpu(cpu_calibration); |
1097 | 980 | |
1098 | 980 | /* |
1099 | 980 | * System (extrapolated from local and master oscillators) and TSC |
1100 | 980 | * timestamps, taken during this calibration and the previous one. |
1101 | 980 | */ |
1102 | 980 | struct cpu_time_stamp prev, curr; |
1103 | 980 | |
1104 | 980 | /* |
1105 | 980 | * System time and TSC ticks elapsed during the previous calibration |
1106 | 980 | * 'epoch'. These values are down-shifted to fit in 32 bits. |
1107 | 980 | */ |
1108 | 980 | u64 stime_elapsed64, tsc_elapsed64; |
1109 | 980 | u32 stime_elapsed32, tsc_elapsed32; |
1110 | 980 | |
1111 | 980 | /* Error correction to slow down a fast local clock. */ |
1112 | 980 | u32 error_factor = 0; |
1113 | 980 | |
1114 | 980 | /* Calculated TSC shift to ensure 32-bit scale multiplier. */ |
1115 | 980 | int tsc_shift = 0; |
1116 | 980 | |
1117 | 980 | /* The overall calibration scale multiplier. */ |
1118 | 980 | u32 calibration_mul_frac; |
1119 | 980 | |
1120 | 980 | if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ) |
1121 | 890 | { |
1122 | 890 | /* Atomically read cpu_calibration struct and write cpu_time struct. */ |
1123 | 890 | local_irq_disable(); |
1124 | 890 | t->stamp = *c; |
1125 | 890 | local_irq_enable(); |
1126 | 890 | update_vcpu_system_time(current); |
1127 | 890 | goto out; |
1128 | 890 | } |
1129 | 980 | |
1130 | 90 | prev = t->stamp; |
1131 | 90 | |
1132 | 90 | /* Disabling IRQs ensures we atomically read cpu_calibration struct. */ |
1133 | 90 | local_irq_disable(); |
1134 | 90 | curr = *c; |
1135 | 90 | local_irq_enable(); |
1136 | 90 | |
1137 | 90 | #if 0 |
1138 | | printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n", |
1139 | | smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime); |
1140 | | printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64 |
1141 | | " -> %"PRId64"\n", |
1142 | | smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime, |
1143 | | curr.master_stime - curr.local_stime); |
1144 | | #endif |
1145 | 90 | |
1146 | 90 | /* Local time warps forward if it lags behind master time. */ |
1147 | 90 | if ( curr.local_stime < curr.master_stime ) |
1148 | 0 | curr.local_stime = curr.master_stime; |
1149 | 90 | |
1150 | 90 | stime_elapsed64 = curr.master_stime - prev.master_stime; |
1151 | 90 | tsc_elapsed64 = curr.local_tsc - prev.local_tsc; |
1152 | 90 | |
1153 | 90 | /* |
1154 | 90 | * Weirdness can happen if we lose sync with the platform timer. |
1155 | 90 | * We could be smarter here: resync platform timer with local timer? |
1156 | 90 | */ |
1157 | 90 | if ( ((s64)stime_elapsed64 < (EPOCH / 2)) ) |
1158 | 0 | goto out; |
1159 | 90 | |
1160 | 90 | /* |
1161 | 90 | * Calculate error-correction factor. This only slows down a fast local |
1162 | 90 | * clock (slow clocks are warped forwards). The scale factor is clamped |
1163 | 90 | * to >= 0.5. |
1164 | 90 | */ |
1165 | 90 | if ( curr.local_stime != curr.master_stime ) |
1166 | 0 | { |
1167 | 0 | u64 local_stime_err = curr.local_stime - curr.master_stime; |
1168 | 0 |
|
1169 | 0 | if ( local_stime_err > EPOCH ) |
1170 | 0 | local_stime_err = EPOCH; |
1171 | 0 | error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err); |
1172 | 0 | } |
1173 | 90 | |
1174 | 90 | /* |
1175 | 90 | * We require 0 < stime_elapsed < 2^31. |
1176 | 90 | * This allows us to binary shift a 32-bit tsc_elapsed such that: |
1177 | 90 | * stime_elapsed < tsc_elapsed <= 2*stime_elapsed |
1178 | 90 | */ |
1179 | 90 | while ( ((u32)stime_elapsed64 != stime_elapsed64) || |
1180 | 0 | ((s32)stime_elapsed64 < 0) ) |
1181 | 0 | { |
1182 | 0 | stime_elapsed64 >>= 1; |
1183 | 0 | tsc_elapsed64 >>= 1; |
1184 | 0 | } |
1185 | 90 | |
1186 | 90 | /* stime_master_diff now fits in a 32-bit word. */ |
1187 | 90 | stime_elapsed32 = (u32)stime_elapsed64; |
1188 | 90 | |
1189 | 90 | /* tsc_elapsed <= 2*stime_elapsed */ |
1190 | 90 | while ( tsc_elapsed64 > (stime_elapsed32 * 2) ) |
1191 | 0 | { |
1192 | 0 | tsc_elapsed64 >>= 1; |
1193 | 0 | tsc_shift--; |
1194 | 0 | } |
1195 | 90 | |
1196 | 90 | /* Local difference must now fit in 32 bits. */ |
1197 | 90 | ASSERT((u32)tsc_elapsed64 == tsc_elapsed64); |
1198 | 90 | tsc_elapsed32 = (u32)tsc_elapsed64; |
1199 | 90 | |
1200 | 90 | /* tsc_elapsed > stime_elapsed */ |
1201 | 90 | ASSERT(tsc_elapsed32 != 0); |
1202 | 90 | while ( tsc_elapsed32 <= stime_elapsed32 ) |
1203 | 0 | { |
1204 | 0 | tsc_elapsed32 <<= 1; |
1205 | 0 | tsc_shift++; |
1206 | 0 | } |
1207 | 90 | |
1208 | 90 | calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32); |
1209 | 90 | if ( error_factor != 0 ) |
1210 | 0 | calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor); |
1211 | 90 | |
1212 | 90 | #if 0 |
1213 | | printk("---%d: %08x %08x %d\n", smp_processor_id(), |
1214 | | error_factor, calibration_mul_frac, tsc_shift); |
1215 | | #endif |
1216 | 90 | |
1217 | 90 | /* Record new timestamp information, atomically w.r.t. interrupts. */ |
1218 | 90 | local_irq_disable(); |
1219 | 90 | t->tsc_scale.mul_frac = calibration_mul_frac; |
1220 | 90 | t->tsc_scale.shift = tsc_shift; |
1221 | 90 | t->stamp = curr; |
1222 | 90 | local_irq_enable(); |
1223 | 90 | |
1224 | 90 | update_vcpu_system_time(current); |
1225 | 90 | |
1226 | 932 | out: |
1227 | 932 | if ( smp_processor_id() == 0 ) |
1228 | 88 | { |
1229 | 88 | set_timer(&calibration_timer, NOW() + EPOCH); |
1230 | 88 | platform_time_calibration(); |
1231 | 88 | } |
1232 | 932 | } |
1233 | | |
1234 | | /* |
1235 | | * TSC Reliability check |
1236 | | */ |
1237 | | |
1238 | | /* |
1239 | | * The Linux original version of this function is |
1240 | | * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar |
1241 | | */ |
1242 | | static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp) |
1243 | 4 | { |
1244 | 4 | static DEFINE_SPINLOCK(sync_lock); |
1245 | 4 | static cycles_t last_tsc; |
1246 | 4 | |
1247 | 4 | cycles_t start, now, prev, end; |
1248 | 4 | int i; |
1249 | 4 | |
1250 | 4 | start = rdtsc_ordered(); |
1251 | 4 | |
1252 | 4 | /* The measurement runs for 20 msecs: */ |
1253 | 4 | end = start + tsc_khz * 20ULL; |
1254 | 4 | now = start; |
1255 | 4 | |
1256 | 10.3k | for ( i = 0; ; i++ ) |
1257 | 10.3k | { |
1258 | 10.3k | /* |
1259 | 10.3k | * We take the global lock, measure TSC, save the |
1260 | 10.3k | * previous TSC that was measured (possibly on |
1261 | 10.3k | * another CPU) and update the previous TSC timestamp. |
1262 | 10.3k | */ |
1263 | 10.3k | spin_lock(&sync_lock); |
1264 | 10.3k | prev = last_tsc; |
1265 | 10.3k | now = rdtsc_ordered(); |
1266 | 10.3k | last_tsc = now; |
1267 | 10.3k | spin_unlock(&sync_lock); |
1268 | 10.3k | |
1269 | 10.3k | /* |
1270 | 10.3k | * Be nice every now and then (and also check whether measurement is |
1271 | 10.3k | * done [we also insert a 10 million loops safety exit, so we dont |
1272 | 10.3k | * lock up in case the TSC readout is totally broken]): |
1273 | 10.3k | */ |
1274 | 10.3k | if ( unlikely(!(i & 7)) ) |
1275 | 1.31k | { |
1276 | 1.31k | if ( (now > end) || (i > 10000000) ) |
1277 | 12 | break; |
1278 | 1.29k | cpu_relax(); |
1279 | 1.29k | /*touch_nmi_watchdog();*/ |
1280 | 1.29k | } |
1281 | 10.3k | |
1282 | 10.3k | /* |
1283 | 10.3k | * Outside the critical section we can now see whether we saw a |
1284 | 10.3k | * time-warp of the TSC going backwards: |
1285 | 10.3k | */ |
1286 | 10.3k | if ( unlikely(prev > now) ) |
1287 | 5.54k | { |
1288 | 5.54k | spin_lock(&sync_lock); |
1289 | 5.54k | if ( *max_warp < prev - now ) |
1290 | 13 | *max_warp = prev - now; |
1291 | 5.54k | spin_unlock(&sync_lock); |
1292 | 5.54k | } |
1293 | 10.3k | } |
1294 | 4 | } |
1295 | | |
1296 | | static unsigned long tsc_max_warp, tsc_check_count; |
1297 | | static cpumask_t tsc_check_cpumask; |
1298 | | |
1299 | | static void tsc_check_slave(void *unused) |
1300 | 8 | { |
1301 | 8 | unsigned int cpu = smp_processor_id(); |
1302 | 8 | local_irq_disable(); |
1303 | 1.46k | while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) ) |
1304 | 1.45k | cpu_relax(); |
1305 | 8 | check_tsc_warp(cpu_khz, &tsc_max_warp); |
1306 | 8 | cpumask_clear_cpu(cpu, &tsc_check_cpumask); |
1307 | 8 | local_irq_enable(); |
1308 | 8 | } |
1309 | | |
1310 | | static void tsc_check_reliability(void) |
1311 | 1 | { |
1312 | 1 | unsigned int cpu = smp_processor_id(); |
1313 | 1 | static DEFINE_SPINLOCK(lock); |
1314 | 1 | |
1315 | 1 | spin_lock(&lock); |
1316 | 1 | |
1317 | 1 | tsc_check_count++; |
1318 | 1 | smp_call_function(tsc_check_slave, NULL, 0); |
1319 | 1 | cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu)); |
1320 | 1 | local_irq_disable(); |
1321 | 1 | check_tsc_warp(cpu_khz, &tsc_max_warp); |
1322 | 1 | local_irq_enable(); |
1323 | 59 | while ( !cpumask_empty(&tsc_check_cpumask) ) |
1324 | 58 | cpu_relax(); |
1325 | 1 | |
1326 | 1 | spin_unlock(&lock); |
1327 | 1 | } |
1328 | | |
1329 | | /* |
1330 | | * Rendezvous for all CPUs in IRQ context. |
1331 | | * Master CPU snapshots the platform timer. |
1332 | | * All CPUS snapshot their local TSC and extrapolation of system time. |
1333 | | */ |
1334 | | struct calibration_rendezvous { |
1335 | | cpumask_t cpu_calibration_map; |
1336 | | atomic_t semaphore; |
1337 | | s_time_t master_stime; |
1338 | | u64 master_tsc_stamp; |
1339 | | }; |
1340 | | |
1341 | | static void |
1342 | | time_calibration_rendezvous_tail(const struct calibration_rendezvous *r) |
1343 | 355 | { |
1344 | 355 | struct cpu_time_stamp *c = &this_cpu(cpu_calibration); |
1345 | 355 | |
1346 | 355 | c->local_tsc = rdtsc_ordered(); |
1347 | 355 | c->local_stime = get_s_time_fixed(c->local_tsc); |
1348 | 355 | c->master_stime = r->master_stime; |
1349 | 355 | |
1350 | 355 | raise_softirq(TIME_CALIBRATE_SOFTIRQ); |
1351 | 355 | } |
1352 | | |
1353 | | /* |
1354 | | * Keep TSCs in sync when they run at the same rate, but may stop in |
1355 | | * deep-sleep C states. |
1356 | | */ |
1357 | | static void time_calibration_tsc_rendezvous(void *_r) |
1358 | 685 | { |
1359 | 685 | int i; |
1360 | 685 | struct calibration_rendezvous *r = _r; |
1361 | 685 | unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map); |
1362 | 685 | |
1363 | 685 | /* Loop to get rid of cache effects on TSC skew. */ |
1364 | 2.59k | for ( i = 4; i >= 0; i-- ) |
1365 | 1.90k | { |
1366 | 1.90k | if ( smp_processor_id() == 0 ) |
1367 | 440 | { |
1368 | 64.2k | while ( atomic_read(&r->semaphore) != (total_cpus - 1) ) |
1369 | 63.8k | cpu_relax(); |
1370 | 440 | |
1371 | 440 | if ( r->master_stime == 0 ) |
1372 | 88 | { |
1373 | 88 | r->master_stime = read_platform_stime(NULL); |
1374 | 88 | r->master_tsc_stamp = rdtsc_ordered(); |
1375 | 88 | } |
1376 | 440 | atomic_inc(&r->semaphore); |
1377 | 440 | |
1378 | 440 | if ( i == 0 ) |
1379 | 88 | write_tsc(r->master_tsc_stamp); |
1380 | 440 | |
1381 | 2.12k | while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) ) |
1382 | 1.68k | cpu_relax(); |
1383 | 440 | atomic_set(&r->semaphore, 0); |
1384 | 440 | } |
1385 | 1.90k | else |
1386 | 1.46k | { |
1387 | 1.46k | atomic_inc(&r->semaphore); |
1388 | 58.7k | while ( atomic_read(&r->semaphore) < total_cpus ) |
1389 | 57.3k | cpu_relax(); |
1390 | 1.46k | |
1391 | 1.46k | if ( i == 0 ) |
1392 | 220 | write_tsc(r->master_tsc_stamp); |
1393 | 1.46k | |
1394 | 1.46k | atomic_inc(&r->semaphore); |
1395 | 4.84k | while ( atomic_read(&r->semaphore) > total_cpus ) |
1396 | 3.37k | cpu_relax(); |
1397 | 1.46k | } |
1398 | 1.90k | } |
1399 | 685 | |
1400 | 685 | time_calibration_rendezvous_tail(r); |
1401 | 685 | } |
1402 | | |
1403 | | /* Ordinary rendezvous function which does not modify TSC values. */ |
1404 | | static void time_calibration_std_rendezvous(void *_r) |
1405 | 0 | { |
1406 | 0 | struct calibration_rendezvous *r = _r; |
1407 | 0 | unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map); |
1408 | 0 |
|
1409 | 0 | if ( smp_processor_id() == 0 ) |
1410 | 0 | { |
1411 | 0 | while ( atomic_read(&r->semaphore) != (total_cpus - 1) ) |
1412 | 0 | cpu_relax(); |
1413 | 0 | r->master_stime = read_platform_stime(NULL); |
1414 | 0 | smp_wmb(); /* write r->master_stime /then/ signal */ |
1415 | 0 | atomic_inc(&r->semaphore); |
1416 | 0 | } |
1417 | 0 | else |
1418 | 0 | { |
1419 | 0 | atomic_inc(&r->semaphore); |
1420 | 0 | while ( atomic_read(&r->semaphore) != total_cpus ) |
1421 | 0 | cpu_relax(); |
1422 | 0 | smp_rmb(); /* receive signal /then/ read r->master_stime */ |
1423 | 0 | } |
1424 | 0 |
|
1425 | 0 | time_calibration_rendezvous_tail(r); |
1426 | 0 | } |
1427 | | |
1428 | | /* |
1429 | | * Rendezvous function used when clocksource is TSC and |
1430 | | * no CPU hotplug will be performed. |
1431 | | */ |
1432 | | static void time_calibration_nop_rendezvous(void *rv) |
1433 | 0 | { |
1434 | 0 | const struct calibration_rendezvous *r = rv; |
1435 | 0 | struct cpu_time_stamp *c = &this_cpu(cpu_calibration); |
1436 | 0 |
|
1437 | 0 | c->local_tsc = r->master_tsc_stamp; |
1438 | 0 | c->local_stime = r->master_stime; |
1439 | 0 | c->master_stime = r->master_stime; |
1440 | 0 |
|
1441 | 0 | raise_softirq(TIME_CALIBRATE_SOFTIRQ); |
1442 | 0 | } |
1443 | | |
1444 | | static void (*time_calibration_rendezvous_fn)(void *) = |
1445 | | time_calibration_std_rendezvous; |
1446 | | |
1447 | | static void time_calibration(void *unused) |
1448 | 88 | { |
1449 | 88 | struct calibration_rendezvous r = { |
1450 | 88 | .semaphore = ATOMIC_INIT(0) |
1451 | 88 | }; |
1452 | 88 | |
1453 | 88 | if ( clocksource_is_tsc() ) |
1454 | 0 | { |
1455 | 0 | local_irq_disable(); |
1456 | 0 | r.master_stime = read_platform_stime(&r.master_tsc_stamp); |
1457 | 0 | local_irq_enable(); |
1458 | 0 | } |
1459 | 88 | |
1460 | 88 | cpumask_copy(&r.cpu_calibration_map, &cpu_online_map); |
1461 | 88 | |
1462 | 88 | /* @wait=1 because we must wait for all cpus before freeing @r. */ |
1463 | 88 | on_selected_cpus(&r.cpu_calibration_map, |
1464 | 88 | time_calibration_rendezvous_fn, |
1465 | 88 | &r, 1); |
1466 | 88 | } |
1467 | | |
1468 | | static struct cpu_time_stamp ap_bringup_ref; |
1469 | | |
1470 | | void time_latch_stamps(void) |
1471 | 11 | { |
1472 | 11 | unsigned long flags; |
1473 | 11 | |
1474 | 11 | local_irq_save(flags); |
1475 | 11 | ap_bringup_ref.master_stime = read_platform_stime(NULL); |
1476 | 11 | ap_bringup_ref.local_tsc = rdtsc_ordered(); |
1477 | 11 | local_irq_restore(flags); |
1478 | 11 | |
1479 | 11 | ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc); |
1480 | 11 | } |
1481 | | |
1482 | | void init_percpu_time(void) |
1483 | 12 | { |
1484 | 12 | struct cpu_time *t = &this_cpu(cpu_time); |
1485 | 12 | unsigned long flags; |
1486 | 12 | u64 tsc; |
1487 | 12 | s_time_t now; |
1488 | 12 | |
1489 | 12 | /* Initial estimate for TSC rate. */ |
1490 | 12 | t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale; |
1491 | 12 | |
1492 | 12 | local_irq_save(flags); |
1493 | 12 | now = read_platform_stime(NULL); |
1494 | 12 | tsc = rdtsc_ordered(); |
1495 | 12 | local_irq_restore(flags); |
1496 | 12 | |
1497 | 12 | t->stamp.master_stime = now; |
1498 | 12 | /* |
1499 | 12 | * To avoid a discontinuity (TSC and platform clock can't be expected |
1500 | 12 | * to be in perfect sync), initialization here needs to match up with |
1501 | 12 | * local_time_calibration()'s decision whether to use its fast path. |
1502 | 12 | */ |
1503 | 12 | if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ) |
1504 | 12 | { |
1505 | 12 | if ( system_state < SYS_STATE_smp_boot ) |
1506 | 1 | now = get_s_time_fixed(tsc); |
1507 | 12 | else |
1508 | 11 | now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime; |
1509 | 12 | } |
1510 | 12 | t->stamp.local_tsc = tsc; |
1511 | 12 | t->stamp.local_stime = now; |
1512 | 12 | } |
1513 | | |
1514 | | /* |
1515 | | * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits. |
1516 | | * Obviously we must not use write_tsc() on such CPUs. |
1517 | | * |
1518 | | * Additionally, AMD specifies that being able to write the TSC MSR is not an |
1519 | | * architectural feature (but, other than their manual says, also cannot be |
1520 | | * determined from CPUID bits). |
1521 | | */ |
1522 | | static void __init tsc_check_writability(void) |
1523 | 2 | { |
1524 | 2 | const char *what = NULL; |
1525 | 2 | uint64_t tsc; |
1526 | 2 | |
1527 | 2 | /* |
1528 | 2 | * If all CPUs are reported as synchronised and in sync, we never write |
1529 | 2 | * the TSCs (except unavoidably, when a CPU is physically hot-plugged). |
1530 | 2 | * Hence testing for writability is pointless and even harmful. |
1531 | 2 | */ |
1532 | 2 | if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) |
1533 | 1 | return; |
1534 | 2 | |
1535 | 1 | tsc = rdtsc(); |
1536 | 1 | if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 ) |
1537 | 1 | { |
1538 | 1 | uint64_t tmp, tmp2 = rdtsc(); |
1539 | 1 | |
1540 | 1 | write_tsc(tsc | (1ULL << 32)); |
1541 | 1 | tmp = rdtsc(); |
1542 | 1 | if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) ) |
1543 | 0 | what = "only partially"; |
1544 | 1 | } |
1545 | 1 | else |
1546 | 0 | { |
1547 | 0 | what = "not"; |
1548 | 0 | } |
1549 | 1 | |
1550 | 1 | /* Nothing to do if the TSC is fully writable. */ |
1551 | 1 | if ( !what ) |
1552 | 1 | { |
1553 | 1 | /* |
1554 | 1 | * Paranoia - write back original TSC value. However, APs get synced |
1555 | 1 | * with BSP as they are brought up, so this doesn't much matter. |
1556 | 1 | */ |
1557 | 1 | write_tsc(tsc); |
1558 | 1 | return; |
1559 | 1 | } |
1560 | 1 | |
1561 | 0 | printk(XENLOG_WARNING "TSC %s writable\n", what); |
1562 | 0 |
|
1563 | 0 | /* time_calibration_tsc_rendezvous() must not be used */ |
1564 | 0 | setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC); |
1565 | 0 |
|
1566 | 0 | /* cstate_restore_tsc() must not be used (or do nothing) */ |
1567 | 0 | if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) |
1568 | 0 | cpuidle_disable_deep_cstate(); |
1569 | 0 |
|
1570 | 0 | /* synchronize_tsc_slave() must do nothing */ |
1571 | 0 | disable_tsc_sync = true; |
1572 | 0 | } |
1573 | | |
1574 | | static void __init reset_percpu_time(void *unused) |
1575 | 0 | { |
1576 | 0 | struct cpu_time *t = &this_cpu(cpu_time); |
1577 | 0 |
|
1578 | 0 | t->stamp.local_tsc = boot_tsc_stamp; |
1579 | 0 | t->stamp.local_stime = 0; |
1580 | 0 | t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp); |
1581 | 0 | t->stamp.master_stime = t->stamp.local_stime; |
1582 | 0 | } |
1583 | | |
1584 | | static void __init try_platform_timer_tail(bool late) |
1585 | 1 | { |
1586 | 1 | init_timer(&plt_overflow_timer, plt_overflow, NULL, 0); |
1587 | 1 | plt_overflow(NULL); |
1588 | 1 | |
1589 | 1 | platform_timer_stamp = plt_stamp64; |
1590 | 1 | stime_platform_stamp = NOW(); |
1591 | 1 | |
1592 | 1 | if ( !late ) |
1593 | 1 | init_percpu_time(); |
1594 | 1 | |
1595 | 1 | init_timer(&calibration_timer, time_calibration, NULL, 0); |
1596 | 1 | set_timer(&calibration_timer, NOW() + EPOCH); |
1597 | 1 | } |
1598 | | |
1599 | | /* Late init function, after all cpus have booted */ |
1600 | | static int __init verify_tsc_reliability(void) |
1601 | 1 | { |
1602 | 1 | if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) |
1603 | 1 | { |
1604 | 1 | /* |
1605 | 1 | * Sadly, despite processor vendors' best design guidance efforts, on |
1606 | 1 | * some systems, cpus may come out of reset improperly synchronized. |
1607 | 1 | * So we must verify there is no warp and we can't do that until all |
1608 | 1 | * CPUs are booted. |
1609 | 1 | */ |
1610 | 1 | tsc_check_reliability(); |
1611 | 1 | if ( tsc_max_warp ) |
1612 | 1 | { |
1613 | 1 | printk("TSC warp detected, disabling TSC_RELIABLE\n"); |
1614 | 1 | setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); |
1615 | 1 | } |
1616 | 0 | else if ( !strcmp(opt_clocksource, "tsc") && |
1617 | 0 | (try_platform_timer(&plt_tsc) > 0) ) |
1618 | 0 | { |
1619 | 0 | /* |
1620 | 0 | * Platform timer has changed and CPU time will only be updated |
1621 | 0 | * after we set again the calibration timer, which means we need to |
1622 | 0 | * seed again each local CPU time. At this stage TSC is known to be |
1623 | 0 | * reliable i.e. monotonically increasing across all CPUs so this |
1624 | 0 | * lets us remove the skew between platform timer and TSC, since |
1625 | 0 | * these are now effectively the same. |
1626 | 0 | */ |
1627 | 0 | on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1); |
1628 | 0 |
|
1629 | 0 | /* |
1630 | 0 | * We won't do CPU Hotplug and TSC clocksource is being used which |
1631 | 0 | * means we have a reliable TSC, plus we don't sync with any other |
1632 | 0 | * clocksource so no need for rendezvous. |
1633 | 0 | */ |
1634 | 0 | time_calibration_rendezvous_fn = time_calibration_nop_rendezvous; |
1635 | 0 |
|
1636 | 0 | /* Finish platform timer switch. */ |
1637 | 0 | try_platform_timer_tail(true); |
1638 | 0 |
|
1639 | 0 | printk("Switched to Platform timer %s TSC\n", |
1640 | 0 | freq_string(plt_src.frequency)); |
1641 | 0 | return 0; |
1642 | 0 | } |
1643 | 1 | } |
1644 | 1 | |
1645 | 1 | /* |
1646 | 1 | * Re-run the TSC writability check if it didn't run to completion, as |
1647 | 1 | * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed |
1648 | 1 | * for determining which rendezvous function to use (below). |
1649 | 1 | */ |
1650 | 1 | if ( !disable_tsc_sync ) |
1651 | 1 | tsc_check_writability(); |
1652 | 1 | |
1653 | 1 | /* |
1654 | 1 | * While with constant-rate TSCs the scale factor can be shared, when TSCs |
1655 | 1 | * are not marked as 'reliable', re-sync during rendezvous. |
1656 | 1 | */ |
1657 | 1 | if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && |
1658 | 1 | !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) |
1659 | 1 | time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous; |
1660 | 1 | |
1661 | 1 | return 0; |
1662 | 1 | } |
1663 | | __initcall(verify_tsc_reliability); |
1664 | | |
1665 | | /* Late init function (after interrupts are enabled). */ |
1666 | | int __init init_xen_time(void) |
1667 | 1 | { |
1668 | 1 | tsc_check_writability(); |
1669 | 1 | |
1670 | 1 | open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration); |
1671 | 1 | |
1672 | 1 | /* NB. get_cmos_time() can take over one second to execute. */ |
1673 | 1 | do_settime(get_cmos_time(), 0, NOW()); |
1674 | 1 | |
1675 | 1 | /* Finish platform timer initialization. */ |
1676 | 1 | try_platform_timer_tail(false); |
1677 | 1 | |
1678 | 1 | return 0; |
1679 | 1 | } |
1680 | | |
1681 | | |
1682 | | /* Early init function. */ |
1683 | | void __init early_time_init(void) |
1684 | 1 | { |
1685 | 1 | struct cpu_time *t = &this_cpu(cpu_time); |
1686 | 1 | u64 tmp; |
1687 | 1 | |
1688 | 1 | preinit_pit(); |
1689 | 1 | tmp = init_platform_timer(); |
1690 | 1 | plt_tsc.frequency = tmp; |
1691 | 1 | |
1692 | 1 | set_time_scale(&t->tsc_scale, tmp); |
1693 | 1 | t->stamp.local_tsc = boot_tsc_stamp; |
1694 | 1 | |
1695 | 1 | do_div(tmp, 1000); |
1696 | 1 | cpu_khz = (unsigned long)tmp; |
1697 | 1 | printk("Detected %lu.%03lu MHz processor.\n", |
1698 | 1 | cpu_khz / 1000, cpu_khz % 1000); |
1699 | 1 | |
1700 | 1 | setup_irq(0, 0, &irq0); |
1701 | 1 | } |
1702 | | |
1703 | | /* keep pit enabled for pit_broadcast working while cpuidle enabled */ |
1704 | | static int _disable_pit_irq(void(*hpet_broadcast_setup)(void)) |
1705 | 1 | { |
1706 | 1 | int ret = 1; |
1707 | 1 | |
1708 | 1 | if ( using_pit || !cpu_has_apic ) |
1709 | 0 | return -1; |
1710 | 1 | |
1711 | 1 | /* |
1712 | 1 | * If we do not rely on PIT CH0 then we can use HPET for one-shot timer |
1713 | 1 | * emulation when entering deep C states. |
1714 | 1 | * XXX dom0 may rely on RTC interrupt delivery, so only enable |
1715 | 1 | * hpet_broadcast if FSB mode available or if force_hpet_broadcast. |
1716 | 1 | */ |
1717 | 1 | if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) ) |
1718 | 0 | { |
1719 | 0 | hpet_broadcast_setup(); |
1720 | 0 | if ( !hpet_broadcast_is_available() ) |
1721 | 0 | { |
1722 | 0 | if ( xen_cpuidle > 0 ) |
1723 | 0 | { |
1724 | 0 | printk("%ps() failed, turning to PIT broadcast\n", |
1725 | 0 | hpet_broadcast_setup); |
1726 | 0 | return -1; |
1727 | 0 | } |
1728 | 0 | ret = 0; |
1729 | 0 | } |
1730 | 0 | } |
1731 | 1 | |
1732 | 1 | /* Disable PIT CH0 timer interrupt. */ |
1733 | 1 | outb_p(0x30, PIT_MODE); |
1734 | 1 | outb_p(0, PIT_CH0); |
1735 | 1 | outb_p(0, PIT_CH0); |
1736 | 1 | |
1737 | 1 | return ret; |
1738 | 1 | } |
1739 | | |
1740 | | static int __init disable_pit_irq(void) |
1741 | 1 | { |
1742 | 1 | if ( !_disable_pit_irq(hpet_broadcast_init) ) |
1743 | 0 | { |
1744 | 0 | xen_cpuidle = 0; |
1745 | 0 | printk("CPUIDLE: disabled due to no HPET. " |
1746 | 0 | "Force enable with 'cpuidle'.\n"); |
1747 | 0 | } |
1748 | 1 | |
1749 | 1 | return 0; |
1750 | 1 | } |
1751 | | __initcall(disable_pit_irq); |
1752 | | |
1753 | | void pit_broadcast_enter(void) |
1754 | 0 | { |
1755 | 0 | cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask); |
1756 | 0 | } |
1757 | | |
1758 | | void pit_broadcast_exit(void) |
1759 | 0 | { |
1760 | 0 | int cpu = smp_processor_id(); |
1761 | 0 |
|
1762 | 0 | if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) ) |
1763 | 0 | reprogram_timer(this_cpu(timer_deadline)); |
1764 | 0 | } |
1765 | | |
1766 | | int pit_broadcast_is_available(void) |
1767 | 0 | { |
1768 | 0 | return cpuidle_using_deep_cstate(); |
1769 | 0 | } |
1770 | | |
1771 | | void send_timer_event(struct vcpu *v) |
1772 | 5.74k | { |
1773 | 5.74k | send_guest_vcpu_virq(v, VIRQ_TIMER); |
1774 | 5.74k | } |
1775 | | |
1776 | | /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */ |
1777 | | static long cmos_utc_offset; /* in seconds */ |
1778 | | |
1779 | | int time_suspend(void) |
1780 | 0 | { |
1781 | 0 | if ( smp_processor_id() == 0 ) |
1782 | 0 | { |
1783 | 0 | cmos_utc_offset = -get_cmos_time(); |
1784 | 0 | cmos_utc_offset += get_sec(); |
1785 | 0 | kill_timer(&calibration_timer); |
1786 | 0 |
|
1787 | 0 | /* Sync platform timer stamps. */ |
1788 | 0 | platform_time_calibration(); |
1789 | 0 | } |
1790 | 0 |
|
1791 | 0 | /* Better to cancel calibration timer for accuracy. */ |
1792 | 0 | clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id())); |
1793 | 0 |
|
1794 | 0 | return 0; |
1795 | 0 | } |
1796 | | |
1797 | | int time_resume(void) |
1798 | 0 | { |
1799 | 0 | preinit_pit(); |
1800 | 0 |
|
1801 | 0 | resume_platform_timer(); |
1802 | 0 |
|
1803 | 0 | if ( !_disable_pit_irq(hpet_broadcast_resume) ) |
1804 | 0 | BUG(); |
1805 | 0 |
|
1806 | 0 | init_percpu_time(); |
1807 | 0 |
|
1808 | 0 | set_timer(&calibration_timer, NOW() + EPOCH); |
1809 | 0 |
|
1810 | 0 | do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW()); |
1811 | 0 |
|
1812 | 0 | update_vcpu_system_time(current); |
1813 | 0 |
|
1814 | 0 | update_domain_rtc(); |
1815 | 0 |
|
1816 | 0 | return 0; |
1817 | 0 | } |
1818 | | |
1819 | | int hwdom_pit_access(struct ioreq *ioreq) |
1820 | 0 | { |
1821 | 0 | /* Is Xen using Channel 2? Then disallow direct dom0 access. */ |
1822 | 0 | if ( using_pit ) |
1823 | 0 | return 0; |
1824 | 0 |
|
1825 | 0 | switch ( ioreq->addr ) |
1826 | 0 | { |
1827 | 0 | case PIT_CH2: |
1828 | 0 | if ( ioreq->dir == IOREQ_READ ) |
1829 | 0 | ioreq->data = inb(PIT_CH2); |
1830 | 0 | else |
1831 | 0 | outb(ioreq->data, PIT_CH2); |
1832 | 0 | return 1; |
1833 | 0 |
|
1834 | 0 | case PIT_MODE: |
1835 | 0 | if ( ioreq->dir == IOREQ_READ ) |
1836 | 0 | return 0; /* urk! */ |
1837 | 0 | switch ( ioreq->data & 0xc0 ) |
1838 | 0 | { |
1839 | 0 | case 0xc0: /* Read Back */ |
1840 | 0 | if ( ioreq->data & 0x08 ) /* Select Channel 2? */ |
1841 | 0 | outb(ioreq->data & 0xf8, PIT_MODE); |
1842 | 0 | if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */ |
1843 | 0 | return 1; /* no - we're done */ |
1844 | 0 | /* Filter Channel 2 and reserved bit 0. */ |
1845 | 0 | ioreq->data &= ~0x09; |
1846 | 0 | return 0; /* emulate ch0/1 readback */ |
1847 | 0 | case 0x80: /* Select Counter 2 */ |
1848 | 0 | outb(ioreq->data, PIT_MODE); |
1849 | 0 | return 1; |
1850 | 0 | } |
1851 | 0 | break; |
1852 | 0 |
|
1853 | 0 | case 0x61: |
1854 | 0 | if ( ioreq->dir == IOREQ_READ ) |
1855 | 0 | ioreq->data = inb(0x61); |
1856 | 0 | else |
1857 | 0 | outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61); |
1858 | 0 | return 1; |
1859 | 0 | } |
1860 | 0 |
|
1861 | 0 | return 0; |
1862 | 0 | } |
1863 | | |
1864 | | /* |
1865 | | * PV SoftTSC Emulation. |
1866 | | */ |
1867 | | |
1868 | | /* |
1869 | | * tsc=unstable: Override all tests; assume TSC is unreliable. |
1870 | | * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs. |
1871 | | * tsc=stable:socket: Assume TSCs are reliable across sockets. |
1872 | | */ |
1873 | | static int __init tsc_parse(const char *s) |
1874 | 0 | { |
1875 | 0 | if ( !strcmp(s, "unstable") ) |
1876 | 0 | { |
1877 | 0 | setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC); |
1878 | 0 | setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC); |
1879 | 0 | setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); |
1880 | 0 | } |
1881 | 0 | else if ( !strcmp(s, "skewed") ) |
1882 | 0 | setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); |
1883 | 0 | else if ( !strcmp(s, "stable:socket") ) |
1884 | 0 | tsc_flags |= TSC_RELIABLE_SOCKET; |
1885 | 0 | else |
1886 | 0 | return -EINVAL; |
1887 | 0 |
|
1888 | 0 | return 0; |
1889 | 0 | } |
1890 | | custom_param("tsc", tsc_parse); |
1891 | | |
1892 | | u64 gtime_to_gtsc(struct domain *d, u64 time) |
1893 | 0 | { |
1894 | 0 | if ( !is_hvm_domain(d) ) |
1895 | 0 | { |
1896 | 0 | if ( time < d->arch.vtsc_offset ) |
1897 | 0 | return -scale_delta(d->arch.vtsc_offset - time, |
1898 | 0 | &d->arch.ns_to_vtsc); |
1899 | 0 | time -= d->arch.vtsc_offset; |
1900 | 0 | } |
1901 | 0 | return scale_delta(time, &d->arch.ns_to_vtsc); |
1902 | 0 | } |
1903 | | |
1904 | | u64 gtsc_to_gtime(struct domain *d, u64 tsc) |
1905 | 0 | { |
1906 | 0 | u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns); |
1907 | 0 |
|
1908 | 0 | if ( !is_hvm_domain(d) ) |
1909 | 0 | time += d->arch.vtsc_offset; |
1910 | 0 | return time; |
1911 | 0 | } |
1912 | | |
1913 | | void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp) |
1914 | 0 | { |
1915 | 0 | s_time_t now = get_s_time(); |
1916 | 0 | struct domain *d = v->domain; |
1917 | 0 |
|
1918 | 0 | spin_lock(&d->arch.vtsc_lock); |
1919 | 0 |
|
1920 | 0 | #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS) |
1921 | 0 | if ( guest_kernel_mode(v, regs) ) |
1922 | 0 | d->arch.vtsc_kerncount++; |
1923 | 0 | else |
1924 | 0 | d->arch.vtsc_usercount++; |
1925 | 0 | #endif |
1926 | 0 |
|
1927 | 0 | if ( (int64_t)(now - d->arch.vtsc_last) > 0 ) |
1928 | 0 | d->arch.vtsc_last = now; |
1929 | 0 | else |
1930 | 0 | now = ++d->arch.vtsc_last; |
1931 | 0 |
|
1932 | 0 | spin_unlock(&d->arch.vtsc_lock); |
1933 | 0 |
|
1934 | 0 | msr_split(regs, gtime_to_gtsc(d, now)); |
1935 | 0 |
|
1936 | 0 | if ( rdtscp ) |
1937 | 0 | regs->rcx = |
1938 | 0 | (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0; |
1939 | 0 | } |
1940 | | |
1941 | | bool clocksource_is_tsc(void) |
1942 | 99.4k | { |
1943 | 99.4k | return plt_src.read_counter == read_tsc; |
1944 | 99.4k | } |
1945 | | |
1946 | | int host_tsc_is_safe(void) |
1947 | 0 | { |
1948 | 0 | return boot_cpu_has(X86_FEATURE_TSC_RELIABLE); |
1949 | 0 | } |
1950 | | |
1951 | | /* |
1952 | | * called to collect tsc-related data only for save file or live |
1953 | | * migrate; called after last rdtsc is done on this incarnation |
1954 | | */ |
1955 | | void tsc_get_info(struct domain *d, uint32_t *tsc_mode, |
1956 | | uint64_t *elapsed_nsec, uint32_t *gtsc_khz, |
1957 | | uint32_t *incarnation) |
1958 | 0 | { |
1959 | 0 | bool enable_tsc_scaling = is_hvm_domain(d) && |
1960 | 0 | hvm_tsc_scaling_supported && !d->arch.vtsc; |
1961 | 0 |
|
1962 | 0 | *incarnation = d->arch.incarnation; |
1963 | 0 | *tsc_mode = d->arch.tsc_mode; |
1964 | 0 |
|
1965 | 0 | switch ( *tsc_mode ) |
1966 | 0 | { |
1967 | 0 | uint64_t tsc; |
1968 | 0 |
|
1969 | 0 | case TSC_MODE_NEVER_EMULATE: |
1970 | 0 | *elapsed_nsec = *gtsc_khz = 0; |
1971 | 0 | break; |
1972 | 0 | case TSC_MODE_DEFAULT: |
1973 | 0 | if ( d->arch.vtsc ) |
1974 | 0 | { |
1975 | 0 | case TSC_MODE_ALWAYS_EMULATE: |
1976 | 0 | *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; |
1977 | 0 | *gtsc_khz = d->arch.tsc_khz; |
1978 | 0 | break; |
1979 | 0 | } |
1980 | 0 | tsc = rdtsc(); |
1981 | 0 | *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns); |
1982 | 0 | *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz; |
1983 | 0 | break; |
1984 | 0 | case TSC_MODE_PVRDTSCP: |
1985 | 0 | if ( d->arch.vtsc ) |
1986 | 0 | { |
1987 | 0 | *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; |
1988 | 0 | *gtsc_khz = cpu_khz; |
1989 | 0 | } |
1990 | 0 | else |
1991 | 0 | { |
1992 | 0 | tsc = rdtsc(); |
1993 | 0 | *elapsed_nsec = scale_delta(tsc, &this_cpu(cpu_time).tsc_scale) - |
1994 | 0 | d->arch.vtsc_offset; |
1995 | 0 | *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz |
1996 | 0 | : 0 /* ignored by tsc_set_info */; |
1997 | 0 | } |
1998 | 0 | break; |
1999 | 0 | } |
2000 | 0 |
|
2001 | 0 | if ( (int64_t)*elapsed_nsec < 0 ) |
2002 | 0 | *elapsed_nsec = 0; |
2003 | 0 | } |
2004 | | |
2005 | | /* |
2006 | | * This may be called as many as three times for a domain, once when the |
2007 | | * hypervisor creates the domain, once when the toolstack creates the |
2008 | | * domain and, if restoring/migrating, once when saved/migrated values |
2009 | | * are restored. Care must be taken that, if multiple calls occur, |
2010 | | * only the last "sticks" and all are completed before the guest executes |
2011 | | * an rdtsc instruction |
2012 | | */ |
2013 | | void tsc_set_info(struct domain *d, |
2014 | | uint32_t tsc_mode, uint64_t elapsed_nsec, |
2015 | | uint32_t gtsc_khz, uint32_t incarnation) |
2016 | 2 | { |
2017 | 2 | if ( is_idle_domain(d) || is_hardware_domain(d) ) |
2018 | 2 | { |
2019 | 2 | d->arch.vtsc = 0; |
2020 | 2 | return; |
2021 | 2 | } |
2022 | 2 | |
2023 | 0 | switch ( d->arch.tsc_mode = tsc_mode ) |
2024 | 0 | { |
2025 | 0 | bool enable_tsc_scaling; |
2026 | 0 |
|
2027 | 0 | case TSC_MODE_DEFAULT: |
2028 | 0 | case TSC_MODE_ALWAYS_EMULATE: |
2029 | 0 | d->arch.vtsc_offset = get_s_time() - elapsed_nsec; |
2030 | 0 | d->arch.tsc_khz = gtsc_khz ?: cpu_khz; |
2031 | 0 | set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000); |
2032 | 0 |
|
2033 | 0 | /* |
2034 | 0 | * In default mode use native TSC if the host has safe TSC and |
2035 | 0 | * host and guest frequencies are the same (either "naturally" or |
2036 | 0 | * - for HVM/PVH - via TSC scaling). |
2037 | 0 | * When a guest is created, gtsc_khz is passed in as zero, making |
2038 | 0 | * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation. |
2039 | 0 | */ |
2040 | 0 | if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() && |
2041 | 0 | (d->arch.tsc_khz == cpu_khz || |
2042 | 0 | (is_hvm_domain(d) && |
2043 | 0 | hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) ) |
2044 | 0 | { |
2045 | 0 | case TSC_MODE_NEVER_EMULATE: |
2046 | 0 | d->arch.vtsc = 0; |
2047 | 0 | break; |
2048 | 0 | } |
2049 | 0 | d->arch.vtsc = 1; |
2050 | 0 | d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns); |
2051 | 0 | break; |
2052 | 0 | case TSC_MODE_PVRDTSCP: |
2053 | 0 | d->arch.vtsc = !boot_cpu_has(X86_FEATURE_RDTSCP) || |
2054 | 0 | !host_tsc_is_safe(); |
2055 | 0 | enable_tsc_scaling = is_hvm_domain(d) && !d->arch.vtsc && |
2056 | 0 | hvm_get_tsc_scaling_ratio(gtsc_khz ?: cpu_khz); |
2057 | 0 | d->arch.tsc_khz = (enable_tsc_scaling && gtsc_khz) ? gtsc_khz : cpu_khz; |
2058 | 0 | set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 ); |
2059 | 0 | d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns); |
2060 | 0 | if ( d->arch.vtsc ) |
2061 | 0 | d->arch.vtsc_offset = get_s_time() - elapsed_nsec; |
2062 | 0 | else { |
2063 | 0 | /* when using native TSC, offset is nsec relative to power-on |
2064 | 0 | * of physical machine */ |
2065 | 0 | d->arch.vtsc_offset = scale_delta(rdtsc(), |
2066 | 0 | &this_cpu(cpu_time).tsc_scale) - |
2067 | 0 | elapsed_nsec; |
2068 | 0 | } |
2069 | 0 | break; |
2070 | 0 | } |
2071 | 0 | d->arch.incarnation = incarnation + 1; |
2072 | 0 | if ( is_hvm_domain(d) ) |
2073 | 0 | { |
2074 | 0 | if ( hvm_tsc_scaling_supported && !d->arch.vtsc ) |
2075 | 0 | d->arch.hvm_domain.tsc_scaling_ratio = |
2076 | 0 | hvm_get_tsc_scaling_ratio(d->arch.tsc_khz); |
2077 | 0 |
|
2078 | 0 | hvm_set_rdtsc_exiting(d, d->arch.vtsc); |
2079 | 0 | if ( d->vcpu && d->vcpu[0] && incarnation == 0 ) |
2080 | 0 | { |
2081 | 0 | /* |
2082 | 0 | * set_tsc_offset() is called from hvm_vcpu_initialise() before |
2083 | 0 | * tsc_set_info(). New vtsc mode may require recomputing TSC |
2084 | 0 | * offset. |
2085 | 0 | * We only need to do this for BSP during initial boot. APs will |
2086 | 0 | * call set_tsc_offset() later from hvm_vcpu_reset_state() and they |
2087 | 0 | * will sync their TSC to BSP's sync_tsc. |
2088 | 0 | */ |
2089 | 0 | d->arch.hvm_domain.sync_tsc = rdtsc(); |
2090 | 0 | hvm_funcs.set_tsc_offset(d->vcpu[0], |
2091 | 0 | d->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset, |
2092 | 0 | d->arch.hvm_domain.sync_tsc); |
2093 | 0 | } |
2094 | 0 | } |
2095 | 0 |
|
2096 | 0 | recalculate_cpuid_policy(d); |
2097 | 0 | } |
2098 | | |
2099 | | /* vtsc may incur measurable performance degradation, diagnose with this */ |
2100 | | static void dump_softtsc(unsigned char key) |
2101 | 0 | { |
2102 | 0 | struct domain *d; |
2103 | 0 | int domcnt = 0; |
2104 | 0 |
|
2105 | 0 | tsc_check_reliability(); |
2106 | 0 | if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) |
2107 | 0 | printk("TSC marked as reliable, " |
2108 | 0 | "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count); |
2109 | 0 | else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) ) |
2110 | 0 | { |
2111 | 0 | printk("TSC has constant rate, "); |
2112 | 0 | if (max_cstate <= 2 && tsc_max_warp == 0) |
2113 | 0 | printk("no deep Cstates, passed warp test, deemed reliable, "); |
2114 | 0 | else |
2115 | 0 | printk("deep Cstates possible, so not reliable, "); |
2116 | 0 | printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); |
2117 | 0 | } else |
2118 | 0 | printk("TSC not marked as either constant or reliable, " |
2119 | 0 | "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); |
2120 | 0 | for_each_domain ( d ) |
2121 | 0 | { |
2122 | 0 | if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT ) |
2123 | 0 | continue; |
2124 | 0 | printk("dom%u%s: mode=%d",d->domain_id, |
2125 | 0 | is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode); |
2126 | 0 | if ( d->arch.vtsc_offset ) |
2127 | 0 | printk(",ofs=%#"PRIx64, d->arch.vtsc_offset); |
2128 | 0 | if ( d->arch.tsc_khz ) |
2129 | 0 | printk(",khz=%"PRIu32, d->arch.tsc_khz); |
2130 | 0 | if ( d->arch.incarnation ) |
2131 | 0 | printk(",inc=%"PRIu32, d->arch.incarnation); |
2132 | 0 | #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS) |
2133 | 0 | if ( d->arch.vtsc_kerncount | d->arch.vtsc_usercount ) |
2134 | 0 | printk(",vtsc count: %"PRIu64" kernel,%"PRIu64" user", |
2135 | 0 | d->arch.vtsc_kerncount, d->arch.vtsc_usercount); |
2136 | 0 | #endif |
2137 | 0 | printk("\n"); |
2138 | 0 | domcnt++; |
2139 | 0 | } |
2140 | 0 |
|
2141 | 0 | if ( !domcnt ) |
2142 | 0 | printk("No domains have emulated TSC\n"); |
2143 | 0 | } |
2144 | | |
2145 | | static int __init setup_dump_softtsc(void) |
2146 | 1 | { |
2147 | 1 | register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1); |
2148 | 1 | return 0; |
2149 | 1 | } |
2150 | | __initcall(setup_dump_softtsc); |
2151 | | |
2152 | | /* |
2153 | | * Local variables: |
2154 | | * mode: C |
2155 | | * c-file-style: "BSD" |
2156 | | * c-basic-offset: 4 |
2157 | | * tab-width: 4 |
2158 | | * indent-tabs-mode: nil |
2159 | | * End: |
2160 | | */ |