debuggers.hg

view xen/arch/x86/time.c @ 20638:cf4f3e2f425c

Make tsc_mode=3 (pvrdtscp) work correctly.

Initial tsc_mode patch contained a rough cut at pvrdtscp mode. This
patch gets it working correctly. For the record, pvrdtscp mode allows
an application to obtain information from Xen to descale/de-offset
a physical tsc value to obtain "nsec since VM start". Though the
raw tsc value may change across migration due to different Hz rates
and different start times of different physical machines, applying
the pvrdtscp algorithm to a raw tsc value guarantees that the result
will always be both a fixed known rate (nanoseconds) and monotonically
increasing. BUT, pvrdtscp will only be fast on physical machines that
support the rdtscp instruction AND on which tsc is "safe"; on other
machines both the rdtsc and rdtscp instructions will be emulated.
Also note that when tsc_mode=3 is enabled, tsc-sensitive applications
that do NOT implement the pvrdtscp algorithm will behave incorrectly.
So, tsc_mode=3 should only be used when all apps are either
tsc-resilient
or pvrdtscp-modified, and only has a performance advantage on very
recent generation processors.

Signed-off-by: Dan Magenheiemer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Dec 08 07:48:45 2009 +0000 (2009-12-08)
parents f330d9edf67c
children 1396dfb8d6ba
line source
1 /******************************************************************************
2 * arch/x86/time.c
3 *
4 * Per-CPU time calibration and management.
5 *
6 * Copyright (c) 2002-2005, K A Fraser
7 *
8 * Portions from Linux are:
9 * Copyright (c) 1991, 1992, 1995 Linus Torvalds
10 */
12 #include <xen/config.h>
13 #include <xen/errno.h>
14 #include <xen/event.h>
15 #include <xen/sched.h>
16 #include <xen/lib.h>
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/time.h>
20 #include <xen/timer.h>
21 #include <xen/smp.h>
22 #include <xen/irq.h>
23 #include <xen/softirq.h>
24 #include <xen/keyhandler.h>
25 #include <xen/guest_access.h>
26 #include <asm/io.h>
27 #include <asm/msr.h>
28 #include <asm/mpspec.h>
29 #include <asm/processor.h>
30 #include <asm/fixmap.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/div64.h>
33 #include <asm/acpi.h>
34 #include <asm/hpet.h>
35 #include <io_ports.h>
36 #include <asm/setup.h> /* for early_time_init */
37 #include <public/arch-x86/cpuid.h>
39 /* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
40 static char __initdata opt_clocksource[10];
41 string_param("clocksource", opt_clocksource);
43 unsigned long cpu_khz; /* CPU clock frequency in kHz. */
44 DEFINE_SPINLOCK(rtc_lock);
45 unsigned long pit0_ticks;
46 static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
47 static DEFINE_SPINLOCK(wc_lock);
49 struct cpu_time {
50 u64 local_tsc_stamp;
51 s_time_t stime_local_stamp;
52 s_time_t stime_master_stamp;
53 struct time_scale tsc_scale;
54 };
56 struct platform_timesource {
57 char *id;
58 char *name;
59 u64 frequency;
60 u64 (*read_counter)(void);
61 int (*init)(struct platform_timesource *);
62 void (*resume)(struct platform_timesource *);
63 int counter_bits;
64 };
66 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
68 /* Calibrate all CPUs to platform timer every EPOCH. */
69 #define EPOCH MILLISECS(1000)
70 static struct timer calibration_timer;
72 /*
73 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
74 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
75 * softirq handling will happen in time.
76 *
77 * The pit_lock protects the 16- and 32-bit stamp fields as well as the
78 */
79 static DEFINE_SPINLOCK(pit_lock);
80 static u16 pit_stamp16;
81 static u32 pit_stamp32;
82 static int using_pit;
84 /*
85 * 32-bit division of integer dividend and integer divisor yielding
86 * 32-bit fractional quotient.
87 */
88 static inline u32 div_frac(u32 dividend, u32 divisor)
89 {
90 u32 quotient, remainder;
91 ASSERT(dividend < divisor);
92 asm (
93 "divl %4"
94 : "=a" (quotient), "=d" (remainder)
95 : "0" (0), "1" (dividend), "r" (divisor) );
96 return quotient;
97 }
99 /*
100 * 32-bit multiplication of multiplicand and fractional multiplier
101 * yielding 32-bit product (radix point at same position as in multiplicand).
102 */
103 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
104 {
105 u32 product_int, product_frac;
106 asm (
107 "mul %3"
108 : "=a" (product_frac), "=d" (product_int)
109 : "0" (multiplicand), "r" (multiplier) );
110 return product_int;
111 }
113 /*
114 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
115 * yielding a 64-bit result.
116 */
117 static inline u64 scale_delta(u64 delta, struct time_scale *scale)
118 {
119 u64 product;
120 #ifdef CONFIG_X86_32
121 u32 tmp1, tmp2;
122 #endif
124 if ( scale->shift < 0 )
125 delta >>= -scale->shift;
126 else
127 delta <<= scale->shift;
129 #ifdef CONFIG_X86_32
130 asm (
131 "mul %5 ; "
132 "mov %4,%%eax ; "
133 "mov %%edx,%4 ; "
134 "mul %5 ; "
135 "xor %5,%5 ; "
136 "add %4,%%eax ; "
137 "adc %5,%%edx ; "
138 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
139 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (scale->mul_frac) );
140 #else
141 asm (
142 "mul %%rdx ; shrd $32,%%rdx,%%rax"
143 : "=a" (product) : "0" (delta), "d" ((u64)scale->mul_frac) );
144 #endif
146 return product;
147 }
149 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
151 /* Compute the reciprocal of the given time_scale. */
152 static inline struct time_scale scale_reciprocal(struct time_scale scale)
153 {
154 struct time_scale reciprocal;
155 u32 dividend;
157 ASSERT(scale.mul_frac != 0);
158 dividend = _TS_MUL_FRAC_IDENTITY;
159 reciprocal.shift = 1 - scale.shift;
160 while ( unlikely(dividend >= scale.mul_frac) )
161 {
162 dividend >>= 1;
163 reciprocal.shift++;
164 }
166 asm (
167 "divl %4"
168 : "=a" (reciprocal.mul_frac), "=d" (dividend)
169 : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
171 return reciprocal;
172 }
174 /*
175 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
176 * IPIs in place of local APIC timers
177 */
178 extern int xen_cpuidle;
179 static cpumask_t pit_broadcast_mask;
181 static void smp_send_timer_broadcast_ipi(void)
182 {
183 int cpu = smp_processor_id();
184 cpumask_t mask;
186 cpus_and(mask, cpu_online_map, pit_broadcast_mask);
188 if ( cpu_isset(cpu, mask) )
189 {
190 cpu_clear(cpu, mask);
191 raise_softirq(TIMER_SOFTIRQ);
192 }
194 if ( !cpus_empty(mask) )
195 {
196 cpumask_raise_softirq(mask, TIMER_SOFTIRQ);
197 }
198 }
200 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
201 {
202 ASSERT(local_irq_is_enabled());
204 if ( hpet_legacy_irq_tick() )
205 return;
207 /* Only for start-of-day interruopt tests in io_apic.c. */
208 (*(volatile unsigned long *)&pit0_ticks)++;
210 /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
211 if ( !cpu_has_apic )
212 raise_softirq(TIMER_SOFTIRQ);
214 if ( xen_cpuidle )
215 smp_send_timer_broadcast_ipi();
217 /* Emulate a 32-bit PIT counter. */
218 if ( using_pit )
219 {
220 u16 count;
222 spin_lock_irq(&pit_lock);
224 outb(0x80, PIT_MODE);
225 count = inb(PIT_CH2);
226 count |= inb(PIT_CH2) << 8;
228 pit_stamp32 += (u16)(pit_stamp16 - count);
229 pit_stamp16 = count;
231 spin_unlock_irq(&pit_lock);
232 }
233 }
235 static struct irqaction __read_mostly irq0 = { timer_interrupt, "timer", NULL };
237 /* ------ Calibrate the TSC -------
238 * Return processor ticks per second / CALIBRATE_FRAC.
239 */
241 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
242 #define CALIBRATE_FRAC 20 /* calibrate over 50ms */
243 #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC)
245 static u64 init_pit_and_calibrate_tsc(void)
246 {
247 u64 start, end;
248 unsigned long count;
250 /* Set PIT channel 0 to HZ Hz. */
251 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
252 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
253 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
254 outb(LATCH >> 8, PIT_CH0); /* MSB */
256 /* Set the Gate high, disable speaker */
257 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
259 /*
260 * Now let's take care of CTC channel 2
261 *
262 * Set the Gate high, program CTC channel 2 for mode 0, (interrupt on
263 * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB)
264 * to begin countdown.
265 */
266 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
267 outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
268 outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */
270 rdtscll(start);
271 for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
272 continue;
273 rdtscll(end);
275 /* Error if the CTC doesn't behave itself. */
276 if ( count == 0 )
277 return 0;
279 return ((end - start) * (u64)CALIBRATE_FRAC);
280 }
282 static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
283 {
284 u64 tps64 = ticks_per_sec;
285 u32 tps32;
286 int shift = 0;
288 ASSERT(tps64 != 0);
290 while ( tps64 > (MILLISECS(1000)*2) )
291 {
292 tps64 >>= 1;
293 shift--;
294 }
296 tps32 = (u32)tps64;
297 while ( tps32 <= (u32)MILLISECS(1000) )
298 {
299 tps32 <<= 1;
300 shift++;
301 }
303 ts->mul_frac = div_frac(MILLISECS(1000), tps32);
304 ts->shift = shift;
305 }
307 static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0);
308 static unsigned int tsc_calibrate_status = 0;
310 void calibrate_tsc_bp(void)
311 {
312 while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) )
313 mb();
315 outb(CALIBRATE_LATCH & 0xff, PIT_CH2);
316 outb(CALIBRATE_LATCH >> 8, PIT_CH2);
318 tsc_calibrate_status = 1;
319 wmb();
321 while ( (inb(0x61) & 0x20) == 0 )
322 continue;
324 tsc_calibrate_status = 2;
325 wmb();
327 while ( atomic_read(&tsc_calibrate_gang) != 0 )
328 mb();
329 }
331 void calibrate_tsc_ap(void)
332 {
333 u64 t1, t2, ticks_per_sec;
335 atomic_inc(&tsc_calibrate_gang);
337 while ( tsc_calibrate_status < 1 )
338 mb();
340 rdtscll(t1);
342 while ( tsc_calibrate_status < 2 )
343 mb();
345 rdtscll(t2);
347 ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC;
348 set_time_scale(&this_cpu(cpu_time).tsc_scale, ticks_per_sec);
350 atomic_dec(&tsc_calibrate_gang);
351 }
353 static char *freq_string(u64 freq)
354 {
355 static char s[20];
356 unsigned int x, y;
357 y = (unsigned int)do_div(freq, 1000000) / 1000;
358 x = (unsigned int)freq;
359 snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
360 return s;
361 }
363 /************************************************************
364 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
365 */
367 static u64 read_pit_count(void)
368 {
369 u16 count16;
370 u32 count32;
371 unsigned long flags;
373 spin_lock_irqsave(&pit_lock, flags);
375 outb(0x80, PIT_MODE);
376 count16 = inb(PIT_CH2);
377 count16 |= inb(PIT_CH2) << 8;
379 count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
381 spin_unlock_irqrestore(&pit_lock, flags);
383 return count32;
384 }
386 static int __init init_pit(struct platform_timesource *pts)
387 {
388 using_pit = 1;
389 return 1;
390 }
392 static struct platform_timesource __initdata plt_pit =
393 {
394 .id = "pit",
395 .name = "PIT",
396 .frequency = CLOCK_TICK_RATE,
397 .read_counter = read_pit_count,
398 .counter_bits = 32,
399 .init = init_pit
400 };
402 /************************************************************
403 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
404 */
406 static u64 read_hpet_count(void)
407 {
408 return hpet_read32(HPET_COUNTER);
409 }
411 static int __init init_hpet(struct platform_timesource *pts)
412 {
413 u64 hpet_rate = hpet_setup();
415 if ( hpet_rate == 0 )
416 return 0;
418 pts->frequency = hpet_rate;
419 return 1;
420 }
422 static void resume_hpet(struct platform_timesource *pts)
423 {
424 u64 hpet_rate = hpet_setup();
426 BUG_ON(hpet_rate == 0);
427 pts->frequency = hpet_rate;
428 }
430 static struct platform_timesource __initdata plt_hpet =
431 {
432 .id = "hpet",
433 .name = "HPET",
434 .read_counter = read_hpet_count,
435 .counter_bits = 32,
436 .init = init_hpet,
437 .resume = resume_hpet
438 };
440 /************************************************************
441 * PLATFORM TIMER 3: IBM 'CYCLONE' TIMER
442 */
444 int use_cyclone;
446 /*
447 * Although the counter is read via a 64-bit register, I believe it is actually
448 * a 40-bit counter. Since this will wrap, I read only the low 32 bits and
449 * periodically fold into a 64-bit software counter, just as for PIT and HPET.
450 */
451 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
452 #define CYCLONE_PMCC_OFFSET 0x51A0
453 #define CYCLONE_MPMC_OFFSET 0x51D0
454 #define CYCLONE_MPCS_OFFSET 0x51A8
455 #define CYCLONE_TIMER_FREQ 100000000
457 /* Cyclone MPMC0 register. */
458 static volatile u32 *cyclone_timer;
460 static u64 read_cyclone_count(void)
461 {
462 return *cyclone_timer;
463 }
465 static volatile u32 *__init map_cyclone_reg(unsigned long regaddr)
466 {
467 unsigned long pageaddr = regaddr & PAGE_MASK;
468 unsigned long offset = regaddr & ~PAGE_MASK;
469 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
470 return (volatile u32 *)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
471 }
473 static int __init init_cyclone(struct platform_timesource *pts)
474 {
475 u32 base;
477 if ( !use_cyclone )
478 return 0;
480 /* Find base address. */
481 base = *(map_cyclone_reg(CYCLONE_CBAR_ADDR));
482 if ( base == 0 )
483 {
484 printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n");
485 return 0;
486 }
488 /* Enable timer and map the counter register. */
489 *(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1;
490 *(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1;
491 cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET);
492 return 1;
493 }
495 static struct platform_timesource __initdata plt_cyclone =
496 {
497 .id = "cyclone",
498 .name = "IBM Cyclone",
499 .frequency = CYCLONE_TIMER_FREQ,
500 .read_counter = read_cyclone_count,
501 .counter_bits = 32,
502 .init = init_cyclone
503 };
505 /************************************************************
506 * PLATFORM TIMER 4: ACPI PM TIMER
507 */
509 u32 pmtmr_ioport;
511 /* ACPI PM timer ticks at 3.579545 MHz. */
512 #define ACPI_PM_FREQUENCY 3579545
514 static u64 read_pmtimer_count(void)
515 {
516 return inl(pmtmr_ioport);
517 }
519 static int __init init_pmtimer(struct platform_timesource *pts)
520 {
521 if ( pmtmr_ioport == 0 )
522 return 0;
524 return 1;
525 }
527 static struct platform_timesource __initdata plt_pmtimer =
528 {
529 .id = "acpi",
530 .name = "ACPI PM Timer",
531 .frequency = ACPI_PM_FREQUENCY,
532 .read_counter = read_pmtimer_count,
533 .counter_bits = 24,
534 .init = init_pmtimer
535 };
537 static struct time_scale pmt_scale;
538 static struct time_scale pmt_scale_r;
539 static __init int init_pmtmr_scale(void)
540 {
541 set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
542 pmt_scale_r = scale_reciprocal(pmt_scale);
543 return 0;
544 }
545 __initcall(init_pmtmr_scale);
547 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
548 {
549 return scale_delta(ticks, &pmt_scale);
550 }
552 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
553 {
554 return scale_delta(ns, &pmt_scale_r);
555 }
557 /************************************************************
558 * GENERIC PLATFORM TIMER INFRASTRUCTURE
559 */
561 static struct platform_timesource plt_src; /* details of chosen timesource */
562 static u64 plt_mask; /* hardware-width mask */
563 static u64 plt_overflow_period; /* ns between calls to plt_overflow() */
564 static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */
566 /* Protected by platform_timer_lock. */
567 static DEFINE_SPINLOCK(platform_timer_lock);
568 static s_time_t stime_platform_stamp; /* System time at below platform time */
569 static u64 platform_timer_stamp; /* Platform time at above system time */
570 static u64 plt_stamp64; /* 64-bit platform counter stamp */
571 static u64 plt_stamp; /* hardware-width platform counter stamp */
572 static struct timer plt_overflow_timer;
574 static void plt_overflow(void *unused)
575 {
576 u64 count;
578 spin_lock_irq(&platform_timer_lock);
579 count = plt_src.read_counter();
580 plt_stamp64 += (count - plt_stamp) & plt_mask;
581 plt_stamp = count;
582 spin_unlock_irq(&platform_timer_lock);
584 set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
585 }
587 static s_time_t __read_platform_stime(u64 platform_time)
588 {
589 u64 diff = platform_time - platform_timer_stamp;
590 ASSERT(spin_is_locked(&platform_timer_lock));
591 return (stime_platform_stamp + scale_delta(diff, &plt_scale));
592 }
594 static s_time_t read_platform_stime(void)
595 {
596 u64 count;
597 s_time_t stime;
599 ASSERT(!local_irq_is_enabled());
601 spin_lock(&platform_timer_lock);
602 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
603 stime = __read_platform_stime(count);
604 spin_unlock(&platform_timer_lock);
606 return stime;
607 }
609 static void platform_time_calibration(void)
610 {
611 u64 count;
612 s_time_t stamp;
613 unsigned long flags;
615 spin_lock_irqsave(&platform_timer_lock, flags);
616 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
617 stamp = __read_platform_stime(count);
618 stime_platform_stamp = stamp;
619 platform_timer_stamp = count;
620 spin_unlock_irqrestore(&platform_timer_lock, flags);
621 }
623 static void resume_platform_timer(void)
624 {
625 /* Timer source can be reset when backing from S3 to S0 */
626 if ( plt_src.resume )
627 plt_src.resume(&plt_src);
629 plt_stamp64 = platform_timer_stamp;
630 plt_stamp = plt_src.read_counter();
631 }
633 static void __init init_platform_timer(void)
634 {
635 static struct platform_timesource * __initdata plt_timers[] = {
636 &plt_cyclone, &plt_hpet, &plt_pmtimer, &plt_pit
637 };
639 struct platform_timesource *pts = NULL;
640 int i, rc = -1;
642 if ( opt_clocksource[0] != '\0' )
643 {
644 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
645 {
646 pts = plt_timers[i];
647 if ( !strcmp(opt_clocksource, pts->id) )
648 {
649 rc = pts->init(pts);
650 break;
651 }
652 }
654 if ( rc <= 0 )
655 printk("WARNING: %s clocksource '%s'.\n",
656 (rc == 0) ? "Could not initialise" : "Unrecognised",
657 opt_clocksource);
658 }
660 if ( rc <= 0 )
661 {
662 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
663 {
664 pts = plt_timers[i];
665 if ( (rc = pts->init(pts)) > 0 )
666 break;
667 }
668 }
670 BUG_ON(rc <= 0);
672 plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
674 set_time_scale(&plt_scale, pts->frequency);
676 plt_overflow_period = scale_delta(
677 1ull << (pts->counter_bits-1), &plt_scale);
678 init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
679 plt_src = *pts;
680 plt_overflow(NULL);
682 platform_timer_stamp = plt_stamp64;
683 stime_platform_stamp = NOW();
685 printk("Platform timer is %s %s\n",
686 freq_string(pts->frequency), pts->name);
687 }
689 void cstate_restore_tsc(void)
690 {
691 struct cpu_time *t = &this_cpu(cpu_time);
692 struct time_scale sys_to_tsc = scale_reciprocal(t->tsc_scale);
693 s_time_t stime_delta;
694 u64 new_tsc;
696 if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
697 return;
699 stime_delta = read_platform_stime() - t->stime_master_stamp;
700 if ( stime_delta < 0 )
701 stime_delta = 0;
703 new_tsc = t->local_tsc_stamp + scale_delta(stime_delta, &sys_to_tsc);
705 write_tsc(new_tsc);
706 }
708 /***************************************************************************
709 * CMOS Timer functions
710 ***************************************************************************/
712 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
713 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
714 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
715 *
716 * [For the Julian calendar (which was used in Russia before 1917,
717 * Britain & colonies before 1752, anywhere else before 1582,
718 * and is still in use by some communities) leave out the
719 * -year/100+year/400 terms, and add 10.]
720 *
721 * This algorithm was first published by Gauss (I think).
722 *
723 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
724 * machines were long is 32-bit! (However, as time_t is signed, we
725 * will already get problems at other places on 2038-01-19 03:14:08)
726 */
727 unsigned long
728 mktime (unsigned int year, unsigned int mon,
729 unsigned int day, unsigned int hour,
730 unsigned int min, unsigned int sec)
731 {
732 /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
733 if ( 0 >= (int) (mon -= 2) )
734 {
735 mon += 12;
736 year -= 1;
737 }
739 return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
740 year*365 - 719499
741 )*24 + hour /* now have hours */
742 )*60 + min /* now have minutes */
743 )*60 + sec; /* finally seconds */
744 }
746 static unsigned long __get_cmos_time(void)
747 {
748 unsigned int year, mon, day, hour, min, sec;
750 sec = CMOS_READ(RTC_SECONDS);
751 min = CMOS_READ(RTC_MINUTES);
752 hour = CMOS_READ(RTC_HOURS);
753 day = CMOS_READ(RTC_DAY_OF_MONTH);
754 mon = CMOS_READ(RTC_MONTH);
755 year = CMOS_READ(RTC_YEAR);
757 if ( !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD )
758 {
759 BCD_TO_BIN(sec);
760 BCD_TO_BIN(min);
761 BCD_TO_BIN(hour);
762 BCD_TO_BIN(day);
763 BCD_TO_BIN(mon);
764 BCD_TO_BIN(year);
765 }
767 if ( (year += 1900) < 1970 )
768 year += 100;
770 return mktime(year, mon, day, hour, min, sec);
771 }
773 static unsigned long get_cmos_time(void)
774 {
775 unsigned long res, flags;
776 int i;
778 spin_lock_irqsave(&rtc_lock, flags);
780 /* read RTC exactly on falling edge of update flag */
781 for ( i = 0 ; i < 1000000 ; i++ ) /* may take up to 1 second... */
782 if ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
783 break;
784 for ( i = 0 ; i < 1000000 ; i++ ) /* must try at least 2.228 ms */
785 if ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
786 break;
788 res = __get_cmos_time();
790 spin_unlock_irqrestore(&rtc_lock, flags);
791 return res;
792 }
794 /***************************************************************************
795 * System Time
796 ***************************************************************************/
798 s_time_t get_s_time(void)
799 {
800 struct cpu_time *t = &this_cpu(cpu_time);
801 u64 tsc, delta;
802 s_time_t now;
804 rdtscll(tsc);
805 delta = tsc - t->local_tsc_stamp;
806 now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
808 return now;
809 }
811 /* Explicitly OR with 1 just in case version number gets out of sync. */
812 #define version_update_begin(v) (((v)+1)|1)
813 #define version_update_end(v) ((v)+1)
815 static void __update_vcpu_system_time(struct vcpu *v, int force)
816 {
817 struct cpu_time *t;
818 struct vcpu_time_info *u, _u;
819 XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
820 struct domain *d = v->domain;
821 s_time_t tsc_stamp = 0;
823 if ( v->vcpu_info == NULL )
824 return;
826 t = &this_cpu(cpu_time);
827 u = &vcpu_info(v, time);
829 if ( d->arch.vtsc )
830 tsc_stamp = scale_delta(t->stime_local_stamp - d->arch.vtsc_offset,
831 &d->arch.ns_to_vtsc);
832 else
833 tsc_stamp = t->local_tsc_stamp;
835 if ( (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) &&
836 boot_cpu_has(X86_FEATURE_RDTSCP) )
837 write_rdtscp_aux(d->arch.incarnation);
839 /* Don't bother unless timestamps have changed or we are forced. */
840 if ( !force && (u->tsc_timestamp == tsc_stamp) )
841 return;
843 memset(&_u, 0, sizeof(_u));
845 if ( d->arch.vtsc )
846 {
847 _u.tsc_timestamp = tsc_stamp;
848 _u.system_time = t->stime_local_stamp;
849 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
850 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
851 }
852 else
853 {
854 _u.tsc_timestamp = t->local_tsc_stamp;
855 _u.system_time = t->stime_local_stamp;
856 _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
857 _u.tsc_shift = (s8)t->tsc_scale.shift;
858 }
860 /* 1. Update guest kernel version. */
861 _u.version = u->version = version_update_begin(u->version);
862 wmb();
863 /* 2. Update all other guest kernel fields. */
864 *u = _u;
865 wmb();
866 /* 3. Update guest kernel version. */
867 u->version = version_update_end(u->version);
869 user_u = v->arch.time_info_guest;
870 if ( !guest_handle_is_null(user_u) )
871 {
872 /* 1. Update userspace version. */
873 __copy_field_to_guest(user_u, &_u, version);
874 wmb();
875 /* 2. Update all other userspavce fields. */
876 __copy_to_guest(user_u, &_u, 1);
877 wmb();
878 /* 3. Update userspace version. */
879 _u.version = version_update_end(_u.version);
880 __copy_field_to_guest(user_u, &_u, version);
881 }
882 }
884 void update_vcpu_system_time(struct vcpu *v)
885 {
886 __update_vcpu_system_time(v, 0);
887 }
889 void force_update_vcpu_system_time(struct vcpu *v)
890 {
891 __update_vcpu_system_time(v, 1);
892 }
894 void update_domain_wallclock_time(struct domain *d)
895 {
896 uint32_t *wc_version;
898 spin_lock(&wc_lock);
900 wc_version = &shared_info(d, wc_version);
901 *wc_version = version_update_begin(*wc_version);
902 wmb();
904 shared_info(d, wc_sec) = wc_sec + d->time_offset_seconds;
905 shared_info(d, wc_nsec) = wc_nsec;
907 wmb();
908 *wc_version = version_update_end(*wc_version);
910 spin_unlock(&wc_lock);
911 }
913 void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds)
914 {
915 d->time_offset_seconds = time_offset_seconds;
916 if ( is_hvm_domain(d) )
917 rtc_update_clock(d);
918 }
920 int cpu_frequency_change(u64 freq)
921 {
922 struct cpu_time *t = &this_cpu(cpu_time);
923 u64 curr_tsc;
925 /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
926 if ( freq < 1000000u )
927 {
928 gdprintk(XENLOG_WARNING, "Rejecting CPU frequency change "
929 "to %"PRIu64" Hz.\n", freq);
930 return -EINVAL;
931 }
933 local_irq_disable();
934 /* Platform time /first/, as we may be delayed by platform_timer_lock. */
935 t->stime_master_stamp = read_platform_stime();
936 /* TSC-extrapolated time may be bogus after frequency change. */
937 /*t->stime_local_stamp = get_s_time();*/
938 t->stime_local_stamp = t->stime_master_stamp;
939 rdtscll(curr_tsc);
940 t->local_tsc_stamp = curr_tsc;
941 set_time_scale(&t->tsc_scale, freq);
942 local_irq_enable();
944 update_vcpu_system_time(current);
946 /* A full epoch should pass before we check for deviation. */
947 if ( smp_processor_id() == 0 )
948 {
949 set_timer(&calibration_timer, NOW() + EPOCH);
950 platform_time_calibration();
951 }
953 return 0;
954 }
956 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
957 void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
958 {
959 u64 x;
960 u32 y, _wc_sec, _wc_nsec;
961 struct domain *d;
963 x = (secs * 1000000000ULL) + (u64)nsecs - system_time_base;
964 y = do_div(x, 1000000000);
966 spin_lock(&wc_lock);
967 wc_sec = _wc_sec = (u32)x;
968 wc_nsec = _wc_nsec = (u32)y;
969 spin_unlock(&wc_lock);
971 rcu_read_lock(&domlist_read_lock);
972 for_each_domain ( d )
973 update_domain_wallclock_time(d);
974 rcu_read_unlock(&domlist_read_lock);
975 }
977 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
978 struct cpu_calibration {
979 u64 local_tsc_stamp;
980 s_time_t stime_local_stamp;
981 s_time_t stime_master_stamp;
982 };
983 static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration);
985 /* Softirq handler for per-CPU time calibration. */
986 static void local_time_calibration(void)
987 {
988 struct cpu_time *t = &this_cpu(cpu_time);
989 struct cpu_calibration *c = &this_cpu(cpu_calibration);
991 /*
992 * System timestamps, extrapolated from local and master oscillators,
993 * taken during this calibration and the previous calibration.
994 */
995 s_time_t prev_local_stime, curr_local_stime;
996 s_time_t prev_master_stime, curr_master_stime;
998 /* TSC timestamps taken during this calibration and prev calibration. */
999 u64 prev_tsc, curr_tsc;
1001 /*
1002 * System time and TSC ticks elapsed during the previous calibration
1003 * 'epoch'. These values are down-shifted to fit in 32 bits.
1004 */
1005 u64 stime_elapsed64, tsc_elapsed64;
1006 u32 stime_elapsed32, tsc_elapsed32;
1008 /* The accumulated error in the local estimate. */
1009 u64 local_stime_err;
1011 /* Error correction to slow down a fast local clock. */
1012 u32 error_factor = 0;
1014 /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1015 int tsc_shift = 0;
1017 /* The overall calibration scale multiplier. */
1018 u32 calibration_mul_frac;
1020 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1022 /* Atomically read cpu_calibration struct and write cpu_time struct. */
1023 local_irq_disable();
1024 t->local_tsc_stamp = c->local_tsc_stamp;
1025 t->stime_local_stamp = c->stime_master_stamp;
1026 t->stime_master_stamp = c->stime_master_stamp;
1027 local_irq_enable();
1028 update_vcpu_system_time(current);
1029 goto out;
1032 prev_tsc = t->local_tsc_stamp;
1033 prev_local_stime = t->stime_local_stamp;
1034 prev_master_stime = t->stime_master_stamp;
1036 /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1037 local_irq_disable();
1038 curr_tsc = c->local_tsc_stamp;
1039 curr_local_stime = c->stime_local_stamp;
1040 curr_master_stime = c->stime_master_stamp;
1041 local_irq_enable();
1043 #if 0
1044 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1045 smp_processor_id(), prev_tsc, prev_local_stime, prev_master_stime);
1046 printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1047 " -> %"PRId64"\n",
1048 smp_processor_id(), curr_tsc, curr_local_stime, curr_master_stime,
1049 curr_master_stime - curr_local_stime);
1050 #endif
1052 /* Local time warps forward if it lags behind master time. */
1053 if ( curr_local_stime < curr_master_stime )
1054 curr_local_stime = curr_master_stime;
1056 stime_elapsed64 = curr_master_stime - prev_master_stime;
1057 tsc_elapsed64 = curr_tsc - prev_tsc;
1059 /*
1060 * Weirdness can happen if we lose sync with the platform timer.
1061 * We could be smarter here: resync platform timer with local timer?
1062 */
1063 if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1064 goto out;
1066 /*
1067 * Calculate error-correction factor. This only slows down a fast local
1068 * clock (slow clocks are warped forwards). The scale factor is clamped
1069 * to >= 0.5.
1070 */
1071 if ( curr_local_stime != curr_master_stime )
1073 local_stime_err = curr_local_stime - curr_master_stime;
1074 if ( local_stime_err > EPOCH )
1075 local_stime_err = EPOCH;
1076 error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1079 /*
1080 * We require 0 < stime_elapsed < 2^31.
1081 * This allows us to binary shift a 32-bit tsc_elapsed such that:
1082 * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1083 */
1084 while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1085 ((s32)stime_elapsed64 < 0) )
1087 stime_elapsed64 >>= 1;
1088 tsc_elapsed64 >>= 1;
1091 /* stime_master_diff now fits in a 32-bit word. */
1092 stime_elapsed32 = (u32)stime_elapsed64;
1094 /* tsc_elapsed <= 2*stime_elapsed */
1095 while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1097 tsc_elapsed64 >>= 1;
1098 tsc_shift--;
1101 /* Local difference must now fit in 32 bits. */
1102 ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1103 tsc_elapsed32 = (u32)tsc_elapsed64;
1105 /* tsc_elapsed > stime_elapsed */
1106 ASSERT(tsc_elapsed32 != 0);
1107 while ( tsc_elapsed32 <= stime_elapsed32 )
1109 tsc_elapsed32 <<= 1;
1110 tsc_shift++;
1113 calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1114 if ( error_factor != 0 )
1115 calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1117 #if 0
1118 printk("---%d: %08x %08x %d\n", smp_processor_id(),
1119 error_factor, calibration_mul_frac, tsc_shift);
1120 #endif
1122 /* Record new timestamp information, atomically w.r.t. interrupts. */
1123 local_irq_disable();
1124 t->tsc_scale.mul_frac = calibration_mul_frac;
1125 t->tsc_scale.shift = tsc_shift;
1126 t->local_tsc_stamp = curr_tsc;
1127 t->stime_local_stamp = curr_local_stime;
1128 t->stime_master_stamp = curr_master_stime;
1129 local_irq_enable();
1131 update_vcpu_system_time(current);
1133 out:
1134 if ( smp_processor_id() == 0 )
1136 set_timer(&calibration_timer, NOW() + EPOCH);
1137 platform_time_calibration();
1141 /*
1142 * Rendezvous for all CPUs in IRQ context.
1143 * Master CPU snapshots the platform timer.
1144 * All CPUS snapshot their local TSC and extrapolation of system time.
1145 */
1146 struct calibration_rendezvous {
1147 cpumask_t cpu_calibration_map;
1148 atomic_t semaphore;
1149 s_time_t master_stime;
1150 u64 master_tsc_stamp;
1151 };
1153 /*
1154 * Keep TSCs in sync when they run at the same rate, but may stop in
1155 * deep-sleep C states.
1156 */
1157 static void time_calibration_tsc_rendezvous(void *_r)
1159 int i;
1160 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1161 struct calibration_rendezvous *r = _r;
1162 unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
1164 /* Loop to get rid of cache effects on TSC skew. */
1165 for ( i = 4; i >= 0; i-- )
1167 if ( smp_processor_id() == 0 )
1169 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1170 mb();
1172 if ( r->master_stime == 0 )
1174 r->master_stime = read_platform_stime();
1175 rdtscll(r->master_tsc_stamp);
1177 atomic_inc(&r->semaphore);
1179 if ( i == 0 )
1180 write_tsc(r->master_tsc_stamp);
1182 while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1183 mb();
1184 atomic_set(&r->semaphore, 0);
1186 else
1188 atomic_inc(&r->semaphore);
1189 while ( atomic_read(&r->semaphore) < total_cpus )
1190 mb();
1192 if ( i == 0 )
1193 write_tsc(r->master_tsc_stamp);
1195 atomic_inc(&r->semaphore);
1196 while ( atomic_read(&r->semaphore) > total_cpus )
1197 mb();
1201 rdtscll(c->local_tsc_stamp);
1202 c->stime_local_stamp = get_s_time();
1203 c->stime_master_stamp = r->master_stime;
1205 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1208 /* Ordinary rendezvous function which does not modify TSC values. */
1209 static void time_calibration_std_rendezvous(void *_r)
1211 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1212 struct calibration_rendezvous *r = _r;
1213 unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
1215 if ( smp_processor_id() == 0 )
1217 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1218 cpu_relax();
1219 r->master_stime = read_platform_stime();
1220 mb(); /* write r->master_stime /then/ signal */
1221 atomic_inc(&r->semaphore);
1223 else
1225 atomic_inc(&r->semaphore);
1226 while ( atomic_read(&r->semaphore) != total_cpus )
1227 cpu_relax();
1228 mb(); /* receive signal /then/ read r->master_stime */
1231 rdtscll(c->local_tsc_stamp);
1232 c->stime_local_stamp = get_s_time();
1233 c->stime_master_stamp = r->master_stime;
1235 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1238 static void (*time_calibration_rendezvous_fn)(void *) =
1239 time_calibration_std_rendezvous;
1241 static void time_calibration(void *unused)
1243 struct calibration_rendezvous r = {
1244 .cpu_calibration_map = cpu_online_map,
1245 .semaphore = ATOMIC_INIT(0)
1246 };
1248 /* @wait=1 because we must wait for all cpus before freeing @r. */
1249 on_selected_cpus(&r.cpu_calibration_map,
1250 time_calibration_rendezvous_fn,
1251 &r, 1);
1254 void init_percpu_time(void)
1256 struct cpu_time *t = &this_cpu(cpu_time);
1257 unsigned long flags;
1258 s_time_t now;
1260 local_irq_save(flags);
1261 rdtscll(t->local_tsc_stamp);
1262 now = read_platform_stime();
1263 local_irq_restore(flags);
1265 t->stime_master_stamp = now;
1266 t->stime_local_stamp = now;
1268 if ( smp_processor_id() == 0 )
1270 init_timer(&calibration_timer, time_calibration, NULL, 0);
1271 set_timer(&calibration_timer, NOW() + EPOCH);
1275 /* Late init function (after all CPUs are booted). */
1276 int __init init_xen_time(void)
1278 /* If we have constant-rate TSCs then scale factor can be shared. */
1279 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1281 int cpu;
1282 for_each_possible_cpu ( cpu )
1283 per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1284 /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
1285 if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1286 time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
1289 open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1291 /* System time (get_s_time()) starts ticking from now. */
1292 rdtscll(this_cpu(cpu_time).local_tsc_stamp);
1294 /* NB. get_cmos_time() can take over one second to execute. */
1295 do_settime(get_cmos_time(), 0, NOW());
1297 init_platform_timer();
1299 init_percpu_time();
1301 return 0;
1305 /* Early init function. */
1306 void __init early_time_init(void)
1308 u64 tmp = init_pit_and_calibrate_tsc();
1310 set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
1312 do_div(tmp, 1000);
1313 cpu_khz = (unsigned long)tmp;
1314 printk("Detected %lu.%03lu MHz processor.\n",
1315 cpu_khz / 1000, cpu_khz % 1000);
1317 setup_irq(0, &irq0);
1320 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
1321 static int disable_pit_irq(void)
1323 if ( using_pit || !cpu_has_apic )
1324 return 0;
1326 /*
1327 * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
1328 * emulation when entering deep C states.
1329 * XXX dom0 may rely on RTC interrupt delivery, so only enable
1330 * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
1331 */
1332 if ( xen_cpuidle && !boot_cpu_has(X86_FEATURE_ARAT) )
1334 hpet_broadcast_init();
1335 if ( !hpet_broadcast_is_available() )
1337 if ( xen_cpuidle == -1 )
1339 xen_cpuidle = 0;
1340 printk("CPUIDLE: disabled due to no HPET. "
1341 "Force enable with 'cpuidle'.\n");
1343 else
1345 printk("HPET broadcast init failed, turn to PIT broadcast.\n");
1346 return 0;
1351 /* Disable PIT CH0 timer interrupt. */
1352 outb_p(0x30, PIT_MODE);
1353 outb_p(0, PIT_CH0);
1354 outb_p(0, PIT_CH0);
1356 return 0;
1358 __initcall(disable_pit_irq);
1360 void pit_broadcast_enter(void)
1362 cpu_set(smp_processor_id(), pit_broadcast_mask);
1365 void pit_broadcast_exit(void)
1367 int cpu = smp_processor_id();
1369 if ( cpu_test_and_clear(cpu, pit_broadcast_mask) )
1370 reprogram_timer(per_cpu(timer_deadline, cpu));
1373 int pit_broadcast_is_available(void)
1375 return xen_cpuidle;
1378 void send_timer_event(struct vcpu *v)
1380 send_guest_vcpu_virq(v, VIRQ_TIMER);
1383 /* Return secs after 00:00:00 localtime, 1 January, 1970. */
1384 unsigned long get_localtime(struct domain *d)
1386 return wc_sec + (wc_nsec + NOW()) / 1000000000ULL
1387 + d->time_offset_seconds;
1390 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1391 static long cmos_utc_offset; /* in seconds */
1393 int time_suspend(void)
1395 if ( smp_processor_id() == 0 )
1397 cmos_utc_offset = -get_cmos_time();
1398 cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL);
1399 kill_timer(&calibration_timer);
1401 /* Sync platform timer stamps. */
1402 platform_time_calibration();
1405 /* Better to cancel calibration timer for accuracy. */
1406 clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1408 return 0;
1411 int time_resume(void)
1413 /*u64 tmp = */init_pit_and_calibrate_tsc();
1415 /* Disable this while calibrate_tsc_ap() also is skipped. */
1416 /*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/
1418 resume_platform_timer();
1420 disable_pit_irq();
1422 init_percpu_time();
1424 do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
1426 update_vcpu_system_time(current);
1428 return 0;
1431 int dom0_pit_access(struct ioreq *ioreq)
1433 /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1434 if ( using_pit )
1435 return 0;
1437 switch ( ioreq->addr )
1439 case PIT_CH2:
1440 if ( ioreq->dir == IOREQ_READ )
1441 ioreq->data = inb(PIT_CH2);
1442 else
1443 outb(ioreq->data, PIT_CH2);
1444 return 1;
1446 case PIT_MODE:
1447 if ( ioreq->dir == IOREQ_READ )
1448 return 0; /* urk! */
1449 switch ( ioreq->data & 0xc0 )
1451 case 0xc0: /* Read Back */
1452 if ( ioreq->data & 0x08 ) /* Select Channel 2? */
1453 outb(ioreq->data & 0xf8, PIT_MODE);
1454 if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1455 return 1; /* no - we're done */
1456 /* Filter Channel 2 and reserved bit 0. */
1457 ioreq->data &= ~0x09;
1458 return 0; /* emulate ch0/1 readback */
1459 case 0x80: /* Select Counter 2 */
1460 outb(ioreq->data, PIT_MODE);
1461 return 1;
1464 case 0x61:
1465 if ( ioreq->dir == IOREQ_READ )
1466 ioreq->data = inb(0x61);
1467 else
1468 outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1469 return 1;
1472 return 0;
1475 struct tm wallclock_time(void)
1477 uint64_t seconds;
1479 if ( !wc_sec )
1480 return (struct tm) { 0 };
1482 seconds = NOW() + (wc_sec * 1000000000ull) + wc_nsec;
1483 do_div(seconds, 1000000000);
1484 return gmtime(seconds);
1487 /*
1488 * TSC Reliability check
1489 */
1491 /*
1492 * The Linux original version of this function is
1493 * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1494 */
1495 void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1497 #define rdtsc_barrier() mb()
1498 static DEFINE_SPINLOCK(sync_lock);
1499 static cycles_t last_tsc;
1501 cycles_t start, now, prev, end;
1502 int i;
1504 rdtsc_barrier();
1505 start = get_cycles();
1506 rdtsc_barrier();
1508 /* The measurement runs for 20 msecs: */
1509 end = start + tsc_khz * 20ULL;
1510 now = start;
1512 for ( i = 0; ; i++ )
1514 /*
1515 * We take the global lock, measure TSC, save the
1516 * previous TSC that was measured (possibly on
1517 * another CPU) and update the previous TSC timestamp.
1518 */
1519 spin_lock(&sync_lock);
1520 prev = last_tsc;
1521 rdtsc_barrier();
1522 now = get_cycles();
1523 rdtsc_barrier();
1524 last_tsc = now;
1525 spin_unlock(&sync_lock);
1527 /*
1528 * Be nice every now and then (and also check whether measurement is
1529 * done [we also insert a 10 million loops safety exit, so we dont
1530 * lock up in case the TSC readout is totally broken]):
1531 */
1532 if ( unlikely(!(i & 7)) )
1534 if ( (now > end) || (i > 10000000) )
1535 break;
1536 cpu_relax();
1537 /*touch_nmi_watchdog();*/
1540 /*
1541 * Outside the critical section we can now see whether we saw a
1542 * time-warp of the TSC going backwards:
1543 */
1544 if ( unlikely(prev > now) )
1546 spin_lock(&sync_lock);
1547 if ( *max_warp < prev - now )
1548 *max_warp = prev - now;
1549 spin_unlock(&sync_lock);
1554 static unsigned long tsc_max_warp, tsc_check_count;
1555 static cpumask_t tsc_check_cpumask = CPU_MASK_NONE;
1557 static void tsc_check_slave(void *unused)
1559 unsigned int cpu = smp_processor_id();
1560 local_irq_disable();
1561 while ( !cpu_isset(cpu, tsc_check_cpumask) )
1562 mb();
1563 check_tsc_warp(cpu_khz, &tsc_max_warp);
1564 cpu_clear(cpu, tsc_check_cpumask);
1565 local_irq_enable();
1568 void tsc_check_reliability(void)
1570 unsigned int cpu = smp_processor_id();
1571 static DEFINE_SPINLOCK(lock);
1573 spin_lock(&lock);
1575 tsc_check_count++;
1576 smp_call_function(tsc_check_slave, NULL, 0);
1577 tsc_check_cpumask = cpu_online_map;
1578 local_irq_disable();
1579 check_tsc_warp(cpu_khz, &tsc_max_warp);
1580 cpu_clear(cpu, tsc_check_cpumask);
1581 local_irq_enable();
1582 while ( !cpus_empty(tsc_check_cpumask) )
1583 cpu_relax();
1585 spin_unlock(&lock);
1588 /*
1589 * PV SoftTSC Emulation.
1590 */
1592 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
1594 s_time_t now = get_s_time();
1595 struct domain *d = v->domain;
1597 spin_lock(&d->arch.vtsc_lock);
1599 if ( guest_kernel_mode(v, regs) )
1600 d->arch.vtsc_kerncount++;
1601 else
1602 d->arch.vtsc_usercount++;
1604 if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
1605 d->arch.vtsc_last = now;
1606 else
1607 now = ++d->arch.vtsc_last;
1609 spin_unlock(&d->arch.vtsc_lock);
1611 now = scale_delta(now - d->arch.vtsc_offset, &d->arch.ns_to_vtsc);
1613 regs->eax = (uint32_t)now;
1614 regs->edx = (uint32_t)(now >> 32);
1616 if ( rdtscp )
1617 regs->ecx =
1618 (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0;
1621 static int host_tsc_is_safe(void)
1623 extern unsigned int max_cstate;
1625 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1626 return 1;
1627 if ( num_online_cpus() == 1 )
1628 return 1;
1629 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
1631 if ( !tsc_check_count )
1632 tsc_check_reliability();
1633 if ( tsc_max_warp == 0 )
1634 return 1;
1636 return 0;
1639 void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
1640 uint32_t *ecx, uint32_t *edx)
1642 struct domain *d = current->domain;
1643 uint64_t offset;
1645 switch ( sub_idx )
1647 case 0: /* features */
1648 *eax = ( ( (!!d->arch.vtsc) << 0 ) |
1649 ( (!!host_tsc_is_safe()) << 1 ) |
1650 ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
1651 0 );
1652 *ebx = d->arch.tsc_mode;
1653 *ecx = d->arch.tsc_khz;
1654 *edx = d->arch.incarnation;
1655 break;
1656 case 1: /* scale and offset */
1657 if ( !d->arch.vtsc )
1658 offset = d->arch.vtsc_offset;
1659 else
1660 /* offset already applied to value returned by virtual rdtscp */
1661 offset = 0;
1662 *eax = (uint32_t)offset;
1663 *ebx = (uint32_t)(offset >> 32);
1664 *ecx = d->arch.vtsc_to_ns.mul_frac;
1665 *edx = (s8)d->arch.vtsc_to_ns.shift;
1666 break;
1667 case 2: /* physical cpu_khz */
1668 *eax = cpu_khz;
1669 *ebx = *ecx = *edx = 0;
1670 break;
1671 default:
1672 *eax = *ebx = *ecx = *edx = 0;
1676 /*
1677 * called to collect tsc-related data only for save file or live
1678 * migrate; called after last rdtsc is done on this incarnation
1679 */
1680 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
1681 uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
1682 uint32_t *incarnation)
1684 *incarnation = d->arch.incarnation;
1685 switch ( *tsc_mode = d->arch.tsc_mode )
1687 case TSC_MODE_NEVER_EMULATE:
1688 *elapsed_nsec = *gtsc_khz = 0;
1689 break;
1690 case TSC_MODE_ALWAYS_EMULATE:
1691 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1692 *gtsc_khz = d->arch.tsc_khz;
1693 break;
1694 case TSC_MODE_DEFAULT:
1695 if ( d->arch.vtsc )
1697 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1698 *gtsc_khz = d->arch.tsc_khz;
1699 } else {
1700 uint64_t tsc = 0;
1701 rdtscll(tsc);
1702 *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
1703 *gtsc_khz = cpu_khz;
1705 break;
1706 case TSC_MODE_PVRDTSCP:
1707 if ( d->arch.vtsc )
1709 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1710 *gtsc_khz = cpu_khz;
1711 } else {
1712 uint64_t tsc = 0;
1713 rdtscll(tsc);
1714 *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns) -
1715 d->arch.vtsc_offset;
1716 *gtsc_khz = 0; /* ignored by tsc_set_info */
1718 break;
1722 /*
1723 * This may be called as many as three times for a domain, once when the
1724 * hypervisor creates the domain, once when the toolstack creates the
1725 * domain and, if restoring/migrating, once when saved/migrated values
1726 * are restored. Care must be taken that, if multiple calls occur,
1727 * only the last "sticks" and all are completed before the guest executes
1728 * an rdtsc instruction
1729 */
1730 void tsc_set_info(struct domain *d,
1731 uint32_t tsc_mode, uint64_t elapsed_nsec,
1732 uint32_t gtsc_khz, uint32_t incarnation)
1734 if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
1736 d->arch.vtsc = 0;
1737 return;
1739 switch ( d->arch.tsc_mode = tsc_mode )
1741 case TSC_MODE_NEVER_EMULATE:
1742 d->arch.vtsc = 0;
1743 break;
1744 case TSC_MODE_ALWAYS_EMULATE:
1745 d->arch.vtsc = 1;
1746 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1747 d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
1748 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1749 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1750 break;
1751 case TSC_MODE_DEFAULT:
1752 d->arch.vtsc = 1;
1753 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1754 d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
1755 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1756 /* use native TSC if initial host has safe TSC and not migrated yet */
1757 if ( host_tsc_is_safe() && incarnation == 0 )
1758 d->arch.vtsc = 0;
1759 else
1760 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1761 break;
1762 case TSC_MODE_PVRDTSCP:
1763 d->arch.vtsc = boot_cpu_has(X86_FEATURE_RDTSCP) &&
1764 host_tsc_is_safe() ? 0 : 1;
1765 d->arch.tsc_khz = cpu_khz;
1766 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1767 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1768 if ( d->arch.vtsc )
1769 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1770 else {
1771 /* when using native TSC, offset is nsec relative to power-on
1772 * of physical machine */
1773 uint64_t tsc = 0;
1774 rdtscll(tsc);
1775 d->arch.vtsc_offset = scale_delta(tsc,&d->arch.vtsc_to_ns) -
1776 elapsed_nsec;
1778 break;
1780 d->arch.incarnation = incarnation + 1;
1781 if ( is_hvm_domain(d) )
1782 hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
1785 /* vtsc may incur measurable performance degradation, diagnose with this */
1786 static void dump_softtsc(unsigned char key)
1788 struct domain *d;
1789 int domcnt = 0;
1790 extern unsigned int max_cstate;
1792 tsc_check_reliability();
1793 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1794 printk("TSC marked as reliable, "
1795 "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1796 else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
1798 printk("TSC has constant rate, ");
1799 if (max_cstate <= 2 && tsc_max_warp == 0)
1800 printk("no deep Cstates, passed warp test, deemed reliable, ");
1801 else
1802 printk("deep Cstates possible, so not reliable, ");
1803 printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1804 } else
1805 printk("TSC not marked as either constant or reliable, "
1806 "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1807 for_each_domain ( d )
1809 if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
1810 continue;
1811 printk("dom%u%s: mode=%d",d->domain_id,
1812 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
1813 if ( d->arch.vtsc_offset )
1814 printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
1815 if ( d->arch.tsc_khz )
1816 printk(",khz=%"PRIu32"",d->arch.tsc_khz);
1817 if ( d->arch.incarnation )
1818 printk(",inc=%"PRIu32"",d->arch.incarnation);
1819 if ( !(d->arch.vtsc_kerncount | d->arch.vtsc_usercount) )
1821 printk("\n");
1822 continue;
1824 if ( is_hvm_domain(d) )
1825 printk(",vtsc count: %"PRIu64" total\n",
1826 d->arch.vtsc_kerncount);
1827 else
1828 printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
1829 d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
1830 domcnt++;
1833 if ( !domcnt )
1834 printk("No domains have emulated TSC\n");
1837 static struct keyhandler dump_softtsc_keyhandler = {
1838 .diagnostic = 1,
1839 .u.fn = dump_softtsc,
1840 .desc = "dump softtsc stats"
1841 };
1843 static int __init setup_dump_softtsc(void)
1845 register_keyhandler('s', &dump_softtsc_keyhandler);
1846 return 0;
1848 __initcall(setup_dump_softtsc);
1850 /*
1851 * Local variables:
1852 * mode: C
1853 * c-set-style: "BSD"
1854 * c-basic-offset: 4
1855 * tab-width: 4
1856 * indent-tabs-mode: nil
1857 * End:
1858 */