debuggers.hg

view xen/arch/x86/time.c @ 20957:f330b15f885d

Some time-handling fixes.

Fixes my domU boot hangs (when using vtsc) due to vtsc_offset less
then local cpu's stime_local_stamp, leading to bogus
vcpu_time_info.tsc_timestamp.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 08 10:14:48 2010 +0000 (2010-02-08)
parents 5668c36282ea
children 804304d4e05d
line source
1 /******************************************************************************
2 * arch/x86/time.c
3 *
4 * Per-CPU time calibration and management.
5 *
6 * Copyright (c) 2002-2005, K A Fraser
7 *
8 * Portions from Linux are:
9 * Copyright (c) 1991, 1992, 1995 Linus Torvalds
10 */
12 #include <xen/config.h>
13 #include <xen/errno.h>
14 #include <xen/event.h>
15 #include <xen/sched.h>
16 #include <xen/lib.h>
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/time.h>
20 #include <xen/timer.h>
21 #include <xen/smp.h>
22 #include <xen/irq.h>
23 #include <xen/softirq.h>
24 #include <xen/keyhandler.h>
25 #include <xen/guest_access.h>
26 #include <asm/io.h>
27 #include <asm/msr.h>
28 #include <asm/mpspec.h>
29 #include <asm/processor.h>
30 #include <asm/fixmap.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/div64.h>
33 #include <asm/acpi.h>
34 #include <asm/hpet.h>
35 #include <io_ports.h>
36 #include <asm/setup.h> /* for early_time_init */
37 #include <public/arch-x86/cpuid.h>
39 /* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
40 static char __initdata opt_clocksource[10];
41 string_param("clocksource", opt_clocksource);
43 unsigned long cpu_khz; /* CPU clock frequency in kHz. */
44 DEFINE_SPINLOCK(rtc_lock);
45 unsigned long pit0_ticks;
46 static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
47 static DEFINE_SPINLOCK(wc_lock);
49 struct cpu_time {
50 u64 local_tsc_stamp;
51 s_time_t stime_local_stamp;
52 s_time_t stime_master_stamp;
53 struct time_scale tsc_scale;
54 };
56 struct platform_timesource {
57 char *id;
58 char *name;
59 u64 frequency;
60 u64 (*read_counter)(void);
61 int (*init)(struct platform_timesource *);
62 void (*resume)(struct platform_timesource *);
63 int counter_bits;
64 };
66 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
68 /* Calibrate all CPUs to platform timer every EPOCH. */
69 #define EPOCH MILLISECS(1000)
70 static struct timer calibration_timer;
72 /*
73 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
74 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
75 * softirq handling will happen in time.
76 *
77 * The pit_lock protects the 16- and 32-bit stamp fields as well as the
78 */
79 static DEFINE_SPINLOCK(pit_lock);
80 static u16 pit_stamp16;
81 static u32 pit_stamp32;
82 static int using_pit;
84 /*
85 * 32-bit division of integer dividend and integer divisor yielding
86 * 32-bit fractional quotient.
87 */
88 static inline u32 div_frac(u32 dividend, u32 divisor)
89 {
90 u32 quotient, remainder;
91 ASSERT(dividend < divisor);
92 asm (
93 "divl %4"
94 : "=a" (quotient), "=d" (remainder)
95 : "0" (0), "1" (dividend), "r" (divisor) );
96 return quotient;
97 }
99 /*
100 * 32-bit multiplication of multiplicand and fractional multiplier
101 * yielding 32-bit product (radix point at same position as in multiplicand).
102 */
103 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
104 {
105 u32 product_int, product_frac;
106 asm (
107 "mul %3"
108 : "=a" (product_frac), "=d" (product_int)
109 : "0" (multiplicand), "r" (multiplier) );
110 return product_int;
111 }
113 /*
114 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
115 * yielding a 64-bit result.
116 */
117 static inline u64 scale_delta(u64 delta, struct time_scale *scale)
118 {
119 u64 product;
120 #ifdef CONFIG_X86_32
121 u32 tmp1, tmp2;
122 #endif
124 if ( scale->shift < 0 )
125 delta >>= -scale->shift;
126 else
127 delta <<= scale->shift;
129 #ifdef CONFIG_X86_32
130 asm (
131 "mul %5 ; "
132 "mov %4,%%eax ; "
133 "mov %%edx,%4 ; "
134 "mul %5 ; "
135 "xor %5,%5 ; "
136 "add %4,%%eax ; "
137 "adc %5,%%edx ; "
138 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
139 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (scale->mul_frac) );
140 #else
141 asm (
142 "mul %%rdx ; shrd $32,%%rdx,%%rax"
143 : "=a" (product) : "0" (delta), "d" ((u64)scale->mul_frac) );
144 #endif
146 return product;
147 }
149 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
151 /* Compute the reciprocal of the given time_scale. */
152 static inline struct time_scale scale_reciprocal(struct time_scale scale)
153 {
154 struct time_scale reciprocal;
155 u32 dividend;
157 ASSERT(scale.mul_frac != 0);
158 dividend = _TS_MUL_FRAC_IDENTITY;
159 reciprocal.shift = 1 - scale.shift;
160 while ( unlikely(dividend >= scale.mul_frac) )
161 {
162 dividend >>= 1;
163 reciprocal.shift++;
164 }
166 asm (
167 "divl %4"
168 : "=a" (reciprocal.mul_frac), "=d" (dividend)
169 : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
171 return reciprocal;
172 }
174 /*
175 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
176 * IPIs in place of local APIC timers
177 */
178 extern int xen_cpuidle;
179 static cpumask_t pit_broadcast_mask;
181 static void smp_send_timer_broadcast_ipi(void)
182 {
183 int cpu = smp_processor_id();
184 cpumask_t mask;
186 cpus_and(mask, cpu_online_map, pit_broadcast_mask);
188 if ( cpu_isset(cpu, mask) )
189 {
190 cpu_clear(cpu, mask);
191 raise_softirq(TIMER_SOFTIRQ);
192 }
194 if ( !cpus_empty(mask) )
195 {
196 cpumask_raise_softirq(mask, TIMER_SOFTIRQ);
197 }
198 }
200 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
201 {
202 ASSERT(local_irq_is_enabled());
204 if ( hpet_legacy_irq_tick() )
205 return;
207 /* Only for start-of-day interruopt tests in io_apic.c. */
208 (*(volatile unsigned long *)&pit0_ticks)++;
210 /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
211 if ( !cpu_has_apic )
212 raise_softirq(TIMER_SOFTIRQ);
214 if ( xen_cpuidle )
215 smp_send_timer_broadcast_ipi();
217 /* Emulate a 32-bit PIT counter. */
218 if ( using_pit )
219 {
220 u16 count;
222 spin_lock_irq(&pit_lock);
224 outb(0x80, PIT_MODE);
225 count = inb(PIT_CH2);
226 count |= inb(PIT_CH2) << 8;
228 pit_stamp32 += (u16)(pit_stamp16 - count);
229 pit_stamp16 = count;
231 spin_unlock_irq(&pit_lock);
232 }
233 }
235 static struct irqaction __read_mostly irq0 = { timer_interrupt, "timer", NULL };
237 /* ------ Calibrate the TSC -------
238 * Return processor ticks per second / CALIBRATE_FRAC.
239 */
241 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
242 #define CALIBRATE_FRAC 20 /* calibrate over 50ms */
243 #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC)
245 static u64 init_pit_and_calibrate_tsc(void)
246 {
247 u64 start, end;
248 unsigned long count;
250 /* Set PIT channel 0 to HZ Hz. */
251 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
252 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
253 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
254 outb(LATCH >> 8, PIT_CH0); /* MSB */
256 /* Set the Gate high, disable speaker */
257 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
259 /*
260 * Now let's take care of CTC channel 2
261 *
262 * Set the Gate high, program CTC channel 2 for mode 0, (interrupt on
263 * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB)
264 * to begin countdown.
265 */
266 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
267 outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
268 outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */
270 rdtscll(start);
271 for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
272 continue;
273 rdtscll(end);
275 /* Error if the CTC doesn't behave itself. */
276 if ( count == 0 )
277 return 0;
279 return ((end - start) * (u64)CALIBRATE_FRAC);
280 }
282 static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
283 {
284 u64 tps64 = ticks_per_sec;
285 u32 tps32;
286 int shift = 0;
288 ASSERT(tps64 != 0);
290 while ( tps64 > (MILLISECS(1000)*2) )
291 {
292 tps64 >>= 1;
293 shift--;
294 }
296 tps32 = (u32)tps64;
297 while ( tps32 <= (u32)MILLISECS(1000) )
298 {
299 tps32 <<= 1;
300 shift++;
301 }
303 ts->mul_frac = div_frac(MILLISECS(1000), tps32);
304 ts->shift = shift;
305 }
307 static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0);
308 static unsigned int tsc_calibrate_status = 0;
310 void calibrate_tsc_bp(void)
311 {
312 while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) )
313 mb();
315 outb(CALIBRATE_LATCH & 0xff, PIT_CH2);
316 outb(CALIBRATE_LATCH >> 8, PIT_CH2);
318 tsc_calibrate_status = 1;
319 wmb();
321 while ( (inb(0x61) & 0x20) == 0 )
322 continue;
324 tsc_calibrate_status = 2;
325 wmb();
327 while ( atomic_read(&tsc_calibrate_gang) != 0 )
328 mb();
329 }
331 void calibrate_tsc_ap(void)
332 {
333 u64 t1, t2, ticks_per_sec;
335 atomic_inc(&tsc_calibrate_gang);
337 while ( tsc_calibrate_status < 1 )
338 mb();
340 rdtscll(t1);
342 while ( tsc_calibrate_status < 2 )
343 mb();
345 rdtscll(t2);
347 ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC;
348 set_time_scale(&this_cpu(cpu_time).tsc_scale, ticks_per_sec);
350 atomic_dec(&tsc_calibrate_gang);
351 }
353 static char *freq_string(u64 freq)
354 {
355 static char s[20];
356 unsigned int x, y;
357 y = (unsigned int)do_div(freq, 1000000) / 1000;
358 x = (unsigned int)freq;
359 snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
360 return s;
361 }
363 /************************************************************
364 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
365 */
367 static u64 read_pit_count(void)
368 {
369 u16 count16;
370 u32 count32;
371 unsigned long flags;
373 spin_lock_irqsave(&pit_lock, flags);
375 outb(0x80, PIT_MODE);
376 count16 = inb(PIT_CH2);
377 count16 |= inb(PIT_CH2) << 8;
379 count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
381 spin_unlock_irqrestore(&pit_lock, flags);
383 return count32;
384 }
386 static int __init init_pit(struct platform_timesource *pts)
387 {
388 using_pit = 1;
389 return 1;
390 }
392 static struct platform_timesource __initdata plt_pit =
393 {
394 .id = "pit",
395 .name = "PIT",
396 .frequency = CLOCK_TICK_RATE,
397 .read_counter = read_pit_count,
398 .counter_bits = 32,
399 .init = init_pit
400 };
402 /************************************************************
403 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
404 */
406 static u64 read_hpet_count(void)
407 {
408 return hpet_read32(HPET_COUNTER);
409 }
411 static int __init init_hpet(struct platform_timesource *pts)
412 {
413 u64 hpet_rate = hpet_setup();
415 if ( hpet_rate == 0 )
416 return 0;
418 pts->frequency = hpet_rate;
419 return 1;
420 }
422 static void resume_hpet(struct platform_timesource *pts)
423 {
424 u64 hpet_rate = hpet_setup();
426 BUG_ON(hpet_rate == 0);
427 pts->frequency = hpet_rate;
428 }
430 static struct platform_timesource __initdata plt_hpet =
431 {
432 .id = "hpet",
433 .name = "HPET",
434 .read_counter = read_hpet_count,
435 .counter_bits = 32,
436 .init = init_hpet,
437 .resume = resume_hpet
438 };
440 /************************************************************
441 * PLATFORM TIMER 3: IBM 'CYCLONE' TIMER
442 */
444 int use_cyclone;
446 /*
447 * Although the counter is read via a 64-bit register, I believe it is actually
448 * a 40-bit counter. Since this will wrap, I read only the low 32 bits and
449 * periodically fold into a 64-bit software counter, just as for PIT and HPET.
450 */
451 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
452 #define CYCLONE_PMCC_OFFSET 0x51A0
453 #define CYCLONE_MPMC_OFFSET 0x51D0
454 #define CYCLONE_MPCS_OFFSET 0x51A8
455 #define CYCLONE_TIMER_FREQ 100000000
457 /* Cyclone MPMC0 register. */
458 static volatile u32 *cyclone_timer;
460 static u64 read_cyclone_count(void)
461 {
462 return *cyclone_timer;
463 }
465 static volatile u32 *__init map_cyclone_reg(unsigned long regaddr)
466 {
467 unsigned long pageaddr = regaddr & PAGE_MASK;
468 unsigned long offset = regaddr & ~PAGE_MASK;
469 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
470 return (volatile u32 *)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
471 }
473 static int __init init_cyclone(struct platform_timesource *pts)
474 {
475 u32 base;
477 if ( !use_cyclone )
478 return 0;
480 /* Find base address. */
481 base = *(map_cyclone_reg(CYCLONE_CBAR_ADDR));
482 if ( base == 0 )
483 {
484 printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n");
485 return 0;
486 }
488 /* Enable timer and map the counter register. */
489 *(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1;
490 *(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1;
491 cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET);
492 return 1;
493 }
495 static struct platform_timesource __initdata plt_cyclone =
496 {
497 .id = "cyclone",
498 .name = "IBM Cyclone",
499 .frequency = CYCLONE_TIMER_FREQ,
500 .read_counter = read_cyclone_count,
501 .counter_bits = 32,
502 .init = init_cyclone
503 };
505 /************************************************************
506 * PLATFORM TIMER 4: ACPI PM TIMER
507 */
509 u32 pmtmr_ioport;
511 /* ACPI PM timer ticks at 3.579545 MHz. */
512 #define ACPI_PM_FREQUENCY 3579545
514 static u64 read_pmtimer_count(void)
515 {
516 return inl(pmtmr_ioport);
517 }
519 static int __init init_pmtimer(struct platform_timesource *pts)
520 {
521 if ( pmtmr_ioport == 0 )
522 return 0;
524 return 1;
525 }
527 static struct platform_timesource __initdata plt_pmtimer =
528 {
529 .id = "acpi",
530 .name = "ACPI PM Timer",
531 .frequency = ACPI_PM_FREQUENCY,
532 .read_counter = read_pmtimer_count,
533 .counter_bits = 24,
534 .init = init_pmtimer
535 };
537 static struct time_scale pmt_scale;
538 static struct time_scale pmt_scale_r;
539 static __init int init_pmtmr_scale(void)
540 {
541 set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
542 pmt_scale_r = scale_reciprocal(pmt_scale);
543 return 0;
544 }
545 __initcall(init_pmtmr_scale);
547 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
548 {
549 return scale_delta(ticks, &pmt_scale);
550 }
552 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
553 {
554 return scale_delta(ns, &pmt_scale_r);
555 }
557 /************************************************************
558 * GENERIC PLATFORM TIMER INFRASTRUCTURE
559 */
561 static struct platform_timesource plt_src; /* details of chosen timesource */
562 static u64 plt_mask; /* hardware-width mask */
563 static u64 plt_overflow_period; /* ns between calls to plt_overflow() */
564 static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */
566 /* Protected by platform_timer_lock. */
567 static DEFINE_SPINLOCK(platform_timer_lock);
568 static s_time_t stime_platform_stamp; /* System time at below platform time */
569 static u64 platform_timer_stamp; /* Platform time at above system time */
570 static u64 plt_stamp64; /* 64-bit platform counter stamp */
571 static u64 plt_stamp; /* hardware-width platform counter stamp */
572 static struct timer plt_overflow_timer;
574 static void plt_overflow(void *unused)
575 {
576 u64 count;
578 spin_lock_irq(&platform_timer_lock);
579 count = plt_src.read_counter();
580 plt_stamp64 += (count - plt_stamp) & plt_mask;
581 plt_stamp = count;
582 spin_unlock_irq(&platform_timer_lock);
584 set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
585 }
587 static s_time_t __read_platform_stime(u64 platform_time)
588 {
589 u64 diff = platform_time - platform_timer_stamp;
590 ASSERT(spin_is_locked(&platform_timer_lock));
591 return (stime_platform_stamp + scale_delta(diff, &plt_scale));
592 }
594 static s_time_t read_platform_stime(void)
595 {
596 u64 count;
597 s_time_t stime;
599 ASSERT(!local_irq_is_enabled());
601 spin_lock(&platform_timer_lock);
602 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
603 stime = __read_platform_stime(count);
604 spin_unlock(&platform_timer_lock);
606 return stime;
607 }
609 static void platform_time_calibration(void)
610 {
611 u64 count;
612 s_time_t stamp;
613 unsigned long flags;
615 spin_lock_irqsave(&platform_timer_lock, flags);
616 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
617 stamp = __read_platform_stime(count);
618 stime_platform_stamp = stamp;
619 platform_timer_stamp = count;
620 spin_unlock_irqrestore(&platform_timer_lock, flags);
621 }
623 static void resume_platform_timer(void)
624 {
625 /* Timer source can be reset when backing from S3 to S0 */
626 if ( plt_src.resume )
627 plt_src.resume(&plt_src);
629 plt_stamp64 = platform_timer_stamp;
630 plt_stamp = plt_src.read_counter();
631 }
633 static void __init init_platform_timer(void)
634 {
635 static struct platform_timesource * __initdata plt_timers[] = {
636 &plt_cyclone, &plt_hpet, &plt_pmtimer, &plt_pit
637 };
639 struct platform_timesource *pts = NULL;
640 int i, rc = -1;
642 if ( opt_clocksource[0] != '\0' )
643 {
644 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
645 {
646 pts = plt_timers[i];
647 if ( !strcmp(opt_clocksource, pts->id) )
648 {
649 rc = pts->init(pts);
650 break;
651 }
652 }
654 if ( rc <= 0 )
655 printk("WARNING: %s clocksource '%s'.\n",
656 (rc == 0) ? "Could not initialise" : "Unrecognised",
657 opt_clocksource);
658 }
660 if ( rc <= 0 )
661 {
662 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
663 {
664 pts = plt_timers[i];
665 if ( (rc = pts->init(pts)) > 0 )
666 break;
667 }
668 }
670 BUG_ON(rc <= 0);
672 plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
674 set_time_scale(&plt_scale, pts->frequency);
676 plt_overflow_period = scale_delta(
677 1ull << (pts->counter_bits-1), &plt_scale);
678 init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
679 plt_src = *pts;
680 plt_overflow(NULL);
682 platform_timer_stamp = plt_stamp64;
683 stime_platform_stamp = NOW();
685 printk("Platform timer is %s %s\n",
686 freq_string(pts->frequency), pts->name);
687 }
689 void cstate_restore_tsc(void)
690 {
691 struct cpu_time *t = &this_cpu(cpu_time);
692 struct time_scale sys_to_tsc = scale_reciprocal(t->tsc_scale);
693 s_time_t stime_delta;
694 u64 new_tsc;
696 if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
697 return;
699 stime_delta = read_platform_stime() - t->stime_master_stamp;
700 if ( stime_delta < 0 )
701 stime_delta = 0;
703 new_tsc = t->local_tsc_stamp + scale_delta(stime_delta, &sys_to_tsc);
705 write_tsc(new_tsc);
706 }
708 /***************************************************************************
709 * CMOS Timer functions
710 ***************************************************************************/
712 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
713 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
714 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
715 *
716 * [For the Julian calendar (which was used in Russia before 1917,
717 * Britain & colonies before 1752, anywhere else before 1582,
718 * and is still in use by some communities) leave out the
719 * -year/100+year/400 terms, and add 10.]
720 *
721 * This algorithm was first published by Gauss (I think).
722 *
723 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
724 * machines were long is 32-bit! (However, as time_t is signed, we
725 * will already get problems at other places on 2038-01-19 03:14:08)
726 */
727 unsigned long
728 mktime (unsigned int year, unsigned int mon,
729 unsigned int day, unsigned int hour,
730 unsigned int min, unsigned int sec)
731 {
732 /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
733 if ( 0 >= (int) (mon -= 2) )
734 {
735 mon += 12;
736 year -= 1;
737 }
739 return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
740 year*365 - 719499
741 )*24 + hour /* now have hours */
742 )*60 + min /* now have minutes */
743 )*60 + sec; /* finally seconds */
744 }
746 static unsigned long __get_cmos_time(void)
747 {
748 unsigned int year, mon, day, hour, min, sec;
750 sec = CMOS_READ(RTC_SECONDS);
751 min = CMOS_READ(RTC_MINUTES);
752 hour = CMOS_READ(RTC_HOURS);
753 day = CMOS_READ(RTC_DAY_OF_MONTH);
754 mon = CMOS_READ(RTC_MONTH);
755 year = CMOS_READ(RTC_YEAR);
757 if ( !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD )
758 {
759 BCD_TO_BIN(sec);
760 BCD_TO_BIN(min);
761 BCD_TO_BIN(hour);
762 BCD_TO_BIN(day);
763 BCD_TO_BIN(mon);
764 BCD_TO_BIN(year);
765 }
767 if ( (year += 1900) < 1970 )
768 year += 100;
770 return mktime(year, mon, day, hour, min, sec);
771 }
773 static unsigned long get_cmos_time(void)
774 {
775 unsigned long res, flags;
776 int i;
778 spin_lock_irqsave(&rtc_lock, flags);
780 /* read RTC exactly on falling edge of update flag */
781 for ( i = 0 ; i < 1000000 ; i++ ) /* may take up to 1 second... */
782 if ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
783 break;
784 for ( i = 0 ; i < 1000000 ; i++ ) /* must try at least 2.228 ms */
785 if ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) )
786 break;
788 res = __get_cmos_time();
790 spin_unlock_irqrestore(&rtc_lock, flags);
791 return res;
792 }
794 /***************************************************************************
795 * System Time
796 ***************************************************************************/
798 s_time_t get_s_time(void)
799 {
800 struct cpu_time *t = &this_cpu(cpu_time);
801 u64 tsc, delta;
802 s_time_t now;
804 rdtscll(tsc);
805 delta = tsc - t->local_tsc_stamp;
806 now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
808 return now;
809 }
811 /* Explicitly OR with 1 just in case version number gets out of sync. */
812 #define version_update_begin(v) (((v)+1)|1)
813 #define version_update_end(v) ((v)+1)
815 static void __update_vcpu_system_time(struct vcpu *v, int force)
816 {
817 struct cpu_time *t;
818 struct vcpu_time_info *u, _u;
819 XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
820 struct domain *d = v->domain;
821 s_time_t tsc_stamp = 0;
823 if ( v->vcpu_info == NULL )
824 return;
826 t = &this_cpu(cpu_time);
827 u = &vcpu_info(v, time);
829 if ( d->arch.vtsc )
830 {
831 u64 delta = max_t(s64, t->stime_local_stamp - d->arch.vtsc_offset, 0);
832 tsc_stamp = scale_delta(delta, &d->arch.ns_to_vtsc);
833 }
834 else
835 {
836 tsc_stamp = t->local_tsc_stamp;
837 }
839 memset(&_u, 0, sizeof(_u));
841 if ( d->arch.vtsc )
842 {
843 _u.tsc_timestamp = tsc_stamp;
844 _u.system_time = t->stime_local_stamp;
845 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
846 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
847 }
848 else
849 {
850 _u.tsc_timestamp = t->local_tsc_stamp;
851 _u.system_time = t->stime_local_stamp;
852 _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
853 _u.tsc_shift = (s8)t->tsc_scale.shift;
854 }
856 /* Don't bother unless timestamp record has changed or we are forced. */
857 _u.version = u->version; /* make versions match for memcmp test */
858 if ( !force && !memcmp(u, &_u, sizeof(_u)) )
859 return;
861 /* 1. Update guest kernel version. */
862 _u.version = u->version = version_update_begin(u->version);
863 wmb();
864 /* 2. Update all other guest kernel fields. */
865 *u = _u;
866 wmb();
867 /* 3. Update guest kernel version. */
868 u->version = version_update_end(u->version);
870 user_u = v->arch.time_info_guest;
871 if ( !guest_handle_is_null(user_u) )
872 {
873 /* 1. Update userspace version. */
874 __copy_field_to_guest(user_u, &_u, version);
875 wmb();
876 /* 2. Update all other userspavce fields. */
877 __copy_to_guest(user_u, &_u, 1);
878 wmb();
879 /* 3. Update userspace version. */
880 _u.version = version_update_end(_u.version);
881 __copy_field_to_guest(user_u, &_u, version);
882 }
883 }
885 void update_vcpu_system_time(struct vcpu *v)
886 {
887 __update_vcpu_system_time(v, 0);
888 }
890 void force_update_vcpu_system_time(struct vcpu *v)
891 {
892 __update_vcpu_system_time(v, 1);
893 }
895 void update_domain_wallclock_time(struct domain *d)
896 {
897 uint32_t *wc_version;
899 spin_lock(&wc_lock);
901 wc_version = &shared_info(d, wc_version);
902 *wc_version = version_update_begin(*wc_version);
903 wmb();
905 shared_info(d, wc_sec) = wc_sec + d->time_offset_seconds;
906 shared_info(d, wc_nsec) = wc_nsec;
908 wmb();
909 *wc_version = version_update_end(*wc_version);
911 spin_unlock(&wc_lock);
912 }
914 static void update_domain_rtc(void)
915 {
916 struct domain *d;
918 rcu_read_lock(&domlist_read_lock);
920 for_each_domain ( d )
921 if ( is_hvm_domain(d) )
922 rtc_update_clock(d);
924 rcu_read_unlock(&domlist_read_lock);
925 }
927 void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds)
928 {
929 d->time_offset_seconds = time_offset_seconds;
930 if ( is_hvm_domain(d) )
931 rtc_update_clock(d);
932 }
934 int cpu_frequency_change(u64 freq)
935 {
936 struct cpu_time *t = &this_cpu(cpu_time);
937 u64 curr_tsc;
939 /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
940 if ( freq < 1000000u )
941 {
942 gdprintk(XENLOG_WARNING, "Rejecting CPU frequency change "
943 "to %"PRIu64" Hz.\n", freq);
944 return -EINVAL;
945 }
947 local_irq_disable();
948 /* Platform time /first/, as we may be delayed by platform_timer_lock. */
949 t->stime_master_stamp = read_platform_stime();
950 /* TSC-extrapolated time may be bogus after frequency change. */
951 /*t->stime_local_stamp = get_s_time();*/
952 t->stime_local_stamp = t->stime_master_stamp;
953 rdtscll(curr_tsc);
954 t->local_tsc_stamp = curr_tsc;
955 set_time_scale(&t->tsc_scale, freq);
956 local_irq_enable();
958 update_vcpu_system_time(current);
960 /* A full epoch should pass before we check for deviation. */
961 if ( smp_processor_id() == 0 )
962 {
963 set_timer(&calibration_timer, NOW() + EPOCH);
964 platform_time_calibration();
965 }
967 return 0;
968 }
970 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
971 void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
972 {
973 u64 x;
974 u32 y, _wc_sec, _wc_nsec;
975 struct domain *d;
977 x = (secs * 1000000000ULL) + (u64)nsecs - system_time_base;
978 y = do_div(x, 1000000000);
980 spin_lock(&wc_lock);
981 wc_sec = _wc_sec = (u32)x;
982 wc_nsec = _wc_nsec = (u32)y;
983 spin_unlock(&wc_lock);
985 rcu_read_lock(&domlist_read_lock);
986 for_each_domain ( d )
987 update_domain_wallclock_time(d);
988 rcu_read_unlock(&domlist_read_lock);
989 }
991 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
992 struct cpu_calibration {
993 u64 local_tsc_stamp;
994 s_time_t stime_local_stamp;
995 s_time_t stime_master_stamp;
996 };
997 static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration);
999 /* Softirq handler for per-CPU time calibration. */
1000 static void local_time_calibration(void)
1002 struct cpu_time *t = &this_cpu(cpu_time);
1003 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1005 /*
1006 * System timestamps, extrapolated from local and master oscillators,
1007 * taken during this calibration and the previous calibration.
1008 */
1009 s_time_t prev_local_stime, curr_local_stime;
1010 s_time_t prev_master_stime, curr_master_stime;
1012 /* TSC timestamps taken during this calibration and prev calibration. */
1013 u64 prev_tsc, curr_tsc;
1015 /*
1016 * System time and TSC ticks elapsed during the previous calibration
1017 * 'epoch'. These values are down-shifted to fit in 32 bits.
1018 */
1019 u64 stime_elapsed64, tsc_elapsed64;
1020 u32 stime_elapsed32, tsc_elapsed32;
1022 /* The accumulated error in the local estimate. */
1023 u64 local_stime_err;
1025 /* Error correction to slow down a fast local clock. */
1026 u32 error_factor = 0;
1028 /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1029 int tsc_shift = 0;
1031 /* The overall calibration scale multiplier. */
1032 u32 calibration_mul_frac;
1034 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1036 /* Atomically read cpu_calibration struct and write cpu_time struct. */
1037 local_irq_disable();
1038 t->local_tsc_stamp = c->local_tsc_stamp;
1039 t->stime_local_stamp = c->stime_master_stamp;
1040 t->stime_master_stamp = c->stime_master_stamp;
1041 local_irq_enable();
1042 update_vcpu_system_time(current);
1043 goto out;
1046 prev_tsc = t->local_tsc_stamp;
1047 prev_local_stime = t->stime_local_stamp;
1048 prev_master_stime = t->stime_master_stamp;
1050 /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1051 local_irq_disable();
1052 curr_tsc = c->local_tsc_stamp;
1053 curr_local_stime = c->stime_local_stamp;
1054 curr_master_stime = c->stime_master_stamp;
1055 local_irq_enable();
1057 #if 0
1058 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1059 smp_processor_id(), prev_tsc, prev_local_stime, prev_master_stime);
1060 printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1061 " -> %"PRId64"\n",
1062 smp_processor_id(), curr_tsc, curr_local_stime, curr_master_stime,
1063 curr_master_stime - curr_local_stime);
1064 #endif
1066 /* Local time warps forward if it lags behind master time. */
1067 if ( curr_local_stime < curr_master_stime )
1068 curr_local_stime = curr_master_stime;
1070 stime_elapsed64 = curr_master_stime - prev_master_stime;
1071 tsc_elapsed64 = curr_tsc - prev_tsc;
1073 /*
1074 * Weirdness can happen if we lose sync with the platform timer.
1075 * We could be smarter here: resync platform timer with local timer?
1076 */
1077 if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1078 goto out;
1080 /*
1081 * Calculate error-correction factor. This only slows down a fast local
1082 * clock (slow clocks are warped forwards). The scale factor is clamped
1083 * to >= 0.5.
1084 */
1085 if ( curr_local_stime != curr_master_stime )
1087 local_stime_err = curr_local_stime - curr_master_stime;
1088 if ( local_stime_err > EPOCH )
1089 local_stime_err = EPOCH;
1090 error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1093 /*
1094 * We require 0 < stime_elapsed < 2^31.
1095 * This allows us to binary shift a 32-bit tsc_elapsed such that:
1096 * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1097 */
1098 while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1099 ((s32)stime_elapsed64 < 0) )
1101 stime_elapsed64 >>= 1;
1102 tsc_elapsed64 >>= 1;
1105 /* stime_master_diff now fits in a 32-bit word. */
1106 stime_elapsed32 = (u32)stime_elapsed64;
1108 /* tsc_elapsed <= 2*stime_elapsed */
1109 while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1111 tsc_elapsed64 >>= 1;
1112 tsc_shift--;
1115 /* Local difference must now fit in 32 bits. */
1116 ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1117 tsc_elapsed32 = (u32)tsc_elapsed64;
1119 /* tsc_elapsed > stime_elapsed */
1120 ASSERT(tsc_elapsed32 != 0);
1121 while ( tsc_elapsed32 <= stime_elapsed32 )
1123 tsc_elapsed32 <<= 1;
1124 tsc_shift++;
1127 calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1128 if ( error_factor != 0 )
1129 calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1131 #if 0
1132 printk("---%d: %08x %08x %d\n", smp_processor_id(),
1133 error_factor, calibration_mul_frac, tsc_shift);
1134 #endif
1136 /* Record new timestamp information, atomically w.r.t. interrupts. */
1137 local_irq_disable();
1138 t->tsc_scale.mul_frac = calibration_mul_frac;
1139 t->tsc_scale.shift = tsc_shift;
1140 t->local_tsc_stamp = curr_tsc;
1141 t->stime_local_stamp = curr_local_stime;
1142 t->stime_master_stamp = curr_master_stime;
1143 local_irq_enable();
1145 update_vcpu_system_time(current);
1147 out:
1148 if ( smp_processor_id() == 0 )
1150 set_timer(&calibration_timer, NOW() + EPOCH);
1151 platform_time_calibration();
1155 /*
1156 * TSC Reliability check
1157 */
1159 /*
1160 * The Linux original version of this function is
1161 * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1162 */
1163 void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1165 #define rdtsc_barrier() mb()
1166 static DEFINE_SPINLOCK(sync_lock);
1167 static cycles_t last_tsc;
1169 cycles_t start, now, prev, end;
1170 int i;
1172 rdtsc_barrier();
1173 start = get_cycles();
1174 rdtsc_barrier();
1176 /* The measurement runs for 20 msecs: */
1177 end = start + tsc_khz * 20ULL;
1178 now = start;
1180 for ( i = 0; ; i++ )
1182 /*
1183 * We take the global lock, measure TSC, save the
1184 * previous TSC that was measured (possibly on
1185 * another CPU) and update the previous TSC timestamp.
1186 */
1187 spin_lock(&sync_lock);
1188 prev = last_tsc;
1189 rdtsc_barrier();
1190 now = get_cycles();
1191 rdtsc_barrier();
1192 last_tsc = now;
1193 spin_unlock(&sync_lock);
1195 /*
1196 * Be nice every now and then (and also check whether measurement is
1197 * done [we also insert a 10 million loops safety exit, so we dont
1198 * lock up in case the TSC readout is totally broken]):
1199 */
1200 if ( unlikely(!(i & 7)) )
1202 if ( (now > end) || (i > 10000000) )
1203 break;
1204 cpu_relax();
1205 /*touch_nmi_watchdog();*/
1208 /*
1209 * Outside the critical section we can now see whether we saw a
1210 * time-warp of the TSC going backwards:
1211 */
1212 if ( unlikely(prev > now) )
1214 spin_lock(&sync_lock);
1215 if ( *max_warp < prev - now )
1216 *max_warp = prev - now;
1217 spin_unlock(&sync_lock);
1222 static unsigned long tsc_max_warp, tsc_check_count;
1223 static cpumask_t tsc_check_cpumask = CPU_MASK_NONE;
1225 static void tsc_check_slave(void *unused)
1227 unsigned int cpu = smp_processor_id();
1228 local_irq_disable();
1229 while ( !cpu_isset(cpu, tsc_check_cpumask) )
1230 mb();
1231 check_tsc_warp(cpu_khz, &tsc_max_warp);
1232 cpu_clear(cpu, tsc_check_cpumask);
1233 local_irq_enable();
1236 void tsc_check_reliability(void)
1238 unsigned int cpu = smp_processor_id();
1239 static DEFINE_SPINLOCK(lock);
1241 spin_lock(&lock);
1243 tsc_check_count++;
1244 smp_call_function(tsc_check_slave, NULL, 0);
1245 tsc_check_cpumask = cpu_online_map;
1246 local_irq_disable();
1247 check_tsc_warp(cpu_khz, &tsc_max_warp);
1248 cpu_clear(cpu, tsc_check_cpumask);
1249 local_irq_enable();
1250 while ( !cpus_empty(tsc_check_cpumask) )
1251 cpu_relax();
1253 spin_unlock(&lock);
1256 /*
1257 * Rendezvous for all CPUs in IRQ context.
1258 * Master CPU snapshots the platform timer.
1259 * All CPUS snapshot their local TSC and extrapolation of system time.
1260 */
1261 struct calibration_rendezvous {
1262 cpumask_t cpu_calibration_map;
1263 atomic_t semaphore;
1264 s_time_t master_stime;
1265 u64 master_tsc_stamp;
1266 };
1268 /*
1269 * Keep TSCs in sync when they run at the same rate, but may stop in
1270 * deep-sleep C states.
1271 */
1272 static void time_calibration_tsc_rendezvous(void *_r)
1274 int i;
1275 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1276 struct calibration_rendezvous *r = _r;
1277 unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
1279 /* Loop to get rid of cache effects on TSC skew. */
1280 for ( i = 4; i >= 0; i-- )
1282 if ( smp_processor_id() == 0 )
1284 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1285 mb();
1287 if ( r->master_stime == 0 )
1289 r->master_stime = read_platform_stime();
1290 rdtscll(r->master_tsc_stamp);
1292 atomic_inc(&r->semaphore);
1294 if ( i == 0 )
1295 write_tsc(r->master_tsc_stamp);
1297 while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1298 mb();
1299 atomic_set(&r->semaphore, 0);
1301 else
1303 atomic_inc(&r->semaphore);
1304 while ( atomic_read(&r->semaphore) < total_cpus )
1305 mb();
1307 if ( i == 0 )
1308 write_tsc(r->master_tsc_stamp);
1310 atomic_inc(&r->semaphore);
1311 while ( atomic_read(&r->semaphore) > total_cpus )
1312 mb();
1316 rdtscll(c->local_tsc_stamp);
1317 c->stime_local_stamp = get_s_time();
1318 c->stime_master_stamp = r->master_stime;
1320 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1323 /* Ordinary rendezvous function which does not modify TSC values. */
1324 static void time_calibration_std_rendezvous(void *_r)
1326 struct cpu_calibration *c = &this_cpu(cpu_calibration);
1327 struct calibration_rendezvous *r = _r;
1328 unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
1330 if ( smp_processor_id() == 0 )
1332 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1333 cpu_relax();
1334 r->master_stime = read_platform_stime();
1335 mb(); /* write r->master_stime /then/ signal */
1336 atomic_inc(&r->semaphore);
1338 else
1340 atomic_inc(&r->semaphore);
1341 while ( atomic_read(&r->semaphore) != total_cpus )
1342 cpu_relax();
1343 mb(); /* receive signal /then/ read r->master_stime */
1346 rdtscll(c->local_tsc_stamp);
1347 c->stime_local_stamp = get_s_time();
1348 c->stime_master_stamp = r->master_stime;
1350 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1353 static void (*time_calibration_rendezvous_fn)(void *) =
1354 time_calibration_std_rendezvous;
1356 static void time_calibration(void *unused)
1358 struct calibration_rendezvous r = {
1359 .cpu_calibration_map = cpu_online_map,
1360 .semaphore = ATOMIC_INIT(0)
1361 };
1363 /* @wait=1 because we must wait for all cpus before freeing @r. */
1364 on_selected_cpus(&r.cpu_calibration_map,
1365 time_calibration_rendezvous_fn,
1366 &r, 1);
1369 void init_percpu_time(void)
1371 struct cpu_time *t = &this_cpu(cpu_time);
1372 unsigned long flags;
1373 s_time_t now;
1375 local_irq_save(flags);
1376 rdtscll(t->local_tsc_stamp);
1377 now = read_platform_stime();
1378 local_irq_restore(flags);
1380 t->stime_master_stamp = now;
1381 t->stime_local_stamp = now;
1383 if ( smp_processor_id() == 0 )
1385 init_timer(&calibration_timer, time_calibration, NULL, 0);
1386 set_timer(&calibration_timer, NOW() + EPOCH);
1390 /* Late init function (after all CPUs are booted). */
1391 int __init init_xen_time(void)
1393 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1395 /*
1396 * Sadly, despite processor vendors' best design guidance efforts, on
1397 * some systems, cpus may come out of reset improperly synchronized.
1398 * So we must verify there is no warp and we can't do that until all
1399 * CPUs are booted.
1400 */
1401 tsc_check_reliability();
1402 if ( tsc_max_warp )
1403 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1406 /* If we have constant-rate TSCs then scale factor can be shared. */
1407 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1409 int cpu;
1410 for_each_possible_cpu ( cpu )
1411 per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1412 /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
1413 if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1414 time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
1417 open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1419 /* System time (get_s_time()) starts ticking from now. */
1420 rdtscll(this_cpu(cpu_time).local_tsc_stamp);
1422 /* NB. get_cmos_time() can take over one second to execute. */
1423 do_settime(get_cmos_time(), 0, NOW());
1425 init_platform_timer();
1427 init_percpu_time();
1429 return 0;
1433 /* Early init function. */
1434 void __init early_time_init(void)
1436 u64 tmp = init_pit_and_calibrate_tsc();
1438 set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
1440 do_div(tmp, 1000);
1441 cpu_khz = (unsigned long)tmp;
1442 printk("Detected %lu.%03lu MHz processor.\n",
1443 cpu_khz / 1000, cpu_khz % 1000);
1445 setup_irq(0, &irq0);
1448 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
1449 static int disable_pit_irq(void)
1451 if ( using_pit || !cpu_has_apic )
1452 return 0;
1454 /*
1455 * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
1456 * emulation when entering deep C states.
1457 * XXX dom0 may rely on RTC interrupt delivery, so only enable
1458 * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
1459 */
1460 if ( xen_cpuidle && !boot_cpu_has(X86_FEATURE_ARAT) )
1462 hpet_broadcast_init();
1463 if ( !hpet_broadcast_is_available() )
1465 if ( xen_cpuidle == -1 )
1467 xen_cpuidle = 0;
1468 printk("CPUIDLE: disabled due to no HPET. "
1469 "Force enable with 'cpuidle'.\n");
1471 else
1473 printk("HPET broadcast init failed, turn to PIT broadcast.\n");
1474 return 0;
1479 /* Disable PIT CH0 timer interrupt. */
1480 outb_p(0x30, PIT_MODE);
1481 outb_p(0, PIT_CH0);
1482 outb_p(0, PIT_CH0);
1484 return 0;
1486 __initcall(disable_pit_irq);
1488 void pit_broadcast_enter(void)
1490 cpu_set(smp_processor_id(), pit_broadcast_mask);
1493 void pit_broadcast_exit(void)
1495 int cpu = smp_processor_id();
1497 if ( cpu_test_and_clear(cpu, pit_broadcast_mask) )
1498 reprogram_timer(per_cpu(timer_deadline_start, cpu));
1501 int pit_broadcast_is_available(void)
1503 return xen_cpuidle;
1506 void send_timer_event(struct vcpu *v)
1508 send_guest_vcpu_virq(v, VIRQ_TIMER);
1511 /* Return secs after 00:00:00 localtime, 1 January, 1970. */
1512 unsigned long get_localtime(struct domain *d)
1514 return wc_sec + (wc_nsec + NOW()) / 1000000000ULL
1515 + d->time_offset_seconds;
1518 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1519 static long cmos_utc_offset; /* in seconds */
1521 int time_suspend(void)
1523 if ( smp_processor_id() == 0 )
1525 cmos_utc_offset = -get_cmos_time();
1526 cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL);
1527 kill_timer(&calibration_timer);
1529 /* Sync platform timer stamps. */
1530 platform_time_calibration();
1533 /* Better to cancel calibration timer for accuracy. */
1534 clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1536 return 0;
1539 int time_resume(void)
1541 /*u64 tmp = */init_pit_and_calibrate_tsc();
1543 /* Disable this while calibrate_tsc_ap() also is skipped. */
1544 /*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/
1546 resume_platform_timer();
1548 disable_pit_irq();
1550 init_percpu_time();
1552 do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
1554 update_vcpu_system_time(current);
1556 update_domain_rtc();
1558 return 0;
1561 int dom0_pit_access(struct ioreq *ioreq)
1563 /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1564 if ( using_pit )
1565 return 0;
1567 switch ( ioreq->addr )
1569 case PIT_CH2:
1570 if ( ioreq->dir == IOREQ_READ )
1571 ioreq->data = inb(PIT_CH2);
1572 else
1573 outb(ioreq->data, PIT_CH2);
1574 return 1;
1576 case PIT_MODE:
1577 if ( ioreq->dir == IOREQ_READ )
1578 return 0; /* urk! */
1579 switch ( ioreq->data & 0xc0 )
1581 case 0xc0: /* Read Back */
1582 if ( ioreq->data & 0x08 ) /* Select Channel 2? */
1583 outb(ioreq->data & 0xf8, PIT_MODE);
1584 if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1585 return 1; /* no - we're done */
1586 /* Filter Channel 2 and reserved bit 0. */
1587 ioreq->data &= ~0x09;
1588 return 0; /* emulate ch0/1 readback */
1589 case 0x80: /* Select Counter 2 */
1590 outb(ioreq->data, PIT_MODE);
1591 return 1;
1594 case 0x61:
1595 if ( ioreq->dir == IOREQ_READ )
1596 ioreq->data = inb(0x61);
1597 else
1598 outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1599 return 1;
1602 return 0;
1605 struct tm wallclock_time(void)
1607 uint64_t seconds;
1609 if ( !wc_sec )
1610 return (struct tm) { 0 };
1612 seconds = NOW() + (wc_sec * 1000000000ull) + wc_nsec;
1613 do_div(seconds, 1000000000);
1614 return gmtime(seconds);
1617 /*
1618 * PV SoftTSC Emulation.
1619 */
1621 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
1623 s_time_t now = get_s_time();
1624 struct domain *d = v->domain;
1625 u64 delta;
1627 spin_lock(&d->arch.vtsc_lock);
1629 if ( guest_kernel_mode(v, regs) )
1630 d->arch.vtsc_kerncount++;
1631 else
1632 d->arch.vtsc_usercount++;
1634 if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
1635 d->arch.vtsc_last = now;
1636 else
1637 now = ++d->arch.vtsc_last;
1639 spin_unlock(&d->arch.vtsc_lock);
1641 delta = max_t(s64, now - d->arch.vtsc_offset, 0);
1642 now = scale_delta(delta, &d->arch.ns_to_vtsc);
1644 regs->eax = (uint32_t)now;
1645 regs->edx = (uint32_t)(now >> 32);
1647 if ( rdtscp )
1648 regs->ecx =
1649 (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0;
1652 int host_tsc_is_safe(void)
1654 return boot_cpu_has(X86_FEATURE_TSC_RELIABLE) || (num_online_cpus() == 1);
1657 void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
1658 uint32_t *ecx, uint32_t *edx)
1660 struct domain *d = current->domain;
1661 uint64_t offset;
1663 switch ( sub_idx )
1665 case 0: /* features */
1666 *eax = ( ( (!!d->arch.vtsc) << 0 ) |
1667 ( (!!host_tsc_is_safe()) << 1 ) |
1668 ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
1669 0 );
1670 *ebx = d->arch.tsc_mode;
1671 *ecx = d->arch.tsc_khz;
1672 *edx = d->arch.incarnation;
1673 break;
1674 case 1: /* scale and offset */
1675 if ( !d->arch.vtsc )
1676 offset = d->arch.vtsc_offset;
1677 else
1678 /* offset already applied to value returned by virtual rdtscp */
1679 offset = 0;
1680 *eax = (uint32_t)offset;
1681 *ebx = (uint32_t)(offset >> 32);
1682 *ecx = d->arch.vtsc_to_ns.mul_frac;
1683 *edx = (s8)d->arch.vtsc_to_ns.shift;
1684 break;
1685 case 2: /* physical cpu_khz */
1686 *eax = cpu_khz;
1687 *ebx = *ecx = *edx = 0;
1688 break;
1689 default:
1690 *eax = *ebx = *ecx = *edx = 0;
1694 /*
1695 * called to collect tsc-related data only for save file or live
1696 * migrate; called after last rdtsc is done on this incarnation
1697 */
1698 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
1699 uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
1700 uint32_t *incarnation)
1702 *incarnation = d->arch.incarnation;
1703 *tsc_mode = d->arch.tsc_mode;
1705 switch ( *tsc_mode )
1707 case TSC_MODE_NEVER_EMULATE:
1708 *elapsed_nsec = *gtsc_khz = 0;
1709 break;
1710 case TSC_MODE_ALWAYS_EMULATE:
1711 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1712 *gtsc_khz = d->arch.tsc_khz;
1713 break;
1714 case TSC_MODE_DEFAULT:
1715 if ( d->arch.vtsc )
1717 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1718 *gtsc_khz = d->arch.tsc_khz;
1720 else
1722 uint64_t tsc = 0;
1723 rdtscll(tsc);
1724 *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
1725 *gtsc_khz = cpu_khz;
1727 break;
1728 case TSC_MODE_PVRDTSCP:
1729 if ( d->arch.vtsc )
1731 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1732 *gtsc_khz = cpu_khz;
1734 else
1736 uint64_t tsc = 0;
1737 rdtscll(tsc);
1738 *elapsed_nsec = (scale_delta(tsc,&d->arch.vtsc_to_ns) -
1739 d->arch.vtsc_offset);
1740 *gtsc_khz = 0; /* ignored by tsc_set_info */
1742 break;
1745 if ( (int64_t)*elapsed_nsec < 0 )
1746 *elapsed_nsec = 0;
1749 /*
1750 * This may be called as many as three times for a domain, once when the
1751 * hypervisor creates the domain, once when the toolstack creates the
1752 * domain and, if restoring/migrating, once when saved/migrated values
1753 * are restored. Care must be taken that, if multiple calls occur,
1754 * only the last "sticks" and all are completed before the guest executes
1755 * an rdtsc instruction
1756 */
1757 void tsc_set_info(struct domain *d,
1758 uint32_t tsc_mode, uint64_t elapsed_nsec,
1759 uint32_t gtsc_khz, uint32_t incarnation)
1761 if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
1763 d->arch.vtsc = 0;
1764 return;
1766 switch ( d->arch.tsc_mode = tsc_mode )
1768 case TSC_MODE_NEVER_EMULATE:
1769 d->arch.vtsc = 0;
1770 break;
1771 case TSC_MODE_ALWAYS_EMULATE:
1772 d->arch.vtsc = 1;
1773 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1774 d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
1775 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1776 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1777 break;
1778 case TSC_MODE_DEFAULT:
1779 d->arch.vtsc = 1;
1780 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1781 d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
1782 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1783 /* use native TSC if initial host has safe TSC and not migrated yet */
1784 if ( host_tsc_is_safe() && incarnation == 0 )
1785 d->arch.vtsc = 0;
1786 else
1787 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1788 break;
1789 case TSC_MODE_PVRDTSCP:
1790 d->arch.vtsc = boot_cpu_has(X86_FEATURE_RDTSCP) &&
1791 host_tsc_is_safe() ? 0 : 1;
1792 d->arch.tsc_khz = cpu_khz;
1793 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
1794 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
1795 if ( d->arch.vtsc )
1796 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
1797 else {
1798 /* when using native TSC, offset is nsec relative to power-on
1799 * of physical machine */
1800 uint64_t tsc = 0;
1801 rdtscll(tsc);
1802 d->arch.vtsc_offset = scale_delta(tsc,&d->arch.vtsc_to_ns) -
1803 elapsed_nsec;
1805 break;
1807 d->arch.incarnation = incarnation + 1;
1808 if ( is_hvm_domain(d) )
1809 hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
1812 /* vtsc may incur measurable performance degradation, diagnose with this */
1813 static void dump_softtsc(unsigned char key)
1815 struct domain *d;
1816 int domcnt = 0;
1817 extern unsigned int max_cstate;
1819 tsc_check_reliability();
1820 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1821 printk("TSC marked as reliable, "
1822 "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1823 else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
1825 printk("TSC has constant rate, ");
1826 if (max_cstate <= 2 && tsc_max_warp == 0)
1827 printk("no deep Cstates, passed warp test, deemed reliable, ");
1828 else
1829 printk("deep Cstates possible, so not reliable, ");
1830 printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1831 } else
1832 printk("TSC not marked as either constant or reliable, "
1833 "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
1834 for_each_domain ( d )
1836 if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
1837 continue;
1838 printk("dom%u%s: mode=%d",d->domain_id,
1839 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
1840 if ( d->arch.vtsc_offset )
1841 printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
1842 if ( d->arch.tsc_khz )
1843 printk(",khz=%"PRIu32"",d->arch.tsc_khz);
1844 if ( d->arch.incarnation )
1845 printk(",inc=%"PRIu32"",d->arch.incarnation);
1846 if ( !(d->arch.vtsc_kerncount | d->arch.vtsc_usercount) )
1848 printk("\n");
1849 continue;
1851 if ( is_hvm_domain(d) )
1852 printk(",vtsc count: %"PRIu64" total\n",
1853 d->arch.vtsc_kerncount);
1854 else
1855 printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
1856 d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
1857 domcnt++;
1860 if ( !domcnt )
1861 printk("No domains have emulated TSC\n");
1864 static struct keyhandler dump_softtsc_keyhandler = {
1865 .diagnostic = 1,
1866 .u.fn = dump_softtsc,
1867 .desc = "dump softtsc stats"
1868 };
1870 static int __init setup_dump_softtsc(void)
1872 register_keyhandler('s', &dump_softtsc_keyhandler);
1873 return 0;
1875 __initcall(setup_dump_softtsc);
1877 /*
1878 * Local variables:
1879 * mode: C
1880 * c-set-style: "BSD"
1881 * c-basic-offset: 4
1882 * tab-width: 4
1883 * indent-tabs-mode: nil
1884 * End:
1885 */