Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/time.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * arch/x86/time.c
3
 * 
4
 * Per-CPU time calibration and management.
5
 * 
6
 * Copyright (c) 2002-2005, K A Fraser
7
 * 
8
 * Portions from Linux are:
9
 * Copyright (c) 1991, 1992, 1995  Linus Torvalds
10
 */
11
12
#include <xen/errno.h>
13
#include <xen/event.h>
14
#include <xen/sched.h>
15
#include <xen/lib.h>
16
#include <xen/init.h>
17
#include <xen/time.h>
18
#include <xen/timer.h>
19
#include <xen/smp.h>
20
#include <xen/irq.h>
21
#include <xen/softirq.h>
22
#include <xen/efi.h>
23
#include <xen/cpuidle.h>
24
#include <xen/symbols.h>
25
#include <xen/keyhandler.h>
26
#include <xen/guest_access.h>
27
#include <asm/io.h>
28
#include <asm/msr.h>
29
#include <asm/mpspec.h>
30
#include <asm/processor.h>
31
#include <asm/fixmap.h>
32
#include <asm/mc146818rtc.h>
33
#include <asm/div64.h>
34
#include <asm/acpi.h>
35
#include <asm/hpet.h>
36
#include <io_ports.h>
37
#include <asm/setup.h> /* for early_time_init */
38
#include <public/arch-x86/cpuid.h>
39
40
/* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */
41
static char __initdata opt_clocksource[10];
42
string_param("clocksource", opt_clocksource);
43
44
unsigned long __read_mostly cpu_khz;  /* CPU clock frequency in kHz. */
45
DEFINE_SPINLOCK(rtc_lock);
46
unsigned long pit0_ticks;
47
48
struct cpu_time_stamp {
49
    u64 local_tsc;
50
    s_time_t local_stime;
51
    s_time_t master_stime;
52
};
53
54
struct cpu_time {
55
    struct cpu_time_stamp stamp;
56
    struct time_scale tsc_scale;
57
};
58
59
struct platform_timesource {
60
    char *id;
61
    char *name;
62
    u64 frequency;
63
    u64 (*read_counter)(void);
64
    s64 (*init)(struct platform_timesource *);
65
    void (*resume)(struct platform_timesource *);
66
    int counter_bits;
67
};
68
69
static DEFINE_PER_CPU(struct cpu_time, cpu_time);
70
71
/* Calibrate all CPUs to platform timer every EPOCH. */
72
179
#define EPOCH MILLISECS(1000)
73
static struct timer calibration_timer;
74
75
/*
76
 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
77
 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
78
 * softirq handling will happen in time.
79
 * 
80
 * The pit_lock protects the 16- and 32-bit stamp fields as well as the 
81
 */
82
static DEFINE_SPINLOCK(pit_lock);
83
static u16 pit_stamp16;
84
static u32 pit_stamp32;
85
static bool __read_mostly using_pit;
86
87
/* Boot timestamp, filled in head.S */
88
u64 __initdata boot_tsc_stamp;
89
90
/*
91
 * 32-bit division of integer dividend and integer divisor yielding
92
 * 32-bit fractional quotient.
93
 */
94
static inline u32 div_frac(u32 dividend, u32 divisor)
95
3
{
96
3
    u32 quotient, remainder;
97
3
    ASSERT(dividend < divisor);
98
3
    asm ( 
99
3
        "divl %4"
100
3
        : "=a" (quotient), "=d" (remainder)
101
3
        : "0" (0), "1" (dividend), "r" (divisor) );
102
3
    return quotient;
103
3
}
104
105
/*
106
 * 32-bit multiplication of multiplicand and fractional multiplier
107
 * yielding 32-bit product (radix point at same position as in multiplicand).
108
 */
109
static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
110
0
{
111
0
    u32 product_int, product_frac;
112
0
    asm (
113
0
        "mul %3"
114
0
        : "=a" (product_frac), "=d" (product_int)
115
0
        : "0" (multiplicand), "r" (multiplier) );
116
0
    return product_int;
117
0
}
118
119
/*
120
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
121
 * yielding a 64-bit result.
122
 */
123
u64 scale_delta(u64 delta, const struct time_scale *scale)
124
20.4M
{
125
20.4M
    u64 product;
126
20.4M
127
20.4M
    if ( scale->shift < 0 )
128
19.6M
        delta >>= -scale->shift;
129
20.4M
    else
130
784k
        delta <<= scale->shift;
131
20.4M
132
20.4M
    asm (
133
20.4M
        "mulq %2 ; shrd $32,%1,%0"
134
20.4M
        : "=a" (product), "=d" (delta)
135
20.4M
        : "rm" (delta), "0" ((u64)scale->mul_frac) );
136
20.4M
137
20.4M
    return product;
138
20.4M
}
139
140
2.12M
#define _TS_MUL_FRAC_IDENTITY 0x80000000UL
141
142
/* Compute the reciprocal of the given time_scale. */
143
static inline struct time_scale scale_reciprocal(struct time_scale scale)
144
2.12M
{
145
2.12M
    struct time_scale reciprocal;
146
2.12M
    u32 dividend;
147
2.12M
148
2.12M
    ASSERT(scale.mul_frac != 0);
149
2.12M
    dividend = _TS_MUL_FRAC_IDENTITY;
150
2.12M
    reciprocal.shift = 1 - scale.shift;
151
2.12M
    while ( unlikely(dividend >= scale.mul_frac) )
152
0
    {
153
0
        dividend >>= 1;
154
0
        reciprocal.shift++;
155
0
    }
156
2.12M
157
2.12M
    asm (
158
2.12M
        "divl %4"
159
2.12M
        : "=a" (reciprocal.mul_frac), "=d" (dividend)
160
2.12M
        : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
161
2.12M
162
2.12M
    return reciprocal;
163
2.12M
}
164
165
/*
166
 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
167
 * IPIs in place of local APIC timers
168
 */
169
static cpumask_t pit_broadcast_mask;
170
171
static void smp_send_timer_broadcast_ipi(void)
172
35
{
173
35
    int cpu = smp_processor_id();
174
35
    cpumask_t mask;
175
35
176
35
    cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask);
177
35
178
35
    if ( cpumask_test_cpu(cpu, &mask) )
179
0
    {
180
0
        __cpumask_clear_cpu(cpu, &mask);
181
0
        raise_softirq(TIMER_SOFTIRQ);
182
0
    }
183
35
184
35
    if ( !cpumask_empty(&mask) )
185
0
    {
186
0
        cpumask_raise_softirq(&mask, TIMER_SOFTIRQ);
187
0
    }
188
35
}
189
190
static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
191
35
{
192
35
    ASSERT(local_irq_is_enabled());
193
35
194
35
    if ( hpet_legacy_irq_tick() )
195
0
        return;
196
35
197
35
    /* Only for start-of-day interruopt tests in io_apic.c. */
198
35
    pit0_ticks++;
199
35
200
35
    /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
201
35
    if ( !cpu_has_apic )
202
0
        raise_softirq(TIMER_SOFTIRQ);
203
35
204
35
    if ( xen_cpuidle )
205
35
        smp_send_timer_broadcast_ipi();
206
35
207
35
    /* Emulate a 32-bit PIT counter. */
208
35
    if ( using_pit )
209
0
    {
210
0
        u16 count;
211
0
212
0
        spin_lock_irq(&pit_lock);
213
0
214
0
        outb(0x80, PIT_MODE);
215
0
        count  = inb(PIT_CH2);
216
0
        count |= inb(PIT_CH2) << 8;
217
0
218
0
        pit_stamp32 += (u16)(pit_stamp16 - count);
219
0
        pit_stamp16 = count;
220
0
221
0
        spin_unlock_irq(&pit_lock);
222
0
    }
223
35
}
224
225
static struct irqaction __read_mostly irq0 = {
226
    timer_interrupt, "timer", NULL
227
};
228
229
2
#define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
230
3
#define CALIBRATE_FRAC  20      /* calibrate over 50ms */
231
1
#define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC)
232
233
static void preinit_pit(void)
234
1
{
235
1
    /* Set PIT channel 0 to HZ Hz. */
236
2
#define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
237
1
    outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
238
1
    outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
239
1
    outb(LATCH >> 8, PIT_CH0);     /* MSB */
240
1
#undef LATCH
241
1
}
242
243
void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
244
3
{
245
3
    u64 tps64 = ticks_per_sec;
246
3
    u32 tps32;
247
3
    int shift = 0;
248
3
249
3
    ASSERT(tps64 != 0);
250
3
251
4
    while ( tps64 > (MILLISECS(1000)*2) )
252
1
    {
253
1
        tps64 >>= 1;
254
1
        shift--;
255
1
    }
256
3
257
3
    tps32 = (u32)tps64;
258
19
    while ( tps32 <= (u32)MILLISECS(1000) )
259
16
    {
260
16
        tps32 <<= 1;
261
16
        shift++;
262
16
    }
263
3
264
3
    ts->mul_frac = div_frac(MILLISECS(1000), tps32);
265
3
    ts->shift    = shift;
266
3
}
267
268
static char *freq_string(u64 freq)
269
1
{
270
1
    static char s[20];
271
1
    unsigned int x, y;
272
1
    y = (unsigned int)do_div(freq, 1000000) / 1000;
273
1
    x = (unsigned int)freq;
274
1
    snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
275
1
    return s;
276
1
}
277
278
/************************************************************
279
 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
280
 */
281
282
static u64 read_pit_count(void)
283
0
{
284
0
    u16 count16;
285
0
    u32 count32;
286
0
    unsigned long flags;
287
0
288
0
    spin_lock_irqsave(&pit_lock, flags);
289
0
290
0
    outb(0x80, PIT_MODE);
291
0
    count16  = inb(PIT_CH2);
292
0
    count16 |= inb(PIT_CH2) << 8;
293
0
294
0
    count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
295
0
296
0
    spin_unlock_irqrestore(&pit_lock, flags);
297
0
298
0
    return count32;
299
0
}
300
301
static s64 __init init_pit(struct platform_timesource *pts)
302
0
{
303
0
    u8 portb = inb(0x61);
304
0
    u64 start, end;
305
0
    unsigned long count;
306
0
307
0
    using_pit = true;
308
0
309
0
    /* Set the Gate high, disable speaker. */
310
0
    outb((portb & ~0x02) | 0x01, 0x61);
311
0
312
0
    /*
313
0
     * Now let's take care of CTC channel 2: mode 0, (interrupt on
314
0
     * terminal count mode), binary count, load CALIBRATE_LATCH count,
315
0
     * (LSB and MSB) to begin countdown.
316
0
     */
317
0
#define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE)
318
0
    outb(0xb0, PIT_MODE);                  /* binary, mode 0, LSB/MSB, Ch 2 */
319
0
    outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
320
0
    outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
321
0
#undef CALIBRATE_LATCH
322
0
323
0
    start = rdtsc_ordered();
324
0
    for ( count = 0; !(inb(0x61) & 0x20); ++count )
325
0
        continue;
326
0
    end = rdtsc_ordered();
327
0
328
0
    /* Set the Gate low, disable speaker. */
329
0
    outb(portb & ~0x03, 0x61);
330
0
331
0
    /* Error if the CTC doesn't behave itself. */
332
0
    if ( count == 0 )
333
0
        return 0;
334
0
335
0
    return (end - start) * CALIBRATE_FRAC;
336
0
}
337
338
static void resume_pit(struct platform_timesource *pts)
339
0
{
340
0
    /* Set CTC channel 2 to mode 0 again; initial value does not matter. */
341
0
    outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
342
0
    outb(0, PIT_CH2);     /* LSB of count */
343
0
    outb(0, PIT_CH2);     /* MSB of count */
344
0
}
345
346
static struct platform_timesource __initdata plt_pit =
347
{
348
    .id = "pit",
349
    .name = "PIT",
350
    .frequency = CLOCK_TICK_RATE,
351
    .read_counter = read_pit_count,
352
    .counter_bits = 32,
353
    .init = init_pit,
354
    .resume = resume_pit,
355
};
356
357
/************************************************************
358
 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
359
 */
360
361
static u64 read_hpet_count(void)
362
200
{
363
200
    return hpet_read32(HPET_COUNTER);
364
200
}
365
366
static s64 __init init_hpet(struct platform_timesource *pts)
367
1
{
368
1
    u64 hpet_rate = hpet_setup(), start;
369
1
    u32 count, target;
370
1
371
1
    if ( hpet_rate == 0 )
372
0
        return 0;
373
1
374
1
    pts->frequency = hpet_rate;
375
1
376
1
    count = hpet_read32(HPET_COUNTER);
377
1
    start = rdtsc_ordered();
378
1
    target = count + CALIBRATE_VALUE(hpet_rate);
379
1
    if ( target < count )
380
0
        while ( hpet_read32(HPET_COUNTER) >= count )
381
0
            continue;
382
86.0k
    while ( hpet_read32(HPET_COUNTER) < target )
383
86.0k
        continue;
384
1
385
1
    return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
386
1
}
387
388
static void resume_hpet(struct platform_timesource *pts)
389
0
{
390
0
    hpet_resume(NULL);
391
0
}
392
393
static struct platform_timesource __initdata plt_hpet =
394
{
395
    .id = "hpet",
396
    .name = "HPET",
397
    .read_counter = read_hpet_count,
398
    .counter_bits = 32,
399
    .init = init_hpet,
400
    .resume = resume_hpet
401
};
402
403
/************************************************************
404
 * PLATFORM TIMER 3: ACPI PM TIMER
405
 */
406
407
u32 __read_mostly pmtmr_ioport;
408
unsigned int __initdata pmtmr_width;
409
410
/* ACPI PM timer ticks at 3.579545 MHz. */
411
1
#define ACPI_PM_FREQUENCY 3579545
412
413
static u64 read_pmtimer_count(void)
414
0
{
415
0
    return inl(pmtmr_ioport);
416
0
}
417
418
static s64 __init init_pmtimer(struct platform_timesource *pts)
419
0
{
420
0
    u64 start;
421
0
    u32 count, target, mask = 0xffffff;
422
0
423
0
    if ( !pmtmr_ioport || !pmtmr_width )
424
0
        return 0;
425
0
426
0
    if ( pmtmr_width == 32 )
427
0
    {
428
0
        pts->counter_bits = 32;
429
0
        mask = 0xffffffff;
430
0
    }
431
0
432
0
    count = inl(pmtmr_ioport) & mask;
433
0
    start = rdtsc_ordered();
434
0
    target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
435
0
    if ( target < count )
436
0
        while ( (inl(pmtmr_ioport) & mask) >= count )
437
0
            continue;
438
0
    while ( (inl(pmtmr_ioport) & mask) < target )
439
0
        continue;
440
0
441
0
    return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
442
0
}
443
444
static struct platform_timesource __initdata plt_pmtimer =
445
{
446
    .id = "acpi",
447
    .name = "ACPI PM Timer",
448
    .frequency = ACPI_PM_FREQUENCY,
449
    .read_counter = read_pmtimer_count,
450
    .counter_bits = 24,
451
    .init = init_pmtimer
452
};
453
454
static struct time_scale __read_mostly pmt_scale;
455
static struct time_scale __read_mostly pmt_scale_r;
456
457
static __init int init_pmtmr_scale(void)
458
1
{
459
1
    set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
460
1
    pmt_scale_r = scale_reciprocal(pmt_scale);
461
1
    return 0;
462
1
}
463
__initcall(init_pmtmr_scale);
464
465
uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
466
0
{
467
0
    return scale_delta(ticks, &pmt_scale);
468
0
}
469
470
uint64_t ns_to_acpi_pm_tick(uint64_t ns)
471
0
{
472
0
    return scale_delta(ns, &pmt_scale_r);
473
0
}
474
475
/************************************************************
476
 * PLATFORM TIMER 4: TSC
477
 */
478
static unsigned int __initdata tsc_flags;
479
480
/* TSC is reliable across sockets */
481
0
#define TSC_RELIABLE_SOCKET (1 << 0)
482
483
/*
484
 * Called in verify_tsc_reliability() under reliable TSC conditions
485
 * thus reusing all the checks already performed there.
486
 */
487
static s64 __init init_tsc(struct platform_timesource *pts)
488
0
{
489
0
    u64 ret = pts->frequency;
490
0
491
0
    if ( nr_cpu_ids != num_present_cpus() )
492
0
    {
493
0
        printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n");
494
0
        ret = 0;
495
0
    }
496
0
497
0
    if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) )
498
0
    {
499
0
        printk(XENLOG_WARNING "TSC: Not invariant across sockets\n");
500
0
        ret = 0;
501
0
    }
502
0
503
0
    if ( !ret )
504
0
        printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n");
505
0
506
0
    return ret;
507
0
}
508
509
static u64 read_tsc(void)
510
0
{
511
0
    return rdtsc_ordered();
512
0
}
513
514
static struct platform_timesource __initdata plt_tsc =
515
{
516
    .id = "tsc",
517
    .name = "TSC",
518
    .read_counter = read_tsc,
519
    /*
520
     * Calculations for platform timer overflow assume u64 boundary.
521
     * Hence we set to less than 64, such that the TSC wraparound is
522
     * correctly checked and handled.
523
     */
524
    .counter_bits = 63,
525
    .init = init_tsc,
526
};
527
528
/************************************************************
529
 * GENERIC PLATFORM TIMER INFRASTRUCTURE
530
 */
531
532
/* details of chosen timesource */
533
static struct platform_timesource __read_mostly plt_src;
534
/* hardware-width mask */
535
static u64 __read_mostly plt_mask;
536
 /* ns between calls to plt_overflow() */
537
static u64 __read_mostly plt_overflow_period;
538
/* scale: platform counter -> nanosecs */
539
static struct time_scale __read_mostly plt_scale;
540
541
/* Protected by platform_timer_lock. */
542
static DEFINE_SPINLOCK(platform_timer_lock);
543
static s_time_t stime_platform_stamp; /* System time at below platform time */
544
static u64 platform_timer_stamp;      /* Platform time at above system time */
545
static u64 plt_stamp64;          /* 64-bit platform counter stamp           */
546
static u64 plt_stamp;            /* hardware-width platform counter stamp   */
547
static struct timer plt_overflow_timer;
548
549
static s_time_t __read_platform_stime(u64 platform_time)
550
201
{
551
201
    u64 diff = platform_time - platform_timer_stamp;
552
201
    ASSERT(spin_is_locked(&platform_timer_lock));
553
201
    return (stime_platform_stamp + scale_delta(diff, &plt_scale));
554
201
}
555
556
static void plt_overflow(void *unused)
557
1
{
558
1
    int i;
559
1
    u64 count;
560
1
    s_time_t now, plt_now, plt_wrap;
561
1
562
1
    spin_lock_irq(&platform_timer_lock);
563
1
564
1
    count = plt_src.read_counter();
565
1
    plt_stamp64 += (count - plt_stamp) & plt_mask;
566
1
    plt_stamp = count;
567
1
568
1
    now = NOW();
569
1
    plt_wrap = __read_platform_stime(plt_stamp64);
570
1
    for ( i = 0; i < 10; i++ )
571
1
    {
572
1
        plt_now = plt_wrap;
573
1
        plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1);
574
1
        if ( ABS(plt_wrap - now) > ABS(plt_now - now) )
575
1
            break;
576
0
        plt_stamp64 += plt_mask + 1;
577
0
    }
578
1
    if ( i != 0 )
579
0
    {
580
0
        static bool warned_once;
581
0
582
0
        if ( !test_and_set_bool(warned_once) )
583
0
            printk("Platform timer appears to have unexpectedly wrapped "
584
0
                   "%u%s times.\n", i, (i == 10) ? " or more" : "");
585
0
    }
586
1
587
1
    spin_unlock_irq(&platform_timer_lock);
588
1
589
1
    set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
590
1
}
591
592
static s_time_t read_platform_stime(u64 *stamp)
593
111
{
594
111
    u64 plt_counter, count;
595
111
    s_time_t stime;
596
111
597
111
    ASSERT(!local_irq_is_enabled());
598
111
599
111
    spin_lock(&platform_timer_lock);
600
111
    plt_counter = plt_src.read_counter();
601
111
    count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask);
602
111
    stime = __read_platform_stime(count);
603
111
    spin_unlock(&platform_timer_lock);
604
111
605
111
    if ( unlikely(stamp) )
606
0
        *stamp = plt_counter;
607
111
608
111
    return stime;
609
111
}
610
611
static void platform_time_calibration(void)
612
88
{
613
88
    u64 count;
614
88
    s_time_t stamp;
615
88
    unsigned long flags;
616
88
617
88
    spin_lock_irqsave(&platform_timer_lock, flags);
618
88
    count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
619
88
    stamp = __read_platform_stime(count);
620
88
    stime_platform_stamp = stamp;
621
88
    platform_timer_stamp = count;
622
88
    spin_unlock_irqrestore(&platform_timer_lock, flags);
623
88
}
624
625
static void resume_platform_timer(void)
626
0
{
627
0
    /* Timer source can be reset when backing from S3 to S0 */
628
0
    if ( plt_src.resume )
629
0
        plt_src.resume(&plt_src);
630
0
631
0
    plt_stamp64 = platform_timer_stamp;
632
0
    plt_stamp = plt_src.read_counter();
633
0
}
634
635
static void __init reset_platform_timer(void)
636
0
{
637
0
    /* Deactivate any timers running */
638
0
    kill_timer(&plt_overflow_timer);
639
0
    kill_timer(&calibration_timer);
640
0
641
0
    /* Reset counters and stamps */
642
0
    spin_lock_irq(&platform_timer_lock);
643
0
    plt_stamp = 0;
644
0
    plt_stamp64 = 0;
645
0
    platform_timer_stamp = 0;
646
0
    stime_platform_stamp = 0;
647
0
    spin_unlock_irq(&platform_timer_lock);
648
0
}
649
650
static s64 __init try_platform_timer(struct platform_timesource *pts)
651
1
{
652
1
    s64 rc = pts->init(pts);
653
1
654
1
    if ( rc <= 0 )
655
0
        return rc;
656
1
657
1
    /* We have a platform timesource already so reset it */
658
1
    if ( plt_src.counter_bits != 0 )
659
0
        reset_platform_timer();
660
1
661
1
    plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
662
1
663
1
    set_time_scale(&plt_scale, pts->frequency);
664
1
665
1
    plt_overflow_period = scale_delta(
666
1
        1ull << (pts->counter_bits - 1), &plt_scale);
667
1
    plt_src = *pts;
668
1
669
1
    return rc;
670
1
}
671
672
static u64 __init init_platform_timer(void)
673
1
{
674
1
    static struct platform_timesource * __initdata plt_timers[] = {
675
1
        &plt_hpet, &plt_pmtimer, &plt_pit
676
1
    };
677
1
678
1
    struct platform_timesource *pts = NULL;
679
1
    unsigned int i;
680
1
    s64 rc = -1;
681
1
682
1
    /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */
683
1
    if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") )
684
0
    {
685
0
        for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
686
0
        {
687
0
            pts = plt_timers[i];
688
0
            if ( !strcmp(opt_clocksource, pts->id) )
689
0
            {
690
0
                rc = try_platform_timer(pts);
691
0
                break;
692
0
            }
693
0
        }
694
0
695
0
        if ( rc <= 0 )
696
0
            printk("WARNING: %s clocksource '%s'.\n",
697
0
                   (rc == 0) ? "Could not initialise" : "Unrecognised",
698
0
                   opt_clocksource);
699
0
    }
700
1
701
1
    if ( rc <= 0 )
702
1
    {
703
1
        for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
704
1
        {
705
1
            pts = plt_timers[i];
706
1
            if ( (rc = try_platform_timer(pts)) > 0 )
707
1
                break;
708
1
        }
709
1
    }
710
1
711
1
    BUG_ON(rc <= 0);
712
1
713
1
    printk("Platform timer is %s %s\n",
714
1
           freq_string(pts->frequency), pts->name);
715
1
716
1
    return rc;
717
1
}
718
719
u64 stime2tsc(s_time_t stime)
720
2.09M
{
721
2.09M
    struct cpu_time *t;
722
2.09M
    struct time_scale sys_to_tsc;
723
2.09M
    s_time_t stime_delta;
724
2.09M
725
2.09M
    t = &this_cpu(cpu_time);
726
2.09M
    sys_to_tsc = scale_reciprocal(t->tsc_scale);
727
2.09M
728
2.09M
    stime_delta = stime - t->stamp.local_stime;
729
2.09M
    if ( stime_delta < 0 )
730
0
        stime_delta = 0;
731
2.09M
732
2.09M
    return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc);
733
2.09M
}
734
735
void cstate_restore_tsc(void)
736
1.40M
{
737
1.40M
    if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
738
1.43M
        return;
739
1.40M
740
18.4E
    write_tsc(stime2tsc(read_platform_stime(NULL)));
741
18.4E
}
742
743
/***************************************************************************
744
 * CMOS Timer functions
745
 ***************************************************************************/
746
747
/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
748
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
749
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
750
 *
751
 * [For the Julian calendar (which was used in Russia before 1917,
752
 * Britain & colonies before 1752, anywhere else before 1582,
753
 * and is still in use by some communities) leave out the
754
 * -year/100+year/400 terms, and add 10.]
755
 *
756
 * This algorithm was first published by Gauss (I think).
757
 *
758
 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
759
 * machines were long is 32-bit! (However, as time_t is signed, we
760
 * will already get problems at other places on 2038-01-19 03:14:08)
761
 */
762
unsigned long
763
mktime (unsigned int year, unsigned int mon,
764
        unsigned int day, unsigned int hour,
765
        unsigned int min, unsigned int sec)
766
1
{
767
1
    /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
768
1
    if ( 0 >= (int) (mon -= 2) )
769
0
    {
770
0
        mon += 12;
771
0
        year -= 1;
772
0
    }
773
1
774
1
    return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
775
1
              year*365 - 719499
776
1
        )*24 + hour /* now have hours */
777
1
        )*60 + min  /* now have minutes */
778
1
        )*60 + sec; /* finally seconds */
779
1
}
780
781
struct rtc_time {
782
    unsigned int year, mon, day, hour, min, sec;
783
};
784
785
static void __get_cmos_time(struct rtc_time *rtc)
786
1
{
787
1
    rtc->sec  = CMOS_READ(RTC_SECONDS);
788
1
    rtc->min  = CMOS_READ(RTC_MINUTES);
789
1
    rtc->hour = CMOS_READ(RTC_HOURS);
790
1
    rtc->day  = CMOS_READ(RTC_DAY_OF_MONTH);
791
1
    rtc->mon  = CMOS_READ(RTC_MONTH);
792
1
    rtc->year = CMOS_READ(RTC_YEAR);
793
1
    
794
1
    if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) )
795
1
    {
796
1
        BCD_TO_BIN(rtc->sec);
797
1
        BCD_TO_BIN(rtc->min);
798
1
        BCD_TO_BIN(rtc->hour);
799
1
        BCD_TO_BIN(rtc->day);
800
1
        BCD_TO_BIN(rtc->mon);
801
1
        BCD_TO_BIN(rtc->year);
802
1
    }
803
1
804
1
    if ( (rtc->year += 1900) < 1970 )
805
1
        rtc->year += 100;
806
1
}
807
808
static unsigned long get_cmos_time(void)
809
1
{
810
1
    unsigned long res, flags;
811
1
    struct rtc_time rtc;
812
1
    unsigned int seconds = 60;
813
1
    static bool __read_mostly cmos_rtc_probe;
814
1
    boolean_param("cmos-rtc-probe", cmos_rtc_probe);
815
1
816
1
    if ( efi_enabled(EFI_RS) )
817
0
    {
818
0
        res = efi_get_time();
819
0
        if ( res )
820
0
            return res;
821
0
    }
822
1
823
1
    if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
824
1
        cmos_rtc_probe = false;
825
0
    else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe )
826
0
        panic("System with no CMOS RTC advertised must be booted from EFI"
827
0
              " (or with command line option \"cmos-rtc-probe\")");
828
1
829
1
    for ( ; ; )
830
1
    {
831
1
        s_time_t start, t1, t2;
832
1
833
1
        spin_lock_irqsave(&rtc_lock, flags);
834
1
835
1
        /* read RTC exactly on falling edge of update flag */
836
1
        start = NOW();
837
152k
        do { /* may take up to 1 second... */
838
152k
            t1 = NOW() - start;
839
152k
        } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
840
152k
                  t1 <= SECONDS(1) );
841
1
842
1
        start = NOW();
843
577
        do { /* must try at least 2.228 ms */
844
577
            t2 = NOW() - start;
845
577
        } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
846
576
                  t2 < MILLISECS(3) );
847
1
848
1
        __get_cmos_time(&rtc);
849
1
850
1
        spin_unlock_irqrestore(&rtc_lock, flags);
851
1
852
1
        if ( likely(!cmos_rtc_probe) ||
853
0
             t1 > SECONDS(1) || t2 >= MILLISECS(3) ||
854
0
             rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 ||
855
0
             !rtc.day || rtc.day > 31 ||
856
0
             !rtc.mon || rtc.mon > 12 )
857
1
            break;
858
1
859
0
        if ( seconds < 60 )
860
0
        {
861
0
            if ( rtc.sec != seconds )
862
0
                cmos_rtc_probe = false;
863
0
            break;
864
0
        }
865
0
866
0
        process_pending_softirqs();
867
0
868
0
        seconds = rtc.sec;
869
0
    }
870
1
871
1
    if ( unlikely(cmos_rtc_probe) )
872
0
        panic("No CMOS RTC found - system must be booted from EFI");
873
1
874
1
    return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec);
875
1
}
876
877
/***************************************************************************
878
 * System Time
879
 ***************************************************************************/
880
881
s_time_t get_s_time_fixed(u64 at_tsc)
882
20.0M
{
883
20.0M
    const struct cpu_time *t = &this_cpu(cpu_time);
884
20.0M
    u64 tsc, delta;
885
20.0M
    s_time_t now;
886
20.0M
887
20.0M
    if ( at_tsc )
888
416
        tsc = at_tsc;
889
20.0M
    else
890
20.0M
        tsc = rdtsc_ordered();
891
20.0M
    delta = tsc - t->stamp.local_tsc;
892
20.0M
    now = t->stamp.local_stime + scale_delta(delta, &t->tsc_scale);
893
20.0M
894
20.0M
    return now;
895
20.0M
}
896
897
s_time_t get_s_time()
898
19.9M
{
899
19.9M
    return get_s_time_fixed(0);
900
19.9M
}
901
902
uint64_t tsc_ticks2ns(uint64_t ticks)
903
0
{
904
0
    struct cpu_time *t = &this_cpu(cpu_time);
905
0
906
0
    return scale_delta(ticks, &t->tsc_scale);
907
0
}
908
909
static void __update_vcpu_system_time(struct vcpu *v, int force)
910
165k
{
911
165k
    const struct cpu_time *t;
912
165k
    struct vcpu_time_info *u, _u = {};
913
165k
    struct domain *d = v->domain;
914
165k
    s_time_t tsc_stamp;
915
165k
916
165k
    if ( v->vcpu_info == NULL )
917
66.5k
        return;
918
165k
919
98.9k
    t = &this_cpu(cpu_time);
920
98.9k
    u = &vcpu_info(v, time);
921
98.9k
922
98.9k
    if ( d->arch.vtsc )
923
0
    {
924
0
        s_time_t stime = t->stamp.local_stime;
925
0
926
0
        if ( is_hvm_domain(d) )
927
0
        {
928
0
            struct pl_time *pl = v->domain->arch.hvm_domain.pl_time;
929
0
930
0
            stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset;
931
0
            if ( stime >= 0 )
932
0
                tsc_stamp = gtime_to_gtsc(d, stime);
933
0
            else
934
0
                tsc_stamp = -gtime_to_gtsc(d, -stime);
935
0
        }
936
0
        else
937
0
            tsc_stamp = gtime_to_gtsc(d, stime);
938
0
939
0
        _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
940
0
        _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
941
0
    }
942
98.9k
    else
943
98.9k
    {
944
99.4k
        if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
945
0
        {
946
0
            tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);
947
0
            _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
948
0
            _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
949
0
        }
950
98.9k
        else
951
98.9k
        {
952
98.9k
            tsc_stamp            = t->stamp.local_tsc;
953
98.9k
            _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
954
98.9k
            _u.tsc_shift         = t->tsc_scale.shift;
955
98.9k
        }
956
98.9k
    }
957
98.9k
958
98.9k
    _u.tsc_timestamp = tsc_stamp;
959
98.9k
    _u.system_time   = t->stamp.local_stime;
960
98.9k
961
98.9k
    /*
962
98.9k
     * It's expected that domains cope with this bit changing on every
963
98.9k
     * pvclock read to check whether they can resort solely on this tuple
964
98.9k
     * or if it further requires monotonicity checks with other vcpus.
965
98.9k
     */
966
98.9k
    if ( clocksource_is_tsc() )
967
0
        _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
968
98.9k
969
98.9k
    if ( is_hvm_domain(d) )
970
99.4k
        _u.tsc_timestamp += v->arch.hvm_vcpu.cache_tsc_offset;
971
98.9k
972
98.9k
    /* Don't bother unless timestamp record has changed or we are forced. */
973
98.9k
    _u.version = u->version; /* make versions match for memcmp test */
974
100k
    if ( !force && !memcmp(u, &_u, sizeof(_u)) )
975
99.3k
        return;
976
98.9k
977
98.9k
    /* 1. Update guest kernel version. */
978
18.4E
    _u.version = u->version = version_update_begin(u->version);
979
18.4E
    wmb();
980
18.4E
    /* 2. Update all other guest kernel fields. */
981
18.4E
    *u = _u;
982
18.4E
    wmb();
983
18.4E
    /* 3. Update guest kernel version. */
984
18.4E
    u->version = version_update_end(u->version);
985
18.4E
986
18.4E
    if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
987
0
         !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
988
0
        v->arch.pv_vcpu.pending_system_time = _u;
989
18.4E
}
990
991
bool update_secondary_system_time(struct vcpu *v,
992
                                  struct vcpu_time_info *u)
993
1.41k
{
994
1.41k
    XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
995
1.41k
    struct guest_memory_policy policy =
996
1.41k
        { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
997
1.41k
998
1.41k
    if ( guest_handle_is_null(user_u) )
999
1.41k
        return true;
1000
1.41k
1001
18.4E
    update_guest_memory_policy(v, &policy);
1002
18.4E
1003
18.4E
    /* 1. Update userspace version. */
1004
18.4E
    if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
1005
0
    {
1006
0
        update_guest_memory_policy(v, &policy);
1007
0
        return false;
1008
0
    }
1009
18.4E
    wmb();
1010
18.4E
    /* 2. Update all other userspace fields. */
1011
18.4E
    __copy_to_guest(user_u, u, 1);
1012
18.4E
    wmb();
1013
18.4E
    /* 3. Update userspace version. */
1014
18.4E
    u->version = version_update_end(u->version);
1015
18.4E
    __copy_field_to_guest(user_u, u, version);
1016
18.4E
1017
18.4E
    update_guest_memory_policy(v, &policy);
1018
18.4E
1019
18.4E
    return true;
1020
18.4E
}
1021
1022
void update_vcpu_system_time(struct vcpu *v)
1023
165k
{
1024
165k
    __update_vcpu_system_time(v, 0);
1025
165k
}
1026
1027
void force_update_vcpu_system_time(struct vcpu *v)
1028
0
{
1029
0
    __update_vcpu_system_time(v, 1);
1030
0
}
1031
1032
static void update_domain_rtc(void)
1033
0
{
1034
0
    struct domain *d;
1035
0
1036
0
    rcu_read_lock(&domlist_read_lock);
1037
0
1038
0
    for_each_domain ( d )
1039
0
        if ( is_hvm_domain(d) )
1040
0
            rtc_update_clock(d);
1041
0
1042
0
    rcu_read_unlock(&domlist_read_lock);
1043
0
}
1044
1045
void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
1046
0
{
1047
0
    d->time_offset_seconds = time_offset_seconds;
1048
0
    if ( is_hvm_domain(d) )
1049
0
        rtc_update_clock(d);
1050
0
    update_domain_wallclock_time(d);
1051
0
}
1052
1053
int cpu_frequency_change(u64 freq)
1054
0
{
1055
0
    struct cpu_time *t = &this_cpu(cpu_time);
1056
0
    u64 curr_tsc;
1057
0
1058
0
    /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
1059
0
    if ( freq < 1000000u )
1060
0
    {
1061
0
        printk(XENLOG_WARNING "Rejecting CPU frequency change "
1062
0
               "to %"PRIu64" Hz\n", freq);
1063
0
        return -EINVAL;
1064
0
    }
1065
0
1066
0
    local_irq_disable();
1067
0
    /* Platform time /first/, as we may be delayed by platform_timer_lock. */
1068
0
    t->stamp.master_stime = read_platform_stime(NULL);
1069
0
    curr_tsc = rdtsc_ordered();
1070
0
    /* TSC-extrapolated time may be bogus after frequency change. */
1071
0
    /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/
1072
0
    t->stamp.local_stime = t->stamp.master_stime;
1073
0
    t->stamp.local_tsc = curr_tsc;
1074
0
    set_time_scale(&t->tsc_scale, freq);
1075
0
    local_irq_enable();
1076
0
1077
0
    update_vcpu_system_time(current);
1078
0
1079
0
    /* A full epoch should pass before we check for deviation. */
1080
0
    if ( smp_processor_id() == 0 )
1081
0
    {
1082
0
        set_timer(&calibration_timer, NOW() + EPOCH);
1083
0
        platform_time_calibration();
1084
0
    }
1085
0
1086
0
    return 0;
1087
0
}
1088
1089
/* Per-CPU communication between rendezvous IRQ and softirq handler. */
1090
static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration);
1091
1092
/* Softirq handler for per-CPU time calibration. */
1093
static void local_time_calibration(void)
1094
980
{
1095
980
    struct cpu_time *t = &this_cpu(cpu_time);
1096
980
    const struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1097
980
1098
980
    /*
1099
980
     * System (extrapolated from local and master oscillators) and TSC
1100
980
     * timestamps, taken during this calibration and the previous one.
1101
980
     */
1102
980
    struct cpu_time_stamp prev, curr;
1103
980
1104
980
    /*
1105
980
     * System time and TSC ticks elapsed during the previous calibration
1106
980
     * 'epoch'. These values are down-shifted to fit in 32 bits.
1107
980
     */
1108
980
    u64 stime_elapsed64, tsc_elapsed64;
1109
980
    u32 stime_elapsed32, tsc_elapsed32;
1110
980
1111
980
    /* Error correction to slow down a fast local clock. */
1112
980
    u32 error_factor = 0;
1113
980
1114
980
    /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1115
980
    int tsc_shift = 0;
1116
980
1117
980
    /* The overall calibration scale multiplier. */
1118
980
    u32 calibration_mul_frac;
1119
980
1120
980
    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1121
890
    {
1122
890
        /* Atomically read cpu_calibration struct and write cpu_time struct. */
1123
890
        local_irq_disable();
1124
890
        t->stamp = *c;
1125
890
        local_irq_enable();
1126
890
        update_vcpu_system_time(current);
1127
890
        goto out;
1128
890
    }
1129
980
1130
90
    prev = t->stamp;
1131
90
1132
90
    /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1133
90
    local_irq_disable();
1134
90
    curr = *c;
1135
90
    local_irq_enable();
1136
90
1137
90
#if 0
1138
    printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1139
           smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime);
1140
    printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1141
           " -> %"PRId64"\n",
1142
           smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime,
1143
           curr.master_stime - curr.local_stime);
1144
#endif
1145
90
1146
90
    /* Local time warps forward if it lags behind master time. */
1147
90
    if ( curr.local_stime < curr.master_stime )
1148
0
        curr.local_stime = curr.master_stime;
1149
90
1150
90
    stime_elapsed64 = curr.master_stime - prev.master_stime;
1151
90
    tsc_elapsed64   = curr.local_tsc - prev.local_tsc;
1152
90
1153
90
    /*
1154
90
     * Weirdness can happen if we lose sync with the platform timer.
1155
90
     * We could be smarter here: resync platform timer with local timer?
1156
90
     */
1157
90
    if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1158
0
        goto out;
1159
90
1160
90
    /*
1161
90
     * Calculate error-correction factor. This only slows down a fast local
1162
90
     * clock (slow clocks are warped forwards). The scale factor is clamped
1163
90
     * to >= 0.5.
1164
90
     */
1165
90
    if ( curr.local_stime != curr.master_stime )
1166
0
    {
1167
0
        u64 local_stime_err = curr.local_stime - curr.master_stime;
1168
0
1169
0
        if ( local_stime_err > EPOCH )
1170
0
            local_stime_err = EPOCH;
1171
0
        error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1172
0
    }
1173
90
1174
90
    /*
1175
90
     * We require 0 < stime_elapsed < 2^31.
1176
90
     * This allows us to binary shift a 32-bit tsc_elapsed such that:
1177
90
     * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1178
90
     */
1179
90
    while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1180
0
            ((s32)stime_elapsed64 < 0) )
1181
0
    {
1182
0
        stime_elapsed64 >>= 1;
1183
0
        tsc_elapsed64   >>= 1;
1184
0
    }
1185
90
1186
90
    /* stime_master_diff now fits in a 32-bit word. */
1187
90
    stime_elapsed32 = (u32)stime_elapsed64;
1188
90
1189
90
    /* tsc_elapsed <= 2*stime_elapsed */
1190
90
    while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1191
0
    {
1192
0
        tsc_elapsed64 >>= 1;
1193
0
        tsc_shift--;
1194
0
    }
1195
90
1196
90
    /* Local difference must now fit in 32 bits. */
1197
90
    ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1198
90
    tsc_elapsed32 = (u32)tsc_elapsed64;
1199
90
1200
90
    /* tsc_elapsed > stime_elapsed */
1201
90
    ASSERT(tsc_elapsed32 != 0);
1202
90
    while ( tsc_elapsed32 <= stime_elapsed32 )
1203
0
    {
1204
0
        tsc_elapsed32 <<= 1;
1205
0
        tsc_shift++;
1206
0
    }
1207
90
1208
90
    calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1209
90
    if ( error_factor != 0 )
1210
0
        calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1211
90
1212
90
#if 0
1213
    printk("---%d: %08x %08x %d\n", smp_processor_id(),
1214
           error_factor, calibration_mul_frac, tsc_shift);
1215
#endif
1216
90
1217
90
    /* Record new timestamp information, atomically w.r.t. interrupts. */
1218
90
    local_irq_disable();
1219
90
    t->tsc_scale.mul_frac = calibration_mul_frac;
1220
90
    t->tsc_scale.shift    = tsc_shift;
1221
90
    t->stamp              = curr;
1222
90
    local_irq_enable();
1223
90
1224
90
    update_vcpu_system_time(current);
1225
90
1226
932
 out:
1227
932
    if ( smp_processor_id() == 0 )
1228
88
    {
1229
88
        set_timer(&calibration_timer, NOW() + EPOCH);
1230
88
        platform_time_calibration();
1231
88
    }
1232
932
}
1233
1234
/*
1235
 * TSC Reliability check
1236
 */
1237
1238
/*
1239
 * The Linux original version of this function is
1240
 * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1241
 */
1242
static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1243
4
{
1244
4
    static DEFINE_SPINLOCK(sync_lock);
1245
4
    static cycles_t last_tsc;
1246
4
1247
4
    cycles_t start, now, prev, end;
1248
4
    int i;
1249
4
1250
4
    start = rdtsc_ordered();
1251
4
1252
4
    /* The measurement runs for 20 msecs: */
1253
4
    end = start + tsc_khz * 20ULL;
1254
4
    now = start;
1255
4
1256
10.3k
    for ( i = 0; ; i++ )
1257
10.3k
    {
1258
10.3k
        /*
1259
10.3k
         * We take the global lock, measure TSC, save the
1260
10.3k
         * previous TSC that was measured (possibly on
1261
10.3k
         * another CPU) and update the previous TSC timestamp.
1262
10.3k
         */
1263
10.3k
        spin_lock(&sync_lock);
1264
10.3k
        prev = last_tsc;
1265
10.3k
        now = rdtsc_ordered();
1266
10.3k
        last_tsc = now;
1267
10.3k
        spin_unlock(&sync_lock);
1268
10.3k
1269
10.3k
        /*
1270
10.3k
         * Be nice every now and then (and also check whether measurement is 
1271
10.3k
         * done [we also insert a 10 million loops safety exit, so we dont 
1272
10.3k
         * lock up in case the TSC readout is totally broken]):
1273
10.3k
         */
1274
10.3k
        if ( unlikely(!(i & 7)) )
1275
1.31k
        {
1276
1.31k
            if ( (now > end) || (i > 10000000) )
1277
12
                break;
1278
1.29k
            cpu_relax();
1279
1.29k
            /*touch_nmi_watchdog();*/
1280
1.29k
        }
1281
10.3k
1282
10.3k
        /*
1283
10.3k
         * Outside the critical section we can now see whether we saw a 
1284
10.3k
         * time-warp of the TSC going backwards:
1285
10.3k
         */
1286
10.3k
        if ( unlikely(prev > now) )
1287
5.54k
        {
1288
5.54k
            spin_lock(&sync_lock);
1289
5.54k
            if ( *max_warp < prev - now )
1290
13
                *max_warp = prev - now;
1291
5.54k
            spin_unlock(&sync_lock);
1292
5.54k
        }
1293
10.3k
    }
1294
4
}
1295
1296
static unsigned long tsc_max_warp, tsc_check_count;
1297
static cpumask_t tsc_check_cpumask;
1298
1299
static void tsc_check_slave(void *unused)
1300
8
{
1301
8
    unsigned int cpu = smp_processor_id();
1302
8
    local_irq_disable();
1303
1.46k
    while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) )
1304
1.45k
        cpu_relax();
1305
8
    check_tsc_warp(cpu_khz, &tsc_max_warp);
1306
8
    cpumask_clear_cpu(cpu, &tsc_check_cpumask);
1307
8
    local_irq_enable();
1308
8
}
1309
1310
static void tsc_check_reliability(void)
1311
1
{
1312
1
    unsigned int cpu = smp_processor_id();
1313
1
    static DEFINE_SPINLOCK(lock);
1314
1
1315
1
    spin_lock(&lock);
1316
1
1317
1
    tsc_check_count++;
1318
1
    smp_call_function(tsc_check_slave, NULL, 0);
1319
1
    cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu));
1320
1
    local_irq_disable();
1321
1
    check_tsc_warp(cpu_khz, &tsc_max_warp);
1322
1
    local_irq_enable();
1323
59
    while ( !cpumask_empty(&tsc_check_cpumask) )
1324
58
        cpu_relax();
1325
1
1326
1
    spin_unlock(&lock);
1327
1
}
1328
1329
/*
1330
 * Rendezvous for all CPUs in IRQ context.
1331
 * Master CPU snapshots the platform timer.
1332
 * All CPUS snapshot their local TSC and extrapolation of system time.
1333
 */
1334
struct calibration_rendezvous {
1335
    cpumask_t cpu_calibration_map;
1336
    atomic_t semaphore;
1337
    s_time_t master_stime;
1338
    u64 master_tsc_stamp;
1339
};
1340
1341
static void
1342
time_calibration_rendezvous_tail(const struct calibration_rendezvous *r)
1343
355
{
1344
355
    struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1345
355
1346
355
    c->local_tsc    = rdtsc_ordered();
1347
355
    c->local_stime  = get_s_time_fixed(c->local_tsc);
1348
355
    c->master_stime = r->master_stime;
1349
355
1350
355
    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1351
355
}
1352
1353
/*
1354
 * Keep TSCs in sync when they run at the same rate, but may stop in
1355
 * deep-sleep C states.
1356
 */
1357
static void time_calibration_tsc_rendezvous(void *_r)
1358
685
{
1359
685
    int i;
1360
685
    struct calibration_rendezvous *r = _r;
1361
685
    unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1362
685
1363
685
    /* Loop to get rid of cache effects on TSC skew. */
1364
2.59k
    for ( i = 4; i >= 0; i-- )
1365
1.90k
    {
1366
1.90k
        if ( smp_processor_id() == 0 )
1367
440
        {
1368
64.2k
            while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1369
63.8k
                cpu_relax();
1370
440
1371
440
            if ( r->master_stime == 0 )
1372
88
            {
1373
88
                r->master_stime = read_platform_stime(NULL);
1374
88
                r->master_tsc_stamp = rdtsc_ordered();
1375
88
            }
1376
440
            atomic_inc(&r->semaphore);
1377
440
1378
440
            if ( i == 0 )
1379
88
                write_tsc(r->master_tsc_stamp);
1380
440
1381
2.12k
            while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1382
1.68k
                cpu_relax();
1383
440
            atomic_set(&r->semaphore, 0);
1384
440
        }
1385
1.90k
        else
1386
1.46k
        {
1387
1.46k
            atomic_inc(&r->semaphore);
1388
58.7k
            while ( atomic_read(&r->semaphore) < total_cpus )
1389
57.3k
                cpu_relax();
1390
1.46k
1391
1.46k
            if ( i == 0 )
1392
220
                write_tsc(r->master_tsc_stamp);
1393
1.46k
1394
1.46k
            atomic_inc(&r->semaphore);
1395
4.84k
            while ( atomic_read(&r->semaphore) > total_cpus )
1396
3.37k
                cpu_relax();
1397
1.46k
        }
1398
1.90k
    }
1399
685
1400
685
    time_calibration_rendezvous_tail(r);
1401
685
}
1402
1403
/* Ordinary rendezvous function which does not modify TSC values. */
1404
static void time_calibration_std_rendezvous(void *_r)
1405
0
{
1406
0
    struct calibration_rendezvous *r = _r;
1407
0
    unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1408
0
1409
0
    if ( smp_processor_id() == 0 )
1410
0
    {
1411
0
        while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1412
0
            cpu_relax();
1413
0
        r->master_stime = read_platform_stime(NULL);
1414
0
        smp_wmb(); /* write r->master_stime /then/ signal */
1415
0
        atomic_inc(&r->semaphore);
1416
0
    }
1417
0
    else
1418
0
    {
1419
0
        atomic_inc(&r->semaphore);
1420
0
        while ( atomic_read(&r->semaphore) != total_cpus )
1421
0
            cpu_relax();
1422
0
        smp_rmb(); /* receive signal /then/ read r->master_stime */
1423
0
    }
1424
0
1425
0
    time_calibration_rendezvous_tail(r);
1426
0
}
1427
1428
/*
1429
 * Rendezvous function used when clocksource is TSC and
1430
 * no CPU hotplug will be performed.
1431
 */
1432
static void time_calibration_nop_rendezvous(void *rv)
1433
0
{
1434
0
    const struct calibration_rendezvous *r = rv;
1435
0
    struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1436
0
1437
0
    c->local_tsc    = r->master_tsc_stamp;
1438
0
    c->local_stime  = r->master_stime;
1439
0
    c->master_stime = r->master_stime;
1440
0
1441
0
    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1442
0
}
1443
1444
static void (*time_calibration_rendezvous_fn)(void *) =
1445
    time_calibration_std_rendezvous;
1446
1447
static void time_calibration(void *unused)
1448
88
{
1449
88
    struct calibration_rendezvous r = {
1450
88
        .semaphore = ATOMIC_INIT(0)
1451
88
    };
1452
88
1453
88
    if ( clocksource_is_tsc() )
1454
0
    {
1455
0
        local_irq_disable();
1456
0
        r.master_stime = read_platform_stime(&r.master_tsc_stamp);
1457
0
        local_irq_enable();
1458
0
    }
1459
88
1460
88
    cpumask_copy(&r.cpu_calibration_map, &cpu_online_map);
1461
88
1462
88
    /* @wait=1 because we must wait for all cpus before freeing @r. */
1463
88
    on_selected_cpus(&r.cpu_calibration_map,
1464
88
                     time_calibration_rendezvous_fn,
1465
88
                     &r, 1);
1466
88
}
1467
1468
static struct cpu_time_stamp ap_bringup_ref;
1469
1470
void time_latch_stamps(void)
1471
11
{
1472
11
    unsigned long flags;
1473
11
1474
11
    local_irq_save(flags);
1475
11
    ap_bringup_ref.master_stime = read_platform_stime(NULL);
1476
11
    ap_bringup_ref.local_tsc = rdtsc_ordered();
1477
11
    local_irq_restore(flags);
1478
11
1479
11
    ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc);
1480
11
}
1481
1482
void init_percpu_time(void)
1483
12
{
1484
12
    struct cpu_time *t = &this_cpu(cpu_time);
1485
12
    unsigned long flags;
1486
12
    u64 tsc;
1487
12
    s_time_t now;
1488
12
1489
12
    /* Initial estimate for TSC rate. */
1490
12
    t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1491
12
1492
12
    local_irq_save(flags);
1493
12
    now = read_platform_stime(NULL);
1494
12
    tsc = rdtsc_ordered();
1495
12
    local_irq_restore(flags);
1496
12
1497
12
    t->stamp.master_stime = now;
1498
12
    /*
1499
12
     * To avoid a discontinuity (TSC and platform clock can't be expected
1500
12
     * to be in perfect sync), initialization here needs to match up with
1501
12
     * local_time_calibration()'s decision whether to use its fast path.
1502
12
     */
1503
12
    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1504
12
    {
1505
12
        if ( system_state < SYS_STATE_smp_boot )
1506
1
            now = get_s_time_fixed(tsc);
1507
12
        else
1508
11
            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
1509
12
    }
1510
12
    t->stamp.local_tsc   = tsc;
1511
12
    t->stamp.local_stime = now;
1512
12
}
1513
1514
/*
1515
 * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits. 
1516
 * Obviously we must not use write_tsc() on such CPUs.
1517
 *
1518
 * Additionally, AMD specifies that being able to write the TSC MSR is not an 
1519
 * architectural feature (but, other than their manual says, also cannot be 
1520
 * determined from CPUID bits).
1521
 */
1522
static void __init tsc_check_writability(void)
1523
2
{
1524
2
    const char *what = NULL;
1525
2
    uint64_t tsc;
1526
2
1527
2
    /*
1528
2
     * If all CPUs are reported as synchronised and in sync, we never write
1529
2
     * the TSCs (except unavoidably, when a CPU is physically hot-plugged).
1530
2
     * Hence testing for writability is pointless and even harmful.
1531
2
     */
1532
2
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1533
1
        return;
1534
2
1535
1
    tsc = rdtsc();
1536
1
    if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
1537
1
    {
1538
1
        uint64_t tmp, tmp2 = rdtsc();
1539
1
1540
1
        write_tsc(tsc | (1ULL << 32));
1541
1
        tmp = rdtsc();
1542
1
        if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
1543
0
            what = "only partially";
1544
1
    }
1545
1
    else
1546
0
    {
1547
0
        what = "not";
1548
0
    }
1549
1
1550
1
    /* Nothing to do if the TSC is fully writable. */
1551
1
    if ( !what )
1552
1
    {
1553
1
        /*
1554
1
         * Paranoia - write back original TSC value. However, APs get synced
1555
1
         * with BSP as they are brought up, so this doesn't much matter.
1556
1
         */
1557
1
        write_tsc(tsc);
1558
1
        return;
1559
1
    }
1560
1
1561
0
    printk(XENLOG_WARNING "TSC %s writable\n", what);
1562
0
1563
0
    /* time_calibration_tsc_rendezvous() must not be used */
1564
0
    setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1565
0
1566
0
    /* cstate_restore_tsc() must not be used (or do nothing) */
1567
0
    if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
1568
0
        cpuidle_disable_deep_cstate();
1569
0
1570
0
    /* synchronize_tsc_slave() must do nothing */
1571
0
    disable_tsc_sync = true;
1572
0
}
1573
1574
static void __init reset_percpu_time(void *unused)
1575
0
{
1576
0
    struct cpu_time *t = &this_cpu(cpu_time);
1577
0
1578
0
    t->stamp.local_tsc = boot_tsc_stamp;
1579
0
    t->stamp.local_stime = 0;
1580
0
    t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp);
1581
0
    t->stamp.master_stime = t->stamp.local_stime;
1582
0
}
1583
1584
static void __init try_platform_timer_tail(bool late)
1585
1
{
1586
1
    init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
1587
1
    plt_overflow(NULL);
1588
1
1589
1
    platform_timer_stamp = plt_stamp64;
1590
1
    stime_platform_stamp = NOW();
1591
1
1592
1
    if ( !late )
1593
1
        init_percpu_time();
1594
1
1595
1
    init_timer(&calibration_timer, time_calibration, NULL, 0);
1596
1
    set_timer(&calibration_timer, NOW() + EPOCH);
1597
1
}
1598
1599
/* Late init function, after all cpus have booted */
1600
static int __init verify_tsc_reliability(void)
1601
1
{
1602
1
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1603
1
    {
1604
1
        /*
1605
1
         * Sadly, despite processor vendors' best design guidance efforts, on
1606
1
         * some systems, cpus may come out of reset improperly synchronized.
1607
1
         * So we must verify there is no warp and we can't do that until all
1608
1
         * CPUs are booted.
1609
1
         */
1610
1
        tsc_check_reliability();
1611
1
        if ( tsc_max_warp )
1612
1
        {
1613
1
            printk("TSC warp detected, disabling TSC_RELIABLE\n");
1614
1
            setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1615
1
        }
1616
0
        else if ( !strcmp(opt_clocksource, "tsc") &&
1617
0
                  (try_platform_timer(&plt_tsc) > 0) )
1618
0
        {
1619
0
            /*
1620
0
             * Platform timer has changed and CPU time will only be updated
1621
0
             * after we set again the calibration timer, which means we need to
1622
0
             * seed again each local CPU time. At this stage TSC is known to be
1623
0
             * reliable i.e. monotonically increasing across all CPUs so this
1624
0
             * lets us remove the skew between platform timer and TSC, since
1625
0
             * these are now effectively the same.
1626
0
             */
1627
0
            on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1);
1628
0
1629
0
            /*
1630
0
             * We won't do CPU Hotplug and TSC clocksource is being used which
1631
0
             * means we have a reliable TSC, plus we don't sync with any other
1632
0
             * clocksource so no need for rendezvous.
1633
0
             */
1634
0
            time_calibration_rendezvous_fn = time_calibration_nop_rendezvous;
1635
0
1636
0
            /* Finish platform timer switch. */
1637
0
            try_platform_timer_tail(true);
1638
0
1639
0
            printk("Switched to Platform timer %s TSC\n",
1640
0
                   freq_string(plt_src.frequency));
1641
0
            return 0;
1642
0
        }
1643
1
    }
1644
1
1645
1
    /*
1646
1
     * Re-run the TSC writability check if it didn't run to completion, as
1647
1
     * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed
1648
1
     * for determining which rendezvous function to use (below).
1649
1
     */
1650
1
    if ( !disable_tsc_sync )
1651
1
        tsc_check_writability();
1652
1
1653
1
    /*
1654
1
     * While with constant-rate TSCs the scale factor can be shared, when TSCs
1655
1
     * are not marked as 'reliable', re-sync during rendezvous.
1656
1
     */
1657
1
    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1658
1
         !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1659
1
        time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
1660
1
1661
1
    return 0;
1662
1
}
1663
__initcall(verify_tsc_reliability);
1664
1665
/* Late init function (after interrupts are enabled). */
1666
int __init init_xen_time(void)
1667
1
{
1668
1
    tsc_check_writability();
1669
1
1670
1
    open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1671
1
1672
1
    /* NB. get_cmos_time() can take over one second to execute. */
1673
1
    do_settime(get_cmos_time(), 0, NOW());
1674
1
1675
1
    /* Finish platform timer initialization. */
1676
1
    try_platform_timer_tail(false);
1677
1
1678
1
    return 0;
1679
1
}
1680
1681
1682
/* Early init function. */
1683
void __init early_time_init(void)
1684
1
{
1685
1
    struct cpu_time *t = &this_cpu(cpu_time);
1686
1
    u64 tmp;
1687
1
1688
1
    preinit_pit();
1689
1
    tmp = init_platform_timer();
1690
1
    plt_tsc.frequency = tmp;
1691
1
1692
1
    set_time_scale(&t->tsc_scale, tmp);
1693
1
    t->stamp.local_tsc = boot_tsc_stamp;
1694
1
1695
1
    do_div(tmp, 1000);
1696
1
    cpu_khz = (unsigned long)tmp;
1697
1
    printk("Detected %lu.%03lu MHz processor.\n", 
1698
1
           cpu_khz / 1000, cpu_khz % 1000);
1699
1
1700
1
    setup_irq(0, 0, &irq0);
1701
1
}
1702
1703
/* keep pit enabled for pit_broadcast working while cpuidle enabled */
1704
static int _disable_pit_irq(void(*hpet_broadcast_setup)(void))
1705
1
{
1706
1
    int ret = 1;
1707
1
1708
1
    if ( using_pit || !cpu_has_apic )
1709
0
        return -1;
1710
1
1711
1
    /*
1712
1
     * If we do not rely on PIT CH0 then we can use HPET for one-shot timer 
1713
1
     * emulation when entering deep C states.
1714
1
     * XXX dom0 may rely on RTC interrupt delivery, so only enable
1715
1
     * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
1716
1
     */
1717
1
    if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) )
1718
0
    {
1719
0
        hpet_broadcast_setup();
1720
0
        if ( !hpet_broadcast_is_available() )
1721
0
        {
1722
0
            if ( xen_cpuidle > 0 )
1723
0
            {
1724
0
                printk("%ps() failed, turning to PIT broadcast\n",
1725
0
                       hpet_broadcast_setup);
1726
0
                return -1;
1727
0
            }
1728
0
            ret = 0;
1729
0
        }
1730
0
    }
1731
1
1732
1
    /* Disable PIT CH0 timer interrupt. */
1733
1
    outb_p(0x30, PIT_MODE);
1734
1
    outb_p(0, PIT_CH0);
1735
1
    outb_p(0, PIT_CH0);
1736
1
1737
1
    return ret;
1738
1
}
1739
1740
static int __init disable_pit_irq(void)
1741
1
{
1742
1
    if ( !_disable_pit_irq(hpet_broadcast_init) )
1743
0
    {
1744
0
        xen_cpuidle = 0;
1745
0
        printk("CPUIDLE: disabled due to no HPET. "
1746
0
               "Force enable with 'cpuidle'.\n");
1747
0
    }
1748
1
1749
1
    return 0;
1750
1
}
1751
__initcall(disable_pit_irq);
1752
1753
void pit_broadcast_enter(void)
1754
0
{
1755
0
    cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask);
1756
0
}
1757
1758
void pit_broadcast_exit(void)
1759
0
{
1760
0
    int cpu = smp_processor_id();
1761
0
1762
0
    if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) )
1763
0
        reprogram_timer(this_cpu(timer_deadline));
1764
0
}
1765
1766
int pit_broadcast_is_available(void)
1767
0
{
1768
0
    return cpuidle_using_deep_cstate();
1769
0
}
1770
1771
void send_timer_event(struct vcpu *v)
1772
5.74k
{
1773
5.74k
    send_guest_vcpu_virq(v, VIRQ_TIMER);
1774
5.74k
}
1775
1776
/* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1777
static long cmos_utc_offset; /* in seconds */
1778
1779
int time_suspend(void)
1780
0
{
1781
0
    if ( smp_processor_id() == 0 )
1782
0
    {
1783
0
        cmos_utc_offset = -get_cmos_time();
1784
0
        cmos_utc_offset += get_sec();
1785
0
        kill_timer(&calibration_timer);
1786
0
1787
0
        /* Sync platform timer stamps. */
1788
0
        platform_time_calibration();
1789
0
    }
1790
0
1791
0
    /* Better to cancel calibration timer for accuracy. */
1792
0
    clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1793
0
1794
0
    return 0;
1795
0
}
1796
1797
int time_resume(void)
1798
0
{
1799
0
    preinit_pit();
1800
0
1801
0
    resume_platform_timer();
1802
0
1803
0
    if ( !_disable_pit_irq(hpet_broadcast_resume) )
1804
0
        BUG();
1805
0
1806
0
    init_percpu_time();
1807
0
1808
0
    set_timer(&calibration_timer, NOW() + EPOCH);
1809
0
1810
0
    do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
1811
0
1812
0
    update_vcpu_system_time(current);
1813
0
1814
0
    update_domain_rtc();
1815
0
1816
0
    return 0;
1817
0
}
1818
1819
int hwdom_pit_access(struct ioreq *ioreq)
1820
0
{
1821
0
    /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1822
0
    if ( using_pit )
1823
0
        return 0;
1824
0
1825
0
    switch ( ioreq->addr )
1826
0
    {
1827
0
    case PIT_CH2:
1828
0
        if ( ioreq->dir == IOREQ_READ )
1829
0
            ioreq->data = inb(PIT_CH2);
1830
0
        else
1831
0
            outb(ioreq->data, PIT_CH2);
1832
0
        return 1;
1833
0
1834
0
    case PIT_MODE:
1835
0
        if ( ioreq->dir == IOREQ_READ )
1836
0
            return 0; /* urk! */
1837
0
        switch ( ioreq->data & 0xc0 )
1838
0
        {
1839
0
        case 0xc0: /* Read Back */
1840
0
            if ( ioreq->data & 0x08 )    /* Select Channel 2? */
1841
0
                outb(ioreq->data & 0xf8, PIT_MODE);
1842
0
            if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1843
0
                return 1; /* no - we're done */
1844
0
            /* Filter Channel 2 and reserved bit 0. */
1845
0
            ioreq->data &= ~0x09;
1846
0
            return 0; /* emulate ch0/1 readback */
1847
0
        case 0x80: /* Select Counter 2 */
1848
0
            outb(ioreq->data, PIT_MODE);
1849
0
            return 1;
1850
0
        }
1851
0
        break;
1852
0
1853
0
    case 0x61:
1854
0
        if ( ioreq->dir == IOREQ_READ )
1855
0
            ioreq->data = inb(0x61);
1856
0
        else
1857
0
            outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1858
0
        return 1;
1859
0
    }
1860
0
1861
0
    return 0;
1862
0
}
1863
1864
/*
1865
 * PV SoftTSC Emulation.
1866
 */
1867
1868
/*
1869
 * tsc=unstable: Override all tests; assume TSC is unreliable.
1870
 * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs.
1871
 * tsc=stable:socket: Assume TSCs are reliable across sockets.
1872
 */
1873
static int __init tsc_parse(const char *s)
1874
0
{
1875
0
    if ( !strcmp(s, "unstable") )
1876
0
    {
1877
0
        setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1878
0
        setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC);
1879
0
        setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1880
0
    }
1881
0
    else if ( !strcmp(s, "skewed") )
1882
0
        setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1883
0
    else if ( !strcmp(s, "stable:socket") )
1884
0
        tsc_flags |= TSC_RELIABLE_SOCKET;
1885
0
    else
1886
0
        return -EINVAL;
1887
0
1888
0
    return 0;
1889
0
}
1890
custom_param("tsc", tsc_parse);
1891
1892
u64 gtime_to_gtsc(struct domain *d, u64 time)
1893
0
{
1894
0
    if ( !is_hvm_domain(d) )
1895
0
    {
1896
0
        if ( time < d->arch.vtsc_offset )
1897
0
            return -scale_delta(d->arch.vtsc_offset - time,
1898
0
                                &d->arch.ns_to_vtsc);
1899
0
        time -= d->arch.vtsc_offset;
1900
0
    }
1901
0
    return scale_delta(time, &d->arch.ns_to_vtsc);
1902
0
}
1903
1904
u64 gtsc_to_gtime(struct domain *d, u64 tsc)
1905
0
{
1906
0
    u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
1907
0
1908
0
    if ( !is_hvm_domain(d) )
1909
0
        time += d->arch.vtsc_offset;
1910
0
    return time;
1911
0
}
1912
1913
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
1914
0
{
1915
0
    s_time_t now = get_s_time();
1916
0
    struct domain *d = v->domain;
1917
0
1918
0
    spin_lock(&d->arch.vtsc_lock);
1919
0
1920
0
#if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
1921
0
    if ( guest_kernel_mode(v, regs) )
1922
0
        d->arch.vtsc_kerncount++;
1923
0
    else
1924
0
        d->arch.vtsc_usercount++;
1925
0
#endif
1926
0
1927
0
    if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
1928
0
        d->arch.vtsc_last = now;
1929
0
    else
1930
0
        now = ++d->arch.vtsc_last;
1931
0
1932
0
    spin_unlock(&d->arch.vtsc_lock);
1933
0
1934
0
    msr_split(regs, gtime_to_gtsc(d, now));
1935
0
1936
0
    if ( rdtscp )
1937
0
         regs->rcx =
1938
0
             (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0;
1939
0
}
1940
1941
bool clocksource_is_tsc(void)
1942
99.4k
{
1943
99.4k
    return plt_src.read_counter == read_tsc;
1944
99.4k
}
1945
1946
int host_tsc_is_safe(void)
1947
0
{
1948
0
    return boot_cpu_has(X86_FEATURE_TSC_RELIABLE);
1949
0
}
1950
1951
/*
1952
 * called to collect tsc-related data only for save file or live
1953
 * migrate; called after last rdtsc is done on this incarnation
1954
 */
1955
void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
1956
                  uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
1957
                  uint32_t *incarnation)
1958
0
{
1959
0
    bool enable_tsc_scaling = is_hvm_domain(d) &&
1960
0
                              hvm_tsc_scaling_supported && !d->arch.vtsc;
1961
0
1962
0
    *incarnation = d->arch.incarnation;
1963
0
    *tsc_mode = d->arch.tsc_mode;
1964
0
1965
0
    switch ( *tsc_mode )
1966
0
    {
1967
0
        uint64_t tsc;
1968
0
1969
0
    case TSC_MODE_NEVER_EMULATE:
1970
0
        *elapsed_nsec = *gtsc_khz = 0;
1971
0
        break;
1972
0
    case TSC_MODE_DEFAULT:
1973
0
        if ( d->arch.vtsc )
1974
0
        {
1975
0
    case TSC_MODE_ALWAYS_EMULATE:
1976
0
            *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1977
0
            *gtsc_khz = d->arch.tsc_khz;
1978
0
            break;
1979
0
        }
1980
0
        tsc = rdtsc();
1981
0
        *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
1982
0
        *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz;
1983
0
        break;
1984
0
    case TSC_MODE_PVRDTSCP:
1985
0
        if ( d->arch.vtsc )
1986
0
        {
1987
0
            *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
1988
0
            *gtsc_khz = cpu_khz;
1989
0
        }
1990
0
        else
1991
0
        {
1992
0
            tsc = rdtsc();
1993
0
            *elapsed_nsec = scale_delta(tsc, &this_cpu(cpu_time).tsc_scale) -
1994
0
                            d->arch.vtsc_offset;
1995
0
            *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz
1996
0
                                           : 0 /* ignored by tsc_set_info */;
1997
0
        }
1998
0
        break;
1999
0
    }
2000
0
2001
0
    if ( (int64_t)*elapsed_nsec < 0 )
2002
0
        *elapsed_nsec = 0;
2003
0
}
2004
2005
/*
2006
 * This may be called as many as three times for a domain, once when the
2007
 * hypervisor creates the domain, once when the toolstack creates the
2008
 * domain and, if restoring/migrating, once when saved/migrated values
2009
 * are restored.  Care must be taken that, if multiple calls occur,
2010
 * only the last "sticks" and all are completed before the guest executes
2011
 * an rdtsc instruction
2012
 */
2013
void tsc_set_info(struct domain *d,
2014
                  uint32_t tsc_mode, uint64_t elapsed_nsec,
2015
                  uint32_t gtsc_khz, uint32_t incarnation)
2016
2
{
2017
2
    if ( is_idle_domain(d) || is_hardware_domain(d) )
2018
2
    {
2019
2
        d->arch.vtsc = 0;
2020
2
        return;
2021
2
    }
2022
2
2023
0
    switch ( d->arch.tsc_mode = tsc_mode )
2024
0
    {
2025
0
        bool enable_tsc_scaling;
2026
0
2027
0
    case TSC_MODE_DEFAULT:
2028
0
    case TSC_MODE_ALWAYS_EMULATE:
2029
0
        d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2030
0
        d->arch.tsc_khz = gtsc_khz ?: cpu_khz;
2031
0
        set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000);
2032
0
2033
0
        /*
2034
0
         * In default mode use native TSC if the host has safe TSC and
2035
0
         * host and guest frequencies are the same (either "naturally" or
2036
0
         * - for HVM/PVH - via TSC scaling).
2037
0
         * When a guest is created, gtsc_khz is passed in as zero, making
2038
0
         * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
2039
0
         */
2040
0
        if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() &&
2041
0
             (d->arch.tsc_khz == cpu_khz ||
2042
0
              (is_hvm_domain(d) &&
2043
0
               hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
2044
0
        {
2045
0
    case TSC_MODE_NEVER_EMULATE:
2046
0
            d->arch.vtsc = 0;
2047
0
            break;
2048
0
        }
2049
0
        d->arch.vtsc = 1;
2050
0
        d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2051
0
        break;
2052
0
    case TSC_MODE_PVRDTSCP:
2053
0
        d->arch.vtsc = !boot_cpu_has(X86_FEATURE_RDTSCP) ||
2054
0
                       !host_tsc_is_safe();
2055
0
        enable_tsc_scaling = is_hvm_domain(d) && !d->arch.vtsc &&
2056
0
                             hvm_get_tsc_scaling_ratio(gtsc_khz ?: cpu_khz);
2057
0
        d->arch.tsc_khz = (enable_tsc_scaling && gtsc_khz) ? gtsc_khz : cpu_khz;
2058
0
        set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
2059
0
        d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2060
0
        if ( d->arch.vtsc )
2061
0
            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2062
0
        else {
2063
0
            /* when using native TSC, offset is nsec relative to power-on
2064
0
             * of physical machine */
2065
0
            d->arch.vtsc_offset = scale_delta(rdtsc(),
2066
0
                                              &this_cpu(cpu_time).tsc_scale) -
2067
0
                                  elapsed_nsec;
2068
0
        }
2069
0
        break;
2070
0
    }
2071
0
    d->arch.incarnation = incarnation + 1;
2072
0
    if ( is_hvm_domain(d) )
2073
0
    {
2074
0
        if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
2075
0
            d->arch.hvm_domain.tsc_scaling_ratio =
2076
0
                hvm_get_tsc_scaling_ratio(d->arch.tsc_khz);
2077
0
2078
0
        hvm_set_rdtsc_exiting(d, d->arch.vtsc);
2079
0
        if ( d->vcpu && d->vcpu[0] && incarnation == 0 )
2080
0
        {
2081
0
            /*
2082
0
             * set_tsc_offset() is called from hvm_vcpu_initialise() before
2083
0
             * tsc_set_info(). New vtsc mode may require recomputing TSC
2084
0
             * offset.
2085
0
             * We only need to do this for BSP during initial boot. APs will
2086
0
             * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
2087
0
             * will sync their TSC to BSP's sync_tsc.
2088
0
             */
2089
0
            d->arch.hvm_domain.sync_tsc = rdtsc();
2090
0
            hvm_funcs.set_tsc_offset(d->vcpu[0],
2091
0
                                     d->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset,
2092
0
                                     d->arch.hvm_domain.sync_tsc);
2093
0
        }
2094
0
    }
2095
0
2096
0
    recalculate_cpuid_policy(d);
2097
0
}
2098
2099
/* vtsc may incur measurable performance degradation, diagnose with this */
2100
static void dump_softtsc(unsigned char key)
2101
0
{
2102
0
    struct domain *d;
2103
0
    int domcnt = 0;
2104
0
2105
0
    tsc_check_reliability();
2106
0
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2107
0
        printk("TSC marked as reliable, "
2108
0
               "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2109
0
    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
2110
0
    {
2111
0
        printk("TSC has constant rate, ");
2112
0
        if (max_cstate <= 2 && tsc_max_warp == 0)
2113
0
            printk("no deep Cstates, passed warp test, deemed reliable, ");
2114
0
        else
2115
0
            printk("deep Cstates possible, so not reliable, ");
2116
0
        printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2117
0
    } else
2118
0
        printk("TSC not marked as either constant or reliable, "
2119
0
               "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2120
0
    for_each_domain ( d )
2121
0
    {
2122
0
        if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT )
2123
0
            continue;
2124
0
        printk("dom%u%s: mode=%d",d->domain_id,
2125
0
                is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
2126
0
        if ( d->arch.vtsc_offset )
2127
0
            printk(",ofs=%#"PRIx64, d->arch.vtsc_offset);
2128
0
        if ( d->arch.tsc_khz )
2129
0
            printk(",khz=%"PRIu32, d->arch.tsc_khz);
2130
0
        if ( d->arch.incarnation )
2131
0
            printk(",inc=%"PRIu32, d->arch.incarnation);
2132
0
#if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
2133
0
        if ( d->arch.vtsc_kerncount | d->arch.vtsc_usercount )
2134
0
            printk(",vtsc count: %"PRIu64" kernel,%"PRIu64" user",
2135
0
                   d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
2136
0
#endif
2137
0
        printk("\n");
2138
0
        domcnt++;
2139
0
    }
2140
0
2141
0
    if ( !domcnt )
2142
0
            printk("No domains have emulated TSC\n");
2143
0
}
2144
2145
static int __init setup_dump_softtsc(void)
2146
1
{
2147
1
    register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1);
2148
1
    return 0;
2149
1
}
2150
__initcall(setup_dump_softtsc);
2151
2152
/*
2153
 * Local variables:
2154
 * mode: C
2155
 * c-file-style: "BSD"
2156
 * c-basic-offset: 4
2157
 * tab-width: 4
2158
 * indent-tabs-mode: nil
2159
 * End:
2160
 */