Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/cpuid.c
Line
Count
Source (jump to first uncovered line)
1
#include <xen/init.h>
2
#include <xen/lib.h>
3
#include <xen/sched.h>
4
#include <asm/cpuid.h>
5
#include <asm/hvm/hvm.h>
6
#include <asm/hvm/nestedhvm.h>
7
#include <asm/hvm/svm/svm.h>
8
#include <asm/hvm/vmx/vmcs.h>
9
#include <asm/paging.h>
10
#include <asm/processor.h>
11
#include <asm/xstate.h>
12
13
const uint32_t known_features[] = INIT_KNOWN_FEATURES;
14
const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
15
16
static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
17
static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
18
static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
19
static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
20
21
2.68k
#define EMPTY_LEAF ((struct cpuid_leaf){})
22
static void zero_leaves(struct cpuid_leaf *l,
23
                        unsigned int first, unsigned int last)
24
1
{
25
1
    memset(&l[first], 0, sizeof(*l) * (last - first + 1));
26
1
}
27
28
struct cpuid_policy __read_mostly raw_cpuid_policy,
29
    __read_mostly host_cpuid_policy,
30
    __read_mostly pv_max_cpuid_policy,
31
    __read_mostly hvm_max_cpuid_policy;
32
33
static void cpuid_leaf(uint32_t leaf, struct cpuid_leaf *data)
34
20
{
35
20
    cpuid(leaf, &data->a, &data->b, &data->c, &data->d);
36
20
}
37
38
static void sanitise_featureset(uint32_t *fs)
39
3
{
40
3
    /* for_each_set_bit() uses unsigned longs.  Extend with zeroes. */
41
3
    uint32_t disabled_features[
42
3
        ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
43
3
    unsigned int i;
44
3
45
33
    for ( i = 0; i < FSCAPINTS; ++i )
46
30
    {
47
30
        /* Clamp to known mask. */
48
30
        fs[i] &= known_features[i];
49
30
50
30
        /*
51
30
         * Identify which features with deep dependencies have been
52
30
         * disabled.
53
30
         */
54
30
        disabled_features[i] = ~fs[i] & deep_features[i];
55
30
    }
56
3
57
3
    for_each_set_bit(i, (void *)disabled_features,
58
3
                     sizeof(disabled_features) * 8)
59
8
    {
60
8
        const uint32_t *dfs = lookup_deep_deps(i);
61
8
        unsigned int j;
62
8
63
8
        ASSERT(dfs); /* deep_features[] should guarentee this. */
64
8
65
88
        for ( j = 0; j < FSCAPINTS; ++j )
66
80
        {
67
80
            fs[j] &= ~dfs[j];
68
80
            disabled_features[j] &= ~dfs[j];
69
80
        }
70
8
    }
71
3
}
72
73
static void recalculate_xstate(struct cpuid_policy *p)
74
4
{
75
4
    uint64_t xstates = XSTATE_FP_SSE;
76
4
    uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
77
4
    unsigned int i, Da1 = p->xstate.Da1;
78
4
79
4
    /*
80
4
     * The Da1 leaf is the only piece of information preserved in the common
81
4
     * case.  Everything else is derived from other feature state.
82
4
     */
83
4
    memset(&p->xstate, 0, sizeof(p->xstate));
84
4
85
4
    if ( !p->basic.xsave )
86
0
        return;
87
4
88
4
    if ( p->basic.avx )
89
4
    {
90
4
        xstates |= XSTATE_YMM;
91
4
        xstate_size = max(xstate_size,
92
4
                          xstate_offsets[_XSTATE_YMM] +
93
4
                          xstate_sizes[_XSTATE_YMM]);
94
4
    }
95
4
96
4
    if ( p->feat.mpx )
97
0
    {
98
0
        xstates |= XSTATE_BNDREGS | XSTATE_BNDCSR;
99
0
        xstate_size = max(xstate_size,
100
0
                          xstate_offsets[_XSTATE_BNDCSR] +
101
0
                          xstate_sizes[_XSTATE_BNDCSR]);
102
0
    }
103
4
104
4
    if ( p->feat.avx512f )
105
0
    {
106
0
        xstates |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
107
0
        xstate_size = max(xstate_size,
108
0
                          xstate_offsets[_XSTATE_HI_ZMM] +
109
0
                          xstate_sizes[_XSTATE_HI_ZMM]);
110
0
    }
111
4
112
4
    if ( p->feat.pku )
113
0
    {
114
0
        xstates |= XSTATE_PKRU;
115
0
        xstate_size = max(xstate_size,
116
0
                          xstate_offsets[_XSTATE_PKRU] +
117
0
                          xstate_sizes[_XSTATE_PKRU]);
118
0
    }
119
4
120
4
    if ( p->extd.lwp )
121
0
    {
122
0
        xstates |= XSTATE_LWP;
123
0
        xstate_size = max(xstate_size,
124
0
                          xstate_offsets[_XSTATE_LWP] +
125
0
                          xstate_sizes[_XSTATE_LWP]);
126
0
    }
127
4
128
4
    p->xstate.max_size  =  xstate_size;
129
4
    p->xstate.xcr0_low  =  xstates & ~XSTATE_XSAVES_ONLY;
130
4
    p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
131
4
132
4
    p->xstate.Da1 = Da1;
133
4
    if ( p->xstate.xsaves )
134
0
    {
135
0
        p->xstate.xss_low   =  xstates & XSTATE_XSAVES_ONLY;
136
0
        p->xstate.xss_high  = (xstates & XSTATE_XSAVES_ONLY) >> 32;
137
0
    }
138
4
    else
139
4
        xstates &= ~XSTATE_XSAVES_ONLY;
140
4
141
248
    for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
142
244
    {
143
244
        uint64_t curr_xstate = 1ul << i;
144
244
145
244
        if ( !(xstates & curr_xstate) )
146
240
            continue;
147
244
148
4
        p->xstate.comp[i].size   = xstate_sizes[i];
149
4
        p->xstate.comp[i].offset = xstate_offsets[i];
150
4
        p->xstate.comp[i].xss    = curr_xstate & XSTATE_XSAVES_ONLY;
151
4
        p->xstate.comp[i].align  = curr_xstate & xstate_align;
152
4
    }
153
4
}
154
155
/*
156
 * Misc adjustments to the policy.  Mostly clobbering reserved fields and
157
 * duplicating shared fields.  Intentionally hidden fields are annotated.
158
 */
159
static void recalculate_misc(struct cpuid_policy *p)
160
2
{
161
2
    p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
162
2
    p->basic.apic_id = 0; /* Dynamic. */
163
2
164
2
    p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
165
2
    p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
166
2
167
2
    p->basic.raw[0x8] = EMPTY_LEAF;
168
2
    p->basic.raw[0xb] = EMPTY_LEAF; /* TODO: Rework topology logic. */
169
2
    p->basic.raw[0xc] = EMPTY_LEAF;
170
2
171
2
    p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
172
2
173
2
    /* Most of Power/RAS hidden from guests. */
174
2
    p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
175
2
176
2
    p->extd.raw[0x8].d = 0;
177
2
178
2
    switch ( p->x86_vendor )
179
2
    {
180
2
    case X86_VENDOR_INTEL:
181
2
        p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
182
2
        p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
183
2
        p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
184
2
185
2
        p->extd.vendor_ebx = 0;
186
2
        p->extd.vendor_ecx = 0;
187
2
        p->extd.vendor_edx = 0;
188
2
189
2
        p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
190
2
191
2
        p->extd.raw[0x5] = EMPTY_LEAF;
192
2
        p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
193
2
194
2
        p->extd.raw[0x8].a &= 0x0000ffff;
195
2
        p->extd.raw[0x8].c = 0;
196
2
        break;
197
2
198
0
    case X86_VENDOR_AMD:
199
0
        zero_leaves(p->basic.raw, 0x2, 0x3);
200
0
        memset(p->cache.raw, 0, sizeof(p->cache.raw));
201
0
        zero_leaves(p->basic.raw, 0x9, 0xa);
202
0
203
0
        p->extd.vendor_ebx = p->basic.vendor_ebx;
204
0
        p->extd.vendor_ecx = p->basic.vendor_ecx;
205
0
        p->extd.vendor_edx = p->basic.vendor_edx;
206
0
207
0
        p->extd.raw_fms = p->basic.raw_fms;
208
0
        p->extd.raw[0x1].b &= 0xff00ffff;
209
0
        p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
210
0
211
0
        p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
212
0
        p->extd.raw[0x8].c &= 0x0003f0ff;
213
0
214
0
        p->extd.raw[0x9] = EMPTY_LEAF;
215
0
216
0
        zero_leaves(p->extd.raw, 0xb, 0x18);
217
0
218
0
        p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
219
0
220
0
        p->extd.raw[0x1c].a = 0; /* LWP.a entirely dynamic. */
221
0
        break;
222
2
    }
223
2
}
224
225
static void __init calculate_raw_policy(void)
226
1
{
227
1
    struct cpuid_policy *p = &raw_cpuid_policy;
228
1
    unsigned int i;
229
1
230
1
    cpuid_leaf(0, &p->basic.raw[0]);
231
14
    for ( i = 1; i < min(ARRAY_SIZE(p->basic.raw),
232
13
                         p->basic.max_leaf + 1ul); ++i )
233
13
    {
234
13
        switch ( i )
235
13
        {
236
3
        case 0x4: case 0x7: case 0xd:
237
3
            /* Multi-invocation leaves.  Deferred. */
238
3
            continue;
239
13
        }
240
13
241
10
        cpuid_leaf(i, &p->basic.raw[i]);
242
10
    }
243
1
244
1
    if ( p->basic.max_leaf >= 4 )
245
1
    {
246
5
        for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
247
5
        {
248
5
            union {
249
5
                struct cpuid_leaf l;
250
5
                struct cpuid_cache_leaf c;
251
5
            } u;
252
5
253
5
            cpuid_count_leaf(4, i, &u.l);
254
5
255
5
            if ( u.c.type == 0 )
256
1
                break;
257
5
258
4
            p->cache.subleaf[i] = u.c;
259
4
        }
260
1
261
1
        /*
262
1
         * The choice of CPUID_GUEST_NR_CACHE is arbitrary.  It is expected
263
1
         * that it will eventually need increasing for future hardware.
264
1
         */
265
1
        if ( i == ARRAY_SIZE(p->cache.raw) )
266
0
            printk(XENLOG_WARNING
267
0
                   "CPUID: Insufficient Leaf 4 space for this hardware\n");
268
1
    }
269
1
270
1
    if ( p->basic.max_leaf >= 7 )
271
1
    {
272
1
        cpuid_count_leaf(7, 0, &p->feat.raw[0]);
273
1
274
1
        for ( i = 1; i < min(ARRAY_SIZE(p->feat.raw),
275
0
                             p->feat.max_subleaf + 1ul); ++i )
276
0
            cpuid_count_leaf(7, i, &p->feat.raw[i]);
277
1
    }
278
1
279
1
    if ( p->basic.max_leaf >= XSTATE_CPUID )
280
1
    {
281
1
        uint64_t xstates;
282
1
283
1
        cpuid_count_leaf(XSTATE_CPUID, 0, &p->xstate.raw[0]);
284
1
        cpuid_count_leaf(XSTATE_CPUID, 1, &p->xstate.raw[1]);
285
1
286
1
        xstates = ((uint64_t)(p->xstate.xcr0_high | p->xstate.xss_high) << 32) |
287
1
            (p->xstate.xcr0_low | p->xstate.xss_low);
288
1
289
62
        for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.raw)); ++i )
290
61
        {
291
61
            if ( xstates & (1ul << i) )
292
1
                cpuid_count_leaf(XSTATE_CPUID, i, &p->xstate.raw[i]);
293
61
        }
294
1
    }
295
1
296
1
    /* Extended leaves. */
297
1
    cpuid_leaf(0x80000000, &p->extd.raw[0]);
298
9
    for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
299
8
                         p->extd.max_leaf + 1 - 0x80000000ul); ++i )
300
8
        cpuid_leaf(0x80000000 + i, &p->extd.raw[i]);
301
1
302
1
    p->x86_vendor = boot_cpu_data.x86_vendor;
303
1
}
304
305
static void __init calculate_host_policy(void)
306
1
{
307
1
    struct cpuid_policy *p = &host_cpuid_policy;
308
1
309
1
    *p = raw_cpuid_policy;
310
1
311
1
    p->basic.max_leaf =
312
1
        min_t(uint32_t, p->basic.max_leaf,   ARRAY_SIZE(p->basic.raw) - 1);
313
1
    p->feat.max_subleaf =
314
1
        min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
315
1
    p->extd.max_leaf = 0x80000000 | min_t(uint32_t, p->extd.max_leaf & 0xffff,
316
1
                                          ARRAY_SIZE(p->extd.raw) - 1);
317
1
318
1
    cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
319
1
    recalculate_xstate(p);
320
1
    recalculate_misc(p);
321
1
322
1
    if ( p->extd.svm )
323
0
    {
324
0
        /* Clamp to implemented features which require hardware support. */
325
0
        p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
326
0
                               (1u << SVM_FEATURE_LBRV) |
327
0
                               (1u << SVM_FEATURE_NRIPS) |
328
0
                               (1u << SVM_FEATURE_PAUSEFILTER) |
329
0
                               (1u << SVM_FEATURE_DECODEASSISTS));
330
0
        /* Enable features which are always emulated. */
331
0
        p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
332
0
                               (1u << SVM_FEATURE_TSCRATEMSR));
333
0
    }
334
1
}
335
336
static void __init calculate_pv_max_policy(void)
337
1
{
338
1
    struct cpuid_policy *p = &pv_max_cpuid_policy;
339
1
    uint32_t pv_featureset[FSCAPINTS];
340
1
    unsigned int i;
341
1
342
1
    *p = host_cpuid_policy;
343
1
    cpuid_policy_to_featureset(p, pv_featureset);
344
1
345
11
    for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
346
10
        pv_featureset[i] &= pv_featuremask[i];
347
1
348
1
    /* Unconditionally claim to be able to set the hypervisor bit. */
349
1
    __set_bit(X86_FEATURE_HYPERVISOR, pv_featureset);
350
1
351
1
    sanitise_featureset(pv_featureset);
352
1
    cpuid_featureset_to_policy(pv_featureset, p);
353
1
    recalculate_xstate(p);
354
1
355
1
    p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
356
1
}
357
358
static void __init calculate_hvm_max_policy(void)
359
1
{
360
1
    struct cpuid_policy *p = &hvm_max_cpuid_policy;
361
1
    uint32_t hvm_featureset[FSCAPINTS];
362
1
    unsigned int i;
363
1
    const uint32_t *hvm_featuremask;
364
1
365
1
    if ( !hvm_enabled )
366
0
        return;
367
1
368
1
    *p = host_cpuid_policy;
369
1
    cpuid_policy_to_featureset(p, hvm_featureset);
370
1
371
1
    hvm_featuremask = hvm_funcs.hap_supported ?
372
1
        hvm_hap_featuremask : hvm_shadow_featuremask;
373
1
374
11
    for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
375
10
        hvm_featureset[i] &= hvm_featuremask[i];
376
1
377
1
    /* Unconditionally claim to be able to set the hypervisor bit. */
378
1
    __set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset);
379
1
380
1
    /*
381
1
     * Xen can provide an APIC emulation to HVM guests even if the host's APIC
382
1
     * isn't enabled.
383
1
     */
384
1
    __set_bit(X86_FEATURE_APIC, hvm_featureset);
385
1
386
1
    /*
387
1
     * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
388
1
     * long mode (and init_amd() has cleared it out of host capabilities), but
389
1
     * HVM guests are able if running in protected mode.
390
1
     */
391
1
    if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
392
0
         raw_cpuid_policy.basic.sep )
393
0
        __set_bit(X86_FEATURE_SEP, hvm_featureset);
394
1
395
1
    /*
396
1
     * With VT-x, some features are only supported by Xen if dedicated
397
1
     * hardware support is also available.
398
1
     */
399
1
    if ( cpu_has_vmx )
400
1
    {
401
1
        if ( !cpu_has_vmx_mpx )
402
1
            __clear_bit(X86_FEATURE_MPX, hvm_featureset);
403
1
404
1
        if ( !cpu_has_vmx_xsaves )
405
1
            __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
406
1
    }
407
1
408
1
    sanitise_featureset(hvm_featureset);
409
1
    cpuid_featureset_to_policy(hvm_featureset, p);
410
1
    recalculate_xstate(p);
411
1
}
412
413
void __init init_guest_cpuid(void)
414
1
{
415
1
    calculate_raw_policy();
416
1
    calculate_host_policy();
417
1
    calculate_pv_max_policy();
418
1
    calculate_hvm_max_policy();
419
1
}
420
421
const uint32_t *lookup_deep_deps(uint32_t feature)
422
9
{
423
9
    static const struct {
424
9
        uint32_t feature;
425
9
        uint32_t fs[FSCAPINTS];
426
9
    } deep_deps[] = INIT_DEEP_DEPS;
427
9
    unsigned int start = 0, end = ARRAY_SIZE(deep_deps);
428
9
429
9
    BUILD_BUG_ON(ARRAY_SIZE(deep_deps) != NR_DEEP_DEPS);
430
9
431
9
    /* Fast early exit. */
432
9
    if ( !test_bit(feature, deep_features) )
433
1
        return NULL;
434
9
435
9
    /* deep_deps[] is sorted.  Perform a binary search. */
436
29
    while ( start < end )
437
29
    {
438
29
        unsigned int mid = start + ((end - start) / 2);
439
29
440
29
        if ( deep_deps[mid].feature > feature )
441
8
            end = mid;
442
21
        else if ( deep_deps[mid].feature < feature )
443
13
            start = mid + 1;
444
21
        else
445
8
            return deep_deps[mid].fs;
446
29
    }
447
8
448
0
    return NULL;
449
8
}
450
451
void recalculate_cpuid_policy(struct domain *d)
452
1
{
453
1
    struct cpuid_policy *p = d->arch.cpuid;
454
1
    const struct cpuid_policy *max =
455
1
        is_pv_domain(d) ? &pv_max_cpuid_policy : &hvm_max_cpuid_policy;
456
1
    uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
457
1
    unsigned int i;
458
1
459
1
    p->x86_vendor = get_cpu_vendor(p->basic.vendor_ebx, p->basic.vendor_ecx,
460
1
                                   p->basic.vendor_edx, gcv_guest);
461
1
462
1
    p->basic.max_leaf   = min(p->basic.max_leaf,   max->basic.max_leaf);
463
1
    p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
464
1
    p->extd.max_leaf    = 0x80000000 | min(p->extd.max_leaf & 0xffff,
465
1
                                           (p->x86_vendor == X86_VENDOR_AMD
466
1
                                            ? CPUID_GUEST_NR_EXTD_AMD
467
1
                                            : CPUID_GUEST_NR_EXTD_INTEL) - 1);
468
1
469
1
    cpuid_policy_to_featureset(p, fs);
470
1
    cpuid_policy_to_featureset(max, max_fs);
471
1
472
1
    if ( is_hvm_domain(d) )
473
1
    {
474
1
        /*
475
1
         * HVM domains using Shadow paging have further restrictions on their
476
1
         * available paging features.
477
1
         */
478
1
        if ( !hap_enabled(d) )
479
0
        {
480
0
            for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
481
0
                max_fs[i] &= hvm_shadow_featuremask[i];
482
0
        }
483
1
484
1
        /* Hide nested-virt if it hasn't been explicitly configured. */
485
1
        if ( !nestedhvm_enabled(d) )
486
1
        {
487
1
            __clear_bit(X86_FEATURE_VMX, max_fs);
488
1
            __clear_bit(X86_FEATURE_SVM, max_fs);
489
1
        }
490
1
    }
491
1
492
1
    /*
493
1
     * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY.  These bits
494
1
     * affect how to interpret topology information in other cpuid leaves.
495
1
     */
496
1
    __set_bit(X86_FEATURE_HTT, max_fs);
497
1
    __set_bit(X86_FEATURE_X2APIC, max_fs);
498
1
    __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
499
1
500
1
    /*
501
1
     * 32bit PV domains can't use any Long Mode features, and cannot use
502
1
     * SYSCALL on non-AMD hardware.
503
1
     */
504
1
    if ( is_pv_32bit_domain(d) )
505
0
    {
506
0
        __clear_bit(X86_FEATURE_LM, max_fs);
507
0
        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
508
0
            __clear_bit(X86_FEATURE_SYSCALL, max_fs);
509
0
    }
510
1
511
1
    /*
512
1
     * ITSC is masked by default (so domains are safe to migrate), but a
513
1
     * toolstack which has configured disable_migrate or vTSC for a domain may
514
1
     * safely select it, and needs a way of doing so.
515
1
     */
516
1
    if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
517
1
        __set_bit(X86_FEATURE_ITSC, max_fs);
518
1
519
1
    /* Clamp the toolstacks choices to reality. */
520
11
    for ( i = 0; i < ARRAY_SIZE(fs); i++ )
521
10
        fs[i] &= max_fs[i];
522
1
523
1
    if ( p->basic.max_leaf < XSTATE_CPUID )
524
0
        __clear_bit(X86_FEATURE_XSAVE, fs);
525
1
526
1
    sanitise_featureset(fs);
527
1
528
1
    /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
529
1
    fs[FEATURESET_7b0] &= ~special_features[FEATURESET_7b0];
530
1
    fs[FEATURESET_7b0] |= (host_cpuid_policy.feat._7b0 &
531
1
                           special_features[FEATURESET_7b0]);
532
1
533
1
    cpuid_featureset_to_policy(fs, p);
534
1
535
1
    /* Pass host cacheline size through to guests. */
536
1
    p->basic.clflush_size = max->basic.clflush_size;
537
1
538
1
    p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
539
1
    p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
540
1
                                paging_max_paddr_bits(d));
541
1
    p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
542
1
                                (p->basic.pae || p->basic.pse36) ? 36 : 32);
543
1
544
1
    p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
545
1
546
1
    recalculate_xstate(p);
547
1
    recalculate_misc(p);
548
1
549
5
    for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
550
5
    {
551
5
        if ( p->cache.subleaf[i].type >= 1 &&
552
4
             p->cache.subleaf[i].type <= 3 )
553
4
        {
554
4
            /* Subleaf has a valid cache type. Zero reserved fields. */
555
4
            p->cache.raw[i].a &= 0xffffc3ffu;
556
4
            p->cache.raw[i].d &= 0x00000007u;
557
4
        }
558
5
        else
559
1
        {
560
1
            /* Subleaf is not valid.  Zero the rest of the union. */
561
1
            zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
562
1
            break;
563
1
        }
564
5
    }
565
1
566
1
    if ( !p->extd.svm )
567
1
        p->extd.raw[0xa] = EMPTY_LEAF;
568
1
569
1
    if ( !p->extd.page1gb )
570
0
        p->extd.raw[0x19] = EMPTY_LEAF;
571
1
572
1
    if ( p->extd.lwp )
573
0
        p->extd.raw[0x1c].d &= max->extd.raw[0x1c].d;
574
1
    else
575
1
        p->extd.raw[0x1c] = EMPTY_LEAF;
576
1
}
577
578
int init_domain_cpuid_policy(struct domain *d)
579
1
{
580
1
    d->arch.cpuid = xmalloc(struct cpuid_policy);
581
1
582
1
    if ( !d->arch.cpuid )
583
0
        return -ENOMEM;
584
1
585
1
    *d->arch.cpuid = is_pv_domain(d)
586
1
        ? pv_max_cpuid_policy : hvm_max_cpuid_policy;
587
1
588
1
    if ( d->disable_migrate )
589
1
        d->arch.cpuid->extd.itsc = cpu_has_itsc;
590
1
591
1
    recalculate_cpuid_policy(d);
592
1
593
1
    return 0;
594
1
}
595
596
void guest_cpuid(const struct vcpu *v, uint32_t leaf,
597
                 uint32_t subleaf, struct cpuid_leaf *res)
598
2.66k
{
599
2.66k
    const struct domain *d = v->domain;
600
2.66k
    const struct cpuid_policy *p = d->arch.cpuid;
601
2.66k
602
2.66k
    *res = EMPTY_LEAF;
603
2.66k
604
2.66k
    /*
605
2.66k
     * First pass:
606
2.66k
     * - Perform max_leaf/subleaf calculations.  Out-of-range leaves return
607
2.66k
     *   all zeros, following the AMD model.
608
2.66k
     * - Fill in *res for leaves no longer handled on the legacy path.
609
2.66k
     * - Dispatch the virtualised leaves to their respective handlers.
610
2.66k
     */
611
2.66k
    switch ( leaf )
612
2.66k
    {
613
2.65k
    case 0 ... CPUID_GUEST_NR_BASIC - 1:
614
2.65k
        ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
615
2.65k
        if ( leaf > min_t(uint32_t, p->basic.max_leaf,
616
2.65k
                          ARRAY_SIZE(p->basic.raw) - 1) )
617
0
            return;
618
2.65k
619
2.65k
        switch ( leaf )
620
2.65k
        {
621
9
        case 0x4:
622
9
            if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
623
0
                return;
624
9
625
9
            *res = p->cache.raw[subleaf];
626
9
            break;
627
9
628
1.04k
        case 0x7:
629
1.04k
            ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
630
1.04k
            if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
631
1.04k
                                 ARRAY_SIZE(p->feat.raw) - 1) )
632
0
                return;
633
1.04k
634
1.04k
            *res = p->feat.raw[subleaf];
635
1.04k
            break;
636
1.04k
637
20
        case XSTATE_CPUID:
638
20
            if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
639
0
                return;
640
20
641
20
            *res = p->xstate.raw[subleaf];
642
20
            break;
643
20
644
1.57k
        default:
645
1.57k
            *res = p->basic.raw[leaf];
646
1.57k
            break;
647
2.65k
        }
648
2.65k
        break;
649
2.65k
650
9
    case 0x40000000 ... 0x400000ff:
651
9
        if ( is_viridian_domain(d) )
652
0
            return cpuid_viridian_leaves(v, leaf, subleaf, res);
653
9
654
9
        /*
655
9
         * Fallthrough.
656
9
         *
657
9
         * Intel reserve up until 0x4fffffff for hypervisor use.  AMD reserve
658
9
         * only until 0x400000ff, but we already use double that.
659
9
         */
660
9
    case 0x40000100 ... 0x400001ff:
661
9
        return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
662
9
663
8
    case 0x80000000 ... 0x80000000 + CPUID_GUEST_NR_EXTD - 1:
664
8
        ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
665
8
        if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
666
8
                                     ARRAY_SIZE(p->extd.raw) - 1) )
667
0
            return;
668
8
669
8
        *res = p->extd.raw[leaf & 0xffff];
670
8
        break;
671
8
672
0
    default:
673
0
        return;
674
2.66k
    }
675
2.66k
676
2.66k
    /*
677
2.66k
     * Skip dynamic adjustments if we are in the wrong context.
678
2.66k
     *
679
2.66k
     * All dynamic adjustments depends on current register state, which will
680
2.66k
     * be stale if the vcpu is running elsewhere.  It is simpler, quicker, and
681
2.66k
     * more reliable for the caller to do nothing (consistently) than to hand
682
2.66k
     * back stale data which it can't use safely.
683
2.66k
     */
684
2.65k
    if ( v != current )
685
0
        return;
686
2.65k
687
2.65k
    /*
688
2.65k
     * Second pass:
689
2.65k
     * - Dynamic adjustments
690
2.65k
     */
691
2.65k
    switch ( leaf )
692
2.65k
    {
693
0
        const struct cpu_user_regs *regs;
694
0
695
531
    case 0x1:
696
531
        /* TODO: Rework topology logic. */
697
531
        res->b &= 0x00ffffffu;
698
531
        if ( is_hvm_domain(d) )
699
531
            res->b |= (v->vcpu_id * 2) << 24;
700
531
701
531
        /* TODO: Rework vPMU control in terms of toolstack choices. */
702
531
        if ( vpmu_available(v) &&
703
0
             vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
704
0
        {
705
0
            res->d |= cpufeat_mask(X86_FEATURE_DS);
706
0
            if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
707
0
                res->c |= cpufeat_mask(X86_FEATURE_DTES64);
708
0
            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
709
0
                res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
710
0
        }
711
531
712
531
        if ( is_hvm_domain(d) )
713
531
        {
714
531
            /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
715
531
            if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
716
530
                res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
717
531
        }
718
531
        else /* PV domain */
719
0
        {
720
0
            regs = guest_cpu_user_regs();
721
0
722
0
            /*
723
0
             * !!! OSXSAVE handling for PV guests is non-architectural !!!
724
0
             *
725
0
             * Architecturally, the correct code here is simply:
726
0
             *
727
0
             *   if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
728
0
             *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
729
0
             *
730
0
             * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
731
0
             * the XSAVE cpuid flag leaked into guests despite the feature not
732
0
             * being available for use), buggy workarounds where introduced to
733
0
             * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
734
0
             * that Xen also incorrectly leaked OSXSAVE into the guest.
735
0
             *
736
0
             * Furthermore, providing architectural OSXSAVE behaviour to a
737
0
             * many Linux PV guests triggered a further kernel bug when the
738
0
             * fpu code observes that XSAVEOPT is available, assumes that
739
0
             * xsave state had been set up for the task, and follows a wild
740
0
             * pointer.
741
0
             *
742
0
             * Older Linux PVOPS kernels however do require architectural
743
0
             * behaviour.  They observe Xen's leaked OSXSAVE and assume they
744
0
             * can already use XSETBV, dying with a #UD because the shadowed
745
0
             * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
746
0
             * observed cases via stable backports of the above changeset.
747
0
             *
748
0
             * Therefore, the leaking of Xen's OSXSAVE setting has become a
749
0
             * defacto part of the PV ABI and can't reasonably be corrected.
750
0
             * It can however be restricted to only the enlightened CPUID
751
0
             * view, as seen by the guest kernel.
752
0
             *
753
0
             * The following situations and logic now applies:
754
0
             *
755
0
             * - Hardware without CPUID faulting support and native CPUID:
756
0
             *    There is nothing Xen can do here.  The hosts XSAVE flag will
757
0
             *    leak through and Xen's OSXSAVE choice will leak through.
758
0
             *
759
0
             *    In the case that the guest kernel has not set up OSXSAVE, only
760
0
             *    SSE will be set in xcr0, and guest userspace can't do too much
761
0
             *    damage itself.
762
0
             *
763
0
             * - Enlightened CPUID or CPUID faulting available:
764
0
             *    Xen can fully control what is seen here.  Guest kernels need
765
0
             *    to see the leaked OSXSAVE via the enlightened path, but
766
0
             *    guest userspace and the native is given architectural
767
0
             *    behaviour.
768
0
             *
769
0
             *    Emulated vs Faulted CPUID is distinguised based on whether a
770
0
             *    #UD or #GP is currently being serviced.
771
0
             */
772
0
            /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
773
0
            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
774
0
                 (regs->entry_vector == TRAP_invalid_op &&
775
0
                  guest_kernel_mode(v, regs) &&
776
0
                  (read_cr4() & X86_CR4_OSXSAVE)) )
777
0
                res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
778
0
779
0
            /*
780
0
             * At the time of writing, a PV domain is the only viable option
781
0
             * for Dom0.  Several interactions between dom0 and Xen for real
782
0
             * hardware setup have unfortunately been implemented based on
783
0
             * state which incorrectly leaked into dom0.
784
0
             *
785
0
             * These leaks are retained for backwards compatibility, but
786
0
             * restricted to the hardware domains kernel only.
787
0
             */
788
0
            if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
789
0
            {
790
0
                /*
791
0
                 * MONITOR never leaked into PV guests, as PV guests cannot
792
0
                 * use the MONITOR/MWAIT instructions.  As such, they require
793
0
                 * the feature to not being present in emulated CPUID.
794
0
                 *
795
0
                 * Modern PVOPS Linux try to be cunning and use native CPUID
796
0
                 * to see if the hardware actually supports MONITOR, and by
797
0
                 * extension, deep C states.
798
0
                 *
799
0
                 * If the feature is seen, deep-C state information is
800
0
                 * obtained from the DSDT and handed back to Xen via the
801
0
                 * XENPF_set_processor_pminfo hypercall.
802
0
                 *
803
0
                 * This mechanism is incompatible with an HVM-based hardware
804
0
                 * domain, and also with CPUID Faulting.
805
0
                 *
806
0
                 * Luckily, Xen can be just as 'cunning', and distinguish an
807
0
                 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
808
0
                 * fault is currently being serviced.  Yuck...
809
0
                 */
810
0
                if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
811
0
                    res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
812
0
813
0
                /*
814
0
                 * While MONITOR never leaked into PV guests, EIST always used
815
0
                 * to.
816
0
                 *
817
0
                 * Modern PVOPS Linux will only parse P state information from
818
0
                 * the DSDT and return it to Xen if EIST is seen in the
819
0
                 * emulated CPUID information.
820
0
                 */
821
0
                if ( cpu_has_eist )
822
0
                    res->c |= cpufeat_mask(X86_FEATURE_EIST);
823
0
            }
824
0
        }
825
531
        goto common_leaf1_adjustments;
826
0
827
0
    case 0x5:
828
0
        /*
829
0
         * Leak the hardware MONITOR leaf under the same conditions that the
830
0
         * MONITOR feature flag is leaked.  See above for details.
831
0
         */
832
0
        regs = guest_cpu_user_regs();
833
0
        if ( is_pv_domain(d) && is_hardware_domain(d) &&
834
0
             guest_kernel_mode(v, regs) && cpu_has_monitor &&
835
0
             regs->entry_vector == TRAP_gp_fault )
836
0
            *res = raw_cpuid_policy.basic.raw[leaf];
837
0
        break;
838
0
839
1.04k
    case 0x7:
840
1.04k
        switch ( subleaf )
841
1.04k
        {
842
1.04k
        case 0:
843
1.04k
            /* OSPKE clear in policy.  Fast-forward CR4 back in. */
844
1.04k
            if ( (is_pv_domain(d)
845
0
                  ? v->arch.pv_vcpu.ctrlreg[4]
846
1.04k
                  : v->arch.hvm_vcpu.guest_cr[4]) & X86_CR4_PKE )
847
0
                res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
848
1.04k
            break;
849
1.04k
        }
850
1.04k
        break;
851
1.04k
852
0
    case 0xa:
853
0
        /* TODO: Rework vPMU control in terms of toolstack choices. */
854
0
        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
855
0
             !vpmu_available(v) )
856
0
            *res = EMPTY_LEAF;
857
0
        else
858
0
        {
859
0
            /* Report at most v3 since that's all we currently emulate. */
860
0
            if ( (res->a & 0xff) > 3 )
861
0
                res->a = (res->a & ~0xff) | 3;
862
0
        }
863
0
        break;
864
1.04k
865
1
    case 0xb:
866
1
        /*
867
1
         * In principle, this leaf is Intel-only.  In practice, it is tightly
868
1
         * coupled with x2apic, and we offer an x2apic-capable APIC emulation
869
1
         * to guests on AMD hardware as well.
870
1
         *
871
1
         * TODO: Rework topology logic.
872
1
         */
873
1
        if ( p->basic.x2apic )
874
1
        {
875
1
            *(uint8_t *)&res->c = subleaf;
876
1
877
1
            /* Fix the x2APIC identifier. */
878
1
            res->d = v->vcpu_id * 2;
879
1
        }
880
1
        break;
881
1.04k
882
20
    case XSTATE_CPUID:
883
20
        switch ( subleaf )
884
20
        {
885
2
        case 1:
886
2
            if ( p->xstate.xsaves )
887
0
            {
888
0
                /*
889
0
                 * TODO: Figure out what to do for XSS state.  VT-x manages
890
0
                 * host vs guest MSR_XSS automatically, so as soon as we start
891
0
                 * supporting any XSS states, the wrong XSS will be in
892
0
                 * context.
893
0
                 */
894
0
                BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
895
0
896
0
                /*
897
0
                 * Read CPUID[0xD,0/1].EBX from hardware.  They vary with
898
0
                 * enabled XSTATE, and appropraite XCR0|XSS are in context.
899
0
                 */
900
17
        case 0:
901
17
                res->b = cpuid_count_ebx(leaf, subleaf);
902
17
            }
903
19
            break;
904
20
        }
905
20
        break;
906
20
907
1
    case 0x80000001:
908
1
        /* SYSCALL is hidden outside of long mode on Intel. */
909
1
        if ( p->x86_vendor == X86_VENDOR_INTEL &&
910
1
             is_hvm_domain(d) && !hvm_long_mode_active(v) )
911
0
            res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
912
1
913
532
    common_leaf1_adjustments:
914
532
        if ( is_hvm_domain(d) )
915
532
        {
916
532
            /* Fast-forward MSR_APIC_BASE.EN. */
917
532
            if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
918
0
                res->d &= ~cpufeat_bit(X86_FEATURE_APIC);
919
532
920
532
            /*
921
532
             * PSE36 is not supported in shadow mode.  This bit should be
922
532
             * clear in hvm_shadow_featuremask[].
923
532
             *
924
532
             * However, an unspecified version of Hyper-V from 2011 refuses to
925
532
             * start as the "cpu does not provide required hw features" if it
926
532
             * can't see PSE36.
927
532
             *
928
532
             * As a workaround, leak the toolstack-provided PSE36 value into a
929
532
             * shadow guest if the guest is already using PAE paging (and
930
532
             * won't care about reverting back to PSE paging).  Otherwise,
931
532
             * knoble it, so a 32bit guest doesn't get the impression that it
932
532
             * could try to use PSE36 paging.
933
532
             */
934
532
            if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
935
0
                res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
936
532
        }
937
532
        else /* PV domain */
938
0
        {
939
0
            /*
940
0
             * MTRR used to unconditionally leak into PV guests.  They cannot
941
0
             * MTRR infrastructure at all, and shouldn't be able to see the
942
0
             * feature.
943
0
             *
944
0
             * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
945
0
             * trying to use the associated MSRs.  Xenolinux-based PV dom0's
946
0
             * however use the MTRR feature as an indication of the presence
947
0
             * of the XENPF_{add,del,read}_memtype hypercalls.
948
0
             */
949
0
            if ( is_hardware_domain(d) && cpu_has_mtrr &&
950
0
                 guest_kernel_mode(v, guest_cpu_user_regs()) )
951
0
                res->d |= cpufeat_mask(X86_FEATURE_MTRR);
952
0
        }
953
532
        break;
954
1
955
0
    case 0x8000001c:
956
0
        if ( (v->arch.xcr0 & XSTATE_LWP) && cpu_has_svm )
957
0
            /* Turn on available bit and other features specified in lwp_cfg. */
958
0
            res->a = (res->d & v->arch.hvm_svm.guest_lwp_cfg) | 1;
959
0
        break;
960
2.65k
    }
961
2.65k
}
962
963
static void __init __maybe_unused build_assertions(void)
964
0
{
965
0
    BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
966
0
    BUILD_BUG_ON(ARRAY_SIZE(special_features) != FSCAPINTS);
967
0
    BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
968
0
    BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
969
0
    BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
970
0
    BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
971
0
972
0
    /* Find some more clever allocation scheme if this trips. */
973
0
    BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
974
0
975
0
    BUILD_BUG_ON(sizeof(raw_cpuid_policy.basic) !=
976
0
                 sizeof(raw_cpuid_policy.basic.raw));
977
0
    BUILD_BUG_ON(sizeof(raw_cpuid_policy.feat) !=
978
0
                 sizeof(raw_cpuid_policy.feat.raw));
979
0
    BUILD_BUG_ON(sizeof(raw_cpuid_policy.xstate) !=
980
0
                 sizeof(raw_cpuid_policy.xstate.raw));
981
0
    BUILD_BUG_ON(sizeof(raw_cpuid_policy.extd) !=
982
0
                 sizeof(raw_cpuid_policy.extd.raw));
983
0
}
984
985
/*
986
 * Local variables:
987
 * mode: C
988
 * c-file-style: "BSD"
989
 * c-basic-offset: 4
990
 * tab-width: 4
991
 * indent-tabs-mode: nil
992
 * End:
993
 */