Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/cpu/mcheck/vmce.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * vmce.c - provide software emulated vMCE support to guest
3
 *
4
 * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
5
 * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include <xen/init.h>
22
#include <xen/types.h>
23
#include <xen/irq.h>
24
#include <xen/event.h>
25
#include <xen/kernel.h>
26
#include <xen/delay.h>
27
#include <xen/smp.h>
28
#include <xen/mm.h>
29
#include <asm/hvm/save.h>
30
#include <asm/processor.h>
31
#include <public/sysctl.h>
32
#include <asm/system.h>
33
#include <asm/msr.h>
34
#include <asm/p2m.h>
35
#include <asm/pv/traps.h>
36
37
#include "mce.h"
38
#include "x86_mca.h"
39
#include "vmce.h"
40
41
/*
42
 * MCG_SER_P:  software error recovery supported
43
 * MCG_TES_P:  to avoid MCi_status bit56:53 model specific
44
 * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
45
 *             for sake of performance since guest not polling periodically
46
 */
47
12
#define INTEL_GUEST_MCG_CAP (MCG_SER_P |  \
48
12
                             MCG_TES_P |  \
49
12
                             MCG_CMCI_P |  \
50
12
                             GUEST_MC_BANK_NUM)
51
52
0
#define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
53
54
void vmce_init_vcpu(struct vcpu *v)
55
12
{
56
12
    int i;
57
12
58
12
    /* global MCA MSRs init */
59
12
    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
60
12
        v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
61
12
    else
62
0
        v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
63
12
64
12
    v->arch.vmce.mcg_status = 0;
65
12
66
12
    /* per-bank MCA MSRs init */
67
36
    for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
68
24
        memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
69
12
70
12
    spin_lock_init(&v->arch.vmce.lock);
71
12
}
72
73
int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
74
0
{
75
0
    unsigned long guest_mcg_cap;
76
0
77
0
    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
78
0
        guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
79
0
    else
80
0
        guest_mcg_cap = AMD_GUEST_MCG_CAP;
81
0
82
0
    if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
83
0
    {
84
0
        dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
85
0
                " %#" PRIx64 " for %pv (supported: %#Lx)\n",
86
0
                is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
87
0
                v, guest_mcg_cap & ~MCG_CAP_COUNT);
88
0
        return -EPERM;
89
0
    }
90
0
91
0
    v->arch.vmce.mcg_cap = ctxt->caps;
92
0
    v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
93
0
    v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
94
0
    v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
95
0
96
0
    return 0;
97
0
}
98
99
/*
100
 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
101
 * when migrating from old vMCE version to new vMCE.
102
 */
103
static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
104
72
{
105
72
    int ret = 1;
106
72
    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
107
72
108
72
    *val = 0;
109
72
110
72
    switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
111
72
    {
112
0
    case MSR_IA32_MC0_CTL:
113
0
        /* stick all 1's to MCi_CTL */
114
0
        *val = ~0UL;
115
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
116
0
                   v, bank, *val);
117
0
        break;
118
0
119
0
    case MSR_IA32_MC0_STATUS:
120
0
        if ( bank < GUEST_MC_BANK_NUM )
121
0
        {
122
0
            *val = v->arch.vmce.bank[bank].mci_status;
123
0
            if ( *val )
124
0
                mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
125
0
                           v, bank, *val);
126
0
        }
127
0
        break;
128
0
129
0
    case MSR_IA32_MC0_ADDR:
130
0
        if ( bank < GUEST_MC_BANK_NUM )
131
0
        {
132
0
            *val = v->arch.vmce.bank[bank].mci_addr;
133
0
            if ( *val )
134
0
                mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
135
0
                           v, bank, *val);
136
0
        }
137
0
        break;
138
0
139
0
    case MSR_IA32_MC0_MISC:
140
0
        if ( bank < GUEST_MC_BANK_NUM )
141
0
        {
142
0
            *val = v->arch.vmce.bank[bank].mci_misc;
143
0
            if ( *val )
144
0
                mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
145
0
                           v, bank, *val);
146
0
        }
147
0
        break;
148
0
149
72
    default:
150
72
        switch ( boot_cpu_data.x86_vendor )
151
72
        {
152
71
        case X86_VENDOR_INTEL:
153
71
            ret = vmce_intel_rdmsr(v, msr, val);
154
71
            break;
155
71
156
0
        case X86_VENDOR_AMD:
157
0
            ret = vmce_amd_rdmsr(v, msr, val);
158
0
            break;
159
71
160
0
        default:
161
0
            ret = 0;
162
0
            break;
163
72
        }
164
71
        break;
165
72
    }
166
72
167
71
    return ret;
168
72
}
169
170
/*
171
 * < 0: Unsupported and will #GP fault to guest
172
 * = 0: Not handled, should be handled by other components
173
 * > 0: Success
174
 */
175
int vmce_rdmsr(uint32_t msr, uint64_t *val)
176
96
{
177
96
    struct vcpu *cur = current;
178
96
    int ret = 1;
179
96
180
96
    *val = 0;
181
96
182
96
    spin_lock(&cur->arch.vmce.lock);
183
96
184
96
    switch ( msr )
185
96
    {
186
0
    case MSR_IA32_MCG_STATUS:
187
0
        *val = cur->arch.vmce.mcg_status;
188
0
        if ( *val )
189
0
            mce_printk(MCE_VERBOSE,
190
0
                       "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
191
0
        break;
192
0
193
12
    case MSR_IA32_MCG_CAP:
194
12
        *val = cur->arch.vmce.mcg_cap;
195
12
        mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
196
12
        break;
197
0
198
0
    case MSR_IA32_MCG_CTL:
199
0
        if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
200
0
            *val = ~0ULL;
201
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
202
0
        break;
203
0
204
0
    case MSR_IA32_MCG_EXT_CTL:
205
0
        /*
206
0
         * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
207
0
         * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
208
0
         * does not need to check them here.
209
0
         */
210
0
        if ( cur->arch.vmce.mcg_cap & MCG_LMCE_P )
211
0
        {
212
0
            *val = cur->arch.vmce.mcg_ext_ctl;
213
0
            mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
214
0
                       cur, *val);
215
0
        }
216
0
        else
217
0
        {
218
0
            ret = -1;
219
0
            mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
220
0
                       cur);
221
0
        }
222
0
        break;
223
0
224
84
    default:
225
72
        ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
226
84
        break;
227
96
    }
228
96
229
96
    spin_unlock(&cur->arch.vmce.lock);
230
96
231
96
    return ret;
232
96
}
233
234
/*
235
 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
236
 * when migratie from old vMCE version to new vMCE.
237
 */
238
static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
239
117
{
240
117
    int ret = 1;
241
117
    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
242
117
243
117
    switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
244
117
    {
245
22
    case MSR_IA32_MC0_CTL:
246
22
        /*
247
22
         * if guest crazy clear any bit of MCi_CTL,
248
22
         * treat it as not implement and ignore write change it.
249
22
         */
250
22
        break;
251
22
252
24
    case MSR_IA32_MC0_STATUS:
253
24
        mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
254
24
                   v, bank, val);
255
24
        if ( val )
256
0
            ret = -1;
257
24
        else if ( bank < GUEST_MC_BANK_NUM )
258
24
            v->arch.vmce.bank[bank].mci_status = val;
259
24
        break;
260
22
261
0
    case MSR_IA32_MC0_ADDR:
262
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
263
0
                   v, bank, val);
264
0
        if ( val )
265
0
            ret = -1;
266
0
        else if ( bank < GUEST_MC_BANK_NUM )
267
0
            v->arch.vmce.bank[bank].mci_addr = val;
268
0
        break;
269
22
270
0
    case MSR_IA32_MC0_MISC:
271
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
272
0
                   v, bank, val);
273
0
        if ( val )
274
0
            ret = -1;
275
0
        else if ( bank < GUEST_MC_BANK_NUM )
276
0
            v->arch.vmce.bank[bank].mci_misc = val;
277
0
        break;
278
22
279
72
    default:
280
72
        switch ( boot_cpu_data.x86_vendor )
281
72
        {
282
72
        case X86_VENDOR_INTEL:
283
72
            ret = vmce_intel_wrmsr(v, msr, val);
284
72
            break;
285
72
286
0
        case X86_VENDOR_AMD:
287
0
            ret = vmce_amd_wrmsr(v, msr, val);
288
0
            break;
289
72
290
0
        default:
291
0
            ret = 0;
292
0
            break;
293
72
        }
294
72
        break;
295
117
    }
296
117
297
118
    return ret;
298
117
}
299
300
/*
301
 * < 0: Unsupported and will #GP fault to guest
302
 * = 0: Not handled, should be handled by other components
303
 * > 0: Success
304
 */
305
int vmce_wrmsr(uint32_t msr, uint64_t val)
306
169
{
307
169
    struct vcpu *cur = current;
308
169
    int ret = 1;
309
169
310
169
    spin_lock(&cur->arch.vmce.lock);
311
169
312
169
    switch ( msr )
313
169
    {
314
0
    case MSR_IA32_MCG_CTL:
315
0
        /* If MCG_CTL exists then stick to all 1's, else ignore. */
316
0
        break;
317
0
318
0
    case MSR_IA32_MCG_STATUS:
319
0
        cur->arch.vmce.mcg_status = val;
320
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
321
0
                   cur, val);
322
0
        break;
323
0
324
0
    case MSR_IA32_MCG_CAP:
325
0
        /*
326
0
         * According to Intel SDM, IA32_MCG_CAP is a read-only register,
327
0
         * the effect of writing to the IA32_MCG_CAP is undefined. Here we
328
0
         * treat writing as 'write not change'. Guest would not surprise.
329
0
         */
330
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
331
0
        break;
332
0
333
0
    case MSR_IA32_MCG_EXT_CTL:
334
0
        if ( (cur->arch.vmce.mcg_cap & MCG_LMCE_P) &&
335
0
             !(val & ~MCG_EXT_CTL_LMCE_EN) )
336
0
            cur->arch.vmce.mcg_ext_ctl = val;
337
0
        else
338
0
            ret = -1;
339
0
        mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
340
0
                   cur, val, (ret == -1) ? ", not supported" : "");
341
0
        break;
342
0
343
167
    default:
344
117
        ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
345
167
        break;
346
169
    }
347
169
348
168
    spin_unlock(&cur->arch.vmce.lock);
349
168
    return ret;
350
169
}
351
352
static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
353
0
{
354
0
    struct vcpu *v;
355
0
    int err = 0;
356
0
357
0
    for_each_vcpu ( d, v )
358
0
    {
359
0
        struct hvm_vmce_vcpu ctxt = {
360
0
            .caps = v->arch.vmce.mcg_cap,
361
0
            .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
362
0
            .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
363
0
            .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
364
0
        };
365
0
366
0
        err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
367
0
        if ( err )
368
0
            break;
369
0
    }
370
0
371
0
    return err;
372
0
}
373
374
static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
375
0
{
376
0
    unsigned int vcpuid = hvm_load_instance(h);
377
0
    struct vcpu *v;
378
0
    struct hvm_vmce_vcpu ctxt;
379
0
    int err;
380
0
381
0
    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
382
0
    {
383
0
        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
384
0
                d->domain_id, vcpuid);
385
0
        err = -EINVAL;
386
0
    }
387
0
    else
388
0
        err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
389
0
390
0
    return err ?: vmce_restore_vcpu(v, &ctxt);
391
0
}
392
393
HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
394
                          vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
395
396
/*
397
 * for Intel MCE, broadcast vMCE to all vcpus
398
 * for AMD MCE, only inject vMCE to vcpu0
399
 *
400
 * @ d, domain to which would inject vmce
401
 * @ vcpu,
402
 *   -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
403
 *   >= 0, vcpu, the vMCE is injected to
404
 */
405
int inject_vmce(struct domain *d, int vcpu)
406
0
{
407
0
    struct vcpu *v;
408
0
    int ret = -ESRCH;
409
0
410
0
    for_each_vcpu ( d, v )
411
0
    {
412
0
        if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
413
0
            continue;
414
0
415
0
        /* Don't inject to uninitialized VCPU. */
416
0
        if ( !v->is_initialised )
417
0
            continue;
418
0
419
0
        if ( (is_hvm_domain(d) ||
420
0
              pv_trap_callback_registered(v, TRAP_machine_check)) &&
421
0
             !test_and_set_bool(v->mce_pending) )
422
0
        {
423
0
            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
424
0
            vcpu_kick(v);
425
0
            ret = 0;
426
0
        }
427
0
        else
428
0
        {
429
0
            mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
430
0
            ret = -EBUSY;
431
0
            break;
432
0
        }
433
0
434
0
        if ( vcpu != VMCE_INJECT_BROADCAST )
435
0
            break;
436
0
    }
437
0
438
0
    return ret;
439
0
}
440
441
static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
442
                             uint64_t mci_status, uint64_t mci_addr,
443
                             uint64_t mci_misc)
444
0
{
445
0
    if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
446
0
    {
447
0
        mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
448
0
                   " vMCE yet!\n", v);
449
0
        return -EBUSY;
450
0
    }
451
0
452
0
    spin_lock(&v->arch.vmce.lock);
453
0
454
0
    v->arch.vmce.mcg_status = mcg_status;
455
0
    /*
456
0
     * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
457
0
     * 2. Filter MCi_STATUS MSCOD model specific error code to guest
458
0
     */
459
0
    v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
460
0
    v->arch.vmce.bank[1].mci_addr = mci_addr;
461
0
    v->arch.vmce.bank[1].mci_misc = mci_misc;
462
0
463
0
    spin_unlock(&v->arch.vmce.lock);
464
0
465
0
    return 0;
466
0
}
467
468
int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
469
                   uint64_t gstatus, int vmce_vcpuid)
470
0
{
471
0
    struct vcpu *v = d->vcpu[0];
472
0
    bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
473
0
    int ret, err;
474
0
475
0
    if ( mc_bank->mc_domid == DOMID_INVALID )
476
0
        return -EINVAL;
477
0
478
0
    if ( broadcast )
479
0
        gstatus &= ~MCG_STATUS_LMCE;
480
0
    else if ( gstatus & MCG_STATUS_LMCE )
481
0
    {
482
0
        ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
483
0
        v = d->vcpu[vmce_vcpuid];
484
0
    }
485
0
486
0
    /*
487
0
     * vMCE with the actual error information is injected to vCPU0,
488
0
     * and, if broadcast is required, we choose to inject less severe
489
0
     * vMCEs to other vCPUs. Thus guest can always get the severest
490
0
     * error (i.e. the actual one) on vCPU0. If guest can recover from
491
0
     * the severest error on vCPU0, the less severe errors on other
492
0
     * vCPUs will not prevent guest from recovering on those vCPUs.
493
0
     */
494
0
    ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
495
0
                            mc_bank->mc_addr, mc_bank->mc_misc);
496
0
    if ( broadcast )
497
0
        for_each_vcpu ( d, v )
498
0
        {
499
0
            if ( !v->vcpu_id )
500
0
                continue;
501
0
            err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
502
0
                                    0, 0, 0);
503
0
            if ( err )
504
0
                ret = err;
505
0
        }
506
0
507
0
    return ret;
508
0
}
509
510
/* It's said some ram is setup as mmio_direct for UC cache attribute */
511
0
#define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
512
0
                                | p2m_to_mask(p2m_ram_logdirty) \
513
0
                                | p2m_to_mask(p2m_ram_ro)       \
514
0
                                | p2m_to_mask(p2m_mmio_direct))
515
516
/*
517
 * Currently all CPUs are redenzevous at the MCE softirq handler, no
518
 * need to consider paging p2m type
519
 * Currently only support HVM guest with EPT paging mode
520
 * XXX following situation missed:
521
 * PoD, Foreign mapped, Granted, Shared
522
 */
523
int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
524
0
{
525
0
    mfn_t r_mfn;
526
0
    p2m_type_t pt;
527
0
    int rc;
528
0
529
0
    /* Always trust dom0's MCE handler will prevent future access */
530
0
    if ( is_hardware_domain(d) )
531
0
        return 0;
532
0
533
0
    if ( !mfn_valid(mfn) )
534
0
        return -EINVAL;
535
0
536
0
    if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
537
0
        return -EOPNOTSUPP;
538
0
539
0
    rc = -1;
540
0
    r_mfn = get_gfn_query(d, gfn, &pt);
541
0
    if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
542
0
    {
543
0
        ASSERT(mfn_x(r_mfn) == mfn_x(mfn));
544
0
        rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
545
0
    }
546
0
    put_gfn(d, gfn);
547
0
548
0
    return rc;
549
0
}
550
551
int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
552
0
{
553
0
    struct vcpu *v;
554
0
555
0
    if ( cap & ~XEN_HVM_MCA_CAP_MASK )
556
0
        return -EINVAL;
557
0
558
0
    if ( cap & XEN_HVM_MCA_CAP_LMCE )
559
0
    {
560
0
        if ( !lmce_support )
561
0
            return -EINVAL;
562
0
        for_each_vcpu(d, v)
563
0
            v->arch.vmce.mcg_cap |= MCG_LMCE_P;
564
0
    }
565
0
566
0
    return 0;
567
0
}