Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/hvm/svm/svm.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * svm.c: handling SVM architecture-related VM exits
3
 * Copyright (c) 2004, Intel Corporation.
4
 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5
 *
6
 * This program is free software; you can redistribute it and/or modify it
7
 * under the terms and conditions of the GNU General Public License,
8
 * version 2, as published by the Free Software Foundation.
9
 *
10
 * This program is distributed in the hope it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13
 * more details.
14
 *
15
 * You should have received a copy of the GNU General Public License along with
16
 * this program; If not, see <http://www.gnu.org/licenses/>.
17
 */
18
19
#include <xen/init.h>
20
#include <xen/lib.h>
21
#include <xen/trace.h>
22
#include <xen/sched.h>
23
#include <xen/irq.h>
24
#include <xen/softirq.h>
25
#include <xen/hypercall.h>
26
#include <xen/domain_page.h>
27
#include <xen/xenoprof.h>
28
#include <asm/current.h>
29
#include <asm/io.h>
30
#include <asm/paging.h>
31
#include <asm/p2m.h>
32
#include <asm/mem_sharing.h>
33
#include <asm/regs.h>
34
#include <asm/cpufeature.h>
35
#include <asm/processor.h>
36
#include <asm/amd.h>
37
#include <asm/guest_access.h>
38
#include <asm/debugreg.h>
39
#include <asm/msr.h>
40
#include <asm/i387.h>
41
#include <asm/iocap.h>
42
#include <asm/hvm/emulate.h>
43
#include <asm/hvm/hvm.h>
44
#include <asm/hvm/support.h>
45
#include <asm/hvm/io.h>
46
#include <asm/hvm/emulate.h>
47
#include <asm/hvm/svm/asid.h>
48
#include <asm/hvm/svm/svm.h>
49
#include <asm/hvm/svm/vmcb.h>
50
#include <asm/hvm/svm/emulate.h>
51
#include <asm/hvm/svm/intr.h>
52
#include <asm/hvm/svm/svmdebug.h>
53
#include <asm/hvm/svm/nestedsvm.h>
54
#include <asm/hvm/nestedhvm.h>
55
#include <asm/x86_emulate.h>
56
#include <public/sched.h>
57
#include <asm/hvm/vpt.h>
58
#include <asm/hvm/trace.h>
59
#include <asm/hap.h>
60
#include <asm/apic.h>
61
#include <asm/debugger.h>
62
#include <asm/xstate.h>
63
64
void svm_asm_do_resume(void);
65
66
u32 svm_feature_flags;
67
68
/* Indicates whether guests may use EFER.LMSLE. */
69
bool_t cpu_has_lmsl;
70
71
static void svm_update_guest_efer(struct vcpu *);
72
73
static struct hvm_function_table svm_function_table;
74
75
/*
76
 * Physical addresses of the Host State Area (for hardware) and vmcb (for Xen)
77
 * which contains Xen's fs/gs/tr/ldtr and GSBASE/STAR/SYSENTER state when in
78
 * guest vcpu context.
79
 */
80
static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
81
static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
82
83
static bool_t amd_erratum383_found __read_mostly;
84
85
/* OSVW bits */
86
static uint64_t osvw_length, osvw_status;
87
static DEFINE_SPINLOCK(osvw_lock);
88
89
/* Only crash the guest if the problem originates in kernel mode. */
90
static void svm_crash_or_fault(struct vcpu *v)
91
0
{
92
0
    if ( vmcb_get_cpl(v->arch.hvm_svm.vmcb) )
93
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
94
0
    else
95
0
        domain_crash(v->domain);
96
0
}
97
98
void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
99
0
{
100
0
    struct vcpu *curr = current;
101
0
102
0
    if ( unlikely(inst_len == 0) )
103
0
        return;
104
0
105
0
    if ( unlikely(inst_len > MAX_INST_LEN) )
106
0
    {
107
0
        gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
108
0
        svm_crash_or_fault(curr);
109
0
        return;
110
0
    }
111
0
112
0
    ASSERT(regs == guest_cpu_user_regs());
113
0
114
0
    regs->rip += inst_len;
115
0
    regs->eflags &= ~X86_EFLAGS_RF;
116
0
117
0
    curr->arch.hvm_svm.vmcb->interrupt_shadow = 0;
118
0
119
0
    if ( regs->eflags & X86_EFLAGS_TF )
120
0
        hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
121
0
}
122
123
static void svm_cpu_down(void)
124
0
{
125
0
    write_efer(read_efer() & ~EFER_SVME);
126
0
}
127
128
unsigned long *
129
svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
130
0
{
131
0
    unsigned long *msr_bit = NULL;
132
0
133
0
    /*
134
0
     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
135
0
     */
136
0
    if ( msr <= 0x1fff )
137
0
        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
138
0
    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
139
0
        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
140
0
    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
141
0
        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
142
0
143
0
    return msr_bit;
144
0
}
145
146
void svm_intercept_msr(struct vcpu *v, uint32_t msr, int flags)
147
0
{
148
0
    unsigned long *msr_bit;
149
0
150
0
    msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
151
0
    BUG_ON(msr_bit == NULL);
152
0
    msr &= 0x1fff;
153
0
154
0
    if ( flags & MSR_INTERCEPT_READ )
155
0
         __set_bit(msr * 2, msr_bit);
156
0
    else
157
0
         __clear_bit(msr * 2, msr_bit);
158
0
159
0
    if ( flags & MSR_INTERCEPT_WRITE )
160
0
        __set_bit(msr * 2 + 1, msr_bit);
161
0
    else
162
0
        __clear_bit(msr * 2 + 1, msr_bit);
163
0
}
164
165
static void svm_save_dr(struct vcpu *v)
166
0
{
167
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
168
0
    unsigned int flag_dr_dirty = v->arch.hvm_vcpu.flag_dr_dirty;
169
0
170
0
    if ( !flag_dr_dirty )
171
0
        return;
172
0
173
0
    /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
174
0
    v->arch.hvm_vcpu.flag_dr_dirty = 0;
175
0
    vmcb_set_dr_intercepts(vmcb, ~0u);
176
0
177
0
    if ( v->domain->arch.cpuid->extd.dbext )
178
0
    {
179
0
        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
180
0
        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_RW);
181
0
        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_RW);
182
0
        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_RW);
183
0
184
0
        rdmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
185
0
        rdmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
186
0
        rdmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
187
0
        rdmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
188
0
    }
189
0
190
0
    v->arch.debugreg[0] = read_debugreg(0);
191
0
    v->arch.debugreg[1] = read_debugreg(1);
192
0
    v->arch.debugreg[2] = read_debugreg(2);
193
0
    v->arch.debugreg[3] = read_debugreg(3);
194
0
    v->arch.debugreg[6] = vmcb_get_dr6(vmcb);
195
0
    v->arch.debugreg[7] = vmcb_get_dr7(vmcb);
196
0
}
197
198
static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
199
0
{
200
0
    if ( v->arch.hvm_vcpu.flag_dr_dirty )
201
0
        return;
202
0
203
0
    v->arch.hvm_vcpu.flag_dr_dirty = 1;
204
0
    vmcb_set_dr_intercepts(vmcb, 0);
205
0
206
0
    ASSERT(v == current);
207
0
208
0
    if ( v->domain->arch.cpuid->extd.dbext )
209
0
    {
210
0
        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
211
0
        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
212
0
        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
213
0
        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
214
0
215
0
        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
216
0
        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
217
0
        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
218
0
        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
219
0
    }
220
0
221
0
    write_debugreg(0, v->arch.debugreg[0]);
222
0
    write_debugreg(1, v->arch.debugreg[1]);
223
0
    write_debugreg(2, v->arch.debugreg[2]);
224
0
    write_debugreg(3, v->arch.debugreg[3]);
225
0
    vmcb_set_dr6(vmcb, v->arch.debugreg[6]);
226
0
    vmcb_set_dr7(vmcb, v->arch.debugreg[7]);
227
0
}
228
229
/*
230
 * DR7 is saved and restored on every vmexit.  Other debug registers only
231
 * need to be restored if their value is going to affect execution -- i.e.,
232
 * if one of the breakpoints is enabled.  So mask out all bits that don't
233
 * enable some breakpoint functionality.
234
 */
235
static void svm_restore_dr(struct vcpu *v)
236
0
{
237
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
238
0
    if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
239
0
        __restore_debug_registers(vmcb, v);
240
0
}
241
242
static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
243
0
{
244
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
245
0
246
0
    c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
247
0
    c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
248
0
    c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
249
0
    c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
250
0
251
0
    c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs;
252
0
    c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp;
253
0
    c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip;
254
0
255
0
    c->pending_event = 0;
256
0
    c->error_code = 0;
257
0
    if ( vmcb->eventinj.fields.v &&
258
0
         hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
259
0
                                     vmcb->eventinj.fields.vector) )
260
0
    {
261
0
        c->pending_event = (uint32_t)vmcb->eventinj.bytes;
262
0
        c->error_code = vmcb->eventinj.fields.errorcode;
263
0
    }
264
0
265
0
    return 1;
266
0
}
267
268
static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
269
0
{
270
0
    struct page_info *page = NULL;
271
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
272
0
    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
273
0
274
0
    if ( c->pending_valid )
275
0
    {
276
0
        if ( (c->pending_type == 1) || (c->pending_type > 4) ||
277
0
             (c->pending_reserved != 0) )
278
0
        {
279
0
            dprintk(XENLOG_ERR, "%pv: Invalid pending event %#"PRIx32"\n",
280
0
                    v, c->pending_event);
281
0
            return -EINVAL;
282
0
        }
283
0
284
0
        if ( c->pending_error_valid &&
285
0
             c->error_code != (uint16_t)c->error_code )
286
0
        {
287
0
            dprintk(XENLOG_ERR, "%pv: Invalid error code %#"PRIx32"\n",
288
0
                    v, c->error_code);
289
0
            return -EINVAL;
290
0
        }
291
0
    }
292
0
293
0
    if ( !paging_mode_hap(v->domain) )
294
0
    {
295
0
        if ( c->cr0 & X86_CR0_PG )
296
0
        {
297
0
            page = get_page_from_gfn(v->domain, c->cr3 >> PAGE_SHIFT,
298
0
                                     NULL, P2M_ALLOC);
299
0
            if ( !page )
300
0
            {
301
0
                gdprintk(XENLOG_ERR, "Invalid CR3 value=%#"PRIx64"\n",
302
0
                         c->cr3);
303
0
                return -EINVAL;
304
0
            }
305
0
        }
306
0
307
0
        if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
308
0
            put_page(pagetable_get_page(v->arch.guest_table));
309
0
310
0
        v->arch.guest_table =
311
0
            page ? pagetable_from_page(page) : pagetable_null();
312
0
    }
313
0
314
0
    v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET;
315
0
    v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
316
0
    v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
317
0
    v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
318
0
    svm_update_guest_cr(v, 0);
319
0
    svm_update_guest_cr(v, 2);
320
0
    svm_update_guest_cr(v, 4);
321
0
322
0
    /* Load sysenter MSRs into both VMCB save area and VCPU fields. */
323
0
    vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
324
0
    vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp;
325
0
    vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip;
326
0
    
327
0
    if ( paging_mode_hap(v->domain) )
328
0
    {
329
0
        vmcb_set_np_enable(vmcb, 1);
330
0
        vmcb_set_g_pat(vmcb, MSR_IA32_CR_PAT_RESET /* guest PAT */);
331
0
        vmcb_set_h_cr3(vmcb, pagetable_get_paddr(p2m_get_pagetable(p2m)));
332
0
    }
333
0
334
0
    if ( c->pending_valid &&
335
0
         hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
336
0
    {
337
0
        gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n",
338
0
                 c->pending_event, c->error_code);
339
0
        vmcb->eventinj.bytes = c->pending_event;
340
0
        vmcb->eventinj.fields.errorcode = c->error_code;
341
0
    }
342
0
    else
343
0
        vmcb->eventinj.bytes = 0;
344
0
345
0
    vmcb->cleanbits.bytes = 0;
346
0
    paging_update_paging_modes(v);
347
0
348
0
    return 0;
349
0
}
350
351
352
static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
353
0
{
354
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
355
0
356
0
    data->shadow_gs        = vmcb->kerngsbase;
357
0
    data->msr_lstar        = vmcb->lstar;
358
0
    data->msr_star         = vmcb->star;
359
0
    data->msr_cstar        = vmcb->cstar;
360
0
    data->msr_syscall_mask = vmcb->sfmask;
361
0
    data->msr_efer         = v->arch.hvm_vcpu.guest_efer;
362
0
    data->msr_flags        = 0;
363
0
}
364
365
366
static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
367
0
{
368
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
369
0
370
0
    vmcb->kerngsbase = data->shadow_gs;
371
0
    vmcb->lstar      = data->msr_lstar;
372
0
    vmcb->star       = data->msr_star;
373
0
    vmcb->cstar      = data->msr_cstar;
374
0
    vmcb->sfmask     = data->msr_syscall_mask;
375
0
    v->arch.hvm_vcpu.guest_efer = data->msr_efer;
376
0
    svm_update_guest_efer(v);
377
0
}
378
379
static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
380
0
{
381
0
    svm_save_cpu_state(v, ctxt);
382
0
    svm_vmcb_save(v, ctxt);
383
0
}
384
385
static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
386
0
{
387
0
    svm_load_cpu_state(v, ctxt);
388
0
    if (svm_vmcb_restore(v, ctxt)) {
389
0
        gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
390
0
        domain_crash(v->domain);
391
0
        return -EINVAL;
392
0
    }
393
0
394
0
    return 0;
395
0
}
396
397
static unsigned int __init svm_init_msr(void)
398
0
{
399
0
    return boot_cpu_has(X86_FEATURE_DBEXT) ? 4 : 0;
400
0
}
401
402
static void svm_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
403
0
{
404
0
    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
405
0
    {
406
0
        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[0];
407
0
        if ( ctxt->msr[ctxt->count].val )
408
0
            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR0_ADDRESS_MASK;
409
0
410
0
        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[1];
411
0
        if ( ctxt->msr[ctxt->count].val )
412
0
            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR1_ADDRESS_MASK;
413
0
414
0
        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[2];
415
0
        if ( ctxt->msr[ctxt->count].val )
416
0
            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR2_ADDRESS_MASK;
417
0
418
0
        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[3];
419
0
        if ( ctxt->msr[ctxt->count].val )
420
0
            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR3_ADDRESS_MASK;
421
0
    }
422
0
}
423
424
static int svm_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
425
0
{
426
0
    unsigned int i, idx;
427
0
    int err = 0;
428
0
429
0
    for ( i = 0; i < ctxt->count; ++i )
430
0
    {
431
0
        switch ( idx = ctxt->msr[i].index )
432
0
        {
433
0
        case MSR_AMD64_DR0_ADDRESS_MASK:
434
0
            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
435
0
                err = -ENXIO;
436
0
            else if ( ctxt->msr[i].val >> 32 )
437
0
                err = -EDOM;
438
0
            else
439
0
                v->arch.hvm_svm.dr_mask[0] = ctxt->msr[i].val;
440
0
            break;
441
0
442
0
        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
443
0
            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
444
0
                err = -ENXIO;
445
0
            else if ( ctxt->msr[i].val >> 32 )
446
0
                err = -EDOM;
447
0
            else
448
0
                v->arch.hvm_svm.dr_mask[idx - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
449
0
                    ctxt->msr[i].val;
450
0
            break;
451
0
452
0
        default:
453
0
            continue;
454
0
        }
455
0
        if ( err )
456
0
            break;
457
0
        ctxt->msr[i]._rsvd = 1;
458
0
    }
459
0
460
0
    return err;
461
0
}
462
463
static void svm_fpu_enter(struct vcpu *v)
464
0
{
465
0
    struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
466
0
467
0
    vcpu_restore_fpu_lazy(v);
468
0
    vmcb_set_exception_intercepts(
469
0
        n1vmcb,
470
0
        vmcb_get_exception_intercepts(n1vmcb) & ~(1U << TRAP_no_device));
471
0
}
472
473
static void svm_fpu_leave(struct vcpu *v)
474
0
{
475
0
    struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
476
0
477
0
    ASSERT(!v->fpu_dirtied);
478
0
    ASSERT(read_cr0() & X86_CR0_TS);
479
0
480
0
    /*
481
0
     * If the guest does not have TS enabled then we must cause and handle an 
482
0
     * exception on first use of the FPU. If the guest *does* have TS enabled 
483
0
     * then this is not necessary: no FPU activity can occur until the guest 
484
0
     * clears CR0.TS, and we will initialise the FPU when that happens.
485
0
     */
486
0
    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
487
0
    {
488
0
        vmcb_set_exception_intercepts(
489
0
            n1vmcb,
490
0
            vmcb_get_exception_intercepts(n1vmcb) | (1U << TRAP_no_device));
491
0
        vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) | X86_CR0_TS);
492
0
    }
493
0
}
494
495
static unsigned int svm_get_interrupt_shadow(struct vcpu *v)
496
0
{
497
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
498
0
    unsigned int intr_shadow = 0;
499
0
500
0
    if ( vmcb->interrupt_shadow )
501
0
        intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI;
502
0
503
0
    if ( vmcb_get_general1_intercepts(vmcb) & GENERAL1_INTERCEPT_IRET )
504
0
        intr_shadow |= HVM_INTR_SHADOW_NMI;
505
0
506
0
    return intr_shadow;
507
0
}
508
509
static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
510
0
{
511
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
512
0
    u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
513
0
514
0
    vmcb->interrupt_shadow =
515
0
        !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI));
516
0
517
0
    general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
518
0
    if ( intr_shadow & HVM_INTR_SHADOW_NMI )
519
0
        general1_intercepts |= GENERAL1_INTERCEPT_IRET;
520
0
    vmcb_set_general1_intercepts(vmcb, general1_intercepts);
521
0
}
522
523
static int svm_guest_x86_mode(struct vcpu *v)
524
0
{
525
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
526
0
527
0
    if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
528
0
        return 0;
529
0
    if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
530
0
        return 1;
531
0
    if ( hvm_long_mode_active(v) && likely(vmcb->cs.l) )
532
0
        return 8;
533
0
    return likely(vmcb->cs.db) ? 4 : 2;
534
0
}
535
536
void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
537
0
{
538
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
539
0
    uint64_t value;
540
0
541
0
    switch ( cr )
542
0
    {
543
0
    case 0: {
544
0
        unsigned long hw_cr0_mask = 0;
545
0
546
0
        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
547
0
        {
548
0
            if ( v != current )
549
0
                hw_cr0_mask |= X86_CR0_TS;
550
0
            else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS )
551
0
                svm_fpu_enter(v);
552
0
        }
553
0
554
0
        value = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
555
0
        if ( !paging_mode_hap(v->domain) )
556
0
            value |= X86_CR0_PG | X86_CR0_WP;
557
0
        vmcb_set_cr0(vmcb, value);
558
0
        break;
559
0
    }
560
0
    case 2:
561
0
        vmcb_set_cr2(vmcb, v->arch.hvm_vcpu.guest_cr[2]);
562
0
        break;
563
0
    case 3:
564
0
        vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]);
565
0
        if ( !nestedhvm_enabled(v->domain) )
566
0
            hvm_asid_flush_vcpu(v);
567
0
        else if ( nestedhvm_vmswitch_in_progress(v) )
568
0
            ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */
569
0
        else
570
0
            hvm_asid_flush_vcpu_asid(
571
0
                nestedhvm_vcpu_in_guestmode(v)
572
0
                ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
573
0
        break;
574
0
    case 4:
575
0
        value = HVM_CR4_HOST_MASK;
576
0
        if ( paging_mode_hap(v->domain) )
577
0
            value &= ~X86_CR4_PAE;
578
0
        value |= v->arch.hvm_vcpu.guest_cr[4];
579
0
580
0
        if ( !hvm_paging_enabled(v) )
581
0
        {
582
0
            /*
583
0
             * When the guest thinks paging is disabled, Xen may need to hide
584
0
             * the effects of shadow paging, as hardware runs with the host
585
0
             * paging settings, rather than the guests settings.
586
0
             *
587
0
             * Without CR0.PG, all memory accesses are user mode, so
588
0
             * _PAGE_USER must be set in the shadow pagetables for guest
589
0
             * userspace to function.  This in turn trips up guest supervisor
590
0
             * mode if SMEP/SMAP are left active in context.  They wouldn't
591
0
             * have any effect if paging was actually disabled, so hide them
592
0
             * behind the back of the guest.
593
0
             */
594
0
            value &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
595
0
        }
596
0
597
0
        vmcb_set_cr4(vmcb, value);
598
0
        break;
599
0
    default:
600
0
        BUG();
601
0
    }
602
0
}
603
604
static void svm_update_guest_efer(struct vcpu *v)
605
0
{
606
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
607
0
    bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA);
608
0
    uint64_t new_efer;
609
0
610
0
    new_efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME;
611
0
    if ( lma )
612
0
        new_efer |= EFER_LME;
613
0
    vmcb_set_efer(vmcb, new_efer);
614
0
}
615
616
static void svm_update_guest_vendor(struct vcpu *v)
617
0
{
618
0
    struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
619
0
    struct vmcb_struct *vmcb = arch_svm->vmcb;
620
0
    u32 bitmap = vmcb_get_exception_intercepts(vmcb);
621
0
622
0
    if ( opt_hvm_fep ||
623
0
         (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
624
0
        bitmap |= (1U << TRAP_invalid_op);
625
0
    else
626
0
        bitmap &= ~(1U << TRAP_invalid_op);
627
0
628
0
    vmcb_set_exception_intercepts(vmcb, bitmap);
629
0
}
630
631
static void svm_sync_vmcb(struct vcpu *v)
632
0
{
633
0
    struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
634
0
635
0
    if ( arch_svm->vmcb_in_sync )
636
0
        return;
637
0
638
0
    arch_svm->vmcb_in_sync = 1;
639
0
640
0
    svm_vmsave(arch_svm->vmcb);
641
0
}
642
643
static unsigned int svm_get_cpl(struct vcpu *v)
644
0
{
645
0
    return vmcb_get_cpl(v->arch.hvm_svm.vmcb);
646
0
}
647
648
static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
649
                                     struct segment_register *reg)
650
0
{
651
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
652
0
653
0
    ASSERT((v == current) || !vcpu_runnable(v));
654
0
655
0
    switch ( seg )
656
0
    {
657
0
    case x86_seg_fs ... x86_seg_gs:
658
0
        svm_sync_vmcb(v);
659
0
660
0
        /* Fallthrough. */
661
0
    case x86_seg_es ... x86_seg_ds:
662
0
        *reg = vmcb->sreg[seg];
663
0
664
0
        if ( seg == x86_seg_ss )
665
0
            reg->dpl = vmcb_get_cpl(vmcb);
666
0
        break;
667
0
668
0
    case x86_seg_tr:
669
0
        svm_sync_vmcb(v);
670
0
        *reg = vmcb->tr;
671
0
        break;
672
0
673
0
    case x86_seg_gdtr:
674
0
        *reg = vmcb->gdtr;
675
0
        break;
676
0
677
0
    case x86_seg_idtr:
678
0
        *reg = vmcb->idtr;
679
0
        break;
680
0
681
0
    case x86_seg_ldtr:
682
0
        svm_sync_vmcb(v);
683
0
        *reg = vmcb->ldtr;
684
0
        break;
685
0
686
0
    default:
687
0
        ASSERT_UNREACHABLE();
688
0
        domain_crash(v->domain);
689
0
        *reg = (struct segment_register){};
690
0
    }
691
0
}
692
693
static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
694
                                     struct segment_register *reg)
695
0
{
696
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
697
0
    bool sync = false;
698
0
699
0
    ASSERT((v == current) || !vcpu_runnable(v));
700
0
701
0
    switch ( seg )
702
0
    {
703
0
    case x86_seg_cs:
704
0
    case x86_seg_ds:
705
0
    case x86_seg_es:
706
0
    case x86_seg_ss: /* cpl */
707
0
        vmcb->cleanbits.fields.seg = 0;
708
0
        break;
709
0
710
0
    case x86_seg_gdtr:
711
0
    case x86_seg_idtr:
712
0
        vmcb->cleanbits.fields.dt = 0;
713
0
        break;
714
0
715
0
    case x86_seg_fs:
716
0
    case x86_seg_gs:
717
0
    case x86_seg_tr:
718
0
    case x86_seg_ldtr:
719
0
        sync = (v == current);
720
0
        break;
721
0
722
0
    default:
723
0
        ASSERT_UNREACHABLE();
724
0
        domain_crash(v->domain);
725
0
        return;
726
0
    }
727
0
728
0
    if ( sync )
729
0
        svm_sync_vmcb(v);
730
0
731
0
    switch ( seg )
732
0
    {
733
0
    case x86_seg_ss:
734
0
        vmcb_set_cpl(vmcb, reg->dpl);
735
0
736
0
        /* Fallthrough */
737
0
    case x86_seg_es ... x86_seg_cs:
738
0
    case x86_seg_ds ... x86_seg_gs:
739
0
        vmcb->sreg[seg] = *reg;
740
0
        break;
741
0
742
0
    case x86_seg_tr:
743
0
        vmcb->tr = *reg;
744
0
        break;
745
0
746
0
    case x86_seg_gdtr:
747
0
        vmcb->gdtr.base = reg->base;
748
0
        vmcb->gdtr.limit = reg->limit;
749
0
        break;
750
0
751
0
    case x86_seg_idtr:
752
0
        vmcb->idtr.base = reg->base;
753
0
        vmcb->idtr.limit = reg->limit;
754
0
        break;
755
0
756
0
    case x86_seg_ldtr:
757
0
        vmcb->ldtr = *reg;
758
0
        break;
759
0
760
0
    case x86_seg_none:
761
0
        ASSERT_UNREACHABLE();
762
0
        break;
763
0
    }
764
0
765
0
    if ( sync )
766
0
        svm_vmload(vmcb);
767
0
}
768
769
static unsigned long svm_get_shadow_gs_base(struct vcpu *v)
770
0
{
771
0
    return v->arch.hvm_svm.vmcb->kerngsbase;
772
0
}
773
774
static int svm_set_guest_pat(struct vcpu *v, u64 gpat)
775
0
{
776
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
777
0
778
0
    if ( !paging_mode_hap(v->domain) )
779
0
        return 0;
780
0
781
0
    vmcb_set_g_pat(vmcb, gpat);
782
0
    return 1;
783
0
}
784
785
static int svm_get_guest_pat(struct vcpu *v, u64 *gpat)
786
0
{
787
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
788
0
789
0
    if ( !paging_mode_hap(v->domain) )
790
0
        return 0;
791
0
792
0
    *gpat = vmcb_get_g_pat(vmcb);
793
0
    return 1;
794
0
}
795
796
static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio)
797
0
{
798
0
    uint64_t mult, frac, scaled_host_tsc;
799
0
800
0
    if ( ratio == DEFAULT_TSC_RATIO )
801
0
        return host_tsc;
802
0
803
0
    /*
804
0
     * Suppose the most significant 32 bits of host_tsc and ratio are
805
0
     * tsc_h and mult, and the least 32 bits of them are tsc_l and frac,
806
0
     * then
807
0
     *     host_tsc * ratio * 2^-32
808
0
     *     = host_tsc * (mult * 2^32 + frac) * 2^-32
809
0
     *     = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32
810
0
     *     = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32)
811
0
     *
812
0
     * Multiplications in the last two terms are between 32-bit integers,
813
0
     * so both of them can fit in 64-bit integers.
814
0
     *
815
0
     * Because mult is usually less than 10 in practice, it's very rare
816
0
     * that host_tsc * mult can overflow a 64-bit integer.
817
0
     */
818
0
    mult = ratio >> 32;
819
0
    frac = ratio & ((1ULL << 32) - 1);
820
0
    scaled_host_tsc  = host_tsc * mult;
821
0
    scaled_host_tsc += (host_tsc >> 32) * frac;
822
0
    scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32;
823
0
824
0
    return scaled_host_tsc;
825
0
}
826
827
static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc,
828
    uint64_t ratio)
829
0
{
830
0
    return guest_tsc - scale_tsc(host_tsc, ratio);
831
0
}
832
833
static void svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc)
834
0
{
835
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
836
0
    struct vmcb_struct *n1vmcb, *n2vmcb;
837
0
    uint64_t n2_tsc_offset = 0;
838
0
    struct domain *d = v->domain;
839
0
840
0
    if ( !nestedhvm_enabled(d) ) {
841
0
        vmcb_set_tsc_offset(vmcb, offset);
842
0
        return;
843
0
    }
844
0
845
0
    n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
846
0
    n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx;
847
0
848
0
    if ( nestedhvm_vcpu_in_guestmode(v) ) {
849
0
        struct nestedsvm *svm = &vcpu_nestedsvm(v);
850
0
851
0
        n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) -
852
0
                        vmcb_get_tsc_offset(n1vmcb);
853
0
        if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) {
854
0
            uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc);
855
0
856
0
            n2_tsc_offset = svm_get_tsc_offset(guest_tsc,
857
0
                                               guest_tsc + n2_tsc_offset,
858
0
                                               svm->ns_tscratio);
859
0
        }
860
0
        vmcb_set_tsc_offset(n1vmcb, offset);
861
0
    }
862
0
863
0
    vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset);
864
0
}
865
866
static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
867
0
{
868
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
869
0
    u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
870
0
    u32 general2_intercepts = vmcb_get_general2_intercepts(vmcb);
871
0
872
0
    general1_intercepts &= ~GENERAL1_INTERCEPT_RDTSC;
873
0
    general2_intercepts &= ~GENERAL2_INTERCEPT_RDTSCP;
874
0
875
0
    if ( enable )
876
0
    {
877
0
        general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
878
0
        general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
879
0
    }
880
0
881
0
    vmcb_set_general1_intercepts(vmcb, general1_intercepts);
882
0
    vmcb_set_general2_intercepts(vmcb, general2_intercepts);
883
0
}
884
885
static void svm_set_descriptor_access_exiting(struct vcpu *v, bool enable)
886
0
{
887
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
888
0
    u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
889
0
    u32 mask = GENERAL1_INTERCEPT_IDTR_READ | GENERAL1_INTERCEPT_GDTR_READ
890
0
            | GENERAL1_INTERCEPT_LDTR_READ | GENERAL1_INTERCEPT_TR_READ
891
0
            | GENERAL1_INTERCEPT_IDTR_WRITE | GENERAL1_INTERCEPT_GDTR_WRITE
892
0
            | GENERAL1_INTERCEPT_LDTR_WRITE | GENERAL1_INTERCEPT_TR_WRITE;
893
0
894
0
    if ( enable )
895
0
        general1_intercepts |= mask;
896
0
    else
897
0
        general1_intercepts &= ~mask;
898
0
899
0
    vmcb_set_general1_intercepts(vmcb, general1_intercepts);
900
0
}
901
902
static unsigned int svm_get_insn_bytes(struct vcpu *v, uint8_t *buf)
903
0
{
904
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
905
0
    unsigned int len = v->arch.hvm_svm.cached_insn_len;
906
0
907
0
    if ( len != 0 )
908
0
    {
909
0
        /* Latch and clear the cached instruction. */
910
0
        memcpy(buf, vmcb->guest_ins, MAX_INST_LEN);
911
0
        v->arch.hvm_svm.cached_insn_len = 0;
912
0
    }
913
0
914
0
    return len;
915
0
}
916
917
static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
918
0
{
919
0
    char *p;
920
0
    int i;
921
0
922
0
    for ( i = 0; i < (PAGE_SIZE / 32); i++ )
923
0
    {
924
0
        if ( i == __HYPERVISOR_iret )
925
0
            continue;
926
0
927
0
        p = (char *)(hypercall_page + (i * 32));
928
0
        *(u8  *)(p + 0) = 0xb8; /* mov imm32, %eax */
929
0
        *(u32 *)(p + 1) = i;
930
0
        *(u8  *)(p + 5) = 0x0f; /* vmmcall */
931
0
        *(u8  *)(p + 6) = 0x01;
932
0
        *(u8  *)(p + 7) = 0xd9;
933
0
        *(u8  *)(p + 8) = 0xc3; /* ret */
934
0
    }
935
0
936
0
    /* Don't support HYPERVISOR_iret at the moment */
937
0
    *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
938
0
}
939
940
static void svm_lwp_interrupt(struct cpu_user_regs *regs)
941
0
{
942
0
    struct vcpu *curr = current;
943
0
944
0
    ack_APIC_irq();
945
0
    vlapic_set_irq(
946
0
        vcpu_vlapic(curr),
947
0
        (curr->arch.hvm_svm.guest_lwp_cfg >> 40) & 0xff,
948
0
        0);
949
0
}
950
951
static inline void svm_lwp_save(struct vcpu *v)
952
0
{
953
0
    /* Don't mess up with other guests. Disable LWP for next VCPU. */
954
0
    if ( v->arch.hvm_svm.guest_lwp_cfg )
955
0
    {
956
0
        wrmsrl(MSR_AMD64_LWP_CFG, 0x0);
957
0
        wrmsrl(MSR_AMD64_LWP_CBADDR, 0x0);
958
0
    }
959
0
}
960
961
static inline void svm_lwp_load(struct vcpu *v)
962
0
{
963
0
    /* Only LWP_CFG is reloaded. LWP_CBADDR will be reloaded via xrstor. */
964
0
   if ( v->arch.hvm_svm.guest_lwp_cfg ) 
965
0
       wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
966
0
}
967
968
/* Update LWP_CFG MSR (0xc0000105). Return -1 if error; otherwise returns 0. */
969
static int svm_update_lwp_cfg(struct vcpu *v, uint64_t msr_content)
970
0
{
971
0
    uint32_t msr_low;
972
0
    static uint8_t lwp_intr_vector;
973
0
974
0
    if ( xsave_enabled(v) && cpu_has_lwp )
975
0
    {
976
0
        msr_low = (uint32_t)msr_content;
977
0
        
978
0
        /* generate #GP if guest tries to turn on unsupported features. */
979
0
        if ( msr_low & ~v->domain->arch.cpuid->extd.raw[0x1c].d )
980
0
            return -1;
981
0
982
0
        v->arch.hvm_svm.guest_lwp_cfg = msr_content;
983
0
984
0
        /* setup interrupt handler if needed */
985
0
        if ( (msr_content & 0x80000000) && ((msr_content >> 40) & 0xff) )
986
0
        {
987
0
            alloc_direct_apic_vector(&lwp_intr_vector, svm_lwp_interrupt);
988
0
            v->arch.hvm_svm.cpu_lwp_cfg = (msr_content & 0xffff00ffffffffffULL)
989
0
                | ((uint64_t)lwp_intr_vector << 40);
990
0
        }
991
0
        else
992
0
        {
993
0
            /* otherwise disable it */
994
0
            v->arch.hvm_svm.cpu_lwp_cfg = msr_content & 0xffff00ff7fffffffULL;
995
0
        }
996
0
        
997
0
        wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
998
0
999
0
        /* track nonalzy state if LWP_CFG is non-zero. */
1000
0
        v->arch.nonlazy_xstate_used = !!(msr_content);
1001
0
    }
1002
0
1003
0
    return 0;
1004
0
}
1005
1006
static inline void svm_tsc_ratio_save(struct vcpu *v)
1007
0
{
1008
0
    /* Other vcpus might not have vtsc enabled. So disable TSC_RATIO here. */
1009
0
    if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc )
1010
0
        wrmsrl(MSR_AMD64_TSC_RATIO, DEFAULT_TSC_RATIO);
1011
0
}
1012
1013
static inline void svm_tsc_ratio_load(struct vcpu *v)
1014
0
{
1015
0
    if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc ) 
1016
0
        wrmsrl(MSR_AMD64_TSC_RATIO, hvm_tsc_scaling_ratio(v->domain));
1017
0
}
1018
1019
static void svm_ctxt_switch_from(struct vcpu *v)
1020
0
{
1021
0
    int cpu = smp_processor_id();
1022
0
1023
0
    /*
1024
0
     * Return early if trying to do a context switch without SVM enabled,
1025
0
     * this can happen when the hypervisor shuts down with HVM guests
1026
0
     * still running.
1027
0
     */
1028
0
    if ( unlikely((read_efer() & EFER_SVME) == 0) )
1029
0
        return;
1030
0
1031
0
    svm_fpu_leave(v);
1032
0
1033
0
    svm_save_dr(v);
1034
0
    svm_lwp_save(v);
1035
0
    svm_tsc_ratio_save(v);
1036
0
1037
0
    svm_sync_vmcb(v);
1038
0
    svm_vmload_pa(per_cpu(host_vmcb, cpu));
1039
0
1040
0
    /* Resume use of ISTs now that the host TR is reinstated. */
1041
0
    set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
1042
0
    set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NMI);
1043
0
    set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
1044
0
}
1045
1046
static void svm_ctxt_switch_to(struct vcpu *v)
1047
0
{
1048
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1049
0
    int cpu = smp_processor_id();
1050
0
1051
0
    /*
1052
0
     * This is required, because VMRUN does consistency check and some of the
1053
0
     * DOM0 selectors are pointing to invalid GDT locations, and cause AMD
1054
0
     * processors to shutdown.
1055
0
     */
1056
0
    asm volatile ("mov %0, %%ds; mov %0, %%es; mov %0, %%ss;" :: "r" (0));
1057
0
1058
0
    /*
1059
0
     * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
1060
0
     * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
1061
0
     */
1062
0
    set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
1063
0
    set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
1064
0
    set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
1065
0
1066
0
    svm_restore_dr(v);
1067
0
1068
0
    svm_vmsave_pa(per_cpu(host_vmcb, cpu));
1069
0
    svm_vmload(vmcb);
1070
0
    vmcb->cleanbits.bytes = 0;
1071
0
    svm_lwp_load(v);
1072
0
    svm_tsc_ratio_load(v);
1073
0
1074
0
    if ( cpu_has_rdtscp )
1075
0
        wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
1076
0
}
1077
1078
static void noreturn svm_do_resume(struct vcpu *v)
1079
0
{
1080
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1081
0
    bool_t debug_state = v->domain->debugger_attached;
1082
0
    bool_t vcpu_guestmode = 0;
1083
0
    struct vlapic *vlapic = vcpu_vlapic(v);
1084
0
1085
0
    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
1086
0
        vcpu_guestmode = 1;
1087
0
1088
0
    if ( !vcpu_guestmode &&
1089
0
        unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
1090
0
    {
1091
0
        uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
1092
0
1093
0
        v->arch.hvm_vcpu.debug_state_latch = debug_state;
1094
0
        vmcb_set_exception_intercepts(
1095
0
            vmcb, debug_state ? (intercepts | (1U << TRAP_int3))
1096
0
                              : (intercepts & ~(1U << TRAP_int3)));
1097
0
    }
1098
0
1099
0
    if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
1100
0
    {
1101
0
        v->arch.hvm_svm.launch_core = smp_processor_id();
1102
0
        hvm_migrate_timers(v);
1103
0
        hvm_migrate_pirqs(v);
1104
0
        /* Migrating to another ASID domain.  Request a new ASID. */
1105
0
        hvm_asid_flush_vcpu(v);
1106
0
    }
1107
0
1108
0
    if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
1109
0
    {
1110
0
        vintr_t intr;
1111
0
1112
0
        /* Reflect the vlapic's TPR in the hardware vtpr */
1113
0
        intr = vmcb_get_vintr(vmcb);
1114
0
        intr.fields.tpr =
1115
0
            (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
1116
0
        vmcb_set_vintr(vmcb, intr);
1117
0
    }
1118
0
1119
0
    hvm_do_resume(v);
1120
0
1121
0
    reset_stack_and_jump(svm_asm_do_resume);
1122
0
}
1123
1124
static void svm_guest_osvw_init(struct vcpu *vcpu)
1125
0
{
1126
0
    if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1127
0
        return;
1128
0
1129
0
    /*
1130
0
     * Guests should see errata 400 and 415 as fixed (assuming that
1131
0
     * HLT and IO instructions are intercepted).
1132
0
     */
1133
0
    vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3;
1134
0
    vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL);
1135
0
1136
0
    /*
1137
0
     * By increasing VCPU's osvw.length to 3 we are telling the guest that
1138
0
     * all osvw.status bits inside that length, including bit 0 (which is
1139
0
     * reserved for erratum 298), are valid. However, if host processor's
1140
0
     * osvw_len is 0 then osvw_status[0] carries no information. We need to
1141
0
     * be conservative here and therefore we tell the guest that erratum 298
1142
0
     * is present (because we really don't know).
1143
0
     */
1144
0
    if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 )
1145
0
        vcpu->arch.hvm_svm.osvw.status |= 1;
1146
0
}
1147
1148
void svm_host_osvw_reset()
1149
0
{
1150
0
    spin_lock(&osvw_lock);
1151
0
1152
0
    osvw_length = 64; /* One register (MSRC001_0141) worth of errata */
1153
0
    osvw_status = 0;
1154
0
1155
0
    spin_unlock(&osvw_lock);
1156
0
}
1157
1158
void svm_host_osvw_init()
1159
0
{
1160
0
    spin_lock(&osvw_lock);
1161
0
1162
0
    /*
1163
0
     * Get OSVW bits. If bits are not the same on different processors then
1164
0
     * choose the worst case (i.e. if erratum is present on one processor and
1165
0
     * not on another assume that the erratum is present everywhere).
1166
0
     */
1167
0
    if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) )
1168
0
    {
1169
0
        uint64_t len, status;
1170
0
1171
0
        if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) ||
1172
0
             rdmsr_safe(MSR_AMD_OSVW_STATUS, status) )
1173
0
            len = status = 0;
1174
0
1175
0
        if (len < osvw_length)
1176
0
            osvw_length = len;
1177
0
1178
0
        osvw_status |= status;
1179
0
        osvw_status &= (1ULL << osvw_length) - 1;
1180
0
    }
1181
0
    else
1182
0
        osvw_length = osvw_status = 0;
1183
0
1184
0
    spin_unlock(&osvw_lock);
1185
0
}
1186
1187
static int svm_domain_initialise(struct domain *d)
1188
0
{
1189
0
    static const struct arch_csw csw = {
1190
0
        .from = svm_ctxt_switch_from,
1191
0
        .to   = svm_ctxt_switch_to,
1192
0
        .tail = svm_do_resume,
1193
0
    };
1194
0
1195
0
    d->arch.ctxt_switch = &csw;
1196
0
1197
0
    return 0;
1198
0
}
1199
1200
static void svm_domain_destroy(struct domain *d)
1201
0
{
1202
0
}
1203
1204
static int svm_vcpu_initialise(struct vcpu *v)
1205
0
{
1206
0
    int rc;
1207
0
1208
0
    v->arch.hvm_svm.launch_core = -1;
1209
0
1210
0
    if ( (rc = svm_create_vmcb(v)) != 0 )
1211
0
    {
1212
0
        dprintk(XENLOG_WARNING,
1213
0
                "Failed to create VMCB for vcpu %d: err=%d.\n",
1214
0
                v->vcpu_id, rc);
1215
0
        return rc;
1216
0
    }
1217
0
1218
0
    svm_guest_osvw_init(v);
1219
0
1220
0
    return 0;
1221
0
}
1222
1223
static void svm_vcpu_destroy(struct vcpu *v)
1224
0
{
1225
0
    svm_destroy_vmcb(v);
1226
0
    passive_domain_destroy(v);
1227
0
}
1228
1229
/*
1230
 * Emulate enough of interrupt injection to cover the DPL check (omitted by
1231
 * hardware), and to work out whether it is safe to move %rip fowards for
1232
 * architectural trap vs fault semantics in the exception frame (which
1233
 * hardware won't cope with).
1234
 *
1235
 * The event parameter will be modified to a fault if necessary.
1236
 */
1237
static void svm_emul_swint_injection(struct x86_event *event)
1238
0
{
1239
0
    struct vcpu *curr = current;
1240
0
    const struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1241
0
    const struct cpu_user_regs *regs = guest_cpu_user_regs();
1242
0
    unsigned int trap = event->vector, type = event->type;
1243
0
    unsigned int fault = TRAP_gp_fault, ec = 0;
1244
0
    pagefault_info_t pfinfo;
1245
0
    struct segment_register cs, idtr;
1246
0
    unsigned int idte_size, idte_offset;
1247
0
    unsigned long idte_linear_addr;
1248
0
    struct { uint32_t a, b, c, d; } idte = {};
1249
0
    bool lm = vmcb_get_efer(vmcb) & EFER_LMA;
1250
0
    int rc;
1251
0
1252
0
    if ( !(vmcb_get_cr0(vmcb) & X86_CR0_PE) )
1253
0
        goto raise_exception; /* TODO: support real-mode injection? */
1254
0
1255
0
    idte_size   = lm ? 16 : 8;
1256
0
    idte_offset = trap * idte_size;
1257
0
1258
0
    /* ICEBP sets the External Event bit despite being an instruction. */
1259
0
    ec = (trap << 3) | X86_XEC_IDT |
1260
0
        (type == X86_EVENTTYPE_PRI_SW_EXCEPTION ? X86_XEC_EXT : 0);
1261
0
1262
0
    /*
1263
0
     * TODO: This does not cover the v8086 mode with CR4.VME case
1264
0
     * correctly, but falls on the safe side from the point of view of a
1265
0
     * 32bit OS.  Someone with many TUITs can see about reading the TSS
1266
0
     * Software Interrupt Redirection bitmap.
1267
0
     */
1268
0
    if ( (regs->eflags & X86_EFLAGS_VM) &&
1269
0
         MASK_EXTR(regs->eflags, X86_EFLAGS_IOPL) != 3 )
1270
0
        goto raise_exception;
1271
0
1272
0
    /*
1273
0
     * Read all 8/16 bytes so the idtr limit check is applied properly to
1274
0
     * this entry, even though we don't look at all the words read.
1275
0
     */
1276
0
    hvm_get_segment_register(curr, x86_seg_cs, &cs);
1277
0
    hvm_get_segment_register(curr, x86_seg_idtr, &idtr);
1278
0
    if ( !hvm_virtual_to_linear_addr(x86_seg_idtr, &idtr, idte_offset,
1279
0
                                     idte_size, hvm_access_read,
1280
0
                                     &cs, &idte_linear_addr) )
1281
0
        goto raise_exception;
1282
0
1283
0
    rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
1284
0
                                    PFEC_implicit, &pfinfo);
1285
0
    if ( rc )
1286
0
    {
1287
0
        if ( rc == HVMTRANS_bad_linear_to_gfn )
1288
0
        {
1289
0
            fault = TRAP_page_fault;
1290
0
            ec = pfinfo.ec;
1291
0
            event->cr2 = pfinfo.linear;
1292
0
        }
1293
0
1294
0
        goto raise_exception;
1295
0
    }
1296
0
1297
0
    /* This must be an interrupt, trap, or task gate. */
1298
0
    switch ( (idte.b >> 8) & 0x1f )
1299
0
    {
1300
0
    case SYS_DESC_irq_gate:
1301
0
    case SYS_DESC_trap_gate:
1302
0
        break;
1303
0
    case SYS_DESC_irq_gate16:
1304
0
    case SYS_DESC_trap_gate16:
1305
0
    case SYS_DESC_task_gate:
1306
0
        if ( !lm )
1307
0
            break;
1308
0
        /* fall through */
1309
0
    default:
1310
0
        goto raise_exception;
1311
0
    }
1312
0
1313
0
    /* The 64-bit high half's type must be zero. */
1314
0
    if ( idte.d & 0x1f00 )
1315
0
        goto raise_exception;
1316
0
1317
0
    /* ICEBP counts as a hardware event, and bypasses the dpl check. */
1318
0
    if ( type != X86_EVENTTYPE_PRI_SW_EXCEPTION &&
1319
0
         vmcb_get_cpl(vmcb) > ((idte.b >> 13) & 3) )
1320
0
        goto raise_exception;
1321
0
1322
0
    /* Is this entry present? */
1323
0
    if ( !(idte.b & (1u << 15)) )
1324
0
    {
1325
0
        fault = TRAP_no_segment;
1326
0
        goto raise_exception;
1327
0
    }
1328
0
1329
0
    /*
1330
0
     * Any further fault during injection will cause a double fault.  It
1331
0
     * is fine to leave this up to hardware, and software won't be in a
1332
0
     * position to care about the architectural correctness of %rip in the
1333
0
     * exception frame.
1334
0
     */
1335
0
    return;
1336
0
1337
0
 raise_exception:
1338
0
    event->vector = fault;
1339
0
    event->type = X86_EVENTTYPE_HW_EXCEPTION;
1340
0
    event->insn_len = 0;
1341
0
    event->error_code = ec;
1342
0
}
1343
1344
static void svm_inject_event(const struct x86_event *event)
1345
0
{
1346
0
    struct vcpu *curr = current;
1347
0
    struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1348
0
    eventinj_t eventinj = vmcb->eventinj;
1349
0
    struct x86_event _event = *event;
1350
0
    struct cpu_user_regs *regs = guest_cpu_user_regs();
1351
0
1352
0
    /*
1353
0
     * For hardware lacking NRips support, and always for ICEBP instructions,
1354
0
     * the processor requires extra help to deliver software events.
1355
0
     *
1356
0
     * Xen must emulate enough of the event injection to be sure that a
1357
0
     * further fault shouldn't occur during delivery.  This covers the fact
1358
0
     * that hardware doesn't perform DPL checking on injection.
1359
0
     *
1360
0
     * Also, it accounts for proper positioning of %rip for an event with trap
1361
0
     * semantics (where %rip should point after the instruction) which suffers
1362
0
     * a fault during injection (at which point %rip should point at the
1363
0
     * instruction).
1364
0
     */
1365
0
    if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
1366
0
         (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
1367
0
                                 event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
1368
0
        svm_emul_swint_injection(&_event);
1369
0
1370
0
    switch ( _event.vector )
1371
0
    {
1372
0
    case TRAP_debug:
1373
0
        if ( regs->eflags & X86_EFLAGS_TF )
1374
0
        {
1375
0
            __restore_debug_registers(vmcb, curr);
1376
0
            vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
1377
0
        }
1378
0
        /* fall through */
1379
0
    case TRAP_int3:
1380
0
        if ( curr->domain->debugger_attached )
1381
0
        {
1382
0
            /* Debug/Int3: Trap to debugger. */
1383
0
            domain_pause_for_debugger();
1384
0
            return;
1385
0
        }
1386
0
    }
1387
0
1388
0
    if ( unlikely(eventinj.fields.v) &&
1389
0
         (eventinj.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
1390
0
    {
1391
0
        _event.vector = hvm_combine_hw_exceptions(
1392
0
            eventinj.fields.vector, _event.vector);
1393
0
        if ( _event.vector == TRAP_double_fault )
1394
0
            _event.error_code = 0;
1395
0
    }
1396
0
1397
0
    eventinj.bytes = 0;
1398
0
    eventinj.fields.v = 1;
1399
0
    eventinj.fields.vector = _event.vector;
1400
0
1401
0
    /*
1402
0
     * Refer to AMD Vol 2: System Programming, 15.20 Event Injection.
1403
0
     *
1404
0
     * On hardware lacking NextRIP support, and all hardware in the case of
1405
0
     * icebp, software events with trap semantics need emulating, so %rip in
1406
0
     * the trap frame points after the instruction.
1407
0
     *
1408
0
     * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
1409
0
     * have performed checks such as presence/dpl/etc and believes that the
1410
0
     * event injection will succeed without faulting.
1411
0
     *
1412
0
     * The x86 emulator will always provide fault semantics for software
1413
0
     * events, with _trap.insn_len set appropriately.  If the injection
1414
0
     * requires emulation, move %rip forwards at this point.
1415
0
     */
1416
0
    switch ( _event.type )
1417
0
    {
1418
0
    case X86_EVENTTYPE_SW_INTERRUPT: /* int $n */
1419
0
        if ( cpu_has_svm_nrips )
1420
0
            vmcb->nextrip = regs->rip + _event.insn_len;
1421
0
        else
1422
0
            regs->rip += _event.insn_len;
1423
0
        eventinj.fields.type = X86_EVENTTYPE_SW_INTERRUPT;
1424
0
        break;
1425
0
1426
0
    case X86_EVENTTYPE_PRI_SW_EXCEPTION: /* icebp */
1427
0
        /*
1428
0
         * icebp's injection must always be emulated, as hardware does not
1429
0
         * special case HW_EXCEPTION with vector 1 (#DB) as having trap
1430
0
         * semantics.
1431
0
         */
1432
0
        regs->rip += _event.insn_len;
1433
0
        if ( cpu_has_svm_nrips )
1434
0
            vmcb->nextrip = regs->rip;
1435
0
        eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1436
0
        break;
1437
0
1438
0
    case X86_EVENTTYPE_SW_EXCEPTION: /* int3, into */
1439
0
        /*
1440
0
         * Hardware special cases HW_EXCEPTION with vectors 3 and 4 as having
1441
0
         * trap semantics, and will perform DPL checks.
1442
0
         */
1443
0
        if ( cpu_has_svm_nrips )
1444
0
            vmcb->nextrip = regs->rip + _event.insn_len;
1445
0
        else
1446
0
            regs->rip += _event.insn_len;
1447
0
        eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1448
0
        break;
1449
0
1450
0
    default:
1451
0
        eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1452
0
        eventinj.fields.ev = (_event.error_code != X86_EVENT_NO_EC);
1453
0
        eventinj.fields.errorcode = _event.error_code;
1454
0
        break;
1455
0
    }
1456
0
1457
0
    /*
1458
0
     * If injecting an event outside of 64bit mode, zero the upper bits of the
1459
0
     * %eip and nextrip after the adjustments above.
1460
0
     */
1461
0
    if ( !((vmcb_get_efer(vmcb) & EFER_LMA) && vmcb->cs.l) )
1462
0
    {
1463
0
        regs->rip = regs->eip;
1464
0
        vmcb->nextrip = (uint32_t)vmcb->nextrip;
1465
0
    }
1466
0
1467
0
    ASSERT(!eventinj.fields.ev ||
1468
0
           eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
1469
0
    vmcb->eventinj = eventinj;
1470
0
1471
0
    if ( _event.vector == TRAP_page_fault )
1472
0
    {
1473
0
        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
1474
0
        vmcb_set_cr2(vmcb, _event.cr2);
1475
0
        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
1476
0
    }
1477
0
    else
1478
0
    {
1479
0
        HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
1480
0
    }
1481
0
}
1482
1483
static int svm_event_pending(struct vcpu *v)
1484
0
{
1485
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1486
0
    return vmcb->eventinj.fields.v;
1487
0
}
1488
1489
static void svm_cpu_dead(unsigned int cpu)
1490
0
{
1491
0
    paddr_t *this_hsa = &per_cpu(hsa, cpu);
1492
0
    paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1493
0
1494
0
    if ( *this_hsa )
1495
0
    {
1496
0
        free_domheap_page(maddr_to_page(*this_hsa));
1497
0
        *this_hsa = 0;
1498
0
    }
1499
0
1500
0
    if ( *this_vmcb )
1501
0
    {
1502
0
        free_domheap_page(maddr_to_page(*this_vmcb));
1503
0
        *this_vmcb = 0;
1504
0
    }
1505
0
}
1506
1507
static int svm_cpu_up_prepare(unsigned int cpu)
1508
0
{
1509
0
    paddr_t *this_hsa = &per_cpu(hsa, cpu);
1510
0
    paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1511
0
    nodeid_t node = cpu_to_node(cpu);
1512
0
    unsigned int memflags = 0;
1513
0
    struct page_info *pg;
1514
0
1515
0
    if ( node != NUMA_NO_NODE )
1516
0
        memflags = MEMF_node(node);
1517
0
1518
0
    if ( !*this_hsa )
1519
0
    {
1520
0
        pg = alloc_domheap_page(NULL, memflags);
1521
0
        if ( !pg )
1522
0
            goto err;
1523
0
1524
0
        clear_domain_page(_mfn(page_to_mfn(pg)));
1525
0
        *this_hsa = page_to_maddr(pg);
1526
0
    }
1527
0
1528
0
    if ( !*this_vmcb )
1529
0
    {
1530
0
        pg = alloc_domheap_page(NULL, memflags);
1531
0
        if ( !pg )
1532
0
            goto err;
1533
0
1534
0
        clear_domain_page(_mfn(page_to_mfn(pg)));
1535
0
        *this_vmcb = page_to_maddr(pg);
1536
0
    }
1537
0
1538
0
    return 0;
1539
0
1540
0
 err:
1541
0
    svm_cpu_dead(cpu);
1542
0
    return -ENOMEM;
1543
0
}
1544
1545
static void svm_init_erratum_383(const struct cpuinfo_x86 *c)
1546
0
{
1547
0
    uint64_t msr_content;
1548
0
1549
0
    /* check whether CPU is affected */
1550
0
    if ( !cpu_has_amd_erratum(c, AMD_ERRATUM_383) )
1551
0
        return;
1552
0
1553
0
    /* use safe methods to be compatible with nested virtualization */
1554
0
    if (rdmsr_safe(MSR_AMD64_DC_CFG, msr_content) == 0 &&
1555
0
        wrmsr_safe(MSR_AMD64_DC_CFG, msr_content | (1ULL << 47)) == 0)
1556
0
    {
1557
0
        amd_erratum383_found = 1;
1558
0
    } else {
1559
0
        printk("Failed to enable erratum 383\n");
1560
0
    }
1561
0
}
1562
1563
static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read)
1564
0
{
1565
0
    if ( !v->domain->arch.cpuid->extd.osvw )
1566
0
        return -1;
1567
0
1568
0
    if ( read )
1569
0
    {
1570
0
        if (msr == MSR_AMD_OSVW_ID_LENGTH)
1571
0
            *val = v->arch.hvm_svm.osvw.length;
1572
0
        else
1573
0
            *val = v->arch.hvm_svm.osvw.status;
1574
0
    }
1575
0
    /* Writes are ignored */
1576
0
1577
0
    return 0;
1578
0
}
1579
1580
static int _svm_cpu_up(bool bsp)
1581
0
{
1582
0
    uint64_t msr_content;
1583
0
    int rc;
1584
0
    unsigned int cpu = smp_processor_id();
1585
0
    const struct cpuinfo_x86 *c = &cpu_data[cpu];
1586
0
 
1587
0
    /* Check whether SVM feature is disabled in BIOS */
1588
0
    rdmsrl(MSR_K8_VM_CR, msr_content);
1589
0
    if ( msr_content & K8_VMCR_SVME_DISABLE )
1590
0
    {
1591
0
        printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu);
1592
0
        return -EINVAL;
1593
0
    }
1594
0
1595
0
    if ( bsp && (rc = svm_cpu_up_prepare(cpu)) != 0 )
1596
0
        return rc;
1597
0
1598
0
    write_efer(read_efer() | EFER_SVME);
1599
0
1600
0
    /* Initialize the HSA for this core. */
1601
0
    wrmsrl(MSR_K8_VM_HSAVE_PA, per_cpu(hsa, cpu));
1602
0
1603
0
    /* check for erratum 383 */
1604
0
    svm_init_erratum_383(c);
1605
0
1606
0
    /* Initialize core's ASID handling. */
1607
0
    svm_asid_init(c);
1608
0
1609
0
    /*
1610
0
     * Check whether EFER.LMSLE can be written.
1611
0
     * Unfortunately there's no feature bit defined for this.
1612
0
     */
1613
0
    msr_content = read_efer();
1614
0
    if ( wrmsr_safe(MSR_EFER, msr_content | EFER_LMSLE) == 0 )
1615
0
        rdmsrl(MSR_EFER, msr_content);
1616
0
    if ( msr_content & EFER_LMSLE )
1617
0
    {
1618
0
        if ( 0 && /* FIXME: Migration! */ bsp )
1619
0
            cpu_has_lmsl = 1;
1620
0
        wrmsrl(MSR_EFER, msr_content ^ EFER_LMSLE);
1621
0
    }
1622
0
    else
1623
0
    {
1624
0
        if ( cpu_has_lmsl )
1625
0
            printk(XENLOG_WARNING "Inconsistent LMSLE support across CPUs!\n");
1626
0
        cpu_has_lmsl = 0;
1627
0
    }
1628
0
1629
0
    /* Initialize OSVW bits to be used by guests */
1630
0
    svm_host_osvw_init();
1631
0
1632
0
    return 0;
1633
0
}
1634
1635
static int svm_cpu_up(void)
1636
0
{
1637
0
    return _svm_cpu_up(false);
1638
0
}
1639
1640
const struct hvm_function_table * __init start_svm(void)
1641
0
{
1642
0
    bool_t printed = 0;
1643
0
1644
0
    svm_host_osvw_reset();
1645
0
1646
0
    if ( _svm_cpu_up(true) )
1647
0
    {
1648
0
        printk("SVM: failed to initialise.\n");
1649
0
        return NULL;
1650
0
    }
1651
0
1652
0
    setup_vmcb_dump();
1653
0
1654
0
    svm_feature_flags = (current_cpu_data.extended_cpuid_level >= 0x8000000A ?
1655
0
                         cpuid_edx(0x8000000A) : 0);
1656
0
1657
0
    printk("SVM: Supported advanced features:\n");
1658
0
1659
0
    /* DecodeAssists fast paths assume nextrip is valid for fast rIP update. */
1660
0
    if ( !cpu_has_svm_nrips )
1661
0
        clear_bit(SVM_FEATURE_DECODEASSISTS, &svm_feature_flags);
1662
0
1663
0
    if ( cpu_has_tsc_ratio )
1664
0
        svm_function_table.tsc_scaling.ratio_frac_bits = 32;
1665
0
1666
0
#define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
1667
0
    P(cpu_has_svm_npt, "Nested Page Tables (NPT)");
1668
0
    P(cpu_has_svm_lbrv, "Last Branch Record (LBR) Virtualisation");
1669
0
    P(cpu_has_svm_nrips, "Next-RIP Saved on #VMEXIT");
1670
0
    P(cpu_has_svm_cleanbits, "VMCB Clean Bits");
1671
0
    P(cpu_has_svm_decode, "DecodeAssists");
1672
0
    P(cpu_has_pause_filter, "Pause-Intercept Filter");
1673
0
    P(cpu_has_tsc_ratio, "TSC Rate MSR");
1674
0
#undef P
1675
0
1676
0
    if ( !printed )
1677
0
        printk(" - none\n");
1678
0
1679
0
    svm_function_table.hap_supported = !!cpu_has_svm_npt;
1680
0
    svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB |
1681
0
        (cpu_has_page1gb ? HVM_HAP_SUPERPAGE_1GB : 0);
1682
0
1683
0
    return &svm_function_table;
1684
0
}
1685
1686
static void svm_do_nested_pgfault(struct vcpu *v,
1687
    struct cpu_user_regs *regs, uint64_t pfec, paddr_t gpa)
1688
0
{
1689
0
    int ret;
1690
0
    unsigned long gfn = gpa >> PAGE_SHIFT;
1691
0
    mfn_t mfn;
1692
0
    p2m_type_t p2mt;
1693
0
    p2m_access_t p2ma;
1694
0
    struct p2m_domain *p2m = NULL;
1695
0
1696
0
    /*
1697
0
     * Since HW doesn't explicitly provide a read access bit and we need to
1698
0
     * somehow describe read-modify-write instructions we will conservatively
1699
0
     * set read_access for all memory accesses that are not instruction fetches.
1700
0
     */
1701
0
    struct npfec npfec = {
1702
0
        .read_access = !(pfec & PFEC_insn_fetch),
1703
0
        .write_access = !!(pfec & PFEC_write_access),
1704
0
        .insn_fetch = !!(pfec & PFEC_insn_fetch),
1705
0
        .present = !!(pfec & PFEC_page_present),
1706
0
    };
1707
0
1708
0
    /* These bits are mutually exclusive */
1709
0
    if ( pfec & NPT_PFEC_with_gla )
1710
0
        npfec.kind = npfec_kind_with_gla;
1711
0
    else if ( pfec & NPT_PFEC_in_gpt )
1712
0
        npfec.kind = npfec_kind_in_gpt;
1713
0
1714
0
    ret = hvm_hap_nested_page_fault(gpa, ~0ul, npfec);
1715
0
1716
0
    if ( tb_init_done )
1717
0
    {
1718
0
        struct {
1719
0
            uint64_t gpa;
1720
0
            uint64_t mfn;
1721
0
            uint32_t qualification;
1722
0
            uint32_t p2mt;
1723
0
        } _d;
1724
0
1725
0
        p2m = p2m_get_p2m(v);
1726
0
        _d.gpa = gpa;
1727
0
        _d.qualification = 0;
1728
0
        mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0);
1729
0
        _d.mfn = mfn_x(mfn);
1730
0
        
1731
0
        __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
1732
0
    }
1733
0
1734
0
    switch (ret) {
1735
0
    case 0:
1736
0
        break;
1737
0
    case 1:
1738
0
        return;
1739
0
    case -1:
1740
0
        ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
1741
0
        /* inject #VMEXIT(NPF) into guest. */
1742
0
        nestedsvm_vmexit_defer(v, VMEXIT_NPF, pfec, gpa);
1743
0
        return;
1744
0
    }
1745
0
1746
0
    if ( p2m == NULL )
1747
0
        p2m = p2m_get_p2m(v);
1748
0
    /* Everything else is an error. */
1749
0
    mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0);
1750
0
    gdprintk(XENLOG_ERR,
1751
0
         "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
1752
0
         gpa, mfn_x(mfn), p2mt);
1753
0
    domain_crash(v->domain);
1754
0
}
1755
1756
static void svm_fpu_dirty_intercept(void)
1757
0
{
1758
0
    struct vcpu *v = current;
1759
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1760
0
    struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1761
0
1762
0
    svm_fpu_enter(v);
1763
0
1764
0
    if ( vmcb != n1vmcb )
1765
0
    {
1766
0
       /* Check if l1 guest must make FPU ready for the l2 guest */
1767
0
       if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS )
1768
0
           hvm_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1769
0
       else
1770
0
           vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) & ~X86_CR0_TS);
1771
0
       return;
1772
0
    }
1773
0
1774
0
    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1775
0
        vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
1776
0
}
1777
1778
static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
1779
0
{
1780
0
    struct vcpu *curr = current;
1781
0
    unsigned int inst_len;
1782
0
    struct cpuid_leaf res;
1783
0
1784
0
    if ( (inst_len = __get_instruction_length(curr, INSTR_CPUID)) == 0 )
1785
0
        return;
1786
0
1787
0
    guest_cpuid(curr, regs->eax, regs->ecx, &res);
1788
0
    HVMTRACE_5D(CPUID, regs->eax, res.a, res.b, res.c, res.d);
1789
0
1790
0
    regs->rax = res.a;
1791
0
    regs->rbx = res.b;
1792
0
    regs->rcx = res.c;
1793
0
    regs->rdx = res.d;
1794
0
1795
0
    __update_guest_eip(regs, inst_len);
1796
0
}
1797
1798
static void svm_vmexit_do_cr_access(
1799
    struct vmcb_struct *vmcb, struct cpu_user_regs *regs)
1800
0
{
1801
0
    int gp, cr, dir, rc;
1802
0
1803
0
    cr = vmcb->exitcode - VMEXIT_CR0_READ;
1804
0
    dir = (cr > 15);
1805
0
    cr &= 0xf;
1806
0
    gp = vmcb->exitinfo1 & 0xf;
1807
0
1808
0
    rc = dir ? hvm_mov_to_cr(cr, gp) : hvm_mov_from_cr(cr, gp);
1809
0
1810
0
    if ( rc == X86EMUL_OKAY )
1811
0
        __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
1812
0
}
1813
1814
static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1815
0
{
1816
0
    struct vmcb_struct *vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1817
0
1818
0
    HVMTRACE_0D(DR_WRITE);
1819
0
    __restore_debug_registers(vmcb, v);
1820
0
}
1821
1822
static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
1823
0
{
1824
0
    int ret;
1825
0
    struct vcpu *v = current;
1826
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1827
0
1828
0
    switch ( msr )
1829
0
    {
1830
0
    case MSR_IA32_SYSENTER_CS:
1831
0
        *msr_content = v->arch.hvm_svm.guest_sysenter_cs;
1832
0
        break;
1833
0
    case MSR_IA32_SYSENTER_ESP:
1834
0
        *msr_content = v->arch.hvm_svm.guest_sysenter_esp;
1835
0
        break;
1836
0
    case MSR_IA32_SYSENTER_EIP:
1837
0
        *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
1838
0
        break;
1839
0
1840
0
    case MSR_IA32_MCx_MISC(4): /* Threshold register */
1841
0
    case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
1842
0
        /*
1843
0
         * MCA/MCE: We report that the threshold register is unavailable
1844
0
         * for OS use (locked by the BIOS).
1845
0
         */
1846
0
        *msr_content = 1ULL << 61; /* MC4_MISC.Locked */
1847
0
        break;
1848
0
1849
0
    case MSR_IA32_EBC_FREQUENCY_ID:
1850
0
        /*
1851
0
         * This Intel-only register may be accessed if this HVM guest
1852
0
         * has been migrated from an Intel host. The value zero is not
1853
0
         * particularly meaningful, but at least avoids the guest crashing!
1854
0
         */
1855
0
        *msr_content = 0;
1856
0
        break;
1857
0
1858
0
    case MSR_IA32_DEBUGCTLMSR:
1859
0
        *msr_content = vmcb_get_debugctlmsr(vmcb);
1860
0
        break;
1861
0
1862
0
    case MSR_IA32_LASTBRANCHFROMIP:
1863
0
        *msr_content = vmcb_get_lastbranchfromip(vmcb);
1864
0
        break;
1865
0
1866
0
    case MSR_IA32_LASTBRANCHTOIP:
1867
0
        *msr_content = vmcb_get_lastbranchtoip(vmcb);
1868
0
        break;
1869
0
1870
0
    case MSR_IA32_LASTINTFROMIP:
1871
0
        *msr_content = vmcb_get_lastintfromip(vmcb);
1872
0
        break;
1873
0
1874
0
    case MSR_IA32_LASTINTTOIP:
1875
0
        *msr_content = vmcb_get_lastinttoip(vmcb);
1876
0
        break;
1877
0
1878
0
    case MSR_AMD64_LWP_CFG:
1879
0
        *msr_content = v->arch.hvm_svm.guest_lwp_cfg;
1880
0
        break;
1881
0
1882
0
    case MSR_K7_PERFCTR0:
1883
0
    case MSR_K7_PERFCTR1:
1884
0
    case MSR_K7_PERFCTR2:
1885
0
    case MSR_K7_PERFCTR3:
1886
0
    case MSR_K7_EVNTSEL0:
1887
0
    case MSR_K7_EVNTSEL1:
1888
0
    case MSR_K7_EVNTSEL2:
1889
0
    case MSR_K7_EVNTSEL3:
1890
0
    case MSR_AMD_FAM15H_PERFCTR0:
1891
0
    case MSR_AMD_FAM15H_PERFCTR1:
1892
0
    case MSR_AMD_FAM15H_PERFCTR2:
1893
0
    case MSR_AMD_FAM15H_PERFCTR3:
1894
0
    case MSR_AMD_FAM15H_PERFCTR4:
1895
0
    case MSR_AMD_FAM15H_PERFCTR5:
1896
0
    case MSR_AMD_FAM15H_EVNTSEL0:
1897
0
    case MSR_AMD_FAM15H_EVNTSEL1:
1898
0
    case MSR_AMD_FAM15H_EVNTSEL2:
1899
0
    case MSR_AMD_FAM15H_EVNTSEL3:
1900
0
    case MSR_AMD_FAM15H_EVNTSEL4:
1901
0
    case MSR_AMD_FAM15H_EVNTSEL5:
1902
0
        if ( vpmu_do_rdmsr(msr, msr_content) )
1903
0
            goto gpf;
1904
0
        break;
1905
0
1906
0
    case MSR_AMD64_DR0_ADDRESS_MASK:
1907
0
        if ( !v->domain->arch.cpuid->extd.dbext )
1908
0
            goto gpf;
1909
0
        *msr_content = v->arch.hvm_svm.dr_mask[0];
1910
0
        break;
1911
0
1912
0
    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
1913
0
        if ( !v->domain->arch.cpuid->extd.dbext )
1914
0
            goto gpf;
1915
0
        *msr_content =
1916
0
            v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1];
1917
0
        break;
1918
0
1919
0
    case MSR_AMD_OSVW_ID_LENGTH:
1920
0
    case MSR_AMD_OSVW_STATUS:
1921
0
        ret = svm_handle_osvw(v, msr, msr_content, 1);
1922
0
        if ( ret < 0 )
1923
0
            goto gpf;
1924
0
        break;
1925
0
1926
0
    default:
1927
0
        ret = nsvm_rdmsr(v, msr, msr_content);
1928
0
        if ( ret < 0 )
1929
0
            goto gpf;
1930
0
        else if ( ret )
1931
0
            break;
1932
0
1933
0
        if ( rdmsr_viridian_regs(msr, msr_content) ||
1934
0
             rdmsr_hypervisor_regs(msr, msr_content) )
1935
0
            break;
1936
0
1937
0
        if ( rdmsr_safe(msr, *msr_content) == 0 )
1938
0
            break;
1939
0
1940
0
        if ( boot_cpu_data.x86 == 0xf && msr == MSR_F10_BU_CFG )
1941
0
        {
1942
0
            /* Win2k8 x64 reads this MSR on revF chips, where it
1943
0
             * wasn't publically available; it uses a magic constant
1944
0
             * in %rdi as a password, which we don't have in
1945
0
             * rdmsr_safe().  Since we'll ignore the later writes,
1946
0
             * just use a plausible value here (the reset value from
1947
0
             * rev10h chips) if the real CPU didn't provide one. */
1948
0
            *msr_content = 0x0000000010200020ull;
1949
0
            break;
1950
0
        }
1951
0
1952
0
        goto gpf;
1953
0
    }
1954
0
1955
0
    HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%x, msr_value=%"PRIx64,
1956
0
                msr, *msr_content);
1957
0
    return X86EMUL_OKAY;
1958
0
1959
0
 gpf:
1960
0
    return X86EMUL_EXCEPTION;
1961
0
}
1962
1963
static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
1964
0
{
1965
0
    int ret, result = X86EMUL_OKAY;
1966
0
    struct vcpu *v = current;
1967
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1968
0
    int sync = 0;
1969
0
1970
0
    switch ( msr )
1971
0
    {
1972
0
    case MSR_IA32_SYSENTER_CS:
1973
0
    case MSR_IA32_SYSENTER_ESP:
1974
0
    case MSR_IA32_SYSENTER_EIP:
1975
0
        sync = 1;
1976
0
        break;
1977
0
    default:
1978
0
        break;
1979
0
    }
1980
0
1981
0
    if ( sync )
1982
0
        svm_sync_vmcb(v);    
1983
0
1984
0
    switch ( msr )
1985
0
    {
1986
0
    case MSR_IA32_SYSENTER_CS:
1987
0
        vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
1988
0
        break;
1989
0
    case MSR_IA32_SYSENTER_ESP:
1990
0
        vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
1991
0
        break;
1992
0
    case MSR_IA32_SYSENTER_EIP:
1993
0
        vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
1994
0
        break;
1995
0
1996
0
    case MSR_IA32_DEBUGCTLMSR:
1997
0
        vmcb_set_debugctlmsr(vmcb, msr_content);
1998
0
        if ( !msr_content || !cpu_has_svm_lbrv )
1999
0
            break;
2000
0
        vmcb->lbr_control.fields.enable = 1;
2001
0
        svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR);
2002
0
        svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP);
2003
0
        svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP);
2004
0
        svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP);
2005
0
        svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP);
2006
0
        break;
2007
0
2008
0
    case MSR_IA32_LASTBRANCHFROMIP:
2009
0
        vmcb_set_lastbranchfromip(vmcb, msr_content);
2010
0
        break;
2011
0
2012
0
    case MSR_IA32_LASTBRANCHTOIP:
2013
0
        vmcb_set_lastbranchtoip(vmcb, msr_content);
2014
0
        break;
2015
0
2016
0
    case MSR_IA32_LASTINTFROMIP:
2017
0
        vmcb_set_lastintfromip(vmcb, msr_content);
2018
0
        break;
2019
0
2020
0
    case MSR_IA32_LASTINTTOIP:
2021
0
        vmcb_set_lastinttoip(vmcb, msr_content);
2022
0
        break;
2023
0
2024
0
    case MSR_AMD64_LWP_CFG:
2025
0
        if ( svm_update_lwp_cfg(v, msr_content) < 0 )
2026
0
            goto gpf;
2027
0
        break;
2028
0
2029
0
    case MSR_K7_PERFCTR0:
2030
0
    case MSR_K7_PERFCTR1:
2031
0
    case MSR_K7_PERFCTR2:
2032
0
    case MSR_K7_PERFCTR3:
2033
0
    case MSR_K7_EVNTSEL0:
2034
0
    case MSR_K7_EVNTSEL1:
2035
0
    case MSR_K7_EVNTSEL2:
2036
0
    case MSR_K7_EVNTSEL3:
2037
0
    case MSR_AMD_FAM15H_PERFCTR0:
2038
0
    case MSR_AMD_FAM15H_PERFCTR1:
2039
0
    case MSR_AMD_FAM15H_PERFCTR2:
2040
0
    case MSR_AMD_FAM15H_PERFCTR3:
2041
0
    case MSR_AMD_FAM15H_PERFCTR4:
2042
0
    case MSR_AMD_FAM15H_PERFCTR5:
2043
0
    case MSR_AMD_FAM15H_EVNTSEL0:
2044
0
    case MSR_AMD_FAM15H_EVNTSEL1:
2045
0
    case MSR_AMD_FAM15H_EVNTSEL2:
2046
0
    case MSR_AMD_FAM15H_EVNTSEL3:
2047
0
    case MSR_AMD_FAM15H_EVNTSEL4:
2048
0
    case MSR_AMD_FAM15H_EVNTSEL5:
2049
0
        if ( vpmu_do_wrmsr(msr, msr_content, 0) )
2050
0
            goto gpf;
2051
0
        break;
2052
0
2053
0
    case MSR_IA32_MCx_MISC(4): /* Threshold register */
2054
0
    case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
2055
0
        /*
2056
0
         * MCA/MCE: Threshold register is reported to be locked, so we ignore
2057
0
         * all write accesses. This behaviour matches real HW, so guests should
2058
0
         * have no problem with this.
2059
0
         */
2060
0
        break;
2061
0
2062
0
    case MSR_AMD64_DR0_ADDRESS_MASK:
2063
0
        if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2064
0
            goto gpf;
2065
0
        v->arch.hvm_svm.dr_mask[0] = msr_content;
2066
0
        break;
2067
0
2068
0
    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
2069
0
        if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2070
0
            goto gpf;
2071
0
        v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
2072
0
            msr_content;
2073
0
        break;
2074
0
2075
0
    case MSR_AMD_OSVW_ID_LENGTH:
2076
0
    case MSR_AMD_OSVW_STATUS:
2077
0
        ret = svm_handle_osvw(v, msr, &msr_content, 0);
2078
0
        if ( ret < 0 )
2079
0
            goto gpf;
2080
0
        break;
2081
0
2082
0
    default:
2083
0
        ret = nsvm_wrmsr(v, msr, msr_content);
2084
0
        if ( ret < 0 )
2085
0
            goto gpf;
2086
0
        else if ( ret )
2087
0
            break;
2088
0
2089
0
        if ( wrmsr_viridian_regs(msr, msr_content) )
2090
0
            break;
2091
0
2092
0
        switch ( wrmsr_hypervisor_regs(msr, msr_content) )
2093
0
        {
2094
0
        case -ERESTART:
2095
0
            result = X86EMUL_RETRY;
2096
0
            break;
2097
0
        case 0:
2098
0
        case 1:
2099
0
            break;
2100
0
        default:
2101
0
            goto gpf;
2102
0
        }
2103
0
        break;
2104
0
    }
2105
0
2106
0
    if ( sync )
2107
0
        svm_vmload(vmcb);
2108
0
2109
0
    return result;
2110
0
2111
0
 gpf:
2112
0
    return X86EMUL_EXCEPTION;
2113
0
}
2114
2115
static void svm_do_msr_access(struct cpu_user_regs *regs)
2116
0
{
2117
0
    struct vcpu *curr = current;
2118
0
    bool rdmsr = curr->arch.hvm_svm.vmcb->exitinfo1 == 0;
2119
0
    int rc, inst_len = __get_instruction_length(
2120
0
        curr, rdmsr ? INSTR_RDMSR : INSTR_WRMSR);
2121
0
2122
0
    if ( inst_len == 0 )
2123
0
        return;
2124
0
2125
0
    if ( rdmsr )
2126
0
    {
2127
0
        uint64_t msr_content = 0;
2128
0
2129
0
        rc = hvm_msr_read_intercept(regs->ecx, &msr_content);
2130
0
        if ( rc == X86EMUL_OKAY )
2131
0
            msr_split(regs, msr_content);
2132
0
    }
2133
0
    else
2134
0
        rc = hvm_msr_write_intercept(regs->ecx, msr_fold(regs), 1);
2135
0
2136
0
    if ( rc == X86EMUL_OKAY )
2137
0
        __update_guest_eip(regs, inst_len);
2138
0
    else if ( rc == X86EMUL_EXCEPTION )
2139
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2140
0
}
2141
2142
static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb,
2143
                              struct cpu_user_regs *regs)
2144
0
{
2145
0
    unsigned int inst_len;
2146
0
2147
0
    if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 )
2148
0
        return;
2149
0
    __update_guest_eip(regs, inst_len);
2150
0
2151
0
    hvm_hlt(regs->eflags);
2152
0
}
2153
2154
static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs)
2155
0
{
2156
0
    unsigned int inst_len;
2157
0
2158
0
    if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 )
2159
0
        return;
2160
0
    __update_guest_eip(regs, inst_len);
2161
0
2162
0
    hvm_rdtsc_intercept(regs);
2163
0
}
2164
2165
static void svm_vmexit_do_pause(struct cpu_user_regs *regs)
2166
0
{
2167
0
    unsigned int inst_len;
2168
0
2169
0
    if ( (inst_len = __get_instruction_length(current, INSTR_PAUSE)) == 0 )
2170
0
        return;
2171
0
    __update_guest_eip(regs, inst_len);
2172
0
2173
0
    /*
2174
0
     * The guest is running a contended spinlock and we've detected it.
2175
0
     * Do something useful, like reschedule the guest
2176
0
     */
2177
0
    perfc_incr(pauseloop_exits);
2178
0
    do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
2179
0
}
2180
2181
static void
2182
svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
2183
                    struct vcpu *v, uint64_t vmcbaddr)
2184
0
{
2185
0
    if ( !nsvm_efer_svm_enabled(v) )
2186
0
    {
2187
0
        gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
2188
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2189
0
        return;
2190
0
    }
2191
0
2192
0
    if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2193
0
    {
2194
0
        gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #GP\n");
2195
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2196
0
        return;
2197
0
    }
2198
0
2199
0
    vcpu_nestedhvm(v).nv_vmentry_pending = 1;
2200
0
    return;
2201
0
}
2202
2203
static struct page_info *
2204
nsvm_get_nvmcb_page(struct vcpu *v, uint64_t vmcbaddr)
2205
0
{
2206
0
    p2m_type_t p2mt;
2207
0
    struct page_info *page;
2208
0
    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2209
0
2210
0
    if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2211
0
        return NULL;
2212
0
2213
0
    /* Need to translate L1-GPA to MPA */
2214
0
    page = get_page_from_gfn(v->domain, 
2215
0
                            nv->nv_vvmcxaddr >> PAGE_SHIFT, 
2216
0
                            &p2mt, P2M_ALLOC | P2M_UNSHARE);
2217
0
    if ( !page )
2218
0
        return NULL;
2219
0
2220
0
    if ( !p2m_is_ram(p2mt) || p2m_is_readonly(p2mt) )
2221
0
    {
2222
0
        put_page(page);
2223
0
        return NULL; 
2224
0
    }
2225
0
2226
0
    return  page;
2227
0
}
2228
2229
static void
2230
svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
2231
                     struct cpu_user_regs *regs,
2232
                     struct vcpu *v, uint64_t vmcbaddr)
2233
0
{
2234
0
    unsigned int inst_len;
2235
0
    struct page_info *page;
2236
0
2237
0
    if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
2238
0
        return;
2239
0
2240
0
    if ( !nsvm_efer_svm_enabled(v) ) 
2241
0
    {
2242
0
        gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
2243
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2244
0
        return;
2245
0
    }
2246
0
2247
0
    page = nsvm_get_nvmcb_page(v, vmcbaddr);
2248
0
    if ( !page )
2249
0
    {
2250
0
        gdprintk(XENLOG_ERR,
2251
0
            "VMLOAD: mapping failed, injecting #GP\n");
2252
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2253
0
        return;
2254
0
    }
2255
0
2256
0
    svm_vmload_pa(page_to_maddr(page));
2257
0
    put_page(page);
2258
0
2259
0
    /* State in L1 VMCB is stale now */
2260
0
    v->arch.hvm_svm.vmcb_in_sync = 0;
2261
0
2262
0
    __update_guest_eip(regs, inst_len);
2263
0
}
2264
2265
static void
2266
svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
2267
                     struct cpu_user_regs *regs,
2268
                     struct vcpu *v, uint64_t vmcbaddr)
2269
0
{
2270
0
    unsigned int inst_len;
2271
0
    struct page_info *page;
2272
0
2273
0
    if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
2274
0
        return;
2275
0
2276
0
    if ( !nsvm_efer_svm_enabled(v) ) 
2277
0
    {
2278
0
        gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
2279
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2280
0
        return;
2281
0
    }
2282
0
2283
0
    page = nsvm_get_nvmcb_page(v, vmcbaddr);
2284
0
    if ( !page )
2285
0
    {
2286
0
        gdprintk(XENLOG_ERR,
2287
0
            "VMSAVE: mapping vmcb failed, injecting #GP\n");
2288
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2289
0
        return;
2290
0
    }
2291
0
2292
0
    svm_vmsave_pa(page_to_maddr(page));
2293
0
    put_page(page);
2294
0
    __update_guest_eip(regs, inst_len);
2295
0
}
2296
2297
static int svm_is_erratum_383(struct cpu_user_regs *regs)
2298
0
{
2299
0
    uint64_t msr_content;
2300
0
    uint32_t i;
2301
0
    struct vcpu *v = current;
2302
0
2303
0
    if ( !amd_erratum383_found )
2304
0
        return 0;
2305
0
2306
0
    rdmsrl(MSR_IA32_MC0_STATUS, msr_content);
2307
0
    /* Bit 62 may or may not be set for this mce */
2308
0
    msr_content &= ~(1ULL << 62);
2309
0
2310
0
    if ( msr_content != 0xb600000000010015ULL )
2311
0
        return 0;
2312
0
    
2313
0
    /* Clear MCi_STATUS registers */
2314
0
    for (i = 0; i < nr_mce_banks; i++)
2315
0
        wrmsrl(MSR_IA32_MCx_STATUS(i), 0ULL);
2316
0
    
2317
0
    rdmsrl(MSR_IA32_MCG_STATUS, msr_content);
2318
0
    wrmsrl(MSR_IA32_MCG_STATUS, msr_content & ~(1ULL << 2));
2319
0
2320
0
    /* flush TLB */
2321
0
    flush_tlb_mask(v->domain->domain_dirty_cpumask);
2322
0
2323
0
    return 1;
2324
0
}
2325
2326
static void svm_vmexit_mce_intercept(
2327
    struct vcpu *v, struct cpu_user_regs *regs)
2328
0
{
2329
0
    if ( svm_is_erratum_383(regs) )
2330
0
    {
2331
0
        gdprintk(XENLOG_ERR, "SVM hits AMD erratum 383\n");
2332
0
        domain_crash(v->domain);
2333
0
    }
2334
0
}
2335
2336
static void svm_wbinvd_intercept(void)
2337
0
{
2338
0
    if ( cache_flush_permitted(current->domain) )
2339
0
        flush_all(FLUSH_CACHE);
2340
0
}
2341
2342
static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs)
2343
0
{
2344
0
    static const enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD };
2345
0
    int inst_len;
2346
0
2347
0
    inst_len = __get_instruction_length_from_list(
2348
0
        current, list, ARRAY_SIZE(list));
2349
0
    if ( inst_len == 0 )
2350
0
        return;
2351
0
2352
0
    svm_wbinvd_intercept();
2353
0
2354
0
    __update_guest_eip(regs, inst_len);
2355
0
}
2356
2357
static void svm_invlpga_intercept(
2358
    struct vcpu *v, unsigned long vaddr, uint32_t asid)
2359
0
{
2360
0
    svm_invlpga(vaddr,
2361
0
                (asid == 0)
2362
0
                ? v->arch.hvm_vcpu.n1asid.asid
2363
0
                : vcpu_nestedhvm(v).nv_n2asid.asid);
2364
0
}
2365
2366
static void svm_invlpg_intercept(unsigned long vaddr)
2367
0
{
2368
0
    HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
2369
0
    paging_invlpg(current, vaddr);
2370
0
}
2371
2372
static bool is_invlpg(const struct x86_emulate_state *state,
2373
                      const struct x86_emulate_ctxt *ctxt)
2374
0
{
2375
0
    unsigned int ext;
2376
0
2377
0
    return ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) &&
2378
0
           x86_insn_modrm(state, NULL, &ext) != 3 &&
2379
0
           (ext & 7) == 7;
2380
0
}
2381
2382
static void svm_invlpg(struct vcpu *v, unsigned long vaddr)
2383
0
{
2384
0
    svm_asid_g_invlpg(v, vaddr);
2385
0
}
2386
2387
static bool svm_get_pending_event(struct vcpu *v, struct x86_event *info)
2388
0
{
2389
0
    const struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2390
0
2391
0
    if ( vmcb->eventinj.fields.v )
2392
0
        return false;
2393
0
2394
0
    info->vector = vmcb->eventinj.fields.vector;
2395
0
    info->type = vmcb->eventinj.fields.type;
2396
0
    info->error_code = vmcb->eventinj.fields.errorcode;
2397
0
2398
0
    return true;
2399
0
}
2400
2401
static struct hvm_function_table __initdata svm_function_table = {
2402
    .name                 = "SVM",
2403
    .cpu_up_prepare       = svm_cpu_up_prepare,
2404
    .cpu_dead             = svm_cpu_dead,
2405
    .cpu_up               = svm_cpu_up,
2406
    .cpu_down             = svm_cpu_down,
2407
    .domain_initialise    = svm_domain_initialise,
2408
    .domain_destroy       = svm_domain_destroy,
2409
    .vcpu_initialise      = svm_vcpu_initialise,
2410
    .vcpu_destroy         = svm_vcpu_destroy,
2411
    .save_cpu_ctxt        = svm_save_vmcb_ctxt,
2412
    .load_cpu_ctxt        = svm_load_vmcb_ctxt,
2413
    .init_msr             = svm_init_msr,
2414
    .save_msr             = svm_save_msr,
2415
    .load_msr             = svm_load_msr,
2416
    .get_interrupt_shadow = svm_get_interrupt_shadow,
2417
    .set_interrupt_shadow = svm_set_interrupt_shadow,
2418
    .guest_x86_mode       = svm_guest_x86_mode,
2419
    .get_cpl              = svm_get_cpl,
2420
    .get_segment_register = svm_get_segment_register,
2421
    .set_segment_register = svm_set_segment_register,
2422
    .get_shadow_gs_base   = svm_get_shadow_gs_base,
2423
    .update_guest_cr      = svm_update_guest_cr,
2424
    .update_guest_efer    = svm_update_guest_efer,
2425
    .update_guest_vendor  = svm_update_guest_vendor,
2426
    .fpu_leave            = svm_fpu_leave,
2427
    .set_guest_pat        = svm_set_guest_pat,
2428
    .get_guest_pat        = svm_get_guest_pat,
2429
    .set_tsc_offset       = svm_set_tsc_offset,
2430
    .inject_event         = svm_inject_event,
2431
    .init_hypercall_page  = svm_init_hypercall_page,
2432
    .event_pending        = svm_event_pending,
2433
    .get_pending_event    = svm_get_pending_event,
2434
    .invlpg               = svm_invlpg,
2435
    .wbinvd_intercept     = svm_wbinvd_intercept,
2436
    .fpu_dirty_intercept  = svm_fpu_dirty_intercept,
2437
    .msr_read_intercept   = svm_msr_read_intercept,
2438
    .msr_write_intercept  = svm_msr_write_intercept,
2439
    .set_rdtsc_exiting    = svm_set_rdtsc_exiting,
2440
    .set_descriptor_access_exiting = svm_set_descriptor_access_exiting,
2441
    .get_insn_bytes       = svm_get_insn_bytes,
2442
2443
    .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
2444
    .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
2445
    .nhvm_vcpu_reset = nsvm_vcpu_reset,
2446
    .nhvm_vcpu_vmexit_event = nsvm_vcpu_vmexit_event,
2447
    .nhvm_vcpu_p2m_base = nsvm_vcpu_hostcr3,
2448
    .nhvm_vmcx_guest_intercepts_event = nsvm_vmcb_guest_intercepts_event,
2449
    .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
2450
    .nhvm_intr_blocked = nsvm_intr_blocked,
2451
    .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
2452
2453
    .tsc_scaling = {
2454
        .max_ratio = ~TSC_RATIO_RSVD_BITS,
2455
    },
2456
};
2457
2458
void svm_vmexit_handler(struct cpu_user_regs *regs)
2459
0
{
2460
0
    uint64_t exit_reason;
2461
0
    struct vcpu *v = current;
2462
0
    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2463
0
    eventinj_t eventinj;
2464
0
    int inst_len, rc;
2465
0
    vintr_t intr;
2466
0
    bool_t vcpu_guestmode = 0;
2467
0
    struct vlapic *vlapic = vcpu_vlapic(v);
2468
0
2469
0
    hvm_invalidate_regs_fields(regs);
2470
0
2471
0
    if ( paging_mode_hap(v->domain) )
2472
0
        v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2473
0
            vmcb_get_cr3(vmcb);
2474
0
2475
0
    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
2476
0
        vcpu_guestmode = 1;
2477
0
2478
0
    /*
2479
0
     * Before doing anything else, we need to sync up the VLAPIC's TPR with
2480
0
     * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
2481
0
     * because we update the vTPR on MMIO writes to the TPR.
2482
0
     * NB. We need to preserve the low bits of the TPR to make checked builds
2483
0
     * of Windows work, even though they don't actually do anything.
2484
0
     */
2485
0
    if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
2486
0
    {
2487
0
        intr = vmcb_get_vintr(vmcb);
2488
0
        vlapic_set_reg(vlapic, APIC_TASKPRI,
2489
0
                   ((intr.fields.tpr & 0x0F) << 4) |
2490
0
                   (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0x0F));
2491
0
    }
2492
0
2493
0
    exit_reason = vmcb->exitcode;
2494
0
2495
0
    if ( hvm_long_mode_active(v) )
2496
0
        HVMTRACE_ND(VMEXIT64, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2497
0
                    1/*cycles*/, 3, exit_reason,
2498
0
                    regs->eip, regs->rip >> 32, 0, 0, 0);
2499
0
    else
2500
0
        HVMTRACE_ND(VMEXIT, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2501
0
                    1/*cycles*/, 2, exit_reason,
2502
0
                    regs->eip, 0, 0, 0, 0);
2503
0
2504
0
    if ( vcpu_guestmode ) {
2505
0
        enum nestedhvm_vmexits nsret;
2506
0
        struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2507
0
        struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
2508
0
        uint64_t exitinfo1, exitinfo2;
2509
0
2510
0
        paging_update_nestedmode(v);
2511
0
2512
0
        /* Write real exitinfo1 back into virtual vmcb.
2513
0
         * nestedsvm_check_intercepts() expects to have the correct
2514
0
         * exitinfo1 value there.
2515
0
         */
2516
0
        exitinfo1 = ns_vmcb->exitinfo1;
2517
0
        ns_vmcb->exitinfo1 = vmcb->exitinfo1;
2518
0
        nsret = nestedsvm_check_intercepts(v, regs, exit_reason);
2519
0
        switch (nsret) {
2520
0
        case NESTEDHVM_VMEXIT_CONTINUE:
2521
0
            BUG();
2522
0
            break;
2523
0
        case NESTEDHVM_VMEXIT_HOST:
2524
0
            break;
2525
0
        case NESTEDHVM_VMEXIT_INJECT:
2526
0
            /* Switch vcpu from l2 to l1 guest. We must perform
2527
0
             * the switch here to have svm_do_resume() working
2528
0
             * as intended.
2529
0
             */
2530
0
            exitinfo1 = vmcb->exitinfo1;
2531
0
            exitinfo2 = vmcb->exitinfo2;
2532
0
            nv->nv_vmswitch_in_progress = 1;
2533
0
            nsret = nestedsvm_vmexit_n2n1(v, regs);
2534
0
            nv->nv_vmswitch_in_progress = 0;
2535
0
            switch (nsret) {
2536
0
            case NESTEDHVM_VMEXIT_DONE:
2537
0
                /* defer VMEXIT injection */
2538
0
                nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2);
2539
0
                goto out;
2540
0
            case NESTEDHVM_VMEXIT_FATALERROR:
2541
0
                gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n");
2542
0
                domain_crash(v->domain);
2543
0
                goto out;
2544
0
            default:
2545
0
                BUG();
2546
0
            case NESTEDHVM_VMEXIT_ERROR:
2547
0
                break;
2548
0
            }
2549
0
            /* fallthrough */
2550
0
        case NESTEDHVM_VMEXIT_ERROR:
2551
0
            gdprintk(XENLOG_ERR,
2552
0
                "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n");
2553
0
            goto out;
2554
0
        case NESTEDHVM_VMEXIT_FATALERROR:
2555
0
            gdprintk(XENLOG_ERR,
2556
0
                "unexpected nestedsvm_check_intercepts() error\n");
2557
0
            domain_crash(v->domain);
2558
0
            goto out;
2559
0
        default:
2560
0
            gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n",
2561
0
                nsret);
2562
0
            domain_crash(v->domain);
2563
0
            goto out;
2564
0
        }
2565
0
    }
2566
0
2567
0
    if ( unlikely(exit_reason == VMEXIT_INVALID) )
2568
0
    {
2569
0
        gdprintk(XENLOG_ERR, "invalid VMCB state:\n");
2570
0
        svm_vmcb_dump(__func__, vmcb);
2571
0
        domain_crash(v->domain);
2572
0
        goto out;
2573
0
    }
2574
0
2575
0
    perfc_incra(svmexits, exit_reason);
2576
0
2577
0
    hvm_maybe_deassert_evtchn_irq();
2578
0
2579
0
    vmcb->cleanbits.bytes = cpu_has_svm_cleanbits ? ~0u : 0u;
2580
0
2581
0
    /* Event delivery caused this intercept? Queue for redelivery. */
2582
0
    eventinj = vmcb->exitintinfo;
2583
0
    if ( unlikely(eventinj.fields.v) &&
2584
0
         hvm_event_needs_reinjection(eventinj.fields.type,
2585
0
                                     eventinj.fields.vector) )
2586
0
        vmcb->eventinj = eventinj;
2587
0
2588
0
    switch ( exit_reason )
2589
0
    {
2590
0
    case VMEXIT_INTR:
2591
0
        /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2592
0
        HVMTRACE_0D(INTR);
2593
0
        break;
2594
0
2595
0
    case VMEXIT_NMI:
2596
0
        /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2597
0
        HVMTRACE_0D(NMI);
2598
0
        break;
2599
0
2600
0
    case VMEXIT_SMI:
2601
0
        /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2602
0
        HVMTRACE_0D(SMI);
2603
0
        break;
2604
0
2605
0
    case VMEXIT_EXCEPTION_DB:
2606
0
        if ( !v->domain->debugger_attached )
2607
0
            hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
2608
0
        else
2609
0
            domain_pause_for_debugger();
2610
0
        break;
2611
0
2612
0
    case VMEXIT_EXCEPTION_BP:
2613
0
        if ( !v->domain->debugger_attached )
2614
0
            goto unexpected_exit_type;
2615
0
        /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2616
0
        if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 )
2617
0
            break;
2618
0
        __update_guest_eip(regs, inst_len);
2619
0
        current->arch.gdbsx_vcpu_event = TRAP_int3;
2620
0
        domain_pause_for_debugger();
2621
0
        break;
2622
0
2623
0
    case VMEXIT_EXCEPTION_NM:
2624
0
        svm_fpu_dirty_intercept();
2625
0
        break;  
2626
0
2627
0
    case VMEXIT_EXCEPTION_PF: {
2628
0
        unsigned long va;
2629
0
        va = vmcb->exitinfo2;
2630
0
        regs->error_code = vmcb->exitinfo1;
2631
0
        HVM_DBG_LOG(DBG_LEVEL_VMMU,
2632
0
                    "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2633
0
                    regs->rax, regs->rbx, regs->rcx,
2634
0
                    regs->rdx, regs->rsi, regs->rdi);
2635
0
2636
0
        if ( cpu_has_svm_decode )
2637
0
            v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2638
0
        rc = paging_fault(va, regs);
2639
0
        v->arch.hvm_svm.cached_insn_len = 0;
2640
0
2641
0
        if ( rc )
2642
0
        {
2643
0
            if ( trace_will_trace_event(TRC_SHADOW) )
2644
0
                break;
2645
0
            if ( hvm_long_mode_active(v) )
2646
0
                HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
2647
0
            else
2648
0
                HVMTRACE_2D(PF_XEN, regs->error_code, va);
2649
0
            break;
2650
0
        }
2651
0
2652
0
        hvm_inject_page_fault(regs->error_code, va);
2653
0
        break;
2654
0
    }
2655
0
2656
0
    case VMEXIT_EXCEPTION_AC:
2657
0
        HVMTRACE_1D(TRAP, TRAP_alignment_check);
2658
0
        hvm_inject_hw_exception(TRAP_alignment_check, vmcb->exitinfo1);
2659
0
        break;
2660
0
2661
0
    case VMEXIT_EXCEPTION_UD:
2662
0
        hvm_ud_intercept(regs);
2663
0
        break;
2664
0
2665
0
    /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2666
0
    case VMEXIT_EXCEPTION_MC:
2667
0
        HVMTRACE_0D(MCE);
2668
0
        svm_vmexit_mce_intercept(v, regs);
2669
0
        break;
2670
0
2671
0
    case VMEXIT_VINTR: {
2672
0
        u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2673
0
        intr = vmcb_get_vintr(vmcb);
2674
0
2675
0
        intr.fields.irq = 0;
2676
0
        general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2677
0
2678
0
        vmcb_set_vintr(vmcb, intr);
2679
0
        vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2680
0
        break;
2681
0
    }
2682
0
2683
0
    case VMEXIT_INVD:
2684
0
    case VMEXIT_WBINVD:
2685
0
        svm_vmexit_do_invalidate_cache(regs);
2686
0
        break;
2687
0
2688
0
    case VMEXIT_TASK_SWITCH: {
2689
0
        enum hvm_task_switch_reason reason;
2690
0
        int32_t errcode = -1;
2691
0
        if ( (vmcb->exitinfo2 >> 36) & 1 )
2692
0
            reason = TSW_iret;
2693
0
        else if ( (vmcb->exitinfo2 >> 38) & 1 )
2694
0
            reason = TSW_jmp;
2695
0
        else
2696
0
            reason = TSW_call_or_int;
2697
0
        if ( (vmcb->exitinfo2 >> 44) & 1 )
2698
0
            errcode = (uint32_t)vmcb->exitinfo2;
2699
0
2700
0
        /*
2701
0
         * Some processors set the EXITINTINFO field when the task switch
2702
0
         * is caused by a task gate in the IDT. In this case we will be
2703
0
         * emulating the event injection, so we do not want the processor
2704
0
         * to re-inject the original event!
2705
0
         */
2706
0
        vmcb->eventinj.bytes = 0;
2707
0
2708
0
        hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
2709
0
        break;
2710
0
    }
2711
0
2712
0
    case VMEXIT_CPUID:
2713
0
        svm_vmexit_do_cpuid(regs);
2714
0
        break;
2715
0
2716
0
    case VMEXIT_HLT:
2717
0
        svm_vmexit_do_hlt(vmcb, regs);
2718
0
        break;
2719
0
2720
0
    case VMEXIT_IOIO:
2721
0
        if ( (vmcb->exitinfo1 & (1u<<2)) == 0 )
2722
0
        {
2723
0
            uint16_t port = (vmcb->exitinfo1 >> 16) & 0xFFFF;
2724
0
            int bytes = ((vmcb->exitinfo1 >> 4) & 0x07);
2725
0
            int dir = (vmcb->exitinfo1 & 1) ? IOREQ_READ : IOREQ_WRITE;
2726
0
            if ( handle_pio(port, bytes, dir) )
2727
0
                __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip);
2728
0
        }
2729
0
        else if ( !hvm_emulate_one_insn(x86_insn_is_portio, "port I/O") )
2730
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
2731
0
        break;
2732
0
2733
0
    case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
2734
0
    case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
2735
0
        if ( cpu_has_svm_decode && (vmcb->exitinfo1 & (1ULL << 63)) )
2736
0
            svm_vmexit_do_cr_access(vmcb, regs);
2737
0
        else if ( !hvm_emulate_one_insn(x86_insn_is_cr_access, "CR access") )
2738
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
2739
0
        break;
2740
0
2741
0
    case VMEXIT_INVLPG:
2742
0
        if ( cpu_has_svm_decode )
2743
0
        {
2744
0
            svm_invlpg_intercept(vmcb->exitinfo1);
2745
0
            __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
2746
0
        }
2747
0
        else if ( !hvm_emulate_one_insn(is_invlpg, "invlpg") )
2748
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
2749
0
        break;
2750
0
2751
0
    case VMEXIT_INVLPGA:
2752
0
        if ( (inst_len = __get_instruction_length(v, INSTR_INVLPGA)) == 0 )
2753
0
            break;
2754
0
        svm_invlpga_intercept(v, regs->rax, regs->ecx);
2755
0
        __update_guest_eip(regs, inst_len);
2756
0
        break;
2757
0
2758
0
    case VMEXIT_VMMCALL:
2759
0
        if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
2760
0
            break;
2761
0
        BUG_ON(vcpu_guestmode);
2762
0
        HVMTRACE_1D(VMMCALL, regs->eax);
2763
0
2764
0
        if ( hvm_hypercall(regs) == HVM_HCALL_completed )
2765
0
            __update_guest_eip(regs, inst_len);
2766
0
        break;
2767
0
2768
0
    case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
2769
0
    case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2770
0
        svm_dr_access(v, regs);
2771
0
        break;
2772
0
2773
0
    case VMEXIT_MSR:
2774
0
        svm_do_msr_access(regs);
2775
0
        break;
2776
0
2777
0
    case VMEXIT_SHUTDOWN:
2778
0
        hvm_triple_fault();
2779
0
        break;
2780
0
2781
0
    case VMEXIT_RDTSCP:
2782
0
        regs->rcx = hvm_msr_tsc_aux(v);
2783
0
        /* fall through */
2784
0
    case VMEXIT_RDTSC:
2785
0
        svm_vmexit_do_rdtsc(regs);
2786
0
        break;
2787
0
2788
0
    case VMEXIT_MONITOR:
2789
0
    case VMEXIT_MWAIT:
2790
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2791
0
        break;
2792
0
2793
0
    case VMEXIT_VMRUN:
2794
0
        svm_vmexit_do_vmrun(regs, v, regs->rax);
2795
0
        break;
2796
0
    case VMEXIT_VMLOAD:
2797
0
        svm_vmexit_do_vmload(vmcb, regs, v, regs->rax);
2798
0
        break;
2799
0
    case VMEXIT_VMSAVE:
2800
0
        svm_vmexit_do_vmsave(vmcb, regs, v, regs->rax);
2801
0
        break;
2802
0
    case VMEXIT_STGI:
2803
0
        svm_vmexit_do_stgi(regs, v);
2804
0
        break;
2805
0
    case VMEXIT_CLGI:
2806
0
        svm_vmexit_do_clgi(regs, v);
2807
0
        break;
2808
0
    case VMEXIT_SKINIT:
2809
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2810
0
        break;
2811
0
2812
0
    case VMEXIT_XSETBV:
2813
0
        if ( vmcb_get_cpl(vmcb) )
2814
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
2815
0
        else if ( (inst_len = __get_instruction_length(v, INSTR_XSETBV)) &&
2816
0
                  hvm_handle_xsetbv(regs->ecx, msr_fold(regs)) == 0 )
2817
0
            __update_guest_eip(regs, inst_len);
2818
0
        break;
2819
0
2820
0
    case VMEXIT_NPF:
2821
0
        perfc_incra(svmexits, VMEXIT_NPF_PERFC);
2822
0
        if ( cpu_has_svm_decode )
2823
0
            v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2824
0
        rc = vmcb->exitinfo1 & PFEC_page_present
2825
0
             ? p2m_pt_handle_deferred_changes(vmcb->exitinfo2) : 0;
2826
0
        if ( rc >= 0 )
2827
0
            svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
2828
0
        else
2829
0
        {
2830
0
            printk(XENLOG_G_ERR
2831
0
                   "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
2832
0
                   v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
2833
0
            domain_crash(v->domain);
2834
0
        }
2835
0
        v->arch.hvm_svm.cached_insn_len = 0;
2836
0
        break;
2837
0
2838
0
    case VMEXIT_IRET: {
2839
0
        u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2840
0
2841
0
        /*
2842
0
         * IRET clears the NMI mask. However because we clear the mask
2843
0
         * /before/ executing IRET, we set the interrupt shadow to prevent
2844
0
         * a pending NMI from being injected immediately. This will work
2845
0
         * perfectly unless the IRET instruction faults: in that case we
2846
0
         * may inject an NMI before the NMI handler's IRET instruction is
2847
0
         * retired.
2848
0
         */
2849
0
        general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
2850
0
        vmcb->interrupt_shadow = 1;
2851
0
2852
0
        vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2853
0
        break;
2854
0
    }
2855
0
2856
0
    case VMEXIT_PAUSE:
2857
0
        svm_vmexit_do_pause(regs);
2858
0
        break;
2859
0
2860
0
    case VMEXIT_IDTR_READ:
2861
0
    case VMEXIT_IDTR_WRITE:
2862
0
        hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2863
0
            VM_EVENT_DESC_IDTR, exit_reason == VMEXIT_IDTR_WRITE);
2864
0
        break;
2865
0
2866
0
    case VMEXIT_GDTR_READ:
2867
0
    case VMEXIT_GDTR_WRITE:
2868
0
        hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2869
0
            VM_EVENT_DESC_GDTR, exit_reason == VMEXIT_GDTR_WRITE);
2870
0
        break;
2871
0
2872
0
    case VMEXIT_LDTR_READ:
2873
0
    case VMEXIT_LDTR_WRITE:
2874
0
        hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2875
0
            VM_EVENT_DESC_LDTR, exit_reason == VMEXIT_LDTR_WRITE);
2876
0
        break;
2877
0
2878
0
    case VMEXIT_TR_READ:
2879
0
    case VMEXIT_TR_WRITE:
2880
0
        hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2881
0
            VM_EVENT_DESC_TR, exit_reason == VMEXIT_TR_WRITE);
2882
0
        break;
2883
0
2884
0
    default:
2885
0
    unexpected_exit_type:
2886
0
        gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", "
2887
0
                "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n",
2888
0
                exit_reason, vmcb->exitinfo1, vmcb->exitinfo2);
2889
0
        svm_crash_or_fault(v);
2890
0
        break;
2891
0
    }
2892
0
2893
0
  out:
2894
0
    if ( vcpu_guestmode || vlapic_hw_disabled(vlapic) )
2895
0
        return;
2896
0
2897
0
    /* The exit may have updated the TPR: reflect this in the hardware vtpr */
2898
0
    intr = vmcb_get_vintr(vmcb);
2899
0
    intr.fields.tpr =
2900
0
        (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
2901
0
    vmcb_set_vintr(vmcb, intr);
2902
0
}
2903
2904
void svm_trace_vmentry(void)
2905
0
{
2906
0
    struct vcpu *curr = current;
2907
0
    HVMTRACE_ND(VMENTRY,
2908
0
                nestedhvm_vcpu_in_guestmode(curr) ? TRC_HVM_NESTEDFLAG : 0,
2909
0
                1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2910
0
}
2911
  
2912
/*
2913
 * Local variables:
2914
 * mode: C
2915
 * c-file-style: "BSD"
2916
 * c-basic-offset: 4
2917
 * tab-width: 4
2918
 * indent-tabs-mode: nil
2919
 * End:
2920
 */