Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/hvm/vmx/vmx.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * vmx.c: handling VMX architecture-related VM exits
3
 * Copyright (c) 2004, Intel Corporation.
4
 *
5
 * This program is free software; you can redistribute it and/or modify it
6
 * under the terms and conditions of the GNU General Public License,
7
 * version 2, as published by the Free Software Foundation.
8
 *
9
 * This program is distributed in the hope it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12
 * more details.
13
 *
14
 * You should have received a copy of the GNU General Public License along with
15
 * this program; If not, see <http://www.gnu.org/licenses/>.
16
 */
17
18
#include <xen/init.h>
19
#include <xen/lib.h>
20
#include <xen/trace.h>
21
#include <xen/sched.h>
22
#include <xen/irq.h>
23
#include <xen/softirq.h>
24
#include <xen/domain_page.h>
25
#include <xen/hypercall.h>
26
#include <xen/perfc.h>
27
#include <asm/current.h>
28
#include <asm/io.h>
29
#include <asm/iocap.h>
30
#include <asm/regs.h>
31
#include <asm/cpufeature.h>
32
#include <asm/processor.h>
33
#include <asm/guest_access.h>
34
#include <asm/debugreg.h>
35
#include <asm/msr.h>
36
#include <asm/p2m.h>
37
#include <asm/mem_sharing.h>
38
#include <asm/hvm/emulate.h>
39
#include <asm/hvm/hvm.h>
40
#include <asm/hvm/support.h>
41
#include <asm/hvm/vmx/vmx.h>
42
#include <asm/hvm/vmx/vmcs.h>
43
#include <public/sched.h>
44
#include <public/hvm/ioreq.h>
45
#include <asm/hvm/vpic.h>
46
#include <asm/hvm/vlapic.h>
47
#include <asm/x86_emulate.h>
48
#include <asm/hvm/vpt.h>
49
#include <public/hvm/save.h>
50
#include <asm/hvm/trace.h>
51
#include <asm/hvm/monitor.h>
52
#include <asm/xenoprof.h>
53
#include <asm/debugger.h>
54
#include <asm/apic.h>
55
#include <asm/hvm/nestedhvm.h>
56
#include <asm/altp2m.h>
57
#include <asm/event.h>
58
#include <asm/mce.h>
59
#include <asm/monitor.h>
60
#include <public/arch-x86/cpuid.h>
61
62
static bool_t __initdata opt_force_ept;
63
boolean_param("force-ept", opt_force_ept);
64
65
enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
66
67
static void vmx_ctxt_switch_from(struct vcpu *v);
68
static void vmx_ctxt_switch_to(struct vcpu *v);
69
70
static int  vmx_alloc_vlapic_mapping(struct domain *d);
71
static void vmx_free_vlapic_mapping(struct domain *d);
72
static void vmx_install_vlapic_mapping(struct vcpu *v);
73
static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
74
static void vmx_update_guest_efer(struct vcpu *v);
75
static void vmx_update_guest_vendor(struct vcpu *v);
76
static void vmx_wbinvd_intercept(void);
77
static void vmx_fpu_dirty_intercept(void);
78
static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content);
79
static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
80
static void vmx_invlpg(struct vcpu *v, unsigned long vaddr);
81
static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
82
83
struct vmx_pi_blocking_vcpu {
84
    struct list_head     list;
85
    spinlock_t           lock;
86
};
87
88
/*
89
 * We maintain a per-CPU linked-list of vCPUs, so in PI wakeup
90
 * handler we can find which vCPU should be woken up.
91
 */
92
static DEFINE_PER_CPU(struct vmx_pi_blocking_vcpu, vmx_pi_blocking);
93
94
uint8_t __read_mostly posted_intr_vector;
95
static uint8_t __read_mostly pi_wakeup_vector;
96
97
void vmx_pi_per_cpu_init(unsigned int cpu)
98
12
{
99
12
    INIT_LIST_HEAD(&per_cpu(vmx_pi_blocking, cpu).list);
100
12
    spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
101
12
}
102
103
static void vmx_vcpu_block(struct vcpu *v)
104
0
{
105
0
    unsigned long flags;
106
0
    unsigned int dest;
107
0
    spinlock_t *old_lock;
108
0
    spinlock_t *pi_blocking_list_lock =
109
0
    &per_cpu(vmx_pi_blocking, v->processor).lock;
110
0
    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
111
0
112
0
    spin_lock_irqsave(pi_blocking_list_lock, flags);
113
0
    old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
114
0
                       pi_blocking_list_lock);
115
0
116
0
    /*
117
0
     * 'v->arch.hvm_vmx.pi_blocking.lock' should be NULL before
118
0
     * being assigned to a new value, since the vCPU is currently
119
0
     * running and it cannot be on any blocking list.
120
0
     */
121
0
    ASSERT(old_lock == NULL);
122
0
123
0
    list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
124
0
                  &per_cpu(vmx_pi_blocking, v->processor).list);
125
0
    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
126
0
127
0
    ASSERT(!pi_test_sn(pi_desc));
128
0
129
0
    dest = cpu_physical_id(v->processor);
130
0
131
0
    ASSERT(pi_desc->ndst ==
132
0
           (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
133
0
134
0
    write_atomic(&pi_desc->nv, pi_wakeup_vector);
135
0
}
136
137
static void vmx_pi_switch_from(struct vcpu *v)
138
0
{
139
0
    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
140
0
141
0
    if ( test_bit(_VPF_blocked, &v->pause_flags) )
142
0
        return;
143
0
144
0
    pi_set_sn(pi_desc);
145
0
}
146
147
static void vmx_pi_switch_to(struct vcpu *v)
148
0
{
149
0
    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
150
0
    unsigned int dest = cpu_physical_id(v->processor);
151
0
152
0
    write_atomic(&pi_desc->ndst,
153
0
                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
154
0
155
0
    pi_clear_sn(pi_desc);
156
0
}
157
158
static void vmx_pi_unblock_vcpu(struct vcpu *v)
159
0
{
160
0
    unsigned long flags;
161
0
    spinlock_t *pi_blocking_list_lock;
162
0
    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
163
0
164
0
    /*
165
0
     * Set 'NV' field back to posted_intr_vector, so the
166
0
     * Posted-Interrupts can be delivered to the vCPU when
167
0
     * it is running in non-root mode.
168
0
     */
169
0
    write_atomic(&pi_desc->nv, posted_intr_vector);
170
0
171
0
    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
172
0
173
0
    /* Prevent the compiler from eliminating the local variable.*/
174
0
    smp_rmb();
175
0
176
0
    /* The vCPU is not on any blocking list. */
177
0
    if ( pi_blocking_list_lock == NULL )
178
0
        return;
179
0
180
0
    spin_lock_irqsave(pi_blocking_list_lock, flags);
181
0
182
0
    /*
183
0
     * v->arch.hvm_vmx.pi_blocking.lock == NULL here means the vCPU
184
0
     * was removed from the blocking list while we are acquiring the lock.
185
0
     */
186
0
    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
187
0
    {
188
0
        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
189
0
        list_del(&v->arch.hvm_vmx.pi_blocking.list);
190
0
        v->arch.hvm_vmx.pi_blocking.lock = NULL;
191
0
    }
192
0
193
0
    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
194
0
}
195
196
static void vmx_pi_do_resume(struct vcpu *v)
197
0
{
198
0
    ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
199
0
200
0
    vmx_pi_unblock_vcpu(v);
201
0
}
202
203
void vmx_pi_desc_fixup(unsigned int cpu)
204
0
{
205
0
    unsigned int new_cpu, dest;
206
0
    unsigned long flags;
207
0
    struct arch_vmx_struct *vmx, *tmp;
208
0
    spinlock_t *new_lock, *old_lock = &per_cpu(vmx_pi_blocking, cpu).lock;
209
0
    struct list_head *blocked_vcpus = &per_cpu(vmx_pi_blocking, cpu).list;
210
0
211
0
    if ( !iommu_intpost )
212
0
        return;
213
0
214
0
    /*
215
0
     * We are in the context of CPU_DEAD or CPU_UP_CANCELED notification,
216
0
     * and it is impossible for a second CPU go down in parallel. So we
217
0
     * can safely acquire the old cpu's lock and then acquire the new_cpu's
218
0
     * lock after that.
219
0
     */
220
0
    spin_lock_irqsave(old_lock, flags);
221
0
222
0
    list_for_each_entry_safe(vmx, tmp, blocked_vcpus, pi_blocking.list)
223
0
    {
224
0
        /*
225
0
         * Suppress notification or we may miss an interrupt when the
226
0
         * target cpu is dying.
227
0
         */
228
0
        pi_set_sn(&vmx->pi_desc);
229
0
230
0
        /*
231
0
         * Check whether a notification is pending before doing the
232
0
         * movement, if that is the case we need to wake up it directly
233
0
         * other than moving it to the new cpu's list.
234
0
         */
235
0
        if ( pi_test_on(&vmx->pi_desc) )
236
0
        {
237
0
            list_del(&vmx->pi_blocking.list);
238
0
            vmx->pi_blocking.lock = NULL;
239
0
            vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
240
0
        }
241
0
        else
242
0
        {
243
0
            /*
244
0
             * We need to find an online cpu as the NDST of the PI descriptor, it
245
0
             * doesn't matter whether it is within the cpupool of the domain or
246
0
             * not. As long as it is online, the vCPU will be woken up once the
247
0
             * notification event arrives.
248
0
             */
249
0
            new_cpu = cpumask_any(&cpu_online_map);
250
0
            new_lock = &per_cpu(vmx_pi_blocking, new_cpu).lock;
251
0
252
0
            spin_lock(new_lock);
253
0
254
0
            ASSERT(vmx->pi_blocking.lock == old_lock);
255
0
256
0
            dest = cpu_physical_id(new_cpu);
257
0
            write_atomic(&vmx->pi_desc.ndst,
258
0
                         x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
259
0
260
0
            list_move(&vmx->pi_blocking.list,
261
0
                      &per_cpu(vmx_pi_blocking, new_cpu).list);
262
0
            vmx->pi_blocking.lock = new_lock;
263
0
264
0
            spin_unlock(new_lock);
265
0
        }
266
0
267
0
        pi_clear_sn(&vmx->pi_desc);
268
0
    }
269
0
270
0
    spin_unlock_irqrestore(old_lock, flags);
271
0
}
272
273
/*
274
 * To handle posted interrupts correctly, we need to set the following
275
 * state:
276
 *
277
 * * The PI notification vector (NV)
278
 * * The PI notification destination processor (NDST)
279
 * * The PI "suppress notification" bit (SN)
280
 * * The vcpu pi "blocked" list
281
 *
282
 * VMX implements the runstate transitions as the following:
283
 *
284
 * A: ... -> running
285
 *  - SN = 0
286
 *  - NDST = v->processor
287
 *  If a VM is currently running, we want the PI delivered to the guest vcpu
288
 *  on the proper pcpu.
289
 *
290
 * B: running -> ...
291
 *  - SN = 1
292
 *
293
 * C: ... -> blocked
294
 *  - SN = 0
295
 *  - NV = pi_wakeup_vector
296
 *  - Add vcpu to blocked list
297
 *  If the vm is blocked, we want the PI delivered to Xen so that it can
298
 *  wake it up.
299
 *
300
 * D: ... -> vmentry
301
 *  - SN = 0
302
 *  - NV = posted_intr_vector
303
 *  - Take vcpu off blocked list
304
 *
305
 *  If the VM is currently either preempted or offline (i.e., not running
306
 *  because of some reason other than blocking waiting for an interrupt),
307
 *  there's nothing Xen can do -- we want the interrupt pending bit set in
308
 *  the guest, but we don't want to bother Xen with an interrupt (SN clear).
309
 *
310
 * There's a brief window of time between vmx_intr_assist() and checking
311
 * softirqs where if an interrupt comes in it may be lost; so we need Xen
312
 * to get an interrupt and raise a softirq so that it will go through the
313
 * vmx_intr_assist() path again (SN clear, NV = posted_interrupt).
314
 */
315
316
/* This function is called when pcidevs_lock is held */
317
void vmx_pi_hooks_assign(struct domain *d)
318
0
{
319
0
    struct vcpu *v;
320
0
321
0
    if ( !iommu_intpost || !is_hvm_domain(d) )
322
0
        return;
323
0
324
0
    ASSERT(!d->arch.hvm_domain.pi_ops.vcpu_block);
325
0
326
0
    /*
327
0
     * We carefully handle the timing here:
328
0
     * - Install the context switch first
329
0
     * - Then set the NDST field
330
0
     * - Install the block and resume hooks in the end
331
0
     *
332
0
     * This can make sure the PI (especially the NDST feild) is
333
0
     * in proper state when we call vmx_vcpu_block().
334
0
     */
335
0
    d->arch.hvm_domain.pi_ops.switch_from = vmx_pi_switch_from;
336
0
    d->arch.hvm_domain.pi_ops.switch_to = vmx_pi_switch_to;
337
0
338
0
    for_each_vcpu ( d, v )
339
0
    {
340
0
        unsigned int dest = cpu_physical_id(v->processor);
341
0
        struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
342
0
343
0
        /*
344
0
         * We don't need to update NDST if vmx_pi_switch_to()
345
0
         * has already got called.
346
0
         */
347
0
        (void)cmpxchg(&pi_desc->ndst, APIC_INVALID_DEST,
348
0
                x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
349
0
    }
350
0
351
0
    d->arch.hvm_domain.pi_ops.vcpu_block = vmx_vcpu_block;
352
0
    d->arch.hvm_domain.pi_ops.do_resume = vmx_pi_do_resume;
353
0
}
354
355
/* This function is called when pcidevs_lock is held */
356
void vmx_pi_hooks_deassign(struct domain *d)
357
0
{
358
0
    struct vcpu *v;
359
0
360
0
    if ( !iommu_intpost || !is_hvm_domain(d) )
361
0
        return;
362
0
363
0
    ASSERT(d->arch.hvm_domain.pi_ops.vcpu_block);
364
0
365
0
    /*
366
0
     * Pausing the domain can make sure the vCPUs are not
367
0
     * running and hence not calling the hooks simultaneously
368
0
     * when deassigning the PI hooks and removing the vCPU
369
0
     * from the blocking list.
370
0
     */
371
0
    ASSERT(current->domain != d);
372
0
    domain_pause(d);
373
0
374
0
    /*
375
0
     * Note that we don't set 'd->arch.hvm_domain.pi_ops.switch_to' to NULL
376
0
     * here. If we deassign the hooks while the vCPU is runnable in the
377
0
     * runqueue with 'SN' set, all the future notification event will be
378
0
     * suppressed since vmx_deliver_posted_intr() also use 'SN' bit
379
0
     * as the suppression flag. Preserving the 'switch_to' hook function can
380
0
     * clear the 'SN' bit when the vCPU becomes running next time. After
381
0
     * that, No matter which status(runnable, running or block) the vCPU is in,
382
0
     * the 'SN' bit will keep clear for the 'switch_from' hook function that set
383
0
     * the 'SN' bit has been removed. At that time, the 'switch_to' hook function
384
0
     * is also useless. Considering the function doesn't do harm to the whole
385
0
     * system, leave it here until we find a clean solution to deassign the
386
0
     * 'switch_to' hook function.
387
0
     */
388
0
    d->arch.hvm_domain.pi_ops.vcpu_block = NULL;
389
0
    d->arch.hvm_domain.pi_ops.switch_from = NULL;
390
0
    d->arch.hvm_domain.pi_ops.do_resume = NULL;
391
0
392
0
    for_each_vcpu ( d, v )
393
0
        vmx_pi_unblock_vcpu(v);
394
0
395
0
    domain_unpause(d);
396
0
}
397
398
static int vmx_domain_initialise(struct domain *d)
399
1
{
400
1
    static const struct arch_csw csw = {
401
1
        .from = vmx_ctxt_switch_from,
402
1
        .to   = vmx_ctxt_switch_to,
403
1
        .tail = vmx_do_resume,
404
1
    };
405
1
    int rc;
406
1
407
1
    d->arch.ctxt_switch = &csw;
408
1
409
1
    if ( !has_vlapic(d) )
410
0
        return 0;
411
1
412
1
    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
413
0
        return rc;
414
1
415
1
    return 0;
416
1
}
417
418
static void vmx_domain_destroy(struct domain *d)
419
0
{
420
0
    if ( !has_vlapic(d) )
421
0
        return;
422
0
423
0
    vmx_free_vlapic_mapping(d);
424
0
}
425
426
static int vmx_vcpu_initialise(struct vcpu *v)
427
12
{
428
12
    int rc;
429
12
430
12
    spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
431
12
432
12
    INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocking.list);
433
12
434
12
    if ( (rc = vmx_create_vmcs(v)) != 0 )
435
0
    {
436
0
        dprintk(XENLOG_WARNING,
437
0
                "Failed to create VMCS for vcpu %d: err=%d.\n",
438
0
                v->vcpu_id, rc);
439
0
        return rc;
440
0
    }
441
12
442
12
    /*
443
12
     * It's rare but still possible that domain has already been in log-dirty
444
12
     * mode when vcpu is being created (commented by Tim), in which case we
445
12
     * should enable PML for this vcpu if PML has been enabled for the domain,
446
12
     * and failure to enable results in failure of creating this vcpu.
447
12
     *
448
12
     * Note even there's no vcpu created for the domain, vmx_domain_enable_pml
449
12
     * will return successful in which case vmx_domain_pml_enabled will also
450
12
     * return true. And even this is the first vcpu to be created with
451
12
     * vmx_domain_pml_enabled being true, failure of enabling PML still results
452
12
     * in failure of creating vcpu, to avoid complicated logic to revert PML
453
12
     * style EPT table to non-PML style EPT table.
454
12
     */
455
12
    if ( vmx_domain_pml_enabled(v->domain) )
456
0
    {
457
0
        if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
458
0
        {
459
0
            dprintk(XENLOG_ERR, "%pv: Failed to enable PML.\n", v);
460
0
            vmx_destroy_vmcs(v);
461
0
            return rc;
462
0
        }
463
0
    }
464
12
465
12
    vmx_install_vlapic_mapping(v);
466
12
467
12
    /* %eax == 1 signals full real-mode support to the guest loader. */
468
12
    if ( v->vcpu_id == 0 )
469
1
        v->arch.user_regs.rax = 1;
470
12
471
12
    return 0;
472
12
}
473
474
static void vmx_vcpu_destroy(struct vcpu *v)
475
0
{
476
0
    /*
477
0
     * There are cases that domain still remains in log-dirty mode when it is
478
0
     * about to be destroyed (ex, user types 'xl destroy <dom>'), in which case
479
0
     * we should disable PML manually here. Note that vmx_vcpu_destroy is called
480
0
     * prior to vmx_domain_destroy so we need to disable PML for each vcpu
481
0
     * separately here.
482
0
     */
483
0
    vmx_vcpu_disable_pml(v);
484
0
    vmx_destroy_vmcs(v);
485
0
    passive_domain_destroy(v);
486
0
}
487
488
static enum handler_return
489
long_mode_do_msr_read(unsigned int msr, uint64_t *msr_content)
490
0
{
491
0
    struct vcpu *v = current;
492
0
493
0
    switch ( msr )
494
0
    {
495
0
    case MSR_FS_BASE:
496
0
        __vmread(GUEST_FS_BASE, msr_content);
497
0
        break;
498
0
499
0
    case MSR_GS_BASE:
500
0
        __vmread(GUEST_GS_BASE, msr_content);
501
0
        break;
502
0
503
0
    case MSR_SHADOW_GS_BASE:
504
0
        rdmsrl(MSR_SHADOW_GS_BASE, *msr_content);
505
0
        break;
506
0
507
0
    case MSR_STAR:
508
0
        *msr_content = v->arch.hvm_vmx.star;
509
0
        break;
510
0
511
0
    case MSR_LSTAR:
512
0
        *msr_content = v->arch.hvm_vmx.lstar;
513
0
        break;
514
0
515
0
    case MSR_CSTAR:
516
0
        *msr_content = v->arch.hvm_vmx.cstar;
517
0
        break;
518
0
519
0
    case MSR_SYSCALL_MASK:
520
0
        *msr_content = v->arch.hvm_vmx.sfmask;
521
0
        break;
522
0
523
0
    default:
524
0
        return HNDL_unhandled;
525
0
    }
526
0
527
0
    HVM_DBG_LOG(DBG_LEVEL_MSR, "msr %#x content %#"PRIx64, msr, *msr_content);
528
0
529
0
    return HNDL_done;
530
0
}
531
532
static enum handler_return
533
long_mode_do_msr_write(unsigned int msr, uint64_t msr_content)
534
50
{
535
50
    struct vcpu *v = current;
536
50
537
50
    HVM_DBG_LOG(DBG_LEVEL_MSR, "msr %#x content %#"PRIx64, msr, msr_content);
538
50
539
50
    switch ( msr )
540
50
    {
541
0
    case MSR_FS_BASE:
542
0
    case MSR_GS_BASE:
543
0
    case MSR_SHADOW_GS_BASE:
544
0
        if ( !is_canonical_address(msr_content) )
545
0
            goto uncanonical_address;
546
0
547
0
        if ( msr == MSR_FS_BASE )
548
0
            __vmwrite(GUEST_FS_BASE, msr_content);
549
0
        else if ( msr == MSR_GS_BASE )
550
0
            __vmwrite(GUEST_GS_BASE, msr_content);
551
0
        else
552
0
            wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
553
0
554
0
        break;
555
0
556
12
    case MSR_STAR:
557
12
        v->arch.hvm_vmx.star = msr_content;
558
12
        wrmsrl(MSR_STAR, msr_content);
559
12
        break;
560
0
561
12
    case MSR_LSTAR:
562
12
        if ( !is_canonical_address(msr_content) )
563
0
            goto uncanonical_address;
564
12
        v->arch.hvm_vmx.lstar = msr_content;
565
12
        wrmsrl(MSR_LSTAR, msr_content);
566
12
        break;
567
12
568
12
    case MSR_CSTAR:
569
12
        if ( !is_canonical_address(msr_content) )
570
0
            goto uncanonical_address;
571
12
        v->arch.hvm_vmx.cstar = msr_content;
572
12
        break;
573
12
574
12
    case MSR_SYSCALL_MASK:
575
12
        v->arch.hvm_vmx.sfmask = msr_content;
576
12
        wrmsrl(MSR_SYSCALL_MASK, msr_content);
577
12
        break;
578
12
579
2
    default:
580
2
        return HNDL_unhandled;
581
50
    }
582
50
583
48
    return HNDL_done;
584
50
585
0
 uncanonical_address:
586
0
    HVM_DBG_LOG(DBG_LEVEL_MSR, "Not cano address of msr write %x", msr);
587
0
    hvm_inject_hw_exception(TRAP_gp_fault, 0);
588
0
    return HNDL_exception_raised;
589
50
}
590
591
/*
592
 * To avoid MSR save/restore at every VM exit/entry time, we restore
593
 * the x86_64 specific MSRs at domain switch time. Since these MSRs
594
 * are not modified once set for para domains, we don't save them,
595
 * but simply reset them to values set in percpu_traps_init().
596
 */
597
static void vmx_restore_host_msrs(void)
598
36.9k
{
599
36.9k
    /* Relies on the SYSCALL trampoline being at the start of the stubs. */
600
36.9k
    wrmsrl(MSR_STAR,         XEN_MSR_STAR);
601
36.9k
    wrmsrl(MSR_LSTAR,        this_cpu(stubs.addr));
602
36.9k
    wrmsrl(MSR_SYSCALL_MASK, XEN_SYSCALL_MASK);
603
36.9k
}
604
605
static void vmx_save_guest_msrs(struct vcpu *v)
606
36.9k
{
607
36.9k
    /*
608
36.9k
     * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
609
36.9k
     * be updated at any time via SWAPGS, which we cannot trap.
610
36.9k
     */
611
36.9k
    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
612
36.9k
}
613
614
static void vmx_restore_guest_msrs(struct vcpu *v)
615
37.0k
{
616
37.0k
    wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
617
37.0k
    wrmsrl(MSR_STAR,           v->arch.hvm_vmx.star);
618
37.0k
    wrmsrl(MSR_LSTAR,          v->arch.hvm_vmx.lstar);
619
37.0k
    wrmsrl(MSR_SYSCALL_MASK,   v->arch.hvm_vmx.sfmask);
620
37.0k
621
37.0k
    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
622
12
    {
623
12
        HVM_DBG_LOG(DBG_LEVEL_2,
624
12
                    "restore guest's EFER with value %lx",
625
12
                    v->arch.hvm_vcpu.guest_efer);
626
12
        write_efer((read_efer() & ~EFER_SCE) |
627
12
                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
628
12
    }
629
37.0k
630
37.0k
    if ( cpu_has_rdtscp )
631
37.0k
        wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
632
37.0k
}
633
634
void vmx_update_cpu_exec_control(struct vcpu *v)
635
192k
{
636
192k
    if ( nestedhvm_vcpu_in_guestmode(v) )
637
0
        nvmx_update_exec_control(v, v->arch.hvm_vmx.exec_control);
638
192k
    else
639
192k
        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
640
192k
}
641
642
void vmx_update_secondary_exec_control(struct vcpu *v)
643
36
{
644
36
    if ( nestedhvm_vcpu_in_guestmode(v) )
645
0
        nvmx_update_secondary_exec_control(v,
646
0
            v->arch.hvm_vmx.secondary_exec_control);
647
36
    else
648
36
        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
649
36
                  v->arch.hvm_vmx.secondary_exec_control);
650
36
}
651
652
void vmx_update_exception_bitmap(struct vcpu *v)
653
37.2k
{
654
37.2k
    u32 bitmap = unlikely(v->arch.hvm_vmx.vmx_realmode)
655
37.2k
        ? 0xffffffffu : v->arch.hvm_vmx.exception_bitmap;
656
37.2k
657
37.2k
    if ( nestedhvm_vcpu_in_guestmode(v) )
658
0
        nvmx_update_exception_bitmap(v, bitmap);
659
37.2k
    else
660
37.2k
        __vmwrite(EXCEPTION_BITMAP, bitmap);
661
37.2k
}
662
663
static void vmx_update_guest_vendor(struct vcpu *v)
664
12
{
665
12
    if ( opt_hvm_fep ||
666
12
         (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
667
0
        v->arch.hvm_vmx.exception_bitmap |= (1U << TRAP_invalid_op);
668
12
    else
669
12
        v->arch.hvm_vmx.exception_bitmap &= ~(1U << TRAP_invalid_op);
670
12
671
12
    vmx_vmcs_enter(v);
672
12
    vmx_update_exception_bitmap(v);
673
12
    vmx_vmcs_exit(v);
674
12
}
675
676
int vmx_guest_x86_mode(struct vcpu *v)
677
5.69M
{
678
5.69M
    unsigned long cs_ar_bytes;
679
5.69M
680
5.69M
    if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
681
0
        return 0;
682
5.69M
    if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
683
0
        return 1;
684
5.69M
    __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
685
5.69M
    if ( hvm_long_mode_active(v) &&
686
5.69M
         likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
687
5.72M
        return 8;
688
18.4E
    return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
689
5.69M
}
690
691
static void vmx_save_dr(struct vcpu *v)
692
37.0k
{
693
37.0k
    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
694
37.0k
        return;
695
37.0k
696
37.0k
    /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
697
18.4E
    v->arch.hvm_vcpu.flag_dr_dirty = 0;
698
18.4E
    v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
699
18.4E
    vmx_update_cpu_exec_control(v);
700
18.4E
701
18.4E
    v->arch.debugreg[0] = read_debugreg(0);
702
18.4E
    v->arch.debugreg[1] = read_debugreg(1);
703
18.4E
    v->arch.debugreg[2] = read_debugreg(2);
704
18.4E
    v->arch.debugreg[3] = read_debugreg(3);
705
18.4E
    v->arch.debugreg[6] = read_debugreg(6);
706
18.4E
    /* DR7 must be saved as it is used by vmx_restore_dr(). */
707
18.4E
    __vmread(GUEST_DR7, &v->arch.debugreg[7]);
708
18.4E
}
709
710
static void __restore_debug_registers(struct vcpu *v)
711
0
{
712
0
    if ( v->arch.hvm_vcpu.flag_dr_dirty )
713
0
        return;
714
0
715
0
    v->arch.hvm_vcpu.flag_dr_dirty = 1;
716
0
717
0
    write_debugreg(0, v->arch.debugreg[0]);
718
0
    write_debugreg(1, v->arch.debugreg[1]);
719
0
    write_debugreg(2, v->arch.debugreg[2]);
720
0
    write_debugreg(3, v->arch.debugreg[3]);
721
0
    write_debugreg(6, v->arch.debugreg[6]);
722
0
    /* DR7 is loaded from the VMCS. */
723
0
}
724
725
/*
726
 * DR7 is saved and restored on every vmexit.  Other debug registers only
727
 * need to be restored if their value is going to affect execution -- i.e.,
728
 * if one of the breakpoints is enabled.  So mask out all bits that don't
729
 * enable some breakpoint functionality.
730
 */
731
static void vmx_restore_dr(struct vcpu *v)
732
37.0k
{
733
37.0k
    /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
734
37.0k
    if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
735
0
        __restore_debug_registers(v);
736
37.0k
}
737
738
static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
739
0
{
740
0
    unsigned long ev;
741
0
742
0
    vmx_vmcs_enter(v);
743
0
744
0
    c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
745
0
    c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
746
0
    c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
747
0
    c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
748
0
749
0
    c->msr_efer = v->arch.hvm_vcpu.guest_efer;
750
0
751
0
    __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
752
0
    __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
753
0
    __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
754
0
755
0
    c->pending_event = 0;
756
0
    c->error_code = 0;
757
0
    __vmread(VM_ENTRY_INTR_INFO, &ev);
758
0
    if ( (ev & INTR_INFO_VALID_MASK) &&
759
0
         hvm_event_needs_reinjection(MASK_EXTR(ev, INTR_INFO_INTR_TYPE_MASK),
760
0
                                     ev & INTR_INFO_VECTOR_MASK) )
761
0
    {
762
0
        c->pending_event = ev;
763
0
        __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE, &ev);
764
0
        c->error_code = ev;
765
0
    }
766
0
767
0
    vmx_vmcs_exit(v);
768
0
}
769
770
static int vmx_restore_cr0_cr3(
771
    struct vcpu *v, unsigned long cr0, unsigned long cr3)
772
0
{
773
0
    struct page_info *page = NULL;
774
0
775
0
    if ( paging_mode_shadow(v->domain) )
776
0
    {
777
0
        if ( cr0 & X86_CR0_PG )
778
0
        {
779
0
            page = get_page_from_gfn(v->domain, cr3 >> PAGE_SHIFT,
780
0
                                     NULL, P2M_ALLOC);
781
0
            if ( !page )
782
0
            {
783
0
                gdprintk(XENLOG_ERR, "Invalid CR3 value=%#lx\n", cr3);
784
0
                return -EINVAL;
785
0
            }
786
0
        }
787
0
788
0
        if ( hvm_paging_enabled(v) )
789
0
            put_page(pagetable_get_page(v->arch.guest_table));
790
0
791
0
        v->arch.guest_table =
792
0
            page ? pagetable_from_page(page) : pagetable_null();
793
0
    }
794
0
795
0
    v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
796
0
    v->arch.hvm_vcpu.guest_cr[3] = cr3;
797
0
798
0
    return 0;
799
0
}
800
801
static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
802
0
{
803
0
    int rc;
804
0
805
0
    if ( c->pending_valid )
806
0
    {
807
0
        if ( (c->pending_type == 1) || (c->pending_type > 6) ||
808
0
             (c->pending_reserved != 0) )
809
0
        {
810
0
            dprintk(XENLOG_ERR, "%pv: Invalid pending event %#"PRIx32"\n",
811
0
                    v, c->pending_event);
812
0
            return -EINVAL;
813
0
        }
814
0
815
0
        if ( c->pending_error_valid &&
816
0
             c->error_code != (uint16_t)c->error_code )
817
0
        {
818
0
            dprintk(XENLOG_ERR, "%pv: Invalid error code %#"PRIx32"\n",
819
0
                    v, c->error_code);
820
0
            return -EINVAL;
821
0
        }
822
0
    }
823
0
824
0
    rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
825
0
    if ( rc )
826
0
        return rc;
827
0
828
0
    vmx_vmcs_enter(v);
829
0
830
0
    v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
831
0
    v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
832
0
    vmx_update_guest_cr(v, 0);
833
0
    vmx_update_guest_cr(v, 2);
834
0
    vmx_update_guest_cr(v, 4);
835
0
836
0
    v->arch.hvm_vcpu.guest_efer = c->msr_efer;
837
0
    vmx_update_guest_efer(v);
838
0
839
0
    __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
840
0
    __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
841
0
    __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
842
0
843
0
    __vmwrite(GUEST_DR7, c->dr7);
844
0
845
0
    if ( c->pending_valid &&
846
0
         hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
847
0
    {
848
0
        gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n",
849
0
                 c->pending_event, c->error_code);
850
0
        __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
851
0
        __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
852
0
    }
853
0
    else
854
0
    {
855
0
        __vmwrite(VM_ENTRY_INTR_INFO, 0);
856
0
        __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
857
0
    }
858
0
    vmx_vmcs_exit(v);
859
0
860
0
    paging_update_paging_modes(v);
861
0
862
0
    return 0;
863
0
}
864
865
static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
866
0
{
867
0
    data->shadow_gs        = v->arch.hvm_vmx.shadow_gs;
868
0
    data->msr_flags        = 0;
869
0
    data->msr_lstar        = v->arch.hvm_vmx.lstar;
870
0
    data->msr_star         = v->arch.hvm_vmx.star;
871
0
    data->msr_cstar        = v->arch.hvm_vmx.cstar;
872
0
    data->msr_syscall_mask = v->arch.hvm_vmx.sfmask;
873
0
}
874
875
static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
876
0
{
877
0
    v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
878
0
    v->arch.hvm_vmx.star      = data->msr_star;
879
0
    v->arch.hvm_vmx.lstar     = data->msr_lstar;
880
0
    v->arch.hvm_vmx.cstar     = data->msr_cstar;
881
0
    v->arch.hvm_vmx.sfmask    = data->msr_syscall_mask;
882
0
}
883
884
885
static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
886
0
{
887
0
    vmx_save_cpu_state(v, ctxt);
888
0
    vmx_vmcs_save(v, ctxt);
889
0
}
890
891
static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
892
0
{
893
0
    vmx_load_cpu_state(v, ctxt);
894
0
895
0
    if ( vmx_vmcs_restore(v, ctxt) )
896
0
    {
897
0
        gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
898
0
        domain_crash(v->domain);
899
0
        return -EINVAL;
900
0
    }
901
0
902
0
    return 0;
903
0
}
904
905
static unsigned int __init vmx_init_msr(void)
906
1
{
907
1
    return (cpu_has_mpx && cpu_has_vmx_mpx) +
908
1
           (cpu_has_xsaves && cpu_has_vmx_xsaves);
909
1
}
910
911
static void vmx_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
912
0
{
913
0
    vmx_vmcs_enter(v);
914
0
915
0
    if ( cpu_has_mpx && cpu_has_vmx_mpx )
916
0
    {
917
0
        __vmread(GUEST_BNDCFGS, &ctxt->msr[ctxt->count].val);
918
0
        if ( ctxt->msr[ctxt->count].val )
919
0
            ctxt->msr[ctxt->count++].index = MSR_IA32_BNDCFGS;
920
0
    }
921
0
922
0
    vmx_vmcs_exit(v);
923
0
924
0
    if ( cpu_has_xsaves && cpu_has_vmx_xsaves )
925
0
    {
926
0
        ctxt->msr[ctxt->count].val = v->arch.hvm_vcpu.msr_xss;
927
0
        if ( ctxt->msr[ctxt->count].val )
928
0
            ctxt->msr[ctxt->count++].index = MSR_IA32_XSS;
929
0
    }
930
0
}
931
932
static int vmx_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
933
0
{
934
0
    unsigned int i;
935
0
    int err = 0;
936
0
937
0
    vmx_vmcs_enter(v);
938
0
939
0
    for ( i = 0; i < ctxt->count; ++i )
940
0
    {
941
0
        switch ( ctxt->msr[i].index )
942
0
        {
943
0
        case MSR_IA32_BNDCFGS:
944
0
            if ( cpu_has_mpx && cpu_has_vmx_mpx &&
945
0
                 is_canonical_address(ctxt->msr[i].val) &&
946
0
                 !(ctxt->msr[i].val & IA32_BNDCFGS_RESERVED) )
947
0
                __vmwrite(GUEST_BNDCFGS, ctxt->msr[i].val);
948
0
            else if ( ctxt->msr[i].val )
949
0
                err = -ENXIO;
950
0
            break;
951
0
        case MSR_IA32_XSS:
952
0
            if ( cpu_has_xsaves && cpu_has_vmx_xsaves )
953
0
                v->arch.hvm_vcpu.msr_xss = ctxt->msr[i].val;
954
0
            else
955
0
                err = -ENXIO;
956
0
            break;
957
0
        default:
958
0
            continue;
959
0
        }
960
0
        if ( err )
961
0
            break;
962
0
        ctxt->msr[i]._rsvd = 1;
963
0
    }
964
0
965
0
    vmx_vmcs_exit(v);
966
0
967
0
    return err;
968
0
}
969
970
static void vmx_fpu_enter(struct vcpu *v)
971
3.82k
{
972
3.82k
    vcpu_restore_fpu_lazy(v);
973
3.82k
    v->arch.hvm_vmx.exception_bitmap &= ~(1u << TRAP_no_device);
974
3.82k
    vmx_update_exception_bitmap(v);
975
3.82k
    v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
976
3.82k
    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
977
3.82k
}
978
979
static void vmx_fpu_leave(struct vcpu *v)
980
36.9k
{
981
36.9k
    ASSERT(!v->fpu_dirtied);
982
36.9k
    ASSERT(read_cr0() & X86_CR0_TS);
983
36.9k
984
36.9k
    if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
985
275
    {
986
275
        v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
987
275
        __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
988
275
    }
989
36.9k
990
36.9k
    /*
991
36.9k
     * If the guest does not have TS enabled then we must cause and handle an
992
36.9k
     * exception on first use of the FPU. If the guest *does* have TS enabled
993
36.9k
     * then this is not necessary: no FPU activity can occur until the guest
994
36.9k
     * clears CR0.TS, and we will initialise the FPU when that happens.
995
36.9k
     */
996
36.9k
    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
997
33.3k
    {
998
33.3k
        v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
999
33.3k
        __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1000
33.3k
        v->arch.hvm_vmx.exception_bitmap |= (1u << TRAP_no_device);
1001
33.3k
        vmx_update_exception_bitmap(v);
1002
33.3k
    }
1003
36.9k
}
1004
1005
static void vmx_ctxt_switch_from(struct vcpu *v)
1006
36.9k
{
1007
36.9k
    /*
1008
36.9k
     * Return early if trying to do a context switch without VMX enabled,
1009
36.9k
     * this can happen when the hypervisor shuts down with HVM guests
1010
36.9k
     * still running.
1011
36.9k
     */
1012
36.9k
    if ( unlikely(!this_cpu(vmxon)) )
1013
0
        return;
1014
36.9k
1015
36.9k
    if ( !v->is_running )
1016
2.89k
    {
1017
2.89k
        /*
1018
2.89k
         * When this vCPU isn't marked as running anymore, a remote pCPU's
1019
2.89k
         * attempt to pause us (from vmx_vmcs_enter()) won't have a reason
1020
2.89k
         * to spin in vcpu_sleep_sync(), and hence that pCPU might have taken
1021
2.89k
         * away the VMCS from us. As we're running with interrupts disabled,
1022
2.89k
         * we also can't call vmx_vmcs_enter().
1023
2.89k
         */
1024
2.89k
        vmx_vmcs_reload(v);
1025
2.89k
    }
1026
36.9k
1027
36.9k
    vmx_fpu_leave(v);
1028
36.9k
    vmx_save_guest_msrs(v);
1029
36.9k
    vmx_restore_host_msrs();
1030
36.9k
    vmx_save_dr(v);
1031
36.9k
1032
36.9k
    if ( v->domain->arch.hvm_domain.pi_ops.switch_from )
1033
0
        v->domain->arch.hvm_domain.pi_ops.switch_from(v);
1034
36.9k
}
1035
1036
static void vmx_ctxt_switch_to(struct vcpu *v)
1037
37.0k
{
1038
37.0k
    unsigned long old_cr4 = read_cr4(), new_cr4 = mmu_cr4_features;
1039
37.0k
1040
37.0k
    /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
1041
37.0k
    if ( old_cr4 != new_cr4 )
1042
0
        write_cr4(new_cr4);
1043
37.0k
1044
37.0k
    vmx_restore_guest_msrs(v);
1045
37.0k
    vmx_restore_dr(v);
1046
37.0k
1047
37.0k
    if ( v->domain->arch.hvm_domain.pi_ops.switch_to )
1048
0
        v->domain->arch.hvm_domain.pi_ops.switch_to(v);
1049
37.0k
}
1050
1051
1052
unsigned int vmx_get_cpl(void)
1053
304k
{
1054
304k
    unsigned long attr;
1055
304k
1056
304k
    __vmread(GUEST_SS_AR_BYTES, &attr);
1057
304k
1058
304k
    return (attr >> 5) & 3;
1059
304k
}
1060
1061
static unsigned int _vmx_get_cpl(struct vcpu *v)
1062
304k
{
1063
304k
    unsigned int cpl;
1064
304k
1065
304k
    vmx_vmcs_enter(v);
1066
304k
    cpl = vmx_get_cpl();
1067
304k
    vmx_vmcs_exit(v);
1068
304k
1069
304k
    return cpl;
1070
304k
}
1071
1072
/*
1073
 * SDM Vol 3: VM Entries > Checks on Guest Segment Registers:
1074
 *
1075
 * We can only enter virtual 8086 mode if all of CS, SS, DS, ES, FS and GS are
1076
 * 16bit ring-3 data segments.  On hardware lacking the unrestricted_guest
1077
 * feature, Xen fakes up real mode using vm86 mode.  The guest thinks it's got
1078
 * ring-0 segments, so we need to fudge things.  We store the ring-3 version
1079
 * in the VMCS to avoid lots of shuffling on vmenter and vmexit, and translate
1080
 * in these accessors.
1081
 */
1082
0
#define rm_cs_attr   0x9b
1083
0
#define rm_ds_attr   0x93
1084
0
#define vm86_ds_attr 0xf3
1085
0
#define vm86_tr_attr 0x8b
1086
1087
static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
1088
                                     struct segment_register *reg)
1089
180k
{
1090
180k
    unsigned long attr = 0, sel = 0, limit;
1091
180k
1092
180k
    /*
1093
180k
     * We may get here in the context of dump_execstate(), which may have
1094
180k
     * interrupted context switching between setting "current" and
1095
180k
     * vmx_do_resume() reaching the end of vmx_load_vmcs(). That would make
1096
180k
     * all the VMREADs below fail if we don't bail right away.
1097
180k
     */
1098
180k
    if ( unlikely(!vmx_vmcs_try_enter(v)) )
1099
0
    {
1100
0
        static bool_t warned;
1101
0
1102
0
        if ( !warned )
1103
0
        {
1104
0
            warned = 1;
1105
0
            printk(XENLOG_WARNING "Segment register inaccessible for %pv\n"
1106
0
                   "(If you see this outside of debugging activity,"
1107
0
                   " please report to xen-devel@lists.xenproject.org)\n",
1108
0
                   v);
1109
0
        }
1110
0
        memset(reg, 0, sizeof(*reg));
1111
0
        return;
1112
0
    }
1113
180k
1114
180k
    switch ( seg )
1115
180k
    {
1116
180k
    case x86_seg_es ... x86_seg_gs:
1117
180k
        __vmread(GUEST_SEG_SELECTOR(seg), &sel);
1118
180k
        __vmread(GUEST_SEG_LIMIT(seg),    &limit);
1119
180k
        __vmread(GUEST_SEG_BASE(seg),     &reg->base);
1120
180k
        __vmread(GUEST_SEG_AR_BYTES(seg), &attr);
1121
180k
        break;
1122
0
    case x86_seg_tr:
1123
0
        __vmread(GUEST_TR_SELECTOR, &sel);
1124
0
        __vmread(GUEST_TR_LIMIT,    &limit);
1125
0
        __vmread(GUEST_TR_BASE,     &reg->base);
1126
0
        __vmread(GUEST_TR_AR_BYTES, &attr);
1127
0
        break;
1128
0
    case x86_seg_gdtr:
1129
0
        __vmread(GUEST_GDTR_LIMIT, &limit);
1130
0
        __vmread(GUEST_GDTR_BASE,  &reg->base);
1131
0
        break;
1132
0
    case x86_seg_idtr:
1133
0
        __vmread(GUEST_IDTR_LIMIT, &limit);
1134
0
        __vmread(GUEST_IDTR_BASE,  &reg->base);
1135
0
        break;
1136
0
    case x86_seg_ldtr:
1137
0
        __vmread(GUEST_LDTR_SELECTOR, &sel);
1138
0
        __vmread(GUEST_LDTR_LIMIT,    &limit);
1139
0
        __vmread(GUEST_LDTR_BASE,     &reg->base);
1140
0
        __vmread(GUEST_LDTR_AR_BYTES, &attr);
1141
0
        break;
1142
0
    default:
1143
0
        BUG();
1144
0
        return;
1145
180k
    }
1146
180k
1147
180k
    vmx_vmcs_exit(v);
1148
180k
1149
180k
    reg->sel = sel;
1150
180k
    reg->limit = limit;
1151
180k
1152
180k
    /*
1153
180k
     * Fold VT-x representation into Xen's representation.  The Present bit is
1154
180k
     * unconditionally set to the inverse of unusable.
1155
180k
     */
1156
180k
    reg->attr =
1157
180k
        (!(attr & (1u << 16)) << 7) | (attr & 0x7f) | ((attr >> 4) & 0xf00);
1158
180k
1159
180k
    /* Adjust for virtual 8086 mode */
1160
180k
    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr 
1161
0
         && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
1162
0
    {
1163
0
        struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
1164
0
        if ( seg == x86_seg_tr ) 
1165
0
            *reg = *sreg;
1166
0
        else if ( reg->base != sreg->base || seg == x86_seg_ss )
1167
0
        {
1168
0
            /* If the guest's reloaded the segment, remember the new version.
1169
0
             * We can't tell if the guest reloaded the segment with another 
1170
0
             * one that has the same base.  By default we assume it hasn't,
1171
0
             * since we don't want to lose big-real-mode segment attributes,
1172
0
             * but for SS we assume it has: the Ubuntu graphical bootloader
1173
0
             * does this and gets badly confused if we leave the old SS in 
1174
0
             * place. */
1175
0
            reg->attr = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
1176
0
            *sreg = *reg;
1177
0
        }
1178
0
        else 
1179
0
        {
1180
0
            /* Always give realmode guests a selector that matches the base
1181
0
             * but keep the attr and limit from before */
1182
0
            *reg = *sreg;
1183
0
            reg->sel = reg->base >> 4;
1184
0
        }
1185
0
    }
1186
180k
}
1187
1188
static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
1189
                                     struct segment_register *reg)
1190
115
{
1191
115
    uint32_t attr, sel, limit;
1192
115
    uint64_t base;
1193
115
1194
115
    sel = reg->sel;
1195
115
    attr = reg->attr;
1196
115
    limit = reg->limit;
1197
115
    base = reg->base;
1198
115
1199
115
    /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
1200
115
    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
1201
0
    {
1202
0
        /* Remember the proper contents */
1203
0
        v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
1204
0
        
1205
0
        if ( seg == x86_seg_tr ) 
1206
0
        {
1207
0
            const struct domain *d = v->domain;
1208
0
            uint64_t val = d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED];
1209
0
1210
0
            if ( val )
1211
0
            {
1212
0
                sel = 0;
1213
0
                attr = vm86_tr_attr;
1214
0
                limit = ((val & ~VM86_TSS_UPDATED) >> 32) - 1;
1215
0
                base = (uint32_t)val;
1216
0
                if ( val & VM86_TSS_UPDATED )
1217
0
                {
1218
0
                    hvm_prepare_vm86_tss(v, base, limit);
1219
0
                    cmpxchg(&d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED],
1220
0
                            val, val & ~VM86_TSS_UPDATED);
1221
0
                }
1222
0
                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
1223
0
            }
1224
0
            else
1225
0
                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
1226
0
        }
1227
0
        else
1228
0
        {
1229
0
            /* Try to fake it out as a 16bit data segment.  This could
1230
0
             * cause confusion for the guest if it reads the selector,
1231
0
             * but otherwise we have to emulate if *any* segment hasn't
1232
0
             * been reloaded. */
1233
0
            if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff && reg->p )
1234
0
            {
1235
0
                sel = base >> 4;
1236
0
                attr = vm86_ds_attr;
1237
0
                limit = 0xffff;
1238
0
                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
1239
0
            }
1240
0
            else 
1241
0
                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
1242
0
        }
1243
0
    }
1244
115
1245
115
    /*
1246
115
     * Unfold Xen representation into VT-x representation.  The unusable bit
1247
115
     * is unconditionally set to the inverse of present.
1248
115
     */
1249
115
    attr = (!(attr & (1u << 7)) << 16) | ((attr & 0xf00) << 4) | (attr & 0xff);
1250
115
1251
115
    vmx_vmcs_enter(v);
1252
115
1253
115
    switch ( seg )
1254
115
    {
1255
70
    case x86_seg_es ... x86_seg_gs:
1256
70
        __vmwrite(GUEST_SEG_SELECTOR(seg), sel);
1257
70
        __vmwrite(GUEST_SEG_LIMIT(seg),    limit);
1258
70
        __vmwrite(GUEST_SEG_BASE(seg),     base);
1259
70
        __vmwrite(GUEST_SEG_AR_BYTES(seg), attr);
1260
70
        break;
1261
12
    case x86_seg_tr:
1262
12
        __vmwrite(GUEST_TR_SELECTOR, sel);
1263
12
        __vmwrite(GUEST_TR_LIMIT, limit);
1264
12
        __vmwrite(GUEST_TR_BASE, base);
1265
12
        __vmwrite(GUEST_TR_AR_BYTES, attr);
1266
12
        break;
1267
11
    case x86_seg_gdtr:
1268
11
        __vmwrite(GUEST_GDTR_LIMIT, limit);
1269
11
        __vmwrite(GUEST_GDTR_BASE, base);
1270
11
        break;
1271
11
    case x86_seg_idtr:
1272
11
        __vmwrite(GUEST_IDTR_LIMIT, limit);
1273
11
        __vmwrite(GUEST_IDTR_BASE, base);
1274
11
        break;
1275
11
    case x86_seg_ldtr:
1276
11
        __vmwrite(GUEST_LDTR_SELECTOR, sel);
1277
11
        __vmwrite(GUEST_LDTR_LIMIT, limit);
1278
11
        __vmwrite(GUEST_LDTR_BASE, base);
1279
11
        __vmwrite(GUEST_LDTR_AR_BYTES, attr);
1280
11
        break;
1281
0
    default:
1282
0
        BUG();
1283
115
    }
1284
115
1285
115
    vmx_vmcs_exit(v);
1286
115
}
1287
1288
static unsigned long vmx_get_shadow_gs_base(struct vcpu *v)
1289
0
{
1290
0
    return v->arch.hvm_vmx.shadow_gs;
1291
0
}
1292
1293
static int vmx_set_guest_pat(struct vcpu *v, u64 gpat)
1294
56
{
1295
56
    if ( !paging_mode_hap(v->domain) ||
1296
56
         unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
1297
12
        return 0;
1298
56
1299
44
    vmx_vmcs_enter(v);
1300
44
    __vmwrite(GUEST_PAT, gpat);
1301
44
    vmx_vmcs_exit(v);
1302
44
    return 1;
1303
56
}
1304
1305
static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat)
1306
22
{
1307
22
    if ( !paging_mode_hap(v->domain) ||
1308
22
         unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
1309
0
        return 0;
1310
22
1311
22
    vmx_vmcs_enter(v);
1312
22
    __vmread(GUEST_PAT, gpat);
1313
22
    vmx_vmcs_exit(v);
1314
22
    return 1;
1315
22
}
1316
1317
static bool vmx_set_guest_bndcfgs(struct vcpu *v, u64 val)
1318
0
{
1319
0
    ASSERT(cpu_has_mpx && cpu_has_vmx_mpx);
1320
0
1321
0
    vmx_vmcs_enter(v);
1322
0
    __vmwrite(GUEST_BNDCFGS, val);
1323
0
    vmx_vmcs_exit(v);
1324
0
1325
0
    return true;
1326
0
}
1327
1328
static bool vmx_get_guest_bndcfgs(struct vcpu *v, u64 *val)
1329
0
{
1330
0
    ASSERT(cpu_has_mpx && cpu_has_vmx_mpx);
1331
0
1332
0
    vmx_vmcs_enter(v);
1333
0
    __vmread(GUEST_BNDCFGS, val);
1334
0
    vmx_vmcs_exit(v);
1335
0
1336
0
    return true;
1337
0
}
1338
1339
static void vmx_handle_cd(struct vcpu *v, unsigned long value)
1340
45
{
1341
45
    if ( !paging_mode_hap(v->domain) )
1342
0
    {
1343
0
        /*
1344
0
         * For shadow, 'load IA32_PAT' VM-entry control is 0, so it cannot
1345
0
         * set guest memory type as UC via IA32_PAT. Xen drop all shadows
1346
0
         * so that any new ones would be created on demand.
1347
0
         */
1348
0
        hvm_shadow_handle_cd(v, value);
1349
0
    }
1350
45
    else
1351
45
    {
1352
45
        u64 *pat = &v->arch.hvm_vcpu.pat_cr;
1353
45
1354
45
        if ( value & X86_CR0_CD )
1355
22
        {
1356
22
            /*
1357
22
             * For EPT, set guest IA32_PAT fields as UC so that guest
1358
22
             * memory type are all UC.
1359
22
             */
1360
22
            u64 uc_pat =
1361
22
                ((uint64_t)PAT_TYPE_UNCACHABLE)       |       /* PAT0 */
1362
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 8)  |       /* PAT1 */
1363
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 16) |       /* PAT2 */
1364
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 24) |       /* PAT3 */
1365
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 32) |       /* PAT4 */
1366
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 40) |       /* PAT5 */
1367
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 48) |       /* PAT6 */
1368
22
                ((uint64_t)PAT_TYPE_UNCACHABLE << 56);        /* PAT7 */
1369
22
1370
22
            vmx_get_guest_pat(v, pat);
1371
22
            vmx_set_guest_pat(v, uc_pat);
1372
22
            vmx_set_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
1373
22
1374
22
            wbinvd();               /* flush possibly polluted cache */
1375
22
            hvm_asid_flush_vcpu(v); /* invalidate memory type cached in TLB */
1376
22
            v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
1377
22
        }
1378
45
        else
1379
23
        {
1380
23
            v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
1381
23
            vmx_set_guest_pat(v, *pat);
1382
23
            if ( !iommu_enabled || iommu_snoop )
1383
22
                vmx_clear_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
1384
23
            hvm_asid_flush_vcpu(v); /* no need to flush cache */
1385
23
        }
1386
45
    }
1387
45
}
1388
1389
static void vmx_setup_tsc_scaling(struct vcpu *v)
1390
11
{
1391
11
    if ( !hvm_tsc_scaling_supported || v->domain->arch.vtsc )
1392
11
        return;
1393
11
1394
0
    vmx_vmcs_enter(v);
1395
0
    __vmwrite(TSC_MULTIPLIER, hvm_tsc_scaling_ratio(v->domain));
1396
0
    vmx_vmcs_exit(v);
1397
0
}
1398
1399
static void vmx_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc)
1400
13
{
1401
13
    vmx_vmcs_enter(v);
1402
13
1403
13
    if ( nestedhvm_vcpu_in_guestmode(v) )
1404
0
        offset += nvmx_get_tsc_offset(v);
1405
13
1406
13
    __vmwrite(TSC_OFFSET, offset);
1407
13
    vmx_vmcs_exit(v);
1408
13
}
1409
1410
static void vmx_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
1411
0
{
1412
0
    vmx_vmcs_enter(v);
1413
0
    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
1414
0
    if ( enable )
1415
0
        v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
1416
0
    vmx_update_cpu_exec_control(v);
1417
0
    vmx_vmcs_exit(v);
1418
0
}
1419
1420
static void vmx_set_descriptor_access_exiting(struct vcpu *v, bool enable)
1421
0
{
1422
0
    if ( enable )
1423
0
        v->arch.hvm_vmx.secondary_exec_control |=
1424
0
            SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING;
1425
0
    else
1426
0
        v->arch.hvm_vmx.secondary_exec_control &=
1427
0
            ~SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING;
1428
0
1429
0
    vmx_vmcs_enter(v);
1430
0
    vmx_update_secondary_exec_control(v);
1431
0
    vmx_vmcs_exit(v);
1432
0
}
1433
1434
static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1435
2
{
1436
2
    char *p;
1437
2
    int i;
1438
2
1439
258
    for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1440
256
    {
1441
256
        if ( i == __HYPERVISOR_iret )
1442
2
            continue;
1443
256
1444
254
        p = (char *)(hypercall_page + (i * 32));
1445
254
        *(u8  *)(p + 0) = 0xb8; /* mov imm32, %eax */
1446
254
        *(u32 *)(p + 1) = i;
1447
254
        *(u8  *)(p + 5) = 0x0f; /* vmcall */
1448
254
        *(u8  *)(p + 6) = 0x01;
1449
254
        *(u8  *)(p + 7) = 0xc1;
1450
254
        *(u8  *)(p + 8) = 0xc3; /* ret */
1451
254
    }
1452
2
1453
2
    /* Don't support HYPERVISOR_iret at the moment */
1454
2
    *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1455
2
}
1456
1457
static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
1458
205k
{
1459
205k
    unsigned long intr_shadow;
1460
205k
1461
205k
    __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1462
205k
1463
205k
    return intr_shadow;
1464
205k
}
1465
1466
static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
1467
0
{
1468
0
    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1469
0
}
1470
1471
static void vmx_load_pdptrs(struct vcpu *v)
1472
110
{
1473
110
    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
1474
110
    uint64_t *guest_pdptes;
1475
110
    struct page_info *page;
1476
110
    p2m_type_t p2mt;
1477
110
    char *p;
1478
110
1479
110
    /* EPT needs to load PDPTRS into VMCS for PAE. */
1480
110
    if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1481
110
        return;
1482
110
1483
0
    if ( (cr3 & 0x1fUL) && !hvm_pcid_enabled(v) )
1484
0
        goto crash;
1485
0
1486
0
    page = get_page_from_gfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt, P2M_UNSHARE);
1487
0
    if ( !page )
1488
0
    {
1489
0
        /* Ideally you don't want to crash but rather go into a wait 
1490
0
         * queue, but this is the wrong place. We're holding at least
1491
0
         * the paging lock */
1492
0
        gdprintk(XENLOG_ERR,
1493
0
                 "Bad cr3 on load pdptrs gfn %lx type %d\n",
1494
0
                 cr3 >> PAGE_SHIFT, (int) p2mt);
1495
0
        goto crash;
1496
0
    }
1497
0
1498
0
    p = __map_domain_page(page);
1499
0
1500
0
    guest_pdptes = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
1501
0
1502
0
    /*
1503
0
     * We do not check the PDPTRs for validity. The CPU will do this during
1504
0
     * vm entry, and we can handle the failure there and crash the guest.
1505
0
     * The only thing we could do better here is #GP instead.
1506
0
     */
1507
0
1508
0
    vmx_vmcs_enter(v);
1509
0
1510
0
    __vmwrite(GUEST_PDPTE(0), guest_pdptes[0]);
1511
0
    __vmwrite(GUEST_PDPTE(1), guest_pdptes[1]);
1512
0
    __vmwrite(GUEST_PDPTE(2), guest_pdptes[2]);
1513
0
    __vmwrite(GUEST_PDPTE(3), guest_pdptes[3]);
1514
0
1515
0
    vmx_vmcs_exit(v);
1516
0
1517
0
    unmap_domain_page(p);
1518
0
    put_page(page);
1519
0
    return;
1520
0
1521
0
 crash:
1522
0
    domain_crash(v->domain);
1523
0
}
1524
1525
static void vmx_update_host_cr3(struct vcpu *v)
1526
12
{
1527
12
    vmx_vmcs_enter(v);
1528
12
    __vmwrite(HOST_CR3, v->arch.cr3);
1529
12
    vmx_vmcs_exit(v);
1530
12
}
1531
1532
void vmx_update_debug_state(struct vcpu *v)
1533
0
{
1534
0
    if ( v->arch.hvm_vcpu.debug_state_latch )
1535
0
        v->arch.hvm_vmx.exception_bitmap |= 1U << TRAP_int3;
1536
0
    else
1537
0
        v->arch.hvm_vmx.exception_bitmap &= ~(1U << TRAP_int3);
1538
0
1539
0
    vmx_vmcs_enter(v);
1540
0
    vmx_update_exception_bitmap(v);
1541
0
    vmx_vmcs_exit(v);
1542
0
}
1543
1544
static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1545
7.93k
{
1546
7.93k
    vmx_vmcs_enter(v);
1547
7.93k
1548
7.93k
    switch ( cr )
1549
7.93k
    {
1550
7.69k
    case 0:
1551
7.69k
    {
1552
7.69k
        bool realmode;
1553
7.69k
        unsigned long hw_cr0_mask = X86_CR0_NE;
1554
7.69k
1555
7.69k
        if ( !vmx_unrestricted_guest(v) )
1556
0
            hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE;
1557
7.69k
1558
7.69k
        if ( paging_mode_shadow(v->domain) )
1559
0
            hw_cr0_mask |= X86_CR0_WP;
1560
7.69k
1561
7.69k
        if ( paging_mode_hap(v->domain) )
1562
7.69k
        {
1563
7.69k
            /* Manage GUEST_CR3 when CR0.PE=0. */
1564
7.69k
            uint32_t old_ctls = v->arch.hvm_vmx.exec_control;
1565
7.69k
            uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1566
7.69k
                                 CPU_BASED_CR3_STORE_EXITING);
1567
7.69k
1568
7.69k
            v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1569
7.69k
            if ( !hvm_paging_enabled(v) && !vmx_unrestricted_guest(v) )
1570
0
                v->arch.hvm_vmx.exec_control |= cr3_ctls;
1571
7.69k
1572
7.69k
            /* Trap CR3 updates if CR3 memory events are enabled. */
1573
7.69k
            if ( v->domain->arch.monitor.write_ctrlreg_enabled &
1574
7.69k
                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3) )
1575
0
                v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING;
1576
7.69k
1577
7.69k
            if ( old_ctls != v->arch.hvm_vmx.exec_control )
1578
0
                vmx_update_cpu_exec_control(v);
1579
7.69k
        }
1580
7.69k
1581
7.69k
        if ( !nestedhvm_vcpu_in_guestmode(v) )
1582
7.69k
            __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1583
7.69k
        else
1584
3
            nvmx_set_cr_read_shadow(v, 0);
1585
7.69k
1586
7.69k
        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1587
3.86k
        {
1588
3.86k
            if ( v != current )
1589
24
                hw_cr0_mask |= X86_CR0_TS;
1590
3.84k
            else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1591
3.81k
                vmx_fpu_enter(v);
1592
3.86k
        }
1593
7.69k
1594
7.69k
        realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
1595
7.69k
1596
7.69k
        if ( !vmx_unrestricted_guest(v) &&
1597
0
             (realmode != v->arch.hvm_vmx.vmx_realmode) )
1598
0
        {
1599
0
            enum x86_segment s;
1600
0
            struct segment_register reg[x86_seg_tr + 1];
1601
0
1602
0
            BUILD_BUG_ON(x86_seg_tr != x86_seg_gs + 1);
1603
0
1604
0
            /* Entering or leaving real mode: adjust the segment registers.
1605
0
             * Need to read them all either way, as realmode reads can update
1606
0
             * the saved values we'll use when returning to prot mode. */
1607
0
            for ( s = 0; s < ARRAY_SIZE(reg); s++ )
1608
0
                hvm_get_segment_register(v, s, &reg[s]);
1609
0
            v->arch.hvm_vmx.vmx_realmode = realmode;
1610
0
1611
0
            if ( realmode )
1612
0
            {
1613
0
                for ( s = 0; s < ARRAY_SIZE(reg); s++ )
1614
0
                    hvm_set_segment_register(v, s, &reg[s]);
1615
0
            }
1616
0
            else
1617
0
            {
1618
0
                for ( s = 0; s < ARRAY_SIZE(reg); s++ )
1619
0
                    if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
1620
0
                        hvm_set_segment_register(
1621
0
                            v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
1622
0
            }
1623
0
1624
0
            vmx_update_exception_bitmap(v);
1625
0
        }
1626
7.69k
1627
7.69k
        v->arch.hvm_vcpu.hw_cr[0] =
1628
7.69k
            v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1629
7.69k
        __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1630
7.69k
    }
1631
7.69k
        /* Fallthrough: Changing CR0 can change some bits in real CR4. */
1632
7.81k
    case 4:
1633
7.81k
        v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1634
7.81k
        if ( paging_mode_hap(v->domain) )
1635
7.80k
            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1636
7.81k
1637
7.81k
        if ( !nestedhvm_vcpu_in_guestmode(v) )
1638
7.80k
            __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1639
7.81k
        else
1640
8
            nvmx_set_cr_read_shadow(v, 4);
1641
7.81k
1642
7.81k
        v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1643
7.81k
        if ( v->arch.hvm_vmx.vmx_realmode )
1644
0
            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1645
7.81k
1646
7.81k
        if ( !hvm_paging_enabled(v) )
1647
71
        {
1648
71
            /*
1649
71
             * When the guest thinks paging is disabled, Xen may need to hide
1650
71
             * the effects of running with CR0.PG actually enabled.  There are
1651
71
             * two subtly complicated cases.
1652
71
             */
1653
71
1654
71
            if ( paging_mode_hap(v->domain) )
1655
71
            {
1656
71
                /*
1657
71
                 * On hardware lacking the Unrestricted Guest feature (or with
1658
71
                 * it disabled in the VMCS), we may not enter the guest with
1659
71
                 * CR0.PG actually disabled.  When EPT is enabled, we run with
1660
71
                 * guest paging settings, but with CR3 pointing at
1661
71
                 * HVM_PARAM_IDENT_PT which is a 32bit pagetable using 4M
1662
71
                 * superpages.  Override the guests paging settings to match.
1663
71
                 */
1664
71
                v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1665
71
                v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1666
71
            }
1667
71
1668
71
            /*
1669
71
             * Without CR0.PG, all memory accesses are user mode, so
1670
71
             * _PAGE_USER must be set in the pagetables for guest userspace to
1671
71
             * function.  This in turn trips up guest supervisor mode if
1672
71
             * SMEP/SMAP are left active in context.  They wouldn't have any
1673
71
             * effect if paging was actually disabled, so hide them behind the
1674
71
             * back of the guest.
1675
71
             */
1676
71
            v->arch.hvm_vcpu.hw_cr[4] &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
1677
71
        }
1678
7.81k
1679
7.81k
        __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1680
7.81k
        break;
1681
7.69k
1682
11
    case 2:
1683
11
        /* CR2 is updated in exit stub. */
1684
11
        break;
1685
7.69k
1686
110
    case 3:
1687
110
        if ( paging_mode_hap(v->domain) )
1688
110
        {
1689
110
            if ( !hvm_paging_enabled(v) && !vmx_unrestricted_guest(v) )
1690
0
                v->arch.hvm_vcpu.hw_cr[3] =
1691
0
                    v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1692
110
            vmx_load_pdptrs(v);
1693
110
        }
1694
110
1695
110
        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1696
110
        hvm_asid_flush_vcpu(v);
1697
110
        break;
1698
7.69k
1699
0
    default:
1700
0
        BUG();
1701
7.93k
    }
1702
7.93k
1703
7.93k
    vmx_vmcs_exit(v);
1704
7.93k
}
1705
1706
static void vmx_update_guest_efer(struct vcpu *v)
1707
60
{
1708
60
    unsigned long vm_entry_value;
1709
60
1710
60
    vmx_vmcs_enter(v);
1711
60
1712
60
    __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1713
60
    if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1714
36
        vm_entry_value |= VM_ENTRY_IA32E_MODE;
1715
60
    else
1716
24
        vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1717
60
    __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1718
60
1719
60
    vmx_vmcs_exit(v);
1720
60
1721
60
    if ( v == current )
1722
48
        write_efer((read_efer() & ~EFER_SCE) |
1723
48
                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
1724
60
}
1725
1726
void nvmx_enqueue_n2_exceptions(struct vcpu *v, 
1727
            unsigned long intr_fields, int error_code, uint8_t source)
1728
0
{
1729
0
    struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1730
0
1731
0
    if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) ) {
1732
0
        /* enqueue the exception till the VMCS switch back to L1 */
1733
0
        nvmx->intr.intr_info = intr_fields;
1734
0
        nvmx->intr.error_code = error_code;
1735
0
        nvmx->intr.source = source;
1736
0
        vcpu_nestedhvm(v).nv_vmexit_pending = 1;
1737
0
        return;
1738
0
    }
1739
0
    else
1740
0
        gdprintk(XENLOG_ERR, "Double Fault on Nested Guest: exception %lx %x"
1741
0
                 "on %lx %x\n", intr_fields, error_code,
1742
0
                 nvmx->intr.intr_info, nvmx->intr.error_code);
1743
0
}
1744
1745
static int nvmx_vmexit_event(struct vcpu *v, const struct x86_event *event)
1746
0
{
1747
0
    nvmx_enqueue_n2_exceptions(v, event->vector, event->error_code,
1748
0
                               hvm_intsrc_none);
1749
0
    return NESTEDHVM_VMEXIT_DONE;
1750
0
}
1751
1752
static void __vmx_inject_exception(int trap, int type, int error_code)
1753
97.7k
{
1754
97.7k
    unsigned long intr_fields;
1755
97.7k
    struct vcpu *curr = current;
1756
97.7k
1757
97.7k
    /*
1758
97.7k
     * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1759
97.7k
     *  "If the VM entry is injecting, there is no blocking by STI or by
1760
97.7k
     *   MOV SS following the VM entry, regardless of the contents of the
1761
97.7k
     *   interruptibility-state field [in the guest-state area before the
1762
97.7k
     *   VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1763
97.7k
     */
1764
97.7k
1765
97.7k
    intr_fields = INTR_INFO_VALID_MASK |
1766
97.7k
                  MASK_INSR(type, INTR_INFO_INTR_TYPE_MASK) |
1767
97.7k
                  MASK_INSR(trap, INTR_INFO_VECTOR_MASK);
1768
97.7k
    if ( error_code != X86_EVENT_NO_EC )
1769
0
    {
1770
0
        ASSERT(error_code == (uint16_t)error_code);
1771
0
        __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1772
0
        intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1773
0
    }
1774
97.7k
1775
97.7k
    __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1776
97.7k
1777
97.7k
    /* Can't inject exceptions in virtual 8086 mode because they would 
1778
97.7k
     * use the protected-mode IDT.  Emulate at the next vmenter instead. */
1779
97.7k
    if ( curr->arch.hvm_vmx.vmx_realmode ) 
1780
0
        curr->arch.hvm_vmx.vmx_emulate = 1;
1781
97.7k
}
1782
1783
void vmx_inject_extint(int trap, uint8_t source)
1784
97.8k
{
1785
97.8k
    struct vcpu *v = current;
1786
97.8k
    u32    pin_based_cntrl;
1787
97.8k
1788
97.8k
    if ( nestedhvm_vcpu_in_guestmode(v) ) {
1789
0
        pin_based_cntrl = get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
1790
0
        if ( pin_based_cntrl & PIN_BASED_EXT_INTR_MASK ) {
1791
0
            nvmx_enqueue_n2_exceptions (v, 
1792
0
               INTR_INFO_VALID_MASK |
1793
0
               MASK_INSR(X86_EVENTTYPE_EXT_INTR, INTR_INFO_INTR_TYPE_MASK) |
1794
0
               MASK_INSR(trap, INTR_INFO_VECTOR_MASK),
1795
0
               X86_EVENT_NO_EC, source);
1796
0
            return;
1797
0
        }
1798
0
    }
1799
97.8k
    __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1800
97.8k
                           X86_EVENT_NO_EC);
1801
97.8k
}
1802
1803
void vmx_inject_nmi(void)
1804
0
{
1805
0
    struct vcpu *v = current;
1806
0
    u32    pin_based_cntrl;
1807
0
1808
0
    if ( nestedhvm_vcpu_in_guestmode(v) ) {
1809
0
        pin_based_cntrl = get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
1810
0
        if ( pin_based_cntrl & PIN_BASED_NMI_EXITING ) {
1811
0
            nvmx_enqueue_n2_exceptions (v, 
1812
0
               INTR_INFO_VALID_MASK |
1813
0
               MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK) |
1814
0
               MASK_INSR(TRAP_nmi, INTR_INFO_VECTOR_MASK),
1815
0
               X86_EVENT_NO_EC, hvm_intsrc_nmi);
1816
0
            return;
1817
0
        }
1818
0
    }
1819
0
    __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1820
0
                           X86_EVENT_NO_EC);
1821
0
}
1822
1823
/*
1824
 * Generate a virtual event in the guest.
1825
 * NOTES:
1826
 *  - INT 3 (CC) and INTO (CE) are X86_EVENTTYPE_SW_EXCEPTION;
1827
 *  - INT nn (CD nn) is X86_EVENTTYPE_SW_INTERRUPT;
1828
 *  - #DB is X86_EVENTTYPE_HW_EXCEPTION, except when generated by
1829
 *    opcode 0xf1 (which is X86_EVENTTYPE_PRI_SW_EXCEPTION)
1830
 */
1831
static void vmx_inject_event(const struct x86_event *event)
1832
0
{
1833
0
    unsigned long intr_info;
1834
0
    struct vcpu *curr = current;
1835
0
    struct x86_event _event = *event;
1836
0
1837
0
    switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
1838
0
    {
1839
0
    case TRAP_debug:
1840
0
        if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1841
0
        {
1842
0
            __restore_debug_registers(curr);
1843
0
            write_debugreg(6, read_debugreg(6) | DR_STEP);
1844
0
        }
1845
0
        if ( !nestedhvm_vcpu_in_guestmode(curr) ||
1846
0
             !nvmx_intercepts_exception(curr, TRAP_debug, _event.error_code) )
1847
0
        {
1848
0
            unsigned long val;
1849
0
1850
0
            __vmread(GUEST_DR7, &val);
1851
0
            __vmwrite(GUEST_DR7, val & ~DR_GENERAL_DETECT);
1852
0
            __vmread(GUEST_IA32_DEBUGCTL, &val);
1853
0
            __vmwrite(GUEST_IA32_DEBUGCTL, val & ~IA32_DEBUGCTLMSR_LBR);
1854
0
        }
1855
0
        if ( cpu_has_monitor_trap_flag )
1856
0
            break;
1857
0
        /* fall through */
1858
0
    case TRAP_int3:
1859
0
        if ( curr->domain->debugger_attached )
1860
0
        {
1861
0
            /* Debug/Int3: Trap to debugger. */
1862
0
            domain_pause_for_debugger();
1863
0
            return;
1864
0
        }
1865
0
        break;
1866
0
1867
0
    case TRAP_page_fault:
1868
0
        ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
1869
0
        curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
1870
0
        break;
1871
0
    }
1872
0
1873
0
    if ( nestedhvm_vcpu_in_guestmode(curr) )
1874
0
        intr_info = vcpu_2_nvmx(curr).intr.intr_info;
1875
0
    else
1876
0
        __vmread(VM_ENTRY_INTR_INFO, &intr_info);
1877
0
1878
0
    if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1879
0
         (MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) ==
1880
0
          X86_EVENTTYPE_HW_EXCEPTION) )
1881
0
    {
1882
0
        _event.vector = hvm_combine_hw_exceptions(
1883
0
            (uint8_t)intr_info, _event.vector);
1884
0
        if ( _event.vector == TRAP_double_fault )
1885
0
            _event.error_code = 0;
1886
0
    }
1887
0
1888
0
    if ( _event.type >= X86_EVENTTYPE_SW_INTERRUPT )
1889
0
        __vmwrite(VM_ENTRY_INSTRUCTION_LEN, _event.insn_len);
1890
0
1891
0
    if ( nestedhvm_vcpu_in_guestmode(curr) &&
1892
0
         nvmx_intercepts_exception(curr, _event.vector, _event.error_code) )
1893
0
    {
1894
0
        nvmx_enqueue_n2_exceptions (curr, 
1895
0
            INTR_INFO_VALID_MASK |
1896
0
            MASK_INSR(_event.type, INTR_INFO_INTR_TYPE_MASK) |
1897
0
            MASK_INSR(_event.vector, INTR_INFO_VECTOR_MASK),
1898
0
            _event.error_code, hvm_intsrc_none);
1899
0
        return;
1900
0
    }
1901
0
    else
1902
0
        __vmx_inject_exception(_event.vector, _event.type, _event.error_code);
1903
0
1904
0
    if ( (_event.vector == TRAP_page_fault) &&
1905
0
         (_event.type == X86_EVENTTYPE_HW_EXCEPTION) )
1906
0
        HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
1907
0
                         TRC_PAR_LONG(curr->arch.hvm_vcpu.guest_cr[2]));
1908
0
    else
1909
0
        HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
1910
0
}
1911
1912
static int vmx_event_pending(struct vcpu *v)
1913
65.3k
{
1914
65.3k
    unsigned long intr_info;
1915
65.3k
1916
65.3k
    ASSERT(v == current);
1917
65.3k
    __vmread(VM_ENTRY_INTR_INFO, &intr_info);
1918
65.3k
1919
65.3k
    return intr_info & INTR_INFO_VALID_MASK;
1920
65.3k
}
1921
1922
static void vmx_set_info_guest(struct vcpu *v)
1923
0
{
1924
0
    unsigned long intr_shadow;
1925
0
1926
0
    vmx_vmcs_enter(v);
1927
0
1928
0
    __vmwrite(GUEST_DR7, v->arch.debugreg[7]);
1929
0
1930
0
    /* 
1931
0
     * If the interruptibility-state field indicates blocking by STI,
1932
0
     * setting the TF flag in the EFLAGS may cause VM entry to fail
1933
0
     * and crash the guest. See SDM 3B 22.3.1.5.
1934
0
     * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
1935
0
     * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
1936
0
     * immediately vmexit and hence make no progress.
1937
0
     */
1938
0
    __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1939
0
    if ( v->domain->debugger_attached &&
1940
0
         (v->arch.user_regs.eflags & X86_EFLAGS_TF) &&
1941
0
         (intr_shadow & VMX_INTR_SHADOW_STI) )
1942
0
    {
1943
0
        intr_shadow &= ~VMX_INTR_SHADOW_STI;
1944
0
        __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1945
0
    }
1946
0
1947
0
    vmx_vmcs_exit(v);
1948
0
}
1949
1950
static void vmx_update_eoi_exit_bitmap(struct vcpu *v, u8 vector, u8 trig)
1951
4.14k
{
1952
4.14k
    if ( trig )
1953
296
        vmx_set_eoi_exit_bitmap(v, vector);
1954
4.14k
    else
1955
3.84k
        vmx_clear_eoi_exit_bitmap(v, vector);
1956
4.14k
}
1957
1958
static int vmx_virtual_intr_delivery_enabled(void)
1959
12.6k
{
1960
12.6k
    return cpu_has_vmx_virtual_intr_delivery;
1961
12.6k
}
1962
1963
static void vmx_process_isr(int isr, struct vcpu *v)
1964
0
{
1965
0
    unsigned long status;
1966
0
    u8 old;
1967
0
    unsigned int i;
1968
0
    const struct vlapic *vlapic = vcpu_vlapic(v);
1969
0
1970
0
    if ( isr < 0 )
1971
0
        isr = 0;
1972
0
1973
0
    vmx_vmcs_enter(v);
1974
0
    __vmread(GUEST_INTR_STATUS, &status);
1975
0
    old = status >> VMX_GUEST_INTR_STATUS_SVI_OFFSET;
1976
0
    if ( isr != old )
1977
0
    {
1978
0
        status &= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
1979
0
        status |= isr << VMX_GUEST_INTR_STATUS_SVI_OFFSET;
1980
0
        __vmwrite(GUEST_INTR_STATUS, status);
1981
0
    }
1982
0
1983
0
    /*
1984
0
     * Theoretically, only level triggered interrupts can have their
1985
0
     * corresponding bits set in the eoi exit bitmap. That is, the bits
1986
0
     * set in the eoi exit bitmap should also be set in TMR. But a periodic
1987
0
     * timer interrupt does not follow the rule: it is edge triggered, but
1988
0
     * requires its corresponding bit be set in the eoi exit bitmap. So we
1989
0
     * should not construct the eoi exit bitmap based on TMR.
1990
0
     * Here we will construct the eoi exit bitmap via (IRR | ISR). This
1991
0
     * means that EOIs to the interrupts that are set in the IRR or ISR will
1992
0
     * cause VM exits after restoring, regardless of the trigger modes. It
1993
0
     * is acceptable because the subsequent interrupts will set up the eoi
1994
0
     * bitmap correctly.
1995
0
     */
1996
0
    for ( i = 0x10; i < NR_VECTORS; ++i )
1997
0
        if ( vlapic_test_vector(i, &vlapic->regs->data[APIC_IRR]) ||
1998
0
             vlapic_test_vector(i, &vlapic->regs->data[APIC_ISR]) )
1999
0
            set_bit(i, v->arch.hvm_vmx.eoi_exit_bitmap);
2000
0
2001
0
    for ( i = 0; i < ARRAY_SIZE(v->arch.hvm_vmx.eoi_exit_bitmap); ++i )
2002
0
        __vmwrite(EOI_EXIT_BITMAP(i), v->arch.hvm_vmx.eoi_exit_bitmap[i]);
2003
0
2004
0
    vmx_vmcs_exit(v);
2005
0
}
2006
2007
static void __vmx_deliver_posted_interrupt(struct vcpu *v)
2008
4.14k
{
2009
4.14k
    bool_t running = v->is_running;
2010
4.14k
2011
4.14k
    vcpu_unblock(v);
2012
4.14k
    /*
2013
4.14k
     * Just like vcpu_kick(), nothing is needed for the following two cases:
2014
4.14k
     * 1. The target vCPU is not running, meaning it is blocked or runnable.
2015
4.14k
     * 2. The target vCPU is the current vCPU and we're in non-interrupt
2016
4.14k
     * context.
2017
4.14k
     */
2018
4.14k
    if ( running && (in_irq() || (v != current)) )
2019
156
    {
2020
156
        /*
2021
156
         * Note: Only two cases will reach here:
2022
156
         * 1. The target vCPU is running on other pCPU.
2023
156
         * 2. The target vCPU is the current vCPU.
2024
156
         *
2025
156
         * Note2: Don't worry the v->processor may change. The vCPU being
2026
156
         * moved to another processor is guaranteed to sync PIR to vIRR,
2027
156
         * due to the involved scheduling cycle.
2028
156
         */
2029
156
        unsigned int cpu = v->processor;
2030
156
2031
156
        /*
2032
156
         * For case 1, we send an IPI to the pCPU. When an IPI arrives, the
2033
156
         * target vCPU maybe is running in non-root mode, running in root
2034
156
         * mode, runnable or blocked. If the target vCPU is running in
2035
156
         * non-root mode, the hardware will sync PIR to vIRR for
2036
156
         * 'posted_intr_vector' is special to the pCPU. If the target vCPU is
2037
156
         * running in root-mode, the interrupt handler starts to run.
2038
156
         * Considering an IPI may arrive in the window between the call to
2039
156
         * vmx_intr_assist() and interrupts getting disabled, the interrupt
2040
156
         * handler should raise a softirq to ensure events will be delivered
2041
156
         * in time. If the target vCPU is runnable, it will sync PIR to
2042
156
         * vIRR next time it is chose to run. In this case, a IPI and a
2043
156
         * softirq is sent to a wrong vCPU which will not have any adverse
2044
156
         * effect. If the target vCPU is blocked, since vcpu_block() checks
2045
156
         * whether there is an event to be delivered through
2046
156
         * local_events_need_delivery() just after blocking, the vCPU must
2047
156
         * have synced PIR to vIRR. Similarly, there is a IPI and a softirq
2048
156
         * sent to a wrong vCPU.
2049
156
         */
2050
156
        if ( cpu != smp_processor_id() )
2051
156
            send_IPI_mask(cpumask_of(cpu), posted_intr_vector);
2052
156
        /*
2053
156
         * For case 2, raising a softirq ensures PIR will be synced to vIRR.
2054
156
         * As any softirq will do, as an optimization we only raise one if
2055
156
         * none is pending already.
2056
156
         */
2057
0
        else if ( !softirq_pending(cpu) )
2058
0
            raise_softirq(VCPU_KICK_SOFTIRQ);
2059
156
    }
2060
4.14k
}
2061
2062
static void vmx_deliver_posted_intr(struct vcpu *v, u8 vector)
2063
4.14k
{
2064
4.14k
    if ( pi_test_and_set_pir(vector, &v->arch.hvm_vmx.pi_desc) )
2065
0
        return;
2066
4.14k
2067
4.14k
    if ( unlikely(v->arch.hvm_vmx.eoi_exitmap_changed) )
2068
3
    {
2069
3
        /*
2070
3
         * If EOI exitbitmap needs to changed or notification vector
2071
3
         * can't be allocated, interrupt will not be injected till
2072
3
         * VMEntry as it used to be.
2073
3
         */
2074
3
        pi_set_on(&v->arch.hvm_vmx.pi_desc);
2075
3
    }
2076
4.14k
    else
2077
4.14k
    {
2078
4.14k
        struct pi_desc old, new, prev;
2079
4.14k
2080
4.14k
        prev.control = v->arch.hvm_vmx.pi_desc.control;
2081
4.14k
2082
4.14k
        do {
2083
4.14k
            /*
2084
4.14k
             * Currently, we don't support urgent interrupt, all
2085
4.14k
             * interrupts are recognized as non-urgent interrupt,
2086
4.14k
             * Besides that, if 'ON' is already set, no need to
2087
4.14k
             * sent posted-interrupts notification event as well,
2088
4.14k
             * according to hardware behavior.
2089
4.14k
             */
2090
4.14k
            if ( pi_test_sn(&prev) || pi_test_on(&prev) )
2091
0
            {
2092
0
                vcpu_kick(v);
2093
0
                return;
2094
0
            }
2095
4.14k
2096
4.14k
            old.control = v->arch.hvm_vmx.pi_desc.control &
2097
4.14k
                          ~((1 << POSTED_INTR_ON) | (1 << POSTED_INTR_SN));
2098
4.14k
            new.control = v->arch.hvm_vmx.pi_desc.control |
2099
4.14k
                          (1 << POSTED_INTR_ON);
2100
4.14k
2101
4.14k
            prev.control = cmpxchg(&v->arch.hvm_vmx.pi_desc.control,
2102
4.14k
                                   old.control, new.control);
2103
4.14k
        } while ( prev.control != old.control );
2104
4.14k
2105
4.14k
        __vmx_deliver_posted_interrupt(v);
2106
4.14k
        return;
2107
4.14k
    }
2108
4.14k
2109
3
    vcpu_kick(v);
2110
3
}
2111
2112
static void vmx_sync_pir_to_irr(struct vcpu *v)
2113
7.09M
{
2114
7.09M
    struct vlapic *vlapic = vcpu_vlapic(v);
2115
7.09M
    unsigned int group, i;
2116
7.09M
    DECLARE_BITMAP(pending_intr, NR_VECTORS);
2117
7.09M
2118
7.09M
    if ( !pi_test_and_clear_on(&v->arch.hvm_vmx.pi_desc) )
2119
7.43M
        return;
2120
7.09M
2121
18.4E
    for ( group = 0; group < ARRAY_SIZE(pending_intr); group++ )
2122
16.2k
        pending_intr[group] = pi_get_pir(&v->arch.hvm_vmx.pi_desc, group);
2123
18.4E
2124
18.4E
    for_each_set_bit(i, pending_intr, NR_VECTORS)
2125
18.4E
        vlapic_set_vector(i, &vlapic->regs->data[APIC_IRR]);
2126
18.4E
}
2127
2128
static void vmx_handle_eoi(u8 vector)
2129
0
{
2130
0
    unsigned long status;
2131
0
2132
0
    /* We need to clear the SVI field. */
2133
0
    __vmread(GUEST_INTR_STATUS, &status);
2134
0
    status &= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
2135
0
    __vmwrite(GUEST_INTR_STATUS, status);
2136
0
}
2137
2138
static void vmx_enable_msr_interception(struct domain *d, uint32_t msr)
2139
0
{
2140
0
    struct vcpu *v;
2141
0
2142
0
    for_each_vcpu ( d, v )
2143
0
        vmx_set_msr_intercept(v, msr, VMX_MSR_W);
2144
0
}
2145
2146
static bool_t vmx_is_singlestep_supported(void)
2147
0
{
2148
0
    return !!cpu_has_monitor_trap_flag;
2149
0
}
2150
2151
static void vmx_vcpu_update_eptp(struct vcpu *v)
2152
0
{
2153
0
    struct domain *d = v->domain;
2154
0
    struct p2m_domain *p2m = NULL;
2155
0
    struct ept_data *ept;
2156
0
2157
0
    if ( altp2m_active(d) )
2158
0
        p2m = p2m_get_altp2m(v);
2159
0
    if ( !p2m )
2160
0
        p2m = p2m_get_hostp2m(d);
2161
0
2162
0
    ept = &p2m->ept;
2163
0
    ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
2164
0
2165
0
    vmx_vmcs_enter(v);
2166
0
2167
0
    __vmwrite(EPT_POINTER, ept->eptp);
2168
0
2169
0
    if ( v->arch.hvm_vmx.secondary_exec_control &
2170
0
         SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS )
2171
0
        __vmwrite(EPTP_INDEX, vcpu_altp2m(v).p2midx);
2172
0
2173
0
    vmx_vmcs_exit(v);
2174
0
}
2175
2176
static void vmx_vcpu_update_vmfunc_ve(struct vcpu *v)
2177
0
{
2178
0
    struct domain *d = v->domain;
2179
0
    u32 mask = SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
2180
0
2181
0
    if ( !cpu_has_vmx_vmfunc )
2182
0
        return;
2183
0
2184
0
    if ( cpu_has_vmx_virt_exceptions )
2185
0
        mask |= SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
2186
0
2187
0
    vmx_vmcs_enter(v);
2188
0
2189
0
    if ( !d->is_dying && altp2m_active(d) )
2190
0
    {
2191
0
        v->arch.hvm_vmx.secondary_exec_control |= mask;
2192
0
        __vmwrite(VM_FUNCTION_CONTROL, VMX_VMFUNC_EPTP_SWITCHING);
2193
0
        __vmwrite(EPTP_LIST_ADDR, virt_to_maddr(d->arch.altp2m_eptp));
2194
0
2195
0
        if ( cpu_has_vmx_virt_exceptions )
2196
0
        {
2197
0
            p2m_type_t t;
2198
0
            mfn_t mfn;
2199
0
2200
0
            mfn = get_gfn_query_unlocked(d, gfn_x(vcpu_altp2m(v).veinfo_gfn), &t);
2201
0
2202
0
            if ( !mfn_eq(mfn, INVALID_MFN) )
2203
0
                __vmwrite(VIRT_EXCEPTION_INFO, mfn_x(mfn) << PAGE_SHIFT);
2204
0
            else
2205
0
                v->arch.hvm_vmx.secondary_exec_control &=
2206
0
                    ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
2207
0
        }
2208
0
    }
2209
0
    else
2210
0
        v->arch.hvm_vmx.secondary_exec_control &= ~mask;
2211
0
2212
0
    vmx_update_secondary_exec_control(v);
2213
0
    vmx_vmcs_exit(v);
2214
0
}
2215
2216
static int vmx_vcpu_emulate_vmfunc(const struct cpu_user_regs *regs)
2217
0
{
2218
0
    int rc = X86EMUL_EXCEPTION;
2219
0
    struct vcpu *curr = current;
2220
0
2221
0
    if ( !cpu_has_vmx_vmfunc && altp2m_active(curr->domain) &&
2222
0
         regs->eax == 0 &&
2223
0
         p2m_switch_vcpu_altp2m_by_id(curr, regs->ecx) )
2224
0
        rc = X86EMUL_OKAY;
2225
0
2226
0
    return rc;
2227
0
}
2228
2229
static bool_t vmx_vcpu_emulate_ve(struct vcpu *v)
2230
0
{
2231
0
    bool_t rc = 0, writable;
2232
0
    gfn_t gfn = vcpu_altp2m(v).veinfo_gfn;
2233
0
    ve_info_t *veinfo;
2234
0
2235
0
    if ( gfn_eq(gfn, INVALID_GFN) )
2236
0
        return 0;
2237
0
2238
0
    veinfo = hvm_map_guest_frame_rw(gfn_x(gfn), 0, &writable);
2239
0
    if ( !veinfo )
2240
0
        return 0;
2241
0
    if ( !writable || veinfo->semaphore != 0 )
2242
0
        goto out;
2243
0
2244
0
    rc = 1;
2245
0
2246
0
    veinfo->exit_reason = EXIT_REASON_EPT_VIOLATION;
2247
0
    veinfo->semaphore = ~0;
2248
0
    veinfo->eptp_index = vcpu_altp2m(v).p2midx;
2249
0
2250
0
    vmx_vmcs_enter(v);
2251
0
    __vmread(EXIT_QUALIFICATION, &veinfo->exit_qualification);
2252
0
    __vmread(GUEST_LINEAR_ADDRESS, &veinfo->gla);
2253
0
    __vmread(GUEST_PHYSICAL_ADDRESS, &veinfo->gpa);
2254
0
    vmx_vmcs_exit(v);
2255
0
2256
0
    hvm_inject_hw_exception(TRAP_virtualisation,
2257
0
                            X86_EVENT_NO_EC);
2258
0
2259
0
 out:
2260
0
    hvm_unmap_guest_frame(veinfo, 0);
2261
0
    return rc;
2262
0
}
2263
2264
static int vmx_set_mode(struct vcpu *v, int mode)
2265
0
{
2266
0
    unsigned long attr;
2267
0
2268
0
    ASSERT((mode == 4) || (mode == 8));
2269
0
2270
0
    attr = (mode == 4) ? 0xc09b : 0xa09b;
2271
0
2272
0
    vmx_vmcs_enter(v);
2273
0
    __vmwrite(GUEST_CS_AR_BYTES, attr);
2274
0
    vmx_vmcs_exit(v);
2275
0
2276
0
    return 0;
2277
0
}
2278
2279
static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
2280
0
{
2281
0
    unsigned long intr_info, error_code;
2282
0
2283
0
    vmx_vmcs_enter(v);
2284
0
    __vmread(VM_ENTRY_INTR_INFO, &intr_info);
2285
0
    __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE, &error_code);
2286
0
    vmx_vmcs_exit(v);
2287
0
2288
0
    if ( !(intr_info & INTR_INFO_VALID_MASK) )
2289
0
        return false;
2290
0
2291
0
    info->vector = MASK_EXTR(intr_info, INTR_INFO_VECTOR_MASK);
2292
0
    info->type = MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK);
2293
0
    info->error_code = error_code;
2294
0
2295
0
    return true;
2296
0
}
2297
2298
static struct hvm_function_table __initdata vmx_function_table = {
2299
    .name                 = "VMX",
2300
    .cpu_up_prepare       = vmx_cpu_up_prepare,
2301
    .cpu_dead             = vmx_cpu_dead,
2302
    .domain_initialise    = vmx_domain_initialise,
2303
    .domain_destroy       = vmx_domain_destroy,
2304
    .vcpu_initialise      = vmx_vcpu_initialise,
2305
    .vcpu_destroy         = vmx_vcpu_destroy,
2306
    .save_cpu_ctxt        = vmx_save_vmcs_ctxt,
2307
    .load_cpu_ctxt        = vmx_load_vmcs_ctxt,
2308
    .init_msr             = vmx_init_msr,
2309
    .save_msr             = vmx_save_msr,
2310
    .load_msr             = vmx_load_msr,
2311
    .get_interrupt_shadow = vmx_get_interrupt_shadow,
2312
    .set_interrupt_shadow = vmx_set_interrupt_shadow,
2313
    .guest_x86_mode       = vmx_guest_x86_mode,
2314
    .get_cpl              = _vmx_get_cpl,
2315
    .get_segment_register = vmx_get_segment_register,
2316
    .set_segment_register = vmx_set_segment_register,
2317
    .get_shadow_gs_base   = vmx_get_shadow_gs_base,
2318
    .update_host_cr3      = vmx_update_host_cr3,
2319
    .update_guest_cr      = vmx_update_guest_cr,
2320
    .update_guest_efer    = vmx_update_guest_efer,
2321
    .update_guest_vendor  = vmx_update_guest_vendor,
2322
    .fpu_leave            = vmx_fpu_leave,
2323
    .set_guest_pat        = vmx_set_guest_pat,
2324
    .get_guest_pat        = vmx_get_guest_pat,
2325
    .set_tsc_offset       = vmx_set_tsc_offset,
2326
    .inject_event         = vmx_inject_event,
2327
    .init_hypercall_page  = vmx_init_hypercall_page,
2328
    .event_pending        = vmx_event_pending,
2329
    .get_pending_event    = vmx_get_pending_event,
2330
    .invlpg               = vmx_invlpg,
2331
    .cpu_up               = vmx_cpu_up,
2332
    .cpu_down             = vmx_cpu_down,
2333
    .wbinvd_intercept     = vmx_wbinvd_intercept,
2334
    .fpu_dirty_intercept  = vmx_fpu_dirty_intercept,
2335
    .msr_read_intercept   = vmx_msr_read_intercept,
2336
    .msr_write_intercept  = vmx_msr_write_intercept,
2337
    .vmfunc_intercept     = vmx_vmfunc_intercept,
2338
    .handle_cd            = vmx_handle_cd,
2339
    .set_info_guest       = vmx_set_info_guest,
2340
    .set_rdtsc_exiting    = vmx_set_rdtsc_exiting,
2341
    .nhvm_vcpu_initialise = nvmx_vcpu_initialise,
2342
    .nhvm_vcpu_destroy    = nvmx_vcpu_destroy,
2343
    .nhvm_vcpu_reset      = nvmx_vcpu_reset,
2344
    .nhvm_vcpu_p2m_base   = nvmx_vcpu_eptp_base,
2345
    .nhvm_vmcx_hap_enabled = nvmx_ept_enabled,
2346
    .nhvm_vmcx_guest_intercepts_event = nvmx_intercepts_exception,
2347
    .nhvm_vcpu_vmexit_event = nvmx_vmexit_event,
2348
    .nhvm_intr_blocked    = nvmx_intr_blocked,
2349
    .nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
2350
    .update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap,
2351
    .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
2352
    .process_isr          = vmx_process_isr,
2353
    .deliver_posted_intr  = vmx_deliver_posted_intr,
2354
    .sync_pir_to_irr      = vmx_sync_pir_to_irr,
2355
    .handle_eoi           = vmx_handle_eoi,
2356
    .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
2357
    .enable_msr_interception = vmx_enable_msr_interception,
2358
    .is_singlestep_supported = vmx_is_singlestep_supported,
2359
    .set_mode = vmx_set_mode,
2360
    .altp2m_vcpu_update_p2m = vmx_vcpu_update_eptp,
2361
    .altp2m_vcpu_update_vmfunc_ve = vmx_vcpu_update_vmfunc_ve,
2362
    .altp2m_vcpu_emulate_ve = vmx_vcpu_emulate_ve,
2363
    .altp2m_vcpu_emulate_vmfunc = vmx_vcpu_emulate_vmfunc,
2364
    .tsc_scaling = {
2365
        .max_ratio = VMX_TSC_MULTIPLIER_MAX,
2366
        .setup     = vmx_setup_tsc_scaling,
2367
    },
2368
};
2369
2370
/* Handle VT-d posted-interrupt when VCPU is blocked. */
2371
static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
2372
0
{
2373
0
    struct arch_vmx_struct *vmx, *tmp;
2374
0
    spinlock_t *lock = &per_cpu(vmx_pi_blocking, smp_processor_id()).lock;
2375
0
    struct list_head *blocked_vcpus =
2376
0
    &per_cpu(vmx_pi_blocking, smp_processor_id()).list;
2377
0
2378
0
    ack_APIC_irq();
2379
0
    this_cpu(irq_count)++;
2380
0
2381
0
    spin_lock(lock);
2382
0
2383
0
    /*
2384
0
     * XXX: The length of the list depends on how many vCPU is current
2385
0
     * blocked on this specific pCPU. This may hurt the interrupt latency
2386
0
     * if the list grows to too many entries.
2387
0
     */
2388
0
    list_for_each_entry_safe(vmx, tmp, blocked_vcpus, pi_blocking.list)
2389
0
    {
2390
0
        if ( pi_test_on(&vmx->pi_desc) )
2391
0
        {
2392
0
            list_del(&vmx->pi_blocking.list);
2393
0
            ASSERT(vmx->pi_blocking.lock == lock);
2394
0
            vmx->pi_blocking.lock = NULL;
2395
0
            vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
2396
0
        }
2397
0
    }
2398
0
2399
0
    spin_unlock(lock);
2400
0
}
2401
2402
/* Handle VT-d posted-interrupt when VCPU is running. */
2403
static void pi_notification_interrupt(struct cpu_user_regs *regs)
2404
70
{
2405
70
    ack_APIC_irq();
2406
70
    this_cpu(irq_count)++;
2407
70
2408
70
    /*
2409
70
     * We get here when a vCPU is running in root-mode (such as via hypercall,
2410
70
     * or any other reasons which can result in VM-Exit), and before vCPU is
2411
70
     * back to non-root, external interrupts from an assigned device happen
2412
70
     * and a notification event is delivered to this logical CPU.
2413
70
     *
2414
70
     * we need to set VCPU_KICK_SOFTIRQ for the current cpu, just like
2415
70
     * __vmx_deliver_posted_interrupt(). So the pending interrupt in PIRR will
2416
70
     * be synced to vIRR before VM-Exit in time.
2417
70
     *
2418
70
     * Please refer to the following code fragments from
2419
70
     * xen/arch/x86/hvm/vmx/entry.S:
2420
70
     *
2421
70
     *     .Lvmx_do_vmentry
2422
70
     *
2423
70
     *      ......
2424
70
     *
2425
70
     *      point 1
2426
70
     *
2427
70
     *      cli
2428
70
     *      cmp  %ecx,(%rdx,%rax,1)
2429
70
     *      jnz  .Lvmx_process_softirqs
2430
70
     *
2431
70
     *      ......
2432
70
     *
2433
70
     *      je   .Lvmx_launch
2434
70
     *
2435
70
     *      ......
2436
70
     *
2437
70
     *     .Lvmx_process_softirqs:
2438
70
     *      sti
2439
70
     *      call do_softirq
2440
70
     *      jmp  .Lvmx_do_vmentry
2441
70
     *
2442
70
     * If VT-d engine issues a notification event at point 1 above, it cannot
2443
70
     * be delivered to the guest during this VM-entry without raising the
2444
70
     * softirq in this notification handler.
2445
70
     */
2446
70
    raise_softirq(VCPU_KICK_SOFTIRQ);
2447
70
}
2448
2449
static void __init lbr_tsx_fixup_check(void);
2450
static void __init bdw_erratum_bdf14_fixup_check(void);
2451
2452
const struct hvm_function_table * __init start_vmx(void)
2453
1
{
2454
1
    set_in_cr4(X86_CR4_VMXE);
2455
1
2456
1
    if ( _vmx_cpu_up(true) )
2457
0
    {
2458
0
        printk("VMX: failed to initialise.\n");
2459
0
        return NULL;
2460
0
    }
2461
1
2462
1
    if ( cpu_has_vmx_dt_exiting )
2463
1
        vmx_function_table.set_descriptor_access_exiting =
2464
1
            vmx_set_descriptor_access_exiting;
2465
1
2466
1
    /*
2467
1
     * Do not enable EPT when (!cpu_has_vmx_pat), to prevent security hole
2468
1
     * (refer to http://xenbits.xen.org/xsa/advisory-60.html).
2469
1
     */
2470
1
    if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
2471
1
    {
2472
1
        vmx_function_table.hap_supported = 1;
2473
1
        vmx_function_table.altp2m_supported = 1;
2474
1
2475
1
        vmx_function_table.hap_capabilities = 0;
2476
1
2477
1
        if ( cpu_has_vmx_ept_2mb )
2478
1
            vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_2MB;
2479
1
        if ( cpu_has_vmx_ept_1gb )
2480
1
            vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_1GB;
2481
1
2482
1
        setup_ept_dump();
2483
1
    }
2484
1
2485
1
    if ( !cpu_has_vmx_virtual_intr_delivery )
2486
0
    {
2487
0
        vmx_function_table.update_eoi_exit_bitmap = NULL;
2488
0
        vmx_function_table.process_isr = NULL;
2489
0
        vmx_function_table.handle_eoi = NULL;
2490
0
    }
2491
1
2492
1
    if ( cpu_has_vmx_posted_intr_processing )
2493
1
    {
2494
1
        alloc_direct_apic_vector(&posted_intr_vector, pi_notification_interrupt);
2495
1
        if ( iommu_intpost )
2496
0
            alloc_direct_apic_vector(&pi_wakeup_vector, pi_wakeup_interrupt);
2497
1
    }
2498
1
    else
2499
0
    {
2500
0
        vmx_function_table.deliver_posted_intr = NULL;
2501
0
        vmx_function_table.sync_pir_to_irr = NULL;
2502
0
    }
2503
1
2504
1
    if ( cpu_has_vmx_tsc_scaling )
2505
0
        vmx_function_table.tsc_scaling.ratio_frac_bits = 48;
2506
1
2507
1
    if ( cpu_has_mpx && cpu_has_vmx_mpx )
2508
0
    {
2509
0
        vmx_function_table.set_guest_bndcfgs = vmx_set_guest_bndcfgs;
2510
0
        vmx_function_table.get_guest_bndcfgs = vmx_get_guest_bndcfgs;
2511
0
    }
2512
1
2513
1
    setup_vmcs_dump();
2514
1
2515
1
    lbr_tsx_fixup_check();
2516
1
    bdw_erratum_bdf14_fixup_check();
2517
1
2518
1
    return &vmx_function_table;
2519
1
}
2520
2521
/*
2522
 * Not all cases receive valid value in the VM-exit instruction length field.
2523
 * Callers must know what they're doing!
2524
 */
2525
static int get_instruction_length(void)
2526
403k
{
2527
403k
    unsigned long len;
2528
403k
2529
403k
    __vmread(VM_EXIT_INSTRUCTION_LEN, &len); /* Safe: callers audited */
2530
403k
    BUG_ON((len < 1) || (len > MAX_INST_LEN));
2531
403k
    return len;
2532
403k
}
2533
2534
void update_guest_eip(void)
2535
400k
{
2536
400k
    struct cpu_user_regs *regs = guest_cpu_user_regs();
2537
400k
    unsigned long x;
2538
400k
2539
400k
    regs->rip += get_instruction_length(); /* Safe: callers audited */
2540
400k
    regs->eflags &= ~X86_EFLAGS_RF;
2541
400k
2542
400k
    __vmread(GUEST_INTERRUPTIBILITY_INFO, &x);
2543
400k
    if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
2544
65.6k
    {
2545
65.6k
        x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
2546
65.6k
        __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
2547
65.6k
    }
2548
400k
2549
400k
    if ( regs->eflags & X86_EFLAGS_TF )
2550
0
        hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
2551
400k
}
2552
2553
static void vmx_fpu_dirty_intercept(void)
2554
6
{
2555
6
    struct vcpu *curr = current;
2556
6
2557
6
    vmx_fpu_enter(curr);
2558
6
2559
6
    /* Disable TS in guest CR0 unless the guest wants the exception too. */
2560
6
    if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
2561
0
    {
2562
0
        curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
2563
0
        __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
2564
0
    }
2565
6
}
2566
2567
static int vmx_do_cpuid(struct cpu_user_regs *regs)
2568
2.66k
{
2569
2.66k
    struct vcpu *curr = current;
2570
2.66k
    uint32_t leaf = regs->eax, subleaf = regs->ecx;
2571
2.66k
    struct cpuid_leaf res;
2572
2.66k
2573
2.66k
    if ( hvm_check_cpuid_faulting(current) )
2574
0
    {
2575
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2576
0
        return 1;  /* Don't advance the guest IP! */
2577
0
    }
2578
2.66k
2579
2.66k
    guest_cpuid(curr, leaf, subleaf, &res);
2580
2.66k
    HVMTRACE_5D(CPUID, leaf, res.a, res.b, res.c, res.d);
2581
2.66k
2582
2.66k
    regs->rax = res.a;
2583
2.66k
    regs->rbx = res.b;
2584
2.66k
    regs->rcx = res.c;
2585
2.66k
    regs->rdx = res.d;
2586
2.66k
2587
2.66k
    return hvm_monitor_cpuid(get_instruction_length(), leaf, subleaf);
2588
2.66k
}
2589
2590
static void vmx_dr_access(unsigned long exit_qualification,
2591
                          struct cpu_user_regs *regs)
2592
0
{
2593
0
    struct vcpu *v = current;
2594
0
2595
0
    HVMTRACE_0D(DR_WRITE);
2596
0
2597
0
    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
2598
0
        __restore_debug_registers(v);
2599
0
2600
0
    /* Allow guest direct access to DR registers */
2601
0
    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
2602
0
    vmx_update_cpu_exec_control(v);
2603
0
}
2604
2605
static void vmx_invlpg_intercept(unsigned long vaddr)
2606
0
{
2607
0
    HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
2608
0
    paging_invlpg(current, vaddr);
2609
0
}
2610
2611
static void vmx_invlpg(struct vcpu *v, unsigned long vaddr)
2612
0
{
2613
0
    if ( cpu_has_vmx_vpid )
2614
0
        vpid_sync_vcpu_gva(v, vaddr);
2615
0
}
2616
2617
static int vmx_vmfunc_intercept(struct cpu_user_regs *regs)
2618
0
{
2619
0
    /*
2620
0
     * This handler is a placeholder for future where Xen may
2621
0
     * want to handle VMFUNC exits and resume a domain normally without
2622
0
     * injecting a #UD to the guest - for example, in a VT-nested
2623
0
     * scenario where Xen may want to lazily shadow the alternate
2624
0
     * EPTP list.
2625
0
     */
2626
0
    gdprintk(XENLOG_ERR, "Failed guest VMFUNC execution\n");
2627
0
    return X86EMUL_EXCEPTION;
2628
0
}
2629
2630
static int vmx_cr_access(unsigned long exit_qualification)
2631
7.76k
{
2632
7.76k
    struct vcpu *curr = current;
2633
7.76k
2634
7.76k
    switch ( VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification) )
2635
7.76k
    {
2636
2.93k
    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: {
2637
2.93k
        unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
2638
2.93k
        unsigned long cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
2639
2.93k
        return hvm_mov_to_cr(cr, gp);
2640
2.93k
    }
2641
0
    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: {
2642
0
        unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
2643
0
        unsigned long cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
2644
0
        return hvm_mov_from_cr(cr, gp);
2645
2.93k
    }
2646
3.79k
    case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: {
2647
3.79k
        unsigned long old = curr->arch.hvm_vcpu.guest_cr[0];
2648
3.79k
        unsigned long value = old & ~X86_CR0_TS;
2649
3.79k
2650
3.79k
        /*
2651
3.79k
         * Special case unlikely to be interesting to a
2652
3.79k
         * VM_EVENT_FLAG_DENY-capable application, so the hvm_monitor_crX()
2653
3.79k
         * return value is ignored for now.
2654
3.79k
         */
2655
3.79k
        hvm_monitor_crX(CR0, value, old);
2656
3.79k
        curr->arch.hvm_vcpu.guest_cr[0] = value;
2657
3.79k
        vmx_update_guest_cr(curr, 0);
2658
3.79k
        HVMTRACE_0D(CLTS);
2659
3.79k
        break;
2660
2.93k
    }
2661
1.03k
    case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: {
2662
1.03k
        unsigned long value = curr->arch.hvm_vcpu.guest_cr[0];
2663
1.03k
        int rc;
2664
1.03k
2665
1.03k
        /* LMSW can (1) set PE; (2) set or clear MP, EM, and TS. */
2666
1.03k
        value = (value & ~(X86_CR0_MP|X86_CR0_EM|X86_CR0_TS)) |
2667
1.03k
                (VMX_CONTROL_REG_ACCESS_DATA(exit_qualification) &
2668
1.03k
                 (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS));
2669
1.03k
        HVMTRACE_LONG_1D(LMSW, value);
2670
1.03k
2671
1.03k
        if ( (rc = hvm_set_cr0(value, 1)) == X86EMUL_EXCEPTION )
2672
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
2673
1.03k
2674
1.03k
        return rc;
2675
2.93k
    }
2676
0
    default:
2677
0
        BUG();
2678
7.76k
    }
2679
7.76k
2680
3.79k
    return X86EMUL_OKAY;
2681
7.76k
}
2682
2683
/* This defines the layout of struct lbr_info[] */
2684
0
#define LBR_LASTINT_FROM_IDX    0
2685
#define LBR_LASTINT_TO_IDX      1
2686
#define LBR_LASTBRANCH_TOS_IDX  2
2687
0
#define LBR_LASTBRANCH_FROM_IDX 3
2688
#define LBR_LASTBRANCH_TO_IDX   4
2689
#define LBR_LASTBRANCH_INFO     5
2690
2691
static const struct lbr_info {
2692
    u32 base, count;
2693
} p4_lbr[] = {
2694
    { MSR_P4_LER_FROM_LIP,          1 },
2695
    { MSR_P4_LER_TO_LIP,            1 },
2696
    { MSR_P4_LASTBRANCH_TOS,        1 },
2697
    { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2698
    { MSR_P4_LASTBRANCH_0_TO_LIP,   NUM_MSR_P4_LASTBRANCH_FROM_TO },
2699
    { 0, 0 }
2700
}, c2_lbr[] = {
2701
    { MSR_IA32_LASTINTFROMIP,       1 },
2702
    { MSR_IA32_LASTINTTOIP,         1 },
2703
    { MSR_C2_LASTBRANCH_TOS,        1 },
2704
    { MSR_C2_LASTBRANCH_0_FROM_IP,  NUM_MSR_C2_LASTBRANCH_FROM_TO },
2705
    { MSR_C2_LASTBRANCH_0_TO_IP,    NUM_MSR_C2_LASTBRANCH_FROM_TO },
2706
    { 0, 0 }
2707
}, nh_lbr[] = {
2708
    { MSR_IA32_LASTINTFROMIP,       1 },
2709
    { MSR_IA32_LASTINTTOIP,         1 },
2710
    { MSR_C2_LASTBRANCH_TOS,        1 },
2711
    { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2712
    { MSR_P4_LASTBRANCH_0_TO_LIP,   NUM_MSR_P4_LASTBRANCH_FROM_TO },
2713
    { 0, 0 }
2714
}, sk_lbr[] = {
2715
    { MSR_IA32_LASTINTFROMIP,       1 },
2716
    { MSR_IA32_LASTINTTOIP,         1 },
2717
    { MSR_SKL_LASTBRANCH_TOS,       1 },
2718
    { MSR_SKL_LASTBRANCH_0_FROM_IP, NUM_MSR_SKL_LASTBRANCH },
2719
    { MSR_SKL_LASTBRANCH_0_TO_IP,   NUM_MSR_SKL_LASTBRANCH },
2720
    { MSR_SKL_LASTBRANCH_0_INFO,    NUM_MSR_SKL_LASTBRANCH },
2721
    { 0, 0 }
2722
}, at_lbr[] = {
2723
    { MSR_IA32_LASTINTFROMIP,       1 },
2724
    { MSR_IA32_LASTINTTOIP,         1 },
2725
    { MSR_C2_LASTBRANCH_TOS,        1 },
2726
    { MSR_C2_LASTBRANCH_0_FROM_IP,  NUM_MSR_ATOM_LASTBRANCH_FROM_TO },
2727
    { MSR_C2_LASTBRANCH_0_TO_IP,    NUM_MSR_ATOM_LASTBRANCH_FROM_TO },
2728
    { 0, 0 }
2729
}, gm_lbr[] = {
2730
    { MSR_IA32_LASTINTFROMIP,       1 },
2731
    { MSR_IA32_LASTINTTOIP,         1 },
2732
    { MSR_GM_LASTBRANCH_TOS,        1 },
2733
    { MSR_GM_LASTBRANCH_0_FROM_IP,  NUM_MSR_GM_LASTBRANCH_FROM_TO },
2734
    { MSR_GM_LASTBRANCH_0_TO_IP,    NUM_MSR_GM_LASTBRANCH_FROM_TO },
2735
    { 0, 0 }
2736
};
2737
2738
static const struct lbr_info *last_branch_msr_get(void)
2739
2
{
2740
2
    switch ( boot_cpu_data.x86 )
2741
2
    {
2742
2
    case 6:
2743
2
        switch ( boot_cpu_data.x86_model )
2744
2
        {
2745
2
        /* Core2 Duo */
2746
0
        case 0x0f:
2747
0
        /* Enhanced Core */
2748
0
        case 0x17:
2749
0
        /* Xeon 7400 */
2750
0
        case 0x1d:
2751
0
            return c2_lbr;
2752
0
        /* Nehalem */
2753
2
        case 0x1a: case 0x1e: case 0x1f: case 0x2e:
2754
2
        /* Westmere */
2755
2
        case 0x25: case 0x2c: case 0x2f:
2756
2
        /* Sandy Bridge */
2757
2
        case 0x2a: case 0x2d:
2758
2
        /* Ivy Bridge */
2759
2
        case 0x3a: case 0x3e:
2760
2
        /* Haswell */
2761
2
        case 0x3c: case 0x3f: case 0x45: case 0x46:
2762
2
        /* Broadwell */
2763
2
        case 0x3d: case 0x47: case 0x4f: case 0x56:
2764
2
            return nh_lbr;
2765
2
        /* Skylake */
2766
0
        case 0x4e: case 0x5e:
2767
0
        /* Xeon Scalable */
2768
0
        case 0x55:
2769
0
        /* Cannon Lake */
2770
0
        case 0x66:
2771
0
        /* Goldmont Plus */
2772
0
        case 0x7a:
2773
0
        /* Kaby Lake */
2774
0
        case 0x8e: case 0x9e:
2775
0
            return sk_lbr;
2776
0
        /* Atom */
2777
0
        case 0x1c: case 0x26: case 0x27: case 0x35: case 0x36:
2778
0
        /* Silvermont */
2779
0
        case 0x37: case 0x4a: case 0x4d: case 0x5a: case 0x5d:
2780
0
        /* Xeon Phi Knights Landing */
2781
0
        case 0x57:
2782
0
        /* Xeon Phi Knights Mill */
2783
0
        case 0x85:
2784
0
        /* Airmont */
2785
0
        case 0x4c:
2786
0
            return at_lbr;
2787
0
        /* Goldmont */
2788
0
        case 0x5c: case 0x5f:
2789
0
            return gm_lbr;
2790
2
        }
2791
0
        break;
2792
2
2793
0
    case 15:
2794
0
        switch ( boot_cpu_data.x86_model )
2795
0
        {
2796
0
        /* Pentium4/Xeon with em64t */
2797
0
        case 3: case 4: case 6:
2798
0
            return p4_lbr;
2799
0
        }
2800
0
        break;
2801
2
    }
2802
2
2803
0
    return NULL;
2804
2
}
2805
2806
enum
2807
{
2808
    LBR_FORMAT_32                 = 0x0, /* 32-bit record format */
2809
    LBR_FORMAT_LIP                = 0x1, /* 64-bit LIP record format */
2810
    LBR_FORMAT_EIP                = 0x2, /* 64-bit EIP record format */
2811
    LBR_FORMAT_EIP_FLAGS          = 0x3, /* 64-bit EIP, Flags */
2812
    LBR_FORMAT_EIP_FLAGS_TSX      = 0x4, /* 64-bit EIP, Flags, TSX */
2813
    LBR_FORMAT_EIP_FLAGS_TSX_INFO = 0x5, /* 64-bit EIP, Flags, TSX, LBR_INFO */
2814
    LBR_FORMAT_EIP_FLAGS_CYCLES   = 0x6, /* 64-bit EIP, Flags, Cycles */
2815
    LBR_FORMAT_LIP_FLAGS_TSX_INFO = 0x7, /* 64-bit LIP, Flags, TSX, LBR_INFO */
2816
};
2817
2818
0
#define LBR_FROM_SIGNEXT_2MSB  ((1ULL << 59) | (1ULL << 60))
2819
2820
0
#define FIXUP_LBR_TSX            (1u << 0)
2821
0
#define FIXUP_BDW_ERRATUM_BDF14  (1u << 1)
2822
2823
static bool __read_mostly lbr_tsx_fixup_needed;
2824
static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
2825
static uint32_t __read_mostly lbr_from_start;
2826
static uint32_t __read_mostly lbr_from_end;
2827
static uint32_t __read_mostly lbr_lastint_from;
2828
2829
static void __init lbr_tsx_fixup_check(void)
2830
1
{
2831
1
    bool tsx_support = cpu_has_hle || cpu_has_rtm;
2832
1
    uint64_t caps;
2833
1
    uint32_t lbr_format;
2834
1
2835
1
    /* Fixup is needed only when TSX support is disabled ... */
2836
1
    if ( tsx_support )
2837
0
        return;
2838
1
2839
1
    if ( !cpu_has_pdcm )
2840
0
        return;
2841
1
2842
1
    rdmsrl(MSR_IA32_PERF_CAPABILITIES, caps);
2843
1
    lbr_format = caps & MSR_IA32_PERF_CAP_LBR_FORMAT;
2844
1
2845
1
    /* ... and the address format of LBR includes TSX bits 61:62 */
2846
1
    if ( lbr_format == LBR_FORMAT_EIP_FLAGS_TSX )
2847
0
    {
2848
0
        const struct lbr_info *lbr = last_branch_msr_get();
2849
0
2850
0
        if ( lbr == NULL )
2851
0
            return;
2852
0
2853
0
        lbr_lastint_from = lbr[LBR_LASTINT_FROM_IDX].base;
2854
0
        lbr_from_start = lbr[LBR_LASTBRANCH_FROM_IDX].base;
2855
0
        lbr_from_end = lbr_from_start + lbr[LBR_LASTBRANCH_FROM_IDX].count;
2856
0
2857
0
        lbr_tsx_fixup_needed = true;
2858
0
    }
2859
1
}
2860
2861
static void __init bdw_erratum_bdf14_fixup_check(void)
2862
1
{
2863
1
    /* Broadwell E5-2600 v4 processors need to work around erratum BDF14. */
2864
1
    if ( boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 79 )
2865
0
        bdw_erratum_bdf14_fixup_needed = true;
2866
1
}
2867
2868
static int is_last_branch_msr(u32 ecx)
2869
2
{
2870
2
    const struct lbr_info *lbr = last_branch_msr_get();
2871
2
2872
2
    if ( lbr == NULL )
2873
0
        return 0;
2874
2
2875
12
    for ( ; lbr->count; lbr++ )
2876
10
        if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2877
0
            return 1;
2878
2
2879
2
    return 0;
2880
2
}
2881
2882
static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
2883
12
{
2884
12
    const struct vcpu *curr = current;
2885
12
2886
12
    HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);
2887
12
2888
12
    switch ( msr )
2889
12
    {
2890
0
    case MSR_IA32_SYSENTER_CS:
2891
0
        __vmread(GUEST_SYSENTER_CS, msr_content);
2892
0
        break;
2893
0
    case MSR_IA32_SYSENTER_ESP:
2894
0
        __vmread(GUEST_SYSENTER_ESP, msr_content);
2895
0
        break;
2896
0
    case MSR_IA32_SYSENTER_EIP:
2897
0
        __vmread(GUEST_SYSENTER_EIP, msr_content);
2898
0
        break;
2899
0
    case MSR_IA32_DEBUGCTLMSR:
2900
0
        __vmread(GUEST_IA32_DEBUGCTL, msr_content);
2901
0
        break;
2902
0
    case MSR_IA32_FEATURE_CONTROL:
2903
0
        *msr_content = IA32_FEATURE_CONTROL_LOCK;
2904
0
        if ( vmce_has_lmce(curr) )
2905
0
            *msr_content |= IA32_FEATURE_CONTROL_LMCE_ON;
2906
0
        if ( nestedhvm_enabled(curr->domain) )
2907
0
            *msr_content |= IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX;
2908
0
        break;
2909
0
    case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_VMFUNC:
2910
0
        if ( !nvmx_msr_read_intercept(msr, msr_content) )
2911
0
            goto gp_fault;
2912
0
        break;
2913
12
    case MSR_IA32_MISC_ENABLE:
2914
12
        rdmsrl(MSR_IA32_MISC_ENABLE, *msr_content);
2915
12
        /* Debug Trace Store is not supported. */
2916
12
        *msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2917
12
                       MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
2918
12
        /* Perhaps vpmu will change some bits. */
2919
12
        /* FALLTHROUGH */
2920
12
    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
2921
12
    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
2922
12
    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
2923
12
    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
2924
12
    case MSR_IA32_PEBS_ENABLE:
2925
12
    case MSR_IA32_DS_AREA:
2926
12
        if ( vpmu_do_rdmsr(msr, msr_content) )
2927
0
            goto gp_fault;
2928
12
        break;
2929
12
2930
0
    default:
2931
0
        if ( passive_domain_do_rdmsr(msr, msr_content) )
2932
0
            goto done;
2933
0
        switch ( long_mode_do_msr_read(msr, msr_content) )
2934
0
        {
2935
0
            case HNDL_unhandled:
2936
0
                break;
2937
0
            case HNDL_exception_raised:
2938
0
                return X86EMUL_EXCEPTION;
2939
0
            case HNDL_done:
2940
0
                goto done;
2941
0
        }
2942
0
2943
0
        if ( vmx_read_guest_msr(msr, msr_content) == 0 )
2944
0
            break;
2945
0
2946
0
        if ( is_last_branch_msr(msr) )
2947
0
        {
2948
0
            *msr_content = 0;
2949
0
            break;
2950
0
        }
2951
0
2952
0
        if ( rdmsr_viridian_regs(msr, msr_content) ||
2953
0
             rdmsr_hypervisor_regs(msr, msr_content) )
2954
0
            break;
2955
0
2956
0
        if ( rdmsr_safe(msr, *msr_content) == 0 )
2957
0
            break;
2958
0
2959
0
        goto gp_fault;
2960
12
    }
2961
12
2962
12
done:
2963
12
    HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%#x, msr_value=%#"PRIx64,
2964
12
                msr, *msr_content);
2965
12
    return X86EMUL_OKAY;
2966
12
2967
0
gp_fault:
2968
0
    return X86EMUL_EXCEPTION;
2969
12
}
2970
2971
static int vmx_alloc_vlapic_mapping(struct domain *d)
2972
1
{
2973
1
    struct page_info *pg;
2974
1
    unsigned long mfn;
2975
1
2976
1
    if ( !cpu_has_vmx_virtualize_apic_accesses )
2977
0
        return 0;
2978
1
2979
1
    pg = alloc_domheap_page(d, MEMF_no_owner);
2980
1
    if ( !pg )
2981
0
        return -ENOMEM;
2982
1
    mfn = page_to_mfn(pg);
2983
1
    clear_domain_page(_mfn(mfn));
2984
1
    share_xen_page_with_guest(pg, d, XENSHARE_writable);
2985
1
    d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
2986
1
    set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
2987
1
                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
2988
1
2989
1
    return 0;
2990
1
}
2991
2992
static void vmx_free_vlapic_mapping(struct domain *d)
2993
0
{
2994
0
    unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
2995
0
2996
0
    if ( mfn != 0 )
2997
0
        free_shared_domheap_page(mfn_to_page(mfn));
2998
0
}
2999
3000
static void vmx_install_vlapic_mapping(struct vcpu *v)
3001
12
{
3002
12
    paddr_t virt_page_ma, apic_page_ma;
3003
12
3004
12
    if ( v->domain->arch.hvm_domain.vmx.apic_access_mfn == 0 )
3005
0
        return;
3006
12
3007
12
    ASSERT(cpu_has_vmx_virtualize_apic_accesses);
3008
12
3009
12
    virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
3010
12
    apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
3011
12
    apic_page_ma <<= PAGE_SHIFT;
3012
12
3013
12
    vmx_vmcs_enter(v);
3014
12
    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
3015
12
    __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
3016
12
    vmx_vmcs_exit(v);
3017
12
}
3018
3019
void vmx_vlapic_msr_changed(struct vcpu *v)
3020
21
{
3021
21
    int virtualize_x2apic_mode;
3022
21
    struct vlapic *vlapic = vcpu_vlapic(v);
3023
21
    unsigned int msr;
3024
21
3025
21
    virtualize_x2apic_mode = ( (cpu_has_vmx_apic_reg_virt ||
3026
0
                                cpu_has_vmx_virtual_intr_delivery) &&
3027
21
                               cpu_has_vmx_virtualize_x2apic_mode );
3028
21
3029
21
    if ( !cpu_has_vmx_virtualize_apic_accesses &&
3030
0
         !virtualize_x2apic_mode )
3031
0
        return;
3032
21
3033
21
    vmx_vmcs_enter(v);
3034
21
    v->arch.hvm_vmx.secondary_exec_control &=
3035
21
        ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3036
21
          SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
3037
21
    if ( !vlapic_hw_disabled(vlapic) &&
3038
19
         (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
3039
20
    {
3040
22
        if ( virtualize_x2apic_mode && vlapic_x2apic_mode(vlapic) )
3041
10
        {
3042
10
            v->arch.hvm_vmx.secondary_exec_control |=
3043
10
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3044
10
            if ( cpu_has_vmx_apic_reg_virt )
3045
11
            {
3046
11
                for ( msr = MSR_IA32_APICBASE_MSR;
3047
2.04k
                      msr <= MSR_IA32_APICBASE_MSR + 0xff; msr++ )
3048
2.03k
                    vmx_clear_msr_intercept(v, msr, VMX_MSR_R);
3049
11
3050
11
                vmx_set_msr_intercept(v, MSR_IA32_APICPPR_MSR, VMX_MSR_R);
3051
11
                vmx_set_msr_intercept(v, MSR_IA32_APICTMICT_MSR, VMX_MSR_R);
3052
11
                vmx_set_msr_intercept(v, MSR_IA32_APICTMCCT_MSR, VMX_MSR_R);
3053
11
            }
3054
10
            if ( cpu_has_vmx_virtual_intr_delivery )
3055
12
            {
3056
12
                vmx_clear_msr_intercept(v, MSR_IA32_APICTPR_MSR, VMX_MSR_W);
3057
12
                vmx_clear_msr_intercept(v, MSR_IA32_APICEOI_MSR, VMX_MSR_W);
3058
12
                vmx_clear_msr_intercept(v, MSR_IA32_APICSELF_MSR, VMX_MSR_W);
3059
12
            }
3060
10
        }
3061
20
        else
3062
10
            v->arch.hvm_vmx.secondary_exec_control |=
3063
10
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3064
20
    }
3065
21
    if ( !(v->arch.hvm_vmx.secondary_exec_control &
3066
21
           SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE) )
3067
12
        for ( msr = MSR_IA32_APICBASE_MSR;
3068
3.08k
              msr <= MSR_IA32_APICBASE_MSR + 0xff; msr++ )
3069
3.07k
            vmx_set_msr_intercept(v, msr, VMX_MSR_RW);
3070
21
3071
21
    vmx_update_secondary_exec_control(v);
3072
21
    vmx_vmcs_exit(v);
3073
21
}
3074
3075
static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
3076
50
{
3077
50
    struct vcpu *v = current;
3078
50
3079
50
    HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);
3080
50
3081
50
    switch ( msr )
3082
50
    {
3083
0
    case MSR_IA32_SYSENTER_CS:
3084
0
        __vmwrite(GUEST_SYSENTER_CS, msr_content);
3085
0
        break;
3086
0
    case MSR_IA32_SYSENTER_ESP:
3087
0
        if ( !is_canonical_address(msr_content) )
3088
0
            goto gp_fault;
3089
0
        __vmwrite(GUEST_SYSENTER_ESP, msr_content);
3090
0
        break;
3091
0
    case MSR_IA32_SYSENTER_EIP:
3092
0
        if ( !is_canonical_address(msr_content) )
3093
0
            goto gp_fault;
3094
0
        __vmwrite(GUEST_SYSENTER_EIP, msr_content);
3095
0
        break;
3096
0
    case MSR_IA32_DEBUGCTLMSR: {
3097
0
        int i, rc = 0;
3098
0
        uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
3099
0
3100
0
        if ( boot_cpu_has(X86_FEATURE_RTM) )
3101
0
            supported |= IA32_DEBUGCTLMSR_RTM;
3102
0
        if ( msr_content & ~supported )
3103
0
        {
3104
0
            /* Perhaps some other bits are supported in vpmu. */
3105
0
            if ( vpmu_do_wrmsr(msr, msr_content, supported) )
3106
0
                break;
3107
0
        }
3108
0
        if ( msr_content & IA32_DEBUGCTLMSR_LBR )
3109
0
        {
3110
0
            const struct lbr_info *lbr = last_branch_msr_get();
3111
0
            if ( lbr == NULL )
3112
0
                break;
3113
0
3114
0
            for ( ; (rc == 0) && lbr->count; lbr++ )
3115
0
                for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
3116
0
                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
3117
0
                    {
3118
0
                        vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
3119
0
                        if ( lbr_tsx_fixup_needed )
3120
0
                            v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
3121
0
                        if ( bdw_erratum_bdf14_fixup_needed )
3122
0
                            v->arch.hvm_vmx.lbr_fixup_enabled |=
3123
0
                                FIXUP_BDW_ERRATUM_BDF14;
3124
0
                    }
3125
0
        }
3126
0
3127
0
        if ( (rc < 0) ||
3128
0
             (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
3129
0
            hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
3130
0
        else
3131
0
            __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
3132
0
3133
0
        break;
3134
0
    }
3135
0
    case MSR_IA32_FEATURE_CONTROL:
3136
0
    case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3137
0
        /* None of these MSRs are writeable. */
3138
0
        goto gp_fault;
3139
0
3140
0
    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
3141
0
    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(7):
3142
0
    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
3143
0
    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
3144
0
    case MSR_IA32_PEBS_ENABLE:
3145
0
    case MSR_IA32_DS_AREA:
3146
0
         if ( vpmu_do_wrmsr(msr, msr_content, 0) )
3147
0
            goto gp_fault;
3148
0
        break;
3149
0
3150
50
    default:
3151
50
        if ( passive_domain_do_wrmsr(msr, msr_content) )
3152
0
            return X86EMUL_OKAY;
3153
50
3154
50
        if ( wrmsr_viridian_regs(msr, msr_content) ) 
3155
0
            break;
3156
50
3157
50
        switch ( long_mode_do_msr_write(msr, msr_content) )
3158
50
        {
3159
2
            case HNDL_unhandled:
3160
2
                if ( (vmx_write_guest_msr(msr, msr_content) != 0) &&
3161
2
                     !is_last_branch_msr(msr) )
3162
2
                    switch ( wrmsr_hypervisor_regs(msr, msr_content) )
3163
2
                    {
3164
0
                    case -ERESTART:
3165
0
                        return X86EMUL_RETRY;
3166
2
                    case 0:
3167
2
                    case 1:
3168
2
                        break;
3169
0
                    default:
3170
0
                        goto gp_fault;
3171
2
                    }
3172
2
                break;
3173
0
            case HNDL_exception_raised:
3174
0
                return X86EMUL_EXCEPTION;
3175
48
            case HNDL_done:
3176
48
                break;
3177
50
        }
3178
50
        break;
3179
50
    }
3180
50
3181
50
    return X86EMUL_OKAY;
3182
50
3183
0
gp_fault:
3184
0
    return X86EMUL_EXCEPTION;
3185
50
}
3186
3187
static void vmx_do_extint(struct cpu_user_regs *regs)
3188
17.8k
{
3189
17.8k
    unsigned long vector;
3190
17.8k
3191
17.8k
    __vmread(VM_EXIT_INTR_INFO, &vector);
3192
17.8k
    BUG_ON(!(vector & INTR_INFO_VALID_MASK));
3193
17.8k
3194
17.8k
    vector &= INTR_INFO_VECTOR_MASK;
3195
17.8k
    HVMTRACE_1D(INTR, vector);
3196
17.8k
3197
17.8k
    regs->entry_vector = vector;
3198
17.8k
    do_IRQ(regs);
3199
17.8k
}
3200
3201
static void vmx_wbinvd_intercept(void)
3202
42
{
3203
42
    if ( !cache_flush_permitted(current->domain) || iommu_snoop )
3204
42
        return;
3205
42
3206
0
    if ( cpu_has_wbinvd_exiting )
3207
0
        flush_all(FLUSH_CACHE);
3208
0
    else
3209
0
        wbinvd();
3210
0
}
3211
3212
static void ept_handle_violation(ept_qual_t q, paddr_t gpa)
3213
60.1k
{
3214
60.1k
    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
3215
60.1k
    mfn_t mfn;
3216
60.1k
    p2m_type_t p2mt;
3217
60.1k
    int ret;
3218
60.1k
    struct domain *d = current->domain;
3219
60.1k
3220
60.1k
    /*
3221
60.1k
     * We treat all write violations also as read violations.
3222
60.1k
     * The reason why this is required is the following warning:
3223
60.1k
     * "An EPT violation that occurs during as a result of execution of a
3224
60.1k
     * read-modify-write operation sets bit 1 (data write). Whether it also
3225
60.1k
     * sets bit 0 (data read) is implementation-specific and, for a given
3226
60.1k
     * implementation, may differ for different kinds of read-modify-write
3227
60.1k
     * operations."
3228
60.1k
     * - Intel(R) 64 and IA-32 Architectures Software Developer's Manual
3229
60.1k
     *   Volume 3C: System Programming Guide, Part 3
3230
60.1k
     */
3231
60.1k
    struct npfec npfec = {
3232
4.11k
        .read_access = q.read || q.write,
3233
60.1k
        .write_access = q.write,
3234
60.1k
        .insn_fetch = q.fetch,
3235
60.1k
        .present = q.eff_read || q.eff_write || q.eff_exec,
3236
60.1k
    };
3237
60.1k
3238
60.1k
    if ( tb_init_done )
3239
0
    {
3240
0
        struct {
3241
0
            uint64_t gpa;
3242
0
            uint64_t mfn;
3243
0
            u32 qualification;
3244
0
            u32 p2mt;
3245
0
        } _d;
3246
0
3247
0
        _d.gpa = gpa;
3248
0
        _d.qualification = q.raw;
3249
0
        _d.mfn = mfn_x(get_gfn_query_unlocked(d, gfn, &_d.p2mt));
3250
0
3251
0
        __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
3252
0
    }
3253
60.1k
3254
60.1k
    if ( q.gla_valid )
3255
60.1k
    {
3256
60.1k
        __vmread(GUEST_LINEAR_ADDRESS, &gla);
3257
60.1k
        npfec.gla_valid = 1;
3258
60.1k
        if( q.gla_fault )
3259
60.1k
            npfec.kind = npfec_kind_with_gla;
3260
60.1k
        else
3261
0
            npfec.kind = npfec_kind_in_gpt;
3262
60.1k
    }
3263
60.1k
    else
3264
0
        gla = ~0ull;
3265
60.1k
3266
60.1k
    ret = hvm_hap_nested_page_fault(gpa, gla, npfec);
3267
60.1k
    switch ( ret )
3268
60.1k
    {
3269
0
    case 0:         // Unhandled L1 EPT violation
3270
0
        break;
3271
60.1k
    case 1:         // This violation is handled completly
3272
60.1k
        return;
3273
0
    case -1:        // This vioaltion should be injected to L1 VMM
3274
0
        vcpu_nestedhvm(current).nv_vmexit_pending = 1;
3275
0
        return;
3276
60.1k
    }
3277
60.1k
3278
60.1k
    /* Everything else is an error. */
3279
0
    mfn = get_gfn_query_unlocked(d, gfn, &p2mt);
3280
0
    gprintk(XENLOG_ERR,
3281
0
            "EPT violation %#lx (%c%c%c/%c%c%c) gpa %#"PRIpaddr" mfn %#lx type %i\n",
3282
0
            q.raw,
3283
0
            q.read  ? 'r' : '-',
3284
0
            q.write ? 'w' : '-',
3285
0
            q.fetch ? 'x' : '-',
3286
0
            q.eff_read  ? 'r' : '-',
3287
0
            q.eff_write ? 'w' : '-',
3288
0
            q.eff_exec  ? 'x' : '-',
3289
0
            gpa, mfn_x(mfn), p2mt);
3290
0
3291
0
    ept_walk_table(d, gfn);
3292
0
3293
0
    if ( q.gla_valid )
3294
0
        gprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
3295
0
3296
0
    domain_crash(d);
3297
0
}
3298
3299
static void vmx_failed_vmentry(unsigned int exit_reason,
3300
                               struct cpu_user_regs *regs)
3301
0
{
3302
0
    unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
3303
0
    unsigned long exit_qualification;
3304
0
    struct vcpu *curr = current;
3305
0
3306
0
    printk("%pv vmentry failure (reason %#x): ", curr, exit_reason);
3307
0
    __vmread(EXIT_QUALIFICATION, &exit_qualification);
3308
0
    switch ( failed_vmentry_reason )
3309
0
    {
3310
0
    case EXIT_REASON_INVALID_GUEST_STATE:
3311
0
        printk("Invalid guest state (%lu)\n", exit_qualification);
3312
0
        break;
3313
0
3314
0
    case EXIT_REASON_MSR_LOADING:
3315
0
    {
3316
0
        unsigned long idx = exit_qualification - 1;
3317
0
        const struct vmx_msr_entry *msr;
3318
0
3319
0
        printk("MSR loading (entry %lu)\n", idx);
3320
0
3321
0
        if ( idx >= (PAGE_SIZE / sizeof(*msr)) )
3322
0
            printk("  Entry out of range\n");
3323
0
        else
3324
0
        {
3325
0
            msr = &curr->arch.hvm_vmx.msr_area[idx];
3326
0
3327
0
            printk("  msr %08x val %016"PRIx64" (mbz %#x)\n",
3328
0
                   msr->index, msr->data, msr->mbz);
3329
0
        }
3330
0
        break;
3331
0
    }
3332
0
3333
0
    case EXIT_REASON_MCE_DURING_VMENTRY:
3334
0
        printk("MCE\n");
3335
0
        HVMTRACE_0D(MCE);
3336
0
        /* Already handled. */
3337
0
        break;
3338
0
3339
0
    default:
3340
0
        printk("Unknown\n");
3341
0
        break;
3342
0
    }
3343
0
3344
0
    printk("************* VMCS Area **************\n");
3345
0
    vmcs_dump_vcpu(curr);
3346
0
    printk("**************************************\n");
3347
0
3348
0
    domain_crash(curr->domain);
3349
0
}
3350
3351
void vmx_enter_realmode(struct cpu_user_regs *regs)
3352
0
{
3353
0
    struct vcpu *v = current;
3354
0
3355
0
    /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3.  Since
3356
0
     * we have CR4.VME == 1 and our own TSS with an empty interrupt
3357
0
     * redirection bitmap, all software INTs will be handled by vm86 */
3358
0
    v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
3359
0
    regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
3360
0
}
3361
3362
static int vmx_handle_eoi_write(void)
3363
0
{
3364
0
    unsigned long exit_qualification;
3365
0
3366
0
    /*
3367
0
     * 1. Must be a linear access data write.
3368
0
     * 2. Data write must be to the EOI register.
3369
0
     */
3370
0
    __vmread(EXIT_QUALIFICATION, &exit_qualification);
3371
0
    if ( (((exit_qualification >> 12) & 0xf) == 1) &&
3372
0
         ((exit_qualification & 0xfff) == APIC_EOI) )
3373
0
    {
3374
0
        update_guest_eip(); /* Safe: APIC data write */
3375
0
        vlapic_EOI_set(vcpu_vlapic(current));
3376
0
        HVMTRACE_0D(VLAPIC);
3377
0
        return 1;
3378
0
    }
3379
0
3380
0
    return 0;
3381
0
}
3382
3383
/*
3384
 * Propagate VM_EXIT_INTR_INFO to VM_ENTRY_INTR_INFO.  Used to mirror an
3385
 * intercepted exception back to the guest as if Xen hadn't intercepted it.
3386
 *
3387
 * It is the callers responsibility to ensure that this function is only used
3388
 * in the context of an appropriate vmexit.
3389
 */
3390
static void vmx_propagate_intr(unsigned long intr)
3391
0
{
3392
0
    struct x86_event event = {
3393
0
        .vector = MASK_EXTR(intr, INTR_INFO_VECTOR_MASK),
3394
0
        .type = MASK_EXTR(intr, INTR_INFO_INTR_TYPE_MASK),
3395
0
    };
3396
0
    unsigned long tmp;
3397
0
3398
0
    if ( intr & INTR_INFO_DELIVER_CODE_MASK )
3399
0
    {
3400
0
        __vmread(VM_EXIT_INTR_ERROR_CODE, &tmp);
3401
0
        event.error_code = tmp;
3402
0
    }
3403
0
    else
3404
0
        event.error_code = X86_EVENT_NO_EC;
3405
0
3406
0
    if ( event.type >= X86_EVENTTYPE_SW_INTERRUPT )
3407
0
    {
3408
0
        __vmread(VM_EXIT_INSTRUCTION_LEN, &tmp);
3409
0
        event.insn_len = tmp;
3410
0
    }
3411
0
    else
3412
0
        event.insn_len = 0;
3413
0
3414
0
    hvm_inject_event(&event);
3415
0
}
3416
3417
static void vmx_idtv_reinject(unsigned long idtv_info)
3418
5.08M
{
3419
5.08M
3420
5.08M
    /* Event delivery caused this intercept? Queue for redelivery. */
3421
5.08M
    if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) )
3422
0
    {
3423
0
        if ( hvm_event_needs_reinjection(MASK_EXTR(idtv_info,
3424
0
                                                   INTR_INFO_INTR_TYPE_MASK),
3425
0
                                         idtv_info & INTR_INFO_VECTOR_MASK) )
3426
0
        {
3427
0
            /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
3428
0
            __vmwrite(VM_ENTRY_INTR_INFO,
3429
0
                      idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
3430
0
            if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
3431
0
            {
3432
0
                unsigned long ec;
3433
0
3434
0
                __vmread(IDT_VECTORING_ERROR_CODE, &ec);
3435
0
                __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, ec);
3436
0
            }
3437
0
        }
3438
0
3439
0
        /*
3440
0
         * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
3441
0
         * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
3442
0
         */
3443
0
        if ( cpu_has_vmx_vnmi &&
3444
0
             ((idtv_info & INTR_INFO_INTR_TYPE_MASK) ==
3445
0
              MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK)) )
3446
0
        {
3447
0
            unsigned long intr_info;
3448
0
3449
0
            __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_info);
3450
0
            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
3451
0
                      intr_info & ~VMX_INTR_SHADOW_NMI);
3452
0
        }
3453
0
    }
3454
5.08M
}
3455
3456
static void vmx_handle_xsaves(void)
3457
0
{
3458
0
    gdprintk(XENLOG_ERR, "xsaves should not cause vmexit\n");
3459
0
    domain_crash(current->domain);
3460
0
}
3461
3462
static void vmx_handle_xrstors(void)
3463
0
{
3464
0
    gdprintk(XENLOG_ERR, "xrstors should not cause vmexit\n");
3465
0
    domain_crash(current->domain);
3466
0
}
3467
3468
static void vmx_handle_descriptor_access(uint32_t exit_reason)
3469
0
{
3470
0
    uint64_t instr_info;
3471
0
    uint64_t exit_qualification;
3472
0
    unsigned int desc;
3473
0
3474
0
    __vmread(EXIT_QUALIFICATION, &exit_qualification);
3475
0
    __vmread(VMX_INSTRUCTION_INFO, &instr_info);
3476
0
3477
0
    if ( exit_reason == EXIT_REASON_ACCESS_GDTR_OR_IDTR )
3478
0
    {
3479
0
        idt_or_gdt_instr_info_t info;
3480
0
        info.raw = instr_info;
3481
0
        desc = info.instr_identity ? VM_EVENT_DESC_IDTR : VM_EVENT_DESC_GDTR;
3482
0
        hvm_descriptor_access_intercept(info.raw, exit_qualification, desc,
3483
0
                                        info.instr_write);
3484
0
    }
3485
0
    else
3486
0
    {
3487
0
        ldt_or_tr_instr_info_t info;
3488
0
        info.raw = instr_info;
3489
0
        desc = info.instr_identity ? VM_EVENT_DESC_TR : VM_EVENT_DESC_LDTR;
3490
0
        hvm_descriptor_access_intercept(info.raw, exit_qualification, desc,
3491
0
                                        info.instr_write);
3492
0
    }
3493
0
}
3494
3495
static int vmx_handle_apic_write(void)
3496
0
{
3497
0
    unsigned long exit_qualification;
3498
0
3499
0
    ASSERT(cpu_has_vmx_apic_reg_virt);
3500
0
    __vmread(EXIT_QUALIFICATION, &exit_qualification);
3501
0
3502
0
    return vlapic_apicv_write(current, exit_qualification & 0xfff);
3503
0
}
3504
3505
void vmx_vmexit_handler(struct cpu_user_regs *regs)
3506
5.20M
{
3507
5.20M
    unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
3508
5.20M
    unsigned int vector = 0, mode;
3509
5.20M
    struct vcpu *v = current;
3510
5.20M
3511
5.20M
    __vmread(GUEST_RIP,    &regs->rip);
3512
5.20M
    __vmread(GUEST_RSP,    &regs->rsp);
3513
5.20M
    __vmread(GUEST_RFLAGS, &regs->rflags);
3514
5.20M
3515
5.20M
    hvm_invalidate_regs_fields(regs);
3516
5.20M
3517
5.20M
    if ( paging_mode_hap(v->domain) )
3518
5.18M
    {
3519
5.18M
        __vmread(GUEST_CR3, &v->arch.hvm_vcpu.hw_cr[3]);
3520
5.18M
        if ( vmx_unrestricted_guest(v) || hvm_paging_enabled(v) )
3521
5.17M
            v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3];
3522
5.18M
    }
3523
5.20M
3524
5.20M
    __vmread(VM_EXIT_REASON, &exit_reason);
3525
5.20M
3526
5.20M
    if ( hvm_long_mode_active(v) )
3527
5.17M
        HVMTRACE_ND(VMEXIT64, 0, 1/*cycles*/, 3, exit_reason,
3528
5.20M
                    regs->eip, regs->rip >> 32, 0, 0, 0);
3529
5.20M
    else
3530
30.4k
        HVMTRACE_ND(VMEXIT, 0, 1/*cycles*/, 2, exit_reason,
3531
5.20M
                    regs->eip, 0, 0, 0, 0);
3532
5.20M
3533
5.20M
    perfc_incra(vmexits, exit_reason);
3534
5.20M
3535
5.20M
    /* Handle the interrupt we missed before allowing any more in. */
3536
5.20M
    switch ( (uint16_t)exit_reason )
3537
5.20M
    {
3538
17.8k
    case EXIT_REASON_EXTERNAL_INTERRUPT:
3539
17.8k
        vmx_do_extint(regs);
3540
17.8k
        break;
3541
6
    case EXIT_REASON_EXCEPTION_NMI:
3542
6
        __vmread(VM_EXIT_INTR_INFO, &intr_info);
3543
6
        BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
3544
6
        vector = intr_info & INTR_INFO_VECTOR_MASK;
3545
6
        if ( vector == TRAP_machine_check )
3546
0
            do_machine_check(regs);
3547
6
        if ( (vector == TRAP_nmi) &&
3548
0
             ((intr_info & INTR_INFO_INTR_TYPE_MASK) ==
3549
0
              MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK)) )
3550
0
        {
3551
0
            exception_table[TRAP_nmi](regs);
3552
0
            enable_nmis();
3553
0
        }
3554
6
        break;
3555
0
    case EXIT_REASON_MCE_DURING_VMENTRY:
3556
0
        do_machine_check(regs);
3557
0
        break;
3558
5.20M
    }
3559
5.20M
3560
5.20M
    /* Now enable interrupts so it's safe to take locks. */
3561
5.18M
    local_irq_enable();
3562
5.18M
3563
5.18M
    /*
3564
5.18M
     * If the guest has the ability to switch EPTP without an exit,
3565
5.18M
     * figure out whether it has done so and update the altp2m data.
3566
5.18M
     */
3567
5.18M
    if ( altp2m_active(v->domain) &&
3568
0
        (v->arch.hvm_vmx.secondary_exec_control &
3569
0
        SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
3570
0
    {
3571
0
        unsigned long idx;
3572
0
3573
0
        if ( v->arch.hvm_vmx.secondary_exec_control &
3574
0
            SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS )
3575
0
            __vmread(EPTP_INDEX, &idx);
3576
0
        else
3577
0
        {
3578
0
            unsigned long eptp;
3579
0
3580
0
            __vmread(EPT_POINTER, &eptp);
3581
0
3582
0
            if ( (idx = p2m_find_altp2m_by_eptp(v->domain, eptp)) ==
3583
0
                 INVALID_ALTP2M )
3584
0
            {
3585
0
                gdprintk(XENLOG_ERR, "EPTP not found in alternate p2m list\n");
3586
0
                domain_crash(v->domain);
3587
0
            }
3588
0
        }
3589
0
3590
0
        if ( idx != vcpu_altp2m(v).p2midx )
3591
0
        {
3592
0
            BUG_ON(idx >= MAX_ALTP2M);
3593
0
            atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
3594
0
            vcpu_altp2m(v).p2midx = idx;
3595
0
            atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
3596
0
        }
3597
0
    }
3598
5.18M
3599
5.18M
    /* XXX: This looks ugly, but we need a mechanism to ensure
3600
5.18M
     * any pending vmresume has really happened
3601
5.18M
     */
3602
5.18M
    vcpu_nestedhvm(v).nv_vmswitch_in_progress = 0;
3603
5.18M
    if ( nestedhvm_vcpu_in_guestmode(v) )
3604
0
    {
3605
0
        paging_update_nestedmode(v);
3606
0
        if ( nvmx_n2_vmexit_handler(regs, exit_reason) )
3607
0
            goto out;
3608
0
    }
3609
5.18M
3610
5.18M
    if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
3611
0
        return vmx_failed_vmentry(exit_reason, regs);
3612
5.18M
3613
5.18M
    if ( v->arch.hvm_vmx.vmx_realmode )
3614
0
    {
3615
0
        /* Put RFLAGS back the way the guest wants it */
3616
0
        regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
3617
0
        regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
3618
0
3619
0
        /* Unless this exit was for an interrupt, we've hit something
3620
0
         * vm86 can't handle.  Try again, using the emulator. */
3621
0
        switch ( exit_reason )
3622
0
        {
3623
0
        case EXIT_REASON_EXCEPTION_NMI:
3624
0
            if ( vector != TRAP_page_fault
3625
0
                 && vector != TRAP_nmi 
3626
0
                 && vector != TRAP_machine_check ) 
3627
0
            {
3628
0
        default:
3629
0
                perfc_incr(realmode_exits);
3630
0
                v->arch.hvm_vmx.vmx_emulate = 1;
3631
0
                HVMTRACE_0D(REALMODE_EMULATE);
3632
0
                return;
3633
0
            }
3634
0
        case EXIT_REASON_EXTERNAL_INTERRUPT:
3635
0
        case EXIT_REASON_INIT:
3636
0
        case EXIT_REASON_SIPI:
3637
0
        case EXIT_REASON_PENDING_VIRT_INTR:
3638
0
        case EXIT_REASON_PENDING_VIRT_NMI:
3639
0
        case EXIT_REASON_MCE_DURING_VMENTRY:
3640
0
        case EXIT_REASON_GETSEC:
3641
0
        case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
3642
0
        case EXIT_REASON_ACCESS_LDTR_OR_TR:
3643
0
        case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
3644
0
        case EXIT_REASON_INVEPT:
3645
0
        case EXIT_REASON_INVVPID:
3646
0
            break;
3647
0
        }
3648
0
    }
3649
5.18M
3650
5.18M
    hvm_maybe_deassert_evtchn_irq();
3651
5.18M
3652
5.18M
    __vmread(IDT_VECTORING_INFO, &idtv_info);
3653
5.18M
    if ( exit_reason != EXIT_REASON_TASK_SWITCH )
3654
5.08M
        vmx_idtv_reinject(idtv_info);
3655
5.18M
3656
5.18M
    switch ( exit_reason )
3657
5.18M
    {
3658
0
        unsigned long ecode;
3659
0
3660
6
    case EXIT_REASON_EXCEPTION_NMI:
3661
6
    {
3662
6
        /*
3663
6
         * We don't set the software-interrupt exiting (INT n).
3664
6
         * (1) We can get an exception (e.g. #PG) in the guest, or
3665
6
         * (2) NMI
3666
6
         */
3667
6
3668
6
        /*
3669
6
         * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
3670
6
         * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
3671
6
         * (NB. If we emulate this IRET for any reason, we should re-clear!)
3672
6
         */
3673
6
        if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
3674
0
             !(idtv_info & INTR_INFO_VALID_MASK) &&
3675
0
             (vector != TRAP_double_fault) )
3676
0
        {
3677
0
            unsigned long guest_info;
3678
0
3679
0
            __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
3680
0
            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
3681
0
                      guest_info | VMX_INTR_SHADOW_NMI);
3682
0
        }
3683
6
3684
6
        perfc_incra(cause_vector, vector);
3685
6
3686
6
        switch ( vector )
3687
6
        {
3688
0
        case TRAP_debug:
3689
0
            /*
3690
0
             * Updates DR6 where debugger can peek (See 3B 23.2.1,
3691
0
             * Table 23-1, "Exit Qualification for Debug Exceptions").
3692
0
             */
3693
0
            __vmread(EXIT_QUALIFICATION, &exit_qualification);
3694
0
            HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
3695
0
            write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
3696
0
            if ( !v->domain->debugger_attached )
3697
0
            {
3698
0
                unsigned long insn_len = 0;
3699
0
                int rc;
3700
0
                unsigned long trap_type = MASK_EXTR(intr_info,
3701
0
                                                    INTR_INFO_INTR_TYPE_MASK);
3702
0
3703
0
                if ( trap_type >= X86_EVENTTYPE_SW_INTERRUPT )
3704
0
                    __vmread(VM_EXIT_INSTRUCTION_LEN, &insn_len);
3705
0
3706
0
                rc = hvm_monitor_debug(regs->rip,
3707
0
                                       HVM_MONITOR_DEBUG_EXCEPTION,
3708
0
                                       trap_type, insn_len);
3709
0
3710
0
                /*
3711
0
                 * rc < 0 error in monitor/vm_event, crash
3712
0
                 * !rc    continue normally
3713
0
                 * rc > 0 paused waiting for response, work here is done
3714
0
                 */
3715
0
                if ( rc < 0 )
3716
0
                    goto exit_and_crash;
3717
0
                if ( !rc )
3718
0
                    vmx_propagate_intr(intr_info);
3719
0
            }
3720
0
            else
3721
0
                domain_pause_for_debugger();
3722
0
            break;
3723
0
        case TRAP_int3:
3724
0
            HVMTRACE_1D(TRAP, vector);
3725
0
            if ( !v->domain->debugger_attached )
3726
0
            {
3727
0
                unsigned long insn_len;
3728
0
                int rc;
3729
0
3730
0
                __vmread(VM_EXIT_INSTRUCTION_LEN, &insn_len);
3731
0
                rc = hvm_monitor_debug(regs->rip,
3732
0
                                       HVM_MONITOR_SOFTWARE_BREAKPOINT,
3733
0
                                       X86_EVENTTYPE_SW_EXCEPTION,
3734
0
                                       insn_len);
3735
0
3736
0
                if ( rc < 0 )
3737
0
                    goto exit_and_crash;
3738
0
                if ( !rc )
3739
0
                    vmx_propagate_intr(intr_info);
3740
0
            }
3741
0
            else
3742
0
            {
3743
0
                update_guest_eip(); /* Safe: INT3 */
3744
0
                v->arch.gdbsx_vcpu_event = TRAP_int3;
3745
0
                domain_pause_for_debugger();
3746
0
            }
3747
0
            break;
3748
6
        case TRAP_no_device:
3749
6
            HVMTRACE_1D(TRAP, vector);
3750
6
            vmx_fpu_dirty_intercept();
3751
6
            break;
3752
0
        case TRAP_page_fault:
3753
0
            __vmread(EXIT_QUALIFICATION, &exit_qualification);
3754
0
            __vmread(VM_EXIT_INTR_ERROR_CODE, &ecode);
3755
0
            regs->error_code = ecode;
3756
0
3757
0
            HVM_DBG_LOG(DBG_LEVEL_VMMU,
3758
0
                        "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
3759
0
                        regs->rax, regs->rbx, regs->rcx,
3760
0
                        regs->rdx, regs->rsi, regs->rdi);
3761
0
3762
0
            if ( paging_fault(exit_qualification, regs) )
3763
0
            {
3764
0
                if ( trace_will_trace_event(TRC_SHADOW) )
3765
0
                    break;
3766
0
                if ( hvm_long_mode_active(v) )
3767
0
                    HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
3768
0
                                     TRC_PAR_LONG(exit_qualification) );
3769
0
                else
3770
0
                    HVMTRACE_2D(PF_XEN,
3771
0
                                regs->error_code, exit_qualification );
3772
0
                break;
3773
0
            }
3774
0
3775
0
            hvm_inject_page_fault(regs->error_code, exit_qualification);
3776
0
            break;
3777
0
        case TRAP_alignment_check:
3778
0
            HVMTRACE_1D(TRAP, vector);
3779
0
            vmx_propagate_intr(intr_info);
3780
0
            break;
3781
0
        case TRAP_nmi:
3782
0
            if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
3783
0
                 X86_EVENTTYPE_NMI )
3784
0
                goto exit_and_crash;
3785
0
            HVMTRACE_0D(NMI);
3786
0
            /* Already handled above. */
3787
0
            break;
3788
0
        case TRAP_machine_check:
3789
0
            HVMTRACE_0D(MCE);
3790
0
            /* Already handled above. */
3791
0
            break;
3792
0
        case TRAP_invalid_op:
3793
0
            HVMTRACE_1D(TRAP, vector);
3794
0
            hvm_ud_intercept(regs);
3795
0
            break;
3796
0
        default:
3797
0
            HVMTRACE_1D(TRAP, vector);
3798
0
            goto exit_and_crash;
3799
6
        }
3800
6
        break;
3801
6
    }
3802
17.7k
    case EXIT_REASON_EXTERNAL_INTERRUPT:
3803
17.7k
        /* Already handled above. */
3804
17.7k
        break;
3805
0
    case EXIT_REASON_TRIPLE_FAULT:
3806
0
        hvm_triple_fault();
3807
0
        break;
3808
93.3k
    case EXIT_REASON_PENDING_VIRT_INTR:
3809
93.3k
        /* Disable the interrupt window. */
3810
93.3k
        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3811
93.3k
        vmx_update_cpu_exec_control(v);
3812
93.3k
        break;
3813
0
    case EXIT_REASON_PENDING_VIRT_NMI:
3814
0
        /* Disable the NMI window. */
3815
0
        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3816
0
        vmx_update_cpu_exec_control(v);
3817
0
        break;
3818
0
    case EXIT_REASON_TASK_SWITCH: {
3819
0
        static const enum hvm_task_switch_reason reasons[] = {
3820
0
            TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int
3821
0
        };
3822
0
        unsigned int inst_len, source;
3823
0
3824
0
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
3825
0
        source = (exit_qualification >> 30) & 3;
3826
0
        /* Vectored event should fill in interrupt information. */
3827
0
        WARN_ON((source == 3) && !(idtv_info & INTR_INFO_VALID_MASK));
3828
0
        /*
3829
0
         * In the following cases there is an instruction to skip over:
3830
0
         *  - TSW is due to a CALL, IRET or JMP instruction.
3831
0
         *  - TSW is a vectored event due to a SW exception or SW interrupt.
3832
0
         */
3833
0
        inst_len = ((source != 3) ||        /* CALL, IRET, or JMP? */
3834
0
                    (MASK_EXTR(idtv_info, INTR_INFO_INTR_TYPE_MASK)
3835
0
                     > 3)) /* IntrType > 3? */
3836
0
            ? get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0;
3837
0
        if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
3838
0
            __vmread(IDT_VECTORING_ERROR_CODE, &ecode);
3839
0
        else
3840
0
             ecode = -1;
3841
0
        regs->rip += inst_len;
3842
0
        hvm_task_switch((uint16_t)exit_qualification, reasons[source], ecode);
3843
0
        break;
3844
6
    }
3845
2.66k
    case EXIT_REASON_CPUID:
3846
2.66k
    {
3847
2.66k
        int rc = vmx_do_cpuid(regs);
3848
2.66k
3849
2.66k
        /*
3850
2.66k
         * rc < 0 error in monitor/vm_event, crash
3851
2.66k
         * !rc    continue normally
3852
2.66k
         * rc > 0 paused waiting for response, work here is done
3853
2.66k
         */
3854
2.66k
        if ( rc < 0 )
3855
0
            goto exit_and_crash;
3856
2.66k
        if ( !rc )
3857
2.66k
            update_guest_eip(); /* Safe: CPUID */
3858
2.66k
        break;
3859
2.66k
    }
3860
65.4k
    case EXIT_REASON_HLT:
3861
65.4k
        update_guest_eip(); /* Safe: HLT */
3862
65.4k
        hvm_hlt(regs->eflags);
3863
65.4k
        break;
3864
0
    case EXIT_REASON_INVLPG:
3865
0
        update_guest_eip(); /* Safe: INVLPG */
3866
0
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
3867
0
        vmx_invlpg_intercept(exit_qualification);
3868
0
        break;
3869
0
    case EXIT_REASON_RDTSCP:
3870
0
        regs->rcx = hvm_msr_tsc_aux(v);
3871
0
        /* fall through */
3872
0
    case EXIT_REASON_RDTSC:
3873
0
        update_guest_eip(); /* Safe: RDTSC, RDTSCP */
3874
0
        hvm_rdtsc_intercept(regs);
3875
0
        break;
3876
0
3877
304k
    case EXIT_REASON_VMCALL:
3878
304k
        HVMTRACE_1D(VMMCALL, regs->eax);
3879
304k
3880
304k
        if ( hvm_hypercall(regs) == HVM_HCALL_completed )
3881
304k
            update_guest_eip(); /* Safe: VMCALL */
3882
304k
        break;
3883
0
3884
7.76k
    case EXIT_REASON_CR_ACCESS:
3885
7.76k
    {
3886
7.76k
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
3887
7.76k
        if ( vmx_cr_access(exit_qualification) == X86EMUL_OKAY )
3888
7.76k
            update_guest_eip(); /* Safe: MOV Cn, LMSW, CLTS */
3889
7.76k
        break;
3890
0
    }
3891
0
    case EXIT_REASON_DR_ACCESS:
3892
0
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
3893
0
        vmx_dr_access(exit_qualification, regs);
3894
0
        break;
3895
271
    case EXIT_REASON_MSR_READ:
3896
271
    {
3897
271
        uint64_t msr_content = 0;
3898
271
3899
271
        switch ( hvm_msr_read_intercept(regs->ecx, &msr_content) )
3900
271
        {
3901
270
        case X86EMUL_OKAY:
3902
270
            msr_split(regs, msr_content);
3903
270
            update_guest_eip(); /* Safe: RDMSR */
3904
270
            break;
3905
270
3906
0
        case X86EMUL_EXCEPTION:
3907
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
3908
0
            break;
3909
271
        }
3910
271
        break;
3911
271
    }
3912
271
3913
563
    case EXIT_REASON_MSR_WRITE:
3914
563
        switch ( hvm_msr_write_intercept(regs->ecx, msr_fold(regs), 1) )
3915
563
        {
3916
564
        case X86EMUL_OKAY:
3917
564
            update_guest_eip(); /* Safe: WRMSR */
3918
564
            break;
3919
564
3920
0
        case X86EMUL_EXCEPTION:
3921
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
3922
0
            break;
3923
563
        }
3924
563
        break;
3925
563
3926
0
    case EXIT_REASON_VMXOFF:
3927
0
        if ( nvmx_handle_vmxoff(regs) == X86EMUL_OKAY )
3928
0
            update_guest_eip();
3929
0
        break;
3930
563
3931
0
    case EXIT_REASON_VMXON:
3932
0
        if ( nvmx_handle_vmxon(regs) == X86EMUL_OKAY )
3933
0
            update_guest_eip();
3934
0
        break;
3935
563
3936
0
    case EXIT_REASON_VMCLEAR:
3937
0
        if ( nvmx_handle_vmclear(regs) == X86EMUL_OKAY )
3938
0
            update_guest_eip();
3939
0
        break;
3940
563
 
3941
0
    case EXIT_REASON_VMPTRLD:
3942
0
        if ( nvmx_handle_vmptrld(regs) == X86EMUL_OKAY )
3943
0
            update_guest_eip();
3944
0
        break;
3945
563
3946
0
    case EXIT_REASON_VMPTRST:
3947
0
        if ( nvmx_handle_vmptrst(regs) == X86EMUL_OKAY )
3948
0
            update_guest_eip();
3949
0
        break;
3950
563
3951
0
    case EXIT_REASON_VMREAD:
3952
0
        if ( nvmx_handle_vmread(regs) == X86EMUL_OKAY )
3953
0
            update_guest_eip();
3954
0
        break;
3955
563
 
3956
0
    case EXIT_REASON_VMWRITE:
3957
0
        if ( nvmx_handle_vmwrite(regs) == X86EMUL_OKAY )
3958
0
            update_guest_eip();
3959
0
        break;
3960
563
3961
0
    case EXIT_REASON_VMLAUNCH:
3962
0
        if ( nvmx_handle_vmlaunch(regs) == X86EMUL_OKAY )
3963
0
            update_guest_eip();
3964
0
        break;
3965
563
3966
0
    case EXIT_REASON_VMRESUME:
3967
0
        if ( nvmx_handle_vmresume(regs) == X86EMUL_OKAY )
3968
0
            update_guest_eip();
3969
0
        break;
3970
563
3971
0
    case EXIT_REASON_INVEPT:
3972
0
        if ( nvmx_handle_invept(regs) == X86EMUL_OKAY )
3973
0
            update_guest_eip();
3974
0
        break;
3975
563
3976
0
    case EXIT_REASON_INVVPID:
3977
0
        if ( nvmx_handle_invvpid(regs) == X86EMUL_OKAY )
3978
0
            update_guest_eip();
3979
0
        break;
3980
563
3981
0
    case EXIT_REASON_VMFUNC:
3982
0
        if ( vmx_vmfunc_intercept(regs) != X86EMUL_OKAY )
3983
0
            hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
3984
0
        else
3985
0
            update_guest_eip();
3986
0
        break;
3987
563
3988
0
    case EXIT_REASON_MWAIT_INSTRUCTION:
3989
0
    case EXIT_REASON_MONITOR_INSTRUCTION:
3990
0
    case EXIT_REASON_GETSEC:
3991
0
        /*
3992
0
         * We should never exit on GETSEC because CR4.SMXE is always 0 when
3993
0
         * running in guest context, and the CPU checks that before getting
3994
0
         * as far as vmexit.
3995
0
         */
3996
0
        WARN_ON(exit_reason == EXIT_REASON_GETSEC);
3997
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
3998
0
        break;
3999
0
4000
0
    case EXIT_REASON_TPR_BELOW_THRESHOLD:
4001
0
        break;
4002
0
4003
0
    case EXIT_REASON_APIC_ACCESS:
4004
0
        if ( !vmx_handle_eoi_write() && !handle_mmio() )
4005
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
4006
0
        break;
4007
0
4008
296
    case EXIT_REASON_EOI_INDUCED:
4009
296
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
4010
296
4011
296
        ASSERT(cpu_has_vmx_virtual_intr_delivery);
4012
296
4013
296
        vlapic_handle_EOI(vcpu_vlapic(v), exit_qualification);
4014
296
        break;
4015
0
4016
20.1k
    case EXIT_REASON_IO_INSTRUCTION:
4017
20.1k
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
4018
20.1k
        if ( exit_qualification & 0x10 )
4019
0
        {
4020
0
            /* INS, OUTS */
4021
0
            if ( !hvm_emulate_one_insn(x86_insn_is_portio, "port I/O") )
4022
0
                hvm_inject_hw_exception(TRAP_gp_fault, 0);
4023
0
        }
4024
20.1k
        else
4025
20.1k
        {
4026
20.1k
            /* IN, OUT */
4027
20.1k
            uint16_t port = (exit_qualification >> 16) & 0xFFFF;
4028
20.1k
            int bytes = (exit_qualification & 0x07) + 1;
4029
20.0k
            int dir = (exit_qualification & 0x08) ? IOREQ_READ : IOREQ_WRITE;
4030
20.1k
            if ( handle_pio(port, bytes, dir) )
4031
20.1k
                update_guest_eip(); /* Safe: IN, OUT */
4032
20.1k
        }
4033
20.1k
        break;
4034
0
4035
44
    case EXIT_REASON_INVD:
4036
44
    case EXIT_REASON_WBINVD:
4037
44
    {
4038
44
        update_guest_eip(); /* Safe: INVD, WBINVD */
4039
44
        vmx_wbinvd_intercept();
4040
44
        break;
4041
44
    }
4042
44
4043
60.1k
    case EXIT_REASON_EPT_VIOLATION:
4044
60.1k
    {
4045
60.1k
        paddr_t gpa;
4046
60.1k
4047
60.1k
        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
4048
60.1k
        __vmread(EXIT_QUALIFICATION, &exit_qualification);
4049
60.1k
        ept_handle_violation(exit_qualification, gpa);
4050
60.1k
        break;
4051
44
    }
4052
44
4053
3.15k
    case EXIT_REASON_EPT_MISCONFIG:
4054
3.15k
    {
4055
3.15k
        paddr_t gpa;
4056
3.15k
4057
3.15k
        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
4058
3.15k
        if ( !ept_handle_misconfig(gpa) )
4059
0
            goto exit_and_crash;
4060
3.15k
        break;
4061
3.15k
    }
4062
3.15k
4063
0
    case EXIT_REASON_MONITOR_TRAP_FLAG:
4064
0
        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
4065
0
        vmx_update_cpu_exec_control(v);
4066
0
        if ( v->arch.hvm_vcpu.single_step )
4067
0
        {
4068
0
            hvm_monitor_debug(regs->rip,
4069
0
                              HVM_MONITOR_SINGLESTEP_BREAKPOINT,
4070
0
                              0, 0);
4071
0
4072
0
            if ( v->domain->debugger_attached )
4073
0
                domain_pause_for_debugger();
4074
0
        }
4075
0
4076
0
        break;
4077
3.15k
4078
4.52M
    case EXIT_REASON_PAUSE_INSTRUCTION:
4079
4.52M
        perfc_incr(pauseloop_exits);
4080
4.52M
        do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
4081
4.52M
        break;
4082
3.15k
4083
11
    case EXIT_REASON_XSETBV:
4084
11
        if ( hvm_handle_xsetbv(regs->ecx, msr_fold(regs)) == 0 )
4085
12
            update_guest_eip(); /* Safe: XSETBV */
4086
11
        break;
4087
3.15k
4088
0
    case EXIT_REASON_APIC_WRITE:
4089
0
        vmx_handle_apic_write();
4090
0
        break;
4091
3.15k
4092
0
    case EXIT_REASON_PML_FULL:
4093
0
        vmx_vcpu_flush_pml_buffer(v);
4094
0
        break;
4095
3.15k
4096
0
    case EXIT_REASON_XSAVES:
4097
0
        vmx_handle_xsaves();
4098
0
        break;
4099
3.15k
4100
0
    case EXIT_REASON_XRSTORS:
4101
0
        vmx_handle_xrstors();
4102
0
        break;
4103
3.15k
4104
0
    case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
4105
0
    case EXIT_REASON_ACCESS_LDTR_OR_TR:
4106
0
        vmx_handle_descriptor_access(exit_reason);
4107
0
        break;
4108
0
4109
0
    case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
4110
0
    case EXIT_REASON_INVPCID:
4111
0
    /* fall through */
4112
0
    default:
4113
0
    exit_and_crash:
4114
0
        gprintk(XENLOG_ERR, "Unexpected vmexit: reason %lu\n", exit_reason);
4115
0
4116
0
        if ( vmx_get_cpl() )
4117
0
            hvm_inject_hw_exception(TRAP_invalid_op,
4118
0
                                    X86_EVENT_NO_EC);
4119
0
        else
4120
0
            domain_crash(v->domain);
4121
0
        break;
4122
5.18M
    }
4123
5.18M
4124
5.51M
out:
4125
5.51M
    if ( nestedhvm_vcpu_in_guestmode(v) )
4126
0
        nvmx_idtv_handling();
4127
5.51M
4128
5.51M
    /*
4129
5.51M
     * VM entry will fail (causing the guest to get crashed) if rIP (and
4130
5.51M
     * rFLAGS, but we don't have an issue there) doesn't meet certain
4131
5.51M
     * criteria. As we must not allow less than fully privileged mode to have
4132
5.51M
     * such an effect on the domain, we correct rIP in that case (accepting
4133
5.51M
     * this not being architecturally correct behavior, as the injected #GP
4134
5.51M
     * fault will then not see the correct [invalid] return address).
4135
5.51M
     * And since we know the guest will crash, we crash it right away if it
4136
5.51M
     * already is in most privileged mode.
4137
5.51M
     */
4138
5.51M
    mode = vmx_guest_x86_mode(v);
4139
5.51M
    if ( mode == 8 ? !is_canonical_address(regs->rip)
4140
88.0k
                   : regs->rip != regs->eip )
4141
0
    {
4142
0
        gprintk(XENLOG_WARNING, "Bad rIP %lx for mode %u\n", regs->rip, mode);
4143
0
4144
0
        if ( vmx_get_cpl() )
4145
0
        {
4146
0
            __vmread(VM_ENTRY_INTR_INFO, &intr_info);
4147
0
            if ( !(intr_info & INTR_INFO_VALID_MASK) )
4148
0
                hvm_inject_hw_exception(TRAP_gp_fault, 0);
4149
0
            /* Need to fix rIP nevertheless. */
4150
0
            if ( mode == 8 )
4151
0
                regs->rip = (long)(regs->rip << (64 - VADDR_BITS)) >>
4152
0
                            (64 - VADDR_BITS);
4153
0
            else
4154
0
                regs->rip = regs->eip;
4155
0
        }
4156
0
        else
4157
0
            domain_crash(v->domain);
4158
0
    }
4159
5.51M
}
4160
4161
static void lbr_tsx_fixup(void)
4162
0
{
4163
0
    struct vcpu *curr = current;
4164
0
    unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
4165
0
    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
4166
0
    struct vmx_msr_entry *msr;
4167
0
4168
0
    if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
4169
0
    {
4170
0
        /*
4171
0
         * Sign extend into bits 61:62 while preserving bit 63
4172
0
         * The loop relies on the fact that MSR array is sorted.
4173
0
         */
4174
0
        for ( ; msr < msr_area + msr_count && msr->index < lbr_from_end; msr++ )
4175
0
            msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
4176
0
    }
4177
0
4178
0
    if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
4179
0
        msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
4180
0
}
4181
4182
static void sign_extend_msr(u32 msr, int type)
4183
0
{
4184
0
    struct vmx_msr_entry *entry;
4185
0
4186
0
    if ( (entry = vmx_find_msr(msr, type)) != NULL )
4187
0
    {
4188
0
        if ( entry->data & VADDR_TOP_BIT )
4189
0
            entry->data |= CANONICAL_MASK;
4190
0
        else
4191
0
            entry->data &= ~CANONICAL_MASK;
4192
0
    }
4193
0
}
4194
4195
static void bdw_erratum_bdf14_fixup(void)
4196
0
{
4197
0
    /*
4198
0
     * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
4199
0
     * been observed to have the top three bits corrupted as though the
4200
0
     * MSR is using the LBR_FORMAT_EIP_FLAGS_TSX format. This is
4201
0
     * incorrect and causes a vmentry failure -- the MSR should contain
4202
0
     * an offset into the current code segment. This is assumed to be
4203
0
     * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
4204
0
     * sign-extending into bits 48:63.
4205
0
     */
4206
0
    sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
4207
0
    sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
4208
0
}
4209
4210
static void lbr_fixup(void)
4211
0
{
4212
0
    struct vcpu *curr = current;
4213
0
4214
0
    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
4215
0
        lbr_tsx_fixup();
4216
0
    if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
4217
0
        bdw_erratum_bdf14_fixup();
4218
0
}
4219
4220
/* Returns false if the vmentry has to be restarted */
4221
bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
4222
5.11M
{
4223
5.11M
    struct vcpu *curr = current;
4224
5.11M
    u32 new_asid, old_asid;
4225
5.11M
    struct hvm_vcpu_asid *p_asid;
4226
5.11M
    bool_t need_flush;
4227
5.11M
4228
5.11M
    /* Shadow EPTP can't be updated here because irqs are disabled */
4229
5.11M
     if ( nestedhvm_vcpu_in_guestmode(curr) && vcpu_nestedhvm(curr).stale_np2m )
4230
0
         return false;
4231
5.11M
4232
5.11M
    if ( curr->domain->arch.hvm_domain.pi_ops.do_resume )
4233
0
        curr->domain->arch.hvm_domain.pi_ops.do_resume(curr);
4234
5.11M
4235
5.11M
    if ( !cpu_has_vmx_vpid )
4236
0
        goto out;
4237
5.11M
    if ( nestedhvm_vcpu_in_guestmode(curr) )
4238
0
        p_asid = &vcpu_nestedhvm(curr).nv_n2asid;
4239
5.11M
    else
4240
5.11M
        p_asid = &curr->arch.hvm_vcpu.n1asid;
4241
5.11M
4242
5.11M
    old_asid = p_asid->asid;
4243
5.11M
    need_flush = hvm_asid_handle_vmenter(p_asid);
4244
5.11M
    new_asid = p_asid->asid;
4245
5.11M
4246
5.11M
    if ( unlikely(new_asid != old_asid) )
4247
37.3k
    {
4248
37.3k
        __vmwrite(VIRTUAL_PROCESSOR_ID, new_asid);
4249
37.3k
        if ( !old_asid && new_asid )
4250
12
        {
4251
12
            /* VPID was disabled: now enabled. */
4252
12
            curr->arch.hvm_vmx.secondary_exec_control |=
4253
12
                SECONDARY_EXEC_ENABLE_VPID;
4254
12
            vmx_update_secondary_exec_control(curr);
4255
12
        }
4256
37.4k
        else if ( old_asid && !new_asid )
4257
0
        {
4258
0
            /* VPID was enabled: now disabled. */
4259
0
            curr->arch.hvm_vmx.secondary_exec_control &=
4260
0
                ~SECONDARY_EXEC_ENABLE_VPID;
4261
0
            vmx_update_secondary_exec_control(curr);
4262
0
        }
4263
37.3k
    }
4264
5.11M
4265
5.11M
    if ( unlikely(need_flush) )
4266
12
        vpid_sync_all();
4267
5.11M
4268
5.11M
    if ( paging_mode_hap(curr->domain) )
4269
5.08M
    {
4270
5.08M
        struct ept_data *ept = &p2m_get_hostp2m(curr->domain)->ept;
4271
5.08M
        unsigned int cpu = smp_processor_id();
4272
5.08M
4273
5.08M
        if ( cpumask_test_cpu(cpu, ept->invalidate) )
4274
372
        {
4275
372
            cpumask_clear_cpu(cpu, ept->invalidate);
4276
372
            if ( nestedhvm_enabled(curr->domain) )
4277
0
                __invept(INVEPT_ALL_CONTEXT, 0, 0);
4278
372
            else
4279
372
                __invept(INVEPT_SINGLE_CONTEXT, ept->eptp, 0);
4280
372
        }
4281
5.08M
    }
4282
5.11M
4283
5.08M
 out:
4284
5.08M
    if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
4285
0
        lbr_fixup();
4286
5.08M
4287
5.08M
    HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
4288
5.08M
4289
5.08M
    __vmwrite(GUEST_RIP,    regs->rip);
4290
5.08M
    __vmwrite(GUEST_RSP,    regs->rsp);
4291
5.08M
    __vmwrite(GUEST_RFLAGS, regs->rflags | X86_EFLAGS_MBS);
4292
5.08M
4293
5.08M
    return true;
4294
5.11M
}
4295
4296
/*
4297
 * Local variables:
4298
 * mode: C
4299
 * c-file-style: "BSD"
4300
 * c-basic-offset: 4
4301
 * tab-width: 4
4302
 * indent-tabs-mode: nil
4303
 * End:
4304
 */