Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/hvm/hvm.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * hvm.c: Common hardware virtual machine abstractions.
3
 *
4
 * Copyright (c) 2004, Intel Corporation.
5
 * Copyright (c) 2005, International Business Machines Corporation.
6
 * Copyright (c) 2008, Citrix Systems, Inc.
7
 * 
8
 * This program is free software; you can redistribute it and/or modify it
9
 * under the terms and conditions of the GNU General Public License,
10
 * version 2, as published by the Free Software Foundation.
11
 *
12
 * This program is distributed in the hope it will be useful, but WITHOUT
13
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15
 * more details.
16
 *
17
 * You should have received a copy of the GNU General Public License along with
18
 * this program; If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include <xen/ctype.h>
22
#include <xen/init.h>
23
#include <xen/lib.h>
24
#include <xen/trace.h>
25
#include <xen/sched.h>
26
#include <xen/irq.h>
27
#include <xen/softirq.h>
28
#include <xen/domain.h>
29
#include <xen/domain_page.h>
30
#include <xen/hypercall.h>
31
#include <xen/guest_access.h>
32
#include <xen/event.h>
33
#include <xen/cpu.h>
34
#include <xen/wait.h>
35
#include <xen/mem_access.h>
36
#include <xen/rangeset.h>
37
#include <xen/monitor.h>
38
#include <xen/warning.h>
39
#include <xen/vpci.h>
40
#include <asm/shadow.h>
41
#include <asm/hap.h>
42
#include <asm/current.h>
43
#include <asm/e820.h>
44
#include <asm/io.h>
45
#include <asm/regs.h>
46
#include <asm/cpufeature.h>
47
#include <asm/processor.h>
48
#include <asm/types.h>
49
#include <asm/msr.h>
50
#include <asm/i387.h>
51
#include <asm/xstate.h>
52
#include <asm/traps.h>
53
#include <asm/mc146818rtc.h>
54
#include <asm/mce.h>
55
#include <asm/monitor.h>
56
#include <asm/hvm/hvm.h>
57
#include <asm/hvm/vpt.h>
58
#include <asm/hvm/support.h>
59
#include <asm/hvm/cacheattr.h>
60
#include <asm/hvm/trace.h>
61
#include <asm/hvm/nestedhvm.h>
62
#include <asm/hvm/monitor.h>
63
#include <asm/hvm/ioreq.h>
64
#include <asm/hvm/vm_event.h>
65
#include <asm/altp2m.h>
66
#include <asm/mtrr.h>
67
#include <asm/apic.h>
68
#include <asm/vm_event.h>
69
#include <public/sched.h>
70
#include <public/hvm/ioreq.h>
71
#include <public/version.h>
72
#include <public/memory.h>
73
#include <public/vm_event.h>
74
#include <public/arch-x86/cpuid.h>
75
#include <asm/cpuid.h>
76
77
bool_t __read_mostly hvm_enabled;
78
79
#ifdef DBG_LEVEL_0
80
unsigned int opt_hvm_debug_level __read_mostly;
81
integer_param("hvm_debug", opt_hvm_debug_level);
82
#endif
83
84
struct hvm_function_table hvm_funcs __read_mostly;
85
86
/*
87
 * The I/O permission bitmap is globally shared by all HVM guests except
88
 * the hardware domain which needs a more permissive one.
89
 */
90
1
#define HVM_IOBITMAP_SIZE (3 * PAGE_SIZE)
91
unsigned long __section(".bss.page_aligned") __aligned(PAGE_SIZE)
92
    hvm_io_bitmap[HVM_IOBITMAP_SIZE / BYTES_PER_LONG];
93
94
/* Xen command-line option to enable HAP */
95
static bool_t __initdata opt_hap_enabled = 1;
96
boolean_param("hap", opt_hap_enabled);
97
98
#ifndef opt_hvm_fep
99
/* Permit use of the Forced Emulation Prefix in HVM guests */
100
bool_t __read_mostly opt_hvm_fep;
101
boolean_param("hvm_fep", opt_hvm_fep);
102
#endif
103
static const char __initconst warning_hvm_fep[] =
104
    "WARNING: HVM FORCED EMULATION PREFIX IS AVAILABLE\n"
105
    "This option is *ONLY* intended to aid testing of Xen.\n"
106
    "It has implications on the security of the system.\n"
107
    "Please *DO NOT* use this in production.\n";
108
109
/* Xen command-line option to enable altp2m */
110
static bool_t __initdata opt_altp2m_enabled = 0;
111
boolean_param("altp2m", opt_altp2m_enabled);
112
113
static int cpu_callback(
114
    struct notifier_block *nfb, unsigned long action, void *hcpu)
115
33
{
116
33
    unsigned int cpu = (unsigned long)hcpu;
117
33
    int rc = 0;
118
33
119
33
    switch ( action )
120
33
    {
121
11
    case CPU_UP_PREPARE:
122
11
        rc = hvm_funcs.cpu_up_prepare(cpu);
123
11
        break;
124
0
    case CPU_DYING:
125
0
        hvm_cpu_down();
126
0
        break;
127
0
    case CPU_UP_CANCELED:
128
0
    case CPU_DEAD:
129
0
        hvm_funcs.cpu_dead(cpu);
130
0
        break;
131
22
    default:
132
22
        break;
133
33
    }
134
33
135
33
    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
136
33
}
137
138
static struct notifier_block cpu_nfb = {
139
    .notifier_call = cpu_callback
140
};
141
142
static int __init hvm_enable(void)
143
1
{
144
1
    const struct hvm_function_table *fns = NULL;
145
1
146
1
    if ( cpu_has_vmx )
147
1
        fns = start_vmx();
148
0
    else if ( cpu_has_svm )
149
0
        fns = start_svm();
150
1
151
1
    if ( fns == NULL )
152
0
        return 0;
153
1
154
1
    hvm_funcs = *fns;
155
1
    hvm_enabled = 1;
156
1
157
1
    printk("HVM: %s enabled\n", fns->name);
158
1
    if ( !fns->hap_supported )
159
0
        printk("HVM: Hardware Assisted Paging (HAP) not detected\n");
160
1
    else if ( !opt_hap_enabled )
161
0
    {
162
0
        hvm_funcs.hap_supported = 0;
163
0
        printk("HVM: Hardware Assisted Paging (HAP) detected but disabled\n");
164
0
    }
165
1
    else
166
1
    {
167
1
        printk("HVM: Hardware Assisted Paging (HAP) detected\n");
168
1
        printk("HVM: HAP page sizes: 4kB");
169
1
        if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_2MB )
170
1
        {
171
1
            printk(", 2MB%s", opt_hap_2mb ? "" : " [disabled]");
172
1
            if ( !opt_hap_2mb )
173
0
                hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_2MB;
174
1
        }
175
1
        if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_1GB )
176
1
        {
177
1
            printk(", 1GB%s", opt_hap_1gb ? "" : " [disabled]");
178
1
            if ( !opt_hap_1gb )
179
0
                hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_1GB;
180
1
        }
181
1
        printk("\n");
182
1
    }
183
1
184
1
    if ( !opt_altp2m_enabled )
185
1
        hvm_funcs.altp2m_supported = 0;
186
1
187
1
    if ( opt_hvm_fep )
188
0
        warning_add(warning_hvm_fep);
189
1
190
1
    /*
191
1
     * Allow direct access to the PC debug ports 0x80 and 0xed (they are
192
1
     * often used for I/O delays, but the vmexits simply slow things down).
193
1
     */
194
1
    memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
195
1
    if ( hvm_port80_allowed )
196
1
        __clear_bit(0x80, hvm_io_bitmap);
197
1
    __clear_bit(0xed, hvm_io_bitmap);
198
1
199
1
    register_cpu_notifier(&cpu_nfb);
200
1
201
1
    return 0;
202
1
}
203
presmp_initcall(hvm_enable);
204
205
/*
206
 * Need to re-inject a given event? We avoid re-injecting software exceptions
207
 * and interrupts because the faulting/trapping instruction can simply be
208
 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
209
 * INT3/INTO/INTn).
210
 */
211
int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
212
0
{
213
0
    switch ( type )
214
0
    {
215
0
    case X86_EVENTTYPE_EXT_INTR:
216
0
    case X86_EVENTTYPE_NMI:
217
0
        return 1;
218
0
    case X86_EVENTTYPE_HW_EXCEPTION:
219
0
        /*
220
0
         * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
221
0
         * check for these vectors, as they are really SW Exceptions. SVM has
222
0
         * not updated RIP to point after the trapping instruction (INT3/INTO).
223
0
         */
224
0
        return (vector != 3) && (vector != 4);
225
0
    default:
226
0
        /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
227
0
        break;
228
0
    }
229
0
    return 0;
230
0
}
231
232
/*
233
 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
234
 * This means we can assume that @vec2 is contributory or a page fault.
235
 */
236
uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
237
0
{
238
0
    const unsigned int contributory_exceptions =
239
0
        (1 << TRAP_divide_error) |
240
0
        (1 << TRAP_invalid_tss) |
241
0
        (1 << TRAP_no_segment) |
242
0
        (1 << TRAP_stack_error) |
243
0
        (1 << TRAP_gp_fault);
244
0
    const unsigned int page_faults =
245
0
        (1 << TRAP_page_fault) |
246
0
        (1 << TRAP_virtualisation);
247
0
248
0
    /* Exception during double-fault delivery always causes a triple fault. */
249
0
    if ( vec1 == TRAP_double_fault )
250
0
    {
251
0
        hvm_triple_fault();
252
0
        return TRAP_double_fault; /* dummy return */
253
0
    }
254
0
255
0
    /* Exception during page-fault delivery always causes a double fault. */
256
0
    if ( (1u << vec1) & page_faults )
257
0
        return TRAP_double_fault;
258
0
259
0
    /* Discard the first exception if it's benign or if we now have a #PF. */
260
0
    if ( !((1u << vec1) & contributory_exceptions) ||
261
0
         ((1u << vec2) & page_faults) )
262
0
        return vec2;
263
0
264
0
    /* Cannot combine the exceptions: double fault. */
265
0
    return TRAP_double_fault;
266
0
}
267
268
void hvm_set_rdtsc_exiting(struct domain *d, bool_t enable)
269
0
{
270
0
    struct vcpu *v;
271
0
272
0
    for_each_vcpu ( d, v )
273
0
        hvm_funcs.set_rdtsc_exiting(v, enable);
274
0
}
275
276
void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat)
277
0
{
278
0
    if ( !hvm_funcs.get_guest_pat(v, guest_pat) )
279
0
        *guest_pat = v->arch.hvm_vcpu.pat_cr;
280
0
}
281
282
int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat)
283
10
{
284
10
    int i;
285
10
    uint8_t *value = (uint8_t *)&guest_pat;
286
10
287
96
    for ( i = 0; i < 8; i++ )
288
72
        switch ( value[i] )
289
72
        {
290
85
        case PAT_TYPE_UC_MINUS:
291
85
        case PAT_TYPE_UNCACHABLE:
292
85
        case PAT_TYPE_WRBACK:
293
85
        case PAT_TYPE_WRCOMB:
294
85
        case PAT_TYPE_WRPROT:
295
85
        case PAT_TYPE_WRTHROUGH:
296
85
            break;
297
0
        default:
298
0
            HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid guest PAT: %"PRIx64"\n",
299
0
                        guest_pat); 
300
0
            return 0;
301
72
        }
302
10
303
24
    if ( !hvm_funcs.set_guest_pat(v, guest_pat) )
304
10
        v->arch.hvm_vcpu.pat_cr = guest_pat;
305
24
306
24
    return 1;
307
10
}
308
309
bool hvm_set_guest_bndcfgs(struct vcpu *v, u64 val)
310
0
{
311
0
    if ( !hvm_funcs.set_guest_bndcfgs ||
312
0
         !is_canonical_address(val) ||
313
0
         (val & IA32_BNDCFGS_RESERVED) )
314
0
        return false;
315
0
316
0
    /*
317
0
     * While MPX instructions are supposed to be gated on XCR0.BND*, let's
318
0
     * nevertheless force the relevant XCR0 bits on when the feature is being
319
0
     * enabled in BNDCFGS.
320
0
     */
321
0
    if ( (val & IA32_BNDCFGS_ENABLE) &&
322
0
         !(v->arch.xcr0_accum & (XSTATE_BNDREGS | XSTATE_BNDCSR)) )
323
0
    {
324
0
        uint64_t xcr0 = get_xcr0();
325
0
        int rc;
326
0
327
0
        if ( v != current )
328
0
            return false;
329
0
330
0
        rc = handle_xsetbv(XCR_XFEATURE_ENABLED_MASK,
331
0
                           xcr0 | XSTATE_BNDREGS | XSTATE_BNDCSR);
332
0
333
0
        if ( rc )
334
0
        {
335
0
            HVM_DBG_LOG(DBG_LEVEL_1, "Failed to force XCR0.BND*: %d", rc);
336
0
            return false;
337
0
        }
338
0
339
0
        if ( handle_xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0) )
340
0
            /* nothing, best effort only */;
341
0
    }
342
0
343
0
    return hvm_funcs.set_guest_bndcfgs(v, val);
344
0
}
345
346
/*
347
 * Get the ratio to scale host TSC frequency to gtsc_khz. zero will be
348
 * returned if TSC scaling is unavailable or ratio cannot be handled
349
 * by host CPU. Otherwise, a non-zero ratio will be returned.
350
 */
351
u64 hvm_get_tsc_scaling_ratio(u32 gtsc_khz)
352
0
{
353
0
    u8 ratio_frac_bits = hvm_funcs.tsc_scaling.ratio_frac_bits;
354
0
    u64 max_ratio = hvm_funcs.tsc_scaling.max_ratio;
355
0
    u64 ratio, dummy;
356
0
357
0
    if ( !hvm_tsc_scaling_supported )
358
0
        return 0;
359
0
360
0
    /*
361
0
     * Return early if the quotient is too large to fit in the integral
362
0
     * part of TSC scaling ratio. This also avoids #DE from the following
363
0
     * divq when the quotient can not fit in a 64-bit integer.
364
0
     */
365
0
    if ( gtsc_khz / cpu_khz > (max_ratio >> ratio_frac_bits) )
366
0
        return 0;
367
0
368
0
    /* ratio = (gtsc_khz << hvm_funcs.tsc_scaling.ratio_frac_bits) / cpu_khz */
369
0
    asm ( "shldq %[frac],%[gkhz],%[zero] ; "
370
0
          "shlq  %[frac],%[gkhz]         ; "
371
0
          "divq  %[hkhz]                   "
372
0
          : "=d" (dummy), "=a" (ratio)
373
0
          : [frac] "c" (ratio_frac_bits),
374
0
            [gkhz] "a" ((u64) gtsc_khz),
375
0
            [zero] "d" (0ULL),
376
0
            [hkhz] "rm" ((u64) cpu_khz) );
377
0
378
0
    return ratio > max_ratio ? 0 : ratio;
379
0
}
380
381
u64 hvm_scale_tsc(const struct domain *d, u64 tsc)
382
0
{
383
0
    u64 ratio = d->arch.hvm_domain.tsc_scaling_ratio;
384
0
    u64 dummy;
385
0
386
0
    if ( ratio == hvm_default_tsc_scaling_ratio )
387
0
        return tsc;
388
0
389
0
    /* tsc = (tsc * ratio) >> hvm_funcs.tsc_scaling.ratio_frac_bits */
390
0
    asm ( "mulq %[ratio]; shrdq %[frac],%%rdx,%[tsc]"
391
0
          : [tsc] "+a" (tsc), "=&d" (dummy)
392
0
          : [frac] "c" (hvm_funcs.tsc_scaling.ratio_frac_bits),
393
0
            [ratio] "rm" (ratio) );
394
0
395
0
    return tsc;
396
0
}
397
398
static void hvm_set_guest_tsc_fixed(struct vcpu *v, u64 guest_tsc, u64 at_tsc)
399
1
{
400
1
    uint64_t tsc;
401
1
    uint64_t delta_tsc;
402
1
403
1
    if ( v->domain->arch.vtsc )
404
0
    {
405
0
        tsc = hvm_get_guest_time_fixed(v, at_tsc);
406
0
        tsc = gtime_to_gtsc(v->domain, tsc);
407
0
    }
408
1
    else
409
1
    {
410
1
        tsc = at_tsc ?: rdtsc();
411
1
        if ( hvm_tsc_scaling_supported )
412
0
            tsc = hvm_scale_tsc(v->domain, tsc);
413
1
    }
414
1
415
1
    delta_tsc = guest_tsc - tsc;
416
1
    v->arch.hvm_vcpu.cache_tsc_offset = delta_tsc;
417
1
418
1
    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, at_tsc);
419
1
}
420
421
1
#define hvm_set_guest_tsc(v, t) hvm_set_guest_tsc_fixed(v, t, 0)
422
423
static void hvm_set_guest_tsc_msr(struct vcpu *v, u64 guest_tsc)
424
0
{
425
0
    uint64_t tsc_offset = v->arch.hvm_vcpu.cache_tsc_offset;
426
0
427
0
    hvm_set_guest_tsc(v, guest_tsc);
428
0
    v->arch.hvm_vcpu.msr_tsc_adjust += v->arch.hvm_vcpu.cache_tsc_offset
429
0
                          - tsc_offset;
430
0
}
431
432
static void hvm_set_guest_tsc_adjust(struct vcpu *v, u64 tsc_adjust)
433
0
{
434
0
    v->arch.hvm_vcpu.cache_tsc_offset += tsc_adjust
435
0
                            - v->arch.hvm_vcpu.msr_tsc_adjust;
436
0
    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
437
0
    v->arch.hvm_vcpu.msr_tsc_adjust = tsc_adjust;
438
0
}
439
440
u64 hvm_get_guest_tsc_fixed(struct vcpu *v, uint64_t at_tsc)
441
0
{
442
0
    uint64_t tsc;
443
0
444
0
    if ( v->domain->arch.vtsc )
445
0
    {
446
0
        tsc = hvm_get_guest_time_fixed(v, at_tsc);
447
0
        tsc = gtime_to_gtsc(v->domain, tsc);
448
0
    }
449
0
    else
450
0
    {
451
0
        tsc = at_tsc ?: rdtsc();
452
0
        if ( hvm_tsc_scaling_supported )
453
0
            tsc = hvm_scale_tsc(v->domain, tsc);
454
0
    }
455
0
456
0
    return tsc + v->arch.hvm_vcpu.cache_tsc_offset;
457
0
}
458
459
void hvm_migrate_timers(struct vcpu *v)
460
548
{
461
548
    rtc_migrate_timers(v);
462
548
    pt_migrate(v);
463
548
}
464
465
static int hvm_migrate_pirq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
466
                            void *arg)
467
0
{
468
0
    struct vcpu *v = arg;
469
0
470
0
    if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
471
0
         /* Needn't migrate pirq if this pirq is delivered to guest directly.*/
472
0
         !pirq_dpci->gmsi.posted &&
473
0
         (pirq_dpci->gmsi.dest_vcpu_id == v->vcpu_id) )
474
0
    {
475
0
        struct irq_desc *desc =
476
0
            pirq_spin_lock_irq_desc(dpci_pirq(pirq_dpci), NULL);
477
0
478
0
        if ( !desc )
479
0
            return 0;
480
0
        ASSERT(MSI_IRQ(desc - irq_desc));
481
0
        irq_set_affinity(desc, cpumask_of(v->processor));
482
0
        spin_unlock_irq(&desc->lock);
483
0
    }
484
0
485
0
    return 0;
486
0
}
487
488
void hvm_migrate_pirqs(struct vcpu *v)
489
590
{
490
590
    struct domain *d = v->domain;
491
590
492
590
    if ( !iommu_enabled || !hvm_domain_irq(d)->dpci )
493
590
       return;
494
590
495
0
    spin_lock(&d->event_lock);
496
0
    pt_pirq_iterate(d, hvm_migrate_pirq, v);
497
0
    spin_unlock(&d->event_lock);
498
0
}
499
500
static bool hvm_get_pending_event(struct vcpu *v, struct x86_event *info)
501
0
{
502
0
    info->cr2 = v->arch.hvm_vcpu.guest_cr[2];
503
0
    return hvm_funcs.get_pending_event(v, info);
504
0
}
505
506
void hvm_do_resume(struct vcpu *v)
507
4.57M
{
508
4.57M
    check_wakeup_from_wait();
509
4.57M
510
4.57M
    pt_restore_timer(v);
511
4.57M
512
4.57M
    if ( !handle_hvm_io_completion(v) )
513
0
        return;
514
4.57M
515
4.57M
    if ( unlikely(v->arch.vm_event) )
516
0
        hvm_vm_event_do_resume(v);
517
4.57M
518
4.57M
    /* Inject pending hw/sw event */
519
4.57M
    if ( v->arch.hvm_vcpu.inject_event.vector >= 0 )
520
0
    {
521
0
        smp_rmb();
522
0
523
0
        if ( !hvm_event_pending(v) )
524
0
            hvm_inject_event(&v->arch.hvm_vcpu.inject_event);
525
0
526
0
        v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
527
0
    }
528
4.57M
529
4.57M
    if ( unlikely(v->arch.vm_event) && v->arch.monitor.next_interrupt_enabled )
530
0
    {
531
0
        struct x86_event info;
532
0
533
0
        if ( hvm_get_pending_event(v, &info) )
534
0
        {
535
0
            hvm_monitor_interrupt(info.vector, info.type, info.error_code,
536
0
                                  info.cr2);
537
0
            v->arch.monitor.next_interrupt_enabled = false;
538
0
        }
539
0
    }
540
4.57M
}
541
542
static int hvm_print_line(
543
    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
544
0
{
545
0
    struct domain *cd = current->domain;
546
0
    char c = *val;
547
0
548
0
    BUG_ON(bytes != 1);
549
0
550
0
    /* Accept only printable characters, newline, and horizontal tab. */
551
0
    if ( !isprint(c) && (c != '\n') && (c != '\t') )
552
0
        return X86EMUL_OKAY;
553
0
554
0
    spin_lock(&cd->pbuf_lock);
555
0
    if ( c != '\n' )
556
0
        cd->pbuf[cd->pbuf_idx++] = c;
557
0
    if ( (cd->pbuf_idx == (DOMAIN_PBUF_SIZE - 1)) || (c == '\n') )
558
0
    {
559
0
        cd->pbuf[cd->pbuf_idx] = '\0';
560
0
        guest_printk(cd, XENLOG_G_DEBUG "%s\n", cd->pbuf);
561
0
        cd->pbuf_idx = 0;
562
0
    }
563
0
    spin_unlock(&cd->pbuf_lock);
564
0
565
0
    return X86EMUL_OKAY;
566
0
}
567
568
int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags,
569
                          struct xen_arch_domainconfig *config)
570
1
{
571
1
    unsigned int nr_gsis;
572
1
    int rc;
573
1
574
1
    if ( !hvm_enabled )
575
0
    {
576
0
        gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
577
0
                 "on a non-VT/AMDV platform.\n");
578
0
        return -EINVAL;
579
0
    }
580
1
581
1
    spin_lock_init(&d->arch.hvm_domain.irq_lock);
582
1
    spin_lock_init(&d->arch.hvm_domain.uc_lock);
583
1
    spin_lock_init(&d->arch.hvm_domain.write_map.lock);
584
1
    rwlock_init(&d->arch.hvm_domain.mmcfg_lock);
585
1
    INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list);
586
1
    INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list);
587
1
    INIT_LIST_HEAD(&d->arch.hvm_domain.mmcfg_regions);
588
1
    INIT_LIST_HEAD(&d->arch.hvm_domain.msix_tables);
589
1
590
1
    rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
591
1
    if ( rc )
592
0
        goto fail;
593
1
594
1
    hvm_init_cacheattr_region_list(d);
595
1
596
1
    rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
597
1
    if ( rc != 0 )
598
0
        goto fail0;
599
1
600
1
    nr_gsis = is_hardware_domain(d) ? nr_irqs_gsi : NR_HVM_DOMU_IRQS;
601
1
    d->arch.hvm_domain.pl_time = xzalloc(struct pl_time);
602
1
    d->arch.hvm_domain.params = xzalloc_array(uint64_t, HVM_NR_PARAMS);
603
1
    d->arch.hvm_domain.io_handler = xzalloc_array(struct hvm_io_handler,
604
1
                                                  NR_IO_HANDLERS);
605
1
    d->arch.hvm_domain.irq = xzalloc_bytes(hvm_irq_size(nr_gsis));
606
1
607
1
    rc = -ENOMEM;
608
1
    if ( !d->arch.hvm_domain.pl_time || !d->arch.hvm_domain.irq ||
609
1
         !d->arch.hvm_domain.params  || !d->arch.hvm_domain.io_handler )
610
0
        goto fail1;
611
1
612
1
    /* Set the number of GSIs */
613
1
    hvm_domain_irq(d)->nr_gsis = nr_gsis;
614
1
615
1
    BUILD_BUG_ON(NR_HVM_DOMU_IRQS < NR_ISAIRQS);
616
1
    ASSERT(hvm_domain_irq(d)->nr_gsis >= NR_ISAIRQS);
617
1
618
1
    /* need link to containing domain */
619
1
    d->arch.hvm_domain.pl_time->domain = d;
620
1
621
1
    /* Set the default IO Bitmap. */
622
1
    if ( is_hardware_domain(d) )
623
1
    {
624
1
        d->arch.hvm_domain.io_bitmap = _xmalloc(HVM_IOBITMAP_SIZE, PAGE_SIZE);
625
1
        if ( d->arch.hvm_domain.io_bitmap == NULL )
626
0
        {
627
0
            rc = -ENOMEM;
628
0
            goto fail1;
629
0
        }
630
1
        memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE);
631
1
    }
632
1
    else
633
0
        d->arch.hvm_domain.io_bitmap = hvm_io_bitmap;
634
1
635
1
    register_g2m_portio_handler(d);
636
1
    register_vpci_portio_handler(d);
637
1
638
1
    hvm_ioreq_init(d);
639
1
640
1
    hvm_init_guest_time(d);
641
1
642
1
    d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON] = SHUTDOWN_reboot;
643
1
644
1
    vpic_init(d);
645
1
646
1
    rc = vioapic_init(d);
647
1
    if ( rc != 0 )
648
0
        goto fail1;
649
1
650
1
    stdvga_init(d);
651
1
652
1
    rtc_init(d);
653
1
654
1
    register_portio_handler(d, 0xe9, 1, hvm_print_line);
655
1
656
1
    if ( hvm_tsc_scaling_supported )
657
0
        d->arch.hvm_domain.tsc_scaling_ratio = hvm_default_tsc_scaling_ratio;
658
1
659
1
    rc = hvm_funcs.domain_initialise(d);
660
1
    if ( rc != 0 )
661
0
        goto fail2;
662
1
663
1
    return 0;
664
1
665
0
 fail2:
666
0
    rtc_deinit(d);
667
0
    stdvga_deinit(d);
668
0
    vioapic_deinit(d);
669
0
 fail1:
670
0
    if ( is_hardware_domain(d) )
671
0
        xfree(d->arch.hvm_domain.io_bitmap);
672
0
    xfree(d->arch.hvm_domain.io_handler);
673
0
    xfree(d->arch.hvm_domain.params);
674
0
    xfree(d->arch.hvm_domain.pl_time);
675
0
    xfree(d->arch.hvm_domain.irq);
676
0
 fail0:
677
0
    hvm_destroy_cacheattr_region_list(d);
678
0
    destroy_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0);
679
0
 fail:
680
0
    return rc;
681
0
}
682
683
void hvm_domain_relinquish_resources(struct domain *d)
684
0
{
685
0
    if ( hvm_funcs.nhvm_domain_relinquish_resources )
686
0
        hvm_funcs.nhvm_domain_relinquish_resources(d);
687
0
688
0
    viridian_domain_deinit(d);
689
0
690
0
    hvm_destroy_all_ioreq_servers(d);
691
0
692
0
    msixtbl_pt_cleanup(d);
693
0
694
0
    /* Stop all asynchronous timer actions. */
695
0
    rtc_deinit(d);
696
0
    if ( d->vcpu != NULL && d->vcpu[0] != NULL )
697
0
    {
698
0
        pmtimer_deinit(d);
699
0
        hpet_deinit(d);
700
0
    }
701
0
}
702
703
void hvm_domain_destroy(struct domain *d)
704
0
{
705
0
    struct list_head *ioport_list, *tmp;
706
0
    struct g2m_ioport *ioport;
707
0
708
0
    xfree(d->arch.hvm_domain.io_handler);
709
0
    d->arch.hvm_domain.io_handler = NULL;
710
0
711
0
    xfree(d->arch.hvm_domain.params);
712
0
    d->arch.hvm_domain.params = NULL;
713
0
714
0
    hvm_destroy_cacheattr_region_list(d);
715
0
716
0
    hvm_funcs.domain_destroy(d);
717
0
    rtc_deinit(d);
718
0
    stdvga_deinit(d);
719
0
    vioapic_deinit(d);
720
0
721
0
    xfree(d->arch.hvm_domain.pl_time);
722
0
    d->arch.hvm_domain.pl_time = NULL;
723
0
724
0
    xfree(d->arch.hvm_domain.irq);
725
0
    d->arch.hvm_domain.irq = NULL;
726
0
727
0
    list_for_each_safe ( ioport_list, tmp,
728
0
                         &d->arch.hvm_domain.g2m_ioport_list )
729
0
    {
730
0
        ioport = list_entry(ioport_list, struct g2m_ioport, list);
731
0
        list_del(&ioport->list);
732
0
        xfree(ioport);
733
0
    }
734
0
735
0
    destroy_vpci_mmcfg(&d->arch.hvm_domain.mmcfg_regions);
736
0
}
737
738
static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
739
0
{
740
0
    struct vcpu *v;
741
0
    struct hvm_tsc_adjust ctxt;
742
0
    int err = 0;
743
0
744
0
    for_each_vcpu ( d, v )
745
0
    {
746
0
        ctxt.tsc_adjust = v->arch.hvm_vcpu.msr_tsc_adjust;
747
0
        err = hvm_save_entry(TSC_ADJUST, v->vcpu_id, h, &ctxt);
748
0
        if ( err )
749
0
            break;
750
0
    }
751
0
752
0
    return err;
753
0
}
754
755
static int hvm_load_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
756
0
{
757
0
    unsigned int vcpuid = hvm_load_instance(h);
758
0
    struct vcpu *v;
759
0
    struct hvm_tsc_adjust ctxt;
760
0
761
0
    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
762
0
    {
763
0
        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
764
0
                d->domain_id, vcpuid);
765
0
        return -EINVAL;
766
0
    }
767
0
768
0
    if ( hvm_load_entry(TSC_ADJUST, h, &ctxt) != 0 )
769
0
        return -EINVAL;
770
0
771
0
    v->arch.hvm_vcpu.msr_tsc_adjust = ctxt.tsc_adjust;
772
0
    return 0;
773
0
}
774
775
HVM_REGISTER_SAVE_RESTORE(TSC_ADJUST, hvm_save_tsc_adjust,
776
                          hvm_load_tsc_adjust, 1, HVMSR_PER_VCPU);
777
778
static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
779
0
{
780
0
    struct vcpu *v;
781
0
    struct hvm_hw_cpu ctxt;
782
0
    struct segment_register seg;
783
0
784
0
    for_each_vcpu ( d, v )
785
0
    {
786
0
        /* We don't need to save state for a vcpu that is down; the restore 
787
0
         * code will leave it down if there is nothing saved. */
788
0
        if ( v->pause_flags & VPF_down )
789
0
            continue;
790
0
791
0
        memset(&ctxt, 0, sizeof(ctxt));
792
0
793
0
        /* Architecture-specific vmcs/vmcb bits */
794
0
        hvm_funcs.save_cpu_ctxt(v, &ctxt);
795
0
796
0
        ctxt.tsc = hvm_get_guest_tsc_fixed(v, d->arch.hvm_domain.sync_tsc);
797
0
798
0
        ctxt.msr_tsc_aux = hvm_msr_tsc_aux(v);
799
0
800
0
        hvm_get_segment_register(v, x86_seg_idtr, &seg);
801
0
        ctxt.idtr_limit = seg.limit;
802
0
        ctxt.idtr_base = seg.base;
803
0
804
0
        hvm_get_segment_register(v, x86_seg_gdtr, &seg);
805
0
        ctxt.gdtr_limit = seg.limit;
806
0
        ctxt.gdtr_base = seg.base;
807
0
808
0
        hvm_get_segment_register(v, x86_seg_cs, &seg);
809
0
        ctxt.cs_sel = seg.sel;
810
0
        ctxt.cs_limit = seg.limit;
811
0
        ctxt.cs_base = seg.base;
812
0
        ctxt.cs_arbytes = seg.attr;
813
0
814
0
        hvm_get_segment_register(v, x86_seg_ds, &seg);
815
0
        ctxt.ds_sel = seg.sel;
816
0
        ctxt.ds_limit = seg.limit;
817
0
        ctxt.ds_base = seg.base;
818
0
        ctxt.ds_arbytes = seg.attr;
819
0
820
0
        hvm_get_segment_register(v, x86_seg_es, &seg);
821
0
        ctxt.es_sel = seg.sel;
822
0
        ctxt.es_limit = seg.limit;
823
0
        ctxt.es_base = seg.base;
824
0
        ctxt.es_arbytes = seg.attr;
825
0
826
0
        hvm_get_segment_register(v, x86_seg_ss, &seg);
827
0
        ctxt.ss_sel = seg.sel;
828
0
        ctxt.ss_limit = seg.limit;
829
0
        ctxt.ss_base = seg.base;
830
0
        ctxt.ss_arbytes = seg.attr;
831
0
832
0
        hvm_get_segment_register(v, x86_seg_fs, &seg);
833
0
        ctxt.fs_sel = seg.sel;
834
0
        ctxt.fs_limit = seg.limit;
835
0
        ctxt.fs_base = seg.base;
836
0
        ctxt.fs_arbytes = seg.attr;
837
0
838
0
        hvm_get_segment_register(v, x86_seg_gs, &seg);
839
0
        ctxt.gs_sel = seg.sel;
840
0
        ctxt.gs_limit = seg.limit;
841
0
        ctxt.gs_base = seg.base;
842
0
        ctxt.gs_arbytes = seg.attr;
843
0
844
0
        hvm_get_segment_register(v, x86_seg_tr, &seg);
845
0
        ctxt.tr_sel = seg.sel;
846
0
        ctxt.tr_limit = seg.limit;
847
0
        ctxt.tr_base = seg.base;
848
0
        ctxt.tr_arbytes = seg.attr;
849
0
850
0
        hvm_get_segment_register(v, x86_seg_ldtr, &seg);
851
0
        ctxt.ldtr_sel = seg.sel;
852
0
        ctxt.ldtr_limit = seg.limit;
853
0
        ctxt.ldtr_base = seg.base;
854
0
        ctxt.ldtr_arbytes = seg.attr;
855
0
856
0
        if ( v->fpu_initialised )
857
0
        {
858
0
            memcpy(ctxt.fpu_regs, v->arch.fpu_ctxt, sizeof(ctxt.fpu_regs));
859
0
            ctxt.flags = XEN_X86_FPU_INITIALISED;
860
0
        }
861
0
862
0
        ctxt.rax = v->arch.user_regs.rax;
863
0
        ctxt.rbx = v->arch.user_regs.rbx;
864
0
        ctxt.rcx = v->arch.user_regs.rcx;
865
0
        ctxt.rdx = v->arch.user_regs.rdx;
866
0
        ctxt.rbp = v->arch.user_regs.rbp;
867
0
        ctxt.rsi = v->arch.user_regs.rsi;
868
0
        ctxt.rdi = v->arch.user_regs.rdi;
869
0
        ctxt.rsp = v->arch.user_regs.rsp;
870
0
        ctxt.rip = v->arch.user_regs.rip;
871
0
        ctxt.rflags = v->arch.user_regs.rflags;
872
0
        ctxt.r8  = v->arch.user_regs.r8;
873
0
        ctxt.r9  = v->arch.user_regs.r9;
874
0
        ctxt.r10 = v->arch.user_regs.r10;
875
0
        ctxt.r11 = v->arch.user_regs.r11;
876
0
        ctxt.r12 = v->arch.user_regs.r12;
877
0
        ctxt.r13 = v->arch.user_regs.r13;
878
0
        ctxt.r14 = v->arch.user_regs.r14;
879
0
        ctxt.r15 = v->arch.user_regs.r15;
880
0
        ctxt.dr0 = v->arch.debugreg[0];
881
0
        ctxt.dr1 = v->arch.debugreg[1];
882
0
        ctxt.dr2 = v->arch.debugreg[2];
883
0
        ctxt.dr3 = v->arch.debugreg[3];
884
0
        ctxt.dr6 = v->arch.debugreg[6];
885
0
        ctxt.dr7 = v->arch.debugreg[7];
886
0
887
0
        if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
888
0
            return 1; 
889
0
    }
890
0
    return 0;
891
0
}
892
893
/* Return a string indicating the error, or NULL for valid. */
894
const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
895
                           signed int cr0_pg)
896
37
{
897
37
    const struct domain *d = v->domain;
898
37
    const struct cpuid_policy *p;
899
37
900
37
    if ( cr0_pg < 0 && !is_hardware_domain(d) )
901
0
        p = d->arch.cpuid;
902
37
    else
903
37
        p = &host_cpuid_policy;
904
37
905
37
    if ( (value & EFER_SCE) && !p->extd.syscall )
906
0
        return "SCE without feature";
907
37
908
37
    if ( (value & (EFER_LME | EFER_LMA)) && !p->extd.lm )
909
0
        return "LME/LMA without feature";
910
37
911
37
    if ( (value & EFER_LMA) && (!(value & EFER_LME) || !cr0_pg) )
912
0
        return "LMA/LME/CR0.PG inconsistency";
913
37
914
37
    if ( (value & EFER_NX) && !p->extd.nx )
915
0
        return "NX without feature";
916
37
917
37
    if ( (value & EFER_SVME) && (!p->extd.svm || !nestedhvm_enabled(d)) )
918
0
        return "SVME without nested virt";
919
37
920
37
    if ( (value & EFER_LMSLE) && !cpu_has_lmsl )
921
0
        return "LMSLE without support";
922
37
923
37
    if ( (value & EFER_FFXSE) && !p->extd.ffxsr )
924
0
        return "FFXSE without feature";
925
37
926
37
    return NULL;
927
37
}
928
929
/* These reserved bits in lower 32 remain 0 after any load of CR0 */
930
#define HVM_CR0_GUEST_RESERVED_BITS             \
931
3.87k
    (~((unsigned long)                          \
932
3.87k
       (X86_CR0_PE | X86_CR0_MP | X86_CR0_EM |  \
933
3.87k
        X86_CR0_TS | X86_CR0_ET | X86_CR0_NE |  \
934
3.87k
        X86_CR0_WP | X86_CR0_AM | X86_CR0_NW |  \
935
3.87k
        X86_CR0_CD | X86_CR0_PG)))
936
937
/* These bits in CR4 can be set by the guest. */
938
unsigned long hvm_cr4_guest_valid_bits(const struct vcpu *v, bool restore)
939
98
{
940
98
    const struct domain *d = v->domain;
941
98
    const struct cpuid_policy *p;
942
98
    bool mce, vmxe;
943
98
944
99
    if ( !restore && !is_hardware_domain(d) )
945
0
        p = d->arch.cpuid;
946
98
    else
947
98
        p = &host_cpuid_policy;
948
98
949
98
    /* Logic broken out simply to aid readability below. */
950
0
    mce  = p->basic.mce || p->basic.mca;
951
99
    vmxe = p->basic.vmx && (restore || nestedhvm_enabled(d));
952
98
953
18.4E
    return ((p->basic.vme     ? X86_CR4_VME | X86_CR4_PVI : 0) |
954
18.4E
            (p->basic.tsc     ? X86_CR4_TSD               : 0) |
955
18.4E
            (p->basic.de      ? X86_CR4_DE                : 0) |
956
18.4E
            (p->basic.pse     ? X86_CR4_PSE               : 0) |
957
18.4E
            (p->basic.pae     ? X86_CR4_PAE               : 0) |
958
18.4E
            (mce              ? X86_CR4_MCE               : 0) |
959
18.4E
            (p->basic.pge     ? X86_CR4_PGE               : 0) |
960
98
                                X86_CR4_PCE                    |
961
18.4E
            (p->basic.fxsr    ? X86_CR4_OSFXSR            : 0) |
962
18.4E
            (p->basic.sse     ? X86_CR4_OSXMMEXCPT        : 0) |
963
98
            (vmxe             ? X86_CR4_VMXE              : 0) |
964
18.4E
            (p->feat.fsgsbase ? X86_CR4_FSGSBASE          : 0) |
965
18.4E
            (p->basic.pcid    ? X86_CR4_PCIDE             : 0) |
966
18.4E
            (p->basic.xsave   ? X86_CR4_OSXSAVE           : 0) |
967
18.4E
            (p->feat.smep     ? X86_CR4_SMEP              : 0) |
968
98
            (p->feat.smap     ? X86_CR4_SMAP              : 0) |
969
98
            (p->feat.umip     ? X86_CR4_UMIP              : 0) |
970
98
            (p->feat.pku      ? X86_CR4_PKE               : 0));
971
98
}
972
973
static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
974
0
{
975
0
    int vcpuid;
976
0
    struct vcpu *v;
977
0
    struct hvm_hw_cpu ctxt;
978
0
    struct segment_register seg;
979
0
    const char *errstr;
980
0
    struct xsave_struct *xsave_area;
981
0
982
0
    /* Which vcpu is this? */
983
0
    vcpuid = hvm_load_instance(h);
984
0
    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
985
0
    {
986
0
        dprintk(XENLOG_G_ERR, "HVM restore: dom%u has no vcpu%u\n",
987
0
                d->domain_id, vcpuid);
988
0
        return -EINVAL;
989
0
    }
990
0
991
0
    if ( hvm_load_entry_zeroextend(CPU, h, &ctxt) != 0 )
992
0
        return -EINVAL;
993
0
994
0
    if ( ctxt.pad0 != 0 )
995
0
        return -EINVAL;
996
0
997
0
    /* Sanity check some control registers. */
998
0
    if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
999
0
         !(ctxt.cr0 & X86_CR0_ET) ||
1000
0
         ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
1001
0
    {
1002
0
        printk(XENLOG_G_ERR "HVM%d restore: bad CR0 %#" PRIx64 "\n",
1003
0
               d->domain_id, ctxt.cr0);
1004
0
        return -EINVAL;
1005
0
    }
1006
0
1007
0
    if ( ctxt.cr4 & ~hvm_cr4_guest_valid_bits(v, 1) )
1008
0
    {
1009
0
        printk(XENLOG_G_ERR "HVM%d restore: bad CR4 %#" PRIx64 "\n",
1010
0
               d->domain_id, ctxt.cr4);
1011
0
        return -EINVAL;
1012
0
    }
1013
0
1014
0
    errstr = hvm_efer_valid(v, ctxt.msr_efer, MASK_EXTR(ctxt.cr0, X86_CR0_PG));
1015
0
    if ( errstr )
1016
0
    {
1017
0
        printk(XENLOG_G_ERR "%pv: HVM restore: bad EFER %#" PRIx64 " - %s\n",
1018
0
               v, ctxt.msr_efer, errstr);
1019
0
        return -EINVAL;
1020
0
    }
1021
0
1022
0
    if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 )
1023
0
    {
1024
0
        gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n",
1025
0
                ctxt.flags);
1026
0
        return -EINVAL;
1027
0
    }
1028
0
1029
0
    /* Older Xen versions used to save the segment arbytes directly 
1030
0
     * from the VMCS on Intel hosts.  Detect this and rearrange them
1031
0
     * into the struct segment_register format. */
1032
0
#define UNFOLD_ARBYTES(_r)                          \
1033
0
    if ( (_r & 0xf000) && !(_r & 0x0f00) )          \
1034
0
        _r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
1035
0
    UNFOLD_ARBYTES(ctxt.cs_arbytes);
1036
0
    UNFOLD_ARBYTES(ctxt.ds_arbytes);
1037
0
    UNFOLD_ARBYTES(ctxt.es_arbytes);
1038
0
    UNFOLD_ARBYTES(ctxt.fs_arbytes);
1039
0
    UNFOLD_ARBYTES(ctxt.gs_arbytes);
1040
0
    UNFOLD_ARBYTES(ctxt.ss_arbytes);
1041
0
    UNFOLD_ARBYTES(ctxt.tr_arbytes);
1042
0
    UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
1043
0
#undef UNFOLD_ARBYTES
1044
0
1045
0
    /* Architecture-specific vmcs/vmcb bits */
1046
0
    if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
1047
0
        return -EINVAL;
1048
0
1049
0
    if ( hvm_funcs.tsc_scaling.setup )
1050
0
        hvm_funcs.tsc_scaling.setup(v);
1051
0
1052
0
    v->arch.hvm_vcpu.msr_tsc_aux = ctxt.msr_tsc_aux;
1053
0
1054
0
    hvm_set_guest_tsc_fixed(v, ctxt.tsc, d->arch.hvm_domain.sync_tsc);
1055
0
1056
0
    seg.limit = ctxt.idtr_limit;
1057
0
    seg.base = ctxt.idtr_base;
1058
0
    hvm_set_segment_register(v, x86_seg_idtr, &seg);
1059
0
1060
0
    seg.limit = ctxt.gdtr_limit;
1061
0
    seg.base = ctxt.gdtr_base;
1062
0
    hvm_set_segment_register(v, x86_seg_gdtr, &seg);
1063
0
1064
0
    seg.sel = ctxt.cs_sel;
1065
0
    seg.limit = ctxt.cs_limit;
1066
0
    seg.base = ctxt.cs_base;
1067
0
    seg.attr = ctxt.cs_arbytes;
1068
0
    hvm_set_segment_register(v, x86_seg_cs, &seg);
1069
0
1070
0
    seg.sel = ctxt.ds_sel;
1071
0
    seg.limit = ctxt.ds_limit;
1072
0
    seg.base = ctxt.ds_base;
1073
0
    seg.attr = ctxt.ds_arbytes;
1074
0
    hvm_set_segment_register(v, x86_seg_ds, &seg);
1075
0
1076
0
    seg.sel = ctxt.es_sel;
1077
0
    seg.limit = ctxt.es_limit;
1078
0
    seg.base = ctxt.es_base;
1079
0
    seg.attr = ctxt.es_arbytes;
1080
0
    hvm_set_segment_register(v, x86_seg_es, &seg);
1081
0
1082
0
    seg.sel = ctxt.ss_sel;
1083
0
    seg.limit = ctxt.ss_limit;
1084
0
    seg.base = ctxt.ss_base;
1085
0
    seg.attr = ctxt.ss_arbytes;
1086
0
    hvm_set_segment_register(v, x86_seg_ss, &seg);
1087
0
1088
0
    seg.sel = ctxt.fs_sel;
1089
0
    seg.limit = ctxt.fs_limit;
1090
0
    seg.base = ctxt.fs_base;
1091
0
    seg.attr = ctxt.fs_arbytes;
1092
0
    hvm_set_segment_register(v, x86_seg_fs, &seg);
1093
0
1094
0
    seg.sel = ctxt.gs_sel;
1095
0
    seg.limit = ctxt.gs_limit;
1096
0
    seg.base = ctxt.gs_base;
1097
0
    seg.attr = ctxt.gs_arbytes;
1098
0
    hvm_set_segment_register(v, x86_seg_gs, &seg);
1099
0
1100
0
    seg.sel = ctxt.tr_sel;
1101
0
    seg.limit = ctxt.tr_limit;
1102
0
    seg.base = ctxt.tr_base;
1103
0
    seg.attr = ctxt.tr_arbytes;
1104
0
    hvm_set_segment_register(v, x86_seg_tr, &seg);
1105
0
1106
0
    seg.sel = ctxt.ldtr_sel;
1107
0
    seg.limit = ctxt.ldtr_limit;
1108
0
    seg.base = ctxt.ldtr_base;
1109
0
    seg.attr = ctxt.ldtr_arbytes;
1110
0
    hvm_set_segment_register(v, x86_seg_ldtr, &seg);
1111
0
1112
0
    /* Cover xsave-absent save file restoration on xsave-capable host. */
1113
0
    xsave_area = xsave_enabled(v) ? NULL : v->arch.xsave_area;
1114
0
1115
0
    v->fpu_initialised = !!(ctxt.flags & XEN_X86_FPU_INITIALISED);
1116
0
    if ( v->fpu_initialised )
1117
0
    {
1118
0
        memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
1119
0
        if ( xsave_area )
1120
0
            xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
1121
0
    }
1122
0
    else if ( xsave_area )
1123
0
    {
1124
0
        xsave_area->xsave_hdr.xstate_bv = 0;
1125
0
        xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT;
1126
0
    }
1127
0
    if ( xsave_area )
1128
0
        xsave_area->xsave_hdr.xcomp_bv = 0;
1129
0
1130
0
    v->arch.user_regs.rax = ctxt.rax;
1131
0
    v->arch.user_regs.rbx = ctxt.rbx;
1132
0
    v->arch.user_regs.rcx = ctxt.rcx;
1133
0
    v->arch.user_regs.rdx = ctxt.rdx;
1134
0
    v->arch.user_regs.rbp = ctxt.rbp;
1135
0
    v->arch.user_regs.rsi = ctxt.rsi;
1136
0
    v->arch.user_regs.rdi = ctxt.rdi;
1137
0
    v->arch.user_regs.rsp = ctxt.rsp;
1138
0
    v->arch.user_regs.rip = ctxt.rip;
1139
0
    v->arch.user_regs.rflags = ctxt.rflags | X86_EFLAGS_MBS;
1140
0
    v->arch.user_regs.r8  = ctxt.r8;
1141
0
    v->arch.user_regs.r9  = ctxt.r9;
1142
0
    v->arch.user_regs.r10 = ctxt.r10;
1143
0
    v->arch.user_regs.r11 = ctxt.r11;
1144
0
    v->arch.user_regs.r12 = ctxt.r12;
1145
0
    v->arch.user_regs.r13 = ctxt.r13;
1146
0
    v->arch.user_regs.r14 = ctxt.r14;
1147
0
    v->arch.user_regs.r15 = ctxt.r15;
1148
0
    v->arch.debugreg[0] = ctxt.dr0;
1149
0
    v->arch.debugreg[1] = ctxt.dr1;
1150
0
    v->arch.debugreg[2] = ctxt.dr2;
1151
0
    v->arch.debugreg[3] = ctxt.dr3;
1152
0
    v->arch.debugreg[6] = ctxt.dr6;
1153
0
    v->arch.debugreg[7] = ctxt.dr7;
1154
0
1155
0
    v->arch.vgc_flags = VGCF_online;
1156
0
1157
0
    /* Auxiliary processors should be woken immediately. */
1158
0
    v->is_initialised = 1;
1159
0
    clear_bit(_VPF_down, &v->pause_flags);
1160
0
    vcpu_wake(v);
1161
0
1162
0
    return 0;
1163
0
}
1164
1165
HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
1166
                          1, HVMSR_PER_VCPU);
1167
1168
1
#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \
1169
1
                                           save_area) + \
1170
1
                                  xstate_ctxt_size(xcr0))
1171
1172
static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
1173
0
{
1174
0
    struct vcpu *v;
1175
0
    struct hvm_hw_cpu_xsave *ctxt;
1176
0
1177
0
    if ( !cpu_has_xsave )
1178
0
        return 0;   /* do nothing */
1179
0
1180
0
    for_each_vcpu ( d, v )
1181
0
    {
1182
0
        unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum);
1183
0
1184
0
        if ( !xsave_enabled(v) )
1185
0
            continue;
1186
0
        if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) )
1187
0
            return 1;
1188
0
        ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
1189
0
        h->cur += size;
1190
0
1191
0
        ctxt->xfeature_mask = xfeature_mask;
1192
0
        ctxt->xcr0 = v->arch.xcr0;
1193
0
        ctxt->xcr0_accum = v->arch.xcr0_accum;
1194
0
        expand_xsave_states(v, &ctxt->save_area,
1195
0
                            size - offsetof(typeof(*ctxt), save_area));
1196
0
    }
1197
0
1198
0
    return 0;
1199
0
}
1200
1201
/*
1202
 * Structure layout conformity checks, documenting correctness of the cast in
1203
 * the invocation of validate_xstate() below.
1204
 * Leverage CONFIG_COMPAT machinery to perform this.
1205
 */
1206
#define xen_xsave_hdr xsave_hdr
1207
#define compat_xsave_hdr hvm_hw_cpu_xsave_hdr
1208
CHECK_FIELD_(struct, xsave_hdr, xstate_bv);
1209
CHECK_FIELD_(struct, xsave_hdr, xcomp_bv);
1210
CHECK_FIELD_(struct, xsave_hdr, reserved);
1211
#undef compat_xsave_hdr
1212
#undef xen_xsave_hdr
1213
1214
static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
1215
0
{
1216
0
    unsigned int vcpuid, size;
1217
0
    int err;
1218
0
    struct vcpu *v;
1219
0
    struct hvm_hw_cpu_xsave *ctxt;
1220
0
    const struct hvm_save_descriptor *desc;
1221
0
    unsigned int i, desc_start, desc_length;
1222
0
1223
0
    /* Which vcpu is this? */
1224
0
    vcpuid = hvm_load_instance(h);
1225
0
    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
1226
0
    {
1227
0
        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
1228
0
                d->domain_id, vcpuid);
1229
0
        return -EINVAL;
1230
0
    }
1231
0
1232
0
    /* Fails since we can't restore an img saved on xsave-capable host. */
1233
0
    if ( !cpu_has_xsave )
1234
0
        return -EOPNOTSUPP;
1235
0
1236
0
    /* Customized checking for entry since our entry is of variable length */
1237
0
    desc = (struct hvm_save_descriptor *)&h->data[h->cur];
1238
0
    if ( sizeof (*desc) > h->size - h->cur)
1239
0
    {
1240
0
        printk(XENLOG_G_WARNING
1241
0
               "HVM%d.%d restore: not enough data left to read xsave descriptor\n",
1242
0
               d->domain_id, vcpuid);
1243
0
        return -ENODATA;
1244
0
    }
1245
0
    if ( desc->length + sizeof (*desc) > h->size - h->cur)
1246
0
    {
1247
0
        printk(XENLOG_G_WARNING
1248
0
               "HVM%d.%d restore: not enough data left to read %u xsave bytes\n",
1249
0
               d->domain_id, vcpuid, desc->length);
1250
0
        return -ENODATA;
1251
0
    }
1252
0
    if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) +
1253
0
                        XSTATE_AREA_MIN_SIZE )
1254
0
    {
1255
0
        printk(XENLOG_G_WARNING
1256
0
               "HVM%d.%d restore mismatch: xsave length %u < %zu\n",
1257
0
               d->domain_id, vcpuid, desc->length,
1258
0
               offsetof(struct hvm_hw_cpu_xsave,
1259
0
                        save_area) + XSTATE_AREA_MIN_SIZE);
1260
0
        return -EINVAL;
1261
0
    }
1262
0
    h->cur += sizeof (*desc);
1263
0
    desc_start = h->cur;
1264
0
1265
0
    ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
1266
0
    h->cur += desc->length;
1267
0
1268
0
    err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
1269
0
                          (const void *)&ctxt->save_area.xsave_hdr);
1270
0
    if ( err )
1271
0
    {
1272
0
        printk(XENLOG_G_WARNING
1273
0
               "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64
1274
0
               " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n",
1275
0
               d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum,
1276
0
               ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err);
1277
0
        return err;
1278
0
    }
1279
0
    size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum);
1280
0
    desc_length = desc->length;
1281
0
    if ( desc_length > size )
1282
0
    {
1283
0
        /*
1284
0
         * Xen 4.3.0, 4.2.3 and older used to send longer-than-needed
1285
0
         * xsave regions.  Permit loading the record if the extra data
1286
0
         * is all zero.
1287
0
         */
1288
0
        for ( i = size; i < desc->length; i++ )
1289
0
        {
1290
0
            if ( h->data[desc_start + i] )
1291
0
            {
1292
0
                printk(XENLOG_G_WARNING
1293
0
                       "HVM%d.%u restore mismatch: xsave length %#x > %#x (non-zero data at %#x)\n",
1294
0
                       d->domain_id, vcpuid, desc->length, size, i);
1295
0
                return -EOPNOTSUPP;
1296
0
            }
1297
0
        }
1298
0
        printk(XENLOG_G_WARNING
1299
0
               "HVM%d.%u restore mismatch: xsave length %#x > %#x\n",
1300
0
               d->domain_id, vcpuid, desc->length, size);
1301
0
        /* Rewind desc_length to ignore the extraneous zeros. */
1302
0
        desc_length = size;
1303
0
    }
1304
0
1305
0
    if ( xsave_area_compressed((const void *)&ctxt->save_area) )
1306
0
    {
1307
0
        printk(XENLOG_G_WARNING
1308
0
               "HVM%d.%u restore: compressed xsave state not supported\n",
1309
0
               d->domain_id, vcpuid);
1310
0
        return -EOPNOTSUPP;
1311
0
    }
1312
0
    else if ( desc_length != size )
1313
0
    {
1314
0
        printk(XENLOG_G_WARNING
1315
0
               "HVM%d.%u restore mismatch: xsave length %#x != %#x\n",
1316
0
               d->domain_id, vcpuid, desc_length, size);
1317
0
        return -EINVAL;
1318
0
    }
1319
0
    /* Checking finished */
1320
0
1321
0
    v->arch.xcr0 = ctxt->xcr0;
1322
0
    v->arch.xcr0_accum = ctxt->xcr0_accum;
1323
0
    if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
1324
0
        v->arch.nonlazy_xstate_used = 1;
1325
0
    compress_xsave_states(v, &ctxt->save_area,
1326
0
                          size - offsetof(struct hvm_hw_cpu_xsave, save_area));
1327
0
1328
0
    return 0;
1329
0
}
1330
1331
0
#define HVM_CPU_MSR_SIZE(cnt) offsetof(struct hvm_msr, msr[cnt])
1332
static unsigned int __read_mostly msr_count_max;
1333
1334
static int hvm_save_cpu_msrs(struct domain *d, hvm_domain_context_t *h)
1335
0
{
1336
0
    struct vcpu *v;
1337
0
1338
0
    for_each_vcpu ( d, v )
1339
0
    {
1340
0
        struct hvm_msr *ctxt;
1341
0
        unsigned int i;
1342
0
1343
0
        if ( _hvm_init_entry(h, CPU_MSR_CODE, v->vcpu_id,
1344
0
                             HVM_CPU_MSR_SIZE(msr_count_max)) )
1345
0
            return 1;
1346
0
        ctxt = (struct hvm_msr *)&h->data[h->cur];
1347
0
        ctxt->count = 0;
1348
0
1349
0
        if ( hvm_funcs.save_msr )
1350
0
            hvm_funcs.save_msr(v, ctxt);
1351
0
1352
0
        ASSERT(ctxt->count <= msr_count_max);
1353
0
1354
0
        for ( i = 0; i < ctxt->count; ++i )
1355
0
            ctxt->msr[i]._rsvd = 0;
1356
0
1357
0
        if ( ctxt->count )
1358
0
            h->cur += HVM_CPU_MSR_SIZE(ctxt->count);
1359
0
        else
1360
0
            h->cur -= sizeof(struct hvm_save_descriptor);
1361
0
    }
1362
0
1363
0
    return 0;
1364
0
}
1365
1366
static int hvm_load_cpu_msrs(struct domain *d, hvm_domain_context_t *h)
1367
0
{
1368
0
    unsigned int i, vcpuid = hvm_load_instance(h);
1369
0
    struct vcpu *v;
1370
0
    const struct hvm_save_descriptor *desc;
1371
0
    struct hvm_msr *ctxt;
1372
0
    int err = 0;
1373
0
1374
0
    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
1375
0
    {
1376
0
        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
1377
0
                d->domain_id, vcpuid);
1378
0
        return -EINVAL;
1379
0
    }
1380
0
1381
0
    /* Customized checking for entry since our entry is of variable length */
1382
0
    desc = (struct hvm_save_descriptor *)&h->data[h->cur];
1383
0
    if ( sizeof (*desc) > h->size - h->cur)
1384
0
    {
1385
0
        printk(XENLOG_G_WARNING
1386
0
               "HVM%d.%d restore: not enough data left to read MSR descriptor\n",
1387
0
               d->domain_id, vcpuid);
1388
0
        return -ENODATA;
1389
0
    }
1390
0
    if ( desc->length + sizeof (*desc) > h->size - h->cur)
1391
0
    {
1392
0
        printk(XENLOG_G_WARNING
1393
0
               "HVM%d.%d restore: not enough data left to read %u MSR bytes\n",
1394
0
               d->domain_id, vcpuid, desc->length);
1395
0
        return -ENODATA;
1396
0
    }
1397
0
    if ( desc->length < HVM_CPU_MSR_SIZE(1) )
1398
0
    {
1399
0
        printk(XENLOG_G_WARNING
1400
0
               "HVM%d.%d restore mismatch: MSR length %u < %zu\n",
1401
0
               d->domain_id, vcpuid, desc->length, HVM_CPU_MSR_SIZE(1));
1402
0
        return -EINVAL;
1403
0
    }
1404
0
1405
0
    h->cur += sizeof(*desc);
1406
0
    ctxt = (struct hvm_msr *)&h->data[h->cur];
1407
0
    h->cur += desc->length;
1408
0
1409
0
    if ( desc->length != HVM_CPU_MSR_SIZE(ctxt->count) )
1410
0
    {
1411
0
        printk(XENLOG_G_WARNING
1412
0
               "HVM%d.%d restore mismatch: MSR length %u != %zu\n",
1413
0
               d->domain_id, vcpuid, desc->length,
1414
0
               HVM_CPU_MSR_SIZE(ctxt->count));
1415
0
        return -EOPNOTSUPP;
1416
0
    }
1417
0
1418
0
    for ( i = 0; i < ctxt->count; ++i )
1419
0
        if ( ctxt->msr[i]._rsvd )
1420
0
            return -EOPNOTSUPP;
1421
0
    /* Checking finished */
1422
0
1423
0
    if ( hvm_funcs.load_msr )
1424
0
        err = hvm_funcs.load_msr(v, ctxt);
1425
0
1426
0
    for ( i = 0; !err && i < ctxt->count; ++i )
1427
0
    {
1428
0
        switch ( ctxt->msr[i].index )
1429
0
        {
1430
0
        default:
1431
0
            if ( !ctxt->msr[i]._rsvd )
1432
0
                err = -ENXIO;
1433
0
            break;
1434
0
        }
1435
0
    }
1436
0
1437
0
    return err;
1438
0
}
1439
1440
/* We need variable length data chunks for XSAVE area and MSRs, hence
1441
 * a custom declaration rather than HVM_REGISTER_SAVE_RESTORE.
1442
 */
1443
static int __init hvm_register_CPU_save_and_restore(void)
1444
1
{
1445
1
    hvm_register_savevm(CPU_XSAVE_CODE,
1446
1
                        "CPU_XSAVE",
1447
1
                        hvm_save_cpu_xsave_states,
1448
1
                        hvm_load_cpu_xsave_states,
1449
1
                        HVM_CPU_XSAVE_SIZE(xfeature_mask) +
1450
1
                            sizeof(struct hvm_save_descriptor),
1451
1
                        HVMSR_PER_VCPU);
1452
1
1453
1
    if ( hvm_funcs.init_msr )
1454
1
        msr_count_max += hvm_funcs.init_msr();
1455
1
1456
1
    if ( msr_count_max )
1457
0
        hvm_register_savevm(CPU_MSR_CODE,
1458
0
                            "CPU_MSR",
1459
0
                            hvm_save_cpu_msrs,
1460
0
                            hvm_load_cpu_msrs,
1461
0
                            HVM_CPU_MSR_SIZE(msr_count_max) +
1462
0
                                sizeof(struct hvm_save_descriptor),
1463
0
                            HVMSR_PER_VCPU);
1464
1
1465
1
    return 0;
1466
1
}
1467
__initcall(hvm_register_CPU_save_and_restore);
1468
1469
int hvm_vcpu_initialise(struct vcpu *v)
1470
12
{
1471
12
    int rc;
1472
12
    struct domain *d = v->domain;
1473
12
1474
12
    hvm_asid_flush_vcpu(v);
1475
12
1476
12
    spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
1477
12
    INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
1478
12
1479
12
    rc = hvm_vcpu_cacheattr_init(v); /* teardown: vcpu_cacheattr_destroy */
1480
12
    if ( rc != 0 )
1481
0
        goto fail1;
1482
12
1483
12
    /* NB: vlapic_init must be called before hvm_funcs.vcpu_initialise */
1484
12
    rc = vlapic_init(v);
1485
12
    if ( rc != 0 ) /* teardown: vlapic_destroy */
1486
0
        goto fail2;
1487
12
1488
12
    if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) /* teardown: hvm_funcs.vcpu_destroy */
1489
0
        goto fail3;
1490
12
1491
12
    softirq_tasklet_init(
1492
12
        &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
1493
12
        (void(*)(unsigned long))hvm_assert_evtchn_irq,
1494
12
        (unsigned long)v);
1495
12
1496
12
    v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
1497
12
1498
12
    rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
1499
12
    if ( rc != 0 )
1500
0
        goto fail4;
1501
12
1502
12
    if ( nestedhvm_enabled(d)
1503
0
         && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) /* teardown: nestedhvm_vcpu_destroy */
1504
0
        goto fail5;
1505
12
1506
12
    rc = hvm_all_ioreq_servers_add_vcpu(d, v);
1507
12
    if ( rc != 0 )
1508
0
        goto fail6;
1509
12
1510
12
    if ( v->vcpu_id == 0 )
1511
1
    {
1512
1
        /* NB. All these really belong in hvm_domain_initialise(). */
1513
1
        pmtimer_init(v);
1514
1
        hpet_init(d);
1515
1
 
1516
1
        /* Init guest TSC to start from zero. */
1517
1
        hvm_set_guest_tsc(v, 0);
1518
1
    }
1519
12
1520
12
    hvm_update_guest_vendor(v);
1521
12
1522
12
    return 0;
1523
12
1524
0
 fail6:
1525
0
    nestedhvm_vcpu_destroy(v);
1526
0
 fail5:
1527
0
    free_compat_arg_xlat(v);
1528
0
 fail4:
1529
0
    hvm_funcs.vcpu_destroy(v);
1530
0
 fail3:
1531
0
    vlapic_destroy(v);
1532
0
 fail2:
1533
0
    hvm_vcpu_cacheattr_destroy(v);
1534
0
 fail1:
1535
0
    return rc;
1536
0
}
1537
1538
void hvm_vcpu_destroy(struct vcpu *v)
1539
0
{
1540
0
    viridian_vcpu_deinit(v);
1541
0
1542
0
    hvm_all_ioreq_servers_remove_vcpu(v->domain, v);
1543
0
1544
0
    if ( hvm_altp2m_supported() )
1545
0
        altp2m_vcpu_destroy(v);
1546
0
1547
0
    nestedhvm_vcpu_destroy(v);
1548
0
1549
0
    free_compat_arg_xlat(v);
1550
0
1551
0
    tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
1552
0
    hvm_funcs.vcpu_destroy(v);
1553
0
1554
0
    vlapic_destroy(v);
1555
0
1556
0
    hvm_vcpu_cacheattr_destroy(v);
1557
0
}
1558
1559
void hvm_vcpu_down(struct vcpu *v)
1560
0
{
1561
0
    struct domain *d = v->domain;
1562
0
    int online_count = 0;
1563
0
1564
0
    /* Doesn't halt us immediately, but we'll never return to guest context. */
1565
0
    set_bit(_VPF_down, &v->pause_flags);
1566
0
    vcpu_sleep_nosync(v);
1567
0
1568
0
    /* Any other VCPUs online? ... */
1569
0
    domain_lock(d);
1570
0
    for_each_vcpu ( d, v )
1571
0
        if ( !(v->pause_flags & VPF_down) )
1572
0
            online_count++;
1573
0
    domain_unlock(d);
1574
0
1575
0
    /* ... Shut down the domain if not. */
1576
0
    if ( online_count == 0 )
1577
0
    {
1578
0
        gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
1579
0
        domain_shutdown(d, SHUTDOWN_poweroff);
1580
0
    }
1581
0
}
1582
1583
void hvm_hlt(unsigned int eflags)
1584
65.4k
{
1585
65.4k
    struct vcpu *curr = current;
1586
65.4k
1587
65.4k
    if ( hvm_event_pending(curr) )
1588
0
        return;
1589
65.4k
1590
65.4k
    /*
1591
65.4k
     * If we halt with interrupts disabled, that's a pretty sure sign that we
1592
65.4k
     * want to shut down. In a real processor, NMIs are the only way to break
1593
65.4k
     * out of this.
1594
65.4k
     */
1595
65.4k
    if ( unlikely(!(eflags & X86_EFLAGS_IF)) )
1596
0
        return hvm_vcpu_down(curr);
1597
65.4k
1598
65.4k
    do_sched_op(SCHEDOP_block, guest_handle_from_ptr(NULL, void));
1599
65.4k
1600
65.4k
    HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
1601
65.4k
}
1602
1603
void hvm_triple_fault(void)
1604
0
{
1605
0
    struct vcpu *v = current;
1606
0
    struct domain *d = v->domain;
1607
0
    u8 reason = d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON];
1608
0
1609
0
    gprintk(XENLOG_INFO,
1610
0
            "Triple fault - invoking HVM shutdown action %d\n",
1611
0
            reason);
1612
0
    vcpu_show_execution_state(v);
1613
0
    domain_shutdown(d, reason);
1614
0
}
1615
1616
void hvm_inject_event(const struct x86_event *event)
1617
0
{
1618
0
    struct vcpu *curr = current;
1619
0
    const uint8_t vector = event->vector;
1620
0
    const bool has_ec = ((event->type == X86_EVENTTYPE_HW_EXCEPTION) &&
1621
0
                         (vector < 32) && ((TRAP_HAVE_EC & (1u << vector))));
1622
0
1623
0
    ASSERT(vector == event->vector); /* Confirm no truncation. */
1624
0
    if ( has_ec )
1625
0
        ASSERT(event->error_code != X86_EVENT_NO_EC);
1626
0
    else
1627
0
        ASSERT(event->error_code == X86_EVENT_NO_EC);
1628
0
1629
0
    if ( nestedhvm_enabled(curr->domain) &&
1630
0
         !nestedhvm_vmswitch_in_progress(curr) &&
1631
0
         nestedhvm_vcpu_in_guestmode(curr) &&
1632
0
         nhvm_vmcx_guest_intercepts_event(
1633
0
             curr, event->vector, event->error_code) )
1634
0
    {
1635
0
        enum nestedhvm_vmexits nsret;
1636
0
1637
0
        nsret = nhvm_vcpu_vmexit_event(curr, event);
1638
0
1639
0
        switch ( nsret )
1640
0
        {
1641
0
        case NESTEDHVM_VMEXIT_DONE:
1642
0
        case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */
1643
0
            return;
1644
0
        case NESTEDHVM_VMEXIT_HOST:
1645
0
        case NESTEDHVM_VMEXIT_CONTINUE:
1646
0
        case NESTEDHVM_VMEXIT_FATALERROR:
1647
0
        default:
1648
0
            gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret);
1649
0
            return;
1650
0
        }
1651
0
    }
1652
0
1653
0
    hvm_funcs.inject_event(event);
1654
0
}
1655
1656
int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
1657
                              struct npfec npfec)
1658
60.1k
{
1659
60.1k
    unsigned long gfn = gpa >> PAGE_SHIFT;
1660
60.1k
    p2m_type_t p2mt;
1661
60.1k
    p2m_access_t p2ma;
1662
60.1k
    mfn_t mfn;
1663
60.1k
    struct vcpu *curr = current;
1664
60.1k
    struct domain *currd = curr->domain;
1665
60.1k
    struct p2m_domain *p2m, *hostp2m;
1666
60.1k
    int rc, fall_through = 0, paged = 0;
1667
60.1k
    int sharing_enomem = 0;
1668
60.1k
    vm_event_request_t *req_ptr = NULL;
1669
60.1k
    bool_t ap2m_active, sync = 0;
1670
60.1k
1671
60.1k
    /* On Nested Virtualization, walk the guest page table.
1672
60.1k
     * If this succeeds, all is fine.
1673
60.1k
     * If this fails, inject a nested page fault into the guest.
1674
60.1k
     */
1675
60.1k
    if ( nestedhvm_enabled(currd)
1676
0
        && nestedhvm_vcpu_in_guestmode(curr)
1677
0
        && nestedhvm_paging_mode_hap(curr) )
1678
0
    {
1679
0
        int rv;
1680
0
1681
0
        /* The vcpu is in guest mode and the l1 guest
1682
0
         * uses hap. That means 'gpa' is in l2 guest
1683
0
         * physical address space.
1684
0
         * Fix the nested p2m or inject nested page fault
1685
0
         * into l1 guest if not fixable. The algorithm is
1686
0
         * the same as for shadow paging.
1687
0
         */
1688
0
1689
0
         rv = nestedhvm_hap_nested_page_fault(curr, &gpa,
1690
0
                                              npfec.read_access,
1691
0
                                              npfec.write_access,
1692
0
                                              npfec.insn_fetch);
1693
0
        switch (rv) {
1694
0
        case NESTEDHVM_PAGEFAULT_DONE:
1695
0
        case NESTEDHVM_PAGEFAULT_RETRY:
1696
0
            return 1;
1697
0
        case NESTEDHVM_PAGEFAULT_L1_ERROR:
1698
0
            /* An error occured while translating gpa from
1699
0
             * l2 guest address to l1 guest address. */
1700
0
            return 0;
1701
0
        case NESTEDHVM_PAGEFAULT_INJECT:
1702
0
            return -1;
1703
0
        case NESTEDHVM_PAGEFAULT_MMIO:
1704
0
            if ( !handle_mmio() )
1705
0
                hvm_inject_hw_exception(TRAP_gp_fault, 0);
1706
0
            return 1;
1707
0
        case NESTEDHVM_PAGEFAULT_L0_ERROR:
1708
0
            /* gpa is now translated to l1 guest address, update gfn. */
1709
0
            gfn = gpa >> PAGE_SHIFT;
1710
0
            break;
1711
0
        }
1712
0
    }
1713
60.1k
1714
60.1k
    /*
1715
60.1k
     * No need to do the P2M lookup for internally handled MMIO, benefiting
1716
60.1k
     * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
1717
60.1k
     * - newer Windows (like Server 2012) for HPET accesses.
1718
60.1k
     */
1719
60.1k
    if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) )
1720
60.1k
    {
1721
60.1k
        if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) )
1722
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
1723
60.1k
        rc = 1;
1724
60.1k
        goto out;
1725
60.1k
    }
1726
60.1k
1727
11
    ap2m_active = altp2m_active(currd);
1728
11
1729
11
    /*
1730
11
     * Take a lock on the host p2m speculatively, to avoid potential
1731
11
     * locking order problems later and to handle unshare etc.
1732
11
     */
1733
11
    hostp2m = p2m_get_hostp2m(currd);
1734
11
    mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
1735
11
                              P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
1736
11
                              NULL);
1737
11
1738
11
    if ( ap2m_active )
1739
0
    {
1740
0
        if ( p2m_altp2m_lazy_copy(curr, gpa, gla, npfec, &p2m) )
1741
0
        {
1742
0
            /* entry was lazily copied from host -- retry */
1743
0
            __put_gfn(hostp2m, gfn);
1744
0
            rc = 1;
1745
0
            goto out;
1746
0
        }
1747
0
1748
0
        mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
1749
0
    }
1750
11
    else
1751
11
        p2m = hostp2m;
1752
11
1753
11
    /* Check access permissions first, then handle faults */
1754
11
    if ( !mfn_eq(mfn, INVALID_MFN) )
1755
0
    {
1756
0
        bool_t violation;
1757
0
1758
0
        /* If the access is against the permissions, then send to vm_event */
1759
0
        switch (p2ma)
1760
0
        {
1761
0
        case p2m_access_n:
1762
0
        case p2m_access_n2rwx:
1763
0
        default:
1764
0
            violation = npfec.read_access || npfec.write_access || npfec.insn_fetch;
1765
0
            break;
1766
0
        case p2m_access_r:
1767
0
            violation = npfec.write_access || npfec.insn_fetch;
1768
0
            break;
1769
0
        case p2m_access_w:
1770
0
            violation = npfec.read_access || npfec.insn_fetch;
1771
0
            break;
1772
0
        case p2m_access_x:
1773
0
            violation = npfec.read_access || npfec.write_access;
1774
0
            break;
1775
0
        case p2m_access_rx:
1776
0
        case p2m_access_rx2rw:
1777
0
            violation = npfec.write_access;
1778
0
            break;
1779
0
        case p2m_access_wx:
1780
0
            violation = npfec.read_access;
1781
0
            break;
1782
0
        case p2m_access_rw:
1783
0
            violation = npfec.insn_fetch;
1784
0
            break;
1785
0
        case p2m_access_rwx:
1786
0
            violation = 0;
1787
0
            break;
1788
0
        }
1789
0
1790
0
        if ( violation )
1791
0
        {
1792
0
            /* Should #VE be emulated for this fault? */
1793
0
            if ( p2m_is_altp2m(p2m) && !cpu_has_vmx_virt_exceptions )
1794
0
            {
1795
0
                bool_t sve;
1796
0
1797
0
                p2m->get_entry(p2m, _gfn(gfn), &p2mt, &p2ma, 0, NULL, &sve);
1798
0
1799
0
                if ( !sve && altp2m_vcpu_emulate_ve(curr) )
1800
0
                {
1801
0
                    rc = 1;
1802
0
                    goto out_put_gfn;
1803
0
                }
1804
0
            }
1805
0
1806
0
            sync = p2m_mem_access_check(gpa, gla, npfec, &req_ptr);
1807
0
1808
0
            if ( !sync )
1809
0
                fall_through = 1;
1810
0
            else
1811
0
            {
1812
0
                /* Rights not promoted (aka. sync event), work here is done */
1813
0
                rc = 1;
1814
0
                goto out_put_gfn;
1815
0
            }
1816
0
        }
1817
0
    }
1818
11
1819
11
    /*
1820
11
     * If this GFN is emulated MMIO or marked as read-only, pass the fault
1821
11
     * to the mmio handler.
1822
11
     */
1823
11
    if ( (p2mt == p2m_mmio_dm) || 
1824
0
         (npfec.write_access &&
1825
0
          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
1826
11
    {
1827
11
        if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) )
1828
0
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
1829
11
        rc = 1;
1830
11
        goto out_put_gfn;
1831
11
    }
1832
11
1833
11
    /* Check if the page has been paged out */
1834
0
    if ( p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_out) )
1835
0
        paged = 1;
1836
0
1837
0
    /* Mem sharing: unshare the page and try again */
1838
0
    if ( npfec.write_access && (p2mt == p2m_ram_shared) )
1839
0
    {
1840
0
        ASSERT(p2m_is_hostp2m(p2m));
1841
0
        sharing_enomem = 
1842
0
            (mem_sharing_unshare_page(currd, gfn, 0) < 0);
1843
0
        rc = 1;
1844
0
        goto out_put_gfn;
1845
0
    }
1846
0
 
1847
0
    /* Spurious fault? PoD and log-dirty also take this path. */
1848
0
    if ( p2m_is_ram(p2mt) )
1849
0
    {
1850
0
        rc = 1;
1851
0
        /*
1852
0
         * Page log dirty is always done with order 0. If this mfn resides in
1853
0
         * a large page, we do not change other pages type within that large
1854
0
         * page.
1855
0
         */
1856
0
        if ( npfec.write_access )
1857
0
        {
1858
0
            paging_mark_dirty(currd, mfn);
1859
0
            /*
1860
0
             * If p2m is really an altp2m, unlock here to avoid lock ordering
1861
0
             * violation when the change below is propagated from host p2m.
1862
0
             */
1863
0
            if ( ap2m_active )
1864
0
                __put_gfn(p2m, gfn);
1865
0
            p2m_change_type_one(currd, gfn, p2m_ram_logdirty, p2m_ram_rw);
1866
0
            __put_gfn(ap2m_active ? hostp2m : p2m, gfn);
1867
0
1868
0
            goto out;
1869
0
        }
1870
0
        goto out_put_gfn;
1871
0
    }
1872
0
1873
0
    if ( (p2mt == p2m_mmio_direct) && is_hardware_domain(currd) &&
1874
0
         npfec.write_access && npfec.present &&
1875
0
         (hvm_emulate_one_mmio(mfn_x(mfn), gla) == X86EMUL_OKAY) )
1876
0
    {
1877
0
        rc = 1;
1878
0
        goto out_put_gfn;
1879
0
    }
1880
0
1881
0
    /* If we fell through, the vcpu will retry now that access restrictions have
1882
0
     * been removed. It may fault again if the p2m entry type still requires so.
1883
0
     * Otherwise, this is an error condition. */
1884
0
    rc = fall_through;
1885
0
1886
11
 out_put_gfn:
1887
11
    __put_gfn(p2m, gfn);
1888
11
    if ( ap2m_active )
1889
0
        __put_gfn(hostp2m, gfn);
1890
60.1k
 out:
1891
60.1k
    /* All of these are delayed until we exit, since we might 
1892
60.1k
     * sleep on event ring wait queues, and we must not hold
1893
60.1k
     * locks in such circumstance */
1894
60.1k
    if ( paged )
1895
0
        p2m_mem_paging_populate(currd, gfn);
1896
60.1k
    if ( sharing_enomem )
1897
0
    {
1898
0
        int rv;
1899
0
        if ( (rv = mem_sharing_notify_enomem(currd, gfn, 1)) < 0 )
1900
0
        {
1901
0
            gdprintk(XENLOG_ERR, "Domain %hu attempt to unshare "
1902
0
                     "gfn %lx, ENOMEM and no helper (rc %d)\n",
1903
0
                     currd->domain_id, gfn, rv);
1904
0
            /* Crash the domain */
1905
0
            rc = 0;
1906
0
        }
1907
0
    }
1908
60.1k
    if ( req_ptr )
1909
0
    {
1910
0
        if ( monitor_traps(curr, sync, req_ptr) < 0 )
1911
0
            rc = 0;
1912
0
1913
0
        xfree(req_ptr);
1914
0
    }
1915
60.1k
    return rc;
1916
11
}
1917
1918
int hvm_handle_xsetbv(u32 index, u64 new_bv)
1919
11
{
1920
11
    int rc;
1921
11
1922
11
    hvm_monitor_crX(XCR0, new_bv, current->arch.xcr0);
1923
11
1924
11
    rc = handle_xsetbv(index, new_bv);
1925
11
    if ( rc )
1926
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
1927
11
1928
11
    return rc;
1929
11
}
1930
1931
int hvm_set_efer(uint64_t value)
1932
36
{
1933
36
    struct vcpu *v = current;
1934
36
    const char *errstr;
1935
36
1936
36
    value &= ~EFER_LMA;
1937
36
1938
36
    errstr = hvm_efer_valid(v, value, -1);
1939
36
    if ( errstr )
1940
0
    {
1941
0
        printk(XENLOG_G_WARNING
1942
0
               "%pv: Invalid EFER update: %#"PRIx64" -> %#"PRIx64" - %s\n",
1943
0
               v, v->arch.hvm_vcpu.guest_efer, value, errstr);
1944
0
        return X86EMUL_EXCEPTION;
1945
0
    }
1946
36
1947
36
    if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
1948
12
         hvm_paging_enabled(v) )
1949
0
    {
1950
0
        gdprintk(XENLOG_WARNING,
1951
0
                 "Trying to change EFER.LME with paging enabled\n");
1952
0
        return X86EMUL_EXCEPTION;
1953
0
    }
1954
36
1955
36
    if ( (value & EFER_LME) && !(v->arch.hvm_vcpu.guest_efer & EFER_LME) )
1956
12
    {
1957
12
        struct segment_register cs;
1958
12
1959
12
        hvm_get_segment_register(v, x86_seg_cs, &cs);
1960
12
1961
12
        /*
1962
12
         * %cs may be loaded with both .D and .L set in legacy mode, and both
1963
12
         * are captured in the VMCS/VMCB.
1964
12
         *
1965
12
         * If a guest does this and then tries to transition into long mode,
1966
12
         * the vmentry from setting LME fails due to invalid guest state,
1967
12
         * because %cr0.PG is still clear.
1968
12
         *
1969
12
         * When LME becomes set, clobber %cs.L to keep the guest firmly in
1970
12
         * compatibility mode until it reloads %cs itself.
1971
12
         */
1972
12
        if ( cs.l )
1973
0
        {
1974
0
            cs.l = 0;
1975
0
            hvm_set_segment_register(v, x86_seg_cs, &cs);
1976
0
        }
1977
12
    }
1978
36
1979
36
    if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
1980
0
       ((value & EFER_SVME) == 0 ) &&
1981
0
       ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
1982
0
    {
1983
0
        /* Cleared EFER.SVME: Flush all nestedp2m tables */
1984
0
        p2m_flush_nestedp2m(v->domain);
1985
0
        nestedhvm_vcpu_reset(v);
1986
0
    }
1987
36
1988
36
    value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
1989
36
    v->arch.hvm_vcpu.guest_efer = value;
1990
36
    hvm_update_guest_efer(v);
1991
36
1992
36
    return X86EMUL_OKAY;
1993
36
}
1994
1995
/* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
1996
static bool_t domain_exit_uc_mode(struct vcpu *v)
1997
0
{
1998
0
    struct domain *d = v->domain;
1999
0
    struct vcpu *vs;
2000
0
2001
0
    for_each_vcpu ( d, vs )
2002
0
    {
2003
0
        if ( (vs == v) || !vs->is_initialised )
2004
0
            continue;
2005
0
        if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
2006
0
             mtrr_pat_not_equal(vs, v) )
2007
0
            return 0;
2008
0
    }
2009
0
2010
0
    return 1;
2011
0
}
2012
2013
static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode)
2014
0
{
2015
0
    v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode;
2016
0
    shadow_blow_tables_per_domain(v->domain);
2017
0
}
2018
2019
int hvm_mov_to_cr(unsigned int cr, unsigned int gpr)
2020
2.93k
{
2021
2.93k
    struct vcpu *curr = current;
2022
2.93k
    unsigned long val, *reg;
2023
2.93k
    int rc;
2024
2.93k
2025
2.93k
    if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL )
2026
0
    {
2027
0
        gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr);
2028
0
        goto exit_and_crash;
2029
0
    }
2030
2.93k
2031
2.93k
    val = *reg;
2032
2.93k
    HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(val));
2033
2.93k
    HVM_DBG_LOG(DBG_LEVEL_1, "CR%u, value = %lx", cr, val);
2034
2.93k
2035
2.93k
    switch ( cr )
2036
2.93k
    {
2037
2.83k
    case 0:
2038
2.83k
        rc = hvm_set_cr0(val, 1);
2039
2.83k
        break;
2040
2.83k
2041
0
    case 3:
2042
0
        rc = hvm_set_cr3(val, 1);
2043
0
        break;
2044
2.83k
2045
98
    case 4:
2046
98
        rc = hvm_set_cr4(val, 1);
2047
98
        break;
2048
2.83k
2049
0
    case 8:
2050
0
        vlapic_set_reg(vcpu_vlapic(curr), APIC_TASKPRI, ((val & 0x0f) << 4));
2051
0
        rc = X86EMUL_OKAY;
2052
0
        break;
2053
2.83k
2054
0
    default:
2055
0
        gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2056
0
        goto exit_and_crash;
2057
2.93k
    }
2058
2.93k
2059
2.93k
    if ( rc == X86EMUL_EXCEPTION )
2060
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2061
2.93k
2062
2.93k
    return rc;
2063
2.93k
2064
0
 exit_and_crash:
2065
0
    domain_crash(curr->domain);
2066
0
    return X86EMUL_UNHANDLEABLE;
2067
2.93k
}
2068
2069
int hvm_mov_from_cr(unsigned int cr, unsigned int gpr)
2070
0
{
2071
0
    struct vcpu *curr = current;
2072
0
    unsigned long val = 0, *reg;
2073
0
2074
0
    if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL )
2075
0
    {
2076
0
        gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr);
2077
0
        goto exit_and_crash;
2078
0
    }
2079
0
2080
0
    switch ( cr )
2081
0
    {
2082
0
    case 0:
2083
0
    case 2:
2084
0
    case 3:
2085
0
    case 4:
2086
0
        val = curr->arch.hvm_vcpu.guest_cr[cr];
2087
0
        break;
2088
0
    case 8:
2089
0
        val = (vlapic_get_reg(vcpu_vlapic(curr), APIC_TASKPRI) & 0xf0) >> 4;
2090
0
        break;
2091
0
    default:
2092
0
        gdprintk(XENLOG_ERR, "invalid cr: %u\n", cr);
2093
0
        goto exit_and_crash;
2094
0
    }
2095
0
2096
0
    *reg = val;
2097
0
    HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(val));
2098
0
    HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%u, value = %lx", cr, val);
2099
0
2100
0
    return X86EMUL_OKAY;
2101
0
2102
0
 exit_and_crash:
2103
0
    domain_crash(curr->domain);
2104
0
    return X86EMUL_UNHANDLEABLE;
2105
0
}
2106
2107
void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value)
2108
0
{
2109
0
    if ( value & X86_CR0_CD )
2110
0
    {
2111
0
        /* Entering no fill cache mode. */
2112
0
        spin_lock(&v->domain->arch.hvm_domain.uc_lock);
2113
0
        v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
2114
0
2115
0
        if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
2116
0
        {
2117
0
            domain_pause_nosync(v->domain);
2118
0
2119
0
            /* Flush physical caches. */
2120
0
            flush_all(FLUSH_CACHE);
2121
0
            hvm_set_uc_mode(v, 1);
2122
0
2123
0
            domain_unpause(v->domain);
2124
0
        }
2125
0
        spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
2126
0
    }
2127
0
    else if ( !(value & X86_CR0_CD) &&
2128
0
              (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
2129
0
    {
2130
0
        /* Exit from no fill cache mode. */
2131
0
        spin_lock(&v->domain->arch.hvm_domain.uc_lock);
2132
0
        v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
2133
0
2134
0
        if ( domain_exit_uc_mode(v) )
2135
0
            hvm_set_uc_mode(v, 0);
2136
0
2137
0
        spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
2138
0
    }
2139
0
}
2140
2141
static void hvm_update_cr(struct vcpu *v, unsigned int cr, unsigned long value)
2142
3.96k
{
2143
3.96k
    v->arch.hvm_vcpu.guest_cr[cr] = value;
2144
3.96k
    nestedhvm_set_cr(v, cr, value);
2145
3.96k
    hvm_update_guest_cr(v, cr);
2146
3.96k
}
2147
2148
int hvm_set_cr0(unsigned long value, bool_t may_defer)
2149
3.87k
{
2150
3.87k
    struct vcpu *v = current;
2151
3.87k
    struct domain *d = v->domain;
2152
3.87k
    unsigned long gfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
2153
3.87k
    struct page_info *page;
2154
3.87k
2155
3.87k
    HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
2156
3.87k
2157
3.87k
    if ( (u32)value != value )
2158
0
    {
2159
0
        HVM_DBG_LOG(DBG_LEVEL_1,
2160
0
                    "Guest attempts to set upper 32 bits in CR0: %lx",
2161
0
                    value);
2162
0
        return X86EMUL_EXCEPTION;
2163
0
    }
2164
3.87k
2165
3.87k
    value &= ~HVM_CR0_GUEST_RESERVED_BITS;
2166
3.87k
2167
3.87k
    /* ET is reserved and should be always be 1. */
2168
3.87k
    value |= X86_CR0_ET;
2169
3.87k
2170
3.87k
    if ( !nestedhvm_vmswitch_in_progress(v) &&
2171
3.87k
         (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
2172
0
        return X86EMUL_EXCEPTION;
2173
3.87k
2174
3.87k
    if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
2175
3.87k
                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
2176
0
    {
2177
0
        ASSERT(v->arch.vm_event);
2178
0
2179
0
        if ( hvm_monitor_crX(CR0, value, old_value) )
2180
0
        {
2181
0
            /* The actual write will occur in hvm_do_resume(), if permitted. */
2182
0
            v->arch.vm_event->write_data.do_write.cr0 = 1;
2183
0
            v->arch.vm_event->write_data.cr0 = value;
2184
0
2185
0
            return X86EMUL_OKAY;
2186
0
        }
2187
0
    }
2188
3.87k
2189
3.87k
    if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
2190
12
    {
2191
12
        if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
2192
12
        {
2193
12
            if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) &&
2194
0
                 !nestedhvm_vmswitch_in_progress(v) )
2195
0
            {
2196
0
                HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
2197
0
                return X86EMUL_EXCEPTION;
2198
0
            }
2199
12
            HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
2200
12
            v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
2201
12
            hvm_update_guest_efer(v);
2202
12
        }
2203
12
2204
12
        if ( !paging_mode_hap(d) )
2205
0
        {
2206
0
            /* The guest CR3 must be pointing to the guest physical. */
2207
0
            gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
2208
0
            page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
2209
0
            if ( !page )
2210
0
            {
2211
0
                gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx\n",
2212
0
                         v->arch.hvm_vcpu.guest_cr[3]);
2213
0
                domain_crash(d);
2214
0
                return X86EMUL_UNHANDLEABLE;
2215
0
            }
2216
0
2217
0
            /* Now arch.guest_table points to machine physical. */
2218
0
            v->arch.guest_table = pagetable_from_page(page);
2219
0
2220
0
            HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
2221
0
                        v->arch.hvm_vcpu.guest_cr[3], page_to_mfn(page));
2222
0
        }
2223
12
    }
2224
3.86k
    else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
2225
0
    {
2226
0
        if ( hvm_pcid_enabled(v) )
2227
0
        {
2228
0
            HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to clear CR0.PG "
2229
0
                        "while CR4.PCIDE=1");
2230
0
            return X86EMUL_EXCEPTION;
2231
0
        }
2232
0
2233
0
        /* When CR0.PG is cleared, LMA is cleared immediately. */
2234
0
        if ( hvm_long_mode_active(v) )
2235
0
        {
2236
0
            v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
2237
0
            hvm_update_guest_efer(v);
2238
0
        }
2239
0
2240
0
        if ( !paging_mode_hap(d) )
2241
0
        {
2242
0
            put_page(pagetable_get_page(v->arch.guest_table));
2243
0
            v->arch.guest_table = pagetable_null();
2244
0
        }
2245
0
    }
2246
3.87k
2247
3.87k
    if ( ((value ^ old_value) & X86_CR0_CD) &&
2248
45
         iommu_enabled && hvm_funcs.handle_cd &&
2249
44
         (!rangeset_is_empty(d->iomem_caps) ||
2250
0
          !rangeset_is_empty(d->arch.ioport_caps) ||
2251
0
          has_arch_pdevs(d)) )
2252
45
        hvm_funcs.handle_cd(v, value);
2253
3.87k
2254
3.87k
    hvm_update_cr(v, 0, value);
2255
3.87k
2256
3.87k
    if ( (value ^ old_value) & X86_CR0_PG ) {
2257
12
        if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
2258
0
            paging_update_nestedmode(v);
2259
12
        else
2260
12
            paging_update_paging_modes(v);
2261
12
    }
2262
3.87k
2263
3.87k
    return X86EMUL_OKAY;
2264
3.87k
}
2265
2266
int hvm_set_cr3(unsigned long value, bool_t may_defer)
2267
0
{
2268
0
    struct vcpu *v = current;
2269
0
    struct page_info *page;
2270
0
    unsigned long old = v->arch.hvm_vcpu.guest_cr[3];
2271
0
2272
0
    if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
2273
0
                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
2274
0
    {
2275
0
        ASSERT(v->arch.vm_event);
2276
0
2277
0
        if ( hvm_monitor_crX(CR3, value, old) )
2278
0
        {
2279
0
            /* The actual write will occur in hvm_do_resume(), if permitted. */
2280
0
            v->arch.vm_event->write_data.do_write.cr3 = 1;
2281
0
            v->arch.vm_event->write_data.cr3 = value;
2282
0
2283
0
            return X86EMUL_OKAY;
2284
0
        }
2285
0
    }
2286
0
2287
0
    if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
2288
0
         (value != v->arch.hvm_vcpu.guest_cr[3]) )
2289
0
    {
2290
0
        /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
2291
0
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2292
0
        page = get_page_from_gfn(v->domain, value >> PAGE_SHIFT,
2293
0
                                 NULL, P2M_ALLOC);
2294
0
        if ( !page )
2295
0
            goto bad_cr3;
2296
0
2297
0
        put_page(pagetable_get_page(v->arch.guest_table));
2298
0
        v->arch.guest_table = pagetable_from_page(page);
2299
0
2300
0
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2301
0
    }
2302
0
2303
0
    v->arch.hvm_vcpu.guest_cr[3] = value;
2304
0
    paging_update_cr3(v);
2305
0
    return X86EMUL_OKAY;
2306
0
2307
0
 bad_cr3:
2308
0
    gdprintk(XENLOG_ERR, "Invalid CR3\n");
2309
0
    domain_crash(v->domain);
2310
0
    return X86EMUL_UNHANDLEABLE;
2311
0
}
2312
2313
int hvm_set_cr4(unsigned long value, bool_t may_defer)
2314
98
{
2315
98
    struct vcpu *v = current;
2316
98
    unsigned long old_cr;
2317
98
2318
98
    if ( value & ~hvm_cr4_guest_valid_bits(v, 0) )
2319
0
    {
2320
0
        HVM_DBG_LOG(DBG_LEVEL_1,
2321
0
                    "Guest attempts to set reserved bit in CR4: %lx",
2322
0
                    value);
2323
0
        return X86EMUL_EXCEPTION;
2324
0
    }
2325
98
2326
98
    if ( !(value & X86_CR4_PAE) )
2327
0
    {
2328
0
        if ( hvm_long_mode_active(v) )
2329
0
        {
2330
0
            HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2331
0
                        "EFER.LMA is set");
2332
0
            return X86EMUL_EXCEPTION;
2333
0
        }
2334
0
    }
2335
98
2336
98
    old_cr = v->arch.hvm_vcpu.guest_cr[4];
2337
98
2338
98
    if ( (value & X86_CR4_PCIDE) && !(old_cr & X86_CR4_PCIDE) &&
2339
12
         (!hvm_long_mode_active(v) ||
2340
12
          (v->arch.hvm_vcpu.guest_cr[3] & 0xfff)) )
2341
0
    {
2342
0
        HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to change CR4.PCIDE from "
2343
0
                    "0 to 1 while either EFER.LMA=0 or CR3[11:0]!=000H");
2344
0
        return X86EMUL_EXCEPTION;
2345
0
    }
2346
98
2347
98
    if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
2348
98
                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
2349
0
    {
2350
0
        ASSERT(v->arch.vm_event);
2351
0
2352
0
        if ( hvm_monitor_crX(CR4, value, old_cr) )
2353
0
        {
2354
0
            /* The actual write will occur in hvm_do_resume(), if permitted. */
2355
0
            v->arch.vm_event->write_data.do_write.cr4 = 1;
2356
0
            v->arch.vm_event->write_data.cr4 = value;
2357
0
2358
0
            return X86EMUL_OKAY;
2359
0
        }
2360
0
    }
2361
98
2362
98
    hvm_update_cr(v, 4, value);
2363
98
2364
98
    /*
2365
98
     * Modifying CR4.{PSE,PAE,PGE,SMEP}, or clearing CR4.PCIDE
2366
98
     * invalidate all TLB entries.
2367
98
     */
2368
98
    if ( ((old_cr ^ value) &
2369
98
          (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE | X86_CR4_SMEP)) ||
2370
37
         (!(value & X86_CR4_PCIDE) && (old_cr & X86_CR4_PCIDE)) )
2371
61
    {
2372
61
        if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
2373
0
            paging_update_nestedmode(v);
2374
61
        else
2375
61
            paging_update_paging_modes(v);
2376
61
    }
2377
98
2378
98
    /*
2379
98
     * {RD,WR}PKRU are not gated on XCR0.PKRU and hence an oddly behaving
2380
98
     * guest may enable the feature in CR4 without enabling it in XCR0. We
2381
98
     * need to context switch / migrate PKRU nevertheless.
2382
98
     */
2383
98
    if ( (value & X86_CR4_PKE) && !(v->arch.xcr0_accum & XSTATE_PKRU) )
2384
0
    {
2385
0
        int rc = handle_xsetbv(XCR_XFEATURE_ENABLED_MASK,
2386
0
                               get_xcr0() | XSTATE_PKRU);
2387
0
2388
0
        if ( rc )
2389
0
        {
2390
0
            HVM_DBG_LOG(DBG_LEVEL_1, "Failed to force XCR0.PKRU: %d", rc);
2391
0
            return X86EMUL_EXCEPTION;
2392
0
        }
2393
0
2394
0
        if ( handle_xsetbv(XCR_XFEATURE_ENABLED_MASK,
2395
0
                           get_xcr0() & ~XSTATE_PKRU) )
2396
0
            /* nothing, best effort only */;
2397
0
    }
2398
98
2399
98
    return X86EMUL_OKAY;
2400
98
}
2401
2402
bool_t hvm_virtual_to_linear_addr(
2403
    enum x86_segment seg,
2404
    const struct segment_register *reg,
2405
    unsigned long offset,
2406
    unsigned int bytes,
2407
    enum hvm_access_type access_type,
2408
    const struct segment_register *active_cs,
2409
    unsigned long *linear_addr)
2410
120k
{
2411
120k
    const struct vcpu *curr = current;
2412
120k
    unsigned long addr = offset, last_byte;
2413
120k
    bool_t okay = 0;
2414
120k
2415
120k
    /*
2416
120k
     * These checks are for a memory access through an active segment.
2417
120k
     *
2418
120k
     * It is expected that the access rights of reg are suitable for seg (and
2419
120k
     * that this is enforced at the point that seg is loaded).
2420
120k
     */
2421
120k
    ASSERT(seg < x86_seg_none);
2422
120k
2423
120k
    if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ||
2424
120k
         (guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
2425
0
    {
2426
0
        /*
2427
0
         * REAL/VM86 MODE: Don't bother with segment access checks.
2428
0
         * Certain of them are not done in native real mode anyway.
2429
0
         */
2430
0
        addr = (uint32_t)(addr + reg->base);
2431
0
        last_byte = (uint32_t)addr + bytes - !!bytes;
2432
0
        if ( last_byte < addr )
2433
0
            goto out;
2434
0
    }
2435
120k
    else if ( hvm_long_mode_active(curr) &&
2436
120k
              (is_x86_system_segment(seg) || active_cs->l) )
2437
120k
    {
2438
120k
        /*
2439
120k
         * User segments are always treated as present.  System segment may
2440
120k
         * not be, and also incur limit checks.
2441
120k
         */
2442
120k
        if ( is_x86_system_segment(seg) &&
2443
0
             (!reg->p || (offset + bytes - !!bytes) > reg->limit) )
2444
0
            goto out;
2445
120k
2446
120k
        /*
2447
120k
         * LONG MODE: FS, GS and system segments: add segment base. All
2448
120k
         * addresses must be canonical.
2449
120k
         */
2450
120k
        if ( seg >= x86_seg_fs )
2451
0
            addr += reg->base;
2452
120k
2453
120k
        last_byte = addr + bytes - !!bytes;
2454
120k
        if ( !is_canonical_address(addr) || last_byte < addr ||
2455
120k
             !is_canonical_address(last_byte) )
2456
0
            goto out;
2457
120k
    }
2458
120k
    else
2459
0
    {
2460
0
        /*
2461
0
         * PROTECTED/COMPATIBILITY MODE: Apply segment checks and add base.
2462
0
         */
2463
0
2464
0
        /*
2465
0
         * Hardware truncates to 32 bits in compatibility mode.
2466
0
         * It does not truncate to 16 bits in 16-bit address-size mode.
2467
0
         */
2468
0
        addr = (uint32_t)(addr + reg->base);
2469
0
2470
0
        /* Segment not valid for use (cooked meaning of .p)? */
2471
0
        if ( !reg->p )
2472
0
            goto out;
2473
0
2474
0
        /* Read/write restrictions only exist for user segments. */
2475
0
        if ( reg->s )
2476
0
        {
2477
0
            switch ( access_type )
2478
0
            {
2479
0
            case hvm_access_read:
2480
0
                if ( (reg->type & 0xa) == 0x8 )
2481
0
                    goto out; /* execute-only code segment */
2482
0
                break;
2483
0
            case hvm_access_write:
2484
0
                if ( (reg->type & 0xa) != 0x2 )
2485
0
                    goto out; /* not a writable data segment */
2486
0
                break;
2487
0
            default:
2488
0
                break;
2489
0
            }
2490
0
        }
2491
0
2492
0
        last_byte = (uint32_t)offset + bytes - !!bytes;
2493
0
2494
0
        /* Is this a grows-down data segment? Special limit check if so. */
2495
0
        if ( reg->s && (reg->type & 0xc) == 0x4 )
2496
0
        {
2497
0
            /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
2498
0
            if ( !reg->db )
2499
0
                last_byte = (uint16_t)last_byte;
2500
0
2501
0
            /* Check first byte and last byte against respective bounds. */
2502
0
            if ( (offset <= reg->limit) || (last_byte < offset) )
2503
0
                goto out;
2504
0
        }
2505
0
        else if ( (last_byte > reg->limit) || (last_byte < offset) )
2506
0
            goto out; /* last byte is beyond limit or wraps 0xFFFFFFFF */
2507
0
    }
2508
120k
2509
120k
    /* All checks ok. */
2510
120k
    okay = 1;
2511
120k
2512
120k
 out:
2513
120k
    /*
2514
120k
     * Always return the correct linear address, even if a permission check
2515
120k
     * failed.  The permissions failure is not relevant to some callers.
2516
120k
     */
2517
120k
    *linear_addr = addr;
2518
120k
    return okay;
2519
120k
}
2520
2521
struct hvm_write_map {
2522
    struct list_head list;
2523
    struct page_info *page;
2524
};
2525
2526
/* On non-NULL return, we leave this function holding an additional 
2527
 * ref on the underlying mfn, if any */
2528
static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
2529
                                  bool_t *writable)
2530
0
{
2531
0
    void *map;
2532
0
    p2m_type_t p2mt;
2533
0
    struct page_info *page;
2534
0
    struct domain *d = current->domain;
2535
0
2536
0
    page = get_page_from_gfn(d, gfn, &p2mt,
2537
0
                             writable ? P2M_UNSHARE : P2M_ALLOC);
2538
0
    if ( (p2m_is_shared(p2mt) && writable) || !page )
2539
0
    {
2540
0
        if ( page )
2541
0
            put_page(page);
2542
0
        return NULL;
2543
0
    }
2544
0
    if ( p2m_is_paging(p2mt) )
2545
0
    {
2546
0
        put_page(page);
2547
0
        p2m_mem_paging_populate(d, gfn);
2548
0
        return NULL;
2549
0
    }
2550
0
2551
0
    if ( writable )
2552
0
    {
2553
0
        if ( unlikely(p2m_is_discard_write(p2mt)) )
2554
0
            *writable = 0;
2555
0
        else if ( !permanent )
2556
0
            paging_mark_dirty(d, _mfn(page_to_mfn(page)));
2557
0
    }
2558
0
2559
0
    if ( !permanent )
2560
0
        return __map_domain_page(page);
2561
0
2562
0
    if ( writable && *writable )
2563
0
    {
2564
0
        struct hvm_write_map *track = xmalloc(struct hvm_write_map);
2565
0
2566
0
        if ( !track )
2567
0
        {
2568
0
            put_page(page);
2569
0
            return NULL;
2570
0
        }
2571
0
        track->page = page;
2572
0
        spin_lock(&d->arch.hvm_domain.write_map.lock);
2573
0
        list_add_tail(&track->list, &d->arch.hvm_domain.write_map.list);
2574
0
        spin_unlock(&d->arch.hvm_domain.write_map.lock);
2575
0
    }
2576
0
2577
0
    map = __map_domain_page_global(page);
2578
0
    if ( !map )
2579
0
        put_page(page);
2580
0
2581
0
    return map;
2582
0
}
2583
2584
void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
2585
                             bool_t *writable)
2586
0
{
2587
0
    *writable = 1;
2588
0
    return _hvm_map_guest_frame(gfn, permanent, writable);
2589
0
}
2590
2591
void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
2592
0
{
2593
0
    return _hvm_map_guest_frame(gfn, permanent, NULL);
2594
0
}
2595
2596
void hvm_unmap_guest_frame(void *p, bool_t permanent)
2597
0
{
2598
0
    unsigned long mfn;
2599
0
    struct page_info *page;
2600
0
2601
0
    if ( !p )
2602
0
        return;
2603
0
2604
0
    mfn = domain_page_map_to_mfn(p);
2605
0
    page = mfn_to_page(mfn);
2606
0
2607
0
    if ( !permanent )
2608
0
        unmap_domain_page(p);
2609
0
    else
2610
0
    {
2611
0
        struct domain *d = page_get_owner(page);
2612
0
        struct hvm_write_map *track;
2613
0
2614
0
        unmap_domain_page_global(p);
2615
0
        spin_lock(&d->arch.hvm_domain.write_map.lock);
2616
0
        list_for_each_entry(track, &d->arch.hvm_domain.write_map.list, list)
2617
0
            if ( track->page == page )
2618
0
            {
2619
0
                paging_mark_dirty(d, _mfn(mfn));
2620
0
                list_del(&track->list);
2621
0
                xfree(track);
2622
0
                break;
2623
0
            }
2624
0
        spin_unlock(&d->arch.hvm_domain.write_map.lock);
2625
0
    }
2626
0
2627
0
    put_page(page);
2628
0
}
2629
2630
void hvm_mapped_guest_frames_mark_dirty(struct domain *d)
2631
0
{
2632
0
    struct hvm_write_map *track;
2633
0
2634
0
    spin_lock(&d->arch.hvm_domain.write_map.lock);
2635
0
    list_for_each_entry(track, &d->arch.hvm_domain.write_map.list, list)
2636
0
        paging_mark_dirty(d, _mfn(page_to_mfn(track->page)));
2637
0
    spin_unlock(&d->arch.hvm_domain.write_map.lock);
2638
0
}
2639
2640
static void *hvm_map_entry(unsigned long va, bool_t *writable)
2641
0
{
2642
0
    unsigned long gfn;
2643
0
    uint32_t pfec;
2644
0
    char *v;
2645
0
2646
0
    if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE )
2647
0
    {
2648
0
        gdprintk(XENLOG_ERR, "Descriptor table entry "
2649
0
                 "straddles page boundary\n");
2650
0
        goto fail;
2651
0
    }
2652
0
2653
0
    /*
2654
0
     * We're mapping on behalf of the segment-load logic, which might write
2655
0
     * the accessed flags in the descriptors (in 32-bit mode), but we still
2656
0
     * treat it as a kernel-mode read (i.e. no access checks).
2657
0
     */
2658
0
    pfec = PFEC_page_present;
2659
0
    gfn = paging_gva_to_gfn(current, va, &pfec);
2660
0
    if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
2661
0
        goto fail;
2662
0
2663
0
    v = hvm_map_guest_frame_rw(gfn, 0, writable);
2664
0
    if ( v == NULL )
2665
0
        goto fail;
2666
0
2667
0
    return v + (va & ~PAGE_MASK);
2668
0
2669
0
 fail:
2670
0
    domain_crash(current->domain);
2671
0
    return NULL;
2672
0
}
2673
2674
static void hvm_unmap_entry(void *p)
2675
0
{
2676
0
    hvm_unmap_guest_frame(p, 0);
2677
0
}
2678
2679
static int hvm_load_segment_selector(
2680
    enum x86_segment seg, uint16_t sel, unsigned int cpl, unsigned int eflags)
2681
0
{
2682
0
    struct segment_register desctab, segr;
2683
0
    struct desc_struct *pdesc, desc;
2684
0
    u8 dpl, rpl;
2685
0
    bool_t writable;
2686
0
    int fault_type = TRAP_invalid_tss;
2687
0
    struct vcpu *v = current;
2688
0
2689
0
    if ( eflags & X86_EFLAGS_VM )
2690
0
    {
2691
0
        segr.sel = sel;
2692
0
        segr.base = (uint32_t)sel << 4;
2693
0
        segr.limit = 0xffffu;
2694
0
        segr.attr = 0xf3;
2695
0
        hvm_set_segment_register(v, seg, &segr);
2696
0
        return 0;
2697
0
    }
2698
0
2699
0
    /* NULL selector? */
2700
0
    if ( (sel & 0xfffc) == 0 )
2701
0
    {
2702
0
        if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
2703
0
            goto fail;
2704
0
        memset(&segr, 0, sizeof(segr));
2705
0
        segr.sel = sel;
2706
0
        hvm_set_segment_register(v, seg, &segr);
2707
0
        return 0;
2708
0
    }
2709
0
2710
0
    /* LDT descriptor must be in the GDT. */
2711
0
    if ( (seg == x86_seg_ldtr) && (sel & 4) )
2712
0
        goto fail;
2713
0
2714
0
    hvm_get_segment_register(
2715
0
        v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
2716
0
2717
0
    /* Segment not valid for use (cooked meaning of .p)? */
2718
0
    if ( !desctab.p )
2719
0
        goto fail;
2720
0
2721
0
    /* Check against descriptor table limit. */
2722
0
    if ( ((sel & 0xfff8) + 7) > desctab.limit )
2723
0
        goto fail;
2724
0
2725
0
    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
2726
0
    if ( pdesc == NULL )
2727
0
        goto hvm_map_fail;
2728
0
2729
0
    do {
2730
0
        desc = *pdesc;
2731
0
2732
0
        /* LDT descriptor is a system segment. All others are code/data. */
2733
0
        if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
2734
0
            goto unmap_and_fail;
2735
0
2736
0
        dpl = (desc.b >> 13) & 3;
2737
0
        rpl = sel & 3;
2738
0
2739
0
        switch ( seg )
2740
0
        {
2741
0
        case x86_seg_cs:
2742
0
            /* Code segment? */
2743
0
            if ( !(desc.b & _SEGMENT_CODE) )
2744
0
                goto unmap_and_fail;
2745
0
            /* Non-conforming segment: check DPL against RPL. */
2746
0
            if ( !(desc.b & _SEGMENT_EC) && (dpl != rpl) )
2747
0
                goto unmap_and_fail;
2748
0
            break;
2749
0
        case x86_seg_ss:
2750
0
            /* Writable data segment? */
2751
0
            if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) != _SEGMENT_WR )
2752
0
                goto unmap_and_fail;
2753
0
            if ( (dpl != cpl) || (dpl != rpl) )
2754
0
                goto unmap_and_fail;
2755
0
            break;
2756
0
        case x86_seg_ldtr:
2757
0
            /* LDT system segment? */
2758
0
            if ( (desc.b & _SEGMENT_TYPE) != (2u<<8) )
2759
0
                goto unmap_and_fail;
2760
0
            goto skip_accessed_flag;
2761
0
        default:
2762
0
            /* Readable code or data segment? */
2763
0
            if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) == _SEGMENT_CODE )
2764
0
                goto unmap_and_fail;
2765
0
            /*
2766
0
             * Data or non-conforming code segment:
2767
0
             * check DPL against RPL and CPL.
2768
0
             */
2769
0
            if ( ((desc.b & (_SEGMENT_EC|_SEGMENT_CODE)) !=
2770
0
                  (_SEGMENT_EC|_SEGMENT_CODE))
2771
0
                 && ((dpl < cpl) || (dpl < rpl)) )
2772
0
                goto unmap_and_fail;
2773
0
            break;
2774
0
        }
2775
0
2776
0
        /* Segment present in memory? */
2777
0
        if ( !(desc.b & _SEGMENT_P) )
2778
0
        {
2779
0
            fault_type = (seg != x86_seg_ss) ? TRAP_no_segment
2780
0
                                             : TRAP_stack_error;
2781
0
            goto unmap_and_fail;
2782
0
        }
2783
0
    } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
2784
0
              writable && /* except if we are to discard writes */
2785
0
              (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
2786
0
2787
0
    /* Force the Accessed flag in our local copy. */
2788
0
    desc.b |= 0x100;
2789
0
2790
0
 skip_accessed_flag:
2791
0
    hvm_unmap_entry(pdesc);
2792
0
2793
0
    segr.base = (((desc.b <<  0) & 0xff000000u) |
2794
0
                 ((desc.b << 16) & 0x00ff0000u) |
2795
0
                 ((desc.a >> 16) & 0x0000ffffu));
2796
0
    segr.attr = (((desc.b >>  8) & 0x00ffu) |
2797
0
                 ((desc.b >> 12) & 0x0f00u));
2798
0
    segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
2799
0
    if ( segr.g )
2800
0
        segr.limit = (segr.limit << 12) | 0xfffu;
2801
0
    segr.sel = sel;
2802
0
    hvm_set_segment_register(v, seg, &segr);
2803
0
2804
0
    return 0;
2805
0
2806
0
 unmap_and_fail:
2807
0
    hvm_unmap_entry(pdesc);
2808
0
 fail:
2809
0
    hvm_inject_hw_exception(fault_type, sel & 0xfffc);
2810
0
 hvm_map_fail:
2811
0
    return 1;
2812
0
}
2813
2814
struct tss32 {
2815
    uint16_t back_link, :16;
2816
    uint32_t esp0;
2817
    uint16_t ss0, :16;
2818
    uint32_t esp1;
2819
    uint16_t ss1, :16;
2820
    uint32_t esp2;
2821
    uint16_t ss2, :16;
2822
    uint32_t cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
2823
    uint16_t es, :16, cs, :16, ss, :16, ds, :16, fs, :16, gs, :16, ldt, :16;
2824
    uint16_t trace /* :1 */, iomap;
2825
};
2826
2827
void hvm_prepare_vm86_tss(struct vcpu *v, uint32_t base, uint32_t limit)
2828
0
{
2829
0
    /*
2830
0
     * If the provided area is large enough to cover at least the ISA port
2831
0
     * range, keep the bitmaps outside the base structure. For rather small
2832
0
     * areas (namely relevant for guests having been migrated from older
2833
0
     * Xen versions), maximize interrupt vector and port coverage by pointing
2834
0
     * the I/O bitmap at 0x20 (which puts the interrupt redirection bitmap
2835
0
     * right at zero), accepting accesses to port 0x235 (represented by bit 5
2836
0
     * of byte 0x46) to trigger #GP (which will simply result in the access
2837
0
     * being handled by the emulator via a slightly different path than it
2838
0
     * would be anyway). Be sure to include one extra byte at the end of the
2839
0
     * I/O bitmap (hence the missing "- 1" in the comparison is not an
2840
0
     * off-by-one mistake), which we deliberately don't fill with all ones.
2841
0
     */
2842
0
    uint16_t iomap = (limit >= sizeof(struct tss32) + (0x100 / 8) + (0x400 / 8)
2843
0
                      ? sizeof(struct tss32) : 0) + (0x100 / 8);
2844
0
2845
0
    ASSERT(limit >= sizeof(struct tss32) - 1);
2846
0
    /*
2847
0
     * Strictly speaking we'd have to use hvm_copy_to_guest_linear() below,
2848
0
     * but since the guest is (supposed to be, unless it corrupts that setup
2849
0
     * itself, which would harm only itself) running on an identmap, we can
2850
0
     * use the less overhead variant below, which also allows passing a vCPU
2851
0
     * argument.
2852
0
     */
2853
0
    hvm_copy_to_guest_phys(base, NULL, limit + 1, v);
2854
0
    hvm_copy_to_guest_phys(base + offsetof(struct tss32, iomap),
2855
0
                           &iomap, sizeof(iomap), v);
2856
0
}
2857
2858
void hvm_task_switch(
2859
    uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
2860
    int32_t errcode)
2861
0
{
2862
0
    struct vcpu *v = current;
2863
0
    struct cpu_user_regs *regs = guest_cpu_user_regs();
2864
0
    struct segment_register gdt, tr, prev_tr, segr;
2865
0
    struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
2866
0
    bool_t otd_writable, ntd_writable;
2867
0
    unsigned int eflags, new_cpl;
2868
0
    pagefault_info_t pfinfo;
2869
0
    int exn_raised, rc;
2870
0
    struct tss32 tss;
2871
0
2872
0
    hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
2873
0
    hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
2874
0
2875
0
    if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
2876
0
    {
2877
0
        hvm_inject_hw_exception((taskswitch_reason == TSW_iret) ?
2878
0
                             TRAP_invalid_tss : TRAP_gp_fault,
2879
0
                             tss_sel & 0xfff8);
2880
0
        goto out;
2881
0
    }
2882
0
2883
0
    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
2884
0
                               &otd_writable);
2885
0
    if ( optss_desc == NULL )
2886
0
        goto out;
2887
0
2888
0
    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
2889
0
    if ( nptss_desc == NULL )
2890
0
        goto out;
2891
0
2892
0
    tss_desc = *nptss_desc;
2893
0
    tr.sel = tss_sel;
2894
0
    tr.base = (((tss_desc.b <<  0) & 0xff000000u) |
2895
0
               ((tss_desc.b << 16) & 0x00ff0000u) |
2896
0
               ((tss_desc.a >> 16) & 0x0000ffffu));
2897
0
    tr.attr = (((tss_desc.b >>  8) & 0x00ffu) |
2898
0
               ((tss_desc.b >> 12) & 0x0f00u));
2899
0
    tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
2900
0
    if ( tr.g )
2901
0
        tr.limit = (tr.limit << 12) | 0xfffu;
2902
0
2903
0
    if ( tr.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
2904
0
    {
2905
0
        hvm_inject_hw_exception(
2906
0
            (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
2907
0
            tss_sel & 0xfff8);
2908
0
        goto out;
2909
0
    }
2910
0
2911
0
    if ( !tr.p )
2912
0
    {
2913
0
        hvm_inject_hw_exception(TRAP_no_segment, tss_sel & 0xfff8);
2914
0
        goto out;
2915
0
    }
2916
0
2917
0
    if ( tr.limit < (sizeof(tss)-1) )
2918
0
    {
2919
0
        hvm_inject_hw_exception(TRAP_invalid_tss, tss_sel & 0xfff8);
2920
0
        goto out;
2921
0
    }
2922
0
2923
0
    rc = hvm_copy_from_guest_linear(
2924
0
        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
2925
0
    if ( rc == HVMTRANS_bad_linear_to_gfn )
2926
0
        hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
2927
0
    if ( rc != HVMTRANS_okay )
2928
0
        goto out;
2929
0
2930
0
    eflags = regs->eflags;
2931
0
    if ( taskswitch_reason == TSW_iret )
2932
0
        eflags &= ~X86_EFLAGS_NT;
2933
0
2934
0
    tss.eip    = regs->eip;
2935
0
    tss.eflags = eflags;
2936
0
    tss.eax    = regs->eax;
2937
0
    tss.ecx    = regs->ecx;
2938
0
    tss.edx    = regs->edx;
2939
0
    tss.ebx    = regs->ebx;
2940
0
    tss.esp    = regs->esp;
2941
0
    tss.ebp    = regs->ebp;
2942
0
    tss.esi    = regs->esi;
2943
0
    tss.edi    = regs->edi;
2944
0
2945
0
    hvm_get_segment_register(v, x86_seg_es, &segr);
2946
0
    tss.es = segr.sel;
2947
0
    hvm_get_segment_register(v, x86_seg_cs, &segr);
2948
0
    tss.cs = segr.sel;
2949
0
    hvm_get_segment_register(v, x86_seg_ss, &segr);
2950
0
    tss.ss = segr.sel;
2951
0
    hvm_get_segment_register(v, x86_seg_ds, &segr);
2952
0
    tss.ds = segr.sel;
2953
0
    hvm_get_segment_register(v, x86_seg_fs, &segr);
2954
0
    tss.fs = segr.sel;
2955
0
    hvm_get_segment_register(v, x86_seg_gs, &segr);
2956
0
    tss.gs = segr.sel;
2957
0
    hvm_get_segment_register(v, x86_seg_ldtr, &segr);
2958
0
    tss.ldt = segr.sel;
2959
0
2960
0
    rc = hvm_copy_to_guest_linear(prev_tr.base + offsetof(typeof(tss), eip),
2961
0
                                  &tss.eip,
2962
0
                                  offsetof(typeof(tss), trace) -
2963
0
                                  offsetof(typeof(tss), eip),
2964
0
                                  PFEC_page_present, &pfinfo);
2965
0
    if ( rc == HVMTRANS_bad_linear_to_gfn )
2966
0
        hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
2967
0
    if ( rc != HVMTRANS_okay )
2968
0
        goto out;
2969
0
2970
0
    rc = hvm_copy_from_guest_linear(
2971
0
        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
2972
0
    if ( rc == HVMTRANS_bad_linear_to_gfn )
2973
0
        hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
2974
0
    /*
2975
0
     * Note: The HVMTRANS_gfn_shared case could be optimised, if the callee
2976
0
     * functions knew we want RO access.
2977
0
     */
2978
0
    if ( rc != HVMTRANS_okay )
2979
0
        goto out;
2980
0
2981
0
    new_cpl = tss.eflags & X86_EFLAGS_VM ? 3 : tss.cs & 3;
2982
0
2983
0
    if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
2984
0
        goto out;
2985
0
2986
0
    rc = hvm_set_cr3(tss.cr3, 1);
2987
0
    if ( rc == X86EMUL_EXCEPTION )
2988
0
        hvm_inject_hw_exception(TRAP_gp_fault, 0);
2989
0
    if ( rc != X86EMUL_OKAY )
2990
0
        goto out;
2991
0
2992
0
    regs->rip    = tss.eip;
2993
0
    regs->rflags = tss.eflags | X86_EFLAGS_MBS;
2994
0
    regs->rax    = tss.eax;
2995
0
    regs->rcx    = tss.ecx;
2996
0
    regs->rdx    = tss.edx;
2997
0
    regs->rbx    = tss.ebx;
2998
0
    regs->rsp    = tss.esp;
2999
0
    regs->rbp    = tss.ebp;
3000
0
    regs->rsi    = tss.esi;
3001
0
    regs->rdi    = tss.edi;
3002
0
3003
0
    exn_raised = 0;
3004
0
    if ( hvm_load_segment_selector(x86_seg_es, tss.es, new_cpl, tss.eflags) ||
3005
0
         hvm_load_segment_selector(x86_seg_cs, tss.cs, new_cpl, tss.eflags) ||
3006
0
         hvm_load_segment_selector(x86_seg_ss, tss.ss, new_cpl, tss.eflags) ||
3007
0
         hvm_load_segment_selector(x86_seg_ds, tss.ds, new_cpl, tss.eflags) ||
3008
0
         hvm_load_segment_selector(x86_seg_fs, tss.fs, new_cpl, tss.eflags) ||
3009
0
         hvm_load_segment_selector(x86_seg_gs, tss.gs, new_cpl, tss.eflags) )
3010
0
        exn_raised = 1;
3011
0
3012
0
    if ( taskswitch_reason == TSW_call_or_int )
3013
0
    {
3014
0
        regs->eflags |= X86_EFLAGS_NT;
3015
0
        tss.back_link = prev_tr.sel;
3016
0
3017
0
        rc = hvm_copy_to_guest_linear(tr.base + offsetof(typeof(tss), back_link),
3018
0
                                      &tss.back_link, sizeof(tss.back_link), 0,
3019
0
                                      &pfinfo);
3020
0
        if ( rc == HVMTRANS_bad_linear_to_gfn )
3021
0
        {
3022
0
            hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
3023
0
            exn_raised = 1;
3024
0
        }
3025
0
        else if ( rc != HVMTRANS_okay )
3026
0
            goto out;
3027
0
    }
3028
0
3029
0
    tr.type = 0xb; /* busy 32-bit tss */
3030
0
    hvm_set_segment_register(v, x86_seg_tr, &tr);
3031
0
3032
0
    v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
3033
0
    hvm_update_guest_cr(v, 0);
3034
0
3035
0
    if ( (taskswitch_reason == TSW_iret ||
3036
0
          taskswitch_reason == TSW_jmp) && otd_writable )
3037
0
        clear_bit(41, optss_desc); /* clear B flag of old task */
3038
0
3039
0
    if ( taskswitch_reason != TSW_iret && ntd_writable )
3040
0
        set_bit(41, nptss_desc); /* set B flag of new task */
3041
0
3042
0
    if ( errcode >= 0 )
3043
0
    {
3044
0
        struct segment_register cs;
3045
0
        unsigned long linear_addr;
3046
0
        unsigned int opsz, sp;
3047
0
3048
0
        hvm_get_segment_register(v, x86_seg_cs, &cs);
3049
0
        opsz = cs.db ? 4 : 2;
3050
0
        hvm_get_segment_register(v, x86_seg_ss, &segr);
3051
0
        if ( segr.db )
3052
0
            sp = regs->esp -= opsz;
3053
0
        else
3054
0
            sp = regs->sp -= opsz;
3055
0
        if ( hvm_virtual_to_linear_addr(x86_seg_ss, &segr, sp, opsz,
3056
0
                                        hvm_access_write,
3057
0
                                        &cs, &linear_addr) )
3058
0
        {
3059
0
            rc = hvm_copy_to_guest_linear(linear_addr, &errcode, opsz, 0,
3060
0
                                          &pfinfo);
3061
0
            if ( rc == HVMTRANS_bad_linear_to_gfn )
3062
0
            {
3063
0
                hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
3064
0
                exn_raised = 1;
3065
0
            }
3066
0
            else if ( rc != HVMTRANS_okay )
3067
0
                goto out;
3068
0
        }
3069
0
    }
3070
0
3071
0
    if ( (tss.trace & 1) && !exn_raised )
3072
0
        hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
3073
0
3074
0
 out:
3075
0
    hvm_unmap_entry(optss_desc);
3076
0
    hvm_unmap_entry(nptss_desc);
3077
0
}
3078
3079
enum hvm_translation_result hvm_translate_get_page(
3080
    struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
3081
    pagefault_info_t *pfinfo, struct page_info **page_p,
3082
    gfn_t *gfn_p, p2m_type_t *p2mt_p)
3083
373k
{
3084
373k
    struct page_info *page;
3085
373k
    p2m_type_t p2mt;
3086
373k
    gfn_t gfn;
3087
373k
3088
373k
    if ( linear )
3089
364k
    {
3090
364k
        gfn = _gfn(paging_gva_to_gfn(v, addr, &pfec));
3091
364k
3092
364k
        if ( gfn_eq(gfn, INVALID_GFN) )
3093
0
        {
3094
0
            if ( pfec & PFEC_page_paged )
3095
0
                return HVMTRANS_gfn_paged_out;
3096
0
3097
0
            if ( pfec & PFEC_page_shared )
3098
0
                return HVMTRANS_gfn_shared;
3099
0
3100
0
            if ( pfinfo )
3101
0
            {
3102
0
                pfinfo->linear = addr;
3103
0
                pfinfo->ec = pfec & ~PFEC_implicit;
3104
0
            }
3105
0
3106
0
            return HVMTRANS_bad_linear_to_gfn;
3107
0
        }
3108
364k
    }
3109
373k
    else
3110
8.52k
    {
3111
8.52k
        gfn = gaddr_to_gfn(addr);
3112
8.52k
        ASSERT(!pfinfo);
3113
8.52k
    }
3114
373k
3115
373k
    /*
3116
373k
     * No need to do the P2M lookup for internally handled MMIO, benefiting
3117
373k
     * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
3118
373k
     * - newer Windows (like Server 2012) for HPET accesses.
3119
373k
     */
3120
373k
    if ( v == current
3121
365k
         && !nestedhvm_vcpu_in_guestmode(v)
3122
365k
         && hvm_mmio_internal(gfn_to_gaddr(gfn)) )
3123
0
        return HVMTRANS_bad_gfn_to_mfn;
3124
373k
3125
373k
    page = get_page_from_gfn(v->domain, gfn_x(gfn), &p2mt, P2M_UNSHARE);
3126
373k
3127
373k
    if ( !page )
3128
0
        return HVMTRANS_bad_gfn_to_mfn;
3129
373k
3130
373k
    if ( p2m_is_paging(p2mt) )
3131
0
    {
3132
0
        put_page(page);
3133
0
        p2m_mem_paging_populate(v->domain, gfn_x(gfn));
3134
0
        return HVMTRANS_gfn_paged_out;
3135
0
    }
3136
373k
    if ( p2m_is_shared(p2mt) )
3137
0
    {
3138
0
        put_page(page);
3139
0
        return HVMTRANS_gfn_shared;
3140
0
    }
3141
373k
    if ( p2m_is_grant(p2mt) )
3142
0
    {
3143
0
        put_page(page);
3144
0
        return HVMTRANS_unhandleable;
3145
0
    }
3146
373k
3147
373k
    *page_p = page;
3148
373k
    if ( gfn_p )
3149
375k
        *gfn_p = gfn;
3150
373k
    if ( p2mt_p )
3151
375k
        *p2mt_p = p2mt;
3152
373k
3153
373k
    return HVMTRANS_okay;
3154
373k
}
3155
3156
363k
#define HVMCOPY_from_guest (0u<<0)
3157
373k
#define HVMCOPY_to_guest   (1u<<0)
3158
23
#define HVMCOPY_phys       (0u<<2)
3159
737k
#define HVMCOPY_linear     (1u<<2)
3160
static enum hvm_translation_result __hvm_copy(
3161
    void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
3162
    uint32_t pfec, pagefault_info_t *pfinfo)
3163
364k
{
3164
364k
    gfn_t gfn;
3165
364k
    struct page_info *page;
3166
364k
    p2m_type_t p2mt;
3167
364k
    char *p;
3168
364k
    int count, todo = size;
3169
364k
3170
364k
    ASSERT(is_hvm_vcpu(v));
3171
364k
3172
364k
    /*
3173
364k
     * XXX Disable for 4.1.0: PV-on-HVM drivers will do grant-table ops
3174
364k
     * such as query_size. Grant-table code currently does copy_to/from_guest
3175
364k
     * accesses under the big per-domain lock, which this test would disallow.
3176
364k
     * The test is not needed until we implement sleeping-on-waitqueue when
3177
364k
     * we access a paged-out frame, and that's post 4.1.0 now.
3178
364k
     */
3179
364k
#if 0
3180
    /*
3181
     * If the required guest memory is paged out, this function may sleep.
3182
     * Hence we bail immediately if called from atomic context.
3183
     */
3184
    if ( in_atomic() )
3185
        return HVMTRANS_unhandleable;
3186
#endif
3187
364k
3188
737k
    while ( todo > 0 )
3189
373k
    {
3190
373k
        enum hvm_translation_result res;
3191
373k
        paddr_t gpa = addr & ~PAGE_MASK;
3192
373k
3193
373k
        count = min_t(int, PAGE_SIZE - gpa, todo);
3194
373k
3195
373k
        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
3196
373k
                                     pfec, pfinfo, &page, &gfn, &p2mt);
3197
373k
        if ( res != HVMTRANS_okay )
3198
0
            return res;
3199
373k
3200
373k
        p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK);
3201
373k
3202
373k
        if ( flags & HVMCOPY_to_guest )
3203
9.12k
        {
3204
9.12k
            if ( p2m_is_discard_write(p2mt) )
3205
0
            {
3206
0
                static unsigned long lastpage;
3207
0
3208
0
                if ( xchg(&lastpage, gfn_x(gfn)) != gfn_x(gfn) )
3209
0
                    dprintk(XENLOG_G_DEBUG,
3210
0
                            "%pv attempted write to read-only gfn %#lx (mfn=%#lx)\n",
3211
0
                            v, gfn_x(gfn), page_to_mfn(page));
3212
0
            }
3213
9.12k
            else
3214
9.12k
            {
3215
9.12k
                if ( buf )
3216
7.80k
                    memcpy(p, buf, count);
3217
9.12k
                else
3218
1.31k
                    memset(p, 0, count);
3219
9.12k
                paging_mark_dirty(v->domain, _mfn(page_to_mfn(page)));
3220
9.12k
            }
3221
9.12k
        }
3222
373k
        else
3223
364k
        {
3224
364k
            memcpy(buf, p, count);
3225
364k
        }
3226
373k
3227
373k
        unmap_domain_page(p);
3228
373k
3229
373k
        addr += count;
3230
373k
        if ( buf )
3231
373k
            buf += count;
3232
373k
        todo -= count;
3233
373k
        put_page(page);
3234
373k
    }
3235
364k
3236
364k
    return HVMTRANS_okay;
3237
364k
}
3238
3239
enum hvm_translation_result hvm_copy_to_guest_phys(
3240
    paddr_t paddr, void *buf, int size, struct vcpu *v)
3241
23
{
3242
23
    return __hvm_copy(buf, paddr, size, v,
3243
23
                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
3244
23
}
3245
3246
enum hvm_translation_result hvm_copy_from_guest_phys(
3247
    void *buf, paddr_t paddr, int size)
3248
0
{
3249
0
    return __hvm_copy(buf, paddr, size, current,
3250
0
                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
3251
0
}
3252
3253
enum hvm_translation_result hvm_copy_to_guest_linear(
3254
    unsigned long addr, void *buf, int size, uint32_t pfec,
3255
    pagefault_info_t *pfinfo)
3256
157
{
3257
157
    return __hvm_copy(buf, addr, size, current,
3258
157
                      HVMCOPY_to_guest | HVMCOPY_linear,
3259
157
                      PFEC_page_present | PFEC_write_access | pfec, pfinfo);
3260
157
}
3261
3262
enum hvm_translation_result hvm_copy_from_guest_linear(
3263
    void *buf, unsigned long addr, int size, uint32_t pfec,
3264
    pagefault_info_t *pfinfo)
3265
303k
{
3266
303k
    return __hvm_copy(buf, addr, size, current,
3267
303k
                      HVMCOPY_from_guest | HVMCOPY_linear,
3268
303k
                      PFEC_page_present | pfec, pfinfo);
3269
303k
}
3270
3271
enum hvm_translation_result hvm_fetch_from_guest_linear(
3272
    void *buf, unsigned long addr, int size, uint32_t pfec,
3273
    pagefault_info_t *pfinfo)
3274
60.1k
{
3275
60.1k
    return __hvm_copy(buf, addr, size, current,
3276
60.1k
                      HVMCOPY_from_guest | HVMCOPY_linear,
3277
60.1k
                      PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo);
3278
60.1k
}
3279
3280
unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
3281
157
{
3282
157
    int rc;
3283
157
3284
157
    if ( current->hcall_compat && is_compat_arg_xlat_range(to, len) )
3285
0
    {
3286
0
        memcpy(to, from, len);
3287
0
        return 0;
3288
0
    }
3289
157
3290
157
    rc = hvm_copy_to_guest_linear((unsigned long)to, (void *)from, len, 0, NULL);
3291
157
    return rc ? len : 0; /* fake a copy_to_user() return code */
3292
157
}
3293
3294
unsigned long clear_user_hvm(void *to, unsigned int len)
3295
0
{
3296
0
    int rc;
3297
0
3298
0
    if ( current->hcall_compat && is_compat_arg_xlat_range(to, len) )
3299
0
    {
3300
0
        memset(to, 0x00, len);
3301
0
        return 0;
3302
0
    }
3303
0
3304
0
    rc = hvm_copy_to_guest_linear((unsigned long)to, NULL, len, 0, NULL);
3305
0
    return rc ? len : 0; /* fake a copy_to_user() return code */
3306
0
}
3307
3308
unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
3309
304k
{
3310
304k
    int rc;
3311
304k
3312
304k
    if ( current->hcall_compat && is_compat_arg_xlat_range(from, len) )
3313
0
    {
3314
0
        memcpy(to, from, len);
3315
0
        return 0;
3316
0
    }
3317
304k
3318
304k
    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL);
3319
304k
    return rc ? len : 0; /* fake a copy_from_user() return code */
3320
304k
}
3321
3322
bool hvm_check_cpuid_faulting(struct vcpu *v)
3323
2.66k
{
3324
2.66k
    const struct msr_vcpu_policy *vp = v->arch.msr;
3325
2.66k
3326
2.66k
    if ( !vp->misc_features_enables.cpuid_faulting )
3327
2.66k
        return false;
3328
2.66k
3329
0
    return hvm_get_cpl(v) > 0;
3330
2.66k
}
3331
3332
static uint64_t _hvm_rdtsc_intercept(void)
3333
0
{
3334
0
    struct vcpu *curr = current;
3335
0
#if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
3336
0
    struct domain *currd = curr->domain;
3337
0
3338
0
    if ( currd->arch.vtsc )
3339
0
        switch ( hvm_guest_x86_mode(curr) )
3340
0
        {
3341
0
        case 8:
3342
0
        case 4:
3343
0
        case 2:
3344
0
            if ( unlikely(hvm_get_cpl(curr)) )
3345
0
            {
3346
0
        case 1:
3347
0
                currd->arch.vtsc_usercount++;
3348
0
                break;
3349
0
            }
3350
0
            /* fall through */
3351
0
        case 0:
3352
0
            currd->arch.vtsc_kerncount++;
3353
0
            break;
3354
0
        }
3355
0
#endif
3356
0
3357
0
    return hvm_get_guest_tsc(curr);
3358
0
}
3359
3360
void hvm_rdtsc_intercept(struct cpu_user_regs *regs)
3361
0
{
3362
0
    msr_split(regs, _hvm_rdtsc_intercept());
3363
0
3364
0
    HVMTRACE_2D(RDTSC, regs->eax, regs->edx);
3365
0
}
3366
3367
int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
3368
271
{
3369
271
    struct vcpu *v = current;
3370
271
    struct domain *d = v->domain;
3371
271
    uint64_t *var_range_base, *fixed_range_base;
3372
271
    int ret;
3373
271
3374
271
    var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
3375
271
    fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
3376
271
3377
271
    if ( (ret = guest_rdmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE )
3378
0
        return ret;
3379
271
3380
271
    ret = X86EMUL_OKAY;
3381
271
3382
271
    switch ( msr )
3383
271
    {
3384
0
        unsigned int index;
3385
0
3386
36
    case MSR_EFER:
3387
36
        *msr_content = v->arch.hvm_vcpu.guest_efer;
3388
36
        break;
3389
0
3390
0
    case MSR_IA32_TSC:
3391
0
        *msr_content = _hvm_rdtsc_intercept();
3392
0
        break;
3393
0
3394
0
    case MSR_IA32_TSC_ADJUST:
3395
0
        *msr_content = v->arch.hvm_vcpu.msr_tsc_adjust;
3396
0
        break;
3397
0
3398
0
    case MSR_TSC_AUX:
3399
0
        *msr_content = hvm_msr_tsc_aux(v);
3400
0
        break;
3401
0
3402
10
    case MSR_IA32_APICBASE:
3403
10
        *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
3404
10
        break;
3405
0
3406
0
    case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff:
3407
0
        if ( hvm_x2apic_msr_read(v, msr, msr_content) )
3408
0
            goto gp_fault;
3409
0
        break;
3410
0
3411
0
    case MSR_IA32_TSC_DEADLINE:
3412
0
        *msr_content = vlapic_tdt_msr_get(vcpu_vlapic(v));
3413
0
        break;
3414
0
3415
0
    case MSR_IA32_CR_PAT:
3416
0
        hvm_get_guest_pat(v, msr_content);
3417
0
        break;
3418
0
3419
1
    case MSR_MTRRcap:
3420
1
        if ( !d->arch.cpuid->basic.mtrr )
3421
0
            goto gp_fault;
3422
1
        *msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
3423
1
        break;
3424
24
    case MSR_MTRRdefType:
3425
24
        if ( !d->arch.cpuid->basic.mtrr )
3426
0
            goto gp_fault;
3427
24
        *msr_content = v->arch.hvm_vcpu.mtrr.def_type
3428
24
                        | (v->arch.hvm_vcpu.mtrr.enabled << 10);
3429
24
        break;
3430
0
    case MSR_MTRRfix64K_00000:
3431
0
        if ( !d->arch.cpuid->basic.mtrr )
3432
0
            goto gp_fault;
3433
0
        *msr_content = fixed_range_base[0];
3434
0
        break;
3435
0
    case MSR_MTRRfix16K_80000:
3436
0
    case MSR_MTRRfix16K_A0000:
3437
0
        if ( !d->arch.cpuid->basic.mtrr )
3438
0
            goto gp_fault;
3439
0
        index = msr - MSR_MTRRfix16K_80000;
3440
0
        *msr_content = fixed_range_base[index + 1];
3441
0
        break;
3442
0
    case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
3443
0
        if ( !d->arch.cpuid->basic.mtrr )
3444
0
            goto gp_fault;
3445
0
        index = msr - MSR_MTRRfix4K_C0000;
3446
0
        *msr_content = fixed_range_base[index + 3];
3447
0
        break;
3448
104
    case MSR_IA32_MTRR_PHYSBASE(0)...MSR_IA32_MTRR_PHYSMASK(MTRR_VCNT-1):
3449
104
        if ( !d->arch.cpuid->basic.mtrr )
3450
0
            goto gp_fault;
3451
104
        index = msr - MSR_IA32_MTRR_PHYSBASE(0);
3452
104
        *msr_content = var_range_base[index];
3453
104
        break;
3454
104
3455
0
    case MSR_IA32_XSS:
3456
0
        if ( !d->arch.cpuid->xstate.xsaves )
3457
0
            goto gp_fault;
3458
0
        *msr_content = v->arch.hvm_vcpu.msr_xss;
3459
0
        break;
3460
0
3461
0
    case MSR_IA32_BNDCFGS:
3462
0
        if ( !d->arch.cpuid->feat.mpx ||
3463
0
             !hvm_get_guest_bndcfgs(v, msr_content) )
3464
0
            goto gp_fault;
3465
0
        break;
3466
0
3467
0
    case MSR_K8_ENABLE_C1E:
3468
0
    case MSR_AMD64_NB_CFG:
3469
0
         /*
3470
0
          * These AMD-only registers may be accessed if this HVM guest
3471
0
          * has been migrated to an Intel host. This fixes a guest crash
3472
0
          * in this case.
3473
0
          */
3474
0
         *msr_content = 0;
3475
0
         break;
3476
0
3477
96
    default:
3478
96
        if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 )
3479
0
            goto gp_fault;
3480
96
        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
3481
96
        ret = ((ret == 0)
3482
12
               ? hvm_funcs.msr_read_intercept(msr, msr_content)
3483
84
               : X86EMUL_OKAY);
3484
96
        break;
3485
271
    }
3486
271
3487
270
 out:
3488
270
    HVMTRACE_3D(MSR_READ, msr,
3489
270
                (uint32_t)*msr_content, (uint32_t)(*msr_content >> 32));
3490
270
    return ret;
3491
271
3492
0
 gp_fault:
3493
0
    ret = X86EMUL_EXCEPTION;
3494
0
    *msr_content = -1ull;
3495
0
    goto out;
3496
271
}
3497
3498
int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
3499
                            bool_t may_defer)
3500
563
{
3501
563
    struct vcpu *v = current;
3502
563
    struct domain *d = v->domain;
3503
563
    int ret;
3504
563
3505
563
    HVMTRACE_3D(MSR_WRITE, msr,
3506
563
               (uint32_t)msr_content, (uint32_t)(msr_content >> 32));
3507
563
3508
565
    if ( may_defer && unlikely(monitored_msr(v->domain, msr)) )
3509
0
    {
3510
0
        ASSERT(v->arch.vm_event);
3511
0
3512
0
        /* The actual write will occur in hvm_do_resume() (if permitted). */
3513
0
        v->arch.vm_event->write_data.do_write.msr = 1;
3514
0
        v->arch.vm_event->write_data.msr = msr;
3515
0
        v->arch.vm_event->write_data.value = msr_content;
3516
0
3517
0
        hvm_monitor_msr(msr, msr_content);
3518
0
        return X86EMUL_OKAY;
3519
0
    }
3520
563
3521
563
    if ( (ret = guest_wrmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE )
3522
0
        return ret;
3523
563
3524
563
    ret = X86EMUL_OKAY;
3525
563
3526
563
    switch ( msr )
3527
563
    {
3528
0
        unsigned int index;
3529
0
3530
36
    case MSR_EFER:
3531
36
        if ( hvm_set_efer(msr_content) )
3532
0
           return X86EMUL_EXCEPTION;
3533
36
        break;
3534
36
3535
0
    case MSR_IA32_TSC:
3536
0
        hvm_set_guest_tsc_msr(v, msr_content);
3537
0
        break;
3538
36
3539
0
    case MSR_IA32_TSC_ADJUST:
3540
0
        hvm_set_guest_tsc_adjust(v, msr_content);
3541
0
        break;
3542
36
3543
0
    case MSR_TSC_AUX:
3544
0
        v->arch.hvm_vcpu.msr_tsc_aux = (uint32_t)msr_content;
3545
0
        if ( cpu_has_rdtscp
3546
0
             && (v->domain->arch.tsc_mode != TSC_MODE_PVRDTSCP) )
3547
0
            wrmsrl(MSR_TSC_AUX, (uint32_t)msr_content);
3548
0
        break;
3549
36
3550
9
    case MSR_IA32_APICBASE:
3551
9
        if ( !vlapic_msr_set(vcpu_vlapic(v), msr_content) )
3552
0
            goto gp_fault;
3553
9
        break;
3554
9
3555
0
    case MSR_IA32_TSC_DEADLINE:
3556
0
        vlapic_tdt_msr_set(vcpu_vlapic(v), msr_content);
3557
0
        break;
3558
9
3559
129
    case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff:
3560
129
        if ( hvm_x2apic_msr_write(v, msr, msr_content) )
3561
0
            goto gp_fault;
3562
129
        break;
3563
129
3564
10
    case MSR_IA32_CR_PAT:
3565
10
        if ( !hvm_set_guest_pat(v, msr_content) )
3566
0
           goto gp_fault;
3567
10
        break;
3568
10
3569
0
    case MSR_MTRRcap:
3570
0
        goto gp_fault;
3571
10
3572
34
    case MSR_MTRRdefType:
3573
34
        if ( !d->arch.cpuid->basic.mtrr )
3574
0
            goto gp_fault;
3575
34
        if ( !mtrr_def_type_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
3576
34
                                    msr_content) )
3577
0
           goto gp_fault;
3578
34
        break;
3579
0
    case MSR_MTRRfix64K_00000:
3580
0
        if ( !d->arch.cpuid->basic.mtrr )
3581
0
            goto gp_fault;
3582
0
        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, 0,
3583
0
                                     msr_content) )
3584
0
            goto gp_fault;
3585
0
        break;
3586
0
    case MSR_MTRRfix16K_80000:
3587
0
    case MSR_MTRRfix16K_A0000:
3588
0
        if ( !d->arch.cpuid->basic.mtrr )
3589
0
            goto gp_fault;
3590
0
        index = msr - MSR_MTRRfix16K_80000 + 1;
3591
0
        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
3592
0
                                     index, msr_content) )
3593
0
            goto gp_fault;
3594
0
        break;
3595
0
    case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
3596
0
        if ( !d->arch.cpuid->basic.mtrr )
3597
0
            goto gp_fault;
3598
0
        index = msr - MSR_MTRRfix4K_C0000 + 3;
3599
0
        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
3600
0
                                     index, msr_content) )
3601
0
            goto gp_fault;
3602
0
        break;
3603
176
    case MSR_IA32_MTRR_PHYSBASE(0)...MSR_IA32_MTRR_PHYSMASK(MTRR_VCNT-1):
3604
176
        if ( !d->arch.cpuid->basic.mtrr )
3605
0
            goto gp_fault;
3606
176
        if ( !mtrr_var_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
3607
176
                                     msr, msr_content) )
3608
0
            goto gp_fault;
3609
176
        break;
3610
176
3611
0
    case MSR_IA32_XSS:
3612
0
        /* No XSS features currently supported for guests. */
3613
0
        if ( !d->arch.cpuid->xstate.xsaves || msr_content != 0 )
3614
0
            goto gp_fault;
3615
0
        v->arch.hvm_vcpu.msr_xss = msr_content;
3616
0
        break;
3617
0
3618
0
    case MSR_IA32_BNDCFGS:
3619
0
        if ( !d->arch.cpuid->feat.mpx ||
3620
0
             !hvm_set_guest_bndcfgs(v, msr_content) )
3621
0
            goto gp_fault;
3622
0
        break;
3623
0
3624
0
    case MSR_AMD64_NB_CFG:
3625
0
        /* ignore the write */
3626
0
        break;
3627
0
3628
169
    default:
3629
169
        if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 )
3630
0
            goto gp_fault;
3631
169
        /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
3632
169
        ret = ((ret == 0)
3633
50
               ? hvm_funcs.msr_write_intercept(msr, msr_content)
3634
119
               : X86EMUL_OKAY);
3635
169
        break;
3636
563
    }
3637
563
3638
564
    return ret;
3639
563
3640
0
gp_fault:
3641
0
    return X86EMUL_EXCEPTION;
3642
563
}
3643
3644
static bool is_sysdesc_access(const struct x86_emulate_state *state,
3645
                              const struct x86_emulate_ctxt *ctxt)
3646
0
{
3647
0
    unsigned int ext;
3648
0
    int mode = x86_insn_modrm(state, NULL, &ext);
3649
0
3650
0
    switch ( ctxt->opcode )
3651
0
    {
3652
0
    case X86EMUL_OPC(0x0f, 0x00):
3653
0
        if ( !(ext & 4) ) /* SLDT / STR / LLDT / LTR */
3654
0
            return true;
3655
0
        break;
3656
0
3657
0
    case X86EMUL_OPC(0x0f, 0x01):
3658
0
        if ( mode != 3 && !(ext & 4) ) /* SGDT / SIDT / LGDT / LIDT */
3659
0
            return true;
3660
0
        break;
3661
0
    }
3662
0
3663
0
    return false;
3664
0
}
3665
3666
int hvm_descriptor_access_intercept(uint64_t exit_info,
3667
                                    uint64_t vmx_exit_qualification,
3668
                                    unsigned int descriptor, bool is_write)
3669
0
{
3670
0
    struct vcpu *curr = current;
3671
0
    struct domain *currd = curr->domain;
3672
0
3673
0
    if ( currd->arch.monitor.descriptor_access_enabled )
3674
0
    {
3675
0
        ASSERT(curr->arch.vm_event);
3676
0
        hvm_monitor_descriptor_access(exit_info, vmx_exit_qualification,
3677
0
                                      descriptor, is_write);
3678
0
    }
3679
0
    else if ( !hvm_emulate_one_insn(is_sysdesc_access, "sysdesc access") )
3680
0
        domain_crash(currd);
3681
0
3682
0
    return X86EMUL_OKAY;
3683
0
}
3684
3685
static bool is_cross_vendor(const struct x86_emulate_state *state,
3686
                            const struct x86_emulate_ctxt *ctxt)
3687
0
{
3688
0
    switch ( ctxt->opcode )
3689
0
    {
3690
0
    case X86EMUL_OPC(0x0f, 0x05): /* syscall */
3691
0
    case X86EMUL_OPC(0x0f, 0x34): /* sysenter */
3692
0
    case X86EMUL_OPC(0x0f, 0x35): /* sysexit */
3693
0
        return true;
3694
0
    }
3695
0
3696
0
    return false;
3697
0
}
3698
3699
void hvm_ud_intercept(struct cpu_user_regs *regs)
3700
0
{
3701
0
    struct vcpu *cur = current;
3702
0
    bool should_emulate =
3703
0
        cur->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor;
3704
0
    struct hvm_emulate_ctxt ctxt;
3705
0
3706
0
    hvm_emulate_init_once(&ctxt, opt_hvm_fep ? NULL : is_cross_vendor, regs);
3707
0
3708
0
    if ( opt_hvm_fep )
3709
0
    {
3710
0
        const struct segment_register *cs = &ctxt.seg_reg[x86_seg_cs];
3711
0
        uint32_t walk = (ctxt.seg_reg[x86_seg_ss].dpl == 3)
3712
0
            ? PFEC_user_mode : 0;
3713
0
        unsigned long addr;
3714
0
        char sig[5]; /* ud2; .ascii "xen" */
3715
0
3716
0
        if ( hvm_virtual_to_linear_addr(x86_seg_cs, cs, regs->rip,
3717
0
                                        sizeof(sig), hvm_access_insn_fetch,
3718
0
                                        cs, &addr) &&
3719
0
             (hvm_fetch_from_guest_linear(sig, addr, sizeof(sig),
3720
0
                                          walk, NULL) == HVMTRANS_okay) &&
3721
0
             (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) )
3722
0
        {
3723
0
            regs->rip += sizeof(sig);
3724
0
            regs->eflags &= ~X86_EFLAGS_RF;
3725
0
3726
0
            /* Zero the upper 32 bits of %rip if not in 64bit mode. */
3727
0
            if ( !(hvm_long_mode_active(cur) && cs->l) )
3728
0
                regs->rip = regs->eip;
3729
0
3730
0
            add_taint(TAINT_HVM_FEP);
3731
0
3732
0
            should_emulate = true;
3733
0
        }
3734
0
    }
3735
0
3736
0
    if ( !should_emulate )
3737
0
    {
3738
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
3739
0
        return;
3740
0
    }
3741
0
3742
0
    switch ( hvm_emulate_one(&ctxt) )
3743
0
    {
3744
0
    case X86EMUL_UNHANDLEABLE:
3745
0
    case X86EMUL_UNIMPLEMENTED:
3746
0
        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
3747
0
        break;
3748
0
    case X86EMUL_EXCEPTION:
3749
0
        hvm_inject_event(&ctxt.ctxt.event);
3750
0
        /* fall through */
3751
0
    default:
3752
0
        hvm_emulate_writeback(&ctxt);
3753
0
        break;
3754
0
    }
3755
0
}
3756
3757
enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
3758
2.04M
{
3759
2.04M
    unsigned long intr_shadow;
3760
2.04M
3761
2.04M
    ASSERT(v == current);
3762
2.04M
3763
2.04M
    if ( nestedhvm_enabled(v->domain) ) {
3764
0
        enum hvm_intblk intr;
3765
0
3766
0
        intr = nhvm_interrupt_blocked(v);
3767
0
        if ( intr != hvm_intblk_none )
3768
0
            return intr;
3769
0
    }
3770
2.04M
3771
2.04M
    if ( (intack.source != hvm_intsrc_nmi) &&
3772
2.03M
         !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
3773
1.89M
        return hvm_intblk_rflags_ie;
3774
2.04M
3775
152k
    intr_shadow = hvm_funcs.get_interrupt_shadow(v);
3776
152k
3777
152k
    if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
3778
0
        return hvm_intblk_shadow;
3779
152k
3780
152k
    if ( intack.source == hvm_intsrc_nmi )
3781
0
        return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
3782
0
                hvm_intblk_nmi_iret : hvm_intblk_none);
3783
152k
3784
152k
    if ( intack.source == hvm_intsrc_lapic )
3785
4.07k
    {
3786
4.07k
        uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
3787
4.07k
        if ( (tpr >> 4) >= (intack.vector >> 4) )
3788
0
            return hvm_intblk_tpr;
3789
4.07k
    }
3790
152k
3791
152k
    return hvm_intblk_none;
3792
152k
}
3793
3794
static void hvm_latch_shinfo_size(struct domain *d)
3795
3
{
3796
3
    /*
3797
3
     * Called from operations which are among the very first executed by
3798
3
     * PV drivers on initialisation or after save/restore. These are sensible
3799
3
     * points at which to sample the execution mode of the guest and latch
3800
3
     * 32- or 64-bit format for shared state.
3801
3
     */
3802
3
    if ( current->domain == d )
3803
3
    {
3804
3
        d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
3805
3
        /*
3806
3
         * Make sure that the timebase in the shared info structure is correct.
3807
3
         *
3808
3
         * If the bit-ness changed we should arguably try to convert the other
3809
3
         * fields as well, but that's much more problematic (e.g. what do you
3810
3
         * do if you're going from 64 bit to 32 bit and there's an event
3811
3
         * channel pending which doesn't exist in the 32 bit version?).  Just
3812
3
         * setting the wallclock time seems to be sufficient for everything
3813
3
         * we do, even if it is a bit of a hack.
3814
3
         */
3815
3
        update_domain_wallclock_time(d);
3816
3
    }
3817
3
}
3818
3819
/* Initialise a hypercall transfer page for a VMX domain using
3820
   paravirtualised drivers. */
3821
void hvm_hypercall_page_initialise(struct domain *d,
3822
                                   void *hypercall_page)
3823
2
{
3824
2
    hvm_latch_shinfo_size(d);
3825
2
    hvm_funcs.init_hypercall_page(d, hypercall_page);
3826
2
}
3827
3828
void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
3829
22
{
3830
22
    struct domain *d = v->domain;
3831
22
    struct segment_register reg;
3832
22
    typeof(v->arch.xsave_area->fpu_sse) *fpu_ctxt = v->arch.fpu_ctxt;
3833
22
3834
22
    domain_lock(d);
3835
22
3836
22
    if ( v->is_initialised )
3837
11
        goto out;
3838
22
3839
11
    if ( !paging_mode_hap(d) )
3840
0
    {
3841
0
        if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
3842
0
            put_page(pagetable_get_page(v->arch.guest_table));
3843
0
        v->arch.guest_table = pagetable_null();
3844
0
    }
3845
11
3846
11
    memset(fpu_ctxt, 0, sizeof(*fpu_ctxt));
3847
11
    fpu_ctxt->fcw = FCW_RESET;
3848
11
    fpu_ctxt->mxcsr = MXCSR_DEFAULT;
3849
11
    if ( v->arch.xsave_area )
3850
11
    {
3851
11
        v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP;
3852
11
        v->arch.xsave_area->xsave_hdr.xcomp_bv = 0;
3853
11
    }
3854
11
3855
11
    v->arch.vgc_flags = VGCF_online;
3856
11
    memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs));
3857
11
    v->arch.user_regs.rflags = X86_EFLAGS_MBS;
3858
11
    v->arch.user_regs.rdx = 0x00000f00;
3859
11
    v->arch.user_regs.rip = ip;
3860
11
    memset(&v->arch.debugreg, 0, sizeof(v->arch.debugreg));
3861
11
3862
11
    v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
3863
11
    hvm_update_guest_cr(v, 0);
3864
11
3865
11
    v->arch.hvm_vcpu.guest_cr[2] = 0;
3866
11
    hvm_update_guest_cr(v, 2);
3867
11
3868
11
    v->arch.hvm_vcpu.guest_cr[3] = 0;
3869
11
    hvm_update_guest_cr(v, 3);
3870
11
3871
11
    v->arch.hvm_vcpu.guest_cr[4] = 0;
3872
11
    hvm_update_guest_cr(v, 4);
3873
11
3874
11
    v->arch.hvm_vcpu.guest_efer = 0;
3875
11
    hvm_update_guest_efer(v);
3876
11
3877
11
    reg.sel = cs;
3878
11
    reg.base = (uint32_t)reg.sel << 4;
3879
11
    reg.limit = 0xffff;
3880
11
    reg.attr = 0x9b;
3881
11
    hvm_set_segment_register(v, x86_seg_cs, &reg);
3882
11
3883
11
    reg.sel = reg.base = 0;
3884
11
    reg.limit = 0xffff;
3885
11
    reg.attr = 0x93;
3886
11
    hvm_set_segment_register(v, x86_seg_ds, &reg);
3887
11
    hvm_set_segment_register(v, x86_seg_es, &reg);
3888
11
    hvm_set_segment_register(v, x86_seg_fs, &reg);
3889
11
    hvm_set_segment_register(v, x86_seg_gs, &reg);
3890
11
    hvm_set_segment_register(v, x86_seg_ss, &reg);
3891
11
3892
11
    reg.attr = 0x82; /* LDT */
3893
11
    hvm_set_segment_register(v, x86_seg_ldtr, &reg);
3894
11
3895
11
    reg.attr = 0x8b; /* 32-bit TSS (busy) */
3896
11
    hvm_set_segment_register(v, x86_seg_tr, &reg);
3897
11
3898
11
    reg.attr = 0;
3899
11
    hvm_set_segment_register(v, x86_seg_gdtr, &reg);
3900
11
    hvm_set_segment_register(v, x86_seg_idtr, &reg);
3901
11
3902
11
    if ( hvm_funcs.tsc_scaling.setup )
3903
11
        hvm_funcs.tsc_scaling.setup(v);
3904
11
3905
11
    /* Sync AP's TSC with BSP's. */
3906
11
    v->arch.hvm_vcpu.cache_tsc_offset =
3907
11
        v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
3908
11
    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset,
3909
11
                             d->arch.hvm_domain.sync_tsc);
3910
11
3911
11
    v->arch.hvm_vcpu.msr_tsc_adjust = 0;
3912
11
3913
11
    paging_update_paging_modes(v);
3914
11
3915
11
    v->arch.flags |= TF_kernel_mode;
3916
11
    v->is_initialised = 1;
3917
11
    clear_bit(_VPF_down, &v->pause_flags);
3918
11
3919
22
 out:
3920
22
    domain_unlock(d);
3921
22
}
3922
3923
static void hvm_s3_suspend(struct domain *d)
3924
0
{
3925
0
    struct vcpu *v;
3926
0
3927
0
    domain_pause(d);
3928
0
    domain_lock(d);
3929
0
3930
0
    if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ||
3931
0
         test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) )
3932
0
    {
3933
0
        domain_unlock(d);
3934
0
        domain_unpause(d);
3935
0
        return;
3936
0
    }
3937
0
3938
0
    for_each_vcpu ( d, v )
3939
0
    {
3940
0
        int rc;
3941
0
3942
0
        vlapic_reset(vcpu_vlapic(v));
3943
0
        rc = vcpu_reset(v);
3944
0
        ASSERT(!rc);
3945
0
    }
3946
0
3947
0
    vpic_reset(d);
3948
0
    vioapic_reset(d);
3949
0
    pit_reset(d);
3950
0
    rtc_reset(d);
3951
0
    pmtimer_reset(d);
3952
0
    hpet_reset(d);
3953
0
3954
0
    hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
3955
0
3956
0
    domain_unlock(d);
3957
0
}
3958
3959
static void hvm_s3_resume(struct domain *d)
3960
0
{
3961
0
    if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
3962
0
    {
3963
0
        struct vcpu *v;
3964
0
3965
0
        for_each_vcpu( d, v )
3966
0
            hvm_set_guest_tsc(v, 0);
3967
0
        domain_unpause(d);
3968
0
    }
3969
0
}
3970
3971
static int hvmop_flush_tlb_all(void)
3972
0
{
3973
0
    struct domain *d = current->domain;
3974
0
    struct vcpu *v;
3975
0
3976
0
    if ( !is_hvm_domain(d) )
3977
0
        return -EINVAL;
3978
0
3979
0
    /* Avoid deadlock if more than one vcpu tries this at the same time. */
3980
0
    if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
3981
0
        return -ERESTART;
3982
0
3983
0
    /* Pause all other vcpus. */
3984
0
    for_each_vcpu ( d, v )
3985
0
        if ( v != current )
3986
0
            vcpu_pause_nosync(v);
3987
0
3988
0
    /* Now that all VCPUs are signalled to deschedule, we wait... */
3989
0
    for_each_vcpu ( d, v )
3990
0
        if ( v != current )
3991
0
            while ( !vcpu_runnable(v) && v->is_running )
3992
0
                cpu_relax();
3993
0
3994
0
    /* All other vcpus are paused, safe to unlock now. */
3995
0
    spin_unlock(&d->hypercall_deadlock_mutex);
3996
0
3997
0
    /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
3998
0
    for_each_vcpu ( d, v )
3999
0
        paging_update_cr3(v);
4000
0
4001
0
    /* Flush all dirty TLBs. */
4002
0
    flush_tlb_mask(d->domain_dirty_cpumask);
4003
0
4004
0
    /* Done. */
4005
0
    for_each_vcpu ( d, v )
4006
0
        if ( v != current )
4007
0
            vcpu_unpause(v);
4008
0
4009
0
    return 0;
4010
0
}
4011
4012
static int hvmop_set_evtchn_upcall_vector(
4013
    XEN_GUEST_HANDLE_PARAM(xen_hvm_evtchn_upcall_vector_t) uop)
4014
0
{
4015
0
    xen_hvm_evtchn_upcall_vector_t op;
4016
0
    struct domain *d = current->domain;
4017
0
    struct vcpu *v;
4018
0
4019
0
    if ( !is_hvm_domain(d) )
4020
0
        return -EINVAL;
4021
0
4022
0
    if ( copy_from_guest(&op, uop, 1) )
4023
0
        return -EFAULT;
4024
0
4025
0
    if ( op.vector < 0x10 )
4026
0
        return -EINVAL;
4027
0
4028
0
    if ( op.vcpu >= d->max_vcpus || (v = d->vcpu[op.vcpu]) == NULL )
4029
0
        return -ENOENT;
4030
0
4031
0
    printk(XENLOG_G_INFO "%pv: upcall vector %02x\n", v, op.vector);
4032
0
4033
0
    v->arch.hvm_vcpu.evtchn_upcall_vector = op.vector;
4034
0
    return 0;
4035
0
}
4036
4037
static int hvm_allow_set_param(struct domain *d,
4038
                               const struct xen_hvm_param *a)
4039
1
{
4040
1
    uint64_t value = d->arch.hvm_domain.params[a->index];
4041
1
    int rc;
4042
1
4043
1
    rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_set_param);
4044
1
    if ( rc )
4045
0
        return rc;
4046
1
4047
1
    switch ( a->index )
4048
1
    {
4049
1
    /* The following parameters can be set by the guest. */
4050
1
    case HVM_PARAM_CALLBACK_IRQ:
4051
1
    case HVM_PARAM_VM86_TSS:
4052
1
    case HVM_PARAM_VM86_TSS_SIZED:
4053
1
    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
4054
1
    case HVM_PARAM_VM_GENERATION_ID_ADDR:
4055
1
    case HVM_PARAM_STORE_EVTCHN:
4056
1
    case HVM_PARAM_CONSOLE_EVTCHN:
4057
1
    case HVM_PARAM_X87_FIP_WIDTH:
4058
1
        break;
4059
1
    /*
4060
1
     * The following parameters must not be set by the guest
4061
1
     * since the domain may need to be paused.
4062
1
     */
4063
0
    case HVM_PARAM_IDENT_PT:
4064
0
    case HVM_PARAM_DM_DOMAIN:
4065
0
    case HVM_PARAM_ACPI_S_STATE:
4066
0
    /* The remaining parameters should not be set by the guest. */
4067
0
    default:
4068
0
        if ( d == current->domain )
4069
0
            rc = -EPERM;
4070
0
        break;
4071
1
    }
4072
1
4073
1
    if ( rc )
4074
0
        return rc;
4075
1
4076
1
    switch ( a->index )
4077
1
    {
4078
1
    /* The following parameters should only be changed once. */
4079
0
    case HVM_PARAM_VIRIDIAN:
4080
0
    case HVM_PARAM_IOREQ_SERVER_PFN:
4081
0
    case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
4082
0
    case HVM_PARAM_ALTP2M:
4083
0
    case HVM_PARAM_MCA_CAP:
4084
0
        if ( value != 0 && a->value != value )
4085
0
            rc = -EEXIST;
4086
0
        break;
4087
1
    default:
4088
1
        break;
4089
1
    }
4090
1
4091
1
    return rc;
4092
1
}
4093
4094
static int hvmop_set_param(
4095
    XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg)
4096
1
{
4097
1
    struct domain *curr_d = current->domain;
4098
1
    struct xen_hvm_param a;
4099
1
    struct domain *d;
4100
1
    struct vcpu *v;
4101
1
    int rc;
4102
1
4103
1
    if ( copy_from_guest(&a, arg, 1) )
4104
0
        return -EFAULT;
4105
1
4106
1
    if ( a.index >= HVM_NR_PARAMS )
4107
0
        return -EINVAL;
4108
1
4109
1
    d = rcu_lock_domain_by_any_id(a.domid);
4110
1
    if ( d == NULL )
4111
0
        return -ESRCH;
4112
1
4113
1
    rc = -EINVAL;
4114
1
    if ( !is_hvm_domain(d) )
4115
0
        goto out;
4116
1
4117
1
    rc = hvm_allow_set_param(d, &a);
4118
1
    if ( rc )
4119
0
        goto out;
4120
1
4121
1
    switch ( a.index )
4122
1
    {
4123
1
    case HVM_PARAM_CALLBACK_IRQ:
4124
1
        hvm_set_callback_via(d, a.value);
4125
1
        hvm_latch_shinfo_size(d);
4126
1
        break;
4127
0
    case HVM_PARAM_TIMER_MODE:
4128
0
        if ( a.value > HVMPTM_one_missed_tick_pending )
4129
0
            rc = -EINVAL;
4130
0
        break;
4131
0
    case HVM_PARAM_VIRIDIAN:
4132
0
        if ( (a.value & ~HVMPV_feature_mask) ||
4133
0
             !(a.value & HVMPV_base_freq) )
4134
0
            rc = -EINVAL;
4135
0
        break;
4136
0
    case HVM_PARAM_IDENT_PT:
4137
0
        /*
4138
0
         * Only actually required for VT-x lacking unrestricted_guest
4139
0
         * capabilities.  Short circuit the pause if possible.
4140
0
         */
4141
0
        if ( !paging_mode_hap(d) || !cpu_has_vmx )
4142
0
        {
4143
0
            d->arch.hvm_domain.params[a.index] = a.value;
4144
0
            break;
4145
0
        }
4146
0
4147
0
        /*
4148
0
         * Update GUEST_CR3 in each VMCS to point at identity map.
4149
0
         * All foreign updates to guest state must synchronise on
4150
0
         * the domctl_lock.
4151
0
         */
4152
0
        rc = -ERESTART;
4153
0
        if ( !domctl_lock_acquire() )
4154
0
            break;
4155
0
4156
0
        rc = 0;
4157
0
        domain_pause(d);
4158
0
        d->arch.hvm_domain.params[a.index] = a.value;
4159
0
        for_each_vcpu ( d, v )
4160
0
            paging_update_cr3(v);
4161
0
        domain_unpause(d);
4162
0
4163
0
        domctl_lock_release();
4164
0
        break;
4165
0
    case HVM_PARAM_DM_DOMAIN:
4166
0
        if ( a.value == DOMID_SELF )
4167
0
            a.value = curr_d->domain_id;
4168
0
4169
0
        rc = hvm_set_dm_domain(d, a.value);
4170
0
        break;
4171
0
    case HVM_PARAM_ACPI_S_STATE:
4172
0
        rc = 0;
4173
0
        if ( a.value == 3 )
4174
0
            hvm_s3_suspend(d);
4175
0
        else if ( a.value == 0 )
4176
0
            hvm_s3_resume(d);
4177
0
        else
4178
0
            rc = -EINVAL;
4179
0
4180
0
        break;
4181
0
    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
4182
0
        rc = pmtimer_change_ioport(d, a.value);
4183
0
        break;
4184
0
    case HVM_PARAM_MEMORY_EVENT_CR0:
4185
0
    case HVM_PARAM_MEMORY_EVENT_CR3:
4186
0
    case HVM_PARAM_MEMORY_EVENT_CR4:
4187
0
    case HVM_PARAM_MEMORY_EVENT_INT3:
4188
0
    case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP:
4189
0
    case HVM_PARAM_MEMORY_EVENT_MSR:
4190
0
        /* Deprecated */
4191
0
        rc = -EOPNOTSUPP;
4192
0
        break;
4193
0
    case HVM_PARAM_NESTEDHVM:
4194
0
        rc = xsm_hvm_param_nested(XSM_PRIV, d);
4195
0
        if ( rc )
4196
0
            break;
4197
0
        if ( a.value > 1 )
4198
0
            rc = -EINVAL;
4199
0
        /*
4200
0
         * Remove the check below once we have
4201
0
         * shadow-on-shadow.
4202
0
         */
4203
0
        if ( !paging_mode_hap(d) && a.value )
4204
0
            rc = -EINVAL;
4205
0
        if ( a.value &&
4206
0
             d->arch.hvm_domain.params[HVM_PARAM_ALTP2M] )
4207
0
            rc = -EINVAL;
4208
0
        /* Set up NHVM state for any vcpus that are already up. */
4209
0
        if ( a.value &&
4210
0
             !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
4211
0
            for_each_vcpu(d, v)
4212
0
                if ( rc == 0 )
4213
0
                    rc = nestedhvm_vcpu_initialise(v);
4214
0
        if ( !a.value || rc )
4215
0
            for_each_vcpu(d, v)
4216
0
                nestedhvm_vcpu_destroy(v);
4217
0
        break;
4218
0
    case HVM_PARAM_ALTP2M:
4219
0
        rc = xsm_hvm_param_altp2mhvm(XSM_PRIV, d);
4220
0
        if ( rc )
4221
0
            break;
4222
0
        if ( a.value > XEN_ALTP2M_limited )
4223
0
            rc = -EINVAL;
4224
0
        if ( a.value &&
4225
0
             d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
4226
0
            rc = -EINVAL;
4227
0
        break;
4228
0
    case HVM_PARAM_BUFIOREQ_EVTCHN:
4229
0
        rc = -EINVAL;
4230
0
        break;
4231
0
    case HVM_PARAM_TRIPLE_FAULT_REASON:
4232
0
        if ( a.value > SHUTDOWN_MAX )
4233
0
            rc = -EINVAL;
4234
0
        break;
4235
0
    case HVM_PARAM_IOREQ_SERVER_PFN:
4236
0
        d->arch.hvm_domain.ioreq_gfn.base = a.value;
4237
0
        break;
4238
0
    case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
4239
0
    {
4240
0
        unsigned int i;
4241
0
4242
0
        if ( a.value == 0 ||
4243
0
             a.value > sizeof(d->arch.hvm_domain.ioreq_gfn.mask) * 8 )
4244
0
        {
4245
0
            rc = -EINVAL;
4246
0
            break;
4247
0
        }
4248
0
        for ( i = 0; i < a.value; i++ )
4249
0
            set_bit(i, &d->arch.hvm_domain.ioreq_gfn.mask);
4250
0
4251
0
        break;
4252
0
    }
4253
0
    case HVM_PARAM_X87_FIP_WIDTH:
4254
0
        if ( a.value != 0 && a.value != 4 && a.value != 8 )
4255
0
        {
4256
0
            rc = -EINVAL;
4257
0
            break;
4258
0
        }
4259
0
        d->arch.x87_fip_width = a.value;
4260
0
        break;
4261
0
4262
0
    case HVM_PARAM_VM86_TSS:
4263
0
        /* Hardware would silently truncate high bits. */
4264
0
        if ( a.value != (uint32_t)a.value )
4265
0
        {
4266
0
            if ( d == curr_d )
4267
0
                domain_crash(d);
4268
0
            rc = -EINVAL;
4269
0
        }
4270
0
        /* Old hvmloader binaries hardcode the size to 128 bytes. */
4271
0
        if ( a.value )
4272
0
            a.value |= (128ULL << 32) | VM86_TSS_UPDATED;
4273
0
        a.index = HVM_PARAM_VM86_TSS_SIZED;
4274
0
        break;
4275
0
4276
0
    case HVM_PARAM_VM86_TSS_SIZED:
4277
0
        if ( (a.value >> 32) < sizeof(struct tss32) )
4278
0
        {
4279
0
            if ( d == curr_d )
4280
0
                domain_crash(d);
4281
0
            rc = -EINVAL;
4282
0
        }
4283
0
        /*
4284
0
         * Cap at the theoretically useful maximum (base structure plus
4285
0
         * 256 bits interrupt redirection bitmap + 64k bits I/O bitmap
4286
0
         * plus one padding byte).
4287
0
         */
4288
0
        if ( (a.value >> 32) > sizeof(struct tss32) +
4289
0
                               (0x100 / 8) + (0x10000 / 8) + 1 )
4290
0
            a.value = (uint32_t)a.value |
4291
0
                      ((sizeof(struct tss32) + (0x100 / 8) +
4292
0
                                               (0x10000 / 8) + 1) << 32);
4293
0
        a.value |= VM86_TSS_UPDATED;
4294
0
        break;
4295
0
4296
0
    case HVM_PARAM_MCA_CAP:
4297
0
        rc = vmce_enable_mca_cap(d, a.value);
4298
0
        break;
4299
1
    }
4300
1
4301
1
    if ( rc != 0 )
4302
0
        goto out;
4303
1
4304
1
    d->arch.hvm_domain.params[a.index] = a.value;
4305
1
4306
1
    HVM_DBG_LOG(DBG_LEVEL_HCALL, "set param %u = %"PRIx64,
4307
1
                a.index, a.value);
4308
1
4309
1
 out:
4310
1
    rcu_unlock_domain(d);
4311
1
    return rc;
4312
1
}
4313
4314
static int hvm_allow_get_param(struct domain *d,
4315
                               const struct xen_hvm_param *a)
4316
3
{
4317
3
    int rc;
4318
3
4319
3
    rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_get_param);
4320
3
    if ( rc )
4321
0
        return rc;
4322
3
4323
3
    switch ( a->index )
4324
3
    {
4325
3
    /* The following parameters can be read by the guest. */
4326
3
    case HVM_PARAM_CALLBACK_IRQ:
4327
3
    case HVM_PARAM_VM86_TSS:
4328
3
    case HVM_PARAM_VM86_TSS_SIZED:
4329
3
    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
4330
3
    case HVM_PARAM_VM_GENERATION_ID_ADDR:
4331
3
    case HVM_PARAM_STORE_PFN:
4332
3
    case HVM_PARAM_STORE_EVTCHN:
4333
3
    case HVM_PARAM_CONSOLE_PFN:
4334
3
    case HVM_PARAM_CONSOLE_EVTCHN:
4335
3
    case HVM_PARAM_ALTP2M:
4336
3
    case HVM_PARAM_X87_FIP_WIDTH:
4337
3
        break;
4338
3
    /*
4339
3
     * The following parameters must not be read by the guest
4340
3
     * since the domain may need to be paused.
4341
3
     */
4342
0
    case HVM_PARAM_IOREQ_PFN:
4343
0
    case HVM_PARAM_BUFIOREQ_PFN:
4344
0
    case HVM_PARAM_BUFIOREQ_EVTCHN:
4345
0
    /* The remaining parameters should not be read by the guest. */
4346
0
    default:
4347
0
        if ( d == current->domain )
4348
0
            rc = -EPERM;
4349
0
        break;
4350
3
    }
4351
3
4352
3
    return rc;
4353
3
}
4354
4355
static int hvmop_get_param(
4356
    XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg)
4357
3
{
4358
3
    struct xen_hvm_param a;
4359
3
    struct domain *d;
4360
3
    int rc;
4361
3
4362
3
    if ( copy_from_guest(&a, arg, 1) )
4363
0
        return -EFAULT;
4364
3
4365
3
    if ( a.index >= HVM_NR_PARAMS )
4366
0
        return -EINVAL;
4367
3
4368
3
    d = rcu_lock_domain_by_any_id(a.domid);
4369
3
    if ( d == NULL )
4370
0
        return -ESRCH;
4371
3
4372
3
    rc = -EINVAL;
4373
3
    if ( !is_hvm_domain(d) )
4374
0
        goto out;
4375
3
4376
3
    rc = hvm_allow_get_param(d, &a);
4377
3
    if ( rc )
4378
0
        goto out;
4379
3
4380
3
    switch ( a.index )
4381
3
    {
4382
0
    case HVM_PARAM_ACPI_S_STATE:
4383
0
        a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
4384
0
        break;
4385
0
4386
0
    case HVM_PARAM_VM86_TSS:
4387
0
        a.value = (uint32_t)d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED];
4388
0
        break;
4389
0
4390
0
    case HVM_PARAM_VM86_TSS_SIZED:
4391
0
        a.value = d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED] &
4392
0
                  ~VM86_TSS_UPDATED;
4393
0
        break;
4394
0
4395
0
    case HVM_PARAM_X87_FIP_WIDTH:
4396
0
        a.value = d->arch.x87_fip_width;
4397
0
        break;
4398
0
    case HVM_PARAM_IOREQ_PFN:
4399
0
    case HVM_PARAM_BUFIOREQ_PFN:
4400
0
    case HVM_PARAM_BUFIOREQ_EVTCHN:
4401
0
        /*
4402
0
         * It may be necessary to create a default ioreq server here,
4403
0
         * because legacy versions of QEMU are not aware of the new API for
4404
0
         * explicit ioreq server creation. However, if the domain is not
4405
0
         * under construction then it will not be QEMU querying the
4406
0
         * parameters and thus the query should not have that side-effect.
4407
0
         */
4408
0
        if ( !d->creation_finished )
4409
0
        {
4410
0
            domid_t domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
4411
0
4412
0
            rc = hvm_create_ioreq_server(d, domid, true,
4413
0
                                         HVM_IOREQSRV_BUFIOREQ_LEGACY, NULL);
4414
0
            if ( rc != 0 && rc != -EEXIST )
4415
0
                goto out;
4416
0
        }
4417
0
4418
0
    /*FALLTHRU*/
4419
3
    default:
4420
3
        a.value = d->arch.hvm_domain.params[a.index];
4421
3
        break;
4422
3
    }
4423
3
4424
3
    rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
4425
3
4426
3
    HVM_DBG_LOG(DBG_LEVEL_HCALL, "get param %u = %"PRIx64,
4427
3
                a.index, a.value);
4428
3
4429
3
 out:
4430
3
    rcu_unlock_domain(d);
4431
3
    return rc;
4432
3
}
4433
4434
static int do_altp2m_op(
4435
    XEN_GUEST_HANDLE_PARAM(void) arg)
4436
0
{
4437
0
    struct xen_hvm_altp2m_op a;
4438
0
    struct domain *d = NULL;
4439
0
    int rc = 0;
4440
0
    uint64_t mode;
4441
0
4442
0
    if ( !hvm_altp2m_supported() )
4443
0
        return -EOPNOTSUPP;
4444
0
4445
0
    if ( copy_from_guest(&a, arg, 1) )
4446
0
        return -EFAULT;
4447
0
4448
0
    if ( a.pad1 || a.pad2 ||
4449
0
         (a.version != HVMOP_ALTP2M_INTERFACE_VERSION) )
4450
0
        return -EINVAL;
4451
0
4452
0
    switch ( a.cmd )
4453
0
    {
4454
0
    case HVMOP_altp2m_get_domain_state:
4455
0
    case HVMOP_altp2m_set_domain_state:
4456
0
    case HVMOP_altp2m_vcpu_enable_notify:
4457
0
    case HVMOP_altp2m_create_p2m:
4458
0
    case HVMOP_altp2m_destroy_p2m:
4459
0
    case HVMOP_altp2m_switch_p2m:
4460
0
    case HVMOP_altp2m_set_mem_access:
4461
0
    case HVMOP_altp2m_change_gfn:
4462
0
        break;
4463
0
    default:
4464
0
        return -EOPNOTSUPP;
4465
0
    }
4466
0
4467
0
    d = ( a.cmd != HVMOP_altp2m_vcpu_enable_notify ) ?
4468
0
        rcu_lock_domain_by_any_id(a.domain) : rcu_lock_current_domain();
4469
0
4470
0
    if ( d == NULL )
4471
0
        return -ESRCH;
4472
0
4473
0
    if ( !is_hvm_domain(d) )
4474
0
    {
4475
0
        rc = -EOPNOTSUPP;
4476
0
        goto out;
4477
0
    }
4478
0
4479
0
    if ( (a.cmd != HVMOP_altp2m_get_domain_state) &&
4480
0
         (a.cmd != HVMOP_altp2m_set_domain_state) &&
4481
0
         !d->arch.altp2m_active )
4482
0
    {
4483
0
        rc = -EOPNOTSUPP;
4484
0
        goto out;
4485
0
    }
4486
0
4487
0
    mode = d->arch.hvm_domain.params[HVM_PARAM_ALTP2M];
4488
0
4489
0
    if ( XEN_ALTP2M_disabled == mode )
4490
0
    {
4491
0
        rc = -EINVAL;
4492
0
        goto out;
4493
0
    }
4494
0
4495
0
    if ( (rc = xsm_hvm_altp2mhvm_op(XSM_OTHER, d, mode, a.cmd)) )
4496
0
        goto out;
4497
0
4498
0
    switch ( a.cmd )
4499
0
    {
4500
0
    case HVMOP_altp2m_get_domain_state:
4501
0
        a.u.domain_state.state = altp2m_active(d);
4502
0
        rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
4503
0
        break;
4504
0
4505
0
    case HVMOP_altp2m_set_domain_state:
4506
0
    {
4507
0
        struct vcpu *v;
4508
0
        bool_t ostate;
4509
0
4510
0
        if ( nestedhvm_enabled(d) )
4511
0
        {
4512
0
            rc = -EINVAL;
4513
0
            break;
4514
0
        }
4515
0
4516
0
        ostate = d->arch.altp2m_active;
4517
0
        d->arch.altp2m_active = !!a.u.domain_state.state;
4518
0
4519
0
        /* If the alternate p2m state has changed, handle appropriately */
4520
0
        if ( d->arch.altp2m_active != ostate &&
4521
0
             (ostate || !(rc = p2m_init_altp2m_by_id(d, 0))) )
4522
0
        {
4523
0
            for_each_vcpu( d, v )
4524
0
            {
4525
0
                if ( !ostate )
4526
0
                    altp2m_vcpu_initialise(v);
4527
0
                else
4528
0
                    altp2m_vcpu_destroy(v);
4529
0
            }
4530
0
4531
0
            if ( ostate )
4532
0
                p2m_flush_altp2m(d);
4533
0
        }
4534
0
        break;
4535
0
    }
4536
0
4537
0
    case HVMOP_altp2m_vcpu_enable_notify:
4538
0
    {
4539
0
        struct vcpu *curr = current;
4540
0
        p2m_type_t p2mt;
4541
0
4542
0
        if ( a.u.enable_notify.pad || a.domain != DOMID_SELF ||
4543
0
             a.u.enable_notify.vcpu_id != curr->vcpu_id )
4544
0
            rc = -EINVAL;
4545
0
4546
0
        if ( !gfn_eq(vcpu_altp2m(curr).veinfo_gfn, INVALID_GFN) ||
4547
0
             mfn_eq(get_gfn_query_unlocked(curr->domain,
4548
0
                    a.u.enable_notify.gfn, &p2mt), INVALID_MFN) )
4549
0
            return -EINVAL;
4550
0
4551
0
        vcpu_altp2m(curr).veinfo_gfn = _gfn(a.u.enable_notify.gfn);
4552
0
        altp2m_vcpu_update_vmfunc_ve(curr);
4553
0
        break;
4554
0
    }
4555
0
4556
0
    case HVMOP_altp2m_create_p2m:
4557
0
        if ( !(rc = p2m_init_next_altp2m(d, &a.u.view.view)) )
4558
0
            rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
4559
0
        break;
4560
0
4561
0
    case HVMOP_altp2m_destroy_p2m:
4562
0
        rc = p2m_destroy_altp2m_by_id(d, a.u.view.view);
4563
0
        break;
4564
0
4565
0
    case HVMOP_altp2m_switch_p2m:
4566
0
        rc = p2m_switch_domain_altp2m_by_id(d, a.u.view.view);
4567
0
        break;
4568
0
4569
0
    case HVMOP_altp2m_set_mem_access:
4570
0
        if ( a.u.set_mem_access.pad )
4571
0
            rc = -EINVAL;
4572
0
        else
4573
0
            rc = p2m_set_mem_access(d, _gfn(a.u.set_mem_access.gfn), 1, 0, 0,
4574
0
                                    a.u.set_mem_access.hvmmem_access,
4575
0
                                    a.u.set_mem_access.view);
4576
0
        break;
4577
0
4578
0
    case HVMOP_altp2m_change_gfn:
4579
0
        if ( a.u.change_gfn.pad1 || a.u.change_gfn.pad2 )
4580
0
            rc = -EINVAL;
4581
0
        else
4582
0
            rc = p2m_change_altp2m_gfn(d, a.u.change_gfn.view,
4583
0
                    _gfn(a.u.change_gfn.old_gfn),
4584
0
                    _gfn(a.u.change_gfn.new_gfn));
4585
0
        break;
4586
0
    default:
4587
0
        ASSERT_UNREACHABLE();
4588
0
    }
4589
0
4590
0
 out:
4591
0
    rcu_unlock_domain(d);
4592
0
4593
0
    return rc;
4594
0
}
4595
4596
static int hvmop_get_mem_type(
4597
    XEN_GUEST_HANDLE_PARAM(xen_hvm_get_mem_type_t) arg)
4598
0
{
4599
0
    struct xen_hvm_get_mem_type a;
4600
0
    struct domain *d;
4601
0
    p2m_type_t t;
4602
0
    int rc;
4603
0
4604
0
    if ( copy_from_guest(&a, arg, 1) )
4605
0
        return -EFAULT;
4606
0
4607
0
    d = rcu_lock_domain_by_any_id(a.domid);
4608
0
    if ( d == NULL )
4609
0
        return -ESRCH;
4610
0
4611
0
    rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_get_mem_type);
4612
0
    if ( rc )
4613
0
        goto out;
4614
0
4615
0
    rc = -EINVAL;
4616
0
    if ( !is_hvm_domain(d) )
4617
0
        goto out;
4618
0
4619
0
    /*
4620
0
     * Use get_gfn query as we are interested in the current
4621
0
     * type, not in allocating or unsharing. That'll happen
4622
0
     * on access.
4623
0
     */
4624
0
    get_gfn_query_unlocked(d, a.pfn, &t);
4625
0
    if ( p2m_is_mmio(t) )
4626
0
        a.mem_type =  HVMMEM_mmio_dm;
4627
0
    else if ( t == p2m_ioreq_server )
4628
0
        a.mem_type = HVMMEM_ioreq_server;
4629
0
    else if ( p2m_is_readonly(t) )
4630
0
        a.mem_type =  HVMMEM_ram_ro;
4631
0
    else if ( p2m_is_ram(t) )
4632
0
        a.mem_type =  HVMMEM_ram_rw;
4633
0
    else if ( p2m_is_pod(t) )
4634
0
        a.mem_type =  HVMMEM_ram_rw;
4635
0
    else if ( p2m_is_grant(t) )
4636
0
        a.mem_type =  HVMMEM_ram_rw;
4637
0
    else
4638
0
        a.mem_type =  HVMMEM_mmio_dm;
4639
0
4640
0
    rc = -EFAULT;
4641
0
    if ( __copy_to_guest(arg, &a, 1) )
4642
0
        goto out;
4643
0
    rc = 0;
4644
0
4645
0
 out:
4646
0
    rcu_unlock_domain(d);
4647
0
4648
0
    return rc;
4649
0
}
4650
4651
long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
4652
4
{
4653
4
    long rc = 0;
4654
4
4655
4
    /*
4656
4
     * NB: hvm_op can be part of a restarted hypercall; but at the
4657
4
     * moment the only hypercalls which do continuations don't need to
4658
4
     * store any iteration information (since they're just re-trying
4659
4
     * the acquisition of a lock).
4660
4
     */
4661
4
4662
4
    switch ( op )
4663
4
    {
4664
0
    case HVMOP_set_evtchn_upcall_vector:
4665
0
        rc = hvmop_set_evtchn_upcall_vector(
4666
0
            guest_handle_cast(arg, xen_hvm_evtchn_upcall_vector_t));
4667
0
        break;
4668
0
    
4669
1
    case HVMOP_set_param:
4670
1
        rc = hvmop_set_param(
4671
1
            guest_handle_cast(arg, xen_hvm_param_t));
4672
1
        break;
4673
0
4674
3
    case HVMOP_get_param:
4675
3
        rc = hvmop_get_param(
4676
3
            guest_handle_cast(arg, xen_hvm_param_t));
4677
3
        break;
4678
0
4679
0
    case HVMOP_flush_tlbs:
4680
0
        rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -EINVAL;
4681
0
        break;
4682
0
4683
0
    case HVMOP_get_mem_type:
4684
0
        rc = hvmop_get_mem_type(
4685
0
            guest_handle_cast(arg, xen_hvm_get_mem_type_t));
4686
0
        break;
4687
0
4688
0
    case HVMOP_pagetable_dying:
4689
0
    {
4690
0
        struct xen_hvm_pagetable_dying a;
4691
0
        struct domain *d;
4692
0
4693
0
        if ( copy_from_guest(&a, arg, 1) )
4694
0
            return -EFAULT;
4695
0
4696
0
        d = rcu_lock_domain_by_any_id(a.domid);
4697
0
        if ( d == NULL )
4698
0
            return -ESRCH;
4699
0
4700
0
        rc = -EINVAL;
4701
0
        if ( is_hvm_domain(d) && paging_mode_shadow(d) )
4702
0
            rc = xsm_hvm_param(XSM_TARGET, d, op);
4703
0
        if ( !rc )
4704
0
            pagetable_dying(d, a.gpa);
4705
0
4706
0
        rcu_unlock_domain(d);
4707
0
        break;
4708
0
    }
4709
0
4710
0
    case HVMOP_get_time: {
4711
0
        xen_hvm_get_time_t gxt;
4712
0
4713
0
        gxt.now = NOW();
4714
0
        if ( copy_to_guest(arg, &gxt, 1) )
4715
0
            rc = -EFAULT;
4716
0
        break;
4717
0
    }
4718
0
4719
0
    case HVMOP_xentrace: {
4720
0
        xen_hvm_xentrace_t tr;
4721
0
4722
0
        if ( copy_from_guest(&tr, arg, 1 ) )
4723
0
            return -EFAULT;
4724
0
4725
0
        if ( tr.extra_bytes > sizeof(tr.extra)
4726
0
             || (tr.event & ~((1u<<TRC_SUBCLS_SHIFT)-1)) )
4727
0
            return -EINVAL;
4728
0
4729
0
        /* Cycles will be taken at the vmexit and vmenter */
4730
0
        trace_var(tr.event | TRC_GUEST, 0 /*!cycles*/,
4731
0
                  tr.extra_bytes, tr.extra);
4732
0
        break;
4733
0
    }
4734
0
4735
0
    case HVMOP_guest_request_vm_event:
4736
0
        if ( guest_handle_is_null(arg) )
4737
0
            monitor_guest_request();
4738
0
        else
4739
0
            rc = -EINVAL;
4740
0
        break;
4741
0
4742
0
    case HVMOP_altp2m:
4743
0
        rc = do_altp2m_op(arg);
4744
0
        break;
4745
0
4746
0
    default:
4747
0
    {
4748
0
        gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op);
4749
0
        rc = -ENOSYS;
4750
0
        break;
4751
0
    }
4752
4
    }
4753
4
4754
4
    if ( rc == -ERESTART )
4755
0
        rc = hypercall_create_continuation(__HYPERVISOR_hvm_op, "lh",
4756
0
                                           op, arg);
4757
4
4758
4
    return rc;
4759
4
}
4760
4761
int hvm_debug_op(struct vcpu *v, int32_t op)
4762
0
{
4763
0
    int rc;
4764
0
4765
0
    switch ( op )
4766
0
    {
4767
0
        case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
4768
0
        case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
4769
0
            rc = -EOPNOTSUPP;
4770
0
            if ( !cpu_has_monitor_trap_flag )
4771
0
                break;
4772
0
            rc = 0;
4773
0
            vcpu_pause(v);
4774
0
            v->arch.hvm_vcpu.single_step =
4775
0
                (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
4776
0
            vcpu_unpause(v); /* guest will latch new state */
4777
0
            break;
4778
0
        default:
4779
0
            rc = -ENOSYS;
4780
0
            break;
4781
0
    }
4782
0
4783
0
    return rc;
4784
0
}
4785
4786
void hvm_toggle_singlestep(struct vcpu *v)
4787
0
{
4788
0
    ASSERT(atomic_read(&v->pause_count));
4789
0
4790
0
    if ( !hvm_is_singlestep_supported() )
4791
0
        return;
4792
0
4793
0
    v->arch.hvm_vcpu.single_step = !v->arch.hvm_vcpu.single_step;
4794
0
}
4795
4796
int hvm_set_mode(struct vcpu *v, int mode)
4797
0
{
4798
0
4799
0
    switch ( mode )
4800
0
    {
4801
0
    case 4:
4802
0
        v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME);
4803
0
        break;
4804
0
    case 8:
4805
0
        v->arch.hvm_vcpu.guest_efer |= (EFER_LMA | EFER_LME);
4806
0
        break;
4807
0
    default:
4808
0
        return -EOPNOTSUPP;
4809
0
    }
4810
0
4811
0
    hvm_update_guest_efer(v);
4812
0
4813
0
    if ( hvm_funcs.set_mode )
4814
0
        return hvm_funcs.set_mode(v, mode);
4815
0
4816
0
    return 0;
4817
0
}
4818
4819
void hvm_domain_soft_reset(struct domain *d)
4820
0
{
4821
0
    hvm_destroy_all_ioreq_servers(d);
4822
0
}
4823
4824
/*
4825
 * Segment caches in VMCB/VMCS are inconsistent about which bits are checked,
4826
 * important, and preserved across vmentry/exit.  Cook the values to make them
4827
 * closer to what is architecturally expected from entries in the segment
4828
 * cache.
4829
 */
4830
void hvm_get_segment_register(struct vcpu *v, enum x86_segment seg,
4831
                              struct segment_register *reg)
4832
180k
{
4833
180k
    hvm_funcs.get_segment_register(v, seg, reg);
4834
180k
4835
180k
    switch ( seg )
4836
180k
    {
4837
60.1k
    case x86_seg_ss:
4838
60.1k
        /* SVM may retain %ss.DB when %ss is loaded with a NULL selector. */
4839
60.1k
        if ( !reg->p )
4840
6
            reg->db = 0;
4841
60.1k
        break;
4842
60.1k
4843
0
    case x86_seg_tr:
4844
0
        /*
4845
0
         * SVM doesn't track %tr.B. Architecturally, a loaded TSS segment will
4846
0
         * always be busy.
4847
0
         */
4848
0
        reg->type |= 0x2;
4849
0
4850
0
        /*
4851
0
         * %cs and %tr are unconditionally present.  SVM ignores these present
4852
0
         * bits and will happily run without them set.
4853
0
         */
4854
60.1k
    case x86_seg_cs:
4855
60.1k
        reg->p = 1;
4856
60.1k
        break;
4857
0
4858
0
    case x86_seg_gdtr:
4859
0
    case x86_seg_idtr:
4860
0
        /*
4861
0
         * Treat GDTR/IDTR as being present system segments.  This avoids them
4862
0
         * needing special casing for segmentation checks.
4863
0
         */
4864
0
        reg->attr = 0x80;
4865
0
        break;
4866
0
4867
60.1k
    default: /* Avoid triggering -Werror=switch */
4868
60.1k
        break;
4869
180k
    }
4870
180k
4871
180k
    if ( reg->p )
4872
180k
    {
4873
180k
        /*
4874
180k
         * For segments which are present/usable, cook the system flag.  SVM
4875
180k
         * ignores the S bit on all segments and will happily run with them in
4876
180k
         * any state.
4877
180k
         */
4878
180k
        reg->s = is_x86_user_segment(seg);
4879
180k
4880
180k
        /*
4881
180k
         * SVM discards %cs.G on #VMEXIT.  Other user segments do have .G
4882
180k
         * tracked, but Linux commit 80112c89ed87 "KVM: Synthesize G bit for
4883
180k
         * all segments." indicates that this isn't necessarily the case when
4884
180k
         * nested under ESXi.
4885
180k
         *
4886
180k
         * Unconditionally recalculate G.
4887
180k
         */
4888
180k
        reg->g = !!(reg->limit >> 20);
4889
180k
4890
180k
        /*
4891
180k
         * SVM doesn't track the Accessed flag.  It will always be set for
4892
180k
         * usable user segments loaded into the descriptor cache.
4893
180k
         */
4894
180k
        if ( is_x86_user_segment(seg) )
4895
180k
            reg->type |= 0x1;
4896
180k
    }
4897
180k
}
4898
4899
void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
4900
                              struct segment_register *reg)
4901
115
{
4902
115
    /* Set G to match the limit field.  VT-x cares, while SVM doesn't. */
4903
115
    if ( reg->p )
4904
92
        reg->g = !!(reg->limit >> 20);
4905
115
4906
115
    switch ( seg )
4907
115
    {
4908
12
    case x86_seg_cs:
4909
12
        ASSERT(reg->p);                              /* Usable. */
4910
12
        ASSERT(reg->s);                              /* User segment. */
4911
12
        ASSERT(reg->type & 0x1);                     /* Accessed. */
4912
12
        ASSERT((reg->base >> 32) == 0);              /* Upper bits clear. */
4913
12
        break;
4914
12
4915
12
    case x86_seg_ss:
4916
12
        if ( reg->p )
4917
12
        {
4918
12
            ASSERT(reg->s);                          /* User segment. */
4919
12
            ASSERT(!(reg->type & 0x8));              /* Data segment. */
4920
12
            ASSERT(reg->type & 0x2);                 /* Writeable. */
4921
12
            ASSERT(reg->type & 0x1);                 /* Accessed. */
4922
12
            ASSERT((reg->base >> 32) == 0);          /* Upper bits clear. */
4923
12
        }
4924
12
        break;
4925
12
4926
46
    case x86_seg_ds:
4927
46
    case x86_seg_es:
4928
46
    case x86_seg_fs:
4929
46
    case x86_seg_gs:
4930
46
        if ( reg->p )
4931
45
        {
4932
45
            ASSERT(reg->s);                          /* User segment. */
4933
45
4934
45
            if ( reg->type & 0x8 )
4935
0
                ASSERT(reg->type & 0x2);             /* Readable. */
4936
45
4937
45
            ASSERT(reg->type & 0x1);                 /* Accessed. */
4938
45
4939
45
            if ( seg == x86_seg_fs || seg == x86_seg_gs )
4940
22
                ASSERT(is_canonical_address(reg->base));
4941
45
            else
4942
23
                ASSERT((reg->base >> 32) == 0);      /* Upper bits clear. */
4943
45
        }
4944
46
        break;
4945
46
4946
12
    case x86_seg_tr:
4947
12
        ASSERT(reg->p);                              /* Usable. */
4948
12
        ASSERT(!reg->s);                             /* System segment. */
4949
12
        ASSERT(!(reg->sel & 0x4));                   /* !TI. */
4950
12
        if ( reg->type == SYS_DESC_tss_busy )
4951
12
            ASSERT(is_canonical_address(reg->base));
4952
0
        else if ( reg->type == SYS_DESC_tss16_busy )
4953
0
            ASSERT((reg->base >> 32) == 0);
4954
0
        else
4955
0
            ASSERT(!"%tr typecheck failure");
4956
12
        break;
4957
46
4958
11
    case x86_seg_ldtr:
4959
11
        if ( reg->p )
4960
11
        {
4961
11
            ASSERT(!reg->s);                         /* System segment. */
4962
11
            ASSERT(!(reg->sel & 0x4));               /* !TI. */
4963
11
            ASSERT(reg->type == SYS_DESC_ldt);
4964
11
            ASSERT(is_canonical_address(reg->base));
4965
11
        }
4966
11
        break;
4967
46
4968
22
    case x86_seg_gdtr:
4969
22
    case x86_seg_idtr:
4970
22
        ASSERT(is_canonical_address(reg->base));
4971
22
        ASSERT((reg->limit >> 16) == 0);             /* Upper bits clear. */
4972
22
        break;
4973
22
4974
0
    default:
4975
0
        ASSERT_UNREACHABLE();
4976
0
        return;
4977
115
    }
4978
115
4979
115
    hvm_funcs.set_segment_register(v, seg, reg);
4980
115
}
4981
4982
/*
4983
 * Local variables:
4984
 * mode: C
4985
 * c-file-style: "BSD"
4986
 * c-basic-offset: 4
4987
 * tab-width: 4
4988
 * indent-tabs-mode: nil
4989
 * End:
4990
 */
4991