Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/domain.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * arch/x86/domain.c
3
 *
4
 * x86-specific domain handling (e.g., register setup and context switching).
5
 */
6
7
/*
8
 *  Copyright (C) 1995  Linus Torvalds
9
 *
10
 *  Pentium III FXSR, SSE support
11
 *  Gareth Hughes <gareth@valinux.com>, May 2000
12
 */
13
14
#include <xen/init.h>
15
#include <xen/lib.h>
16
#include <xen/errno.h>
17
#include <xen/sched.h>
18
#include <xen/domain.h>
19
#include <xen/smp.h>
20
#include <xen/delay.h>
21
#include <xen/softirq.h>
22
#include <xen/grant_table.h>
23
#include <xen/iocap.h>
24
#include <xen/kernel.h>
25
#include <xen/hypercall.h>
26
#include <xen/multicall.h>
27
#include <xen/irq.h>
28
#include <xen/event.h>
29
#include <xen/console.h>
30
#include <xen/percpu.h>
31
#include <xen/compat.h>
32
#include <xen/acpi.h>
33
#include <xen/pci.h>
34
#include <xen/paging.h>
35
#include <xen/cpu.h>
36
#include <xen/wait.h>
37
#include <xen/guest_access.h>
38
#include <xen/livepatch.h>
39
#include <public/sysctl.h>
40
#include <public/hvm/hvm_vcpu.h>
41
#include <asm/regs.h>
42
#include <asm/mc146818rtc.h>
43
#include <asm/system.h>
44
#include <asm/io.h>
45
#include <asm/processor.h>
46
#include <asm/desc.h>
47
#include <asm/i387.h>
48
#include <asm/xstate.h>
49
#include <asm/cpuidle.h>
50
#include <asm/mpspec.h>
51
#include <asm/ldt.h>
52
#include <asm/hvm/hvm.h>
53
#include <asm/hvm/nestedhvm.h>
54
#include <asm/hvm/support.h>
55
#include <asm/hvm/viridian.h>
56
#include <asm/debugreg.h>
57
#include <asm/msr.h>
58
#include <asm/traps.h>
59
#include <asm/nmi.h>
60
#include <asm/mce.h>
61
#include <asm/amd.h>
62
#include <xen/numa.h>
63
#include <xen/iommu.h>
64
#include <compat/vcpu.h>
65
#include <asm/psr.h>
66
#include <asm/pv/domain.h>
67
#include <asm/pv/mm.h>
68
69
DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
70
71
static void default_idle(void);
72
void (*pm_idle) (void) __read_mostly = default_idle;
73
void (*dead_idle) (void) __read_mostly = default_dead_idle;
74
75
static void default_idle(void)
76
0
{
77
0
    local_irq_disable();
78
0
    if ( cpu_is_haltable(smp_processor_id()) )
79
0
        safe_halt();
80
0
    else
81
0
        local_irq_enable();
82
0
}
83
84
void default_dead_idle(void)
85
0
{
86
0
    /*
87
0
     * When going into S3, without flushing caches modified data may be
88
0
     * held by the CPUs spinning here indefinitely, and get discarded by
89
0
     * a subsequent INIT.
90
0
     */
91
0
    wbinvd();
92
0
    for ( ; ; )
93
0
        halt();
94
0
}
95
96
static void play_dead(void)
97
0
{
98
0
    local_irq_disable();
99
0
100
0
    /*
101
0
     * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
102
0
     * as they may be freed at any time. In this case, heap corruption or
103
0
     * #PF can occur (when heap debugging is enabled). For example, even
104
0
     * printk() can involve tasklet scheduling, which touches per-cpu vars.
105
0
     * 
106
0
     * Consider very carefully when adding code to *dead_idle. Most hypervisor
107
0
     * subsystems are unsafe to call.
108
0
     */
109
0
    cpu_exit_clear(smp_processor_id());
110
0
111
0
    (*dead_idle)();
112
0
}
113
114
static void idle_loop(void)
115
64.8k
{
116
64.8k
    unsigned int cpu = smp_processor_id();
117
64.8k
118
64.8k
    for ( ; ; )
119
1.87M
    {
120
1.87M
        if ( cpu_is_offline(cpu) )
121
0
            play_dead();
122
1.87M
123
1.87M
        /* Are we here for running vcpu context tasklets, or for idling? */
124
1.87M
        if ( unlikely(tasklet_work_to_do(cpu)) )
125
44
            do_tasklet();
126
1.87M
        /*
127
1.87M
         * Test softirqs twice --- first to see if should even try scrubbing
128
1.87M
         * and then, after it is done, whether softirqs became pending
129
1.87M
         * while we were scrubbing.
130
1.87M
         */
131
1.95M
        else if ( !softirq_pending(cpu) && !scrub_free_pages()  &&
132
2.11M
                    !softirq_pending(cpu) )
133
2.11M
            pm_idle();
134
1.87M
        do_softirq();
135
1.87M
        /*
136
1.87M
         * We MUST be last (or before pm_idle). Otherwise after we get the
137
1.87M
         * softirq we would execute pm_idle (and sleep) and not patch.
138
1.87M
         */
139
1.87M
        check_for_livepatch_work();
140
1.87M
    }
141
64.8k
}
142
143
void startup_cpu_idle_loop(void)
144
12
{
145
12
    struct vcpu *v = current;
146
12
147
12
    ASSERT(is_idle_vcpu(v));
148
12
    cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask);
149
12
    cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask);
150
12
151
12
    reset_stack_and_jump(idle_loop);
152
12
}
153
154
static void noreturn continue_idle_domain(struct vcpu *v)
155
64.8k
{
156
64.8k
    reset_stack_and_jump(idle_loop);
157
64.8k
}
158
159
void dump_pageframe_info(struct domain *d)
160
0
{
161
0
    struct page_info *page;
162
0
163
0
    printk("Memory pages belonging to domain %u:\n", d->domain_id);
164
0
165
0
    if ( d->tot_pages >= 10 && d->is_dying < DOMDYING_dead )
166
0
    {
167
0
        printk("    DomPage list too long to display\n");
168
0
    }
169
0
    else
170
0
    {
171
0
        unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {};
172
0
173
0
        spin_lock(&d->page_alloc_lock);
174
0
        page_list_for_each ( page, &d->page_list )
175
0
        {
176
0
            unsigned int index = MASK_EXTR(page->u.inuse.type_info,
177
0
                                           PGT_type_mask);
178
0
179
0
            if ( ++total[index] > 16 )
180
0
            {
181
0
                switch ( page->u.inuse.type_info & PGT_type_mask )
182
0
                {
183
0
                case PGT_none:
184
0
                case PGT_writable_page:
185
0
                    continue;
186
0
                }
187
0
            }
188
0
            printk("    DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
189
0
                   _p(page_to_mfn(page)),
190
0
                   page->count_info, page->u.inuse.type_info);
191
0
        }
192
0
        spin_unlock(&d->page_alloc_lock);
193
0
    }
194
0
195
0
    if ( is_hvm_domain(d) )
196
0
        p2m_pod_dump_data(d);
197
0
198
0
    spin_lock(&d->page_alloc_lock);
199
0
    page_list_for_each ( page, &d->xenpage_list )
200
0
    {
201
0
        printk("    XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
202
0
               _p(page_to_mfn(page)),
203
0
               page->count_info, page->u.inuse.type_info);
204
0
    }
205
0
    spin_unlock(&d->page_alloc_lock);
206
0
}
207
208
void update_guest_memory_policy(struct vcpu *v,
209
                                struct guest_memory_policy *policy)
210
0
{
211
0
    smap_check_policy_t old_smap_policy = v->arch.smap_check_policy;
212
0
    bool old_guest_mode = nestedhvm_is_n2(v);
213
0
    bool new_guest_mode = policy->nested_guest_mode;
214
0
215
0
    v->arch.smap_check_policy = policy->smap_policy;
216
0
    policy->smap_policy = old_smap_policy;
217
0
218
0
    /*
219
0
     * When 'v' is in the nested guest mode, all guest copy
220
0
     * functions/macros which finally call paging_gva_to_gfn()
221
0
     * transfer data to/from L2 guest. If the copy is intended for L1
222
0
     * guest, we must first clear the nested guest flag (by setting
223
0
     * policy->nested_guest_mode to false) before the copy and then
224
0
     * restore the nested guest flag (by setting
225
0
     * policy->nested_guest_mode to true) after the copy.
226
0
     */
227
0
    if ( unlikely(old_guest_mode != new_guest_mode) )
228
0
    {
229
0
        if ( new_guest_mode )
230
0
            nestedhvm_vcpu_enter_guestmode(v);
231
0
        else
232
0
            nestedhvm_vcpu_exit_guestmode(v);
233
0
        policy->nested_guest_mode = old_guest_mode;
234
0
    }
235
0
}
236
237
#ifndef CONFIG_BIGMEM
238
/*
239
 * The hole may be at or above the 44-bit boundary, so we need to determine
240
 * the total bit count until reaching 32 significant (not squashed out) bits
241
 * in PFN representations.
242
 * Note that the way "bits" gets initialized/updated/bounds-checked guarantees
243
 * that the function will never return zero, and hence will never be called
244
 * more than once (which is important due to it being deliberately placed in
245
 * .init.text).
246
 */
247
static unsigned int __init noinline _domain_struct_bits(void)
248
1
{
249
1
    unsigned int bits = 32 + PAGE_SHIFT;
250
1
    unsigned int sig = hweight32(~pfn_hole_mask);
251
1
    unsigned int mask = pfn_hole_mask >> 32;
252
1
253
1
    for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 )
254
0
        if ( !(mask & 1) )
255
0
            ++sig;
256
1
257
1
    return bits;
258
1
}
259
#endif
260
261
struct domain *alloc_domain_struct(void)
262
5
{
263
5
    struct domain *d;
264
5
    unsigned int order = get_order_from_bytes(sizeof(*d));
265
5
#ifdef CONFIG_BIGMEM
266
    const unsigned int bits = 0;
267
#else
268
5
    /*
269
5
     * We pack the PDX of the domain structure into a 32-bit field within
270
5
     * the page_info structure. Hence the MEMF_bits() restriction.
271
5
     */
272
5
    static unsigned int __read_mostly bits;
273
5
274
5
    if ( unlikely(!bits) )
275
1
         bits = _domain_struct_bits();
276
5
#endif
277
5
278
5
279
5
#ifndef CONFIG_LOCK_PROFILE
280
5
    BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
281
5
#endif
282
5
    d = alloc_xenheap_pages(order, MEMF_bits(bits));
283
5
    if ( d != NULL )
284
5
    {
285
5
        unsigned int sz;
286
5
287
10
        for ( sz = 0; sz < (PAGE_SIZE << order); sz += PAGE_SIZE )
288
5
            clear_page((void *)d + sz);
289
5
    }
290
5
    return d;
291
5
}
292
293
void free_domain_struct(struct domain *d)
294
0
{
295
0
    lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
296
0
    free_xenheap_page(d);
297
0
}
298
299
struct vcpu *alloc_vcpu_struct(void)
300
24
{
301
24
    struct vcpu *v;
302
24
    /*
303
24
     * This structure contains embedded PAE PDPTEs, used when an HVM guest
304
24
     * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
305
24
     * may require that the shadow CR3 points below 4GB, and hence the whole
306
24
     * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
307
24
     */
308
24
    BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE);
309
24
    v = alloc_xenheap_pages(0, MEMF_bits(32));
310
24
    if ( v != NULL )
311
24
        clear_page(v);
312
24
    return v;
313
24
}
314
315
void free_vcpu_struct(struct vcpu *v)
316
0
{
317
0
    free_xenheap_page(v);
318
0
}
319
320
int vcpu_initialise(struct vcpu *v)
321
24
{
322
24
    struct domain *d = v->domain;
323
24
    int rc;
324
24
325
24
    v->arch.flags = TF_kernel_mode;
326
24
327
24
    rc = mapcache_vcpu_init(v);
328
24
    if ( rc )
329
0
        return rc;
330
24
331
24
    if ( !is_idle_domain(d) )
332
12
    {
333
12
        paging_vcpu_init(v);
334
12
335
12
        if ( (rc = vcpu_init_fpu(v)) != 0 )
336
0
            return rc;
337
12
338
12
        vmce_init_vcpu(v);
339
12
    }
340
12
    else if ( (rc = xstate_alloc_save_area(v)) != 0 )
341
0
        return rc;
342
24
343
24
    spin_lock_init(&v->arch.vpmu.vpmu_lock);
344
24
345
24
    if ( is_hvm_domain(d) )
346
12
        rc = hvm_vcpu_initialise(v);
347
12
    else if ( !is_idle_domain(d) )
348
0
        rc = pv_vcpu_initialise(v);
349
12
    else
350
12
    {
351
12
        /* Idle domain */
352
12
        v->arch.cr3 = __pa(idle_pg_table);
353
12
        rc = 0;
354
12
        v->arch.msr = ZERO_BLOCK_PTR; /* Catch stray misuses */
355
12
    }
356
24
357
24
    if ( rc )
358
0
        goto fail;
359
24
360
24
    if ( !is_idle_domain(v->domain) )
361
12
    {
362
12
        vpmu_initialise(v);
363
12
364
12
        if ( (rc = init_vcpu_msr_policy(v)) )
365
0
            goto fail;
366
12
    }
367
24
368
24
    return rc;
369
24
370
0
 fail:
371
0
    vcpu_destroy_fpu(v);
372
0
    xfree(v->arch.msr);
373
0
    v->arch.msr = NULL;
374
0
375
0
    return rc;
376
24
}
377
378
void vcpu_destroy(struct vcpu *v)
379
0
{
380
0
    xfree(v->arch.vm_event);
381
0
    v->arch.vm_event = NULL;
382
0
383
0
    vcpu_destroy_fpu(v);
384
0
385
0
    if ( !is_idle_domain(v->domain) )
386
0
        vpmu_destroy(v);
387
0
388
0
    if ( is_hvm_vcpu(v) )
389
0
        hvm_vcpu_destroy(v);
390
0
    else
391
0
        pv_vcpu_destroy(v);
392
0
}
393
394
static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
395
1
{
396
1
397
1
    if ( is_hvm_domain(d) )
398
1
    {
399
1
        if ( is_hardware_domain(d) &&
400
1
             emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC|
401
1
                         XEN_X86_EMU_VPCI) )
402
0
            return false;
403
1
        if ( !is_hardware_domain(d) )
404
0
        {
405
0
            switch ( emflags )
406
0
            {
407
0
            case XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI:
408
0
            case XEN_X86_EMU_LAPIC:
409
0
            case 0:
410
0
                break;
411
0
            default:
412
0
                return false;
413
0
            }
414
0
        }
415
1
    }
416
0
    else if ( emflags != 0 && emflags != XEN_X86_EMU_PIT )
417
0
    {
418
0
        /* PV or classic PVH. */
419
0
        return false;
420
0
    }
421
1
422
1
    return true;
423
1
}
424
425
int arch_domain_create(struct domain *d, unsigned int domcr_flags,
426
                       struct xen_arch_domainconfig *config)
427
2
{
428
2
    bool paging_initialised = false;
429
2
    int rc;
430
2
431
2
    if ( config == NULL && !is_idle_domain(d) )
432
0
        return -EINVAL;
433
2
434
2
    d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
435
2
436
2
    INIT_LIST_HEAD(&d->arch.pdev_list);
437
2
438
2
    d->arch.relmem = RELMEM_not_started;
439
2
    INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
440
2
441
2
    if ( d->domain_id && !is_idle_domain(d) &&
442
0
         cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) )
443
0
    {
444
0
        if ( !opt_allow_unsafe )
445
0
        {
446
0
            printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU"
447
0
                   " for security reasons.\n");
448
0
            return -EPERM;
449
0
        }
450
0
        printk(XENLOG_G_WARNING
451
0
               "Dom%d may compromise security on this CPU.\n",
452
0
               d->domain_id);
453
0
    }
454
2
455
2
    if ( is_idle_domain(d) )
456
1
    {
457
1
        d->arch.emulation_flags = 0;
458
1
        d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */
459
1
        d->arch.msr = ZERO_BLOCK_PTR;
460
1
    }
461
2
    else
462
1
    {
463
1
        uint32_t emflags;
464
1
465
1
        if ( is_hardware_domain(d) && is_pv_domain(d) )
466
0
            config->emulation_flags |= XEN_X86_EMU_PIT;
467
1
468
1
        emflags = config->emulation_flags;
469
1
        if ( emflags & ~XEN_X86_EMU_ALL )
470
0
        {
471
0
            printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n",
472
0
                   d->domain_id, emflags);
473
0
            return -EINVAL;
474
0
        }
475
1
476
1
        if ( !emulation_flags_ok(d, emflags) )
477
0
        {
478
0
            printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation "
479
0
                   "with the current selection of emulators: %#x\n",
480
0
                   d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags);
481
0
            return -EOPNOTSUPP;
482
0
        }
483
1
        d->arch.emulation_flags = emflags;
484
1
    }
485
2
486
2
    mapcache_domain_init(d);
487
2
488
2
    HYPERVISOR_COMPAT_VIRT_START(d) =
489
2
        is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
490
2
491
2
    if ( !is_idle_domain(d) )
492
1
    {
493
1
        /* Need to determine if HAP is enabled before initialising paging */
494
1
        if ( is_hvm_domain(d) )
495
1
            d->arch.hvm_domain.hap_enabled =
496
1
                hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap);
497
1
498
1
        if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
499
0
            goto fail;
500
1
        paging_initialised = 1;
501
1
502
1
        if ( (rc = init_domain_cpuid_policy(d)) )
503
0
            goto fail;
504
1
505
1
        if ( (rc = init_domain_msr_policy(d)) )
506
0
            goto fail;
507
1
508
1
        d->arch.ioport_caps = 
509
1
            rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
510
1
        rc = -ENOMEM;
511
1
        if ( d->arch.ioport_caps == NULL )
512
0
            goto fail;
513
1
514
1
        /*
515
1
         * The shared_info machine address must fit in a 32-bit field within a
516
1
         * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
517
1
         */
518
1
        if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
519
0
            goto fail;
520
1
521
1
        clear_page(d->shared_info);
522
1
        share_xen_page_with_guest(
523
1
            virt_to_page(d->shared_info), d, XENSHARE_writable);
524
1
525
1
        if ( (rc = init_domain_irq_mapping(d)) != 0 )
526
0
            goto fail;
527
1
528
1
        if ( (rc = iommu_domain_init(d)) != 0 )
529
0
            goto fail;
530
1
    }
531
2
    spin_lock_init(&d->arch.e820_lock);
532
2
533
2
    psr_domain_init(d);
534
2
535
2
    if ( is_hvm_domain(d) )
536
1
    {
537
1
        if ( (rc = hvm_domain_initialise(d, domcr_flags, config)) != 0 )
538
0
            goto fail;
539
1
    }
540
1
    else if ( is_idle_domain(d) )
541
1
    {
542
1
        static const struct arch_csw idle_csw = {
543
1
            .from = paravirt_ctxt_switch_from,
544
1
            .to   = paravirt_ctxt_switch_to,
545
1
            .tail = continue_idle_domain,
546
1
        };
547
1
548
1
        d->arch.ctxt_switch = &idle_csw;
549
1
    }
550
1
    else
551
0
    {
552
0
        if ( (rc = pv_domain_initialise(d, domcr_flags, config)) != 0 )
553
0
            goto fail;
554
0
    }
555
2
556
2
    /* initialize default tsc behavior in case tools don't */
557
2
    tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
558
2
    spin_lock_init(&d->arch.vtsc_lock);
559
2
560
2
    /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
561
2
    pit_init(d, cpu_khz);
562
2
563
2
    /*
564
2
     * If the FPU does not save FCS/FDS then we can always
565
2
     * save/restore the 64-bit FIP/FDP and ignore the selectors.
566
2
     */
567
2
    d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
568
2
569
2
    return 0;
570
2
571
0
 fail:
572
0
    d->is_dying = DOMDYING_dead;
573
0
    psr_domain_free(d);
574
0
    iommu_domain_destroy(d);
575
0
    cleanup_domain_irq_mapping(d);
576
0
    free_xenheap_page(d->shared_info);
577
0
    xfree(d->arch.cpuid);
578
0
    xfree(d->arch.msr);
579
0
    if ( paging_initialised )
580
0
        paging_final_teardown(d);
581
0
    free_perdomain_mappings(d);
582
0
583
0
    return rc;
584
2
}
585
586
void arch_domain_destroy(struct domain *d)
587
0
{
588
0
    if ( is_hvm_domain(d) )
589
0
        hvm_domain_destroy(d);
590
0
591
0
    xfree(d->arch.e820);
592
0
    xfree(d->arch.cpuid);
593
0
    xfree(d->arch.msr);
594
0
595
0
    free_domain_pirqs(d);
596
0
    if ( !is_idle_domain(d) )
597
0
        iommu_domain_destroy(d);
598
0
599
0
    paging_final_teardown(d);
600
0
601
0
    if ( is_pv_domain(d) )
602
0
        pv_domain_destroy(d);
603
0
    free_perdomain_mappings(d);
604
0
605
0
    free_xenheap_page(d->shared_info);
606
0
    cleanup_domain_irq_mapping(d);
607
0
608
0
    psr_domain_free(d);
609
0
}
610
611
void arch_domain_shutdown(struct domain *d)
612
0
{
613
0
    if ( has_viridian_time_ref_count(d) )
614
0
        viridian_time_ref_count_freeze(d);
615
0
}
616
617
void arch_domain_pause(struct domain *d)
618
1
{
619
1
    if ( has_viridian_time_ref_count(d) )
620
0
        viridian_time_ref_count_freeze(d);
621
1
}
622
623
void arch_domain_unpause(struct domain *d)
624
2
{
625
2
    if ( has_viridian_time_ref_count(d) )
626
0
        viridian_time_ref_count_thaw(d);
627
2
}
628
629
int arch_domain_soft_reset(struct domain *d)
630
0
{
631
0
    struct page_info *page = virt_to_page(d->shared_info), *new_page;
632
0
    int ret = 0;
633
0
    struct domain *owner;
634
0
    unsigned long mfn, gfn;
635
0
    p2m_type_t p2mt;
636
0
    unsigned int i;
637
0
638
0
    /* Soft reset is supported for HVM domains only. */
639
0
    if ( !is_hvm_domain(d) )
640
0
        return -EINVAL;
641
0
642
0
    hvm_domain_soft_reset(d);
643
0
644
0
    spin_lock(&d->event_lock);
645
0
    for ( i = 0; i < d->nr_pirqs ; i++ )
646
0
    {
647
0
        if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
648
0
        {
649
0
            ret = unmap_domain_pirq_emuirq(d, i);
650
0
            if ( ret )
651
0
                break;
652
0
        }
653
0
    }
654
0
    spin_unlock(&d->event_lock);
655
0
656
0
    if ( ret )
657
0
        return ret;
658
0
659
0
    /*
660
0
     * The shared_info page needs to be replaced with a new page, otherwise we
661
0
     * will get a hole if the domain does XENMAPSPACE_shared_info.
662
0
     */
663
0
664
0
    owner = page_get_owner_and_reference(page);
665
0
    ASSERT( owner == d );
666
0
667
0
    mfn = page_to_mfn(page);
668
0
    gfn = mfn_to_gmfn(d, mfn);
669
0
670
0
    /*
671
0
     * gfn == INVALID_GFN indicates that the shared_info page was never mapped
672
0
     * to the domain's address space and there is nothing to replace.
673
0
     */
674
0
    if ( gfn == gfn_x(INVALID_GFN) )
675
0
        goto exit_put_page;
676
0
677
0
    if ( mfn_x(get_gfn_query(d, gfn, &p2mt)) != mfn )
678
0
    {
679
0
        printk(XENLOG_G_ERR "Failed to get Dom%d's shared_info GFN (%lx)\n",
680
0
               d->domain_id, gfn);
681
0
        ret = -EINVAL;
682
0
        goto exit_put_page;
683
0
    }
684
0
685
0
    new_page = alloc_domheap_page(d, 0);
686
0
    if ( !new_page )
687
0
    {
688
0
        printk(XENLOG_G_ERR "Failed to alloc a page to replace"
689
0
               " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
690
0
        ret = -ENOMEM;
691
0
        goto exit_put_gfn;
692
0
    }
693
0
694
0
    ret = guest_physmap_remove_page(d, _gfn(gfn), _mfn(mfn), PAGE_ORDER_4K);
695
0
    if ( ret )
696
0
    {
697
0
        printk(XENLOG_G_ERR "Failed to remove Dom%d's shared_info frame %lx\n",
698
0
               d->domain_id, gfn);
699
0
        free_domheap_page(new_page);
700
0
        goto exit_put_gfn;
701
0
    }
702
0
703
0
    ret = guest_physmap_add_page(d, _gfn(gfn), _mfn(page_to_mfn(new_page)),
704
0
                                 PAGE_ORDER_4K);
705
0
    if ( ret )
706
0
    {
707
0
        printk(XENLOG_G_ERR "Failed to add a page to replace"
708
0
               " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
709
0
        free_domheap_page(new_page);
710
0
    }
711
0
 exit_put_gfn:
712
0
    put_gfn(d, gfn);
713
0
 exit_put_page:
714
0
    put_page(page);
715
0
716
0
    return ret;
717
0
}
718
719
/*
720
 * These are the masks of CR4 bits (subject to hardware availability) which a
721
 * PV guest may not legitimiately attempt to modify.
722
 */
723
static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask;
724
725
static int __init init_pv_cr4_masks(void)
726
1
{
727
1
    unsigned long common_mask = ~X86_CR4_TSD;
728
1
729
1
    /*
730
1
     * All PV guests may attempt to modify TSD, DE and OSXSAVE.
731
1
     */
732
1
    if ( cpu_has_de )
733
1
        common_mask &= ~X86_CR4_DE;
734
1
    if ( cpu_has_xsave )
735
1
        common_mask &= ~X86_CR4_OSXSAVE;
736
1
737
1
    pv_cr4_mask = compat_pv_cr4_mask = common_mask;
738
1
739
1
    /*
740
1
     * 64bit PV guests may attempt to modify FSGSBASE.
741
1
     */
742
1
    if ( cpu_has_fsgsbase )
743
1
        pv_cr4_mask &= ~X86_CR4_FSGSBASE;
744
1
745
1
    return 0;
746
1
}
747
__initcall(init_pv_cr4_masks);
748
749
unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
750
0
{
751
0
    unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
752
0
    unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask;
753
0
754
0
    if ( (guest_cr4 & mask) != (hv_cr4 & mask) )
755
0
        printk(XENLOG_G_WARNING
756
0
               "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n",
757
0
               current->domain->domain_id, v, hv_cr4, guest_cr4);
758
0
759
0
    return (hv_cr4 & mask) | (guest_cr4 & ~mask);
760
0
}
761
762
#define xen_vcpu_guest_context vcpu_guest_context
763
#define fpu_ctxt fpu_ctxt.x
764
CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
765
#undef fpu_ctxt
766
#undef xen_vcpu_guest_context
767
768
/* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */
769
int arch_set_info_guest(
770
    struct vcpu *v, vcpu_guest_context_u c)
771
0
{
772
0
    struct domain *d = v->domain;
773
0
    unsigned long cr3_gfn;
774
0
    struct page_info *cr3_page;
775
0
    unsigned long flags, cr4;
776
0
    unsigned int i;
777
0
    int rc = 0, compat;
778
0
779
0
    /* The context is a compat-mode one if the target domain is compat-mode;
780
0
     * we expect the tools to DTRT even in compat-mode callers. */
781
0
    compat = is_pv_32bit_domain(d);
782
0
783
0
#define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
784
0
    flags = c(flags);
785
0
786
0
    if ( is_pv_domain(d) )
787
0
    {
788
0
        if ( !compat )
789
0
        {
790
0
            if ( !is_canonical_address(c.nat->user_regs.rip) ||
791
0
                 !is_canonical_address(c.nat->user_regs.rsp) ||
792
0
                 !is_canonical_address(c.nat->kernel_sp) ||
793
0
                 (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) ||
794
0
                 !is_canonical_address(c.nat->fs_base) ||
795
0
                 !is_canonical_address(c.nat->gs_base_kernel) ||
796
0
                 !is_canonical_address(c.nat->gs_base_user) ||
797
0
                 !is_canonical_address(c.nat->event_callback_eip) ||
798
0
                 !is_canonical_address(c.nat->syscall_callback_eip) ||
799
0
                 !is_canonical_address(c.nat->failsafe_callback_eip) )
800
0
                return -EINVAL;
801
0
802
0
            fixup_guest_stack_selector(d, c.nat->user_regs.ss);
803
0
            fixup_guest_stack_selector(d, c.nat->kernel_ss);
804
0
            fixup_guest_code_selector(d, c.nat->user_regs.cs);
805
0
806
0
            for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ )
807
0
            {
808
0
                if ( !is_canonical_address(c.nat->trap_ctxt[i].address) )
809
0
                    return -EINVAL;
810
0
                fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
811
0
            }
812
0
813
0
            if ( !__addr_ok(c.nat->ldt_base) )
814
0
                return -EINVAL;
815
0
        }
816
0
        else
817
0
        {
818
0
            fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
819
0
            fixup_guest_stack_selector(d, c.cmp->kernel_ss);
820
0
            fixup_guest_code_selector(d, c.cmp->user_regs.cs);
821
0
            fixup_guest_code_selector(d, c.cmp->event_callback_cs);
822
0
            fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
823
0
824
0
            for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
825
0
                fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
826
0
        }
827
0
828
0
        /* LDT safety checks. */
829
0
        if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
830
0
             (c(ldt_ents) > 8192) )
831
0
            return -EINVAL;
832
0
    }
833
0
834
0
    v->fpu_initialised = !!(flags & VGCF_I387_VALID);
835
0
836
0
    v->arch.flags &= ~TF_kernel_mode;
837
0
    if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
838
0
        v->arch.flags |= TF_kernel_mode;
839
0
840
0
    v->arch.vgc_flags = flags;
841
0
842
0
    if ( flags & VGCF_I387_VALID )
843
0
    {
844
0
        memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt));
845
0
        if ( v->arch.xsave_area )
846
0
            v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
847
0
    }
848
0
    else if ( v->arch.xsave_area )
849
0
    {
850
0
        v->arch.xsave_area->xsave_hdr.xstate_bv = 0;
851
0
        v->arch.xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT;
852
0
    }
853
0
    else
854
0
    {
855
0
        typeof(v->arch.xsave_area->fpu_sse) *fpu_sse = v->arch.fpu_ctxt;
856
0
857
0
        memset(fpu_sse, 0, sizeof(*fpu_sse));
858
0
        fpu_sse->fcw = FCW_DEFAULT;
859
0
        fpu_sse->mxcsr = MXCSR_DEFAULT;
860
0
    }
861
0
    if ( v->arch.xsave_area )
862
0
        v->arch.xsave_area->xsave_hdr.xcomp_bv = 0;
863
0
864
0
    if ( !compat )
865
0
    {
866
0
        memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
867
0
        if ( is_pv_domain(d) )
868
0
            memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt,
869
0
                   sizeof(c.nat->trap_ctxt));
870
0
    }
871
0
    else
872
0
    {
873
0
        XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
874
0
        if ( is_pv_domain(d) )
875
0
        {
876
0
            for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
877
0
                XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
878
0
                               c.cmp->trap_ctxt + i);
879
0
        }
880
0
    }
881
0
882
0
    if ( is_hvm_domain(d) )
883
0
    {
884
0
        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
885
0
            v->arch.debugreg[i] = c(debugreg[i]);
886
0
887
0
        hvm_set_info_guest(v);
888
0
        goto out;
889
0
    }
890
0
891
0
    init_int80_direct_trap(v);
892
0
893
0
    /* IOPL privileges are virtualised. */
894
0
    v->arch.pv_vcpu.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
895
0
    v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
896
0
897
0
    /* Ensure real hardware interrupts are enabled. */
898
0
    v->arch.user_regs.eflags |= X86_EFLAGS_IF;
899
0
900
0
    if ( !v->is_initialised )
901
0
    {
902
0
        if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
903
0
            return -EINVAL;
904
0
905
0
        v->arch.pv_vcpu.ldt_base = c(ldt_base);
906
0
        v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
907
0
    }
908
0
    else
909
0
    {
910
0
        unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
911
0
        bool fail;
912
0
913
0
        if ( !compat )
914
0
        {
915
0
            fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
916
0
            if ( pagetable_is_null(v->arch.guest_table_user) )
917
0
                fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
918
0
            else
919
0
            {
920
0
                pfn = pagetable_get_pfn(v->arch.guest_table_user);
921
0
                fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
922
0
            }
923
0
        } else {
924
0
            l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
925
0
926
0
            pfn = l4e_get_pfn(*l4tab);
927
0
            unmap_domain_page(l4tab);
928
0
            fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
929
0
        }
930
0
931
0
        for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
932
0
            fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]);
933
0
        fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents);
934
0
935
0
        fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base);
936
0
        fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents);
937
0
938
0
        if ( fail )
939
0
           return -EOPNOTSUPP;
940
0
    }
941
0
942
0
    v->arch.pv_vcpu.kernel_ss = c(kernel_ss);
943
0
    v->arch.pv_vcpu.kernel_sp = c(kernel_sp);
944
0
    for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i )
945
0
        v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]);
946
0
947
0
    v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip);
948
0
    v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip);
949
0
    if ( !compat )
950
0
    {
951
0
        v->arch.pv_vcpu.syscall_callback_eip = c.nat->syscall_callback_eip;
952
0
        v->arch.pv_vcpu.fs_base = c.nat->fs_base;
953
0
        v->arch.pv_vcpu.gs_base_kernel = c.nat->gs_base_kernel;
954
0
        v->arch.pv_vcpu.gs_base_user = c.nat->gs_base_user;
955
0
    }
956
0
    else
957
0
    {
958
0
        v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs);
959
0
        v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs);
960
0
    }
961
0
962
0
    /* Only CR0.TS is modifiable by guest or admin. */
963
0
    v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS;
964
0
    v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
965
0
966
0
    cr4 = v->arch.pv_vcpu.ctrlreg[4];
967
0
    v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) :
968
0
        real_cr4_to_pv_guest_cr4(mmu_cr4_features);
969
0
970
0
    memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg));
971
0
    for ( i = 0; i < 8; i++ )
972
0
        (void)set_debugreg(v, i, c(debugreg[i]));
973
0
974
0
    if ( v->is_initialised )
975
0
        goto out;
976
0
977
0
    if ( v->vcpu_id == 0 )
978
0
    {
979
0
        /*
980
0
         * In the restore case we need to deal with L4 pages which got
981
0
         * initialized with m2p_strict still clear (and which hence lack the
982
0
         * correct initial RO_MPT_VIRT_{START,END} L4 entry).
983
0
         */
984
0
        if ( d != current->domain && !VM_ASSIST(d, m2p_strict) &&
985
0
             is_pv_domain(d) && !is_pv_32bit_domain(d) &&
986
0
             test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) &&
987
0
             atomic_read(&d->arch.pv_domain.nr_l4_pages) )
988
0
        {
989
0
            bool done = false;
990
0
991
0
            spin_lock_recursive(&d->page_alloc_lock);
992
0
993
0
            for ( i = 0; ; )
994
0
            {
995
0
                struct page_info *page = page_list_remove_head(&d->page_list);
996
0
997
0
                if ( page_lock(page) )
998
0
                {
999
0
                    if ( (page->u.inuse.type_info & PGT_type_mask) ==
1000
0
                         PGT_l4_page_table )
1001
0
                        done = !fill_ro_mpt(_mfn(page_to_mfn(page)));
1002
0
1003
0
                    page_unlock(page);
1004
0
                }
1005
0
1006
0
                page_list_add_tail(page, &d->page_list);
1007
0
1008
0
                if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
1009
0
                    break;
1010
0
            }
1011
0
1012
0
            spin_unlock_recursive(&d->page_alloc_lock);
1013
0
1014
0
            if ( !done )
1015
0
                return -ERESTART;
1016
0
        }
1017
0
1018
0
        d->vm_assist = c(vm_assist);
1019
0
    }
1020
0
1021
0
    rc = put_old_guest_table(current);
1022
0
    if ( rc )
1023
0
        return rc;
1024
0
1025
0
    if ( !compat )
1026
0
        rc = (int)pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
1027
0
    else
1028
0
    {
1029
0
        unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)];
1030
0
        unsigned int n = (c.cmp->gdt_ents + 511) / 512;
1031
0
1032
0
        if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) )
1033
0
            return -EINVAL;
1034
0
        for ( i = 0; i < n; ++i )
1035
0
            gdt_frames[i] = c.cmp->gdt_frames[i];
1036
0
        rc = (int)pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents);
1037
0
    }
1038
0
    if ( rc != 0 )
1039
0
        return rc;
1040
0
1041
0
    set_bit(_VPF_in_reset, &v->pause_flags);
1042
0
1043
0
    if ( !compat )
1044
0
        cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
1045
0
    else
1046
0
        cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
1047
0
    cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1048
0
1049
0
    if ( !cr3_page )
1050
0
        rc = -EINVAL;
1051
0
    else if ( paging_mode_refcounts(d) )
1052
0
        /* nothing */;
1053
0
    else if ( cr3_page == v->arch.old_guest_table )
1054
0
    {
1055
0
        v->arch.old_guest_table = NULL;
1056
0
        put_page(cr3_page);
1057
0
    }
1058
0
    else
1059
0
    {
1060
0
        if ( !compat )
1061
0
            rc = put_old_guest_table(v);
1062
0
        if ( !rc )
1063
0
            rc = get_page_type_preemptible(cr3_page,
1064
0
                                           !compat ? PGT_root_page_table
1065
0
                                                   : PGT_l3_page_table);
1066
0
        switch ( rc )
1067
0
        {
1068
0
        case -EINTR:
1069
0
            rc = -ERESTART;
1070
0
        case -ERESTART:
1071
0
            break;
1072
0
        case 0:
1073
0
            if ( !compat && !VM_ASSIST(d, m2p_strict) &&
1074
0
                 !paging_mode_refcounts(d) )
1075
0
                fill_ro_mpt(_mfn(cr3_gfn));
1076
0
            break;
1077
0
        default:
1078
0
            if ( cr3_page == current->arch.old_guest_table )
1079
0
                cr3_page = NULL;
1080
0
            break;
1081
0
        }
1082
0
    }
1083
0
    if ( rc )
1084
0
        /* handled below */;
1085
0
    else if ( !compat )
1086
0
    {
1087
0
        v->arch.guest_table = pagetable_from_page(cr3_page);
1088
0
        if ( c.nat->ctrlreg[1] )
1089
0
        {
1090
0
            cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
1091
0
            cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1092
0
1093
0
            if ( !cr3_page )
1094
0
                rc = -EINVAL;
1095
0
            else if ( !paging_mode_refcounts(d) )
1096
0
            {
1097
0
                rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
1098
0
                switch ( rc )
1099
0
                {
1100
0
                case -EINTR:
1101
0
                    rc = -ERESTART;
1102
0
                    /* Fallthrough */
1103
0
                case -ERESTART:
1104
0
                    v->arch.old_guest_ptpg = NULL;
1105
0
                    v->arch.old_guest_table =
1106
0
                        pagetable_get_page(v->arch.guest_table);
1107
0
                    v->arch.guest_table = pagetable_null();
1108
0
                    break;
1109
0
                default:
1110
0
                    if ( cr3_page == current->arch.old_guest_table )
1111
0
                        cr3_page = NULL;
1112
0
                    break;
1113
0
                case 0:
1114
0
                    if ( VM_ASSIST(d, m2p_strict) )
1115
0
                        zap_ro_mpt(_mfn(cr3_gfn));
1116
0
                    break;
1117
0
                }
1118
0
            }
1119
0
            if ( !rc )
1120
0
               v->arch.guest_table_user = pagetable_from_page(cr3_page);
1121
0
        }
1122
0
    }
1123
0
    else
1124
0
    {
1125
0
        l4_pgentry_t *l4tab;
1126
0
1127
0
        l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table));
1128
0
        *l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
1129
0
            _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
1130
0
        unmap_domain_page(l4tab);
1131
0
    }
1132
0
    if ( rc )
1133
0
    {
1134
0
        if ( cr3_page )
1135
0
            put_page(cr3_page);
1136
0
        pv_destroy_gdt(v);
1137
0
        return rc;
1138
0
    }
1139
0
1140
0
    clear_bit(_VPF_in_reset, &v->pause_flags);
1141
0
1142
0
    if ( v->vcpu_id == 0 )
1143
0
        update_domain_wallclock_time(d);
1144
0
1145
0
    /* Don't redo final setup */
1146
0
    v->is_initialised = 1;
1147
0
1148
0
    if ( paging_mode_enabled(d) )
1149
0
        paging_update_paging_modes(v);
1150
0
1151
0
    update_cr3(v);
1152
0
1153
0
 out:
1154
0
    if ( flags & VGCF_online )
1155
0
        clear_bit(_VPF_down, &v->pause_flags);
1156
0
    else
1157
0
        set_bit(_VPF_down, &v->pause_flags);
1158
0
    return 0;
1159
0
#undef c
1160
0
}
1161
1162
int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1163
0
{
1164
0
    int rc;
1165
0
1166
0
    if ( is_hvm_vcpu(v) )
1167
0
    {
1168
0
        struct domain *d = v->domain;
1169
0
        struct vcpu_hvm_context ctxt;
1170
0
1171
0
        if ( copy_from_guest(&ctxt, arg, 1) )
1172
0
            return -EFAULT;
1173
0
1174
0
        domain_lock(d);
1175
0
        rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt);
1176
0
        domain_unlock(d);
1177
0
    }
1178
0
    else
1179
0
        rc = default_initialise_vcpu(v, arg);
1180
0
1181
0
    return rc;
1182
0
}
1183
1184
int arch_vcpu_reset(struct vcpu *v)
1185
0
{
1186
0
    if ( is_pv_vcpu(v) )
1187
0
    {
1188
0
        pv_destroy_gdt(v);
1189
0
        return vcpu_destroy_pagetables(v);
1190
0
    }
1191
0
1192
0
    vcpu_end_shutdown_deferral(v);
1193
0
    return 0;
1194
0
}
1195
1196
long
1197
arch_do_vcpu_op(
1198
    int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1199
0
{
1200
0
    long rc = 0;
1201
0
1202
0
    switch ( cmd )
1203
0
    {
1204
0
    case VCPUOP_register_vcpu_time_memory_area:
1205
0
    {
1206
0
        struct vcpu_register_time_memory_area area;
1207
0
1208
0
        rc = -EFAULT;
1209
0
        if ( copy_from_guest(&area, arg, 1) )
1210
0
            break;
1211
0
1212
0
        if ( !guest_handle_okay(area.addr.h, 1) )
1213
0
            break;
1214
0
1215
0
        rc = 0;
1216
0
        v->arch.time_info_guest = area.addr.h;
1217
0
1218
0
        force_update_vcpu_system_time(v);
1219
0
1220
0
        break;
1221
0
    }
1222
0
1223
0
    case VCPUOP_get_physid:
1224
0
    {
1225
0
        struct vcpu_get_physid cpu_id;
1226
0
1227
0
        rc = -EINVAL;
1228
0
        if ( !is_pinned_vcpu(v) )
1229
0
            break;
1230
0
1231
0
        cpu_id.phys_id =
1232
0
            (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1233
0
            ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1234
0
1235
0
        rc = -EFAULT;
1236
0
        if ( copy_to_guest(arg, &cpu_id, 1) )
1237
0
            break;
1238
0
1239
0
        rc = 0;
1240
0
        break;
1241
0
    }
1242
0
1243
0
    default:
1244
0
        rc = -ENOSYS;
1245
0
        break;
1246
0
    }
1247
0
1248
0
    return rc;
1249
0
}
1250
1251
/*
1252
 * Loading a nul selector does not clear bases and limits on AMD CPUs. Be on
1253
 * the safe side and re-initialize both to flat segment values before loading
1254
 * a nul selector.
1255
 */
1256
0
#define preload_segment(seg, value) do {              \
1257
0
    if ( !((value) & ~3) &&                           \
1258
0
         boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) \
1259
0
        asm volatile ( "movl %k0, %%" #seg            \
1260
0
                       :: "r" (FLAT_USER_DS32) );     \
1261
0
} while ( false )
1262
1263
0
#define loadsegment(seg,value) ({               \
1264
0
    int __r = 1;                                \
1265
0
    asm volatile (                              \
1266
0
        "1: movl %k1,%%" #seg "\n2:\n"          \
1267
0
        ".section .fixup,\"ax\"\n"              \
1268
0
        "3: xorl %k0,%k0\n"                     \
1269
0
        "   movl %k0,%%" #seg "\n"              \
1270
0
        "   jmp 2b\n"                           \
1271
0
        ".previous\n"                           \
1272
0
        _ASM_EXTABLE(1b, 3b)                    \
1273
0
        : "=r" (__r) : "r" (value), "0" (__r) );\
1274
0
    __r; })
1275
1276
/*
1277
 * save_segments() writes a mask of segments which are dirty (non-zero),
1278
 * allowing load_segments() to avoid some expensive segment loads and
1279
 * MSR writes.
1280
 */
1281
static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1282
0
#define DIRTY_DS           0x01
1283
0
#define DIRTY_ES           0x02
1284
0
#define DIRTY_FS           0x04
1285
0
#define DIRTY_GS           0x08
1286
0
#define DIRTY_FS_BASE      0x10
1287
0
#define DIRTY_GS_BASE_USER 0x20
1288
1289
static void load_segments(struct vcpu *n)
1290
0
{
1291
0
    struct cpu_user_regs *uregs = &n->arch.user_regs;
1292
0
    int all_segs_okay = 1;
1293
0
    unsigned int dirty_segment_mask, cpu = smp_processor_id();
1294
0
1295
0
    /* Load and clear the dirty segment mask. */
1296
0
    dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1297
0
    per_cpu(dirty_segment_mask, cpu) = 0;
1298
0
1299
0
    /* Either selector != 0 ==> reload. */
1300
0
    if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
1301
0
    {
1302
0
        preload_segment(ds, uregs->ds);
1303
0
        all_segs_okay &= loadsegment(ds, uregs->ds);
1304
0
    }
1305
0
1306
0
    /* Either selector != 0 ==> reload. */
1307
0
    if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) )
1308
0
    {
1309
0
        preload_segment(es, uregs->es);
1310
0
        all_segs_okay &= loadsegment(es, uregs->es);
1311
0
    }
1312
0
1313
0
    /* Either selector != 0 ==> reload. */
1314
0
    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
1315
0
    {
1316
0
        all_segs_okay &= loadsegment(fs, uregs->fs);
1317
0
        /* non-nul selector updates fs_base */
1318
0
        if ( uregs->fs & ~3 )
1319
0
            dirty_segment_mask &= ~DIRTY_FS_BASE;
1320
0
    }
1321
0
1322
0
    /* Either selector != 0 ==> reload. */
1323
0
    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
1324
0
    {
1325
0
        all_segs_okay &= loadsegment(gs, uregs->gs);
1326
0
        /* non-nul selector updates gs_base_user */
1327
0
        if ( uregs->gs & ~3 )
1328
0
            dirty_segment_mask &= ~DIRTY_GS_BASE_USER;
1329
0
    }
1330
0
1331
0
    if ( !is_pv_32bit_vcpu(n) )
1332
0
    {
1333
0
        /* This can only be non-zero if selector is NULL. */
1334
0
        if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
1335
0
            wrfsbase(n->arch.pv_vcpu.fs_base);
1336
0
1337
0
        /* Most kernels have non-zero GS base, so don't bother testing. */
1338
0
        /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1339
0
        wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel);
1340
0
1341
0
        /* This can only be non-zero if selector is NULL. */
1342
0
        if ( n->arch.pv_vcpu.gs_base_user |
1343
0
             (dirty_segment_mask & DIRTY_GS_BASE_USER) )
1344
0
            wrgsbase(n->arch.pv_vcpu.gs_base_user);
1345
0
1346
0
        /* If in kernel mode then switch the GS bases around. */
1347
0
        if ( (n->arch.flags & TF_kernel_mode) )
1348
0
            asm volatile ( "swapgs" );
1349
0
    }
1350
0
1351
0
    if ( unlikely(!all_segs_okay) )
1352
0
    {
1353
0
        struct pv_vcpu *pv = &n->arch.pv_vcpu;
1354
0
        struct cpu_user_regs *regs = guest_cpu_user_regs();
1355
0
        unsigned long *rsp =
1356
0
            (unsigned long *)(((n->arch.flags & TF_kernel_mode)
1357
0
                               ? regs->rsp : pv->kernel_sp) & ~0xf);
1358
0
        unsigned long cs_and_mask, rflags;
1359
0
1360
0
        /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */
1361
0
        rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1362
0
        rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1363
0
        if ( VM_ASSIST(n->domain, architectural_iopl) )
1364
0
            rflags |= n->arch.pv_vcpu.iopl;
1365
0
1366
0
        if ( is_pv_32bit_vcpu(n) )
1367
0
        {
1368
0
            unsigned int *esp = ring_1(regs) ?
1369
0
                                (unsigned int *)regs->rsp :
1370
0
                                (unsigned int *)pv->kernel_sp;
1371
0
            int ret = 0;
1372
0
1373
0
            /* CS longword also contains full evtchn_upcall_mask. */
1374
0
            cs_and_mask = (unsigned short)regs->cs |
1375
0
                ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1376
0
1377
0
            if ( !ring_1(regs) )
1378
0
            {
1379
0
                ret  = put_user(regs->ss,       esp-1);
1380
0
                ret |= put_user(regs->esp,      esp-2);
1381
0
                esp -= 2;
1382
0
            }
1383
0
1384
0
            if ( ret |
1385
0
                 put_user(rflags,              esp-1) |
1386
0
                 put_user(cs_and_mask,         esp-2) |
1387
0
                 put_user(regs->eip,           esp-3) |
1388
0
                 put_user(uregs->gs,           esp-4) |
1389
0
                 put_user(uregs->fs,           esp-5) |
1390
0
                 put_user(uregs->es,           esp-6) |
1391
0
                 put_user(uregs->ds,           esp-7) )
1392
0
            {
1393
0
                gprintk(XENLOG_ERR,
1394
0
                        "error while creating compat failsafe callback frame\n");
1395
0
                domain_crash(n->domain);
1396
0
            }
1397
0
1398
0
            if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1399
0
                vcpu_info(n, evtchn_upcall_mask) = 1;
1400
0
1401
0
            regs->entry_vector |= TRAP_syscall;
1402
0
            regs->eflags       &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT|
1403
0
                                    X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1404
0
            regs->ss            = FLAT_COMPAT_KERNEL_SS;
1405
0
            regs->esp           = (unsigned long)(esp-7);
1406
0
            regs->cs            = FLAT_COMPAT_KERNEL_CS;
1407
0
            regs->eip           = pv->failsafe_callback_eip;
1408
0
            return;
1409
0
        }
1410
0
1411
0
        if ( !(n->arch.flags & TF_kernel_mode) )
1412
0
            toggle_guest_mode(n);
1413
0
        else
1414
0
            regs->cs &= ~3;
1415
0
1416
0
        /* CS longword also contains full evtchn_upcall_mask. */
1417
0
        cs_and_mask = (unsigned long)regs->cs |
1418
0
            ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1419
0
1420
0
        if ( put_user(regs->ss,            rsp- 1) |
1421
0
             put_user(regs->rsp,           rsp- 2) |
1422
0
             put_user(rflags,              rsp- 3) |
1423
0
             put_user(cs_and_mask,         rsp- 4) |
1424
0
             put_user(regs->rip,           rsp- 5) |
1425
0
             put_user(uregs->gs,           rsp- 6) |
1426
0
             put_user(uregs->fs,           rsp- 7) |
1427
0
             put_user(uregs->es,           rsp- 8) |
1428
0
             put_user(uregs->ds,           rsp- 9) |
1429
0
             put_user(regs->r11,           rsp-10) |
1430
0
             put_user(regs->rcx,           rsp-11) )
1431
0
        {
1432
0
            gprintk(XENLOG_ERR,
1433
0
                    "error while creating failsafe callback frame\n");
1434
0
            domain_crash(n->domain);
1435
0
        }
1436
0
1437
0
        if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1438
0
            vcpu_info(n, evtchn_upcall_mask) = 1;
1439
0
1440
0
        regs->entry_vector |= TRAP_syscall;
1441
0
        regs->rflags       &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1442
0
                                X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1443
0
        regs->ss            = FLAT_KERNEL_SS;
1444
0
        regs->rsp           = (unsigned long)(rsp-11);
1445
0
        regs->cs            = FLAT_KERNEL_CS;
1446
0
        regs->rip           = pv->failsafe_callback_eip;
1447
0
    }
1448
0
}
1449
1450
static void save_segments(struct vcpu *v)
1451
0
{
1452
0
    struct cpu_user_regs *regs = &v->arch.user_regs;
1453
0
    unsigned int dirty_segment_mask = 0;
1454
0
1455
0
    regs->ds = read_sreg(ds);
1456
0
    regs->es = read_sreg(es);
1457
0
    regs->fs = read_sreg(fs);
1458
0
    regs->gs = read_sreg(gs);
1459
0
1460
0
    if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
1461
0
    {
1462
0
        v->arch.pv_vcpu.fs_base = __rdfsbase();
1463
0
        if ( v->arch.flags & TF_kernel_mode )
1464
0
            v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
1465
0
        else
1466
0
            v->arch.pv_vcpu.gs_base_user = __rdgsbase();
1467
0
    }
1468
0
1469
0
    if ( regs->ds )
1470
0
        dirty_segment_mask |= DIRTY_DS;
1471
0
1472
0
    if ( regs->es )
1473
0
        dirty_segment_mask |= DIRTY_ES;
1474
0
1475
0
    if ( regs->fs || is_pv_32bit_vcpu(v) )
1476
0
    {
1477
0
        dirty_segment_mask |= DIRTY_FS;
1478
0
        /* non-nul selector kills fs_base */
1479
0
        if ( regs->fs & ~3 )
1480
0
            v->arch.pv_vcpu.fs_base = 0;
1481
0
    }
1482
0
    if ( v->arch.pv_vcpu.fs_base )
1483
0
        dirty_segment_mask |= DIRTY_FS_BASE;
1484
0
1485
0
    if ( regs->gs || is_pv_32bit_vcpu(v) )
1486
0
    {
1487
0
        dirty_segment_mask |= DIRTY_GS;
1488
0
        /* non-nul selector kills gs_base_user */
1489
0
        if ( regs->gs & ~3 )
1490
0
            v->arch.pv_vcpu.gs_base_user = 0;
1491
0
    }
1492
0
    if ( v->arch.pv_vcpu.gs_base_user )
1493
0
        dirty_segment_mask |= DIRTY_GS_BASE_USER;
1494
0
1495
0
    this_cpu(dirty_segment_mask) = dirty_segment_mask;
1496
0
}
1497
1498
void paravirt_ctxt_switch_from(struct vcpu *v)
1499
0
{
1500
0
    save_segments(v);
1501
0
1502
0
    /*
1503
0
     * Disable debug breakpoints. We do this aggressively because if we switch
1504
0
     * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1505
0
     * inside Xen, before we get a chance to reload DR7, and this cannot always
1506
0
     * safely be handled.
1507
0
     */
1508
0
    if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1509
0
        write_debugreg(7, 0);
1510
0
}
1511
1512
void paravirt_ctxt_switch_to(struct vcpu *v)
1513
0
{
1514
0
    unsigned long cr4;
1515
0
1516
0
    cr4 = pv_guest_cr4_to_real_cr4(v);
1517
0
    if ( unlikely(cr4 != read_cr4()) )
1518
0
        write_cr4(cr4);
1519
0
1520
0
    if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1521
0
        activate_debugregs(v);
1522
0
1523
0
    if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
1524
0
         boot_cpu_has(X86_FEATURE_RDTSCP) )
1525
0
        write_rdtscp_aux(v->domain->arch.incarnation);
1526
0
}
1527
1528
/* Update per-VCPU guest runstate shared memory area (if registered). */
1529
bool update_runstate_area(struct vcpu *v)
1530
328k
{
1531
328k
    bool rc;
1532
328k
    struct guest_memory_policy policy =
1533
328k
        { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
1534
328k
    void __user *guest_handle = NULL;
1535
328k
1536
328k
    if ( guest_handle_is_null(runstate_guest(v)) )
1537
328k
        return true;
1538
328k
1539
127
    update_guest_memory_policy(v, &policy);
1540
127
1541
127
    if ( VM_ASSIST(v->domain, runstate_update_flag) )
1542
0
    {
1543
0
        guest_handle = has_32bit_shinfo(v->domain)
1544
0
            ? &v->runstate_guest.compat.p->state_entry_time + 1
1545
0
            : &v->runstate_guest.native.p->state_entry_time + 1;
1546
0
        guest_handle--;
1547
0
        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
1548
0
        __raw_copy_to_guest(guest_handle,
1549
0
                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1550
0
        smp_wmb();
1551
0
    }
1552
127
1553
127
    if ( has_32bit_shinfo(v->domain) )
1554
0
    {
1555
0
        struct compat_vcpu_runstate_info info;
1556
0
1557
0
        XLAT_vcpu_runstate_info(&info, &v->runstate);
1558
0
        __copy_to_guest(v->runstate_guest.compat, &info, 1);
1559
0
        rc = true;
1560
0
    }
1561
127
    else
1562
127
        rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
1563
127
             sizeof(v->runstate);
1564
127
1565
127
    if ( guest_handle )
1566
0
    {
1567
0
        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
1568
0
        smp_wmb();
1569
0
        __raw_copy_to_guest(guest_handle,
1570
0
                            (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1571
0
    }
1572
127
1573
127
    update_guest_memory_policy(v, &policy);
1574
127
1575
127
    return rc;
1576
328k
}
1577
1578
static void _update_runstate_area(struct vcpu *v)
1579
328k
{
1580
328k
    if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
1581
0
         !(v->arch.flags & TF_kernel_mode) )
1582
0
        v->arch.pv_vcpu.need_update_runstate_area = 1;
1583
328k
}
1584
1585
static inline bool need_full_gdt(const struct domain *d)
1586
118k
{
1587
118k
    return is_pv_domain(d) && !is_idle_domain(d);
1588
118k
}
1589
1590
static void __context_switch(void)
1591
39.5k
{
1592
39.5k
    struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1593
39.5k
    unsigned int          cpu = smp_processor_id();
1594
39.5k
    struct vcpu          *p = per_cpu(curr_vcpu, cpu);
1595
39.5k
    struct vcpu          *n = current;
1596
39.5k
    struct domain        *pd = p->domain, *nd = n->domain;
1597
39.5k
    struct desc_struct   *gdt;
1598
39.5k
    struct desc_ptr       gdt_desc;
1599
39.5k
1600
39.5k
    ASSERT(p != n);
1601
39.5k
    ASSERT(cpumask_empty(n->vcpu_dirty_cpumask));
1602
39.5k
1603
39.5k
    if ( !is_idle_domain(pd) )
1604
36.9k
    {
1605
36.9k
        memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
1606
36.9k
        vcpu_save_fpu(p);
1607
36.9k
        pd->arch.ctxt_switch->from(p);
1608
36.9k
    }
1609
39.5k
1610
39.5k
    /*
1611
39.5k
     * Mark this CPU in next domain's dirty cpumasks before calling
1612
39.5k
     * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1613
39.5k
     * which is synchronised on that function.
1614
39.5k
     */
1615
39.5k
    if ( pd != nd )
1616
5.17k
        cpumask_set_cpu(cpu, nd->domain_dirty_cpumask);
1617
39.5k
    cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask);
1618
39.5k
1619
39.5k
    if ( !is_idle_domain(nd) )
1620
37.0k
    {
1621
37.0k
        memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
1622
37.0k
        if ( cpu_has_xsave )
1623
37.0k
        {
1624
33.3k
            u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
1625
37.0k
1626
37.0k
            if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
1627
0
                BUG();
1628
37.0k
1629
37.0k
            if ( cpu_has_xsaves && is_hvm_vcpu(n) )
1630
0
                set_msr_xss(n->arch.hvm_vcpu.msr_xss);
1631
37.0k
        }
1632
37.0k
        vcpu_restore_fpu_eager(n);
1633
37.0k
        nd->arch.ctxt_switch->to(n);
1634
37.0k
    }
1635
39.5k
1636
39.5k
    psr_ctxt_switch_to(nd);
1637
39.5k
1638
39.5k
    gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
1639
18.4E
                                    per_cpu(compat_gdt_table, cpu);
1640
39.5k
    if ( need_full_gdt(nd) )
1641
0
    {
1642
0
        unsigned long mfn = virt_to_mfn(gdt);
1643
0
        l1_pgentry_t *pl1e = pv_gdt_ptes(n);
1644
0
        unsigned int i;
1645
0
1646
0
        for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1647
0
            l1e_write(pl1e + FIRST_RESERVED_GDT_PAGE + i,
1648
0
                      l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR_RW));
1649
0
    }
1650
39.5k
1651
39.5k
    if ( need_full_gdt(pd) &&
1652
0
         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
1653
0
    {
1654
0
        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1655
0
        gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1656
0
        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1657
0
    }
1658
39.5k
1659
39.5k
    write_ptbase(n);
1660
39.5k
1661
39.5k
    if ( need_full_gdt(nd) &&
1662
0
         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
1663
0
    {
1664
0
        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1665
0
        gdt_desc.base = GDT_VIRT_START(n);
1666
0
        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1667
0
    }
1668
39.5k
1669
39.5k
    if ( pd != nd )
1670
5.17k
        cpumask_clear_cpu(cpu, pd->domain_dirty_cpumask);
1671
39.5k
    cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask);
1672
39.5k
1673
39.5k
    per_cpu(curr_vcpu, cpu) = n;
1674
39.5k
}
1675
1676
1677
void context_switch(struct vcpu *prev, struct vcpu *next)
1678
164k
{
1679
164k
    unsigned int cpu = smp_processor_id();
1680
164k
    const struct domain *prevd = prev->domain, *nextd = next->domain;
1681
164k
    cpumask_t dirty_mask;
1682
164k
1683
164k
    ASSERT(local_irq_is_enabled());
1684
164k
1685
164k
    cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
1686
164k
    /* Allow at most one CPU at a time to be dirty. */
1687
164k
    ASSERT(cpumask_weight(&dirty_mask) <= 1);
1688
164k
    if ( unlikely(!cpumask_test_cpu(cpu, &dirty_mask) &&
1689
164k
                  !cpumask_empty(&dirty_mask)) )
1690
30
    {
1691
30
        /* Other cpus call __sync_local_execstate from flush ipi handler. */
1692
30
        flush_tlb_mask(&dirty_mask);
1693
30
    }
1694
164k
1695
164k
    if ( prev != next )
1696
164k
    {
1697
164k
        _update_runstate_area(prev);
1698
164k
        vpmu_switch_from(prev);
1699
164k
        np2m_schedule(NP2M_SCHEDLE_OUT);
1700
164k
    }
1701
164k
1702
164k
    if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1703
0
        pt_save_timer(prev);
1704
164k
1705
164k
    local_irq_disable();
1706
164k
1707
164k
    set_current(next);
1708
164k
1709
164k
    if ( (per_cpu(curr_vcpu, cpu) == next) ||
1710
102k
         (is_idle_domain(nextd) && cpu_online(cpu)) )
1711
127k
    {
1712
127k
        local_irq_enable();
1713
127k
    }
1714
164k
    else
1715
37.0k
    {
1716
37.0k
        __context_switch();
1717
37.0k
1718
37.0k
        if ( is_pv_domain(nextd) &&
1719
0
             (is_idle_domain(prevd) ||
1720
0
              is_hvm_domain(prevd) ||
1721
0
              is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
1722
0
        {
1723
0
            uint64_t efer = read_efer();
1724
0
            if ( !(efer & EFER_SCE) )
1725
0
                write_efer(efer | EFER_SCE);
1726
0
        }
1727
37.0k
1728
37.0k
        /* Re-enable interrupts before restoring state which may fault. */
1729
37.0k
        local_irq_enable();
1730
37.0k
1731
37.0k
        if ( is_pv_domain(nextd) )
1732
0
        {
1733
0
            load_LDT(next);
1734
0
            load_segments(next);
1735
0
        }
1736
37.0k
1737
37.0k
        ctxt_switch_levelling(next);
1738
37.0k
    }
1739
164k
1740
164k
    context_saved(prev);
1741
164k
1742
164k
    if ( prev != next )
1743
165k
    {
1744
165k
        _update_runstate_area(next);
1745
165k
1746
165k
        /* Must be done with interrupts enabled */
1747
165k
        vpmu_switch_to(next);
1748
165k
        np2m_schedule(NP2M_SCHEDLE_IN);
1749
165k
    }
1750
164k
1751
164k
    /* Ensure that the vcpu has an up-to-date time base. */
1752
164k
    update_vcpu_system_time(next);
1753
164k
1754
164k
    /*
1755
164k
     * Schedule tail *should* be a terminal function pointer, but leave a
1756
164k
     * bug frame around just in case it returns, to save going back into the
1757
164k
     * context switching code and leaving a far more subtle crash to diagnose.
1758
164k
     */
1759
164k
    nextd->arch.ctxt_switch->tail(next);
1760
164k
    BUG();
1761
164k
}
1762
1763
void continue_running(struct vcpu *same)
1764
4.42M
{
1765
4.42M
    /* See the comment above. */
1766
4.42M
    same->domain->arch.ctxt_switch->tail(same);
1767
4.42M
    BUG();
1768
4.42M
}
1769
1770
int __sync_local_execstate(void)
1771
27.1M
{
1772
27.1M
    unsigned long flags;
1773
27.1M
    int switch_required;
1774
27.1M
1775
27.1M
    local_irq_save(flags);
1776
27.1M
1777
27.1M
    switch_required = (this_cpu(curr_vcpu) != current);
1778
27.1M
1779
27.1M
    if ( switch_required )
1780
2.56k
    {
1781
2.56k
        ASSERT(current == idle_vcpu[smp_processor_id()]);
1782
2.56k
        __context_switch();
1783
2.56k
    }
1784
27.1M
1785
27.1M
    local_irq_restore(flags);
1786
27.1M
1787
27.1M
    return switch_required;
1788
27.1M
}
1789
1790
void sync_local_execstate(void)
1791
27.1M
{
1792
27.1M
    (void)__sync_local_execstate();
1793
27.1M
}
1794
1795
void sync_vcpu_execstate(struct vcpu *v)
1796
315
{
1797
315
    if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) )
1798
0
        sync_local_execstate();
1799
315
1800
315
    /* Other cpus call __sync_local_execstate from flush ipi handler. */
1801
315
    flush_tlb_mask(v->vcpu_dirty_cpumask);
1802
315
}
1803
1804
static int relinquish_memory(
1805
    struct domain *d, struct page_list_head *list, unsigned long type)
1806
0
{
1807
0
    struct page_info  *page;
1808
0
    unsigned long     x, y;
1809
0
    int               ret = 0;
1810
0
1811
0
    /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1812
0
    spin_lock_recursive(&d->page_alloc_lock);
1813
0
1814
0
    while ( (page = page_list_remove_head(list)) )
1815
0
    {
1816
0
        /* Grab a reference to the page so it won't disappear from under us. */
1817
0
        if ( unlikely(!get_page(page, d)) )
1818
0
        {
1819
0
            /* Couldn't get a reference -- someone is freeing this page. */
1820
0
            page_list_add_tail(page, &d->arch.relmem_list);
1821
0
            continue;
1822
0
        }
1823
0
1824
0
        if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1825
0
            ret = put_page_and_type_preemptible(page);
1826
0
        switch ( ret )
1827
0
        {
1828
0
        case 0:
1829
0
            break;
1830
0
        case -ERESTART:
1831
0
        case -EINTR:
1832
0
            ret = -ERESTART;
1833
0
            page_list_add(page, list);
1834
0
            set_bit(_PGT_pinned, &page->u.inuse.type_info);
1835
0
            put_page(page);
1836
0
            goto out;
1837
0
        default:
1838
0
            BUG();
1839
0
        }
1840
0
1841
0
        if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1842
0
            put_page(page);
1843
0
1844
0
        /*
1845
0
         * Forcibly invalidate top-most, still valid page tables at this point
1846
0
         * to break circular 'linear page table' references as well as clean up
1847
0
         * partially validated pages. This is okay because MMU structures are
1848
0
         * not shared across domains and this domain is now dead. Thus top-most
1849
0
         * valid tables are not in use so a non-zero count means circular
1850
0
         * reference or partially validated.
1851
0
         */
1852
0
        y = page->u.inuse.type_info;
1853
0
        for ( ; ; )
1854
0
        {
1855
0
            x = y;
1856
0
            if ( likely((x & PGT_type_mask) != type) ||
1857
0
                 likely(!(x & (PGT_validated|PGT_partial))) )
1858
0
                break;
1859
0
1860
0
            y = cmpxchg(&page->u.inuse.type_info, x,
1861
0
                        x & ~(PGT_validated|PGT_partial));
1862
0
            if ( likely(y == x) )
1863
0
            {
1864
0
                /* No need for atomic update of type_info here: noone else updates it. */
1865
0
                switch ( ret = free_page_type(page, x, 1) )
1866
0
                {
1867
0
                case 0:
1868
0
                    break;
1869
0
                case -EINTR:
1870
0
                    page_list_add(page, list);
1871
0
                    page->u.inuse.type_info |= PGT_validated;
1872
0
                    if ( x & PGT_partial )
1873
0
                        put_page(page);
1874
0
                    put_page(page);
1875
0
                    ret = -ERESTART;
1876
0
                    goto out;
1877
0
                case -ERESTART:
1878
0
                    page_list_add(page, list);
1879
0
                    page->u.inuse.type_info |= PGT_partial;
1880
0
                    if ( x & PGT_partial )
1881
0
                        put_page(page);
1882
0
                    goto out;
1883
0
                default:
1884
0
                    BUG();
1885
0
                }
1886
0
                if ( x & PGT_partial )
1887
0
                {
1888
0
                    page->u.inuse.type_info--;
1889
0
                    put_page(page);
1890
0
                }
1891
0
                break;
1892
0
            }
1893
0
        }
1894
0
1895
0
        /* Put the page on the list and /then/ potentially free it. */
1896
0
        page_list_add_tail(page, &d->arch.relmem_list);
1897
0
        put_page(page);
1898
0
1899
0
        if ( hypercall_preempt_check() )
1900
0
        {
1901
0
            ret = -ERESTART;
1902
0
            goto out;
1903
0
        }
1904
0
    }
1905
0
1906
0
    /* list is empty at this point. */
1907
0
    page_list_move(list, &d->arch.relmem_list);
1908
0
1909
0
 out:
1910
0
    spin_unlock_recursive(&d->page_alloc_lock);
1911
0
    return ret;
1912
0
}
1913
1914
int domain_relinquish_resources(struct domain *d)
1915
0
{
1916
0
    int ret;
1917
0
    struct vcpu *v;
1918
0
1919
0
    BUG_ON(!cpumask_empty(d->domain_dirty_cpumask));
1920
0
1921
0
    switch ( d->arch.relmem )
1922
0
    {
1923
0
    case RELMEM_not_started:
1924
0
        ret = pci_release_devices(d);
1925
0
        if ( ret )
1926
0
            return ret;
1927
0
1928
0
        /* Tear down paging-assistance stuff. */
1929
0
        ret = paging_teardown(d);
1930
0
        if ( ret )
1931
0
            return ret;
1932
0
1933
0
        /* Drop the in-use references to page-table bases. */
1934
0
        for_each_vcpu ( d, v )
1935
0
        {
1936
0
            ret = vcpu_destroy_pagetables(v);
1937
0
            if ( ret )
1938
0
                return ret;
1939
0
        }
1940
0
1941
0
        if ( is_pv_domain(d) )
1942
0
        {
1943
0
            for_each_vcpu ( d, v )
1944
0
            {
1945
0
                /*
1946
0
                 * Relinquish GDT mappings. No need for explicit unmapping of
1947
0
                 * the LDT as it automatically gets squashed with the guest
1948
0
                 * mappings.
1949
0
                 */
1950
0
                pv_destroy_gdt(v);
1951
0
            }
1952
0
        }
1953
0
1954
0
        if ( d->arch.pirq_eoi_map != NULL )
1955
0
        {
1956
0
            unmap_domain_page_global(d->arch.pirq_eoi_map);
1957
0
            put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1958
0
            d->arch.pirq_eoi_map = NULL;
1959
0
            d->arch.auto_unmask = 0;
1960
0
        }
1961
0
1962
0
        d->arch.relmem = RELMEM_shared;
1963
0
        /* fallthrough */
1964
0
1965
0
    case RELMEM_shared:
1966
0
1967
0
        if ( is_hvm_domain(d) )
1968
0
        {
1969
0
            /* If the domain has shared pages, relinquish them allowing
1970
0
             * for preemption. */
1971
0
            ret = relinquish_shared_pages(d);
1972
0
            if ( ret )
1973
0
                return ret;
1974
0
        }
1975
0
1976
0
        d->arch.relmem = RELMEM_xen;
1977
0
1978
0
        spin_lock(&d->page_alloc_lock);
1979
0
        page_list_splice(&d->arch.relmem_list, &d->page_list);
1980
0
        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
1981
0
        spin_unlock(&d->page_alloc_lock);
1982
0
1983
0
        /* Fallthrough. Relinquish every page of memory. */
1984
0
    case RELMEM_xen:
1985
0
        ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1986
0
        if ( ret )
1987
0
            return ret;
1988
0
        d->arch.relmem = RELMEM_l4;
1989
0
        /* fallthrough */
1990
0
1991
0
    case RELMEM_l4:
1992
0
        ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1993
0
        if ( ret )
1994
0
            return ret;
1995
0
        d->arch.relmem = RELMEM_l3;
1996
0
        /* fallthrough */
1997
0
1998
0
    case RELMEM_l3:
1999
0
        ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
2000
0
        if ( ret )
2001
0
            return ret;
2002
0
        d->arch.relmem = RELMEM_l2;
2003
0
        /* fallthrough */
2004
0
2005
0
    case RELMEM_l2:
2006
0
        ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2007
0
        if ( ret )
2008
0
            return ret;
2009
0
        d->arch.relmem = RELMEM_done;
2010
0
        /* fallthrough */
2011
0
2012
0
    case RELMEM_done:
2013
0
        break;
2014
0
2015
0
    default:
2016
0
        BUG();
2017
0
    }
2018
0
2019
0
    pit_deinit(d);
2020
0
2021
0
    if ( is_hvm_domain(d) )
2022
0
        hvm_domain_relinquish_resources(d);
2023
0
2024
0
    return 0;
2025
0
}
2026
2027
void arch_dump_domain_info(struct domain *d)
2028
0
{
2029
0
    paging_dump_domain_info(d);
2030
0
}
2031
2032
void arch_dump_vcpu_info(struct vcpu *v)
2033
0
{
2034
0
    paging_dump_vcpu_info(v);
2035
0
2036
0
    vpmu_dump(v);
2037
0
}
2038
2039
void vcpu_kick(struct vcpu *v)
2040
99.6k
{
2041
99.6k
    /*
2042
99.6k
     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2043
99.6k
     * pending flag. These values may fluctuate (after all, we hold no
2044
99.6k
     * locks) but the key insight is that each change will cause
2045
99.6k
     * evtchn_upcall_pending to be polled.
2046
99.6k
     * 
2047
99.6k
     * NB2. We save the running flag across the unblock to avoid a needless
2048
99.6k
     * IPI for domains that we IPI'd to unblock.
2049
99.6k
     */
2050
99.6k
    bool running = v->is_running;
2051
99.6k
2052
99.6k
    vcpu_unblock(v);
2053
99.6k
    if ( running && (in_irq() || (v != current)) )
2054
34.4k
        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2055
99.6k
}
2056
2057
void vcpu_mark_events_pending(struct vcpu *v)
2058
99.7k
{
2059
99.7k
    int already_pending = test_and_set_bit(
2060
99.7k
        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2061
99.7k
2062
99.7k
    if ( already_pending )
2063
65
        return;
2064
99.7k
2065
99.6k
    if ( is_hvm_vcpu(v) )
2066
99.7k
        hvm_assert_evtchn_irq(v);
2067
99.6k
    else
2068
18.4E
        vcpu_kick(v);
2069
99.6k
}
2070
2071
static void vcpu_kick_softirq(void)
2072
34.4k
{
2073
34.4k
    /*
2074
34.4k
     * Nothing to do here: we merely prevent notifiers from racing with checks
2075
34.4k
     * executed on return to guest context with interrupts enabled. See, for
2076
34.4k
     * example, xxx_intr_assist() executed on return to HVM guest context.
2077
34.4k
     */
2078
34.4k
}
2079
2080
static int __init init_vcpu_kick_softirq(void)
2081
1
{
2082
1
    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2083
1
    return 0;
2084
1
}
2085
__initcall(init_vcpu_kick_softirq);
2086
2087
2088
/*
2089
 * Local variables:
2090
 * mode: C
2091
 * c-file-style: "BSD"
2092
 * c-basic-offset: 4
2093
 * tab-width: 4
2094
 * indent-tabs-mode: nil
2095
 * End:
2096
 */