debuggers.hg

view xen/arch/x86/domain.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 97ab84aca65c
children
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <xen/cpu.h>
36 #include <xen/wait.h>
37 #include <public/sysctl.h>
38 #include <asm/regs.h>
39 #include <asm/mc146818rtc.h>
40 #include <asm/system.h>
41 #include <asm/io.h>
42 #include <asm/processor.h>
43 #include <asm/desc.h>
44 #include <asm/i387.h>
45 #include <asm/mpspec.h>
46 #include <asm/ldt.h>
47 #include <asm/hypercall.h>
48 #include <asm/hvm/hvm.h>
49 #include <asm/hvm/support.h>
50 #include <asm/debugreg.h>
51 #include <asm/msr.h>
52 #include <asm/traps.h>
53 #include <asm/nmi.h>
54 #include <asm/mce.h>
55 #include <xen/numa.h>
56 #include <xen/iommu.h>
57 #ifdef CONFIG_COMPAT
58 #include <compat/vcpu.h>
59 #endif
61 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
62 DEFINE_PER_CPU(unsigned long, cr4);
64 static void default_idle(void);
65 static void default_dead_idle(void);
66 void (*pm_idle) (void) __read_mostly = default_idle;
67 void (*dead_idle) (void) __read_mostly = default_dead_idle;
69 static void paravirt_ctxt_switch_from(struct vcpu *v);
70 static void paravirt_ctxt_switch_to(struct vcpu *v);
72 static void vcpu_destroy_pagetables(struct vcpu *v);
74 static void continue_idle_domain(struct vcpu *v)
75 {
76 reset_stack_and_jump(idle_loop);
77 }
79 static void continue_nonidle_domain(struct vcpu *v)
80 {
81 check_wakeup_from_wait();
82 reset_stack_and_jump(ret_from_intr);
83 }
85 static void default_idle(void)
86 {
87 local_irq_disable();
88 if ( cpu_is_haltable(smp_processor_id()) )
89 safe_halt();
90 else
91 local_irq_enable();
92 }
94 static void default_dead_idle(void)
95 {
96 for ( ; ; )
97 halt();
98 }
100 static void play_dead(void)
101 {
102 cpu_exit_clear(smp_processor_id());
103 mb();
104 local_irq_disable();
105 wbinvd();
106 (*dead_idle)();
107 }
109 void idle_loop(void)
110 {
111 for ( ; ; )
112 {
113 if ( cpu_is_offline(smp_processor_id()) )
114 play_dead();
115 (*pm_idle)();
116 do_tasklet();
117 do_softirq();
118 }
119 }
121 void startup_cpu_idle_loop(void)
122 {
123 struct vcpu *v = current;
125 ASSERT(is_idle_vcpu(v));
126 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
127 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
129 reset_stack_and_jump(idle_loop);
130 }
132 void dump_pageframe_info(struct domain *d)
133 {
134 struct page_info *page;
136 printk("Memory pages belonging to domain %u:\n", d->domain_id);
138 if ( d->tot_pages >= 10 )
139 {
140 printk(" DomPage list too long to display\n");
141 }
142 else
143 {
144 spin_lock(&d->page_alloc_lock);
145 page_list_for_each ( page, &d->page_list )
146 {
147 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
148 _p(page_to_mfn(page)),
149 page->count_info, page->u.inuse.type_info);
150 }
151 spin_unlock(&d->page_alloc_lock);
152 }
154 if ( is_hvm_domain(d) )
155 {
156 p2m_pod_dump_data(p2m_get_hostp2m(d));
157 }
159 spin_lock(&d->page_alloc_lock);
160 page_list_for_each ( page, &d->xenpage_list )
161 {
162 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
163 _p(page_to_mfn(page)),
164 page->count_info, page->u.inuse.type_info);
165 }
166 spin_unlock(&d->page_alloc_lock);
167 }
169 struct domain *alloc_domain_struct(void)
170 {
171 struct domain *d;
172 /*
173 * We pack the PDX of the domain structure into a 32-bit field within
174 * the page_info structure. Hence the MEMF_bits() restriction.
175 */
176 unsigned int bits = 32 + PAGE_SHIFT;
178 #ifdef __x86_64__
179 bits += pfn_pdx_hole_shift;
180 #endif
181 d = alloc_xenheap_pages(get_order_from_bytes(sizeof(*d)), MEMF_bits(bits));
182 if ( d != NULL )
183 memset(d, 0, sizeof(*d));
184 return d;
185 }
187 void free_domain_struct(struct domain *d)
188 {
189 lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
190 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
191 }
193 struct vcpu *alloc_vcpu_struct(void)
194 {
195 struct vcpu *v;
196 /*
197 * This structure contains embedded PAE PDPTEs, used when an HVM guest
198 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
199 * may require that the shadow CR3 points below 4GB, and hence the whole
200 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
201 */
202 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
203 if ( v != NULL )
204 memset(v, 0, sizeof(*v));
205 return v;
206 }
208 void free_vcpu_struct(struct vcpu *v)
209 {
210 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
211 }
213 #ifdef __x86_64__
215 static int setup_compat_l4(struct vcpu *v)
216 {
217 struct page_info *pg;
218 l4_pgentry_t *l4tab;
219 int rc;
221 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
222 if ( pg == NULL )
223 return -ENOMEM;
225 rc = setup_compat_arg_xlat(v);
226 if ( rc )
227 {
228 free_domheap_page(pg);
229 return rc;
230 }
232 /* This page needs to look like a pagetable so that it can be shadowed */
233 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
235 l4tab = page_to_virt(pg);
236 copy_page(l4tab, idle_pg_table);
237 l4tab[0] = l4e_empty();
238 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
239 l4e_from_page(pg, __PAGE_HYPERVISOR);
240 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
241 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
242 __PAGE_HYPERVISOR);
244 v->arch.guest_table = pagetable_from_page(pg);
245 v->arch.guest_table_user = v->arch.guest_table;
247 return 0;
248 }
250 static void release_compat_l4(struct vcpu *v)
251 {
252 free_compat_arg_xlat(v);
253 free_domheap_page(pagetable_get_page(v->arch.guest_table));
254 v->arch.guest_table = pagetable_null();
255 v->arch.guest_table_user = pagetable_null();
256 }
258 static inline int may_switch_mode(struct domain *d)
259 {
260 return (!is_hvm_domain(d) && (d->tot_pages == 0));
261 }
263 int switch_native(struct domain *d)
264 {
265 unsigned int vcpuid;
267 if ( d == NULL )
268 return -EINVAL;
269 if ( !may_switch_mode(d) )
270 return -EACCES;
271 if ( !is_pv_32on64_domain(d) )
272 return 0;
274 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
276 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
277 {
278 if (d->vcpu[vcpuid])
279 release_compat_l4(d->vcpu[vcpuid]);
280 }
282 return 0;
283 }
285 int switch_compat(struct domain *d)
286 {
287 unsigned int vcpuid;
289 if ( d == NULL )
290 return -EINVAL;
291 if ( !may_switch_mode(d) )
292 return -EACCES;
293 if ( is_pv_32on64_domain(d) )
294 return 0;
296 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
298 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
299 {
300 if ( (d->vcpu[vcpuid] != NULL) &&
301 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
302 goto undo_and_fail;
303 }
305 domain_set_alloc_bitsize(d);
307 return 0;
309 undo_and_fail:
310 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
311 while ( vcpuid-- != 0 )
312 {
313 if ( d->vcpu[vcpuid] != NULL )
314 release_compat_l4(d->vcpu[vcpuid]);
315 }
316 return -ENOMEM;
317 }
319 #else
320 #define setup_compat_l4(v) 0
321 #define release_compat_l4(v) ((void)0)
322 #endif
324 int vcpu_initialise(struct vcpu *v)
325 {
326 struct domain *d = v->domain;
327 int rc;
329 v->arch.vcpu_info_mfn = INVALID_MFN;
331 v->arch.flags = TF_kernel_mode;
333 #if defined(__i386__)
334 mapcache_vcpu_init(v);
335 #else
336 {
337 unsigned int idx = perdomain_pt_pgidx(v);
338 struct page_info *pg;
340 if ( !perdomain_pt_page(d, idx) )
341 {
342 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
343 if ( !pg )
344 return -ENOMEM;
345 clear_page(page_to_virt(pg));
346 perdomain_pt_page(d, idx) = pg;
347 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx]
348 = l2e_from_page(pg, __PAGE_HYPERVISOR);
349 }
350 }
351 #endif
353 pae_l3_cache_init(&v->arch.pae_l3_cache);
355 paging_vcpu_init(v);
357 v->arch.perdomain_ptes = perdomain_ptes(d, v);
359 spin_lock_init(&v->arch.shadow_ldt_lock);
361 if ( (rc = xsave_alloc_save_area(v)) != 0 )
362 return rc;
364 if ( is_hvm_domain(d) )
365 {
366 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
367 xsave_free_save_area(v);
368 return rc;
369 }
371 /* PV guests by default have a 100Hz ticker. */
372 if ( !is_idle_domain(d) )
373 v->periodic_period = MILLISECS(10);
375 /* PV guests get an emulated PIT too for video BIOSes to use. */
376 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
377 pit_init(v, cpu_khz);
379 v->arch.schedule_tail = continue_nonidle_domain;
380 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
381 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
383 if ( is_idle_domain(d) )
384 {
385 v->arch.schedule_tail = continue_idle_domain;
386 v->arch.cr3 = __pa(idle_pg_table);
387 }
389 v->arch.guest_context.ctrlreg[4] =
390 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
392 rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0;
393 if ( rc )
394 xsave_free_save_area(v);
396 return rc;
397 }
399 void vcpu_destroy(struct vcpu *v)
400 {
401 if ( is_pv_32on64_vcpu(v) )
402 release_compat_l4(v);
404 xsave_free_save_area(v);
406 if ( is_hvm_vcpu(v) )
407 hvm_vcpu_destroy(v);
408 }
410 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
411 {
412 #ifdef __x86_64__
413 struct page_info *pg;
414 #else
415 int pdpt_order;
416 #endif
417 int i, paging_initialised = 0;
418 int rc = -ENOMEM;
420 d->arch.hvm_domain.hap_enabled =
421 is_hvm_domain(d) &&
422 hvm_funcs.hap_supported &&
423 (domcr_flags & DOMCRF_hap);
424 d->arch.hvm_domain.mem_sharing_enabled = 0;
426 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
428 INIT_LIST_HEAD(&d->arch.pdev_list);
430 d->arch.relmem = RELMEM_not_started;
431 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
433 #if defined(__i386__)
435 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
436 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
437 if ( d->arch.mm_perdomain_pt == NULL )
438 goto fail;
439 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
441 mapcache_domain_init(d);
443 #else /* __x86_64__ */
445 BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages)
446 != PAGE_SIZE);
447 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
448 if ( !pg )
449 goto fail;
450 d->arch.mm_perdomain_pt_pages = page_to_virt(pg);
451 clear_page(d->arch.mm_perdomain_pt_pages);
453 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
454 if ( pg == NULL )
455 goto fail;
456 d->arch.mm_perdomain_l2 = page_to_virt(pg);
457 clear_page(d->arch.mm_perdomain_l2);
459 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
460 if ( pg == NULL )
461 goto fail;
462 d->arch.mm_perdomain_l3 = page_to_virt(pg);
463 clear_page(d->arch.mm_perdomain_l3);
464 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
465 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
466 __PAGE_HYPERVISOR);
468 HYPERVISOR_COMPAT_VIRT_START(d) =
469 is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START;
471 #endif /* __x86_64__ */
473 if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
474 goto fail;
475 paging_initialised = 1;
477 if ( !is_idle_domain(d) )
478 {
479 d->arch.ioport_caps =
480 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
481 rc = -ENOMEM;
482 if ( d->arch.ioport_caps == NULL )
483 goto fail;
485 /*
486 * The shared_info machine address must fit in a 32-bit field within a
487 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
488 */
489 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
490 goto fail;
492 clear_page(d->shared_info);
493 share_xen_page_with_guest(
494 virt_to_page(d->shared_info), d, XENSHARE_writable);
496 d->arch.pirq_irq = xmalloc_array(int, d->nr_pirqs);
497 if ( !d->arch.pirq_irq )
498 goto fail;
499 memset(d->arch.pirq_irq, 0,
500 d->nr_pirqs * sizeof(*d->arch.pirq_irq));
502 d->arch.irq_pirq = xmalloc_array(int, nr_irqs);
503 if ( !d->arch.irq_pirq )
504 goto fail;
505 memset(d->arch.irq_pirq, 0,
506 nr_irqs * sizeof(*d->arch.irq_pirq));
508 for ( i = 1; platform_legacy_irq(i); ++i )
509 if ( !IO_APIC_IRQ(i) )
510 d->arch.irq_pirq[i] = d->arch.pirq_irq[i] = i;
512 if ( is_hvm_domain(d) )
513 {
514 d->arch.pirq_emuirq = xmalloc_array(int, d->nr_pirqs);
515 d->arch.emuirq_pirq = xmalloc_array(int, nr_irqs);
516 if ( !d->arch.pirq_emuirq || !d->arch.emuirq_pirq )
517 goto fail;
518 for (i = 0; i < d->nr_pirqs; i++)
519 d->arch.pirq_emuirq[i] = IRQ_UNBOUND;
520 for (i = 0; i < nr_irqs; i++)
521 d->arch.emuirq_pirq[i] = IRQ_UNBOUND;
522 }
525 if ( (rc = iommu_domain_init(d)) != 0 )
526 goto fail;
528 /* For Guest vMCE MSRs virtualization */
529 vmce_init_msr(d);
530 }
532 if ( is_hvm_domain(d) )
533 {
534 if ( (rc = hvm_domain_initialise(d)) != 0 )
535 {
536 iommu_domain_destroy(d);
537 goto fail;
538 }
539 }
540 else
541 {
542 /* 32-bit PV guest by default only if Xen is not 64-bit. */
543 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
544 (CONFIG_PAGING_LEVELS != 4);
545 }
547 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
548 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
549 {
550 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
551 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
552 }
554 /* initialize default tsc behavior in case tools don't */
555 tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
556 spin_lock_init(&d->arch.vtsc_lock);
558 return 0;
560 fail:
561 d->is_dying = DOMDYING_dead;
562 vmce_destroy_msr(d);
563 xfree(d->arch.pirq_irq);
564 xfree(d->arch.irq_pirq);
565 xfree(d->arch.pirq_emuirq);
566 xfree(d->arch.emuirq_pirq);
567 free_xenheap_page(d->shared_info);
568 if ( paging_initialised )
569 paging_final_teardown(d);
570 #ifdef __x86_64__
571 if ( d->arch.mm_perdomain_l2 )
572 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
573 if ( d->arch.mm_perdomain_l3 )
574 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
575 if ( d->arch.mm_perdomain_pt_pages )
576 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
577 #else
578 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
579 #endif
580 return rc;
581 }
583 void arch_domain_destroy(struct domain *d)
584 {
585 #ifdef __x86_64__
586 unsigned int i;
587 #endif
589 if ( is_hvm_domain(d) )
590 hvm_domain_destroy(d);
592 vmce_destroy_msr(d);
593 pci_release_devices(d);
594 free_domain_pirqs(d);
595 if ( !is_idle_domain(d) )
596 iommu_domain_destroy(d);
598 paging_final_teardown(d);
600 #ifdef __i386__
601 free_xenheap_pages(
602 d->arch.mm_perdomain_pt,
603 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
604 #else
605 for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
606 {
607 if ( perdomain_pt_page(d, i) )
608 free_domheap_page(perdomain_pt_page(d, i));
609 }
610 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
611 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
612 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
613 #endif
615 free_xenheap_page(d->shared_info);
616 xfree(d->arch.pirq_irq);
617 xfree(d->arch.irq_pirq);
618 xfree(d->arch.pirq_emuirq);
619 xfree(d->arch.emuirq_pirq);
620 }
622 unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
623 {
624 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
626 hv_cr4_mask = ~X86_CR4_TSD;
627 if ( cpu_has_de )
628 hv_cr4_mask &= ~X86_CR4_DE;
629 if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) )
630 hv_cr4_mask &= ~X86_CR4_FSGSBASE;
631 if ( cpu_has_xsave )
632 hv_cr4_mask &= ~X86_CR4_OSXSAVE;
634 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
635 gdprintk(XENLOG_WARNING,
636 "Attempt to change CR4 flags %08lx -> %08lx\n",
637 hv_cr4, guest_cr4);
639 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
640 }
642 /* This is called by arch_final_setup_guest and do_boot_vcpu */
643 int arch_set_info_guest(
644 struct vcpu *v, vcpu_guest_context_u c)
645 {
646 struct domain *d = v->domain;
647 unsigned long cr3_pfn = INVALID_MFN;
648 unsigned long flags, cr4;
649 int i, rc = 0, compat;
651 /* The context is a compat-mode one if the target domain is compat-mode;
652 * we expect the tools to DTRT even in compat-mode callers. */
653 compat = is_pv_32on64_domain(d);
655 #ifdef CONFIG_COMPAT
656 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
657 #else
658 #define c(fld) (c.nat->fld)
659 #endif
660 flags = c(flags);
662 if ( !is_hvm_vcpu(v) )
663 {
664 if ( !compat )
665 {
666 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
667 fixup_guest_stack_selector(d, c.nat->kernel_ss);
668 fixup_guest_code_selector(d, c.nat->user_regs.cs);
669 #ifdef __i386__
670 fixup_guest_code_selector(d, c.nat->event_callback_cs);
671 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
672 #endif
674 for ( i = 0; i < 256; i++ )
675 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
677 /* LDT safety checks. */
678 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
679 (c.nat->ldt_ents > 8192) ||
680 !array_access_ok(c.nat->ldt_base,
681 c.nat->ldt_ents,
682 LDT_ENTRY_SIZE) )
683 return -EINVAL;
684 }
685 #ifdef CONFIG_COMPAT
686 else
687 {
688 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
689 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
690 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
691 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
692 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
694 for ( i = 0; i < 256; i++ )
695 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
697 /* LDT safety checks. */
698 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
699 (c.cmp->ldt_ents > 8192) ||
700 !compat_array_access_ok(c.cmp->ldt_base,
701 c.cmp->ldt_ents,
702 LDT_ENTRY_SIZE) )
703 return -EINVAL;
704 }
705 #endif
706 }
708 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
710 v->arch.flags &= ~TF_kernel_mode;
711 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
712 v->arch.flags |= TF_kernel_mode;
714 if ( !compat )
715 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
716 #ifdef CONFIG_COMPAT
717 else
718 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
719 #endif
721 v->arch.guest_context.user_regs.eflags |= 2;
723 if ( is_hvm_vcpu(v) )
724 {
725 hvm_set_info_guest(v);
726 goto out;
727 }
729 /* Only CR0.TS is modifiable by guest or admin. */
730 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
731 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
733 init_int80_direct_trap(v);
735 /* IOPL privileges are virtualised. */
736 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
737 v->arch.guest_context.user_regs.eflags &= ~X86_EFLAGS_IOPL;
739 /* Ensure real hardware interrupts are enabled. */
740 v->arch.guest_context.user_regs.eflags |= X86_EFLAGS_IF;
742 cr4 = v->arch.guest_context.ctrlreg[4];
743 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) :
744 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
746 memset(v->arch.guest_context.debugreg, 0,
747 sizeof(v->arch.guest_context.debugreg));
748 for ( i = 0; i < 8; i++ )
749 (void)set_debugreg(v, i, c(debugreg[i]));
751 if ( v->is_initialised )
752 goto out;
754 if ( v->vcpu_id == 0 )
755 d->vm_assist = c(vm_assist);
757 if ( !compat )
758 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
759 #ifdef CONFIG_COMPAT
760 else
761 {
762 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
763 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
765 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
766 return -EINVAL;
767 for ( i = 0; i < n; ++i )
768 gdt_frames[i] = c.cmp->gdt_frames[i];
769 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
770 }
771 #endif
772 if ( rc != 0 )
773 return rc;
775 if ( !compat )
776 {
777 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
779 if ( !mfn_valid(cr3_pfn) ||
780 (paging_mode_refcounts(d)
781 ? !get_page(mfn_to_page(cr3_pfn), d)
782 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
783 PGT_base_page_table)) )
784 {
785 destroy_gdt(v);
786 return -EINVAL;
787 }
789 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
791 #ifdef __x86_64__
792 if ( c.nat->ctrlreg[1] )
793 {
794 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
796 if ( !mfn_valid(cr3_pfn) ||
797 (paging_mode_refcounts(d)
798 ? !get_page(mfn_to_page(cr3_pfn), d)
799 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
800 PGT_base_page_table)) )
801 {
802 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
803 v->arch.guest_table = pagetable_null();
804 if ( paging_mode_refcounts(d) )
805 put_page(mfn_to_page(cr3_pfn));
806 else
807 put_page_and_type(mfn_to_page(cr3_pfn));
808 destroy_gdt(v);
809 return -EINVAL;
810 }
812 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
813 }
814 }
815 else
816 {
817 l4_pgentry_t *l4tab;
819 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
821 if ( !mfn_valid(cr3_pfn) ||
822 (paging_mode_refcounts(d)
823 ? !get_page(mfn_to_page(cr3_pfn), d)
824 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
825 PGT_l3_page_table)) )
826 {
827 destroy_gdt(v);
828 return -EINVAL;
829 }
831 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
832 *l4tab = l4e_from_pfn(
833 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
834 #endif
835 }
837 if ( v->vcpu_id == 0 )
838 update_domain_wallclock_time(d);
840 /* Don't redo final setup */
841 v->is_initialised = 1;
843 if ( paging_mode_enabled(d) )
844 paging_update_paging_modes(v);
846 update_cr3(v);
848 out:
849 if ( flags & VGCF_online )
850 clear_bit(_VPF_down, &v->pause_flags);
851 else
852 set_bit(_VPF_down, &v->pause_flags);
853 return 0;
854 #undef c
855 }
857 void arch_vcpu_reset(struct vcpu *v)
858 {
859 if ( !is_hvm_vcpu(v) )
860 {
861 destroy_gdt(v);
862 vcpu_destroy_pagetables(v);
863 }
864 else
865 {
866 vcpu_end_shutdown_deferral(v);
867 }
868 }
870 /*
871 * Unmap the vcpu info page if the guest decided to place it somewhere
872 * else. This is only used from arch_domain_destroy, so there's no
873 * need to do anything clever.
874 */
875 static void
876 unmap_vcpu_info(struct vcpu *v)
877 {
878 unsigned long mfn;
880 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
881 return;
883 mfn = v->arch.vcpu_info_mfn;
884 unmap_domain_page_global(v->vcpu_info);
886 v->vcpu_info = &dummy_vcpu_info;
887 v->arch.vcpu_info_mfn = INVALID_MFN;
889 put_page_and_type(mfn_to_page(mfn));
890 }
892 /*
893 * Map a guest page in and point the vcpu_info pointer at it. This
894 * makes sure that the vcpu_info is always pointing at a valid piece
895 * of memory, and it sets a pending event to make sure that a pending
896 * event doesn't get missed.
897 */
898 static int
899 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
900 {
901 struct domain *d = v->domain;
902 void *mapping;
903 vcpu_info_t *new_info;
904 int i;
906 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
907 return -EINVAL;
909 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
910 return -EINVAL;
912 /* Run this command on yourself or on other offline VCPUS. */
913 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
914 return -EINVAL;
916 mfn = gmfn_to_mfn(d, mfn);
917 if ( !mfn_valid(mfn) ||
918 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
919 return -EINVAL;
921 mapping = map_domain_page_global(mfn);
922 if ( mapping == NULL )
923 {
924 put_page_and_type(mfn_to_page(mfn));
925 return -ENOMEM;
926 }
928 new_info = (vcpu_info_t *)(mapping + offset);
930 if ( v->vcpu_info == &dummy_vcpu_info )
931 {
932 memset(new_info, 0, sizeof(*new_info));
933 __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
934 }
935 else
936 {
937 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
938 }
940 v->vcpu_info = new_info;
941 v->arch.vcpu_info_mfn = mfn;
943 /* Set new vcpu_info pointer /before/ setting pending flags. */
944 wmb();
946 /*
947 * Mark everything as being pending just to make sure nothing gets
948 * lost. The domain will get a spurious event, but it can cope.
949 */
950 vcpu_info(v, evtchn_upcall_pending) = 1;
951 for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
952 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
954 return 0;
955 }
957 long
958 arch_do_vcpu_op(
959 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
960 {
961 long rc = 0;
963 switch ( cmd )
964 {
965 case VCPUOP_register_runstate_memory_area:
966 {
967 struct vcpu_register_runstate_memory_area area;
968 struct vcpu_runstate_info runstate;
970 rc = -EFAULT;
971 if ( copy_from_guest(&area, arg, 1) )
972 break;
974 if ( !guest_handle_okay(area.addr.h, 1) )
975 break;
977 rc = 0;
978 runstate_guest(v) = area.addr.h;
980 if ( v == current )
981 {
982 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
983 }
984 else
985 {
986 vcpu_runstate_get(v, &runstate);
987 __copy_to_guest(runstate_guest(v), &runstate, 1);
988 }
990 break;
991 }
993 case VCPUOP_register_vcpu_info:
994 {
995 struct domain *d = v->domain;
996 struct vcpu_register_vcpu_info info;
998 rc = -EFAULT;
999 if ( copy_from_guest(&info, arg, 1) )
1000 break;
1002 domain_lock(d);
1003 rc = map_vcpu_info(v, info.mfn, info.offset);
1004 domain_unlock(d);
1006 break;
1009 /*
1010 * XXX Disable for 4.0.0: __update_vcpu_system_time() writes to the given
1011 * virtual address even when running in another domain's address space.
1012 */
1013 #if 0
1014 case VCPUOP_register_vcpu_time_memory_area:
1016 struct vcpu_register_time_memory_area area;
1018 rc = -EFAULT;
1019 if ( copy_from_guest(&area, arg, 1) )
1020 break;
1022 if ( !guest_handle_okay(area.addr.h, 1) )
1023 break;
1025 rc = 0;
1026 v->arch.time_info_guest = area.addr.h;
1028 force_update_vcpu_system_time(v);
1030 break;
1032 #endif
1034 case VCPUOP_get_physid:
1036 struct vcpu_get_physid cpu_id;
1038 rc = -EINVAL;
1039 if ( !is_pinned_vcpu(v) )
1040 break;
1042 cpu_id.phys_id =
1043 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1044 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1046 rc = -EFAULT;
1047 if ( copy_to_guest(arg, &cpu_id, 1) )
1048 break;
1050 rc = 0;
1051 break;
1054 default:
1055 rc = -ENOSYS;
1056 break;
1059 return rc;
1062 #ifdef __x86_64__
1064 #define loadsegment(seg,value) ({ \
1065 int __r = 1; \
1066 asm volatile ( \
1067 "1: movl %k1,%%" #seg "\n2:\n" \
1068 ".section .fixup,\"ax\"\n" \
1069 "3: xorl %k0,%k0\n" \
1070 " movl %k0,%%" #seg "\n" \
1071 " jmp 2b\n" \
1072 ".previous\n" \
1073 _ASM_EXTABLE(1b, 3b) \
1074 : "=r" (__r) : "r" (value), "0" (__r) );\
1075 __r; })
1077 /*
1078 * save_segments() writes a mask of segments which are dirty (non-zero),
1079 * allowing load_segments() to avoid some expensive segment loads and
1080 * MSR writes.
1081 */
1082 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1083 #define DIRTY_DS 0x01
1084 #define DIRTY_ES 0x02
1085 #define DIRTY_FS 0x04
1086 #define DIRTY_GS 0x08
1087 #define DIRTY_FS_BASE 0x10
1088 #define DIRTY_GS_BASE_USER 0x20
1090 static void load_segments(struct vcpu *n)
1092 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
1093 int all_segs_okay = 1;
1094 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1096 /* Load and clear the dirty segment mask. */
1097 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1098 per_cpu(dirty_segment_mask, cpu) = 0;
1100 /* Either selector != 0 ==> reload. */
1101 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1102 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1104 /* Either selector != 0 ==> reload. */
1105 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1106 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1108 /*
1109 * Either selector != 0 ==> reload.
1110 * Also reload to reset FS_BASE if it was non-zero.
1111 */
1112 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1113 nctxt->user_regs.fs) )
1114 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1116 /*
1117 * Either selector != 0 ==> reload.
1118 * Also reload to reset GS_BASE if it was non-zero.
1119 */
1120 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1121 nctxt->user_regs.gs) )
1123 /* Reset GS_BASE with user %gs? */
1124 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1125 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1128 if ( !is_pv_32on64_domain(n->domain) )
1130 /* This can only be non-zero if selector is NULL. */
1131 if ( nctxt->fs_base )
1132 wrmsrl(MSR_FS_BASE, nctxt->fs_base);
1134 /* Most kernels have non-zero GS base, so don't bother testing. */
1135 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1136 wrmsrl(MSR_SHADOW_GS_BASE, nctxt->gs_base_kernel);
1138 /* This can only be non-zero if selector is NULL. */
1139 if ( nctxt->gs_base_user )
1140 wrmsrl(MSR_GS_BASE, nctxt->gs_base_user);
1142 /* If in kernel mode then switch the GS bases around. */
1143 if ( (n->arch.flags & TF_kernel_mode) )
1144 asm volatile ( "swapgs" );
1147 if ( unlikely(!all_segs_okay) )
1149 struct cpu_user_regs *regs = guest_cpu_user_regs();
1150 unsigned long *rsp =
1151 (n->arch.flags & TF_kernel_mode) ?
1152 (unsigned long *)regs->rsp :
1153 (unsigned long *)nctxt->kernel_sp;
1154 unsigned long cs_and_mask, rflags;
1156 if ( is_pv_32on64_domain(n->domain) )
1158 unsigned int *esp = ring_1(regs) ?
1159 (unsigned int *)regs->rsp :
1160 (unsigned int *)nctxt->kernel_sp;
1161 unsigned int cs_and_mask, eflags;
1162 int ret = 0;
1164 /* CS longword also contains full evtchn_upcall_mask. */
1165 cs_and_mask = (unsigned short)regs->cs |
1166 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1167 /* Fold upcall mask into RFLAGS.IF. */
1168 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1169 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1171 if ( !ring_1(regs) )
1173 ret = put_user(regs->ss, esp-1);
1174 ret |= put_user(regs->_esp, esp-2);
1175 esp -= 2;
1178 if ( ret |
1179 put_user(eflags, esp-1) |
1180 put_user(cs_and_mask, esp-2) |
1181 put_user(regs->_eip, esp-3) |
1182 put_user(nctxt->user_regs.gs, esp-4) |
1183 put_user(nctxt->user_regs.fs, esp-5) |
1184 put_user(nctxt->user_regs.es, esp-6) |
1185 put_user(nctxt->user_regs.ds, esp-7) )
1187 gdprintk(XENLOG_ERR, "Error while creating compat "
1188 "failsafe callback frame.\n");
1189 domain_crash(n->domain);
1192 if ( test_bit(_VGCF_failsafe_disables_events,
1193 &n->arch.guest_context.flags) )
1194 vcpu_info(n, evtchn_upcall_mask) = 1;
1196 regs->entry_vector = TRAP_syscall;
1197 regs->_eflags &= 0xFFFCBEFFUL;
1198 regs->ss = FLAT_COMPAT_KERNEL_SS;
1199 regs->_esp = (unsigned long)(esp-7);
1200 regs->cs = FLAT_COMPAT_KERNEL_CS;
1201 regs->_eip = nctxt->failsafe_callback_eip;
1202 return;
1205 if ( !(n->arch.flags & TF_kernel_mode) )
1206 toggle_guest_mode(n);
1207 else
1208 regs->cs &= ~3;
1210 /* CS longword also contains full evtchn_upcall_mask. */
1211 cs_and_mask = (unsigned long)regs->cs |
1212 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1214 /* Fold upcall mask into RFLAGS.IF. */
1215 rflags = regs->rflags & ~X86_EFLAGS_IF;
1216 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1218 if ( put_user(regs->ss, rsp- 1) |
1219 put_user(regs->rsp, rsp- 2) |
1220 put_user(rflags, rsp- 3) |
1221 put_user(cs_and_mask, rsp- 4) |
1222 put_user(regs->rip, rsp- 5) |
1223 put_user(nctxt->user_regs.gs, rsp- 6) |
1224 put_user(nctxt->user_regs.fs, rsp- 7) |
1225 put_user(nctxt->user_regs.es, rsp- 8) |
1226 put_user(nctxt->user_regs.ds, rsp- 9) |
1227 put_user(regs->r11, rsp-10) |
1228 put_user(regs->rcx, rsp-11) )
1230 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1231 "callback frame.\n");
1232 domain_crash(n->domain);
1235 if ( test_bit(_VGCF_failsafe_disables_events,
1236 &n->arch.guest_context.flags) )
1237 vcpu_info(n, evtchn_upcall_mask) = 1;
1239 regs->entry_vector = TRAP_syscall;
1240 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1241 X86_EFLAGS_NT|X86_EFLAGS_TF);
1242 regs->ss = FLAT_KERNEL_SS;
1243 regs->rsp = (unsigned long)(rsp-11);
1244 regs->cs = FLAT_KERNEL_CS;
1245 regs->rip = nctxt->failsafe_callback_eip;
1249 static void save_segments(struct vcpu *v)
1251 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1252 struct cpu_user_regs *regs = &ctxt->user_regs;
1253 unsigned int dirty_segment_mask = 0;
1255 regs->ds = read_segment_register(ds);
1256 regs->es = read_segment_register(es);
1257 regs->fs = read_segment_register(fs);
1258 regs->gs = read_segment_register(gs);
1260 if ( regs->ds )
1261 dirty_segment_mask |= DIRTY_DS;
1263 if ( regs->es )
1264 dirty_segment_mask |= DIRTY_ES;
1266 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1268 dirty_segment_mask |= DIRTY_FS;
1269 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1271 else if ( ctxt->fs_base )
1273 dirty_segment_mask |= DIRTY_FS_BASE;
1276 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1278 dirty_segment_mask |= DIRTY_GS;
1279 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1281 else if ( ctxt->gs_base_user )
1283 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1286 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1289 #define switch_kernel_stack(v) ((void)0)
1291 #elif defined(__i386__)
1293 #define load_segments(n) ((void)0)
1294 #define save_segments(p) ((void)0)
1296 static inline void switch_kernel_stack(struct vcpu *v)
1298 struct tss_struct *tss = &this_cpu(init_tss);
1299 tss->esp1 = v->arch.guest_context.kernel_sp;
1300 tss->ss1 = v->arch.guest_context.kernel_ss;
1303 #endif /* __i386__ */
1305 static void paravirt_ctxt_switch_from(struct vcpu *v)
1307 save_segments(v);
1309 /*
1310 * Disable debug breakpoints. We do this aggressively because if we switch
1311 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1312 * inside Xen, before we get a chance to reload DR7, and this cannot always
1313 * safely be handled.
1314 */
1315 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1316 write_debugreg(7, 0);
1319 static void paravirt_ctxt_switch_to(struct vcpu *v)
1321 unsigned long cr4;
1323 set_int80_direct_trap(v);
1324 switch_kernel_stack(v);
1326 cr4 = pv_guest_cr4_to_real_cr4(v);
1327 if ( unlikely(cr4 != read_cr4()) )
1328 write_cr4(cr4);
1330 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1332 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1333 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1334 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1335 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1336 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1337 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1340 if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) &&
1341 boot_cpu_has(X86_FEATURE_RDTSCP) )
1342 write_rdtscp_aux(v->domain->arch.incarnation);
1345 /* Update per-VCPU guest runstate shared memory area (if registered). */
1346 static void update_runstate_area(struct vcpu *v)
1348 if ( guest_handle_is_null(runstate_guest(v)) )
1349 return;
1351 #ifdef CONFIG_COMPAT
1352 if ( has_32bit_shinfo(v->domain) )
1354 struct compat_vcpu_runstate_info info;
1356 XLAT_vcpu_runstate_info(&info, &v->runstate);
1357 __copy_to_guest(v->runstate_guest.compat, &info, 1);
1358 return;
1360 #endif
1362 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1365 static inline int need_full_gdt(struct vcpu *v)
1367 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1370 static void __context_switch(void)
1372 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1373 unsigned int cpu = smp_processor_id();
1374 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1375 struct vcpu *n = current;
1376 struct desc_struct *gdt;
1377 struct desc_ptr gdt_desc;
1379 ASSERT(p != n);
1380 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1382 if ( !is_idle_vcpu(p) )
1384 memcpy(&p->arch.guest_context.user_regs,
1385 stack_regs,
1386 CTXT_SWITCH_STACK_BYTES);
1387 save_init_fpu(p);
1388 p->arch.ctxt_switch_from(p);
1391 /*
1392 * Mark this CPU in next domain's dirty cpumasks before calling
1393 * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1394 * which is synchronised on that function.
1395 */
1396 if ( p->domain != n->domain )
1397 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1398 cpu_set(cpu, n->vcpu_dirty_cpumask);
1400 if ( !is_idle_vcpu(n) )
1402 memcpy(stack_regs,
1403 &n->arch.guest_context.user_regs,
1404 CTXT_SWITCH_STACK_BYTES);
1405 if ( cpu_has_xsave && n->arch.xcr0 != get_xcr0() )
1406 set_xcr0(n->arch.xcr0);
1407 n->arch.ctxt_switch_to(n);
1410 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1411 per_cpu(compat_gdt_table, cpu);
1412 if ( need_full_gdt(n) )
1414 struct page_info *page = virt_to_page(gdt);
1415 unsigned int i;
1416 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1417 l1e_write(n->arch.perdomain_ptes +
1418 FIRST_RESERVED_GDT_PAGE + i,
1419 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1422 if ( need_full_gdt(p) &&
1423 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1425 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1426 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1427 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1430 write_ptbase(n);
1432 if ( need_full_gdt(n) &&
1433 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1435 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1436 gdt_desc.base = GDT_VIRT_START(n);
1437 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1440 if ( p->domain != n->domain )
1441 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1442 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1444 per_cpu(curr_vcpu, cpu) = n;
1448 void context_switch(struct vcpu *prev, struct vcpu *next)
1450 unsigned int cpu = smp_processor_id();
1451 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1453 ASSERT(local_irq_is_enabled());
1455 /* Allow at most one CPU at a time to be dirty. */
1456 ASSERT(cpus_weight(dirty_mask) <= 1);
1457 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1459 /* Other cpus call __sync_local_execstate from flush ipi handler. */
1460 flush_tlb_mask(&dirty_mask);
1463 if (prev != next)
1464 update_runstate_area(prev);
1466 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1467 pt_save_timer(prev);
1469 local_irq_disable();
1471 set_current(next);
1473 if ( (per_cpu(curr_vcpu, cpu) == next) ||
1474 (is_idle_vcpu(next) && cpu_online(cpu)) )
1476 local_irq_enable();
1478 else
1480 __context_switch();
1482 #ifdef CONFIG_COMPAT
1483 if ( !is_hvm_vcpu(next) &&
1484 (is_idle_vcpu(prev) ||
1485 is_hvm_vcpu(prev) ||
1486 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1488 uint64_t efer = read_efer();
1489 if ( !(efer & EFER_SCE) )
1490 write_efer(efer | EFER_SCE);
1492 #endif
1494 /* Re-enable interrupts before restoring state which may fault. */
1495 local_irq_enable();
1497 if ( !is_hvm_vcpu(next) )
1499 load_LDT(next);
1500 load_segments(next);
1504 context_saved(prev);
1506 if (prev != next)
1507 update_runstate_area(next);
1509 schedule_tail(next);
1510 BUG();
1513 void continue_running(struct vcpu *same)
1515 schedule_tail(same);
1516 BUG();
1519 int __sync_local_execstate(void)
1521 unsigned long flags;
1522 int switch_required;
1524 local_irq_save(flags);
1526 switch_required = (this_cpu(curr_vcpu) != current);
1528 if ( switch_required )
1530 ASSERT(current == idle_vcpu[smp_processor_id()]);
1531 __context_switch();
1534 local_irq_restore(flags);
1536 return switch_required;
1539 void sync_local_execstate(void)
1541 (void)__sync_local_execstate();
1544 void sync_vcpu_execstate(struct vcpu *v)
1546 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1547 sync_local_execstate();
1549 /* Other cpus call __sync_local_execstate from flush ipi handler. */
1550 flush_tlb_mask(&v->vcpu_dirty_cpumask);
1553 #define next_arg(fmt, args) ({ \
1554 unsigned long __arg; \
1555 switch ( *(fmt)++ ) \
1556 { \
1557 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1558 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1559 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1560 default: __arg = 0; BUG(); \
1561 } \
1562 __arg; \
1563 })
1565 unsigned long hypercall_create_continuation(
1566 unsigned int op, const char *format, ...)
1568 struct mc_state *mcs = &current->mc_state;
1569 struct cpu_user_regs *regs;
1570 const char *p = format;
1571 unsigned long arg;
1572 unsigned int i;
1573 va_list args;
1575 va_start(args, format);
1577 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1579 __set_bit(_MCSF_call_preempted, &mcs->flags);
1581 for ( i = 0; *p != '\0'; i++ )
1582 mcs->call.args[i] = next_arg(p, args);
1583 if ( is_pv_32on64_domain(current->domain) )
1585 for ( ; i < 6; i++ )
1586 mcs->call.args[i] = 0;
1589 else
1591 regs = guest_cpu_user_regs();
1592 regs->eax = op;
1594 /* Ensure the hypercall trap instruction is re-executed. */
1595 if ( !is_hvm_vcpu(current) )
1596 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1597 else
1598 current->arch.hvm_vcpu.hcall_preempted = 1;
1600 #ifdef __x86_64__
1601 if ( !is_hvm_vcpu(current) ?
1602 !is_pv_32on64_vcpu(current) :
1603 (hvm_guest_x86_mode(current) == 8) )
1605 for ( i = 0; *p != '\0'; i++ )
1607 arg = next_arg(p, args);
1608 switch ( i )
1610 case 0: regs->rdi = arg; break;
1611 case 1: regs->rsi = arg; break;
1612 case 2: regs->rdx = arg; break;
1613 case 3: regs->r10 = arg; break;
1614 case 4: regs->r8 = arg; break;
1615 case 5: regs->r9 = arg; break;
1619 else
1620 #endif
1622 if ( supervisor_mode_kernel )
1623 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1625 for ( i = 0; *p != '\0'; i++ )
1627 arg = next_arg(p, args);
1628 switch ( i )
1630 case 0: regs->ebx = arg; break;
1631 case 1: regs->ecx = arg; break;
1632 case 2: regs->edx = arg; break;
1633 case 3: regs->esi = arg; break;
1634 case 4: regs->edi = arg; break;
1635 case 5: regs->ebp = arg; break;
1641 va_end(args);
1643 return op;
1646 #ifdef CONFIG_COMPAT
1647 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1649 int rc = 0;
1650 struct mc_state *mcs = &current->mc_state;
1651 struct cpu_user_regs *regs;
1652 unsigned int i, cval = 0;
1653 unsigned long nval = 0;
1654 va_list args;
1656 BUG_ON(id && *id > 5);
1657 BUG_ON(id && (mask & (1U << *id)));
1659 va_start(args, mask);
1661 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1663 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1664 return 0;
1665 for ( i = 0; i < 6; ++i, mask >>= 1 )
1667 if ( mask & 1 )
1669 nval = va_arg(args, unsigned long);
1670 cval = va_arg(args, unsigned int);
1671 if ( cval == nval )
1672 mask &= ~1U;
1673 else
1674 BUG_ON(nval == (unsigned int)nval);
1676 else if ( id && *id == i )
1678 *id = mcs->call.args[i];
1679 id = NULL;
1681 if ( (mask & 1) && mcs->call.args[i] == nval )
1683 mcs->call.args[i] = cval;
1684 ++rc;
1686 else
1687 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1690 else
1692 regs = guest_cpu_user_regs();
1693 for ( i = 0; i < 6; ++i, mask >>= 1 )
1695 unsigned long *reg;
1697 switch ( i )
1699 case 0: reg = &regs->ebx; break;
1700 case 1: reg = &regs->ecx; break;
1701 case 2: reg = &regs->edx; break;
1702 case 3: reg = &regs->esi; break;
1703 case 4: reg = &regs->edi; break;
1704 case 5: reg = &regs->ebp; break;
1705 default: BUG(); reg = NULL; break;
1707 if ( (mask & 1) )
1709 nval = va_arg(args, unsigned long);
1710 cval = va_arg(args, unsigned int);
1711 if ( cval == nval )
1712 mask &= ~1U;
1713 else
1714 BUG_ON(nval == (unsigned int)nval);
1716 else if ( id && *id == i )
1718 *id = *reg;
1719 id = NULL;
1721 if ( (mask & 1) && *reg == nval )
1723 *reg = cval;
1724 ++rc;
1726 else
1727 BUG_ON(*reg != (unsigned int)*reg);
1731 va_end(args);
1733 return rc;
1735 #endif
1737 static int relinquish_memory(
1738 struct domain *d, struct page_list_head *list, unsigned long type)
1740 struct page_info *page;
1741 unsigned long x, y;
1742 int ret = 0;
1744 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1745 spin_lock_recursive(&d->page_alloc_lock);
1747 while ( (page = page_list_remove_head(list)) )
1749 /* Grab a reference to the page so it won't disappear from under us. */
1750 if ( unlikely(!get_page(page, d)) )
1752 /* Couldn't get a reference -- someone is freeing this page. */
1753 page_list_add_tail(page, &d->arch.relmem_list);
1754 continue;
1757 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1758 ret = put_page_and_type_preemptible(page, 1);
1759 switch ( ret )
1761 case 0:
1762 break;
1763 case -EAGAIN:
1764 case -EINTR:
1765 ret = -EAGAIN;
1766 page_list_add(page, list);
1767 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1768 put_page(page);
1769 goto out;
1770 default:
1771 BUG();
1774 clear_superpage_mark(page);
1776 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1777 put_page(page);
1779 /*
1780 * Forcibly invalidate top-most, still valid page tables at this point
1781 * to break circular 'linear page table' references as well as clean up
1782 * partially validated pages. This is okay because MMU structures are
1783 * not shared across domains and this domain is now dead. Thus top-most
1784 * valid tables are not in use so a non-zero count means circular
1785 * reference or partially validated.
1786 */
1787 y = page->u.inuse.type_info;
1788 for ( ; ; )
1790 x = y;
1791 if ( likely((x & PGT_type_mask) != type) ||
1792 likely(!(x & (PGT_validated|PGT_partial))) )
1793 break;
1795 y = cmpxchg(&page->u.inuse.type_info, x,
1796 x & ~(PGT_validated|PGT_partial));
1797 if ( likely(y == x) )
1799 /* No need for atomic update of type_info here: noone else updates it. */
1800 switch ( ret = free_page_type(page, x, 1) )
1802 case 0:
1803 break;
1804 case -EINTR:
1805 page_list_add(page, list);
1806 page->u.inuse.type_info |= PGT_validated;
1807 if ( x & PGT_partial )
1808 put_page(page);
1809 put_page(page);
1810 ret = -EAGAIN;
1811 goto out;
1812 case -EAGAIN:
1813 page_list_add(page, list);
1814 page->u.inuse.type_info |= PGT_partial;
1815 if ( x & PGT_partial )
1816 put_page(page);
1817 goto out;
1818 default:
1819 BUG();
1821 if ( x & PGT_partial )
1823 page->u.inuse.type_info--;
1824 put_page(page);
1826 break;
1830 /* Put the page on the list and /then/ potentially free it. */
1831 page_list_add_tail(page, &d->arch.relmem_list);
1832 put_page(page);
1834 if ( hypercall_preempt_check() )
1836 ret = -EAGAIN;
1837 goto out;
1841 /* list is empty at this point. */
1842 page_list_move(list, &d->arch.relmem_list);
1844 out:
1845 spin_unlock_recursive(&d->page_alloc_lock);
1846 return ret;
1849 static void vcpu_destroy_pagetables(struct vcpu *v)
1851 struct domain *d = v->domain;
1852 unsigned long pfn;
1854 #ifdef __x86_64__
1855 if ( is_pv_32on64_vcpu(v) )
1857 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1858 __va(pagetable_get_paddr(v->arch.guest_table)));
1860 if ( pfn != 0 )
1862 if ( paging_mode_refcounts(d) )
1863 put_page(mfn_to_page(pfn));
1864 else
1865 put_page_and_type(mfn_to_page(pfn));
1868 l4e_write(
1869 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1870 l4e_empty());
1872 v->arch.cr3 = 0;
1873 return;
1875 #endif
1877 pfn = pagetable_get_pfn(v->arch.guest_table);
1878 if ( pfn != 0 )
1880 if ( paging_mode_refcounts(d) )
1881 put_page(mfn_to_page(pfn));
1882 else
1883 put_page_and_type(mfn_to_page(pfn));
1884 v->arch.guest_table = pagetable_null();
1887 #ifdef __x86_64__
1888 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1889 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1890 if ( pfn != 0 )
1892 if ( !is_pv_32bit_vcpu(v) )
1894 if ( paging_mode_refcounts(d) )
1895 put_page(mfn_to_page(pfn));
1896 else
1897 put_page_and_type(mfn_to_page(pfn));
1899 v->arch.guest_table_user = pagetable_null();
1901 #endif
1903 v->arch.cr3 = 0;
1906 int domain_relinquish_resources(struct domain *d)
1908 int ret;
1909 struct vcpu *v;
1911 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1913 switch ( d->arch.relmem )
1915 case RELMEM_not_started:
1916 /* Tear down paging-assistance stuff. */
1917 paging_teardown(d);
1919 for_each_vcpu ( d, v )
1921 /* Drop the in-use references to page-table bases. */
1922 vcpu_destroy_pagetables(v);
1924 /*
1925 * Relinquish GDT mappings. No need for explicit unmapping of the
1926 * LDT as it automatically gets squashed with the guest mappings.
1927 */
1928 destroy_gdt(v);
1930 unmap_vcpu_info(v);
1933 if ( d->arch.pirq_eoi_map != NULL )
1935 unmap_domain_page_global(d->arch.pirq_eoi_map);
1936 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1937 d->arch.pirq_eoi_map = NULL;
1940 d->arch.relmem = RELMEM_xen;
1941 /* fallthrough */
1943 /* Relinquish every page of memory. */
1944 case RELMEM_xen:
1945 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1946 if ( ret )
1947 return ret;
1948 #if CONFIG_PAGING_LEVELS >= 4
1949 d->arch.relmem = RELMEM_l4;
1950 /* fallthrough */
1952 case RELMEM_l4:
1953 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1954 if ( ret )
1955 return ret;
1956 #endif
1957 #if CONFIG_PAGING_LEVELS >= 3
1958 d->arch.relmem = RELMEM_l3;
1959 /* fallthrough */
1961 case RELMEM_l3:
1962 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1963 if ( ret )
1964 return ret;
1965 #endif
1966 d->arch.relmem = RELMEM_l2;
1967 /* fallthrough */
1969 case RELMEM_l2:
1970 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1971 if ( ret )
1972 return ret;
1973 d->arch.relmem = RELMEM_done;
1974 /* fallthrough */
1976 case RELMEM_done:
1977 break;
1979 default:
1980 BUG();
1983 if ( is_hvm_domain(d) )
1984 hvm_domain_relinquish_resources(d);
1986 return 0;
1989 void arch_dump_domain_info(struct domain *d)
1991 paging_dump_domain_info(d);
1994 void arch_dump_vcpu_info(struct vcpu *v)
1996 paging_dump_vcpu_info(v);
1999 void domain_cpuid(
2000 struct domain *d,
2001 unsigned int input,
2002 unsigned int sub_input,
2003 unsigned int *eax,
2004 unsigned int *ebx,
2005 unsigned int *ecx,
2006 unsigned int *edx)
2008 cpuid_input_t *cpuid;
2009 int i;
2011 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
2013 cpuid = &d->arch.cpuids[i];
2015 if ( (cpuid->input[0] == input) &&
2016 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
2017 (cpuid->input[1] == sub_input)) )
2019 *eax = cpuid->eax;
2020 *ebx = cpuid->ebx;
2021 *ecx = cpuid->ecx;
2022 *edx = cpuid->edx;
2024 /*
2025 * Do not advertise host's invariant TSC unless the TSC is
2026 * emulated, or the domain cannot migrate to other hosts.
2027 */
2028 if ( (input == 0x80000007) && /* Advanced Power Management */
2029 !d->disable_migrate && !d->arch.vtsc )
2030 *edx &= ~(1u<<8); /* TSC Invariant */
2032 return;
2036 *eax = *ebx = *ecx = *edx = 0;
2039 void vcpu_kick(struct vcpu *v)
2041 /*
2042 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2043 * pending flag. These values may fluctuate (after all, we hold no
2044 * locks) but the key insight is that each change will cause
2045 * evtchn_upcall_pending to be polled.
2047 * NB2. We save the running flag across the unblock to avoid a needless
2048 * IPI for domains that we IPI'd to unblock.
2049 */
2050 bool_t running = v->is_running;
2051 vcpu_unblock(v);
2052 if ( running && (in_irq() || (v != current)) )
2053 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2056 void vcpu_mark_events_pending(struct vcpu *v)
2058 int already_pending = test_and_set_bit(
2059 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2061 if ( already_pending )
2062 return;
2064 if ( is_hvm_vcpu(v) )
2065 hvm_assert_evtchn_irq(v);
2066 else
2067 vcpu_kick(v);
2070 static void vcpu_kick_softirq(void)
2072 /*
2073 * Nothing to do here: we merely prevent notifiers from racing with checks
2074 * executed on return to guest context with interrupts enabled. See, for
2075 * example, xxx_intr_assist() executed on return to HVM guest context.
2076 */
2079 static int __init init_vcpu_kick_softirq(void)
2081 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2082 return 0;
2084 __initcall(init_vcpu_kick_softirq);
2087 /*
2088 * Local variables:
2089 * mode: C
2090 * c-set-style: "BSD"
2091 * c-basic-offset: 4
2092 * tab-width: 4
2093 * indent-tabs-mode: nil
2094 * End:
2095 */