debuggers.hg

view xen/arch/x86/domain.c @ 20984:3bb163b74673

x86_64: widen bit width usable for struct domain allocation

With it being a PDX (instead of a PFN) that gets stored when a 32-bit
quantity is needed, we should also account for the bits removed during
PFN-to-PDX conversion when doing the allocation.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Feb 12 09:24:18 2010 +0000 (2010-02-12)
parents 68e964ec2c7b
children 94bfa0959297
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <public/sysctl.h>
36 #include <asm/regs.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/system.h>
39 #include <asm/io.h>
40 #include <asm/processor.h>
41 #include <asm/desc.h>
42 #include <asm/i387.h>
43 #include <asm/mpspec.h>
44 #include <asm/ldt.h>
45 #include <asm/hypercall.h>
46 #include <asm/hvm/hvm.h>
47 #include <asm/hvm/support.h>
48 #include <asm/debugreg.h>
49 #include <asm/msr.h>
50 #include <asm/traps.h>
51 #include <asm/nmi.h>
52 #include <xen/numa.h>
53 #include <xen/iommu.h>
54 #ifdef CONFIG_COMPAT
55 #include <compat/vcpu.h>
56 #endif
58 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
59 DEFINE_PER_CPU(u64, efer);
60 DEFINE_PER_CPU(unsigned long, cr4);
62 static void default_idle(void);
63 static void default_dead_idle(void);
64 void (*pm_idle) (void) __read_mostly = default_idle;
65 void (*dead_idle) (void) __read_mostly = default_dead_idle;
67 static void paravirt_ctxt_switch_from(struct vcpu *v);
68 static void paravirt_ctxt_switch_to(struct vcpu *v);
70 static void vcpu_destroy_pagetables(struct vcpu *v);
72 static void continue_idle_domain(struct vcpu *v)
73 {
74 reset_stack_and_jump(idle_loop);
75 }
77 static void continue_nonidle_domain(struct vcpu *v)
78 {
79 reset_stack_and_jump(ret_from_intr);
80 }
82 static void default_idle(void)
83 {
84 local_irq_disable();
85 if ( !softirq_pending(smp_processor_id()) )
86 safe_halt();
87 else
88 local_irq_enable();
89 }
91 static void default_dead_idle(void)
92 {
93 for ( ; ; )
94 halt();
95 }
97 static void play_dead(void)
98 {
99 /*
100 * Flush pending softirqs if any. They can be queued up before this CPU
101 * was taken out of cpu_online_map in __cpu_disable().
102 */
103 do_softirq();
105 /* This must be done before dead CPU ack */
106 cpu_exit_clear();
107 hvm_cpu_down();
108 wbinvd();
109 mb();
110 /* Ack it */
111 __get_cpu_var(cpu_state) = CPU_DEAD;
113 /* With physical CPU hotplug, we should halt the cpu. */
114 local_irq_disable();
115 (*dead_idle)();
116 }
118 void idle_loop(void)
119 {
120 for ( ; ; )
121 {
122 if ( cpu_is_offline(smp_processor_id()) )
123 play_dead();
124 (*pm_idle)();
125 do_softirq();
126 }
127 }
129 void startup_cpu_idle_loop(void)
130 {
131 struct vcpu *v = current;
133 ASSERT(is_idle_vcpu(v));
134 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
135 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
137 reset_stack_and_jump(idle_loop);
138 }
140 void dump_pageframe_info(struct domain *d)
141 {
142 struct page_info *page;
144 printk("Memory pages belonging to domain %u:\n", d->domain_id);
146 if ( d->tot_pages >= 10 )
147 {
148 printk(" DomPage list too long to display\n");
149 }
150 else
151 {
152 page_list_for_each ( page, &d->page_list )
153 {
154 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 if ( is_hvm_domain(d) )
161 {
162 p2m_pod_dump_data(d);
163 }
165 page_list_for_each ( page, &d->xenpage_list )
166 {
167 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
168 _p(page_to_mfn(page)),
169 page->count_info, page->u.inuse.type_info);
170 }
171 }
173 struct domain *alloc_domain_struct(void)
174 {
175 struct domain *d;
176 /*
177 * We pack the PDX of the domain structure into a 32-bit field within
178 * the page_info structure. Hence the MEMF_bits() restriction.
179 */
180 unsigned int bits = 32 + PAGE_SHIFT;
182 #ifdef __x86_64__
183 bits += pfn_pdx_hole_shift;
184 #endif
185 d = alloc_xenheap_pages(get_order_from_bytes(sizeof(*d)), MEMF_bits(bits));
186 if ( d != NULL )
187 memset(d, 0, sizeof(*d));
188 return d;
189 }
191 void free_domain_struct(struct domain *d)
192 {
193 lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
194 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
195 }
197 struct vcpu *alloc_vcpu_struct(void)
198 {
199 struct vcpu *v;
200 /*
201 * This structure contains embedded PAE PDPTEs, used when an HVM guest
202 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
203 * may require that the shadow CR3 points below 4GB, and hence the whole
204 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
205 */
206 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
207 if ( v != NULL )
208 memset(v, 0, sizeof(*v));
209 return v;
210 }
212 void free_vcpu_struct(struct vcpu *v)
213 {
214 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
215 }
217 #ifdef __x86_64__
219 static int setup_compat_l4(struct vcpu *v)
220 {
221 struct page_info *pg;
222 l4_pgentry_t *l4tab;
224 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
225 if ( pg == NULL )
226 return -ENOMEM;
228 /* This page needs to look like a pagetable so that it can be shadowed */
229 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
231 l4tab = page_to_virt(pg);
232 copy_page(l4tab, idle_pg_table);
233 l4tab[0] = l4e_empty();
234 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
235 l4e_from_page(pg, __PAGE_HYPERVISOR);
236 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
237 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
238 __PAGE_HYPERVISOR);
240 v->arch.guest_table = pagetable_from_page(pg);
241 v->arch.guest_table_user = v->arch.guest_table;
243 return 0;
244 }
246 static void release_compat_l4(struct vcpu *v)
247 {
248 free_domheap_page(pagetable_get_page(v->arch.guest_table));
249 v->arch.guest_table = pagetable_null();
250 v->arch.guest_table_user = pagetable_null();
251 }
253 static inline int may_switch_mode(struct domain *d)
254 {
255 return (!is_hvm_domain(d) && (d->tot_pages == 0));
256 }
258 int switch_native(struct domain *d)
259 {
260 unsigned int vcpuid;
262 if ( d == NULL )
263 return -EINVAL;
264 if ( !may_switch_mode(d) )
265 return -EACCES;
266 if ( !is_pv_32on64_domain(d) )
267 return 0;
269 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
271 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
272 {
273 if (d->vcpu[vcpuid])
274 release_compat_l4(d->vcpu[vcpuid]);
275 }
277 return 0;
278 }
280 int switch_compat(struct domain *d)
281 {
282 unsigned int vcpuid;
284 if ( d == NULL )
285 return -EINVAL;
286 if ( !may_switch_mode(d) )
287 return -EACCES;
288 if ( is_pv_32on64_domain(d) )
289 return 0;
291 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
293 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
294 {
295 if ( (d->vcpu[vcpuid] != NULL) &&
296 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
297 goto undo_and_fail;
298 }
300 domain_set_alloc_bitsize(d);
302 return 0;
304 undo_and_fail:
305 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
306 while ( vcpuid-- != 0 )
307 {
308 if ( d->vcpu[vcpuid] != NULL )
309 release_compat_l4(d->vcpu[vcpuid]);
310 }
311 return -ENOMEM;
312 }
314 #else
315 #define setup_compat_l4(v) 0
316 #define release_compat_l4(v) ((void)0)
317 #endif
319 int vcpu_initialise(struct vcpu *v)
320 {
321 struct domain *d = v->domain;
322 int rc;
324 v->arch.vcpu_info_mfn = INVALID_MFN;
326 v->arch.flags = TF_kernel_mode;
328 #if defined(__i386__)
329 mapcache_vcpu_init(v);
330 #else
331 {
332 unsigned int idx = perdomain_pt_pgidx(v);
333 struct page_info *pg;
335 if ( !perdomain_pt_page(d, idx) )
336 {
337 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
338 if ( !pg )
339 return -ENOMEM;
340 clear_page(page_to_virt(pg));
341 perdomain_pt_page(d, idx) = pg;
342 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx]
343 = l2e_from_page(pg, __PAGE_HYPERVISOR);
344 }
345 }
346 #endif
348 pae_l3_cache_init(&v->arch.pae_l3_cache);
350 paging_vcpu_init(v);
352 if ( is_hvm_domain(d) )
353 {
354 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
355 return rc;
356 }
357 else
358 {
359 /* PV guests by default have a 100Hz ticker. */
360 if ( !is_idle_domain(d) )
361 v->periodic_period = MILLISECS(10);
363 /* PV guests get an emulated PIT too for video BIOSes to use. */
364 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
365 pit_init(v, cpu_khz);
367 v->arch.schedule_tail = continue_nonidle_domain;
368 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
369 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
371 if ( is_idle_domain(d) )
372 {
373 v->arch.schedule_tail = continue_idle_domain;
374 v->arch.cr3 = __pa(idle_pg_table);
375 }
377 v->arch.guest_context.ctrlreg[4] =
378 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
379 }
381 v->arch.perdomain_ptes = perdomain_ptes(d, v);
383 spin_lock_init(&v->arch.shadow_ldt_lock);
385 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
386 }
388 void vcpu_destroy(struct vcpu *v)
389 {
390 if ( is_pv_32on64_vcpu(v) )
391 release_compat_l4(v);
393 if ( is_hvm_vcpu(v) )
394 hvm_vcpu_destroy(v);
395 }
397 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
398 {
399 #ifdef __x86_64__
400 struct page_info *pg;
401 #else
402 int pdpt_order;
403 #endif
404 int i, paging_initialised = 0;
405 int rc = -ENOMEM;
407 d->arch.hvm_domain.hap_enabled =
408 is_hvm_domain(d) &&
409 hvm_funcs.hap_supported &&
410 (domcr_flags & DOMCRF_hap);
411 d->arch.hvm_domain.mem_sharing_enabled = 0;
413 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
415 INIT_LIST_HEAD(&d->arch.pdev_list);
417 d->arch.relmem = RELMEM_not_started;
418 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
420 #if defined(__i386__)
422 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
423 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
424 if ( d->arch.mm_perdomain_pt == NULL )
425 goto fail;
426 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
428 mapcache_domain_init(d);
430 #else /* __x86_64__ */
432 BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages)
433 != PAGE_SIZE);
434 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
435 if ( !pg )
436 goto fail;
437 d->arch.mm_perdomain_pt_pages = page_to_virt(pg);
438 clear_page(d->arch.mm_perdomain_pt_pages);
440 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
441 if ( pg == NULL )
442 goto fail;
443 d->arch.mm_perdomain_l2 = page_to_virt(pg);
444 clear_page(d->arch.mm_perdomain_l2);
446 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
447 if ( pg == NULL )
448 goto fail;
449 d->arch.mm_perdomain_l3 = page_to_virt(pg);
450 clear_page(d->arch.mm_perdomain_l3);
451 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
452 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
453 __PAGE_HYPERVISOR);
455 HYPERVISOR_COMPAT_VIRT_START(d) =
456 is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START;
458 #endif /* __x86_64__ */
460 if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
461 goto fail;
462 paging_initialised = 1;
464 if ( !is_idle_domain(d) )
465 {
466 d->arch.ioport_caps =
467 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
468 rc = -ENOMEM;
469 if ( d->arch.ioport_caps == NULL )
470 goto fail;
472 /*
473 * The shared_info machine address must fit in a 32-bit field within a
474 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
475 */
476 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
477 goto fail;
479 clear_page(d->shared_info);
480 share_xen_page_with_guest(
481 virt_to_page(d->shared_info), d, XENSHARE_writable);
483 d->arch.pirq_irq = xmalloc_array(int, d->nr_pirqs);
484 if ( !d->arch.pirq_irq )
485 goto fail;
486 memset(d->arch.pirq_irq, 0,
487 d->nr_pirqs * sizeof(*d->arch.pirq_irq));
489 d->arch.irq_pirq = xmalloc_array(int, nr_irqs);
490 if ( !d->arch.irq_pirq )
491 goto fail;
492 memset(d->arch.irq_pirq, 0,
493 nr_irqs * sizeof(*d->arch.irq_pirq));
495 for ( i = 1; platform_legacy_irq(i); ++i )
496 if ( !IO_APIC_IRQ(i) )
497 d->arch.irq_pirq[i] = d->arch.pirq_irq[i] = i;
499 if ( (rc = iommu_domain_init(d)) != 0 )
500 goto fail;
502 /* For Guest vMCE MSRs virtualization */
503 mce_init_msr(d);
504 }
506 if ( is_hvm_domain(d) )
507 {
508 if ( (rc = hvm_domain_initialise(d)) != 0 )
509 {
510 iommu_domain_destroy(d);
511 goto fail;
512 }
513 }
514 else
515 {
516 /* 32-bit PV guest by default only if Xen is not 64-bit. */
517 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
518 (CONFIG_PAGING_LEVELS != 4);
519 }
521 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
522 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
523 {
524 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
525 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
526 }
528 /* initialize default tsc behavior in case tools don't */
529 tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
530 spin_lock_init(&d->arch.vtsc_lock);
532 return 0;
534 fail:
535 d->is_dying = DOMDYING_dead;
536 xfree(d->arch.pirq_irq);
537 xfree(d->arch.irq_pirq);
538 free_xenheap_page(d->shared_info);
539 if ( paging_initialised )
540 paging_final_teardown(d);
541 #ifdef __x86_64__
542 if ( d->arch.mm_perdomain_l2 )
543 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
544 if ( d->arch.mm_perdomain_l3 )
545 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
546 if ( d->arch.mm_perdomain_pt_pages )
547 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
548 #else
549 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
550 #endif
551 return rc;
552 }
554 void arch_domain_destroy(struct domain *d)
555 {
556 #ifdef __x86_64__
557 unsigned int i;
558 #endif
560 if ( is_hvm_domain(d) )
561 hvm_domain_destroy(d);
563 pci_release_devices(d);
564 free_domain_pirqs(d);
565 if ( !is_idle_domain(d) )
566 iommu_domain_destroy(d);
568 paging_final_teardown(d);
570 #ifdef __i386__
571 free_xenheap_pages(
572 d->arch.mm_perdomain_pt,
573 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
574 #else
575 for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
576 {
577 if ( perdomain_pt_page(d, i) )
578 free_domheap_page(perdomain_pt_page(d, i));
579 }
580 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
581 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
582 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
583 #endif
585 free_xenheap_page(d->shared_info);
586 xfree(d->arch.pirq_irq);
587 xfree(d->arch.irq_pirq);
588 }
590 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
591 {
592 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
594 hv_cr4_mask = ~X86_CR4_TSD;
595 if ( cpu_has_de )
596 hv_cr4_mask &= ~X86_CR4_DE;
598 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
599 gdprintk(XENLOG_WARNING,
600 "Attempt to change CR4 flags %08lx -> %08lx\n",
601 hv_cr4, guest_cr4);
603 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
604 }
606 /* This is called by arch_final_setup_guest and do_boot_vcpu */
607 int arch_set_info_guest(
608 struct vcpu *v, vcpu_guest_context_u c)
609 {
610 struct domain *d = v->domain;
611 unsigned long cr3_pfn = INVALID_MFN;
612 unsigned long flags, cr4;
613 int i, rc = 0, compat;
615 /* The context is a compat-mode one if the target domain is compat-mode;
616 * we expect the tools to DTRT even in compat-mode callers. */
617 compat = is_pv_32on64_domain(d);
619 #ifdef CONFIG_COMPAT
620 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
621 #else
622 #define c(fld) (c.nat->fld)
623 #endif
624 flags = c(flags);
626 if ( !is_hvm_vcpu(v) )
627 {
628 if ( !compat )
629 {
630 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
631 fixup_guest_stack_selector(d, c.nat->kernel_ss);
632 fixup_guest_code_selector(d, c.nat->user_regs.cs);
633 #ifdef __i386__
634 fixup_guest_code_selector(d, c.nat->event_callback_cs);
635 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
636 #endif
638 for ( i = 0; i < 256; i++ )
639 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
641 /* LDT safety checks. */
642 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
643 (c.nat->ldt_ents > 8192) ||
644 !array_access_ok(c.nat->ldt_base,
645 c.nat->ldt_ents,
646 LDT_ENTRY_SIZE) )
647 return -EINVAL;
648 }
649 #ifdef CONFIG_COMPAT
650 else
651 {
652 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
653 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
654 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
655 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
656 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
658 for ( i = 0; i < 256; i++ )
659 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
661 /* LDT safety checks. */
662 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
663 (c.cmp->ldt_ents > 8192) ||
664 !compat_array_access_ok(c.cmp->ldt_base,
665 c.cmp->ldt_ents,
666 LDT_ENTRY_SIZE) )
667 return -EINVAL;
668 }
669 #endif
670 }
672 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
674 v->arch.flags &= ~TF_kernel_mode;
675 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
676 v->arch.flags |= TF_kernel_mode;
678 if ( !compat )
679 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
680 #ifdef CONFIG_COMPAT
681 else
682 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
683 #endif
685 v->arch.guest_context.user_regs.eflags |= 2;
687 if ( is_hvm_vcpu(v) )
688 {
689 hvm_set_info_guest(v);
690 goto out;
691 }
693 /* Only CR0.TS is modifiable by guest or admin. */
694 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
695 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
697 init_int80_direct_trap(v);
699 /* IOPL privileges are virtualised. */
700 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
701 v->arch.guest_context.user_regs.eflags &= ~X86_EFLAGS_IOPL;
703 /* Ensure real hardware interrupts are enabled. */
704 v->arch.guest_context.user_regs.eflags |= X86_EFLAGS_IF;
706 cr4 = v->arch.guest_context.ctrlreg[4];
707 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
708 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
710 memset(v->arch.guest_context.debugreg, 0,
711 sizeof(v->arch.guest_context.debugreg));
712 for ( i = 0; i < 8; i++ )
713 (void)set_debugreg(v, i, c(debugreg[i]));
715 if ( v->is_initialised )
716 goto out;
718 if ( v->vcpu_id == 0 )
719 d->vm_assist = c(vm_assist);
721 if ( !compat )
722 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
723 #ifdef CONFIG_COMPAT
724 else
725 {
726 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
727 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
729 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
730 return -EINVAL;
731 for ( i = 0; i < n; ++i )
732 gdt_frames[i] = c.cmp->gdt_frames[i];
733 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
734 }
735 #endif
736 if ( rc != 0 )
737 return rc;
739 if ( !compat )
740 {
741 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
743 if ( !mfn_valid(cr3_pfn) ||
744 (paging_mode_refcounts(d)
745 ? !get_page(mfn_to_page(cr3_pfn), d)
746 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
747 PGT_base_page_table)) )
748 {
749 destroy_gdt(v);
750 return -EINVAL;
751 }
753 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
755 #ifdef __x86_64__
756 if ( c.nat->ctrlreg[1] )
757 {
758 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
760 if ( !mfn_valid(cr3_pfn) ||
761 (paging_mode_refcounts(d)
762 ? !get_page(mfn_to_page(cr3_pfn), d)
763 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
764 PGT_base_page_table)) )
765 {
766 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
767 v->arch.guest_table = pagetable_null();
768 if ( paging_mode_refcounts(d) )
769 put_page(mfn_to_page(cr3_pfn));
770 else
771 put_page_and_type(mfn_to_page(cr3_pfn));
772 destroy_gdt(v);
773 return -EINVAL;
774 }
776 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
777 }
778 }
779 else
780 {
781 l4_pgentry_t *l4tab;
783 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
785 if ( !mfn_valid(cr3_pfn) ||
786 (paging_mode_refcounts(d)
787 ? !get_page(mfn_to_page(cr3_pfn), d)
788 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
789 PGT_l3_page_table)) )
790 {
791 destroy_gdt(v);
792 return -EINVAL;
793 }
795 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
796 *l4tab = l4e_from_pfn(
797 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
798 #endif
799 }
801 if ( v->vcpu_id == 0 )
802 update_domain_wallclock_time(d);
804 /* Don't redo final setup */
805 v->is_initialised = 1;
807 if ( paging_mode_enabled(d) )
808 paging_update_paging_modes(v);
810 update_cr3(v);
812 out:
813 if ( flags & VGCF_online )
814 clear_bit(_VPF_down, &v->pause_flags);
815 else
816 set_bit(_VPF_down, &v->pause_flags);
817 return 0;
818 #undef c
819 }
821 void arch_vcpu_reset(struct vcpu *v)
822 {
823 if ( !is_hvm_vcpu(v) )
824 {
825 destroy_gdt(v);
826 vcpu_destroy_pagetables(v);
827 }
828 else
829 {
830 vcpu_end_shutdown_deferral(v);
831 }
832 }
834 /*
835 * Unmap the vcpu info page if the guest decided to place it somewhere
836 * else. This is only used from arch_domain_destroy, so there's no
837 * need to do anything clever.
838 */
839 static void
840 unmap_vcpu_info(struct vcpu *v)
841 {
842 unsigned long mfn;
844 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
845 return;
847 mfn = v->arch.vcpu_info_mfn;
848 unmap_domain_page_global(v->vcpu_info);
850 v->vcpu_info = &dummy_vcpu_info;
851 v->arch.vcpu_info_mfn = INVALID_MFN;
853 put_page_and_type(mfn_to_page(mfn));
854 }
856 /*
857 * Map a guest page in and point the vcpu_info pointer at it. This
858 * makes sure that the vcpu_info is always pointing at a valid piece
859 * of memory, and it sets a pending event to make sure that a pending
860 * event doesn't get missed.
861 */
862 static int
863 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
864 {
865 struct domain *d = v->domain;
866 void *mapping;
867 vcpu_info_t *new_info;
868 int i;
870 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
871 return -EINVAL;
873 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
874 return -EINVAL;
876 /* Run this command on yourself or on other offline VCPUS. */
877 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
878 return -EINVAL;
880 mfn = gmfn_to_mfn(d, mfn);
881 if ( !mfn_valid(mfn) ||
882 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
883 return -EINVAL;
885 mapping = map_domain_page_global(mfn);
886 if ( mapping == NULL )
887 {
888 put_page_and_type(mfn_to_page(mfn));
889 return -ENOMEM;
890 }
892 new_info = (vcpu_info_t *)(mapping + offset);
894 if ( v->vcpu_info == &dummy_vcpu_info )
895 {
896 memset(new_info, 0, sizeof(*new_info));
897 __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
898 }
899 else
900 {
901 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
902 }
904 v->vcpu_info = new_info;
905 v->arch.vcpu_info_mfn = mfn;
907 /* Set new vcpu_info pointer /before/ setting pending flags. */
908 wmb();
910 /*
911 * Mark everything as being pending just to make sure nothing gets
912 * lost. The domain will get a spurious event, but it can cope.
913 */
914 vcpu_info(v, evtchn_upcall_pending) = 1;
915 for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
916 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
918 return 0;
919 }
921 long
922 arch_do_vcpu_op(
923 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
924 {
925 long rc = 0;
927 switch ( cmd )
928 {
929 case VCPUOP_register_runstate_memory_area:
930 {
931 struct vcpu_register_runstate_memory_area area;
932 struct vcpu_runstate_info runstate;
934 rc = -EFAULT;
935 if ( copy_from_guest(&area, arg, 1) )
936 break;
938 if ( !guest_handle_okay(area.addr.h, 1) )
939 break;
941 rc = 0;
942 runstate_guest(v) = area.addr.h;
944 if ( v == current )
945 {
946 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
947 }
948 else
949 {
950 vcpu_runstate_get(v, &runstate);
951 __copy_to_guest(runstate_guest(v), &runstate, 1);
952 }
954 break;
955 }
957 case VCPUOP_register_vcpu_info:
958 {
959 struct domain *d = v->domain;
960 struct vcpu_register_vcpu_info info;
962 rc = -EFAULT;
963 if ( copy_from_guest(&info, arg, 1) )
964 break;
966 domain_lock(d);
967 rc = map_vcpu_info(v, info.mfn, info.offset);
968 domain_unlock(d);
970 break;
971 }
973 case VCPUOP_register_vcpu_time_memory_area:
974 {
975 struct vcpu_register_time_memory_area area;
977 rc = -EFAULT;
978 if ( copy_from_guest(&area, arg, 1) )
979 break;
981 if ( !guest_handle_okay(area.addr.h, 1) )
982 break;
984 rc = 0;
985 v->arch.time_info_guest = area.addr.h;
987 force_update_vcpu_system_time(v);
989 break;
990 }
992 case VCPUOP_get_physid:
993 {
994 struct vcpu_get_physid cpu_id;
996 rc = -EINVAL;
997 if ( !v->domain->is_pinned )
998 break;
1000 cpu_id.phys_id =
1001 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1002 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1004 rc = -EFAULT;
1005 if ( copy_to_guest(arg, &cpu_id, 1) )
1006 break;
1008 rc = 0;
1009 break;
1012 default:
1013 rc = -ENOSYS;
1014 break;
1017 return rc;
1020 #ifdef __x86_64__
1022 #define loadsegment(seg,value) ({ \
1023 int __r = 1; \
1024 asm volatile ( \
1025 "1: movl %k1,%%" #seg "\n2:\n" \
1026 ".section .fixup,\"ax\"\n" \
1027 "3: xorl %k0,%k0\n" \
1028 " movl %k0,%%" #seg "\n" \
1029 " jmp 2b\n" \
1030 ".previous\n" \
1031 ".section __ex_table,\"a\"\n" \
1032 " .align 8\n" \
1033 " .quad 1b,3b\n" \
1034 ".previous" \
1035 : "=r" (__r) : "r" (value), "0" (__r) );\
1036 __r; })
1038 /*
1039 * save_segments() writes a mask of segments which are dirty (non-zero),
1040 * allowing load_segments() to avoid some expensive segment loads and
1041 * MSR writes.
1042 */
1043 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1044 #define DIRTY_DS 0x01
1045 #define DIRTY_ES 0x02
1046 #define DIRTY_FS 0x04
1047 #define DIRTY_GS 0x08
1048 #define DIRTY_FS_BASE 0x10
1049 #define DIRTY_GS_BASE_USER 0x20
1051 static void load_segments(struct vcpu *n)
1053 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
1054 int all_segs_okay = 1;
1055 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1057 /* Load and clear the dirty segment mask. */
1058 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1059 per_cpu(dirty_segment_mask, cpu) = 0;
1061 /* Either selector != 0 ==> reload. */
1062 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1063 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1065 /* Either selector != 0 ==> reload. */
1066 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1067 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1069 /*
1070 * Either selector != 0 ==> reload.
1071 * Also reload to reset FS_BASE if it was non-zero.
1072 */
1073 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1074 nctxt->user_regs.fs) )
1075 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1077 /*
1078 * Either selector != 0 ==> reload.
1079 * Also reload to reset GS_BASE if it was non-zero.
1080 */
1081 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1082 nctxt->user_regs.gs) )
1084 /* Reset GS_BASE with user %gs? */
1085 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1086 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1089 if ( !is_pv_32on64_domain(n->domain) )
1091 /* This can only be non-zero if selector is NULL. */
1092 if ( nctxt->fs_base )
1093 wrmsr(MSR_FS_BASE,
1094 nctxt->fs_base,
1095 nctxt->fs_base>>32);
1097 /* Most kernels have non-zero GS base, so don't bother testing. */
1098 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1099 wrmsr(MSR_SHADOW_GS_BASE,
1100 nctxt->gs_base_kernel,
1101 nctxt->gs_base_kernel>>32);
1103 /* This can only be non-zero if selector is NULL. */
1104 if ( nctxt->gs_base_user )
1105 wrmsr(MSR_GS_BASE,
1106 nctxt->gs_base_user,
1107 nctxt->gs_base_user>>32);
1109 /* If in kernel mode then switch the GS bases around. */
1110 if ( (n->arch.flags & TF_kernel_mode) )
1111 asm volatile ( "swapgs" );
1114 if ( unlikely(!all_segs_okay) )
1116 struct cpu_user_regs *regs = guest_cpu_user_regs();
1117 unsigned long *rsp =
1118 (n->arch.flags & TF_kernel_mode) ?
1119 (unsigned long *)regs->rsp :
1120 (unsigned long *)nctxt->kernel_sp;
1121 unsigned long cs_and_mask, rflags;
1123 if ( is_pv_32on64_domain(n->domain) )
1125 unsigned int *esp = ring_1(regs) ?
1126 (unsigned int *)regs->rsp :
1127 (unsigned int *)nctxt->kernel_sp;
1128 unsigned int cs_and_mask, eflags;
1129 int ret = 0;
1131 /* CS longword also contains full evtchn_upcall_mask. */
1132 cs_and_mask = (unsigned short)regs->cs |
1133 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1134 /* Fold upcall mask into RFLAGS.IF. */
1135 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1136 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1138 if ( !ring_1(regs) )
1140 ret = put_user(regs->ss, esp-1);
1141 ret |= put_user(regs->_esp, esp-2);
1142 esp -= 2;
1145 if ( ret |
1146 put_user(eflags, esp-1) |
1147 put_user(cs_and_mask, esp-2) |
1148 put_user(regs->_eip, esp-3) |
1149 put_user(nctxt->user_regs.gs, esp-4) |
1150 put_user(nctxt->user_regs.fs, esp-5) |
1151 put_user(nctxt->user_regs.es, esp-6) |
1152 put_user(nctxt->user_regs.ds, esp-7) )
1154 gdprintk(XENLOG_ERR, "Error while creating compat "
1155 "failsafe callback frame.\n");
1156 domain_crash(n->domain);
1159 if ( test_bit(_VGCF_failsafe_disables_events,
1160 &n->arch.guest_context.flags) )
1161 vcpu_info(n, evtchn_upcall_mask) = 1;
1163 regs->entry_vector = TRAP_syscall;
1164 regs->_eflags &= 0xFFFCBEFFUL;
1165 regs->ss = FLAT_COMPAT_KERNEL_SS;
1166 regs->_esp = (unsigned long)(esp-7);
1167 regs->cs = FLAT_COMPAT_KERNEL_CS;
1168 regs->_eip = nctxt->failsafe_callback_eip;
1169 return;
1172 if ( !(n->arch.flags & TF_kernel_mode) )
1173 toggle_guest_mode(n);
1174 else
1175 regs->cs &= ~3;
1177 /* CS longword also contains full evtchn_upcall_mask. */
1178 cs_and_mask = (unsigned long)regs->cs |
1179 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1181 /* Fold upcall mask into RFLAGS.IF. */
1182 rflags = regs->rflags & ~X86_EFLAGS_IF;
1183 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1185 if ( put_user(regs->ss, rsp- 1) |
1186 put_user(regs->rsp, rsp- 2) |
1187 put_user(rflags, rsp- 3) |
1188 put_user(cs_and_mask, rsp- 4) |
1189 put_user(regs->rip, rsp- 5) |
1190 put_user(nctxt->user_regs.gs, rsp- 6) |
1191 put_user(nctxt->user_regs.fs, rsp- 7) |
1192 put_user(nctxt->user_regs.es, rsp- 8) |
1193 put_user(nctxt->user_regs.ds, rsp- 9) |
1194 put_user(regs->r11, rsp-10) |
1195 put_user(regs->rcx, rsp-11) )
1197 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1198 "callback frame.\n");
1199 domain_crash(n->domain);
1202 if ( test_bit(_VGCF_failsafe_disables_events,
1203 &n->arch.guest_context.flags) )
1204 vcpu_info(n, evtchn_upcall_mask) = 1;
1206 regs->entry_vector = TRAP_syscall;
1207 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1208 X86_EFLAGS_NT|X86_EFLAGS_TF);
1209 regs->ss = FLAT_KERNEL_SS;
1210 regs->rsp = (unsigned long)(rsp-11);
1211 regs->cs = FLAT_KERNEL_CS;
1212 regs->rip = nctxt->failsafe_callback_eip;
1216 static void save_segments(struct vcpu *v)
1218 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1219 struct cpu_user_regs *regs = &ctxt->user_regs;
1220 unsigned int dirty_segment_mask = 0;
1222 regs->ds = read_segment_register(ds);
1223 regs->es = read_segment_register(es);
1224 regs->fs = read_segment_register(fs);
1225 regs->gs = read_segment_register(gs);
1227 if ( regs->ds )
1228 dirty_segment_mask |= DIRTY_DS;
1230 if ( regs->es )
1231 dirty_segment_mask |= DIRTY_ES;
1233 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1235 dirty_segment_mask |= DIRTY_FS;
1236 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1238 else if ( ctxt->fs_base )
1240 dirty_segment_mask |= DIRTY_FS_BASE;
1243 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1245 dirty_segment_mask |= DIRTY_GS;
1246 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1248 else if ( ctxt->gs_base_user )
1250 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1253 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1256 #define switch_kernel_stack(v) ((void)0)
1258 #elif defined(__i386__)
1260 #define load_segments(n) ((void)0)
1261 #define save_segments(p) ((void)0)
1263 static inline void switch_kernel_stack(struct vcpu *v)
1265 struct tss_struct *tss = &this_cpu(init_tss);
1266 tss->esp1 = v->arch.guest_context.kernel_sp;
1267 tss->ss1 = v->arch.guest_context.kernel_ss;
1270 #endif /* __i386__ */
1272 static void paravirt_ctxt_switch_from(struct vcpu *v)
1274 save_segments(v);
1276 /*
1277 * Disable debug breakpoints. We do this aggressively because if we switch
1278 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1279 * inside Xen, before we get a chance to reload DR7, and this cannot always
1280 * safely be handled.
1281 */
1282 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1283 write_debugreg(7, 0);
1286 static void paravirt_ctxt_switch_to(struct vcpu *v)
1288 unsigned long cr4;
1290 set_int80_direct_trap(v);
1291 switch_kernel_stack(v);
1293 cr4 = pv_guest_cr4_to_real_cr4(v);
1294 if ( unlikely(cr4 != read_cr4()) )
1295 write_cr4(cr4);
1297 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1299 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1300 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1301 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1302 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1303 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1304 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1307 if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) &&
1308 boot_cpu_has(X86_FEATURE_RDTSCP) )
1309 write_rdtscp_aux(v->domain->arch.incarnation);
1312 /* Update per-VCPU guest runstate shared memory area (if registered). */
1313 static void update_runstate_area(struct vcpu *v)
1315 if ( guest_handle_is_null(runstate_guest(v)) )
1316 return;
1318 #ifdef CONFIG_COMPAT
1319 if ( is_pv_32on64_domain(v->domain) )
1321 struct compat_vcpu_runstate_info info;
1323 XLAT_vcpu_runstate_info(&info, &v->runstate);
1324 __copy_to_guest(v->runstate_guest.compat, &info, 1);
1325 return;
1327 #endif
1329 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1332 static inline int need_full_gdt(struct vcpu *v)
1334 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1337 static void __context_switch(void)
1339 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1340 unsigned int cpu = smp_processor_id();
1341 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1342 struct vcpu *n = current;
1343 struct desc_struct *gdt;
1344 struct desc_ptr gdt_desc;
1346 ASSERT(p != n);
1347 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1349 if ( !is_idle_vcpu(p) )
1351 memcpy(&p->arch.guest_context.user_regs,
1352 stack_regs,
1353 CTXT_SWITCH_STACK_BYTES);
1354 unlazy_fpu(p);
1355 p->arch.ctxt_switch_from(p);
1358 /*
1359 * Mark this CPU in next domain's dirty cpumasks before calling
1360 * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1361 * which is synchronised on that function.
1362 */
1363 if ( p->domain != n->domain )
1364 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1365 cpu_set(cpu, n->vcpu_dirty_cpumask);
1367 if ( !is_idle_vcpu(n) )
1369 memcpy(stack_regs,
1370 &n->arch.guest_context.user_regs,
1371 CTXT_SWITCH_STACK_BYTES);
1372 n->arch.ctxt_switch_to(n);
1375 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1376 per_cpu(compat_gdt_table, cpu);
1377 if ( need_full_gdt(n) )
1379 struct page_info *page = virt_to_page(gdt);
1380 unsigned int i;
1381 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1382 l1e_write(n->arch.perdomain_ptes +
1383 FIRST_RESERVED_GDT_PAGE + i,
1384 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1387 if ( need_full_gdt(p) &&
1388 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1390 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1391 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1392 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1395 write_ptbase(n);
1397 if ( need_full_gdt(n) &&
1398 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1400 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1401 gdt_desc.base = GDT_VIRT_START(n);
1402 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1405 if ( p->domain != n->domain )
1406 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1407 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1409 per_cpu(curr_vcpu, cpu) = n;
1413 void context_switch(struct vcpu *prev, struct vcpu *next)
1415 unsigned int cpu = smp_processor_id();
1416 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1418 ASSERT(local_irq_is_enabled());
1420 /* Allow at most one CPU at a time to be dirty. */
1421 ASSERT(cpus_weight(dirty_mask) <= 1);
1422 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1424 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1425 flush_tlb_mask(&dirty_mask);
1428 if (prev != next)
1429 update_runstate_area(prev);
1431 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1432 pt_save_timer(prev);
1434 local_irq_disable();
1436 set_current(next);
1438 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1440 local_irq_enable();
1442 else
1444 __context_switch();
1446 #ifdef CONFIG_COMPAT
1447 if ( !is_hvm_vcpu(next) &&
1448 (is_idle_vcpu(prev) ||
1449 is_hvm_vcpu(prev) ||
1450 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1452 uint64_t efer = read_efer();
1453 if ( !(efer & EFER_SCE) )
1454 write_efer(efer | EFER_SCE);
1456 #endif
1458 /* Re-enable interrupts before restoring state which may fault. */
1459 local_irq_enable();
1461 if ( !is_hvm_vcpu(next) )
1463 load_LDT(next);
1464 load_segments(next);
1468 context_saved(prev);
1470 if (prev != next)
1471 update_runstate_area(next);
1473 schedule_tail(next);
1474 BUG();
1477 void continue_running(struct vcpu *same)
1479 schedule_tail(same);
1480 BUG();
1483 int __sync_lazy_execstate(void)
1485 unsigned long flags;
1486 int switch_required;
1488 local_irq_save(flags);
1490 switch_required = (this_cpu(curr_vcpu) != current);
1492 if ( switch_required )
1494 ASSERT(current == idle_vcpu[smp_processor_id()]);
1495 __context_switch();
1498 local_irq_restore(flags);
1500 return switch_required;
1503 void sync_vcpu_execstate(struct vcpu *v)
1505 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1506 (void)__sync_lazy_execstate();
1508 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1509 flush_tlb_mask(&v->vcpu_dirty_cpumask);
1512 struct migrate_info {
1513 long (*func)(void *data);
1514 void *data;
1515 void (*saved_schedule_tail)(struct vcpu *);
1516 cpumask_t saved_affinity;
1517 unsigned int nest;
1518 };
1520 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1522 struct cpu_user_regs *regs = guest_cpu_user_regs();
1523 struct migrate_info *info = v->arch.continue_info;
1524 cpumask_t mask = info->saved_affinity;
1525 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1527 regs->eax = info->func(info->data);
1529 if ( info->nest-- == 0 )
1531 xfree(info);
1532 v->arch.schedule_tail = saved_schedule_tail;
1533 v->arch.continue_info = NULL;
1534 vcpu_unlock_affinity(v, &mask);
1537 (*saved_schedule_tail)(v);
1540 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1542 struct vcpu *v = current;
1543 struct migrate_info *info;
1544 cpumask_t mask = cpumask_of_cpu(cpu);
1545 int rc;
1547 if ( cpu == smp_processor_id() )
1548 return func(data);
1550 info = v->arch.continue_info;
1551 if ( info == NULL )
1553 info = xmalloc(struct migrate_info);
1554 if ( info == NULL )
1555 return -ENOMEM;
1557 rc = vcpu_lock_affinity(v, &mask);
1558 if ( rc )
1560 xfree(info);
1561 return rc;
1564 info->saved_schedule_tail = v->arch.schedule_tail;
1565 info->saved_affinity = mask;
1566 info->nest = 0;
1568 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1569 v->arch.continue_info = info;
1571 else
1573 BUG_ON(info->nest != 0);
1574 rc = vcpu_locked_change_affinity(v, &mask);
1575 if ( rc )
1576 return rc;
1577 info->nest++;
1580 info->func = func;
1581 info->data = data;
1583 /* Dummy return value will be overwritten by new schedule_tail. */
1584 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1585 return 0;
1588 #define next_arg(fmt, args) ({ \
1589 unsigned long __arg; \
1590 switch ( *(fmt)++ ) \
1591 { \
1592 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1593 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1594 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1595 default: __arg = 0; BUG(); \
1596 } \
1597 __arg; \
1598 })
1600 DEFINE_PER_CPU(char, hc_preempted);
1602 unsigned long hypercall_create_continuation(
1603 unsigned int op, const char *format, ...)
1605 struct mc_state *mcs = &this_cpu(mc_state);
1606 struct cpu_user_regs *regs;
1607 const char *p = format;
1608 unsigned long arg;
1609 unsigned int i;
1610 va_list args;
1612 va_start(args, format);
1614 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1616 __set_bit(_MCSF_call_preempted, &mcs->flags);
1618 for ( i = 0; *p != '\0'; i++ )
1619 mcs->call.args[i] = next_arg(p, args);
1620 if ( is_pv_32on64_domain(current->domain) )
1622 for ( ; i < 6; i++ )
1623 mcs->call.args[i] = 0;
1626 else
1628 regs = guest_cpu_user_regs();
1629 regs->eax = op;
1630 /*
1631 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1632 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1633 */
1634 if ( !is_hvm_vcpu(current) )
1635 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1637 #ifdef __x86_64__
1638 if ( !is_hvm_vcpu(current) ?
1639 !is_pv_32on64_vcpu(current) :
1640 (hvm_guest_x86_mode(current) == 8) )
1642 for ( i = 0; *p != '\0'; i++ )
1644 arg = next_arg(p, args);
1645 switch ( i )
1647 case 0: regs->rdi = arg; break;
1648 case 1: regs->rsi = arg; break;
1649 case 2: regs->rdx = arg; break;
1650 case 3: regs->r10 = arg; break;
1651 case 4: regs->r8 = arg; break;
1652 case 5: regs->r9 = arg; break;
1656 else
1657 #endif
1659 if ( supervisor_mode_kernel )
1660 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1662 for ( i = 0; *p != '\0'; i++ )
1664 arg = next_arg(p, args);
1665 switch ( i )
1667 case 0: regs->ebx = arg; break;
1668 case 1: regs->ecx = arg; break;
1669 case 2: regs->edx = arg; break;
1670 case 3: regs->esi = arg; break;
1671 case 4: regs->edi = arg; break;
1672 case 5: regs->ebp = arg; break;
1677 this_cpu(hc_preempted) = 1;
1680 va_end(args);
1682 return op;
1685 #ifdef CONFIG_COMPAT
1686 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1688 int rc = 0;
1689 struct mc_state *mcs = &this_cpu(mc_state);
1690 struct cpu_user_regs *regs;
1691 unsigned int i, cval = 0;
1692 unsigned long nval = 0;
1693 va_list args;
1695 BUG_ON(*id > 5);
1696 BUG_ON(mask & (1U << *id));
1698 va_start(args, mask);
1700 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1702 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1703 return 0;
1704 for ( i = 0; i < 6; ++i, mask >>= 1 )
1706 if ( mask & 1 )
1708 nval = va_arg(args, unsigned long);
1709 cval = va_arg(args, unsigned int);
1710 if ( cval == nval )
1711 mask &= ~1U;
1712 else
1713 BUG_ON(nval == (unsigned int)nval);
1715 else if ( id && *id == i )
1717 *id = mcs->call.args[i];
1718 id = NULL;
1720 if ( (mask & 1) && mcs->call.args[i] == nval )
1722 mcs->call.args[i] = cval;
1723 ++rc;
1725 else
1726 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1729 else
1731 regs = guest_cpu_user_regs();
1732 for ( i = 0; i < 6; ++i, mask >>= 1 )
1734 unsigned long *reg;
1736 switch ( i )
1738 case 0: reg = &regs->ebx; break;
1739 case 1: reg = &regs->ecx; break;
1740 case 2: reg = &regs->edx; break;
1741 case 3: reg = &regs->esi; break;
1742 case 4: reg = &regs->edi; break;
1743 case 5: reg = &regs->ebp; break;
1744 default: BUG(); reg = NULL; break;
1746 if ( (mask & 1) )
1748 nval = va_arg(args, unsigned long);
1749 cval = va_arg(args, unsigned int);
1750 if ( cval == nval )
1751 mask &= ~1U;
1752 else
1753 BUG_ON(nval == (unsigned int)nval);
1755 else if ( id && *id == i )
1757 *id = *reg;
1758 id = NULL;
1760 if ( (mask & 1) && *reg == nval )
1762 *reg = cval;
1763 ++rc;
1765 else
1766 BUG_ON(*reg != (unsigned int)*reg);
1770 va_end(args);
1772 return rc;
1774 #endif
1776 static int relinquish_memory(
1777 struct domain *d, struct page_list_head *list, unsigned long type)
1779 struct page_info *page;
1780 unsigned long x, y;
1781 int ret = 0;
1783 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1784 spin_lock_recursive(&d->page_alloc_lock);
1786 while ( (page = page_list_remove_head(list)) )
1788 /* Grab a reference to the page so it won't disappear from under us. */
1789 if ( unlikely(!get_page(page, d)) )
1791 /* Couldn't get a reference -- someone is freeing this page. */
1792 page_list_add_tail(page, &d->arch.relmem_list);
1793 continue;
1796 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1797 ret = put_page_and_type_preemptible(page, 1);
1798 switch ( ret )
1800 case 0:
1801 break;
1802 case -EAGAIN:
1803 case -EINTR:
1804 page_list_add(page, list);
1805 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1806 put_page(page);
1807 goto out;
1808 default:
1809 BUG();
1812 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1813 put_page(page);
1815 /*
1816 * Forcibly invalidate top-most, still valid page tables at this point
1817 * to break circular 'linear page table' references as well as clean up
1818 * partially validated pages. This is okay because MMU structures are
1819 * not shared across domains and this domain is now dead. Thus top-most
1820 * valid tables are not in use so a non-zero count means circular
1821 * reference or partially validated.
1822 */
1823 y = page->u.inuse.type_info;
1824 for ( ; ; )
1826 x = y;
1827 if ( likely((x & PGT_type_mask) != type) ||
1828 likely(!(x & (PGT_validated|PGT_partial))) )
1829 break;
1831 y = cmpxchg(&page->u.inuse.type_info, x,
1832 x & ~(PGT_validated|PGT_partial));
1833 if ( likely(y == x) )
1835 /* No need for atomic update of type_info here: noone else updates it. */
1836 switch ( ret = free_page_type(page, x, 1) )
1838 case 0:
1839 break;
1840 case -EINTR:
1841 page_list_add(page, list);
1842 page->u.inuse.type_info |= PGT_validated;
1843 if ( x & PGT_partial )
1844 put_page(page);
1845 put_page(page);
1846 ret = -EAGAIN;
1847 goto out;
1848 case -EAGAIN:
1849 page_list_add(page, list);
1850 page->u.inuse.type_info |= PGT_partial;
1851 if ( x & PGT_partial )
1852 put_page(page);
1853 goto out;
1854 default:
1855 BUG();
1857 if ( x & PGT_partial )
1859 page->u.inuse.type_info--;
1860 put_page(page);
1862 break;
1866 /* Put the page on the list and /then/ potentially free it. */
1867 page_list_add_tail(page, &d->arch.relmem_list);
1868 put_page(page);
1870 if ( hypercall_preempt_check() )
1872 ret = -EAGAIN;
1873 goto out;
1877 /* list is empty at this point. */
1878 page_list_move(list, &d->arch.relmem_list);
1880 out:
1881 spin_unlock_recursive(&d->page_alloc_lock);
1882 return ret;
1885 static void vcpu_destroy_pagetables(struct vcpu *v)
1887 struct domain *d = v->domain;
1888 unsigned long pfn;
1890 #ifdef __x86_64__
1891 if ( is_pv_32on64_vcpu(v) )
1893 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1894 __va(pagetable_get_paddr(v->arch.guest_table)));
1896 if ( pfn != 0 )
1898 if ( paging_mode_refcounts(d) )
1899 put_page(mfn_to_page(pfn));
1900 else
1901 put_page_and_type(mfn_to_page(pfn));
1904 l4e_write(
1905 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1906 l4e_empty());
1908 v->arch.cr3 = 0;
1909 return;
1911 #endif
1913 pfn = pagetable_get_pfn(v->arch.guest_table);
1914 if ( pfn != 0 )
1916 if ( paging_mode_refcounts(d) )
1917 put_page(mfn_to_page(pfn));
1918 else
1919 put_page_and_type(mfn_to_page(pfn));
1920 v->arch.guest_table = pagetable_null();
1923 #ifdef __x86_64__
1924 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1925 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1926 if ( pfn != 0 )
1928 if ( !is_pv_32bit_vcpu(v) )
1930 if ( paging_mode_refcounts(d) )
1931 put_page(mfn_to_page(pfn));
1932 else
1933 put_page_and_type(mfn_to_page(pfn));
1935 v->arch.guest_table_user = pagetable_null();
1937 #endif
1939 v->arch.cr3 = 0;
1942 int domain_relinquish_resources(struct domain *d)
1944 int ret;
1945 struct vcpu *v;
1947 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1949 switch ( d->arch.relmem )
1951 case RELMEM_not_started:
1952 /* Tear down paging-assistance stuff. */
1953 paging_teardown(d);
1955 for_each_vcpu ( d, v )
1957 /* Drop the in-use references to page-table bases. */
1958 vcpu_destroy_pagetables(v);
1960 /*
1961 * Relinquish GDT mappings. No need for explicit unmapping of the
1962 * LDT as it automatically gets squashed with the guest mappings.
1963 */
1964 destroy_gdt(v);
1966 unmap_vcpu_info(v);
1969 if ( d->arch.pirq_eoi_map != NULL )
1971 unmap_domain_page_global(d->arch.pirq_eoi_map);
1972 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1973 d->arch.pirq_eoi_map = NULL;
1976 d->arch.relmem = RELMEM_xen;
1977 /* fallthrough */
1979 /* Relinquish every page of memory. */
1980 case RELMEM_xen:
1981 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1982 if ( ret )
1983 return ret;
1984 #if CONFIG_PAGING_LEVELS >= 4
1985 d->arch.relmem = RELMEM_l4;
1986 /* fallthrough */
1988 case RELMEM_l4:
1989 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1990 if ( ret )
1991 return ret;
1992 #endif
1993 #if CONFIG_PAGING_LEVELS >= 3
1994 d->arch.relmem = RELMEM_l3;
1995 /* fallthrough */
1997 case RELMEM_l3:
1998 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1999 if ( ret )
2000 return ret;
2001 #endif
2002 d->arch.relmem = RELMEM_l2;
2003 /* fallthrough */
2005 case RELMEM_l2:
2006 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2007 if ( ret )
2008 return ret;
2009 d->arch.relmem = RELMEM_done;
2010 /* fallthrough */
2012 case RELMEM_done:
2013 break;
2015 default:
2016 BUG();
2019 if ( is_hvm_domain(d) )
2020 hvm_domain_relinquish_resources(d);
2022 return 0;
2025 void arch_dump_domain_info(struct domain *d)
2027 paging_dump_domain_info(d);
2030 void arch_dump_vcpu_info(struct vcpu *v)
2032 paging_dump_vcpu_info(v);
2035 void domain_cpuid(
2036 struct domain *d,
2037 unsigned int input,
2038 unsigned int sub_input,
2039 unsigned int *eax,
2040 unsigned int *ebx,
2041 unsigned int *ecx,
2042 unsigned int *edx)
2044 cpuid_input_t *cpuid;
2045 int i;
2047 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
2049 cpuid = &d->arch.cpuids[i];
2051 if ( (cpuid->input[0] == input) &&
2052 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
2053 (cpuid->input[1] == sub_input)) )
2055 *eax = cpuid->eax;
2056 *ebx = cpuid->ebx;
2057 *ecx = cpuid->ecx;
2058 *edx = cpuid->edx;
2060 /*
2061 * Do not advertise host's invariant TSC unless the TSC is
2062 * emulated, or the domain cannot migrate to other hosts.
2063 */
2064 if ( (input == 0x80000007) && /* Advanced Power Management */
2065 !d->disable_migrate && !d->arch.vtsc )
2066 *edx &= ~(1u<<8); /* TSC Invariant */
2068 return;
2072 *eax = *ebx = *ecx = *edx = 0;
2075 void vcpu_kick(struct vcpu *v)
2077 /*
2078 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2079 * pending flag. These values may fluctuate (after all, we hold no
2080 * locks) but the key insight is that each change will cause
2081 * evtchn_upcall_pending to be polled.
2083 * NB2. We save the running flag across the unblock to avoid a needless
2084 * IPI for domains that we IPI'd to unblock.
2085 */
2086 bool_t running = v->is_running;
2087 vcpu_unblock(v);
2088 if ( running && (in_irq() || (v != current)) )
2089 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2092 void vcpu_mark_events_pending(struct vcpu *v)
2094 int already_pending = test_and_set_bit(
2095 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2097 if ( already_pending )
2098 return;
2100 if ( is_hvm_vcpu(v) )
2101 hvm_assert_evtchn_irq(v);
2102 else
2103 vcpu_kick(v);
2106 static void vcpu_kick_softirq(void)
2108 /*
2109 * Nothing to do here: we merely prevent notifiers from racing with checks
2110 * executed on return to guest context with interrupts enabled. See, for
2111 * example, xxx_intr_assist() executed on return to HVM guest context.
2112 */
2115 static int __init init_vcpu_kick_softirq(void)
2117 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2118 return 0;
2120 __initcall(init_vcpu_kick_softirq);
2123 /*
2124 * Local variables:
2125 * mode: C
2126 * c-set-style: "BSD"
2127 * c-basic-offset: 4
2128 * tab-width: 4
2129 * indent-tabs-mode: nil
2130 * End:
2131 */