debuggers.hg

view xen/arch/x86/domain.c @ 20672:2d072636c4f8

Pvrdtscp: move write_rdtscp_aux() to paravirt_ctxt_switch_to() -
Currently write_rdtscp_aux() is placed in update_vcpu_system_time(),
which is called by schedule() before context_switch(). This will break
the HVM guest TSC_AUX state because at this point, MSR hasn't beed
saved for HVM guests.So put the function in the point when a PV vcpu
is really scheduled in.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Dec 14 07:45:04 2009 +0000 (2009-12-14)
parents d53db6af369f
children 68e964ec2c7b
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <public/sysctl.h>
36 #include <asm/regs.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/system.h>
39 #include <asm/io.h>
40 #include <asm/processor.h>
41 #include <asm/desc.h>
42 #include <asm/i387.h>
43 #include <asm/mpspec.h>
44 #include <asm/ldt.h>
45 #include <asm/hypercall.h>
46 #include <asm/hvm/hvm.h>
47 #include <asm/hvm/support.h>
48 #include <asm/debugreg.h>
49 #include <asm/msr.h>
50 #include <asm/traps.h>
51 #include <asm/nmi.h>
52 #include <xen/numa.h>
53 #include <xen/iommu.h>
54 #ifdef CONFIG_COMPAT
55 #include <compat/vcpu.h>
56 #endif
58 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
59 DEFINE_PER_CPU(u64, efer);
60 DEFINE_PER_CPU(unsigned long, cr4);
62 static void default_idle(void);
63 static void default_dead_idle(void);
64 void (*pm_idle) (void) __read_mostly = default_idle;
65 void (*dead_idle) (void) __read_mostly = default_dead_idle;
67 static void paravirt_ctxt_switch_from(struct vcpu *v);
68 static void paravirt_ctxt_switch_to(struct vcpu *v);
70 static void vcpu_destroy_pagetables(struct vcpu *v);
72 static void continue_idle_domain(struct vcpu *v)
73 {
74 reset_stack_and_jump(idle_loop);
75 }
77 static void continue_nonidle_domain(struct vcpu *v)
78 {
79 reset_stack_and_jump(ret_from_intr);
80 }
82 static void default_idle(void)
83 {
84 local_irq_disable();
85 if ( !softirq_pending(smp_processor_id()) )
86 safe_halt();
87 else
88 local_irq_enable();
89 }
91 static void default_dead_idle(void)
92 {
93 for ( ; ; )
94 halt();
95 }
97 static void play_dead(void)
98 {
99 /*
100 * Flush pending softirqs if any. They can be queued up before this CPU
101 * was taken out of cpu_online_map in __cpu_disable().
102 */
103 do_softirq();
105 /* This must be done before dead CPU ack */
106 cpu_exit_clear();
107 hvm_cpu_down();
108 wbinvd();
109 mb();
110 /* Ack it */
111 __get_cpu_var(cpu_state) = CPU_DEAD;
113 /* With physical CPU hotplug, we should halt the cpu. */
114 local_irq_disable();
115 (*dead_idle)();
116 }
118 void idle_loop(void)
119 {
120 for ( ; ; )
121 {
122 if ( cpu_is_offline(smp_processor_id()) )
123 play_dead();
124 (*pm_idle)();
125 do_softirq();
126 }
127 }
129 void startup_cpu_idle_loop(void)
130 {
131 struct vcpu *v = current;
133 ASSERT(is_idle_vcpu(v));
134 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
135 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
137 reset_stack_and_jump(idle_loop);
138 }
140 void dump_pageframe_info(struct domain *d)
141 {
142 struct page_info *page;
144 printk("Memory pages belonging to domain %u:\n", d->domain_id);
146 if ( d->tot_pages >= 10 )
147 {
148 printk(" DomPage list too long to display\n");
149 }
150 else
151 {
152 page_list_for_each ( page, &d->page_list )
153 {
154 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 if ( is_hvm_domain(d) )
161 {
162 p2m_pod_dump_data(d);
163 }
165 page_list_for_each ( page, &d->xenpage_list )
166 {
167 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
168 _p(page_to_mfn(page)),
169 page->count_info, page->u.inuse.type_info);
170 }
171 }
173 struct domain *alloc_domain_struct(void)
174 {
175 struct domain *d;
176 /*
177 * We pack the MFN of the domain structure into a 32-bit field within
178 * the page_info structure. Hence the MEMF_bits() restriction.
179 */
180 d = alloc_xenheap_pages(
181 get_order_from_bytes(sizeof(*d)), MEMF_bits(32 + PAGE_SHIFT));
182 if ( d != NULL )
183 memset(d, 0, sizeof(*d));
184 return d;
185 }
187 void free_domain_struct(struct domain *d)
188 {
189 lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
190 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
191 }
193 struct vcpu *alloc_vcpu_struct(void)
194 {
195 struct vcpu *v;
196 /*
197 * This structure contains embedded PAE PDPTEs, used when an HVM guest
198 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
199 * may require that the shadow CR3 points below 4GB, and hence the whole
200 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
201 */
202 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
203 if ( v != NULL )
204 memset(v, 0, sizeof(*v));
205 return v;
206 }
208 void free_vcpu_struct(struct vcpu *v)
209 {
210 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
211 }
213 #ifdef __x86_64__
215 static int setup_compat_l4(struct vcpu *v)
216 {
217 struct page_info *pg;
218 l4_pgentry_t *l4tab;
220 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
221 if ( pg == NULL )
222 return -ENOMEM;
224 /* This page needs to look like a pagetable so that it can be shadowed */
225 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
227 l4tab = page_to_virt(pg);
228 copy_page(l4tab, idle_pg_table);
229 l4tab[0] = l4e_empty();
230 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
231 l4e_from_page(pg, __PAGE_HYPERVISOR);
232 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
233 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
234 __PAGE_HYPERVISOR);
236 v->arch.guest_table = pagetable_from_page(pg);
237 v->arch.guest_table_user = v->arch.guest_table;
239 return 0;
240 }
242 static void release_compat_l4(struct vcpu *v)
243 {
244 free_domheap_page(pagetable_get_page(v->arch.guest_table));
245 v->arch.guest_table = pagetable_null();
246 v->arch.guest_table_user = pagetable_null();
247 }
249 static inline int may_switch_mode(struct domain *d)
250 {
251 return (!is_hvm_domain(d) && (d->tot_pages == 0));
252 }
254 int switch_native(struct domain *d)
255 {
256 unsigned int vcpuid;
258 if ( d == NULL )
259 return -EINVAL;
260 if ( !may_switch_mode(d) )
261 return -EACCES;
262 if ( !is_pv_32on64_domain(d) )
263 return 0;
265 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
267 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
268 {
269 if (d->vcpu[vcpuid])
270 release_compat_l4(d->vcpu[vcpuid]);
271 }
273 return 0;
274 }
276 int switch_compat(struct domain *d)
277 {
278 unsigned int vcpuid;
280 if ( d == NULL )
281 return -EINVAL;
282 if ( !may_switch_mode(d) )
283 return -EACCES;
284 if ( is_pv_32on64_domain(d) )
285 return 0;
287 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
289 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
290 {
291 if ( (d->vcpu[vcpuid] != NULL) &&
292 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
293 goto undo_and_fail;
294 }
296 domain_set_alloc_bitsize(d);
298 return 0;
300 undo_and_fail:
301 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
302 while ( vcpuid-- != 0 )
303 {
304 if ( d->vcpu[vcpuid] != NULL )
305 release_compat_l4(d->vcpu[vcpuid]);
306 }
307 return -ENOMEM;
308 }
310 #else
311 #define setup_compat_l4(v) 0
312 #define release_compat_l4(v) ((void)0)
313 #endif
315 int vcpu_initialise(struct vcpu *v)
316 {
317 struct domain *d = v->domain;
318 int rc;
320 v->arch.vcpu_info_mfn = INVALID_MFN;
322 v->arch.flags = TF_kernel_mode;
324 #if defined(__i386__)
325 mapcache_vcpu_init(v);
326 #else
327 {
328 unsigned int idx = perdomain_pt_pgidx(v);
329 struct page_info *pg;
331 if ( !perdomain_pt_page(d, idx) )
332 {
333 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
334 if ( !pg )
335 return -ENOMEM;
336 clear_page(page_to_virt(pg));
337 perdomain_pt_page(d, idx) = pg;
338 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx]
339 = l2e_from_page(pg, __PAGE_HYPERVISOR);
340 }
341 }
342 #endif
344 pae_l3_cache_init(&v->arch.pae_l3_cache);
346 paging_vcpu_init(v);
348 if ( is_hvm_domain(d) )
349 {
350 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
351 return rc;
352 }
353 else
354 {
355 /* PV guests by default have a 100Hz ticker. */
356 if ( !is_idle_domain(d) )
357 v->periodic_period = MILLISECS(10);
359 /* PV guests get an emulated PIT too for video BIOSes to use. */
360 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
361 pit_init(v, cpu_khz);
363 v->arch.schedule_tail = continue_nonidle_domain;
364 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
365 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
367 if ( is_idle_domain(d) )
368 {
369 v->arch.schedule_tail = continue_idle_domain;
370 v->arch.cr3 = __pa(idle_pg_table);
371 }
373 v->arch.guest_context.ctrlreg[4] =
374 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
375 }
377 v->arch.perdomain_ptes = perdomain_ptes(d, v);
379 spin_lock_init(&v->arch.shadow_ldt_lock);
381 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
382 }
384 void vcpu_destroy(struct vcpu *v)
385 {
386 if ( is_pv_32on64_vcpu(v) )
387 release_compat_l4(v);
389 if ( is_hvm_vcpu(v) )
390 hvm_vcpu_destroy(v);
391 }
393 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
394 {
395 #ifdef __x86_64__
396 struct page_info *pg;
397 #else
398 int pdpt_order;
399 #endif
400 int i, paging_initialised = 0;
401 int rc = -ENOMEM;
403 d->arch.hvm_domain.hap_enabled =
404 is_hvm_domain(d) &&
405 hvm_funcs.hap_supported &&
406 (domcr_flags & DOMCRF_hap);
408 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
410 INIT_LIST_HEAD(&d->arch.pdev_list);
412 d->arch.relmem = RELMEM_not_started;
413 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
415 #if defined(__i386__)
417 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
418 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
419 if ( d->arch.mm_perdomain_pt == NULL )
420 goto fail;
421 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
423 mapcache_domain_init(d);
425 #else /* __x86_64__ */
427 BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages)
428 != PAGE_SIZE);
429 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
430 if ( !pg )
431 goto fail;
432 d->arch.mm_perdomain_pt_pages = page_to_virt(pg);
433 clear_page(d->arch.mm_perdomain_pt_pages);
435 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
436 if ( pg == NULL )
437 goto fail;
438 d->arch.mm_perdomain_l2 = page_to_virt(pg);
439 clear_page(d->arch.mm_perdomain_l2);
441 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
442 if ( pg == NULL )
443 goto fail;
444 d->arch.mm_perdomain_l3 = page_to_virt(pg);
445 clear_page(d->arch.mm_perdomain_l3);
446 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
447 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
448 __PAGE_HYPERVISOR);
450 HYPERVISOR_COMPAT_VIRT_START(d) =
451 is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START;
453 #endif /* __x86_64__ */
455 if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
456 goto fail;
457 paging_initialised = 1;
459 if ( !is_idle_domain(d) )
460 {
461 d->arch.ioport_caps =
462 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
463 rc = -ENOMEM;
464 if ( d->arch.ioport_caps == NULL )
465 goto fail;
467 /*
468 * The shared_info machine address must fit in a 32-bit field within a
469 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
470 */
471 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
472 goto fail;
474 clear_page(d->shared_info);
475 share_xen_page_with_guest(
476 virt_to_page(d->shared_info), d, XENSHARE_writable);
478 d->arch.pirq_irq = xmalloc_array(int, d->nr_pirqs);
479 if ( !d->arch.pirq_irq )
480 goto fail;
481 memset(d->arch.pirq_irq, 0,
482 d->nr_pirqs * sizeof(*d->arch.pirq_irq));
484 d->arch.irq_pirq = xmalloc_array(int, nr_irqs);
485 if ( !d->arch.irq_pirq )
486 goto fail;
487 memset(d->arch.irq_pirq, 0,
488 nr_irqs * sizeof(*d->arch.irq_pirq));
490 for ( i = 1; platform_legacy_irq(i); ++i )
491 if ( !IO_APIC_IRQ(i) )
492 d->arch.irq_pirq[i] = d->arch.pirq_irq[i] = i;
494 if ( (rc = iommu_domain_init(d)) != 0 )
495 goto fail;
497 /* For Guest vMCE MSRs virtualization */
498 mce_init_msr(d);
499 }
501 if ( is_hvm_domain(d) )
502 {
503 if ( (rc = hvm_domain_initialise(d)) != 0 )
504 {
505 iommu_domain_destroy(d);
506 goto fail;
507 }
508 }
509 else
510 {
511 /* 32-bit PV guest by default only if Xen is not 64-bit. */
512 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
513 (CONFIG_PAGING_LEVELS != 4);
514 }
516 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
517 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
518 {
519 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
520 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
521 }
523 /* initialize default tsc behavior in case tools don't */
524 tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
525 spin_lock_init(&d->arch.vtsc_lock);
527 return 0;
529 fail:
530 d->is_dying = DOMDYING_dead;
531 xfree(d->arch.pirq_irq);
532 xfree(d->arch.irq_pirq);
533 free_xenheap_page(d->shared_info);
534 if ( paging_initialised )
535 paging_final_teardown(d);
536 #ifdef __x86_64__
537 if ( d->arch.mm_perdomain_l2 )
538 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
539 if ( d->arch.mm_perdomain_l3 )
540 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
541 if ( d->arch.mm_perdomain_pt_pages )
542 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
543 #else
544 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
545 #endif
546 return rc;
547 }
549 void arch_domain_destroy(struct domain *d)
550 {
551 #ifdef __x86_64__
552 unsigned int i;
553 #endif
555 if ( is_hvm_domain(d) )
556 hvm_domain_destroy(d);
558 pci_release_devices(d);
559 free_domain_pirqs(d);
560 if ( !is_idle_domain(d) )
561 iommu_domain_destroy(d);
563 paging_final_teardown(d);
565 #ifdef __i386__
566 free_xenheap_pages(
567 d->arch.mm_perdomain_pt,
568 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
569 #else
570 for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
571 {
572 if ( perdomain_pt_page(d, i) )
573 free_domheap_page(perdomain_pt_page(d, i));
574 }
575 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
576 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
577 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
578 #endif
580 free_xenheap_page(d->shared_info);
581 xfree(d->arch.pirq_irq);
582 xfree(d->arch.irq_pirq);
583 }
585 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
586 {
587 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
589 hv_cr4_mask = ~X86_CR4_TSD;
590 if ( cpu_has_de )
591 hv_cr4_mask &= ~X86_CR4_DE;
593 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
594 gdprintk(XENLOG_WARNING,
595 "Attempt to change CR4 flags %08lx -> %08lx\n",
596 hv_cr4, guest_cr4);
598 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
599 }
601 /* This is called by arch_final_setup_guest and do_boot_vcpu */
602 int arch_set_info_guest(
603 struct vcpu *v, vcpu_guest_context_u c)
604 {
605 struct domain *d = v->domain;
606 unsigned long cr3_pfn = INVALID_MFN;
607 unsigned long flags, cr4;
608 int i, rc = 0, compat;
610 /* The context is a compat-mode one if the target domain is compat-mode;
611 * we expect the tools to DTRT even in compat-mode callers. */
612 compat = is_pv_32on64_domain(d);
614 #ifdef CONFIG_COMPAT
615 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
616 #else
617 #define c(fld) (c.nat->fld)
618 #endif
619 flags = c(flags);
621 if ( !is_hvm_vcpu(v) )
622 {
623 if ( !compat )
624 {
625 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
626 fixup_guest_stack_selector(d, c.nat->kernel_ss);
627 fixup_guest_code_selector(d, c.nat->user_regs.cs);
628 #ifdef __i386__
629 fixup_guest_code_selector(d, c.nat->event_callback_cs);
630 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
631 #endif
633 for ( i = 0; i < 256; i++ )
634 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
636 /* LDT safety checks. */
637 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
638 (c.nat->ldt_ents > 8192) ||
639 !array_access_ok(c.nat->ldt_base,
640 c.nat->ldt_ents,
641 LDT_ENTRY_SIZE) )
642 return -EINVAL;
643 }
644 #ifdef CONFIG_COMPAT
645 else
646 {
647 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
648 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
649 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
650 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
651 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
653 for ( i = 0; i < 256; i++ )
654 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
656 /* LDT safety checks. */
657 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
658 (c.cmp->ldt_ents > 8192) ||
659 !compat_array_access_ok(c.cmp->ldt_base,
660 c.cmp->ldt_ents,
661 LDT_ENTRY_SIZE) )
662 return -EINVAL;
663 }
664 #endif
665 }
667 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
669 v->arch.flags &= ~TF_kernel_mode;
670 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
671 v->arch.flags |= TF_kernel_mode;
673 if ( !compat )
674 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
675 #ifdef CONFIG_COMPAT
676 else
677 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
678 #endif
680 v->arch.guest_context.user_regs.eflags |= 2;
682 if ( is_hvm_vcpu(v) )
683 {
684 hvm_set_info_guest(v);
685 goto out;
686 }
688 /* Only CR0.TS is modifiable by guest or admin. */
689 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
690 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
692 init_int80_direct_trap(v);
694 /* IOPL privileges are virtualised. */
695 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
696 v->arch.guest_context.user_regs.eflags &= ~X86_EFLAGS_IOPL;
698 /* Ensure real hardware interrupts are enabled. */
699 v->arch.guest_context.user_regs.eflags |= X86_EFLAGS_IF;
701 cr4 = v->arch.guest_context.ctrlreg[4];
702 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
703 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
705 memset(v->arch.guest_context.debugreg, 0,
706 sizeof(v->arch.guest_context.debugreg));
707 for ( i = 0; i < 8; i++ )
708 (void)set_debugreg(v, i, c(debugreg[i]));
710 if ( v->is_initialised )
711 goto out;
713 if ( v->vcpu_id == 0 )
714 d->vm_assist = c(vm_assist);
716 if ( !compat )
717 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
718 #ifdef CONFIG_COMPAT
719 else
720 {
721 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
722 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
724 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
725 return -EINVAL;
726 for ( i = 0; i < n; ++i )
727 gdt_frames[i] = c.cmp->gdt_frames[i];
728 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
729 }
730 #endif
731 if ( rc != 0 )
732 return rc;
734 if ( !compat )
735 {
736 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
738 if ( !mfn_valid(cr3_pfn) ||
739 (paging_mode_refcounts(d)
740 ? !get_page(mfn_to_page(cr3_pfn), d)
741 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
742 PGT_base_page_table)) )
743 {
744 destroy_gdt(v);
745 return -EINVAL;
746 }
748 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
750 #ifdef __x86_64__
751 if ( c.nat->ctrlreg[1] )
752 {
753 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
755 if ( !mfn_valid(cr3_pfn) ||
756 (paging_mode_refcounts(d)
757 ? !get_page(mfn_to_page(cr3_pfn), d)
758 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
759 PGT_base_page_table)) )
760 {
761 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
762 v->arch.guest_table = pagetable_null();
763 if ( paging_mode_refcounts(d) )
764 put_page(mfn_to_page(cr3_pfn));
765 else
766 put_page_and_type(mfn_to_page(cr3_pfn));
767 destroy_gdt(v);
768 return -EINVAL;
769 }
771 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
772 }
773 }
774 else
775 {
776 l4_pgentry_t *l4tab;
778 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
780 if ( !mfn_valid(cr3_pfn) ||
781 (paging_mode_refcounts(d)
782 ? !get_page(mfn_to_page(cr3_pfn), d)
783 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
784 PGT_l3_page_table)) )
785 {
786 destroy_gdt(v);
787 return -EINVAL;
788 }
790 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
791 *l4tab = l4e_from_pfn(
792 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
793 #endif
794 }
796 if ( v->vcpu_id == 0 )
797 update_domain_wallclock_time(d);
799 /* Don't redo final setup */
800 v->is_initialised = 1;
802 if ( paging_mode_enabled(d) )
803 paging_update_paging_modes(v);
805 update_cr3(v);
807 out:
808 if ( flags & VGCF_online )
809 clear_bit(_VPF_down, &v->pause_flags);
810 else
811 set_bit(_VPF_down, &v->pause_flags);
812 return 0;
813 #undef c
814 }
816 void arch_vcpu_reset(struct vcpu *v)
817 {
818 if ( !is_hvm_vcpu(v) )
819 {
820 destroy_gdt(v);
821 vcpu_destroy_pagetables(v);
822 }
823 else
824 {
825 vcpu_end_shutdown_deferral(v);
826 }
827 }
829 /*
830 * Unmap the vcpu info page if the guest decided to place it somewhere
831 * else. This is only used from arch_domain_destroy, so there's no
832 * need to do anything clever.
833 */
834 static void
835 unmap_vcpu_info(struct vcpu *v)
836 {
837 unsigned long mfn;
839 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
840 return;
842 mfn = v->arch.vcpu_info_mfn;
843 unmap_domain_page_global(v->vcpu_info);
845 v->vcpu_info = &dummy_vcpu_info;
846 v->arch.vcpu_info_mfn = INVALID_MFN;
848 put_page_and_type(mfn_to_page(mfn));
849 }
851 /*
852 * Map a guest page in and point the vcpu_info pointer at it. This
853 * makes sure that the vcpu_info is always pointing at a valid piece
854 * of memory, and it sets a pending event to make sure that a pending
855 * event doesn't get missed.
856 */
857 static int
858 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
859 {
860 struct domain *d = v->domain;
861 void *mapping;
862 vcpu_info_t *new_info;
863 int i;
865 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
866 return -EINVAL;
868 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
869 return -EINVAL;
871 /* Run this command on yourself or on other offline VCPUS. */
872 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
873 return -EINVAL;
875 mfn = gmfn_to_mfn(d, mfn);
876 if ( !mfn_valid(mfn) ||
877 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
878 return -EINVAL;
880 mapping = map_domain_page_global(mfn);
881 if ( mapping == NULL )
882 {
883 put_page_and_type(mfn_to_page(mfn));
884 return -ENOMEM;
885 }
887 new_info = (vcpu_info_t *)(mapping + offset);
889 if ( v->vcpu_info == &dummy_vcpu_info )
890 {
891 memset(new_info, 0, sizeof(*new_info));
892 __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
893 }
894 else
895 {
896 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
897 }
899 v->vcpu_info = new_info;
900 v->arch.vcpu_info_mfn = mfn;
902 /* Set new vcpu_info pointer /before/ setting pending flags. */
903 wmb();
905 /*
906 * Mark everything as being pending just to make sure nothing gets
907 * lost. The domain will get a spurious event, but it can cope.
908 */
909 vcpu_info(v, evtchn_upcall_pending) = 1;
910 for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
911 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
913 return 0;
914 }
916 long
917 arch_do_vcpu_op(
918 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
919 {
920 long rc = 0;
922 switch ( cmd )
923 {
924 case VCPUOP_register_runstate_memory_area:
925 {
926 struct vcpu_register_runstate_memory_area area;
927 struct vcpu_runstate_info runstate;
929 rc = -EFAULT;
930 if ( copy_from_guest(&area, arg, 1) )
931 break;
933 if ( !guest_handle_okay(area.addr.h, 1) )
934 break;
936 rc = 0;
937 runstate_guest(v) = area.addr.h;
939 if ( v == current )
940 {
941 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
942 }
943 else
944 {
945 vcpu_runstate_get(v, &runstate);
946 __copy_to_guest(runstate_guest(v), &runstate, 1);
947 }
949 break;
950 }
952 case VCPUOP_register_vcpu_info:
953 {
954 struct domain *d = v->domain;
955 struct vcpu_register_vcpu_info info;
957 rc = -EFAULT;
958 if ( copy_from_guest(&info, arg, 1) )
959 break;
961 domain_lock(d);
962 rc = map_vcpu_info(v, info.mfn, info.offset);
963 domain_unlock(d);
965 break;
966 }
968 case VCPUOP_register_vcpu_time_memory_area:
969 {
970 struct vcpu_register_time_memory_area area;
972 rc = -EFAULT;
973 if ( copy_from_guest(&area, arg, 1) )
974 break;
976 if ( !guest_handle_okay(area.addr.h, 1) )
977 break;
979 rc = 0;
980 v->arch.time_info_guest = area.addr.h;
982 force_update_vcpu_system_time(v);
984 break;
985 }
987 case VCPUOP_get_physid:
988 {
989 struct vcpu_get_physid cpu_id;
991 rc = -EINVAL;
992 if ( !v->domain->is_pinned )
993 break;
995 cpu_id.phys_id =
996 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
997 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
999 rc = -EFAULT;
1000 if ( copy_to_guest(arg, &cpu_id, 1) )
1001 break;
1003 rc = 0;
1004 break;
1007 default:
1008 rc = -ENOSYS;
1009 break;
1012 return rc;
1015 #ifdef __x86_64__
1017 #define loadsegment(seg,value) ({ \
1018 int __r = 1; \
1019 asm volatile ( \
1020 "1: movl %k1,%%" #seg "\n2:\n" \
1021 ".section .fixup,\"ax\"\n" \
1022 "3: xorl %k0,%k0\n" \
1023 " movl %k0,%%" #seg "\n" \
1024 " jmp 2b\n" \
1025 ".previous\n" \
1026 ".section __ex_table,\"a\"\n" \
1027 " .align 8\n" \
1028 " .quad 1b,3b\n" \
1029 ".previous" \
1030 : "=r" (__r) : "r" (value), "0" (__r) );\
1031 __r; })
1033 /*
1034 * save_segments() writes a mask of segments which are dirty (non-zero),
1035 * allowing load_segments() to avoid some expensive segment loads and
1036 * MSR writes.
1037 */
1038 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1039 #define DIRTY_DS 0x01
1040 #define DIRTY_ES 0x02
1041 #define DIRTY_FS 0x04
1042 #define DIRTY_GS 0x08
1043 #define DIRTY_FS_BASE 0x10
1044 #define DIRTY_GS_BASE_USER 0x20
1046 static void load_segments(struct vcpu *n)
1048 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
1049 int all_segs_okay = 1;
1050 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1052 /* Load and clear the dirty segment mask. */
1053 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1054 per_cpu(dirty_segment_mask, cpu) = 0;
1056 /* Either selector != 0 ==> reload. */
1057 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1058 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1060 /* Either selector != 0 ==> reload. */
1061 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1062 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1064 /*
1065 * Either selector != 0 ==> reload.
1066 * Also reload to reset FS_BASE if it was non-zero.
1067 */
1068 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1069 nctxt->user_regs.fs) )
1070 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1072 /*
1073 * Either selector != 0 ==> reload.
1074 * Also reload to reset GS_BASE if it was non-zero.
1075 */
1076 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1077 nctxt->user_regs.gs) )
1079 /* Reset GS_BASE with user %gs? */
1080 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1081 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1084 if ( !is_pv_32on64_domain(n->domain) )
1086 /* This can only be non-zero if selector is NULL. */
1087 if ( nctxt->fs_base )
1088 wrmsr(MSR_FS_BASE,
1089 nctxt->fs_base,
1090 nctxt->fs_base>>32);
1092 /* Most kernels have non-zero GS base, so don't bother testing. */
1093 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1094 wrmsr(MSR_SHADOW_GS_BASE,
1095 nctxt->gs_base_kernel,
1096 nctxt->gs_base_kernel>>32);
1098 /* This can only be non-zero if selector is NULL. */
1099 if ( nctxt->gs_base_user )
1100 wrmsr(MSR_GS_BASE,
1101 nctxt->gs_base_user,
1102 nctxt->gs_base_user>>32);
1104 /* If in kernel mode then switch the GS bases around. */
1105 if ( (n->arch.flags & TF_kernel_mode) )
1106 asm volatile ( "swapgs" );
1109 if ( unlikely(!all_segs_okay) )
1111 struct cpu_user_regs *regs = guest_cpu_user_regs();
1112 unsigned long *rsp =
1113 (n->arch.flags & TF_kernel_mode) ?
1114 (unsigned long *)regs->rsp :
1115 (unsigned long *)nctxt->kernel_sp;
1116 unsigned long cs_and_mask, rflags;
1118 if ( is_pv_32on64_domain(n->domain) )
1120 unsigned int *esp = ring_1(regs) ?
1121 (unsigned int *)regs->rsp :
1122 (unsigned int *)nctxt->kernel_sp;
1123 unsigned int cs_and_mask, eflags;
1124 int ret = 0;
1126 /* CS longword also contains full evtchn_upcall_mask. */
1127 cs_and_mask = (unsigned short)regs->cs |
1128 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1129 /* Fold upcall mask into RFLAGS.IF. */
1130 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1131 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1133 if ( !ring_1(regs) )
1135 ret = put_user(regs->ss, esp-1);
1136 ret |= put_user(regs->_esp, esp-2);
1137 esp -= 2;
1140 if ( ret |
1141 put_user(eflags, esp-1) |
1142 put_user(cs_and_mask, esp-2) |
1143 put_user(regs->_eip, esp-3) |
1144 put_user(nctxt->user_regs.gs, esp-4) |
1145 put_user(nctxt->user_regs.fs, esp-5) |
1146 put_user(nctxt->user_regs.es, esp-6) |
1147 put_user(nctxt->user_regs.ds, esp-7) )
1149 gdprintk(XENLOG_ERR, "Error while creating compat "
1150 "failsafe callback frame.\n");
1151 domain_crash(n->domain);
1154 if ( test_bit(_VGCF_failsafe_disables_events,
1155 &n->arch.guest_context.flags) )
1156 vcpu_info(n, evtchn_upcall_mask) = 1;
1158 regs->entry_vector = TRAP_syscall;
1159 regs->_eflags &= 0xFFFCBEFFUL;
1160 regs->ss = FLAT_COMPAT_KERNEL_SS;
1161 regs->_esp = (unsigned long)(esp-7);
1162 regs->cs = FLAT_COMPAT_KERNEL_CS;
1163 regs->_eip = nctxt->failsafe_callback_eip;
1164 return;
1167 if ( !(n->arch.flags & TF_kernel_mode) )
1168 toggle_guest_mode(n);
1169 else
1170 regs->cs &= ~3;
1172 /* CS longword also contains full evtchn_upcall_mask. */
1173 cs_and_mask = (unsigned long)regs->cs |
1174 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1176 /* Fold upcall mask into RFLAGS.IF. */
1177 rflags = regs->rflags & ~X86_EFLAGS_IF;
1178 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1180 if ( put_user(regs->ss, rsp- 1) |
1181 put_user(regs->rsp, rsp- 2) |
1182 put_user(rflags, rsp- 3) |
1183 put_user(cs_and_mask, rsp- 4) |
1184 put_user(regs->rip, rsp- 5) |
1185 put_user(nctxt->user_regs.gs, rsp- 6) |
1186 put_user(nctxt->user_regs.fs, rsp- 7) |
1187 put_user(nctxt->user_regs.es, rsp- 8) |
1188 put_user(nctxt->user_regs.ds, rsp- 9) |
1189 put_user(regs->r11, rsp-10) |
1190 put_user(regs->rcx, rsp-11) )
1192 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1193 "callback frame.\n");
1194 domain_crash(n->domain);
1197 if ( test_bit(_VGCF_failsafe_disables_events,
1198 &n->arch.guest_context.flags) )
1199 vcpu_info(n, evtchn_upcall_mask) = 1;
1201 regs->entry_vector = TRAP_syscall;
1202 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1203 X86_EFLAGS_NT|X86_EFLAGS_TF);
1204 regs->ss = FLAT_KERNEL_SS;
1205 regs->rsp = (unsigned long)(rsp-11);
1206 regs->cs = FLAT_KERNEL_CS;
1207 regs->rip = nctxt->failsafe_callback_eip;
1211 static void save_segments(struct vcpu *v)
1213 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1214 struct cpu_user_regs *regs = &ctxt->user_regs;
1215 unsigned int dirty_segment_mask = 0;
1217 regs->ds = read_segment_register(ds);
1218 regs->es = read_segment_register(es);
1219 regs->fs = read_segment_register(fs);
1220 regs->gs = read_segment_register(gs);
1222 if ( regs->ds )
1223 dirty_segment_mask |= DIRTY_DS;
1225 if ( regs->es )
1226 dirty_segment_mask |= DIRTY_ES;
1228 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1230 dirty_segment_mask |= DIRTY_FS;
1231 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1233 else if ( ctxt->fs_base )
1235 dirty_segment_mask |= DIRTY_FS_BASE;
1238 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1240 dirty_segment_mask |= DIRTY_GS;
1241 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1243 else if ( ctxt->gs_base_user )
1245 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1248 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1251 #define switch_kernel_stack(v) ((void)0)
1253 #elif defined(__i386__)
1255 #define load_segments(n) ((void)0)
1256 #define save_segments(p) ((void)0)
1258 static inline void switch_kernel_stack(struct vcpu *v)
1260 struct tss_struct *tss = &this_cpu(init_tss);
1261 tss->esp1 = v->arch.guest_context.kernel_sp;
1262 tss->ss1 = v->arch.guest_context.kernel_ss;
1265 #endif /* __i386__ */
1267 static void paravirt_ctxt_switch_from(struct vcpu *v)
1269 save_segments(v);
1271 /*
1272 * Disable debug breakpoints. We do this aggressively because if we switch
1273 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1274 * inside Xen, before we get a chance to reload DR7, and this cannot always
1275 * safely be handled.
1276 */
1277 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1278 write_debugreg(7, 0);
1281 static void paravirt_ctxt_switch_to(struct vcpu *v)
1283 unsigned long cr4;
1285 set_int80_direct_trap(v);
1286 switch_kernel_stack(v);
1288 cr4 = pv_guest_cr4_to_real_cr4(v);
1289 if ( unlikely(cr4 != read_cr4()) )
1290 write_cr4(cr4);
1292 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1294 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1295 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1296 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1297 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1298 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1299 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1302 if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) &&
1303 boot_cpu_has(X86_FEATURE_RDTSCP) )
1304 write_rdtscp_aux(v->domain->arch.incarnation);
1307 /* Update per-VCPU guest runstate shared memory area (if registered). */
1308 static void update_runstate_area(struct vcpu *v)
1310 if ( guest_handle_is_null(runstate_guest(v)) )
1311 return;
1313 #ifdef CONFIG_COMPAT
1314 if ( is_pv_32on64_domain(v->domain) )
1316 struct compat_vcpu_runstate_info info;
1318 XLAT_vcpu_runstate_info(&info, &v->runstate);
1319 __copy_to_guest(v->runstate_guest.compat, &info, 1);
1320 return;
1322 #endif
1324 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1327 static inline int need_full_gdt(struct vcpu *v)
1329 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1332 static void __context_switch(void)
1334 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1335 unsigned int cpu = smp_processor_id();
1336 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1337 struct vcpu *n = current;
1338 struct desc_struct *gdt;
1339 struct desc_ptr gdt_desc;
1341 ASSERT(p != n);
1342 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1344 if ( !is_idle_vcpu(p) )
1346 memcpy(&p->arch.guest_context.user_regs,
1347 stack_regs,
1348 CTXT_SWITCH_STACK_BYTES);
1349 unlazy_fpu(p);
1350 p->arch.ctxt_switch_from(p);
1353 /*
1354 * Mark this CPU in next domain's dirty cpumasks before calling
1355 * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1356 * which is synchronised on that function.
1357 */
1358 if ( p->domain != n->domain )
1359 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1360 cpu_set(cpu, n->vcpu_dirty_cpumask);
1362 if ( !is_idle_vcpu(n) )
1364 memcpy(stack_regs,
1365 &n->arch.guest_context.user_regs,
1366 CTXT_SWITCH_STACK_BYTES);
1367 n->arch.ctxt_switch_to(n);
1370 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1371 per_cpu(compat_gdt_table, cpu);
1372 if ( need_full_gdt(n) )
1374 struct page_info *page = virt_to_page(gdt);
1375 unsigned int i;
1376 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1377 l1e_write(n->arch.perdomain_ptes +
1378 FIRST_RESERVED_GDT_PAGE + i,
1379 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1382 if ( need_full_gdt(p) &&
1383 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1385 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1386 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1387 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1390 write_ptbase(n);
1392 if ( need_full_gdt(n) &&
1393 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1395 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1396 gdt_desc.base = GDT_VIRT_START(n);
1397 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1400 if ( p->domain != n->domain )
1401 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1402 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1404 per_cpu(curr_vcpu, cpu) = n;
1408 void context_switch(struct vcpu *prev, struct vcpu *next)
1410 unsigned int cpu = smp_processor_id();
1411 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1413 ASSERT(local_irq_is_enabled());
1415 /* Allow at most one CPU at a time to be dirty. */
1416 ASSERT(cpus_weight(dirty_mask) <= 1);
1417 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1419 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1420 flush_tlb_mask(&dirty_mask);
1423 if (prev != next)
1424 update_runstate_area(prev);
1426 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1427 pt_save_timer(prev);
1429 local_irq_disable();
1431 set_current(next);
1433 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1435 local_irq_enable();
1437 else
1439 __context_switch();
1441 #ifdef CONFIG_COMPAT
1442 if ( !is_hvm_vcpu(next) &&
1443 (is_idle_vcpu(prev) ||
1444 is_hvm_vcpu(prev) ||
1445 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1447 uint64_t efer = read_efer();
1448 if ( !(efer & EFER_SCE) )
1449 write_efer(efer | EFER_SCE);
1451 #endif
1453 /* Re-enable interrupts before restoring state which may fault. */
1454 local_irq_enable();
1456 if ( !is_hvm_vcpu(next) )
1458 load_LDT(next);
1459 load_segments(next);
1463 context_saved(prev);
1465 if (prev != next)
1466 update_runstate_area(next);
1468 schedule_tail(next);
1469 BUG();
1472 void continue_running(struct vcpu *same)
1474 schedule_tail(same);
1475 BUG();
1478 int __sync_lazy_execstate(void)
1480 unsigned long flags;
1481 int switch_required;
1483 local_irq_save(flags);
1485 switch_required = (this_cpu(curr_vcpu) != current);
1487 if ( switch_required )
1489 ASSERT(current == idle_vcpu[smp_processor_id()]);
1490 __context_switch();
1493 local_irq_restore(flags);
1495 return switch_required;
1498 void sync_vcpu_execstate(struct vcpu *v)
1500 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1501 (void)__sync_lazy_execstate();
1503 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1504 flush_tlb_mask(&v->vcpu_dirty_cpumask);
1507 struct migrate_info {
1508 long (*func)(void *data);
1509 void *data;
1510 void (*saved_schedule_tail)(struct vcpu *);
1511 cpumask_t saved_affinity;
1512 unsigned int nest;
1513 };
1515 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1517 struct cpu_user_regs *regs = guest_cpu_user_regs();
1518 struct migrate_info *info = v->arch.continue_info;
1519 cpumask_t mask = info->saved_affinity;
1520 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1522 regs->eax = info->func(info->data);
1524 if ( info->nest-- == 0 )
1526 xfree(info);
1527 v->arch.schedule_tail = saved_schedule_tail;
1528 v->arch.continue_info = NULL;
1529 vcpu_unlock_affinity(v, &mask);
1532 (*saved_schedule_tail)(v);
1535 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1537 struct vcpu *v = current;
1538 struct migrate_info *info;
1539 cpumask_t mask = cpumask_of_cpu(cpu);
1540 int rc;
1542 if ( cpu == smp_processor_id() )
1543 return func(data);
1545 info = v->arch.continue_info;
1546 if ( info == NULL )
1548 info = xmalloc(struct migrate_info);
1549 if ( info == NULL )
1550 return -ENOMEM;
1552 rc = vcpu_lock_affinity(v, &mask);
1553 if ( rc )
1555 xfree(info);
1556 return rc;
1559 info->saved_schedule_tail = v->arch.schedule_tail;
1560 info->saved_affinity = mask;
1561 info->nest = 0;
1563 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1564 v->arch.continue_info = info;
1566 else
1568 BUG_ON(info->nest != 0);
1569 rc = vcpu_locked_change_affinity(v, &mask);
1570 if ( rc )
1571 return rc;
1572 info->nest++;
1575 info->func = func;
1576 info->data = data;
1578 /* Dummy return value will be overwritten by new schedule_tail. */
1579 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1580 return 0;
1583 #define next_arg(fmt, args) ({ \
1584 unsigned long __arg; \
1585 switch ( *(fmt)++ ) \
1586 { \
1587 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1588 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1589 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1590 default: __arg = 0; BUG(); \
1591 } \
1592 __arg; \
1593 })
1595 DEFINE_PER_CPU(char, hc_preempted);
1597 unsigned long hypercall_create_continuation(
1598 unsigned int op, const char *format, ...)
1600 struct mc_state *mcs = &this_cpu(mc_state);
1601 struct cpu_user_regs *regs;
1602 const char *p = format;
1603 unsigned long arg;
1604 unsigned int i;
1605 va_list args;
1607 va_start(args, format);
1609 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1611 __set_bit(_MCSF_call_preempted, &mcs->flags);
1613 for ( i = 0; *p != '\0'; i++ )
1614 mcs->call.args[i] = next_arg(p, args);
1615 if ( is_pv_32on64_domain(current->domain) )
1617 for ( ; i < 6; i++ )
1618 mcs->call.args[i] = 0;
1621 else
1623 regs = guest_cpu_user_regs();
1624 regs->eax = op;
1625 /*
1626 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1627 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1628 */
1629 if ( !is_hvm_vcpu(current) )
1630 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1632 #ifdef __x86_64__
1633 if ( !is_hvm_vcpu(current) ?
1634 !is_pv_32on64_vcpu(current) :
1635 (hvm_guest_x86_mode(current) == 8) )
1637 for ( i = 0; *p != '\0'; i++ )
1639 arg = next_arg(p, args);
1640 switch ( i )
1642 case 0: regs->rdi = arg; break;
1643 case 1: regs->rsi = arg; break;
1644 case 2: regs->rdx = arg; break;
1645 case 3: regs->r10 = arg; break;
1646 case 4: regs->r8 = arg; break;
1647 case 5: regs->r9 = arg; break;
1651 else
1652 #endif
1654 if ( supervisor_mode_kernel )
1655 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1657 for ( i = 0; *p != '\0'; i++ )
1659 arg = next_arg(p, args);
1660 switch ( i )
1662 case 0: regs->ebx = arg; break;
1663 case 1: regs->ecx = arg; break;
1664 case 2: regs->edx = arg; break;
1665 case 3: regs->esi = arg; break;
1666 case 4: regs->edi = arg; break;
1667 case 5: regs->ebp = arg; break;
1672 this_cpu(hc_preempted) = 1;
1675 va_end(args);
1677 return op;
1680 #ifdef CONFIG_COMPAT
1681 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1683 int rc = 0;
1684 struct mc_state *mcs = &this_cpu(mc_state);
1685 struct cpu_user_regs *regs;
1686 unsigned int i, cval = 0;
1687 unsigned long nval = 0;
1688 va_list args;
1690 BUG_ON(*id > 5);
1691 BUG_ON(mask & (1U << *id));
1693 va_start(args, mask);
1695 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1697 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1698 return 0;
1699 for ( i = 0; i < 6; ++i, mask >>= 1 )
1701 if ( mask & 1 )
1703 nval = va_arg(args, unsigned long);
1704 cval = va_arg(args, unsigned int);
1705 if ( cval == nval )
1706 mask &= ~1U;
1707 else
1708 BUG_ON(nval == (unsigned int)nval);
1710 else if ( id && *id == i )
1712 *id = mcs->call.args[i];
1713 id = NULL;
1715 if ( (mask & 1) && mcs->call.args[i] == nval )
1717 mcs->call.args[i] = cval;
1718 ++rc;
1720 else
1721 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1724 else
1726 regs = guest_cpu_user_regs();
1727 for ( i = 0; i < 6; ++i, mask >>= 1 )
1729 unsigned long *reg;
1731 switch ( i )
1733 case 0: reg = &regs->ebx; break;
1734 case 1: reg = &regs->ecx; break;
1735 case 2: reg = &regs->edx; break;
1736 case 3: reg = &regs->esi; break;
1737 case 4: reg = &regs->edi; break;
1738 case 5: reg = &regs->ebp; break;
1739 default: BUG(); reg = NULL; break;
1741 if ( (mask & 1) )
1743 nval = va_arg(args, unsigned long);
1744 cval = va_arg(args, unsigned int);
1745 if ( cval == nval )
1746 mask &= ~1U;
1747 else
1748 BUG_ON(nval == (unsigned int)nval);
1750 else if ( id && *id == i )
1752 *id = *reg;
1753 id = NULL;
1755 if ( (mask & 1) && *reg == nval )
1757 *reg = cval;
1758 ++rc;
1760 else
1761 BUG_ON(*reg != (unsigned int)*reg);
1765 va_end(args);
1767 return rc;
1769 #endif
1771 static int relinquish_memory(
1772 struct domain *d, struct page_list_head *list, unsigned long type)
1774 struct page_info *page;
1775 unsigned long x, y;
1776 int ret = 0;
1778 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1779 spin_lock_recursive(&d->page_alloc_lock);
1781 while ( (page = page_list_remove_head(list)) )
1783 /* Grab a reference to the page so it won't disappear from under us. */
1784 if ( unlikely(!get_page(page, d)) )
1786 /* Couldn't get a reference -- someone is freeing this page. */
1787 page_list_add_tail(page, &d->arch.relmem_list);
1788 continue;
1791 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1792 ret = put_page_and_type_preemptible(page, 1);
1793 switch ( ret )
1795 case 0:
1796 break;
1797 case -EAGAIN:
1798 case -EINTR:
1799 page_list_add(page, list);
1800 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1801 put_page(page);
1802 goto out;
1803 default:
1804 BUG();
1807 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1808 put_page(page);
1810 /*
1811 * Forcibly invalidate top-most, still valid page tables at this point
1812 * to break circular 'linear page table' references as well as clean up
1813 * partially validated pages. This is okay because MMU structures are
1814 * not shared across domains and this domain is now dead. Thus top-most
1815 * valid tables are not in use so a non-zero count means circular
1816 * reference or partially validated.
1817 */
1818 y = page->u.inuse.type_info;
1819 for ( ; ; )
1821 x = y;
1822 if ( likely((x & PGT_type_mask) != type) ||
1823 likely(!(x & (PGT_validated|PGT_partial))) )
1824 break;
1826 y = cmpxchg(&page->u.inuse.type_info, x,
1827 x & ~(PGT_validated|PGT_partial));
1828 if ( likely(y == x) )
1830 /* No need for atomic update of type_info here: noone else updates it. */
1831 switch ( ret = free_page_type(page, x, 1) )
1833 case 0:
1834 break;
1835 case -EINTR:
1836 page_list_add(page, list);
1837 page->u.inuse.type_info |= PGT_validated;
1838 if ( x & PGT_partial )
1839 put_page(page);
1840 put_page(page);
1841 ret = -EAGAIN;
1842 goto out;
1843 case -EAGAIN:
1844 page_list_add(page, list);
1845 page->u.inuse.type_info |= PGT_partial;
1846 if ( x & PGT_partial )
1847 put_page(page);
1848 goto out;
1849 default:
1850 BUG();
1852 if ( x & PGT_partial )
1854 page->u.inuse.type_info--;
1855 put_page(page);
1857 break;
1861 /* Put the page on the list and /then/ potentially free it. */
1862 page_list_add_tail(page, &d->arch.relmem_list);
1863 put_page(page);
1865 if ( hypercall_preempt_check() )
1867 ret = -EAGAIN;
1868 goto out;
1872 /* list is empty at this point. */
1873 page_list_move(list, &d->arch.relmem_list);
1875 out:
1876 spin_unlock_recursive(&d->page_alloc_lock);
1877 return ret;
1880 static void vcpu_destroy_pagetables(struct vcpu *v)
1882 struct domain *d = v->domain;
1883 unsigned long pfn;
1885 #ifdef __x86_64__
1886 if ( is_pv_32on64_vcpu(v) )
1888 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1889 __va(pagetable_get_paddr(v->arch.guest_table)));
1891 if ( pfn != 0 )
1893 if ( paging_mode_refcounts(d) )
1894 put_page(mfn_to_page(pfn));
1895 else
1896 put_page_and_type(mfn_to_page(pfn));
1899 l4e_write(
1900 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1901 l4e_empty());
1903 v->arch.cr3 = 0;
1904 return;
1906 #endif
1908 pfn = pagetable_get_pfn(v->arch.guest_table);
1909 if ( pfn != 0 )
1911 if ( paging_mode_refcounts(d) )
1912 put_page(mfn_to_page(pfn));
1913 else
1914 put_page_and_type(mfn_to_page(pfn));
1915 v->arch.guest_table = pagetable_null();
1918 #ifdef __x86_64__
1919 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1920 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1921 if ( pfn != 0 )
1923 if ( !is_pv_32bit_vcpu(v) )
1925 if ( paging_mode_refcounts(d) )
1926 put_page(mfn_to_page(pfn));
1927 else
1928 put_page_and_type(mfn_to_page(pfn));
1930 v->arch.guest_table_user = pagetable_null();
1932 #endif
1934 v->arch.cr3 = 0;
1937 int domain_relinquish_resources(struct domain *d)
1939 int ret;
1940 struct vcpu *v;
1942 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1944 switch ( d->arch.relmem )
1946 case RELMEM_not_started:
1947 /* Tear down paging-assistance stuff. */
1948 paging_teardown(d);
1950 for_each_vcpu ( d, v )
1952 /* Drop the in-use references to page-table bases. */
1953 vcpu_destroy_pagetables(v);
1955 /*
1956 * Relinquish GDT mappings. No need for explicit unmapping of the
1957 * LDT as it automatically gets squashed with the guest mappings.
1958 */
1959 destroy_gdt(v);
1961 unmap_vcpu_info(v);
1964 if ( d->arch.pirq_eoi_map != NULL )
1966 unmap_domain_page_global(d->arch.pirq_eoi_map);
1967 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1968 d->arch.pirq_eoi_map = NULL;
1971 d->arch.relmem = RELMEM_xen;
1972 /* fallthrough */
1974 /* Relinquish every page of memory. */
1975 case RELMEM_xen:
1976 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1977 if ( ret )
1978 return ret;
1979 #if CONFIG_PAGING_LEVELS >= 4
1980 d->arch.relmem = RELMEM_l4;
1981 /* fallthrough */
1983 case RELMEM_l4:
1984 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1985 if ( ret )
1986 return ret;
1987 #endif
1988 #if CONFIG_PAGING_LEVELS >= 3
1989 d->arch.relmem = RELMEM_l3;
1990 /* fallthrough */
1992 case RELMEM_l3:
1993 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1994 if ( ret )
1995 return ret;
1996 #endif
1997 d->arch.relmem = RELMEM_l2;
1998 /* fallthrough */
2000 case RELMEM_l2:
2001 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2002 if ( ret )
2003 return ret;
2004 d->arch.relmem = RELMEM_done;
2005 /* fallthrough */
2007 case RELMEM_done:
2008 break;
2010 default:
2011 BUG();
2014 if ( is_hvm_domain(d) )
2015 hvm_domain_relinquish_resources(d);
2017 return 0;
2020 void arch_dump_domain_info(struct domain *d)
2022 paging_dump_domain_info(d);
2025 void arch_dump_vcpu_info(struct vcpu *v)
2027 paging_dump_vcpu_info(v);
2030 void domain_cpuid(
2031 struct domain *d,
2032 unsigned int input,
2033 unsigned int sub_input,
2034 unsigned int *eax,
2035 unsigned int *ebx,
2036 unsigned int *ecx,
2037 unsigned int *edx)
2039 cpuid_input_t *cpuid;
2040 int i;
2042 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
2044 cpuid = &d->arch.cpuids[i];
2046 if ( (cpuid->input[0] == input) &&
2047 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
2048 (cpuid->input[1] == sub_input)) )
2050 *eax = cpuid->eax;
2051 *ebx = cpuid->ebx;
2052 *ecx = cpuid->ecx;
2053 *edx = cpuid->edx;
2055 /*
2056 * Do not advertise host's invariant TSC unless the TSC is
2057 * emulated, or the domain cannot migrate to other hosts.
2058 */
2059 if ( (input == 0x80000007) && /* Advanced Power Management */
2060 !d->disable_migrate && !d->arch.vtsc )
2061 *edx &= ~(1u<<8); /* TSC Invariant */
2063 return;
2067 *eax = *ebx = *ecx = *edx = 0;
2070 void vcpu_kick(struct vcpu *v)
2072 /*
2073 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2074 * pending flag. These values may fluctuate (after all, we hold no
2075 * locks) but the key insight is that each change will cause
2076 * evtchn_upcall_pending to be polled.
2078 * NB2. We save the running flag across the unblock to avoid a needless
2079 * IPI for domains that we IPI'd to unblock.
2080 */
2081 bool_t running = v->is_running;
2082 vcpu_unblock(v);
2083 if ( running && (in_irq() || (v != current)) )
2084 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2087 void vcpu_mark_events_pending(struct vcpu *v)
2089 int already_pending = test_and_set_bit(
2090 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2092 if ( already_pending )
2093 return;
2095 if ( is_hvm_vcpu(v) )
2096 hvm_assert_evtchn_irq(v);
2097 else
2098 vcpu_kick(v);
2101 static void vcpu_kick_softirq(void)
2103 /*
2104 * Nothing to do here: we merely prevent notifiers from racing with checks
2105 * executed on return to guest context with interrupts enabled. See, for
2106 * example, xxx_intr_assist() executed on return to HVM guest context.
2107 */
2110 static int __init init_vcpu_kick_softirq(void)
2112 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2113 return 0;
2115 __initcall(init_vcpu_kick_softirq);
2118 /*
2119 * Local variables:
2120 * mode: C
2121 * c-set-style: "BSD"
2122 * c-basic-offset: 4
2123 * tab-width: 4
2124 * indent-tabs-mode: nil
2125 * End:
2126 */