debuggers.hg

view xen/arch/x86/domain.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents cecc76506afc
children 1c01814f9a25
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/traps.h>
50 #include <asm/nmi.h>
51 #include <xen/numa.h>
52 #include <xen/iommu.h>
53 #ifdef CONFIG_COMPAT
54 #include <compat/vcpu.h>
55 #endif
57 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
58 DEFINE_PER_CPU(u64, efer);
59 DEFINE_PER_CPU(unsigned long, cr4);
61 static void default_idle(void);
62 static void default_dead_idle(void);
63 void (*pm_idle) (void) = default_idle;
64 void (*dead_idle) (void) = default_dead_idle;
66 static void paravirt_ctxt_switch_from(struct vcpu *v);
67 static void paravirt_ctxt_switch_to(struct vcpu *v);
69 static void vcpu_destroy_pagetables(struct vcpu *v);
71 static void continue_idle_domain(struct vcpu *v)
72 {
73 reset_stack_and_jump(idle_loop);
74 }
76 static void continue_nonidle_domain(struct vcpu *v)
77 {
78 reset_stack_and_jump(ret_from_intr);
79 }
81 static void default_idle(void)
82 {
83 local_irq_disable();
84 if ( !softirq_pending(smp_processor_id()) )
85 safe_halt();
86 else
87 local_irq_enable();
88 }
90 static void default_dead_idle(void)
91 {
92 for ( ; ; )
93 halt();
94 }
96 static void play_dead(void)
97 {
98 /*
99 * Flush pending softirqs if any. They can be queued up before this CPU
100 * was taken out of cpu_online_map in __cpu_disable().
101 */
102 do_softirq();
104 /* This must be done before dead CPU ack */
105 cpu_exit_clear();
106 hvm_cpu_down();
107 wbinvd();
108 mb();
109 /* Ack it */
110 __get_cpu_var(cpu_state) = CPU_DEAD;
112 /* With physical CPU hotplug, we should halt the cpu. */
113 local_irq_disable();
114 (*dead_idle)();
115 }
117 void idle_loop(void)
118 {
119 for ( ; ; )
120 {
121 if ( cpu_is_offline(smp_processor_id()) )
122 play_dead();
123 page_scrub_schedule_work();
124 (*pm_idle)();
125 do_softirq();
126 }
127 }
129 void startup_cpu_idle_loop(void)
130 {
131 struct vcpu *v = current;
133 ASSERT(is_idle_vcpu(v));
134 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
135 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
137 reset_stack_and_jump(idle_loop);
138 }
140 void dump_pageframe_info(struct domain *d)
141 {
142 struct page_info *page;
144 printk("Memory pages belonging to domain %u:\n", d->domain_id);
146 if ( d->tot_pages >= 10 )
147 {
148 printk(" DomPage list too long to display\n");
149 }
150 else
151 {
152 page_list_for_each ( page, &d->page_list )
153 {
154 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 if ( is_hvm_domain(d) )
161 {
162 p2m_pod_dump_data(d);
163 }
165 page_list_for_each ( page, &d->xenpage_list )
166 {
167 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
168 _p(page_to_mfn(page)),
169 page->count_info, page->u.inuse.type_info);
170 }
171 }
173 struct domain *alloc_domain_struct(void)
174 {
175 struct domain *d;
176 /*
177 * We pack the MFN of the domain structure into a 32-bit field within
178 * the page_info structure. Hence the MEMF_bits() restriction.
179 */
180 d = alloc_xenheap_pages(
181 get_order_from_bytes(sizeof(*d)), MEMF_bits(32 + PAGE_SHIFT));
182 if ( d != NULL )
183 memset(d, 0, sizeof(*d));
184 return d;
185 }
187 void free_domain_struct(struct domain *d)
188 {
189 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
190 }
192 struct vcpu *alloc_vcpu_struct(void)
193 {
194 struct vcpu *v;
195 /*
196 * This structure contains embedded PAE PDPTEs, used when an HVM guest
197 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
198 * may require that the shadow CR3 points below 4GB, and hence the whole
199 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
200 */
201 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
202 if ( v != NULL )
203 memset(v, 0, sizeof(*v));
204 return v;
205 }
207 void free_vcpu_struct(struct vcpu *v)
208 {
209 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
210 }
212 #ifdef CONFIG_COMPAT
214 static int setup_compat_l4(struct vcpu *v)
215 {
216 struct page_info *pg;
217 l4_pgentry_t *l4tab;
219 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
220 if ( pg == NULL )
221 return -ENOMEM;
223 /* This page needs to look like a pagetable so that it can be shadowed */
224 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
226 l4tab = page_to_virt(pg);
227 copy_page(l4tab, idle_pg_table);
228 l4tab[0] = l4e_empty();
229 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
230 l4e_from_page(pg, __PAGE_HYPERVISOR);
231 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
232 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
233 __PAGE_HYPERVISOR);
235 v->arch.guest_table = pagetable_from_page(pg);
236 v->arch.guest_table_user = v->arch.guest_table;
238 return 0;
239 }
241 static void release_compat_l4(struct vcpu *v)
242 {
243 free_domheap_page(pagetable_get_page(v->arch.guest_table));
244 v->arch.guest_table = pagetable_null();
245 v->arch.guest_table_user = pagetable_null();
246 }
248 static inline int may_switch_mode(struct domain *d)
249 {
250 return (!is_hvm_domain(d) && (d->tot_pages == 0));
251 }
253 int switch_native(struct domain *d)
254 {
255 unsigned int vcpuid;
257 if ( d == NULL )
258 return -EINVAL;
259 if ( !may_switch_mode(d) )
260 return -EACCES;
261 if ( !is_pv_32on64_domain(d) )
262 return 0;
264 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
266 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
267 {
268 if (d->vcpu[vcpuid])
269 release_compat_l4(d->vcpu[vcpuid]);
270 }
272 return 0;
273 }
275 int switch_compat(struct domain *d)
276 {
277 unsigned int vcpuid;
279 if ( d == NULL )
280 return -EINVAL;
281 if ( !may_switch_mode(d) )
282 return -EACCES;
283 if ( is_pv_32on64_domain(d) )
284 return 0;
286 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
288 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
289 {
290 if ( (d->vcpu[vcpuid] != NULL) &&
291 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
292 goto undo_and_fail;
293 }
295 domain_set_alloc_bitsize(d);
297 return 0;
299 undo_and_fail:
300 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
301 while ( vcpuid-- != 0 )
302 {
303 if ( d->vcpu[vcpuid] != NULL )
304 release_compat_l4(d->vcpu[vcpuid]);
305 }
306 return -ENOMEM;
307 }
309 #else
310 #define setup_compat_l4(v) 0
311 #define release_compat_l4(v) ((void)0)
312 #endif
314 int vcpu_initialise(struct vcpu *v)
315 {
316 struct domain *d = v->domain;
317 int rc;
319 v->arch.vcpu_info_mfn = INVALID_MFN;
321 v->arch.flags = TF_kernel_mode;
323 #if defined(__i386__)
324 mapcache_vcpu_init(v);
325 #else
326 {
327 unsigned int idx = perdomain_pt_pgidx(v);
328 struct page_info *pg;
330 if ( !perdomain_pt_page(d, idx) )
331 {
332 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
333 if ( !pg )
334 return -ENOMEM;
335 clear_page(page_to_virt(pg));
336 perdomain_pt_page(d, idx) = pg;
337 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx]
338 = l2e_from_page(pg, __PAGE_HYPERVISOR);
339 }
340 }
341 #endif
343 pae_l3_cache_init(&v->arch.pae_l3_cache);
345 paging_vcpu_init(v);
347 if ( is_hvm_domain(d) )
348 {
349 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
350 return rc;
351 }
352 else
353 {
354 /* PV guests by default have a 100Hz ticker. */
355 if ( !is_idle_domain(d) )
356 v->periodic_period = MILLISECS(10);
358 /* PV guests get an emulated PIT too for video BIOSes to use. */
359 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
360 pit_init(v, cpu_khz);
362 v->arch.schedule_tail = continue_nonidle_domain;
363 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
364 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
366 if ( is_idle_domain(d) )
367 {
368 v->arch.schedule_tail = continue_idle_domain;
369 v->arch.cr3 = __pa(idle_pg_table);
370 }
372 v->arch.guest_context.ctrlreg[4] =
373 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
374 }
376 v->arch.perdomain_ptes = perdomain_ptes(d, v);
378 spin_lock_init(&v->arch.shadow_ldt_lock);
380 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
381 }
383 void vcpu_destroy(struct vcpu *v)
384 {
385 if ( is_pv_32on64_vcpu(v) )
386 release_compat_l4(v);
388 if ( is_hvm_vcpu(v) )
389 hvm_vcpu_destroy(v);
390 }
392 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
393 {
394 #ifdef __x86_64__
395 struct page_info *pg;
396 #else
397 int pdpt_order;
398 #endif
399 int i, paging_initialised = 0;
400 int rc = -ENOMEM;
402 d->arch.hvm_domain.hap_enabled =
403 is_hvm_domain(d) &&
404 hvm_funcs.hap_supported &&
405 (domcr_flags & DOMCRF_hap);
407 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
409 INIT_LIST_HEAD(&d->arch.pdev_list);
411 d->arch.relmem = RELMEM_not_started;
412 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
414 #if defined(__i386__)
416 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
417 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
418 if ( d->arch.mm_perdomain_pt == NULL )
419 goto fail;
420 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
422 mapcache_domain_init(d);
424 #else /* __x86_64__ */
426 BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages)
427 != PAGE_SIZE);
428 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
429 if ( !pg )
430 goto fail;
431 d->arch.mm_perdomain_pt_pages = page_to_virt(pg);
432 clear_page(d->arch.mm_perdomain_pt_pages);
434 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
435 if ( pg == NULL )
436 goto fail;
437 d->arch.mm_perdomain_l2 = page_to_virt(pg);
438 clear_page(d->arch.mm_perdomain_l2);
440 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
441 if ( pg == NULL )
442 goto fail;
443 d->arch.mm_perdomain_l3 = page_to_virt(pg);
444 clear_page(d->arch.mm_perdomain_l3);
445 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
446 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
447 __PAGE_HYPERVISOR);
449 #endif /* __x86_64__ */
451 #ifdef CONFIG_COMPAT
452 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
453 #endif
455 if ( (rc = paging_domain_init(d)) != 0 )
456 goto fail;
457 paging_initialised = 1;
459 if ( !is_idle_domain(d) )
460 {
461 d->arch.ioport_caps =
462 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
463 rc = -ENOMEM;
464 if ( d->arch.ioport_caps == NULL )
465 goto fail;
467 /*
468 * The shared_info machine address must fit in a 32-bit field within a
469 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
470 */
471 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
472 goto fail;
474 clear_page(d->shared_info);
475 share_xen_page_with_guest(
476 virt_to_page(d->shared_info), d, XENSHARE_writable);
478 d->arch.pirq_vector = xmalloc_array(s16, d->nr_pirqs);
479 if ( !d->arch.pirq_vector )
480 goto fail;
481 memset(d->arch.pirq_vector, 0,
482 d->nr_pirqs * sizeof(*d->arch.pirq_vector));
484 if ( (rc = iommu_domain_init(d)) != 0 )
485 goto fail;
487 /* For Guest vMCE MSRs virtualization */
488 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
489 intel_mce_init_msr(d);
490 }
492 if ( is_hvm_domain(d) )
493 {
494 if ( (rc = hvm_domain_initialise(d)) != 0 )
495 {
496 iommu_domain_destroy(d);
497 goto fail;
498 }
499 }
500 else
501 {
502 /* 32-bit PV guest by default only if Xen is not 64-bit. */
503 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
504 (CONFIG_PAGING_LEVELS != 4);
505 }
507 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
508 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
509 {
510 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
511 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
512 }
514 return 0;
516 fail:
517 d->is_dying = DOMDYING_dead;
518 xfree(d->arch.pirq_vector);
519 free_xenheap_page(d->shared_info);
520 if ( paging_initialised )
521 paging_final_teardown(d);
522 #ifdef __x86_64__
523 if ( d->arch.mm_perdomain_l2 )
524 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
525 if ( d->arch.mm_perdomain_l3 )
526 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
527 if ( d->arch.mm_perdomain_pt_pages )
528 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
529 #else
530 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
531 #endif
532 return rc;
533 }
535 void arch_domain_destroy(struct domain *d)
536 {
537 #ifdef __x86_64__
538 unsigned int i;
539 #endif
541 if ( is_hvm_domain(d) )
542 hvm_domain_destroy(d);
544 pci_release_devices(d);
545 free_domain_pirqs(d);
546 if ( !is_idle_domain(d) )
547 iommu_domain_destroy(d);
549 paging_final_teardown(d);
551 #ifdef __i386__
552 free_xenheap_pages(
553 d->arch.mm_perdomain_pt,
554 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
555 #else
556 for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
557 {
558 if ( perdomain_pt_page(d, i) )
559 free_domheap_page(perdomain_pt_page(d, i));
560 }
561 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
562 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
563 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
564 #endif
566 free_xenheap_page(d->shared_info);
567 xfree(d->arch.pirq_vector);
568 }
570 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
571 {
572 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
574 hv_cr4_mask = ~X86_CR4_TSD;
575 if ( cpu_has_de )
576 hv_cr4_mask &= ~X86_CR4_DE;
578 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
579 gdprintk(XENLOG_WARNING,
580 "Attempt to change CR4 flags %08lx -> %08lx\n",
581 hv_cr4, guest_cr4);
583 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
584 }
586 /* This is called by arch_final_setup_guest and do_boot_vcpu */
587 int arch_set_info_guest(
588 struct vcpu *v, vcpu_guest_context_u c)
589 {
590 struct domain *d = v->domain;
591 unsigned long cr3_pfn = INVALID_MFN;
592 unsigned long flags, cr4;
593 int i, rc = 0, compat;
595 /* The context is a compat-mode one if the target domain is compat-mode;
596 * we expect the tools to DTRT even in compat-mode callers. */
597 compat = is_pv_32on64_domain(d);
599 #ifdef CONFIG_COMPAT
600 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
601 #else
602 #define c(fld) (c.nat->fld)
603 #endif
604 flags = c(flags);
606 if ( !is_hvm_vcpu(v) )
607 {
608 if ( !compat )
609 {
610 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
611 fixup_guest_stack_selector(d, c.nat->kernel_ss);
612 fixup_guest_code_selector(d, c.nat->user_regs.cs);
613 #ifdef __i386__
614 fixup_guest_code_selector(d, c.nat->event_callback_cs);
615 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
616 #endif
618 for ( i = 0; i < 256; i++ )
619 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
621 /* LDT safety checks. */
622 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
623 (c.nat->ldt_ents > 8192) ||
624 !array_access_ok(c.nat->ldt_base,
625 c.nat->ldt_ents,
626 LDT_ENTRY_SIZE) )
627 return -EINVAL;
628 }
629 #ifdef CONFIG_COMPAT
630 else
631 {
632 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
633 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
634 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
635 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
636 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
638 for ( i = 0; i < 256; i++ )
639 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
641 /* LDT safety checks. */
642 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
643 (c.cmp->ldt_ents > 8192) ||
644 !compat_array_access_ok(c.cmp->ldt_base,
645 c.cmp->ldt_ents,
646 LDT_ENTRY_SIZE) )
647 return -EINVAL;
648 }
649 #endif
650 }
652 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
654 v->arch.flags &= ~TF_kernel_mode;
655 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
656 v->arch.flags |= TF_kernel_mode;
658 if ( !compat )
659 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
660 #ifdef CONFIG_COMPAT
661 else
662 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
663 #endif
665 v->arch.guest_context.user_regs.eflags |= 2;
667 if ( is_hvm_vcpu(v) )
668 {
669 hvm_set_info_guest(v);
670 goto out;
671 }
673 /* Only CR0.TS is modifiable by guest or admin. */
674 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
675 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
677 init_int80_direct_trap(v);
679 /* IOPL privileges are virtualised. */
680 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
681 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
683 /* Ensure real hardware interrupts are enabled. */
684 v->arch.guest_context.user_regs.eflags |= EF_IE;
686 cr4 = v->arch.guest_context.ctrlreg[4];
687 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
688 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
690 memset(v->arch.guest_context.debugreg, 0,
691 sizeof(v->arch.guest_context.debugreg));
692 for ( i = 0; i < 8; i++ )
693 (void)set_debugreg(v, i, c(debugreg[i]));
695 if ( v->is_initialised )
696 goto out;
698 if ( v->vcpu_id == 0 )
699 d->vm_assist = c(vm_assist);
701 if ( !compat )
702 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
703 #ifdef CONFIG_COMPAT
704 else
705 {
706 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
707 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
709 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
710 return -EINVAL;
711 for ( i = 0; i < n; ++i )
712 gdt_frames[i] = c.cmp->gdt_frames[i];
713 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
714 }
715 #endif
716 if ( rc != 0 )
717 return rc;
719 if ( !compat )
720 {
721 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
723 if ( !mfn_valid(cr3_pfn) ||
724 (paging_mode_refcounts(d)
725 ? !get_page(mfn_to_page(cr3_pfn), d)
726 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
727 PGT_base_page_table)) )
728 {
729 destroy_gdt(v);
730 return -EINVAL;
731 }
733 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
735 #ifdef __x86_64__
736 if ( c.nat->ctrlreg[1] )
737 {
738 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
740 if ( !mfn_valid(cr3_pfn) ||
741 (paging_mode_refcounts(d)
742 ? !get_page(mfn_to_page(cr3_pfn), d)
743 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
744 PGT_base_page_table)) )
745 {
746 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
747 v->arch.guest_table = pagetable_null();
748 if ( paging_mode_refcounts(d) )
749 put_page(mfn_to_page(cr3_pfn));
750 else
751 put_page_and_type(mfn_to_page(cr3_pfn));
752 destroy_gdt(v);
753 return -EINVAL;
754 }
756 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
757 }
758 #endif
759 }
760 #ifdef CONFIG_COMPAT
761 else
762 {
763 l4_pgentry_t *l4tab;
765 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
767 if ( !mfn_valid(cr3_pfn) ||
768 (paging_mode_refcounts(d)
769 ? !get_page(mfn_to_page(cr3_pfn), d)
770 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
771 PGT_l3_page_table)) )
772 {
773 destroy_gdt(v);
774 return -EINVAL;
775 }
777 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
778 *l4tab = l4e_from_pfn(
779 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
780 }
781 #endif
783 if ( v->vcpu_id == 0 )
784 update_domain_wallclock_time(d);
786 /* Don't redo final setup */
787 v->is_initialised = 1;
789 if ( paging_mode_enabled(d) )
790 paging_update_paging_modes(v);
792 update_cr3(v);
794 out:
795 if ( flags & VGCF_online )
796 clear_bit(_VPF_down, &v->pause_flags);
797 else
798 set_bit(_VPF_down, &v->pause_flags);
799 return 0;
800 #undef c
801 }
803 void arch_vcpu_reset(struct vcpu *v)
804 {
805 if ( !is_hvm_vcpu(v) )
806 {
807 destroy_gdt(v);
808 vcpu_destroy_pagetables(v);
809 }
810 else
811 {
812 vcpu_end_shutdown_deferral(v);
813 }
814 }
816 /*
817 * Unmap the vcpu info page if the guest decided to place it somewhere
818 * else. This is only used from arch_domain_destroy, so there's no
819 * need to do anything clever.
820 */
821 static void
822 unmap_vcpu_info(struct vcpu *v)
823 {
824 struct domain *d = v->domain;
825 unsigned long mfn;
827 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
828 return;
830 mfn = v->arch.vcpu_info_mfn;
831 unmap_domain_page_global(v->vcpu_info);
833 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
834 v->arch.vcpu_info_mfn = INVALID_MFN;
836 put_page_and_type(mfn_to_page(mfn));
837 }
839 /*
840 * Map a guest page in and point the vcpu_info pointer at it. This
841 * makes sure that the vcpu_info is always pointing at a valid piece
842 * of memory, and it sets a pending event to make sure that a pending
843 * event doesn't get missed.
844 */
845 static int
846 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
847 {
848 struct domain *d = v->domain;
849 void *mapping;
850 vcpu_info_t *new_info;
851 int i;
853 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
854 return -EINVAL;
856 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
857 return -EINVAL;
859 /* Run this command on yourself or on other offline VCPUS. */
860 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
861 return -EINVAL;
863 mfn = gmfn_to_mfn(d, mfn);
864 if ( !mfn_valid(mfn) ||
865 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
866 return -EINVAL;
868 mapping = map_domain_page_global(mfn);
869 if ( mapping == NULL )
870 {
871 put_page_and_type(mfn_to_page(mfn));
872 return -ENOMEM;
873 }
875 new_info = (vcpu_info_t *)(mapping + offset);
877 if ( v->vcpu_info )
878 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
879 else
880 {
881 memset(new_info, 0, sizeof(*new_info));
882 __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
883 }
885 v->vcpu_info = new_info;
886 v->arch.vcpu_info_mfn = mfn;
888 /* Set new vcpu_info pointer /before/ setting pending flags. */
889 wmb();
891 /*
892 * Mark everything as being pending just to make sure nothing gets
893 * lost. The domain will get a spurious event, but it can cope.
894 */
895 vcpu_info(v, evtchn_upcall_pending) = 1;
896 for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
897 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
899 return 0;
900 }
902 long
903 arch_do_vcpu_op(
904 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
905 {
906 long rc = 0;
908 switch ( cmd )
909 {
910 case VCPUOP_register_runstate_memory_area:
911 {
912 struct vcpu_register_runstate_memory_area area;
913 struct vcpu_runstate_info runstate;
915 rc = -EFAULT;
916 if ( copy_from_guest(&area, arg, 1) )
917 break;
919 if ( !guest_handle_okay(area.addr.h, 1) )
920 break;
922 rc = 0;
923 runstate_guest(v) = area.addr.h;
925 if ( v == current )
926 {
927 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
928 }
929 else
930 {
931 vcpu_runstate_get(v, &runstate);
932 __copy_to_guest(runstate_guest(v), &runstate, 1);
933 }
935 break;
936 }
938 case VCPUOP_register_vcpu_info:
939 {
940 struct domain *d = v->domain;
941 struct vcpu_register_vcpu_info info;
943 rc = -EFAULT;
944 if ( copy_from_guest(&info, arg, 1) )
945 break;
947 domain_lock(d);
948 rc = map_vcpu_info(v, info.mfn, info.offset);
949 domain_unlock(d);
951 break;
952 }
954 case VCPUOP_get_physid:
955 {
956 struct vcpu_get_physid cpu_id;
958 rc = -EINVAL;
959 if ( !v->domain->is_pinned )
960 break;
962 cpu_id.phys_id =
963 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
964 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
966 rc = -EFAULT;
967 if ( copy_to_guest(arg, &cpu_id, 1) )
968 break;
970 rc = 0;
971 break;
972 }
974 default:
975 rc = -ENOSYS;
976 break;
977 }
979 return rc;
980 }
982 #ifdef __x86_64__
984 #define loadsegment(seg,value) ({ \
985 int __r = 1; \
986 asm volatile ( \
987 "1: movl %k1,%%" #seg "\n2:\n" \
988 ".section .fixup,\"ax\"\n" \
989 "3: xorl %k0,%k0\n" \
990 " movl %k0,%%" #seg "\n" \
991 " jmp 2b\n" \
992 ".previous\n" \
993 ".section __ex_table,\"a\"\n" \
994 " .align 8\n" \
995 " .quad 1b,3b\n" \
996 ".previous" \
997 : "=r" (__r) : "r" (value), "0" (__r) );\
998 __r; })
1000 /*
1001 * save_segments() writes a mask of segments which are dirty (non-zero),
1002 * allowing load_segments() to avoid some expensive segment loads and
1003 * MSR writes.
1004 */
1005 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1006 #define DIRTY_DS 0x01
1007 #define DIRTY_ES 0x02
1008 #define DIRTY_FS 0x04
1009 #define DIRTY_GS 0x08
1010 #define DIRTY_FS_BASE 0x10
1011 #define DIRTY_GS_BASE_USER 0x20
1013 static void load_segments(struct vcpu *n)
1015 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
1016 int all_segs_okay = 1;
1017 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1019 /* Load and clear the dirty segment mask. */
1020 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1021 per_cpu(dirty_segment_mask, cpu) = 0;
1023 /* Either selector != 0 ==> reload. */
1024 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1025 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1027 /* Either selector != 0 ==> reload. */
1028 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1029 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1031 /*
1032 * Either selector != 0 ==> reload.
1033 * Also reload to reset FS_BASE if it was non-zero.
1034 */
1035 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1036 nctxt->user_regs.fs) )
1037 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1039 /*
1040 * Either selector != 0 ==> reload.
1041 * Also reload to reset GS_BASE if it was non-zero.
1042 */
1043 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1044 nctxt->user_regs.gs) )
1046 /* Reset GS_BASE with user %gs? */
1047 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1048 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1051 if ( !is_pv_32on64_domain(n->domain) )
1053 /* This can only be non-zero if selector is NULL. */
1054 if ( nctxt->fs_base )
1055 wrmsr(MSR_FS_BASE,
1056 nctxt->fs_base,
1057 nctxt->fs_base>>32);
1059 /* Most kernels have non-zero GS base, so don't bother testing. */
1060 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1061 wrmsr(MSR_SHADOW_GS_BASE,
1062 nctxt->gs_base_kernel,
1063 nctxt->gs_base_kernel>>32);
1065 /* This can only be non-zero if selector is NULL. */
1066 if ( nctxt->gs_base_user )
1067 wrmsr(MSR_GS_BASE,
1068 nctxt->gs_base_user,
1069 nctxt->gs_base_user>>32);
1071 /* If in kernel mode then switch the GS bases around. */
1072 if ( (n->arch.flags & TF_kernel_mode) )
1073 asm volatile ( "swapgs" );
1076 if ( unlikely(!all_segs_okay) )
1078 struct cpu_user_regs *regs = guest_cpu_user_regs();
1079 unsigned long *rsp =
1080 (n->arch.flags & TF_kernel_mode) ?
1081 (unsigned long *)regs->rsp :
1082 (unsigned long *)nctxt->kernel_sp;
1083 unsigned long cs_and_mask, rflags;
1085 if ( is_pv_32on64_domain(n->domain) )
1087 unsigned int *esp = ring_1(regs) ?
1088 (unsigned int *)regs->rsp :
1089 (unsigned int *)nctxt->kernel_sp;
1090 unsigned int cs_and_mask, eflags;
1091 int ret = 0;
1093 /* CS longword also contains full evtchn_upcall_mask. */
1094 cs_and_mask = (unsigned short)regs->cs |
1095 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1096 /* Fold upcall mask into RFLAGS.IF. */
1097 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1098 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1100 if ( !ring_1(regs) )
1102 ret = put_user(regs->ss, esp-1);
1103 ret |= put_user(regs->_esp, esp-2);
1104 esp -= 2;
1107 if ( ret |
1108 put_user(eflags, esp-1) |
1109 put_user(cs_and_mask, esp-2) |
1110 put_user(regs->_eip, esp-3) |
1111 put_user(nctxt->user_regs.gs, esp-4) |
1112 put_user(nctxt->user_regs.fs, esp-5) |
1113 put_user(nctxt->user_regs.es, esp-6) |
1114 put_user(nctxt->user_regs.ds, esp-7) )
1116 gdprintk(XENLOG_ERR, "Error while creating compat "
1117 "failsafe callback frame.\n");
1118 domain_crash(n->domain);
1121 if ( test_bit(_VGCF_failsafe_disables_events,
1122 &n->arch.guest_context.flags) )
1123 vcpu_info(n, evtchn_upcall_mask) = 1;
1125 regs->entry_vector = TRAP_syscall;
1126 regs->_eflags &= 0xFFFCBEFFUL;
1127 regs->ss = FLAT_COMPAT_KERNEL_SS;
1128 regs->_esp = (unsigned long)(esp-7);
1129 regs->cs = FLAT_COMPAT_KERNEL_CS;
1130 regs->_eip = nctxt->failsafe_callback_eip;
1131 return;
1134 if ( !(n->arch.flags & TF_kernel_mode) )
1135 toggle_guest_mode(n);
1136 else
1137 regs->cs &= ~3;
1139 /* CS longword also contains full evtchn_upcall_mask. */
1140 cs_and_mask = (unsigned long)regs->cs |
1141 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1143 /* Fold upcall mask into RFLAGS.IF. */
1144 rflags = regs->rflags & ~X86_EFLAGS_IF;
1145 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1147 if ( put_user(regs->ss, rsp- 1) |
1148 put_user(regs->rsp, rsp- 2) |
1149 put_user(rflags, rsp- 3) |
1150 put_user(cs_and_mask, rsp- 4) |
1151 put_user(regs->rip, rsp- 5) |
1152 put_user(nctxt->user_regs.gs, rsp- 6) |
1153 put_user(nctxt->user_regs.fs, rsp- 7) |
1154 put_user(nctxt->user_regs.es, rsp- 8) |
1155 put_user(nctxt->user_regs.ds, rsp- 9) |
1156 put_user(regs->r11, rsp-10) |
1157 put_user(regs->rcx, rsp-11) )
1159 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1160 "callback frame.\n");
1161 domain_crash(n->domain);
1164 if ( test_bit(_VGCF_failsafe_disables_events,
1165 &n->arch.guest_context.flags) )
1166 vcpu_info(n, evtchn_upcall_mask) = 1;
1168 regs->entry_vector = TRAP_syscall;
1169 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1170 X86_EFLAGS_NT|X86_EFLAGS_TF);
1171 regs->ss = FLAT_KERNEL_SS;
1172 regs->rsp = (unsigned long)(rsp-11);
1173 regs->cs = FLAT_KERNEL_CS;
1174 regs->rip = nctxt->failsafe_callback_eip;
1178 static void save_segments(struct vcpu *v)
1180 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1181 struct cpu_user_regs *regs = &ctxt->user_regs;
1182 unsigned int dirty_segment_mask = 0;
1184 regs->ds = read_segment_register(ds);
1185 regs->es = read_segment_register(es);
1186 regs->fs = read_segment_register(fs);
1187 regs->gs = read_segment_register(gs);
1189 if ( regs->ds )
1190 dirty_segment_mask |= DIRTY_DS;
1192 if ( regs->es )
1193 dirty_segment_mask |= DIRTY_ES;
1195 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1197 dirty_segment_mask |= DIRTY_FS;
1198 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1200 else if ( ctxt->fs_base )
1202 dirty_segment_mask |= DIRTY_FS_BASE;
1205 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1207 dirty_segment_mask |= DIRTY_GS;
1208 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1210 else if ( ctxt->gs_base_user )
1212 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1215 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1218 #define switch_kernel_stack(v) ((void)0)
1220 #elif defined(__i386__)
1222 #define load_segments(n) ((void)0)
1223 #define save_segments(p) ((void)0)
1225 static inline void switch_kernel_stack(struct vcpu *v)
1227 struct tss_struct *tss = &init_tss[smp_processor_id()];
1228 tss->esp1 = v->arch.guest_context.kernel_sp;
1229 tss->ss1 = v->arch.guest_context.kernel_ss;
1232 #endif /* __i386__ */
1234 static void paravirt_ctxt_switch_from(struct vcpu *v)
1236 save_segments(v);
1238 /*
1239 * Disable debug breakpoints. We do this aggressively because if we switch
1240 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1241 * inside Xen, before we get a chance to reload DR7, and this cannot always
1242 * safely be handled.
1243 */
1244 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1245 write_debugreg(7, 0);
1248 static void paravirt_ctxt_switch_to(struct vcpu *v)
1250 unsigned long cr4;
1252 set_int80_direct_trap(v);
1253 switch_kernel_stack(v);
1255 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1256 if ( unlikely(cr4 != read_cr4()) )
1257 write_cr4(cr4);
1259 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1261 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1262 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1263 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1264 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1265 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1266 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1270 static inline int need_full_gdt(struct vcpu *v)
1272 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1275 static void __context_switch(void)
1277 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1278 unsigned int cpu = smp_processor_id();
1279 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1280 struct vcpu *n = current;
1281 struct desc_struct *gdt;
1282 struct desc_ptr gdt_desc;
1284 ASSERT(p != n);
1285 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1287 if ( !is_idle_vcpu(p) )
1289 memcpy(&p->arch.guest_context.user_regs,
1290 stack_regs,
1291 CTXT_SWITCH_STACK_BYTES);
1292 unlazy_fpu(p);
1293 p->arch.ctxt_switch_from(p);
1296 if ( !is_idle_vcpu(n) )
1298 memcpy(stack_regs,
1299 &n->arch.guest_context.user_regs,
1300 CTXT_SWITCH_STACK_BYTES);
1301 n->arch.ctxt_switch_to(n);
1304 if ( p->domain != n->domain )
1305 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1306 cpu_set(cpu, n->vcpu_dirty_cpumask);
1308 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1309 per_cpu(compat_gdt_table, cpu);
1310 if ( need_full_gdt(n) )
1312 struct page_info *page = virt_to_page(gdt);
1313 unsigned int i;
1314 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1315 l1e_write(n->arch.perdomain_ptes +
1316 FIRST_RESERVED_GDT_PAGE + i,
1317 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1320 if ( need_full_gdt(p) &&
1321 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1323 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1324 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1325 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1328 write_ptbase(n);
1330 if ( need_full_gdt(n) &&
1331 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1333 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1334 gdt_desc.base = GDT_VIRT_START(n);
1335 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1338 if ( p->domain != n->domain )
1339 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1340 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1342 per_cpu(curr_vcpu, cpu) = n;
1346 void context_switch(struct vcpu *prev, struct vcpu *next)
1348 unsigned int cpu = smp_processor_id();
1349 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1351 ASSERT(local_irq_is_enabled());
1353 /* Allow at most one CPU at a time to be dirty. */
1354 ASSERT(cpus_weight(dirty_mask) <= 1);
1355 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1357 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1358 flush_tlb_mask(&dirty_mask);
1361 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1362 pt_save_timer(prev);
1364 local_irq_disable();
1366 set_current(next);
1368 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1370 local_irq_enable();
1372 else
1374 __context_switch();
1376 #ifdef CONFIG_COMPAT
1377 if ( !is_hvm_vcpu(next) &&
1378 (is_idle_vcpu(prev) ||
1379 is_hvm_vcpu(prev) ||
1380 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1382 uint64_t efer = read_efer();
1383 if ( !(efer & EFER_SCE) )
1384 write_efer(efer | EFER_SCE);
1386 #endif
1388 /* Re-enable interrupts before restoring state which may fault. */
1389 local_irq_enable();
1391 if ( !is_hvm_vcpu(next) )
1393 load_LDT(next);
1394 load_segments(next);
1398 context_saved(prev);
1400 /* Update per-VCPU guest runstate shared memory area (if registered). */
1401 if ( !guest_handle_is_null(runstate_guest(next)) )
1403 if ( !is_pv_32on64_domain(next->domain) )
1404 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1405 #ifdef CONFIG_COMPAT
1406 else
1408 struct compat_vcpu_runstate_info info;
1410 XLAT_vcpu_runstate_info(&info, &next->runstate);
1411 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1413 #endif
1416 schedule_tail(next);
1417 BUG();
1420 void continue_running(struct vcpu *same)
1422 schedule_tail(same);
1423 BUG();
1426 int __sync_lazy_execstate(void)
1428 unsigned long flags;
1429 int switch_required;
1431 local_irq_save(flags);
1433 switch_required = (this_cpu(curr_vcpu) != current);
1435 if ( switch_required )
1437 ASSERT(current == idle_vcpu[smp_processor_id()]);
1438 __context_switch();
1441 local_irq_restore(flags);
1443 return switch_required;
1446 void sync_vcpu_execstate(struct vcpu *v)
1448 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1449 (void)__sync_lazy_execstate();
1451 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1452 flush_tlb_mask(&v->vcpu_dirty_cpumask);
1455 struct migrate_info {
1456 long (*func)(void *data);
1457 void *data;
1458 void (*saved_schedule_tail)(struct vcpu *);
1459 cpumask_t saved_affinity;
1460 unsigned int nest;
1461 };
1463 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1465 struct cpu_user_regs *regs = guest_cpu_user_regs();
1466 struct migrate_info *info = v->arch.continue_info;
1467 cpumask_t mask = info->saved_affinity;
1468 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1470 regs->eax = info->func(info->data);
1472 if ( info->nest-- == 0 )
1474 xfree(info);
1475 v->arch.schedule_tail = saved_schedule_tail;
1476 v->arch.continue_info = NULL;
1477 vcpu_unlock_affinity(v, &mask);
1480 (*saved_schedule_tail)(v);
1483 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1485 struct vcpu *v = current;
1486 struct migrate_info *info;
1487 cpumask_t mask = cpumask_of_cpu(cpu);
1488 int rc;
1490 if ( cpu == smp_processor_id() )
1491 return func(data);
1493 info = v->arch.continue_info;
1494 if ( info == NULL )
1496 info = xmalloc(struct migrate_info);
1497 if ( info == NULL )
1498 return -ENOMEM;
1500 rc = vcpu_lock_affinity(v, &mask);
1501 if ( rc )
1503 xfree(info);
1504 return rc;
1507 info->saved_schedule_tail = v->arch.schedule_tail;
1508 info->saved_affinity = mask;
1509 info->nest = 0;
1511 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1512 v->arch.continue_info = info;
1514 else
1516 BUG_ON(info->nest != 0);
1517 rc = vcpu_locked_change_affinity(v, &mask);
1518 if ( rc )
1519 return rc;
1520 info->nest++;
1523 info->func = func;
1524 info->data = data;
1526 /* Dummy return value will be overwritten by new schedule_tail. */
1527 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1528 return 0;
1531 #define next_arg(fmt, args) ({ \
1532 unsigned long __arg; \
1533 switch ( *(fmt)++ ) \
1534 { \
1535 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1536 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1537 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1538 default: __arg = 0; BUG(); \
1539 } \
1540 __arg; \
1541 })
1543 DEFINE_PER_CPU(char, hc_preempted);
1545 unsigned long hypercall_create_continuation(
1546 unsigned int op, const char *format, ...)
1548 struct mc_state *mcs = &this_cpu(mc_state);
1549 struct cpu_user_regs *regs;
1550 const char *p = format;
1551 unsigned long arg;
1552 unsigned int i;
1553 va_list args;
1555 va_start(args, format);
1557 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1559 __set_bit(_MCSF_call_preempted, &mcs->flags);
1561 for ( i = 0; *p != '\0'; i++ )
1562 mcs->call.args[i] = next_arg(p, args);
1563 if ( is_pv_32on64_domain(current->domain) )
1565 for ( ; i < 6; i++ )
1566 mcs->call.args[i] = 0;
1569 else
1571 regs = guest_cpu_user_regs();
1572 regs->eax = op;
1573 /*
1574 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1575 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1576 */
1577 if ( !is_hvm_vcpu(current) )
1578 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1580 #ifdef __x86_64__
1581 if ( !is_hvm_vcpu(current) ?
1582 !is_pv_32on64_vcpu(current) :
1583 (hvm_guest_x86_mode(current) == 8) )
1585 for ( i = 0; *p != '\0'; i++ )
1587 arg = next_arg(p, args);
1588 switch ( i )
1590 case 0: regs->rdi = arg; break;
1591 case 1: regs->rsi = arg; break;
1592 case 2: regs->rdx = arg; break;
1593 case 3: regs->r10 = arg; break;
1594 case 4: regs->r8 = arg; break;
1595 case 5: regs->r9 = arg; break;
1599 else
1600 #endif
1602 if ( supervisor_mode_kernel )
1603 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1605 for ( i = 0; *p != '\0'; i++ )
1607 arg = next_arg(p, args);
1608 switch ( i )
1610 case 0: regs->ebx = arg; break;
1611 case 1: regs->ecx = arg; break;
1612 case 2: regs->edx = arg; break;
1613 case 3: regs->esi = arg; break;
1614 case 4: regs->edi = arg; break;
1615 case 5: regs->ebp = arg; break;
1620 this_cpu(hc_preempted) = 1;
1623 va_end(args);
1625 return op;
1628 #ifdef CONFIG_COMPAT
1629 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1631 int rc = 0;
1632 struct mc_state *mcs = &this_cpu(mc_state);
1633 struct cpu_user_regs *regs;
1634 unsigned int i, cval = 0;
1635 unsigned long nval = 0;
1636 va_list args;
1638 BUG_ON(*id > 5);
1639 BUG_ON(mask & (1U << *id));
1641 va_start(args, mask);
1643 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1645 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1646 return 0;
1647 for ( i = 0; i < 6; ++i, mask >>= 1 )
1649 if ( mask & 1 )
1651 nval = va_arg(args, unsigned long);
1652 cval = va_arg(args, unsigned int);
1653 if ( cval == nval )
1654 mask &= ~1U;
1655 else
1656 BUG_ON(nval == (unsigned int)nval);
1658 else if ( id && *id == i )
1660 *id = mcs->call.args[i];
1661 id = NULL;
1663 if ( (mask & 1) && mcs->call.args[i] == nval )
1665 mcs->call.args[i] = cval;
1666 ++rc;
1668 else
1669 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1672 else
1674 regs = guest_cpu_user_regs();
1675 for ( i = 0; i < 6; ++i, mask >>= 1 )
1677 unsigned long *reg;
1679 switch ( i )
1681 case 0: reg = &regs->ebx; break;
1682 case 1: reg = &regs->ecx; break;
1683 case 2: reg = &regs->edx; break;
1684 case 3: reg = &regs->esi; break;
1685 case 4: reg = &regs->edi; break;
1686 case 5: reg = &regs->ebp; break;
1687 default: BUG(); reg = NULL; break;
1689 if ( (mask & 1) )
1691 nval = va_arg(args, unsigned long);
1692 cval = va_arg(args, unsigned int);
1693 if ( cval == nval )
1694 mask &= ~1U;
1695 else
1696 BUG_ON(nval == (unsigned int)nval);
1698 else if ( id && *id == i )
1700 *id = *reg;
1701 id = NULL;
1703 if ( (mask & 1) && *reg == nval )
1705 *reg = cval;
1706 ++rc;
1708 else
1709 BUG_ON(*reg != (unsigned int)*reg);
1713 va_end(args);
1715 return rc;
1717 #endif
1719 static int relinquish_memory(
1720 struct domain *d, struct page_list_head *list, unsigned long type)
1722 struct page_info *page;
1723 unsigned long x, y;
1724 int ret = 0;
1726 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1727 spin_lock_recursive(&d->page_alloc_lock);
1729 while ( (page = page_list_remove_head(list)) )
1731 /* Grab a reference to the page so it won't disappear from under us. */
1732 if ( unlikely(!get_page(page, d)) )
1734 /* Couldn't get a reference -- someone is freeing this page. */
1735 page_list_add_tail(page, &d->arch.relmem_list);
1736 continue;
1739 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1740 ret = put_page_and_type_preemptible(page, 1);
1741 switch ( ret )
1743 case 0:
1744 break;
1745 case -EAGAIN:
1746 case -EINTR:
1747 page_list_add(page, list);
1748 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1749 put_page(page);
1750 goto out;
1751 default:
1752 BUG();
1755 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1756 put_page(page);
1758 /*
1759 * Forcibly invalidate top-most, still valid page tables at this point
1760 * to break circular 'linear page table' references as well as clean up
1761 * partially validated pages. This is okay because MMU structures are
1762 * not shared across domains and this domain is now dead. Thus top-most
1763 * valid tables are not in use so a non-zero count means circular
1764 * reference or partially validated.
1765 */
1766 y = page->u.inuse.type_info;
1767 for ( ; ; )
1769 x = y;
1770 if ( likely((x & PGT_type_mask) != type) ||
1771 likely(!(x & (PGT_validated|PGT_partial))) )
1772 break;
1774 y = cmpxchg(&page->u.inuse.type_info, x,
1775 x & ~(PGT_validated|PGT_partial));
1776 if ( likely(y == x) )
1778 /* No need for atomic update of type_info here: noone else updates it. */
1779 switch ( ret = free_page_type(page, x, 1) )
1781 case 0:
1782 break;
1783 case -EINTR:
1784 page_list_add(page, list);
1785 page->u.inuse.type_info |= PGT_validated;
1786 if ( x & PGT_partial )
1787 put_page(page);
1788 put_page(page);
1789 ret = -EAGAIN;
1790 goto out;
1791 case -EAGAIN:
1792 page_list_add(page, list);
1793 page->u.inuse.type_info |= PGT_partial;
1794 if ( x & PGT_partial )
1795 put_page(page);
1796 goto out;
1797 default:
1798 BUG();
1800 if ( x & PGT_partial )
1802 page->u.inuse.type_info--;
1803 put_page(page);
1805 break;
1809 /* Put the page on the list and /then/ potentially free it. */
1810 page_list_add_tail(page, &d->arch.relmem_list);
1811 put_page(page);
1813 if ( hypercall_preempt_check() )
1815 ret = -EAGAIN;
1816 goto out;
1820 /* list is empty at this point. */
1821 if ( !page_list_empty(&d->arch.relmem_list) )
1823 *list = d->arch.relmem_list;
1824 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
1827 out:
1828 spin_unlock_recursive(&d->page_alloc_lock);
1829 return ret;
1832 static void vcpu_destroy_pagetables(struct vcpu *v)
1834 struct domain *d = v->domain;
1835 unsigned long pfn;
1837 #ifdef __x86_64__
1838 if ( is_pv_32on64_vcpu(v) )
1840 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1841 __va(pagetable_get_paddr(v->arch.guest_table)));
1843 if ( pfn != 0 )
1845 if ( paging_mode_refcounts(d) )
1846 put_page(mfn_to_page(pfn));
1847 else
1848 put_page_and_type(mfn_to_page(pfn));
1851 l4e_write(
1852 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1853 l4e_empty());
1855 v->arch.cr3 = 0;
1856 return;
1858 #endif
1860 pfn = pagetable_get_pfn(v->arch.guest_table);
1861 if ( pfn != 0 )
1863 if ( paging_mode_refcounts(d) )
1864 put_page(mfn_to_page(pfn));
1865 else
1866 put_page_and_type(mfn_to_page(pfn));
1867 v->arch.guest_table = pagetable_null();
1870 #ifdef __x86_64__
1871 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1872 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1873 if ( pfn != 0 )
1875 if ( !is_pv_32bit_vcpu(v) )
1877 if ( paging_mode_refcounts(d) )
1878 put_page(mfn_to_page(pfn));
1879 else
1880 put_page_and_type(mfn_to_page(pfn));
1882 v->arch.guest_table_user = pagetable_null();
1884 #endif
1886 v->arch.cr3 = 0;
1889 int domain_relinquish_resources(struct domain *d)
1891 int ret;
1892 struct vcpu *v;
1894 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1896 switch ( d->arch.relmem )
1898 case RELMEM_not_started:
1899 /* Tear down paging-assistance stuff. */
1900 paging_teardown(d);
1902 for_each_vcpu ( d, v )
1904 /* Drop the in-use references to page-table bases. */
1905 vcpu_destroy_pagetables(v);
1907 /*
1908 * Relinquish GDT mappings. No need for explicit unmapping of the
1909 * LDT as it automatically gets squashed with the guest mappings.
1910 */
1911 destroy_gdt(v);
1913 unmap_vcpu_info(v);
1916 if ( d->arch.pirq_eoi_map != NULL )
1918 unmap_domain_page_global(d->arch.pirq_eoi_map);
1919 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1920 d->arch.pirq_eoi_map = NULL;
1923 d->arch.relmem = RELMEM_xen;
1924 /* fallthrough */
1926 /* Relinquish every page of memory. */
1927 case RELMEM_xen:
1928 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1929 if ( ret )
1930 return ret;
1931 #if CONFIG_PAGING_LEVELS >= 4
1932 d->arch.relmem = RELMEM_l4;
1933 /* fallthrough */
1935 case RELMEM_l4:
1936 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1937 if ( ret )
1938 return ret;
1939 #endif
1940 #if CONFIG_PAGING_LEVELS >= 3
1941 d->arch.relmem = RELMEM_l3;
1942 /* fallthrough */
1944 case RELMEM_l3:
1945 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1946 if ( ret )
1947 return ret;
1948 #endif
1949 d->arch.relmem = RELMEM_l2;
1950 /* fallthrough */
1952 case RELMEM_l2:
1953 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1954 if ( ret )
1955 return ret;
1956 d->arch.relmem = RELMEM_done;
1957 /* fallthrough */
1959 case RELMEM_done:
1960 break;
1962 default:
1963 BUG();
1966 if ( is_hvm_domain(d) )
1967 hvm_domain_relinquish_resources(d);
1969 return 0;
1972 void arch_dump_domain_info(struct domain *d)
1974 paging_dump_domain_info(d);
1977 void arch_dump_vcpu_info(struct vcpu *v)
1979 paging_dump_vcpu_info(v);
1982 void domain_cpuid(
1983 struct domain *d,
1984 unsigned int input,
1985 unsigned int sub_input,
1986 unsigned int *eax,
1987 unsigned int *ebx,
1988 unsigned int *ecx,
1989 unsigned int *edx)
1991 cpuid_input_t *cpuid;
1992 int i;
1994 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1996 cpuid = &d->arch.cpuids[i];
1998 if ( (cpuid->input[0] == input) &&
1999 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
2000 (cpuid->input[1] == sub_input)) )
2002 *eax = cpuid->eax;
2003 *ebx = cpuid->ebx;
2004 *ecx = cpuid->ecx;
2005 *edx = cpuid->edx;
2006 return;
2010 *eax = *ebx = *ecx = *edx = 0;
2013 void vcpu_kick(struct vcpu *v)
2015 /*
2016 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2017 * pending flag. These values may fluctuate (after all, we hold no
2018 * locks) but the key insight is that each change will cause
2019 * evtchn_upcall_pending to be polled.
2021 * NB2. We save the running flag across the unblock to avoid a needless
2022 * IPI for domains that we IPI'd to unblock.
2023 */
2024 bool_t running = v->is_running;
2025 vcpu_unblock(v);
2026 if ( running && (in_irq() || (v != current)) )
2027 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2030 void vcpu_mark_events_pending(struct vcpu *v)
2032 int already_pending = test_and_set_bit(
2033 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2035 if ( already_pending )
2036 return;
2038 if ( is_hvm_vcpu(v) )
2039 hvm_assert_evtchn_irq(v);
2040 else
2041 vcpu_kick(v);
2044 static void vcpu_kick_softirq(void)
2046 /*
2047 * Nothing to do here: we merely prevent notifiers from racing with checks
2048 * executed on return to guest context with interrupts enabled. See, for
2049 * example, xxx_intr_assist() executed on return to HVM guest context.
2050 */
2053 static int __init init_vcpu_kick_softirq(void)
2055 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2056 return 0;
2058 __initcall(init_vcpu_kick_softirq);
2061 /*
2062 * Local variables:
2063 * mode: C
2064 * c-set-style: "BSD"
2065 * c-basic-offset: 4
2066 * tab-width: 4
2067 * indent-tabs-mode: nil
2068 * End:
2069 */