debuggers.hg

view xen/arch/x86/domain.c @ 16586:cd5e1e76d0bc

32-on-64: Fix domain address-size clamping, implement
copy-on-grant-transfer, and eliminate 166GB memory limit for x86/64
Xen.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Dec 06 13:39:19 2007 +0000 (2007-12-06)
parents 69b56d3289f5
children 4fcc8b64c2b5
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/debugreg.h>
46 #include <asm/msr.h>
47 #include <asm/nmi.h>
48 #include <asm/iommu.h>
49 #ifdef CONFIG_COMPAT
50 #include <compat/vcpu.h>
51 #endif
53 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
54 DEFINE_PER_CPU(u64, efer);
55 DEFINE_PER_CPU(unsigned long, cr4);
57 static void unmap_vcpu_info(struct vcpu *v);
59 static void paravirt_ctxt_switch_from(struct vcpu *v);
60 static void paravirt_ctxt_switch_to(struct vcpu *v);
62 static void vcpu_destroy_pagetables(struct vcpu *v);
64 static void continue_idle_domain(struct vcpu *v)
65 {
66 reset_stack_and_jump(idle_loop);
67 }
69 static void continue_nonidle_domain(struct vcpu *v)
70 {
71 reset_stack_and_jump(ret_from_intr);
72 }
74 static void default_idle(void)
75 {
76 local_irq_disable();
77 if ( !softirq_pending(smp_processor_id()) )
78 safe_halt();
79 else
80 local_irq_enable();
81 }
83 static void play_dead(void)
84 {
85 __cpu_disable();
86 /* This must be done before dead CPU ack */
87 cpu_exit_clear();
88 hvm_cpu_down();
89 wbinvd();
90 mb();
91 /* Ack it */
92 __get_cpu_var(cpu_state) = CPU_DEAD;
94 /* With physical CPU hotplug, we should halt the cpu. */
95 local_irq_disable();
96 for ( ; ; )
97 halt();
98 }
100 void idle_loop(void)
101 {
102 for ( ; ; )
103 {
104 if (cpu_is_offline(smp_processor_id()))
105 play_dead();
106 page_scrub_schedule_work();
107 default_idle();
108 do_softirq();
109 }
110 }
112 void startup_cpu_idle_loop(void)
113 {
114 struct vcpu *v = current;
116 ASSERT(is_idle_vcpu(v));
117 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
118 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
120 reset_stack_and_jump(idle_loop);
121 }
123 void dump_pageframe_info(struct domain *d)
124 {
125 struct page_info *page;
127 printk("Memory pages belonging to domain %u:\n", d->domain_id);
129 if ( d->tot_pages >= 10 )
130 {
131 printk(" DomPage list too long to display\n");
132 }
133 else
134 {
135 list_for_each_entry ( page, &d->page_list, list )
136 {
137 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
138 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
139 page->count_info, page->u.inuse.type_info);
140 }
141 }
143 list_for_each_entry ( page, &d->xenpage_list, list )
144 {
145 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
146 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
147 page->count_info, page->u.inuse.type_info);
148 }
149 }
151 struct vcpu *alloc_vcpu_struct(void)
152 {
153 struct vcpu *v;
154 if ( (v = xmalloc(struct vcpu)) != NULL )
155 memset(v, 0, sizeof(*v));
156 return v;
157 }
159 void free_vcpu_struct(struct vcpu *v)
160 {
161 xfree(v);
162 }
164 #ifdef CONFIG_COMPAT
166 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
167 {
168 struct domain *d = v->domain;
169 unsigned i;
170 struct page_info *pg;
172 if ( !d->arch.mm_arg_xlat_l3 )
173 {
174 pg = alloc_domheap_page(NULL);
175 if ( !pg )
176 return -ENOMEM;
177 d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
178 clear_page(d->arch.mm_arg_xlat_l3);
179 }
181 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
182 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
184 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
185 {
186 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
187 l2_pgentry_t *l2tab;
188 l1_pgentry_t *l1tab;
190 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
191 {
192 pg = alloc_domheap_page(NULL);
193 if ( !pg )
194 return -ENOMEM;
195 clear_page(page_to_virt(pg));
196 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
197 }
198 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
199 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
200 {
201 pg = alloc_domheap_page(NULL);
202 if ( !pg )
203 return -ENOMEM;
204 clear_page(page_to_virt(pg));
205 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
206 }
207 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
208 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
209 pg = alloc_domheap_page(NULL);
210 if ( !pg )
211 return -ENOMEM;
212 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
213 }
215 return 0;
216 }
218 static void release_arg_xlat_area(struct domain *d)
219 {
220 if ( d->arch.mm_arg_xlat_l3 )
221 {
222 unsigned l3;
224 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
225 {
226 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
227 {
228 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
229 unsigned l2;
231 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
232 {
233 if ( l2e_get_intpte(l2tab[l2]) )
234 {
235 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
236 unsigned l1;
238 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
239 {
240 if ( l1e_get_intpte(l1tab[l1]) )
241 free_domheap_page(l1e_get_page(l1tab[l1]));
242 }
243 free_domheap_page(l2e_get_page(l2tab[l2]));
244 }
245 }
246 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
247 }
248 }
249 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
250 }
251 }
253 static int setup_compat_l4(struct vcpu *v)
254 {
255 struct page_info *pg = alloc_domheap_page(NULL);
256 l4_pgentry_t *l4tab;
257 int rc;
259 if ( pg == NULL )
260 return -ENOMEM;
262 /* This page needs to look like a pagetable so that it can be shadowed */
263 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
265 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
266 l4tab[0] = l4e_empty();
267 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
268 l4e_from_page(pg, __PAGE_HYPERVISOR);
269 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
270 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
271 __PAGE_HYPERVISOR);
273 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
274 {
275 free_domheap_page(pg);
276 return rc;
277 }
279 v->arch.guest_table = pagetable_from_page(pg);
280 v->arch.guest_table_user = v->arch.guest_table;
282 return 0;
283 }
285 static void release_compat_l4(struct vcpu *v)
286 {
287 free_domheap_page(pagetable_get_page(v->arch.guest_table));
288 v->arch.guest_table = pagetable_null();
289 v->arch.guest_table_user = pagetable_null();
290 }
292 static inline int may_switch_mode(struct domain *d)
293 {
294 return (!is_hvm_domain(d) && (d->tot_pages == 0));
295 }
297 int switch_native(struct domain *d)
298 {
299 l1_pgentry_t gdt_l1e;
300 unsigned int vcpuid;
302 if ( d == NULL )
303 return -EINVAL;
304 if ( !may_switch_mode(d) )
305 return -EACCES;
306 if ( !is_pv_32on64_domain(d) )
307 return 0;
309 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
310 release_arg_xlat_area(d);
312 /* switch gdt */
313 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
314 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
315 {
316 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
317 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
318 if (d->vcpu[vcpuid])
319 release_compat_l4(d->vcpu[vcpuid]);
320 }
322 return 0;
323 }
325 int switch_compat(struct domain *d)
326 {
327 l1_pgentry_t gdt_l1e;
328 unsigned int vcpuid;
330 if ( d == NULL )
331 return -EINVAL;
332 if ( !may_switch_mode(d) )
333 return -EACCES;
334 if ( is_pv_32on64_domain(d) )
335 return 0;
337 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
339 /* switch gdt */
340 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
341 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
342 {
343 if ( (d->vcpu[vcpuid] != NULL) &&
344 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
345 goto undo_and_fail;
346 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
347 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
348 }
350 d->arch.physaddr_bitsize =
351 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
352 + (PAGE_SIZE - 2);
354 return 0;
356 undo_and_fail:
357 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
358 release_arg_xlat_area(d);
359 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
360 while ( vcpuid-- != 0 )
361 {
362 if ( d->vcpu[vcpuid] != NULL )
363 release_compat_l4(d->vcpu[vcpuid]);
364 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
365 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
366 }
367 return -ENOMEM;
368 }
370 #else
371 #define release_arg_xlat_area(d) ((void)0)
372 #define setup_compat_l4(v) 0
373 #define release_compat_l4(v) ((void)0)
374 #endif
376 int vcpu_initialise(struct vcpu *v)
377 {
378 struct domain *d = v->domain;
379 int rc;
381 v->arch.vcpu_info_mfn = INVALID_MFN;
383 v->arch.flags = TF_kernel_mode;
385 #if defined(__i386__)
386 mapcache_vcpu_init(v);
387 #endif
389 pae_l3_cache_init(&v->arch.pae_l3_cache);
391 paging_vcpu_init(v);
393 if ( is_hvm_domain(d) )
394 {
395 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
396 return rc;
397 }
398 else
399 {
400 /* PV guests by default have a 100Hz ticker. */
401 v->periodic_period = MILLISECS(10);
403 /* PV guests get an emulated PIT too for video BIOSes to use. */
404 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
405 pit_init(v, cpu_khz);
407 v->arch.schedule_tail = continue_nonidle_domain;
408 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
409 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
411 if ( is_idle_domain(d) )
412 {
413 v->arch.schedule_tail = continue_idle_domain;
414 v->arch.cr3 = __pa(idle_pg_table);
415 }
417 v->arch.guest_context.ctrlreg[4] =
418 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
419 }
421 v->arch.perdomain_ptes =
422 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
424 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
425 }
427 void vcpu_destroy(struct vcpu *v)
428 {
429 if ( is_pv_32on64_vcpu(v) )
430 release_compat_l4(v);
432 unmap_vcpu_info(v);
434 if ( is_hvm_vcpu(v) )
435 hvm_vcpu_destroy(v);
436 }
438 int arch_domain_create(struct domain *d)
439 {
440 #ifdef __x86_64__
441 struct page_info *pg;
442 int i;
443 #endif
444 l1_pgentry_t gdt_l1e;
445 int vcpuid, pdpt_order, paging_initialised = 0;
446 int rc = -ENOMEM;
448 d->arch.relmem = RELMEM_not_started;
449 INIT_LIST_HEAD(&d->arch.relmem_list);
451 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
452 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
453 if ( d->arch.mm_perdomain_pt == NULL )
454 goto fail;
455 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
457 /*
458 * Map Xen segments into every VCPU's GDT, irrespective of whether every
459 * VCPU will actually be used. This avoids an NMI race during context
460 * switch: if we take an interrupt after switching CR3 but before switching
461 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
462 * try to load CS from an invalid table.
463 */
464 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
465 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
466 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
467 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
469 #if defined(__i386__)
471 mapcache_domain_init(d);
473 #else /* __x86_64__ */
475 if ( (pg = alloc_domheap_page(NULL)) == NULL )
476 goto fail;
477 d->arch.mm_perdomain_l2 = page_to_virt(pg);
478 clear_page(d->arch.mm_perdomain_l2);
479 for ( i = 0; i < (1 << pdpt_order); i++ )
480 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
481 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
482 __PAGE_HYPERVISOR);
484 if ( (pg = alloc_domheap_page(NULL)) == NULL )
485 goto fail;
486 d->arch.mm_perdomain_l3 = page_to_virt(pg);
487 clear_page(d->arch.mm_perdomain_l3);
488 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
489 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
490 __PAGE_HYPERVISOR);
492 #endif /* __x86_64__ */
494 #ifdef CONFIG_COMPAT
495 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
496 #endif
498 paging_domain_init(d);
499 paging_initialised = 1;
501 if ( !is_idle_domain(d) )
502 {
503 d->arch.ioport_caps =
504 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
505 if ( d->arch.ioport_caps == NULL )
506 goto fail;
508 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
509 goto fail;
511 clear_page(d->shared_info);
512 share_xen_page_with_guest(
513 virt_to_page(d->shared_info), d, XENSHARE_writable);
514 }
516 if ( (rc = iommu_domain_init(d)) != 0 )
517 goto fail;
519 if ( is_hvm_domain(d) )
520 {
521 if ( (rc = hvm_domain_initialise(d)) != 0 )
522 {
523 iommu_domain_destroy(d);
524 goto fail;
525 }
526 }
527 else
528 {
529 /* 32-bit PV guest by default only if Xen is not 64-bit. */
530 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
531 (CONFIG_PAGING_LEVELS != 4);
532 }
534 return 0;
536 fail:
537 free_xenheap_page(d->shared_info);
538 if ( paging_initialised )
539 paging_final_teardown(d);
540 #ifdef __x86_64__
541 if ( d->arch.mm_perdomain_l2 )
542 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
543 if ( d->arch.mm_perdomain_l3 )
544 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
545 #endif
546 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
547 return rc;
548 }
550 void arch_domain_destroy(struct domain *d)
551 {
552 if ( is_hvm_domain(d) )
553 hvm_domain_destroy(d);
555 iommu_domain_destroy(d);
557 paging_final_teardown(d);
559 free_xenheap_pages(
560 d->arch.mm_perdomain_pt,
561 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
563 #ifdef __x86_64__
564 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
565 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
566 #endif
568 if ( is_pv_32on64_domain(d) )
569 release_arg_xlat_area(d);
571 free_xenheap_page(d->shared_info);
572 }
574 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
575 {
576 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
578 hv_cr4_mask = ~X86_CR4_TSD;
579 if ( cpu_has_de )
580 hv_cr4_mask &= ~X86_CR4_DE;
582 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
583 gdprintk(XENLOG_WARNING,
584 "Attempt to change CR4 flags %08lx -> %08lx\n",
585 hv_cr4, guest_cr4);
587 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
588 }
590 /* This is called by arch_final_setup_guest and do_boot_vcpu */
591 int arch_set_info_guest(
592 struct vcpu *v, vcpu_guest_context_u c)
593 {
594 struct domain *d = v->domain;
595 unsigned long cr3_pfn = INVALID_MFN;
596 unsigned long flags, cr4;
597 int i, rc = 0, compat;
599 /* The context is a compat-mode one if the target domain is compat-mode;
600 * we expect the tools to DTRT even in compat-mode callers. */
601 compat = is_pv_32on64_domain(d);
603 #ifdef CONFIG_COMPAT
604 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
605 #else
606 #define c(fld) (c.nat->fld)
607 #endif
608 flags = c(flags);
610 if ( !is_hvm_vcpu(v) )
611 {
612 if ( !compat )
613 {
614 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
615 fixup_guest_stack_selector(d, c.nat->kernel_ss);
616 fixup_guest_code_selector(d, c.nat->user_regs.cs);
617 #ifdef __i386__
618 fixup_guest_code_selector(d, c.nat->event_callback_cs);
619 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
620 #endif
622 for ( i = 0; i < 256; i++ )
623 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
625 /* LDT safety checks. */
626 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
627 (c.nat->ldt_ents > 8192) ||
628 !array_access_ok(c.nat->ldt_base,
629 c.nat->ldt_ents,
630 LDT_ENTRY_SIZE) )
631 return -EINVAL;
632 }
633 #ifdef CONFIG_COMPAT
634 else
635 {
636 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
637 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
638 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
639 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
640 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
642 for ( i = 0; i < 256; i++ )
643 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
645 /* LDT safety checks. */
646 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
647 (c.cmp->ldt_ents > 8192) ||
648 !compat_array_access_ok(c.cmp->ldt_base,
649 c.cmp->ldt_ents,
650 LDT_ENTRY_SIZE) )
651 return -EINVAL;
652 }
653 #endif
654 }
656 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
658 v->arch.flags &= ~TF_kernel_mode;
659 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
660 v->arch.flags |= TF_kernel_mode;
662 if ( !compat )
663 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
664 #ifdef CONFIG_COMPAT
665 else
666 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
667 #endif
669 v->arch.guest_context.user_regs.eflags |= 2;
671 if ( is_hvm_vcpu(v) )
672 goto out;
674 /* Only CR0.TS is modifiable by guest or admin. */
675 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
676 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
678 init_int80_direct_trap(v);
680 /* IOPL privileges are virtualised. */
681 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
682 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
684 /* Ensure real hardware interrupts are enabled. */
685 v->arch.guest_context.user_regs.eflags |= EF_IE;
687 cr4 = v->arch.guest_context.ctrlreg[4];
688 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
689 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
691 memset(v->arch.guest_context.debugreg, 0,
692 sizeof(v->arch.guest_context.debugreg));
693 for ( i = 0; i < 8; i++ )
694 (void)set_debugreg(v, i, c(debugreg[i]));
696 if ( v->is_initialised )
697 goto out;
699 if ( v->vcpu_id == 0 )
700 d->vm_assist = c(vm_assist);
702 if ( !compat )
703 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
704 #ifdef CONFIG_COMPAT
705 else
706 {
707 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
708 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
710 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
711 return -EINVAL;
712 for ( i = 0; i < n; ++i )
713 gdt_frames[i] = c.cmp->gdt_frames[i];
714 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
715 }
716 #endif
717 if ( rc != 0 )
718 return rc;
720 if ( !compat )
721 {
722 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
724 if ( !mfn_valid(cr3_pfn) ||
725 (paging_mode_refcounts(d)
726 ? !get_page(mfn_to_page(cr3_pfn), d)
727 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
728 PGT_base_page_table)) )
729 {
730 destroy_gdt(v);
731 return -EINVAL;
732 }
734 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
736 #ifdef __x86_64__
737 if ( c.nat->ctrlreg[1] )
738 {
739 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
741 if ( !mfn_valid(cr3_pfn) ||
742 (paging_mode_refcounts(d)
743 ? !get_page(mfn_to_page(cr3_pfn), d)
744 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
745 PGT_base_page_table)) )
746 {
747 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
748 v->arch.guest_table = pagetable_null();
749 if ( paging_mode_refcounts(d) )
750 put_page(mfn_to_page(cr3_pfn));
751 else
752 put_page_and_type(mfn_to_page(cr3_pfn));
753 destroy_gdt(v);
754 return -EINVAL;
755 }
757 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
758 }
759 #endif
760 }
761 #ifdef CONFIG_COMPAT
762 else
763 {
764 l4_pgentry_t *l4tab;
766 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
768 if ( !mfn_valid(cr3_pfn) ||
769 (paging_mode_refcounts(d)
770 ? !get_page(mfn_to_page(cr3_pfn), d)
771 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
772 PGT_l3_page_table)) )
773 {
774 destroy_gdt(v);
775 return -EINVAL;
776 }
778 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
779 *l4tab = l4e_from_pfn(
780 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
781 }
782 #endif
784 if ( v->vcpu_id == 0 )
785 update_domain_wallclock_time(d);
787 /* Don't redo final setup */
788 v->is_initialised = 1;
790 if ( paging_mode_enabled(d) )
791 paging_update_paging_modes(v);
793 update_cr3(v);
795 out:
796 if ( flags & VGCF_online )
797 clear_bit(_VPF_down, &v->pause_flags);
798 else
799 set_bit(_VPF_down, &v->pause_flags);
800 return 0;
801 #undef c
802 }
804 int arch_vcpu_reset(struct vcpu *v)
805 {
806 destroy_gdt(v);
807 vcpu_destroy_pagetables(v);
808 return 0;
809 }
811 /*
812 * Unmap the vcpu info page if the guest decided to place it somewhere
813 * else. This is only used from arch_domain_destroy, so there's no
814 * need to do anything clever.
815 */
816 static void
817 unmap_vcpu_info(struct vcpu *v)
818 {
819 struct domain *d = v->domain;
820 unsigned long mfn;
822 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
823 return;
825 mfn = v->arch.vcpu_info_mfn;
826 unmap_domain_page_global(v->vcpu_info);
828 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
829 v->arch.vcpu_info_mfn = INVALID_MFN;
831 put_page_and_type(mfn_to_page(mfn));
832 }
834 /*
835 * Map a guest page in and point the vcpu_info pointer at it. This
836 * makes sure that the vcpu_info is always pointing at a valid piece
837 * of memory, and it sets a pending event to make sure that a pending
838 * event doesn't get missed.
839 */
840 static int
841 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
842 {
843 struct domain *d = v->domain;
844 void *mapping;
845 vcpu_info_t *new_info;
846 int i;
848 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
849 return -EINVAL;
851 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
852 return -EINVAL;
854 /* Run this command on yourself or on other offline VCPUS. */
855 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
856 return -EINVAL;
858 mfn = gmfn_to_mfn(d, mfn);
859 if ( !mfn_valid(mfn) ||
860 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
861 return -EINVAL;
863 mapping = map_domain_page_global(mfn);
864 if ( mapping == NULL )
865 {
866 put_page_and_type(mfn_to_page(mfn));
867 return -ENOMEM;
868 }
870 new_info = (vcpu_info_t *)(mapping + offset);
872 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
874 v->vcpu_info = new_info;
875 v->arch.vcpu_info_mfn = mfn;
877 /* Set new vcpu_info pointer /before/ setting pending flags. */
878 wmb();
880 /*
881 * Mark everything as being pending just to make sure nothing gets
882 * lost. The domain will get a spurious event, but it can cope.
883 */
884 vcpu_info(v, evtchn_upcall_pending) = 1;
885 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
886 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
888 /*
889 * Only bother to update time for the current vcpu. If we're
890 * operating on another vcpu, then it had better not be running at
891 * the time.
892 */
893 if ( v == current )
894 update_vcpu_system_time(v);
896 return 0;
897 }
899 long
900 arch_do_vcpu_op(
901 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
902 {
903 long rc = 0;
905 switch ( cmd )
906 {
907 case VCPUOP_register_runstate_memory_area:
908 {
909 struct vcpu_register_runstate_memory_area area;
910 struct vcpu_runstate_info runstate;
912 rc = -EFAULT;
913 if ( copy_from_guest(&area, arg, 1) )
914 break;
916 if ( !guest_handle_okay(area.addr.h, 1) )
917 break;
919 rc = 0;
920 runstate_guest(v) = area.addr.h;
922 if ( v == current )
923 {
924 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
925 }
926 else
927 {
928 vcpu_runstate_get(v, &runstate);
929 __copy_to_guest(runstate_guest(v), &runstate, 1);
930 }
932 break;
933 }
935 case VCPUOP_register_vcpu_info:
936 {
937 struct domain *d = v->domain;
938 struct vcpu_register_vcpu_info info;
940 rc = -EFAULT;
941 if ( copy_from_guest(&info, arg, 1) )
942 break;
944 LOCK_BIGLOCK(d);
945 rc = map_vcpu_info(v, info.mfn, info.offset);
946 UNLOCK_BIGLOCK(d);
948 break;
949 }
951 default:
952 rc = -ENOSYS;
953 break;
954 }
956 return rc;
957 }
959 #ifdef __x86_64__
961 #define loadsegment(seg,value) ({ \
962 int __r = 1; \
963 asm volatile ( \
964 "1: movl %k1,%%" #seg "\n2:\n" \
965 ".section .fixup,\"ax\"\n" \
966 "3: xorl %k0,%k0\n" \
967 " movl %k0,%%" #seg "\n" \
968 " jmp 2b\n" \
969 ".previous\n" \
970 ".section __ex_table,\"a\"\n" \
971 " .align 8\n" \
972 " .quad 1b,3b\n" \
973 ".previous" \
974 : "=r" (__r) : "r" (value), "0" (__r) );\
975 __r; })
977 /*
978 * save_segments() writes a mask of segments which are dirty (non-zero),
979 * allowing load_segments() to avoid some expensive segment loads and
980 * MSR writes.
981 */
982 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
983 #define DIRTY_DS 0x01
984 #define DIRTY_ES 0x02
985 #define DIRTY_FS 0x04
986 #define DIRTY_GS 0x08
987 #define DIRTY_FS_BASE 0x10
988 #define DIRTY_GS_BASE_USER 0x20
990 static void load_segments(struct vcpu *n)
991 {
992 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
993 int all_segs_okay = 1;
994 unsigned int dirty_segment_mask, cpu = smp_processor_id();
996 /* Load and clear the dirty segment mask. */
997 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
998 per_cpu(dirty_segment_mask, cpu) = 0;
1000 /* Either selector != 0 ==> reload. */
1001 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1002 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1004 /* Either selector != 0 ==> reload. */
1005 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1006 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1008 /*
1009 * Either selector != 0 ==> reload.
1010 * Also reload to reset FS_BASE if it was non-zero.
1011 */
1012 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1013 nctxt->user_regs.fs) )
1014 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1016 /*
1017 * Either selector != 0 ==> reload.
1018 * Also reload to reset GS_BASE if it was non-zero.
1019 */
1020 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1021 nctxt->user_regs.gs) )
1023 /* Reset GS_BASE with user %gs? */
1024 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1025 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1028 if ( !is_pv_32on64_domain(n->domain) )
1030 /* This can only be non-zero if selector is NULL. */
1031 if ( nctxt->fs_base )
1032 wrmsr(MSR_FS_BASE,
1033 nctxt->fs_base,
1034 nctxt->fs_base>>32);
1036 /* Most kernels have non-zero GS base, so don't bother testing. */
1037 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1038 wrmsr(MSR_SHADOW_GS_BASE,
1039 nctxt->gs_base_kernel,
1040 nctxt->gs_base_kernel>>32);
1042 /* This can only be non-zero if selector is NULL. */
1043 if ( nctxt->gs_base_user )
1044 wrmsr(MSR_GS_BASE,
1045 nctxt->gs_base_user,
1046 nctxt->gs_base_user>>32);
1048 /* If in kernel mode then switch the GS bases around. */
1049 if ( (n->arch.flags & TF_kernel_mode) )
1050 asm volatile ( "swapgs" );
1053 if ( unlikely(!all_segs_okay) )
1055 struct cpu_user_regs *regs = guest_cpu_user_regs();
1056 unsigned long *rsp =
1057 (n->arch.flags & TF_kernel_mode) ?
1058 (unsigned long *)regs->rsp :
1059 (unsigned long *)nctxt->kernel_sp;
1060 unsigned long cs_and_mask, rflags;
1062 if ( is_pv_32on64_domain(n->domain) )
1064 unsigned int *esp = ring_1(regs) ?
1065 (unsigned int *)regs->rsp :
1066 (unsigned int *)nctxt->kernel_sp;
1067 unsigned int cs_and_mask, eflags;
1068 int ret = 0;
1070 /* CS longword also contains full evtchn_upcall_mask. */
1071 cs_and_mask = (unsigned short)regs->cs |
1072 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1073 /* Fold upcall mask into RFLAGS.IF. */
1074 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1075 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1077 if ( !ring_1(regs) )
1079 ret = put_user(regs->ss, esp-1);
1080 ret |= put_user(regs->_esp, esp-2);
1081 esp -= 2;
1084 if ( ret |
1085 put_user(eflags, esp-1) |
1086 put_user(cs_and_mask, esp-2) |
1087 put_user(regs->_eip, esp-3) |
1088 put_user(nctxt->user_regs.gs, esp-4) |
1089 put_user(nctxt->user_regs.fs, esp-5) |
1090 put_user(nctxt->user_regs.es, esp-6) |
1091 put_user(nctxt->user_regs.ds, esp-7) )
1093 gdprintk(XENLOG_ERR, "Error while creating compat "
1094 "failsafe callback frame.\n");
1095 domain_crash(n->domain);
1098 if ( test_bit(_VGCF_failsafe_disables_events,
1099 &n->arch.guest_context.flags) )
1100 vcpu_info(n, evtchn_upcall_mask) = 1;
1102 regs->entry_vector = TRAP_syscall;
1103 regs->_eflags &= 0xFFFCBEFFUL;
1104 regs->ss = FLAT_COMPAT_KERNEL_SS;
1105 regs->_esp = (unsigned long)(esp-7);
1106 regs->cs = FLAT_COMPAT_KERNEL_CS;
1107 regs->_eip = nctxt->failsafe_callback_eip;
1108 return;
1111 if ( !(n->arch.flags & TF_kernel_mode) )
1112 toggle_guest_mode(n);
1113 else
1114 regs->cs &= ~3;
1116 /* CS longword also contains full evtchn_upcall_mask. */
1117 cs_and_mask = (unsigned long)regs->cs |
1118 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1120 /* Fold upcall mask into RFLAGS.IF. */
1121 rflags = regs->rflags & ~X86_EFLAGS_IF;
1122 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1124 if ( put_user(regs->ss, rsp- 1) |
1125 put_user(regs->rsp, rsp- 2) |
1126 put_user(rflags, rsp- 3) |
1127 put_user(cs_and_mask, rsp- 4) |
1128 put_user(regs->rip, rsp- 5) |
1129 put_user(nctxt->user_regs.gs, rsp- 6) |
1130 put_user(nctxt->user_regs.fs, rsp- 7) |
1131 put_user(nctxt->user_regs.es, rsp- 8) |
1132 put_user(nctxt->user_regs.ds, rsp- 9) |
1133 put_user(regs->r11, rsp-10) |
1134 put_user(regs->rcx, rsp-11) )
1136 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1137 "callback frame.\n");
1138 domain_crash(n->domain);
1141 if ( test_bit(_VGCF_failsafe_disables_events,
1142 &n->arch.guest_context.flags) )
1143 vcpu_info(n, evtchn_upcall_mask) = 1;
1145 regs->entry_vector = TRAP_syscall;
1146 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1147 X86_EFLAGS_NT|X86_EFLAGS_TF);
1148 regs->ss = FLAT_KERNEL_SS;
1149 regs->rsp = (unsigned long)(rsp-11);
1150 regs->cs = FLAT_KERNEL_CS;
1151 regs->rip = nctxt->failsafe_callback_eip;
1155 static void save_segments(struct vcpu *v)
1157 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1158 struct cpu_user_regs *regs = &ctxt->user_regs;
1159 unsigned int dirty_segment_mask = 0;
1161 regs->ds = read_segment_register(ds);
1162 regs->es = read_segment_register(es);
1163 regs->fs = read_segment_register(fs);
1164 regs->gs = read_segment_register(gs);
1166 if ( regs->ds )
1167 dirty_segment_mask |= DIRTY_DS;
1169 if ( regs->es )
1170 dirty_segment_mask |= DIRTY_ES;
1172 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1174 dirty_segment_mask |= DIRTY_FS;
1175 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1177 else if ( ctxt->fs_base )
1179 dirty_segment_mask |= DIRTY_FS_BASE;
1182 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1184 dirty_segment_mask |= DIRTY_GS;
1185 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1187 else if ( ctxt->gs_base_user )
1189 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1192 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1195 #define switch_kernel_stack(v) ((void)0)
1197 #elif defined(__i386__)
1199 #define load_segments(n) ((void)0)
1200 #define save_segments(p) ((void)0)
1202 static inline void switch_kernel_stack(struct vcpu *v)
1204 struct tss_struct *tss = &init_tss[smp_processor_id()];
1205 tss->esp1 = v->arch.guest_context.kernel_sp;
1206 tss->ss1 = v->arch.guest_context.kernel_ss;
1209 #endif /* __i386__ */
1211 static void paravirt_ctxt_switch_from(struct vcpu *v)
1213 save_segments(v);
1215 /*
1216 * Disable debug breakpoints. We do this aggressively because if we switch
1217 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1218 * inside Xen, before we get a chance to reload DR7, and this cannot always
1219 * safely be handled.
1220 */
1221 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1222 write_debugreg(7, 0);
1225 static void paravirt_ctxt_switch_to(struct vcpu *v)
1227 unsigned long cr4;
1229 set_int80_direct_trap(v);
1230 switch_kernel_stack(v);
1232 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1233 if ( unlikely(cr4 != read_cr4()) )
1234 write_cr4(cr4);
1236 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1238 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1239 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1240 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1241 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1242 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1243 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1247 static void __context_switch(void)
1249 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1250 unsigned int cpu = smp_processor_id();
1251 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1252 struct vcpu *n = current;
1254 ASSERT(p != n);
1255 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1257 if ( !is_idle_vcpu(p) )
1259 memcpy(&p->arch.guest_context.user_regs,
1260 stack_regs,
1261 CTXT_SWITCH_STACK_BYTES);
1262 unlazy_fpu(p);
1263 p->arch.ctxt_switch_from(p);
1266 if ( !is_idle_vcpu(n) )
1268 memcpy(stack_regs,
1269 &n->arch.guest_context.user_regs,
1270 CTXT_SWITCH_STACK_BYTES);
1271 n->arch.ctxt_switch_to(n);
1274 if ( p->domain != n->domain )
1275 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1276 cpu_set(cpu, n->vcpu_dirty_cpumask);
1278 write_ptbase(n);
1280 if ( p->vcpu_id != n->vcpu_id )
1282 char gdt_load[10];
1283 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1284 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1285 asm volatile ( "lgdt %0" : "=m" (gdt_load) );
1288 if ( p->domain != n->domain )
1289 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1290 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1292 per_cpu(curr_vcpu, cpu) = n;
1296 void context_switch(struct vcpu *prev, struct vcpu *next)
1298 unsigned int cpu = smp_processor_id();
1299 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1301 ASSERT(local_irq_is_enabled());
1303 /* Allow at most one CPU at a time to be dirty. */
1304 ASSERT(cpus_weight(dirty_mask) <= 1);
1305 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1307 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1308 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1309 flush_tlb_mask(next->vcpu_dirty_cpumask);
1312 local_irq_disable();
1314 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1315 pt_save_timer(prev);
1317 set_current(next);
1319 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1321 local_irq_enable();
1323 else
1325 __context_switch();
1327 #ifdef CONFIG_COMPAT
1328 if ( !is_hvm_vcpu(next) &&
1329 (is_idle_vcpu(prev) ||
1330 is_hvm_vcpu(prev) ||
1331 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1333 uint64_t efer = read_efer();
1334 if ( !(efer & EFER_SCE) )
1335 write_efer(efer | EFER_SCE);
1336 flush_tlb_one_local(GDT_VIRT_START(next) +
1337 FIRST_RESERVED_GDT_BYTE);
1339 #endif
1341 /* Re-enable interrupts before restoring state which may fault. */
1342 local_irq_enable();
1344 if ( !is_hvm_vcpu(next) )
1346 load_LDT(next);
1347 load_segments(next);
1351 context_saved(prev);
1353 /* Update per-VCPU guest runstate shared memory area (if registered). */
1354 if ( !guest_handle_is_null(runstate_guest(next)) )
1356 if ( !is_pv_32on64_domain(next->domain) )
1357 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1358 #ifdef CONFIG_COMPAT
1359 else
1361 struct compat_vcpu_runstate_info info;
1363 XLAT_vcpu_runstate_info(&info, &next->runstate);
1364 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1366 #endif
1369 schedule_tail(next);
1370 BUG();
1373 void continue_running(struct vcpu *same)
1375 schedule_tail(same);
1376 BUG();
1379 int __sync_lazy_execstate(void)
1381 unsigned long flags;
1382 int switch_required;
1384 local_irq_save(flags);
1386 switch_required = (this_cpu(curr_vcpu) != current);
1388 if ( switch_required )
1390 ASSERT(current == idle_vcpu[smp_processor_id()]);
1391 __context_switch();
1394 local_irq_restore(flags);
1396 return switch_required;
1399 void sync_vcpu_execstate(struct vcpu *v)
1401 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1402 (void)__sync_lazy_execstate();
1404 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1405 flush_tlb_mask(v->vcpu_dirty_cpumask);
1408 struct migrate_info {
1409 long (*func)(void *data);
1410 void *data;
1411 void (*saved_schedule_tail)(struct vcpu *);
1412 cpumask_t saved_affinity;
1413 };
1415 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1417 struct cpu_user_regs *regs = guest_cpu_user_regs();
1418 struct migrate_info *info = v->arch.continue_info;
1420 regs->eax = info->func(info->data);
1422 v->arch.schedule_tail = info->saved_schedule_tail;
1423 v->arch.continue_info = NULL;
1425 xfree(info);
1427 vcpu_set_affinity(v, &v->cpu_affinity);
1428 schedule_tail(v);
1431 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1433 struct vcpu *v = current;
1434 struct migrate_info *info;
1435 cpumask_t mask = cpumask_of_cpu(cpu);
1436 int rc;
1438 if ( cpu == smp_processor_id() )
1439 return func(data);
1441 info = xmalloc(struct migrate_info);
1442 if ( info == NULL )
1443 return -ENOMEM;
1445 info->func = func;
1446 info->data = data;
1447 info->saved_schedule_tail = v->arch.schedule_tail;
1448 info->saved_affinity = v->cpu_affinity;
1450 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1451 v->arch.continue_info = info;
1453 rc = vcpu_set_affinity(v, &mask);
1454 if ( rc )
1456 v->arch.schedule_tail = info->saved_schedule_tail;
1457 v->arch.continue_info = NULL;
1458 xfree(info);
1459 return rc;
1462 /* Dummy return value will be overwritten by new schedule_tail. */
1463 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1464 return 0;
1467 #define next_arg(fmt, args) ({ \
1468 unsigned long __arg; \
1469 switch ( *(fmt)++ ) \
1470 { \
1471 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1472 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1473 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1474 default: __arg = 0; BUG(); \
1475 } \
1476 __arg; \
1477 })
1479 DEFINE_PER_CPU(char, hc_preempted);
1481 unsigned long hypercall_create_continuation(
1482 unsigned int op, const char *format, ...)
1484 struct mc_state *mcs = &this_cpu(mc_state);
1485 struct cpu_user_regs *regs;
1486 const char *p = format;
1487 unsigned long arg;
1488 unsigned int i;
1489 va_list args;
1491 va_start(args, format);
1493 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1495 __set_bit(_MCSF_call_preempted, &mcs->flags);
1497 for ( i = 0; *p != '\0'; i++ )
1498 mcs->call.args[i] = next_arg(p, args);
1499 if ( is_pv_32on64_domain(current->domain) )
1501 for ( ; i < 6; i++ )
1502 mcs->call.args[i] = 0;
1505 else
1507 regs = guest_cpu_user_regs();
1508 regs->eax = op;
1509 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1511 #ifdef __x86_64__
1512 if ( !is_hvm_vcpu(current) ?
1513 !is_pv_32on64_vcpu(current) :
1514 (hvm_guest_x86_mode(current) == 8) )
1516 for ( i = 0; *p != '\0'; i++ )
1518 arg = next_arg(p, args);
1519 switch ( i )
1521 case 0: regs->rdi = arg; break;
1522 case 1: regs->rsi = arg; break;
1523 case 2: regs->rdx = arg; break;
1524 case 3: regs->r10 = arg; break;
1525 case 4: regs->r8 = arg; break;
1526 case 5: regs->r9 = arg; break;
1530 else
1531 #endif
1533 if ( supervisor_mode_kernel )
1534 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1536 for ( i = 0; *p != '\0'; i++ )
1538 arg = next_arg(p, args);
1539 switch ( i )
1541 case 0: regs->ebx = arg; break;
1542 case 1: regs->ecx = arg; break;
1543 case 2: regs->edx = arg; break;
1544 case 3: regs->esi = arg; break;
1545 case 4: regs->edi = arg; break;
1546 case 5: regs->ebp = arg; break;
1551 this_cpu(hc_preempted) = 1;
1554 va_end(args);
1556 return op;
1559 #ifdef CONFIG_COMPAT
1560 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1562 int rc = 0;
1563 struct mc_state *mcs = &this_cpu(mc_state);
1564 struct cpu_user_regs *regs;
1565 unsigned int i, cval = 0;
1566 unsigned long nval = 0;
1567 va_list args;
1569 BUG_ON(*id > 5);
1570 BUG_ON(mask & (1U << *id));
1572 va_start(args, mask);
1574 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1576 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1577 return 0;
1578 for ( i = 0; i < 6; ++i, mask >>= 1 )
1580 if ( mask & 1 )
1582 nval = va_arg(args, unsigned long);
1583 cval = va_arg(args, unsigned int);
1584 if ( cval == nval )
1585 mask &= ~1U;
1586 else
1587 BUG_ON(nval == (unsigned int)nval);
1589 else if ( id && *id == i )
1591 *id = mcs->call.args[i];
1592 id = NULL;
1594 if ( (mask & 1) && mcs->call.args[i] == nval )
1596 mcs->call.args[i] = cval;
1597 ++rc;
1599 else
1600 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1603 else
1605 regs = guest_cpu_user_regs();
1606 for ( i = 0; i < 6; ++i, mask >>= 1 )
1608 unsigned long *reg;
1610 switch ( i )
1612 case 0: reg = &regs->ebx; break;
1613 case 1: reg = &regs->ecx; break;
1614 case 2: reg = &regs->edx; break;
1615 case 3: reg = &regs->esi; break;
1616 case 4: reg = &regs->edi; break;
1617 case 5: reg = &regs->ebp; break;
1618 default: BUG(); reg = NULL; break;
1620 if ( (mask & 1) )
1622 nval = va_arg(args, unsigned long);
1623 cval = va_arg(args, unsigned int);
1624 if ( cval == nval )
1625 mask &= ~1U;
1626 else
1627 BUG_ON(nval == (unsigned int)nval);
1629 else if ( id && *id == i )
1631 *id = *reg;
1632 id = NULL;
1634 if ( (mask & 1) && *reg == nval )
1636 *reg = cval;
1637 ++rc;
1639 else
1640 BUG_ON(*reg != (unsigned int)*reg);
1644 va_end(args);
1646 return rc;
1648 #endif
1650 static int relinquish_memory(
1651 struct domain *d, struct list_head *list, unsigned long type)
1653 struct list_head *ent;
1654 struct page_info *page;
1655 unsigned long x, y;
1656 int ret = 0;
1658 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1659 spin_lock_recursive(&d->page_alloc_lock);
1661 ent = list->next;
1662 while ( ent != list )
1664 page = list_entry(ent, struct page_info, list);
1666 /* Grab a reference to the page so it won't disappear from under us. */
1667 if ( unlikely(!get_page(page, d)) )
1669 /* Couldn't get a reference -- someone is freeing this page. */
1670 ent = ent->next;
1671 list_move_tail(&page->list, &d->arch.relmem_list);
1672 continue;
1675 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1676 put_page_and_type(page);
1678 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1679 put_page(page);
1681 /*
1682 * Forcibly invalidate top-most, still valid page tables at this point
1683 * to break circular 'linear page table' references. This is okay
1684 * because MMU structures are not shared across domains and this domain
1685 * is now dead. Thus top-most valid tables are not in use so a non-zero
1686 * count means circular reference.
1687 */
1688 y = page->u.inuse.type_info;
1689 for ( ; ; )
1691 x = y;
1692 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1693 (type|PGT_validated)) )
1694 break;
1696 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1697 if ( likely(y == x) )
1699 free_page_type(page, type);
1700 break;
1704 /* Follow the list chain and /then/ potentially free the page. */
1705 ent = ent->next;
1706 list_move_tail(&page->list, &d->arch.relmem_list);
1707 put_page(page);
1709 if ( hypercall_preempt_check() )
1711 ret = -EAGAIN;
1712 goto out;
1716 list_splice_init(&d->arch.relmem_list, list);
1718 out:
1719 spin_unlock_recursive(&d->page_alloc_lock);
1720 return ret;
1723 static void vcpu_destroy_pagetables(struct vcpu *v)
1725 struct domain *d = v->domain;
1726 unsigned long pfn;
1728 #ifdef __x86_64__
1729 if ( is_pv_32on64_vcpu(v) )
1731 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1732 __va(pagetable_get_paddr(v->arch.guest_table)));
1734 if ( pfn != 0 )
1736 if ( paging_mode_refcounts(d) )
1737 put_page(mfn_to_page(pfn));
1738 else
1739 put_page_and_type(mfn_to_page(pfn));
1742 l4e_write(
1743 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1744 l4e_empty());
1746 v->arch.cr3 = 0;
1747 return;
1749 #endif
1751 pfn = pagetable_get_pfn(v->arch.guest_table);
1752 if ( pfn != 0 )
1754 if ( paging_mode_refcounts(d) )
1755 put_page(mfn_to_page(pfn));
1756 else
1757 put_page_and_type(mfn_to_page(pfn));
1758 v->arch.guest_table = pagetable_null();
1761 #ifdef __x86_64__
1762 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1763 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1764 if ( pfn != 0 )
1766 if ( !is_pv_32bit_vcpu(v) )
1768 if ( paging_mode_refcounts(d) )
1769 put_page(mfn_to_page(pfn));
1770 else
1771 put_page_and_type(mfn_to_page(pfn));
1773 v->arch.guest_table_user = pagetable_null();
1775 #endif
1777 v->arch.cr3 = 0;
1780 int domain_relinquish_resources(struct domain *d)
1782 int ret;
1783 struct vcpu *v;
1785 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1787 switch ( d->arch.relmem )
1789 case RELMEM_not_started:
1790 /* Tear down paging-assistance stuff. */
1791 paging_teardown(d);
1793 /* Drop the in-use references to page-table bases. */
1794 for_each_vcpu ( d, v )
1795 vcpu_destroy_pagetables(v);
1797 /*
1798 * Relinquish GDT mappings. No need for explicit unmapping of the LDT
1799 * as it automatically gets squashed when the guest's mappings go away.
1800 */
1801 for_each_vcpu(d, v)
1802 destroy_gdt(v);
1804 d->arch.relmem = RELMEM_xen_l4;
1805 /* fallthrough */
1807 /* Relinquish every page of memory. */
1808 case RELMEM_xen_l4:
1809 #if CONFIG_PAGING_LEVELS >= 4
1810 ret = relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1811 if ( ret )
1812 return ret;
1813 d->arch.relmem = RELMEM_dom_l4;
1814 /* fallthrough */
1815 case RELMEM_dom_l4:
1816 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1817 if ( ret )
1818 return ret;
1819 d->arch.relmem = RELMEM_xen_l3;
1820 /* fallthrough */
1821 #endif
1823 case RELMEM_xen_l3:
1824 #if CONFIG_PAGING_LEVELS >= 3
1825 ret = relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1826 if ( ret )
1827 return ret;
1828 d->arch.relmem = RELMEM_dom_l3;
1829 /* fallthrough */
1830 case RELMEM_dom_l3:
1831 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1832 if ( ret )
1833 return ret;
1834 d->arch.relmem = RELMEM_xen_l2;
1835 /* fallthrough */
1836 #endif
1838 case RELMEM_xen_l2:
1839 ret = relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1840 if ( ret )
1841 return ret;
1842 d->arch.relmem = RELMEM_dom_l2;
1843 /* fallthrough */
1844 case RELMEM_dom_l2:
1845 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1846 if ( ret )
1847 return ret;
1848 d->arch.relmem = RELMEM_done;
1849 /* fallthrough */
1851 case RELMEM_done:
1852 break;
1854 default:
1855 BUG();
1858 /* Free page used by xen oprofile buffer. */
1859 free_xenoprof_pages(d);
1861 if ( is_hvm_domain(d) )
1862 hvm_domain_relinquish_resources(d);
1864 return 0;
1867 void arch_dump_domain_info(struct domain *d)
1869 paging_dump_domain_info(d);
1872 void arch_dump_vcpu_info(struct vcpu *v)
1874 paging_dump_vcpu_info(v);
1877 /*
1878 * Local variables:
1879 * mode: C
1880 * c-set-style: "BSD"
1881 * c-basic-offset: 4
1882 * tab-width: 4
1883 * indent-tabs-mode: nil
1884 * End:
1885 */