debuggers.hg

view xen/arch/x86/domain.c @ 17986:f2148e532c81

x86 hvm: Fix RTC handling.
1. Clean up initialisation/destruction.
2. Better handle per-domain time-offset changes.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jul 02 17:25:05 2008 +0100 (2008-07-02)
parents 09dd5999401b
children 1e9df5cb885f
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <asm/regs.h>
34 #include <asm/mc146818rtc.h>
35 #include <asm/system.h>
36 #include <asm/io.h>
37 #include <asm/processor.h>
38 #include <asm/desc.h>
39 #include <asm/i387.h>
40 #include <asm/mpspec.h>
41 #include <asm/ldt.h>
42 #include <asm/paging.h>
43 #include <asm/hypercall.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/support.h>
46 #include <asm/debugreg.h>
47 #include <asm/msr.h>
48 #include <asm/nmi.h>
49 #include <xen/numa.h>
50 #include <xen/iommu.h>
51 #ifdef CONFIG_COMPAT
52 #include <compat/vcpu.h>
53 #endif
55 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
56 DEFINE_PER_CPU(u64, efer);
57 DEFINE_PER_CPU(unsigned long, cr4);
59 static void default_idle(void);
60 void (*pm_idle) (void) = default_idle;
62 static void paravirt_ctxt_switch_from(struct vcpu *v);
63 static void paravirt_ctxt_switch_to(struct vcpu *v);
65 static void vcpu_destroy_pagetables(struct vcpu *v);
67 static void continue_idle_domain(struct vcpu *v)
68 {
69 reset_stack_and_jump(idle_loop);
70 }
72 static void continue_nonidle_domain(struct vcpu *v)
73 {
74 reset_stack_and_jump(ret_from_intr);
75 }
77 static void default_idle(void)
78 {
79 local_irq_disable();
80 if ( !softirq_pending(smp_processor_id()) )
81 safe_halt();
82 else
83 local_irq_enable();
84 }
86 static void play_dead(void)
87 {
88 /* This must be done before dead CPU ack */
89 cpu_exit_clear();
90 hvm_cpu_down();
91 wbinvd();
92 mb();
93 /* Ack it */
94 __get_cpu_var(cpu_state) = CPU_DEAD;
96 /* With physical CPU hotplug, we should halt the cpu. */
97 local_irq_disable();
98 for ( ; ; )
99 halt();
100 }
102 void idle_loop(void)
103 {
104 for ( ; ; )
105 {
106 if ( cpu_is_offline(smp_processor_id()) )
107 play_dead();
108 page_scrub_schedule_work();
109 (*pm_idle)();
110 do_softirq();
111 }
112 }
114 void startup_cpu_idle_loop(void)
115 {
116 struct vcpu *v = current;
118 ASSERT(is_idle_vcpu(v));
119 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
120 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
122 reset_stack_and_jump(idle_loop);
123 }
125 void dump_pageframe_info(struct domain *d)
126 {
127 struct page_info *page;
129 printk("Memory pages belonging to domain %u:\n", d->domain_id);
131 if ( d->tot_pages >= 10 )
132 {
133 printk(" DomPage list too long to display\n");
134 }
135 else
136 {
137 list_for_each_entry ( page, &d->page_list, list )
138 {
139 printk(" DomPage %p: caf=%08x, taf=%" PRtype_info "\n",
140 _p(page_to_mfn(page)),
141 page->count_info, page->u.inuse.type_info);
142 }
143 }
145 list_for_each_entry ( page, &d->xenpage_list, list )
146 {
147 printk(" XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
148 _p(page_to_mfn(page)),
149 page->count_info, page->u.inuse.type_info);
150 }
151 }
153 struct vcpu *alloc_vcpu_struct(void)
154 {
155 struct vcpu *v;
156 if ( (v = xmalloc(struct vcpu)) != NULL )
157 memset(v, 0, sizeof(*v));
158 return v;
159 }
161 void free_vcpu_struct(struct vcpu *v)
162 {
163 xfree(v);
164 }
166 #ifdef CONFIG_COMPAT
168 static int setup_compat_l4(struct vcpu *v)
169 {
170 struct page_info *pg = alloc_domheap_page(NULL, 0);
171 l4_pgentry_t *l4tab;
173 if ( pg == NULL )
174 return -ENOMEM;
176 /* This page needs to look like a pagetable so that it can be shadowed */
177 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
179 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
180 l4tab[0] = l4e_empty();
181 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
182 l4e_from_page(pg, __PAGE_HYPERVISOR);
183 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
184 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
185 __PAGE_HYPERVISOR);
187 v->arch.guest_table = pagetable_from_page(pg);
188 v->arch.guest_table_user = v->arch.guest_table;
190 return 0;
191 }
193 static void release_compat_l4(struct vcpu *v)
194 {
195 free_domheap_page(pagetable_get_page(v->arch.guest_table));
196 v->arch.guest_table = pagetable_null();
197 v->arch.guest_table_user = pagetable_null();
198 }
200 static inline int may_switch_mode(struct domain *d)
201 {
202 return (!is_hvm_domain(d) && (d->tot_pages == 0));
203 }
205 int switch_native(struct domain *d)
206 {
207 l1_pgentry_t gdt_l1e;
208 unsigned int vcpuid;
210 if ( d == NULL )
211 return -EINVAL;
212 if ( !may_switch_mode(d) )
213 return -EACCES;
214 if ( !is_pv_32on64_domain(d) )
215 return 0;
217 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
219 /* switch gdt */
220 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
221 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
222 {
223 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
224 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
225 if (d->vcpu[vcpuid])
226 release_compat_l4(d->vcpu[vcpuid]);
227 }
229 return 0;
230 }
232 int switch_compat(struct domain *d)
233 {
234 l1_pgentry_t gdt_l1e;
235 unsigned int vcpuid;
237 if ( d == NULL )
238 return -EINVAL;
239 if ( !may_switch_mode(d) )
240 return -EACCES;
241 if ( is_pv_32on64_domain(d) )
242 return 0;
244 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
246 /* switch gdt */
247 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
248 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
249 {
250 if ( (d->vcpu[vcpuid] != NULL) &&
251 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
252 goto undo_and_fail;
253 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
254 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
255 }
257 domain_set_alloc_bitsize(d);
259 return 0;
261 undo_and_fail:
262 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
263 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
264 while ( vcpuid-- != 0 )
265 {
266 if ( d->vcpu[vcpuid] != NULL )
267 release_compat_l4(d->vcpu[vcpuid]);
268 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
269 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
270 }
271 return -ENOMEM;
272 }
274 #else
275 #define setup_compat_l4(v) 0
276 #define release_compat_l4(v) ((void)0)
277 #endif
279 int vcpu_initialise(struct vcpu *v)
280 {
281 struct domain *d = v->domain;
282 int rc;
284 v->arch.vcpu_info_mfn = INVALID_MFN;
286 v->arch.flags = TF_kernel_mode;
288 #if defined(__i386__)
289 mapcache_vcpu_init(v);
290 #endif
292 pae_l3_cache_init(&v->arch.pae_l3_cache);
294 paging_vcpu_init(v);
296 if ( is_hvm_domain(d) )
297 {
298 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
299 return rc;
300 }
301 else
302 {
303 /* PV guests by default have a 100Hz ticker. */
304 v->periodic_period = MILLISECS(10);
306 /* PV guests get an emulated PIT too for video BIOSes to use. */
307 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
308 pit_init(v, cpu_khz);
310 v->arch.schedule_tail = continue_nonidle_domain;
311 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
312 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
314 if ( is_idle_domain(d) )
315 {
316 v->arch.schedule_tail = continue_idle_domain;
317 v->arch.cr3 = __pa(idle_pg_table);
318 }
320 v->arch.guest_context.ctrlreg[4] =
321 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
322 }
324 v->arch.perdomain_ptes =
325 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
327 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
328 }
330 void vcpu_destroy(struct vcpu *v)
331 {
332 if ( is_pv_32on64_vcpu(v) )
333 release_compat_l4(v);
335 if ( is_hvm_vcpu(v) )
336 hvm_vcpu_destroy(v);
337 }
339 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
340 {
341 #ifdef __x86_64__
342 struct page_info *pg;
343 #endif
344 l1_pgentry_t gdt_l1e;
345 int i, vcpuid, pdpt_order, paging_initialised = 0;
346 int rc = -ENOMEM;
348 d->arch.hvm_domain.hap_enabled =
349 is_hvm_domain(d) &&
350 hvm_funcs.hap_supported &&
351 (domcr_flags & DOMCRF_hap);
353 d->arch.relmem = RELMEM_not_started;
354 INIT_LIST_HEAD(&d->arch.relmem_list);
356 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
357 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
358 if ( d->arch.mm_perdomain_pt == NULL )
359 goto fail;
360 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
362 /*
363 * Map Xen segments into every VCPU's GDT, irrespective of whether every
364 * VCPU will actually be used. This avoids an NMI race during context
365 * switch: if we take an interrupt after switching CR3 but before switching
366 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
367 * try to load CS from an invalid table.
368 */
369 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
370 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
371 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
372 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
374 #if defined(__i386__)
376 mapcache_domain_init(d);
378 #else /* __x86_64__ */
380 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
381 if ( pg == NULL )
382 goto fail;
383 d->arch.mm_perdomain_l2 = page_to_virt(pg);
384 clear_page(d->arch.mm_perdomain_l2);
385 for ( i = 0; i < (1 << pdpt_order); i++ )
386 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
387 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
388 __PAGE_HYPERVISOR);
390 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
391 if ( pg == NULL )
392 goto fail;
393 d->arch.mm_perdomain_l3 = page_to_virt(pg);
394 clear_page(d->arch.mm_perdomain_l3);
395 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
396 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
397 __PAGE_HYPERVISOR);
399 #endif /* __x86_64__ */
401 #ifdef CONFIG_COMPAT
402 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
403 #endif
405 if ( (rc = paging_domain_init(d)) != 0 )
406 goto fail;
407 paging_initialised = 1;
409 if ( !is_idle_domain(d) )
410 {
411 d->arch.ioport_caps =
412 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
413 rc = -ENOMEM;
414 if ( d->arch.ioport_caps == NULL )
415 goto fail;
417 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
418 goto fail;
420 clear_page(d->shared_info);
421 share_xen_page_with_guest(
422 virt_to_page(d->shared_info), d, XENSHARE_writable);
424 if ( (rc = iommu_domain_init(d)) != 0 )
425 goto fail;
426 }
428 spin_lock_init(&d->arch.irq_lock);
430 if ( is_hvm_domain(d) )
431 {
432 if ( (rc = hvm_domain_initialise(d)) != 0 )
433 {
434 iommu_domain_destroy(d);
435 goto fail;
436 }
437 }
438 else
439 {
440 /* 32-bit PV guest by default only if Xen is not 64-bit. */
441 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
442 (CONFIG_PAGING_LEVELS != 4);
443 }
445 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
446 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
447 {
448 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
449 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
450 }
452 return 0;
454 fail:
455 d->is_dying = DOMDYING_dead;
456 free_xenheap_page(d->shared_info);
457 if ( paging_initialised )
458 paging_final_teardown(d);
459 #ifdef __x86_64__
460 if ( d->arch.mm_perdomain_l2 )
461 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
462 if ( d->arch.mm_perdomain_l3 )
463 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
464 #endif
465 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
466 return rc;
467 }
469 void arch_domain_destroy(struct domain *d)
470 {
471 if ( is_hvm_domain(d) )
472 hvm_domain_destroy(d);
474 if ( !is_idle_domain(d) )
475 iommu_domain_destroy(d);
477 paging_final_teardown(d);
479 free_xenheap_pages(
480 d->arch.mm_perdomain_pt,
481 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
483 #ifdef __x86_64__
484 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
485 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
486 #endif
488 free_xenheap_page(d->shared_info);
489 }
491 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
492 {
493 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
495 hv_cr4_mask = ~X86_CR4_TSD;
496 if ( cpu_has_de )
497 hv_cr4_mask &= ~X86_CR4_DE;
499 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
500 gdprintk(XENLOG_WARNING,
501 "Attempt to change CR4 flags %08lx -> %08lx\n",
502 hv_cr4, guest_cr4);
504 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
505 }
507 /* This is called by arch_final_setup_guest and do_boot_vcpu */
508 int arch_set_info_guest(
509 struct vcpu *v, vcpu_guest_context_u c)
510 {
511 struct domain *d = v->domain;
512 unsigned long cr3_pfn = INVALID_MFN;
513 unsigned long flags, cr4;
514 int i, rc = 0, compat;
516 /* The context is a compat-mode one if the target domain is compat-mode;
517 * we expect the tools to DTRT even in compat-mode callers. */
518 compat = is_pv_32on64_domain(d);
520 #ifdef CONFIG_COMPAT
521 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
522 #else
523 #define c(fld) (c.nat->fld)
524 #endif
525 flags = c(flags);
527 if ( !is_hvm_vcpu(v) )
528 {
529 if ( !compat )
530 {
531 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
532 fixup_guest_stack_selector(d, c.nat->kernel_ss);
533 fixup_guest_code_selector(d, c.nat->user_regs.cs);
534 #ifdef __i386__
535 fixup_guest_code_selector(d, c.nat->event_callback_cs);
536 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
537 #endif
539 for ( i = 0; i < 256; i++ )
540 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
542 /* LDT safety checks. */
543 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
544 (c.nat->ldt_ents > 8192) ||
545 !array_access_ok(c.nat->ldt_base,
546 c.nat->ldt_ents,
547 LDT_ENTRY_SIZE) )
548 return -EINVAL;
549 }
550 #ifdef CONFIG_COMPAT
551 else
552 {
553 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
554 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
555 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
556 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
557 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
559 for ( i = 0; i < 256; i++ )
560 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
562 /* LDT safety checks. */
563 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
564 (c.cmp->ldt_ents > 8192) ||
565 !compat_array_access_ok(c.cmp->ldt_base,
566 c.cmp->ldt_ents,
567 LDT_ENTRY_SIZE) )
568 return -EINVAL;
569 }
570 #endif
571 }
573 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
575 v->arch.flags &= ~TF_kernel_mode;
576 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
577 v->arch.flags |= TF_kernel_mode;
579 if ( !compat )
580 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
581 #ifdef CONFIG_COMPAT
582 else
583 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
584 #endif
586 v->arch.guest_context.user_regs.eflags |= 2;
588 if ( is_hvm_vcpu(v) )
589 goto out;
591 /* Only CR0.TS is modifiable by guest or admin. */
592 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
593 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
595 init_int80_direct_trap(v);
597 /* IOPL privileges are virtualised. */
598 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
599 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
601 /* Ensure real hardware interrupts are enabled. */
602 v->arch.guest_context.user_regs.eflags |= EF_IE;
604 cr4 = v->arch.guest_context.ctrlreg[4];
605 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
606 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
608 memset(v->arch.guest_context.debugreg, 0,
609 sizeof(v->arch.guest_context.debugreg));
610 for ( i = 0; i < 8; i++ )
611 (void)set_debugreg(v, i, c(debugreg[i]));
613 if ( v->is_initialised )
614 goto out;
616 if ( v->vcpu_id == 0 )
617 d->vm_assist = c(vm_assist);
619 if ( !compat )
620 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
621 #ifdef CONFIG_COMPAT
622 else
623 {
624 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
625 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
627 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
628 return -EINVAL;
629 for ( i = 0; i < n; ++i )
630 gdt_frames[i] = c.cmp->gdt_frames[i];
631 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
632 }
633 #endif
634 if ( rc != 0 )
635 return rc;
637 if ( !compat )
638 {
639 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
641 if ( !mfn_valid(cr3_pfn) ||
642 (paging_mode_refcounts(d)
643 ? !get_page(mfn_to_page(cr3_pfn), d)
644 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
645 PGT_base_page_table)) )
646 {
647 destroy_gdt(v);
648 return -EINVAL;
649 }
651 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
653 #ifdef __x86_64__
654 if ( c.nat->ctrlreg[1] )
655 {
656 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
658 if ( !mfn_valid(cr3_pfn) ||
659 (paging_mode_refcounts(d)
660 ? !get_page(mfn_to_page(cr3_pfn), d)
661 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
662 PGT_base_page_table)) )
663 {
664 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
665 v->arch.guest_table = pagetable_null();
666 if ( paging_mode_refcounts(d) )
667 put_page(mfn_to_page(cr3_pfn));
668 else
669 put_page_and_type(mfn_to_page(cr3_pfn));
670 destroy_gdt(v);
671 return -EINVAL;
672 }
674 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
675 }
676 #endif
677 }
678 #ifdef CONFIG_COMPAT
679 else
680 {
681 l4_pgentry_t *l4tab;
683 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
685 if ( !mfn_valid(cr3_pfn) ||
686 (paging_mode_refcounts(d)
687 ? !get_page(mfn_to_page(cr3_pfn), d)
688 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
689 PGT_l3_page_table)) )
690 {
691 destroy_gdt(v);
692 return -EINVAL;
693 }
695 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
696 *l4tab = l4e_from_pfn(
697 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
698 }
699 #endif
701 if ( v->vcpu_id == 0 )
702 update_domain_wallclock_time(d);
704 /* Don't redo final setup */
705 v->is_initialised = 1;
707 if ( paging_mode_enabled(d) )
708 paging_update_paging_modes(v);
710 update_cr3(v);
712 out:
713 if ( flags & VGCF_online )
714 clear_bit(_VPF_down, &v->pause_flags);
715 else
716 set_bit(_VPF_down, &v->pause_flags);
717 return 0;
718 #undef c
719 }
721 void arch_vcpu_reset(struct vcpu *v)
722 {
723 if ( !is_hvm_vcpu(v) )
724 {
725 destroy_gdt(v);
726 vcpu_destroy_pagetables(v);
727 }
728 else
729 {
730 vcpu_end_shutdown_deferral(v);
731 }
732 }
734 /*
735 * Unmap the vcpu info page if the guest decided to place it somewhere
736 * else. This is only used from arch_domain_destroy, so there's no
737 * need to do anything clever.
738 */
739 static void
740 unmap_vcpu_info(struct vcpu *v)
741 {
742 struct domain *d = v->domain;
743 unsigned long mfn;
745 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
746 return;
748 mfn = v->arch.vcpu_info_mfn;
749 unmap_domain_page_global(v->vcpu_info);
751 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
752 v->arch.vcpu_info_mfn = INVALID_MFN;
754 put_page_and_type(mfn_to_page(mfn));
755 }
757 /*
758 * Map a guest page in and point the vcpu_info pointer at it. This
759 * makes sure that the vcpu_info is always pointing at a valid piece
760 * of memory, and it sets a pending event to make sure that a pending
761 * event doesn't get missed.
762 */
763 static int
764 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
765 {
766 struct domain *d = v->domain;
767 void *mapping;
768 vcpu_info_t *new_info;
769 int i;
771 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
772 return -EINVAL;
774 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
775 return -EINVAL;
777 /* Run this command on yourself or on other offline VCPUS. */
778 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
779 return -EINVAL;
781 mfn = gmfn_to_mfn(d, mfn);
782 if ( !mfn_valid(mfn) ||
783 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
784 return -EINVAL;
786 mapping = map_domain_page_global(mfn);
787 if ( mapping == NULL )
788 {
789 put_page_and_type(mfn_to_page(mfn));
790 return -ENOMEM;
791 }
793 new_info = (vcpu_info_t *)(mapping + offset);
795 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
797 v->vcpu_info = new_info;
798 v->arch.vcpu_info_mfn = mfn;
800 /* Set new vcpu_info pointer /before/ setting pending flags. */
801 wmb();
803 /*
804 * Mark everything as being pending just to make sure nothing gets
805 * lost. The domain will get a spurious event, but it can cope.
806 */
807 vcpu_info(v, evtchn_upcall_pending) = 1;
808 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
809 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
811 /*
812 * Only bother to update time for the current vcpu. If we're
813 * operating on another vcpu, then it had better not be running at
814 * the time.
815 */
816 if ( v == current )
817 update_vcpu_system_time(v);
819 return 0;
820 }
822 long
823 arch_do_vcpu_op(
824 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
825 {
826 long rc = 0;
828 switch ( cmd )
829 {
830 case VCPUOP_register_runstate_memory_area:
831 {
832 struct vcpu_register_runstate_memory_area area;
833 struct vcpu_runstate_info runstate;
835 rc = -EFAULT;
836 if ( copy_from_guest(&area, arg, 1) )
837 break;
839 if ( !guest_handle_okay(area.addr.h, 1) )
840 break;
842 rc = 0;
843 runstate_guest(v) = area.addr.h;
845 if ( v == current )
846 {
847 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
848 }
849 else
850 {
851 vcpu_runstate_get(v, &runstate);
852 __copy_to_guest(runstate_guest(v), &runstate, 1);
853 }
855 break;
856 }
858 case VCPUOP_register_vcpu_info:
859 {
860 struct domain *d = v->domain;
861 struct vcpu_register_vcpu_info info;
863 rc = -EFAULT;
864 if ( copy_from_guest(&info, arg, 1) )
865 break;
867 domain_lock(d);
868 rc = map_vcpu_info(v, info.mfn, info.offset);
869 domain_unlock(d);
871 break;
872 }
874 case VCPUOP_get_physid:
875 {
876 struct vcpu_get_physid cpu_id;
878 rc = -EINVAL;
879 if ( !v->domain->is_pinned )
880 break;
882 cpu_id.phys_id =
883 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
884 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
886 rc = -EFAULT;
887 if ( copy_to_guest(arg, &cpu_id, 1) )
888 break;
890 rc = 0;
891 break;
892 }
894 default:
895 rc = -ENOSYS;
896 break;
897 }
899 return rc;
900 }
902 #ifdef __x86_64__
904 #define loadsegment(seg,value) ({ \
905 int __r = 1; \
906 asm volatile ( \
907 "1: movl %k1,%%" #seg "\n2:\n" \
908 ".section .fixup,\"ax\"\n" \
909 "3: xorl %k0,%k0\n" \
910 " movl %k0,%%" #seg "\n" \
911 " jmp 2b\n" \
912 ".previous\n" \
913 ".section __ex_table,\"a\"\n" \
914 " .align 8\n" \
915 " .quad 1b,3b\n" \
916 ".previous" \
917 : "=r" (__r) : "r" (value), "0" (__r) );\
918 __r; })
920 /*
921 * save_segments() writes a mask of segments which are dirty (non-zero),
922 * allowing load_segments() to avoid some expensive segment loads and
923 * MSR writes.
924 */
925 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
926 #define DIRTY_DS 0x01
927 #define DIRTY_ES 0x02
928 #define DIRTY_FS 0x04
929 #define DIRTY_GS 0x08
930 #define DIRTY_FS_BASE 0x10
931 #define DIRTY_GS_BASE_USER 0x20
933 static void load_segments(struct vcpu *n)
934 {
935 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
936 int all_segs_okay = 1;
937 unsigned int dirty_segment_mask, cpu = smp_processor_id();
939 /* Load and clear the dirty segment mask. */
940 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
941 per_cpu(dirty_segment_mask, cpu) = 0;
943 /* Either selector != 0 ==> reload. */
944 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
945 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
947 /* Either selector != 0 ==> reload. */
948 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
949 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
951 /*
952 * Either selector != 0 ==> reload.
953 * Also reload to reset FS_BASE if it was non-zero.
954 */
955 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
956 nctxt->user_regs.fs) )
957 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
959 /*
960 * Either selector != 0 ==> reload.
961 * Also reload to reset GS_BASE if it was non-zero.
962 */
963 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
964 nctxt->user_regs.gs) )
965 {
966 /* Reset GS_BASE with user %gs? */
967 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
968 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
969 }
971 if ( !is_pv_32on64_domain(n->domain) )
972 {
973 /* This can only be non-zero if selector is NULL. */
974 if ( nctxt->fs_base )
975 wrmsr(MSR_FS_BASE,
976 nctxt->fs_base,
977 nctxt->fs_base>>32);
979 /* Most kernels have non-zero GS base, so don't bother testing. */
980 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
981 wrmsr(MSR_SHADOW_GS_BASE,
982 nctxt->gs_base_kernel,
983 nctxt->gs_base_kernel>>32);
985 /* This can only be non-zero if selector is NULL. */
986 if ( nctxt->gs_base_user )
987 wrmsr(MSR_GS_BASE,
988 nctxt->gs_base_user,
989 nctxt->gs_base_user>>32);
991 /* If in kernel mode then switch the GS bases around. */
992 if ( (n->arch.flags & TF_kernel_mode) )
993 asm volatile ( "swapgs" );
994 }
996 if ( unlikely(!all_segs_okay) )
997 {
998 struct cpu_user_regs *regs = guest_cpu_user_regs();
999 unsigned long *rsp =
1000 (n->arch.flags & TF_kernel_mode) ?
1001 (unsigned long *)regs->rsp :
1002 (unsigned long *)nctxt->kernel_sp;
1003 unsigned long cs_and_mask, rflags;
1005 if ( is_pv_32on64_domain(n->domain) )
1007 unsigned int *esp = ring_1(regs) ?
1008 (unsigned int *)regs->rsp :
1009 (unsigned int *)nctxt->kernel_sp;
1010 unsigned int cs_and_mask, eflags;
1011 int ret = 0;
1013 /* CS longword also contains full evtchn_upcall_mask. */
1014 cs_and_mask = (unsigned short)regs->cs |
1015 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1016 /* Fold upcall mask into RFLAGS.IF. */
1017 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1018 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1020 if ( !ring_1(regs) )
1022 ret = put_user(regs->ss, esp-1);
1023 ret |= put_user(regs->_esp, esp-2);
1024 esp -= 2;
1027 if ( ret |
1028 put_user(eflags, esp-1) |
1029 put_user(cs_and_mask, esp-2) |
1030 put_user(regs->_eip, esp-3) |
1031 put_user(nctxt->user_regs.gs, esp-4) |
1032 put_user(nctxt->user_regs.fs, esp-5) |
1033 put_user(nctxt->user_regs.es, esp-6) |
1034 put_user(nctxt->user_regs.ds, esp-7) )
1036 gdprintk(XENLOG_ERR, "Error while creating compat "
1037 "failsafe callback frame.\n");
1038 domain_crash(n->domain);
1041 if ( test_bit(_VGCF_failsafe_disables_events,
1042 &n->arch.guest_context.flags) )
1043 vcpu_info(n, evtchn_upcall_mask) = 1;
1045 regs->entry_vector = TRAP_syscall;
1046 regs->_eflags &= 0xFFFCBEFFUL;
1047 regs->ss = FLAT_COMPAT_KERNEL_SS;
1048 regs->_esp = (unsigned long)(esp-7);
1049 regs->cs = FLAT_COMPAT_KERNEL_CS;
1050 regs->_eip = nctxt->failsafe_callback_eip;
1051 return;
1054 if ( !(n->arch.flags & TF_kernel_mode) )
1055 toggle_guest_mode(n);
1056 else
1057 regs->cs &= ~3;
1059 /* CS longword also contains full evtchn_upcall_mask. */
1060 cs_and_mask = (unsigned long)regs->cs |
1061 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1063 /* Fold upcall mask into RFLAGS.IF. */
1064 rflags = regs->rflags & ~X86_EFLAGS_IF;
1065 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1067 if ( put_user(regs->ss, rsp- 1) |
1068 put_user(regs->rsp, rsp- 2) |
1069 put_user(rflags, rsp- 3) |
1070 put_user(cs_and_mask, rsp- 4) |
1071 put_user(regs->rip, rsp- 5) |
1072 put_user(nctxt->user_regs.gs, rsp- 6) |
1073 put_user(nctxt->user_regs.fs, rsp- 7) |
1074 put_user(nctxt->user_regs.es, rsp- 8) |
1075 put_user(nctxt->user_regs.ds, rsp- 9) |
1076 put_user(regs->r11, rsp-10) |
1077 put_user(regs->rcx, rsp-11) )
1079 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1080 "callback frame.\n");
1081 domain_crash(n->domain);
1084 if ( test_bit(_VGCF_failsafe_disables_events,
1085 &n->arch.guest_context.flags) )
1086 vcpu_info(n, evtchn_upcall_mask) = 1;
1088 regs->entry_vector = TRAP_syscall;
1089 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1090 X86_EFLAGS_NT|X86_EFLAGS_TF);
1091 regs->ss = FLAT_KERNEL_SS;
1092 regs->rsp = (unsigned long)(rsp-11);
1093 regs->cs = FLAT_KERNEL_CS;
1094 regs->rip = nctxt->failsafe_callback_eip;
1098 static void save_segments(struct vcpu *v)
1100 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1101 struct cpu_user_regs *regs = &ctxt->user_regs;
1102 unsigned int dirty_segment_mask = 0;
1104 regs->ds = read_segment_register(ds);
1105 regs->es = read_segment_register(es);
1106 regs->fs = read_segment_register(fs);
1107 regs->gs = read_segment_register(gs);
1109 if ( regs->ds )
1110 dirty_segment_mask |= DIRTY_DS;
1112 if ( regs->es )
1113 dirty_segment_mask |= DIRTY_ES;
1115 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1117 dirty_segment_mask |= DIRTY_FS;
1118 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1120 else if ( ctxt->fs_base )
1122 dirty_segment_mask |= DIRTY_FS_BASE;
1125 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1127 dirty_segment_mask |= DIRTY_GS;
1128 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1130 else if ( ctxt->gs_base_user )
1132 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1135 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1138 #define switch_kernel_stack(v) ((void)0)
1140 #elif defined(__i386__)
1142 #define load_segments(n) ((void)0)
1143 #define save_segments(p) ((void)0)
1145 static inline void switch_kernel_stack(struct vcpu *v)
1147 struct tss_struct *tss = &init_tss[smp_processor_id()];
1148 tss->esp1 = v->arch.guest_context.kernel_sp;
1149 tss->ss1 = v->arch.guest_context.kernel_ss;
1152 #endif /* __i386__ */
1154 static void paravirt_ctxt_switch_from(struct vcpu *v)
1156 save_segments(v);
1158 /*
1159 * Disable debug breakpoints. We do this aggressively because if we switch
1160 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1161 * inside Xen, before we get a chance to reload DR7, and this cannot always
1162 * safely be handled.
1163 */
1164 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1165 write_debugreg(7, 0);
1168 static void paravirt_ctxt_switch_to(struct vcpu *v)
1170 unsigned long cr4;
1172 set_int80_direct_trap(v);
1173 switch_kernel_stack(v);
1175 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1176 if ( unlikely(cr4 != read_cr4()) )
1177 write_cr4(cr4);
1179 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1181 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1182 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1183 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1184 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1185 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1186 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1190 static void __context_switch(void)
1192 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1193 unsigned int cpu = smp_processor_id();
1194 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1195 struct vcpu *n = current;
1197 ASSERT(p != n);
1198 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1200 if ( !is_idle_vcpu(p) )
1202 memcpy(&p->arch.guest_context.user_regs,
1203 stack_regs,
1204 CTXT_SWITCH_STACK_BYTES);
1205 unlazy_fpu(p);
1206 p->arch.ctxt_switch_from(p);
1209 if ( !is_idle_vcpu(n) )
1211 memcpy(stack_regs,
1212 &n->arch.guest_context.user_regs,
1213 CTXT_SWITCH_STACK_BYTES);
1214 n->arch.ctxt_switch_to(n);
1217 if ( p->domain != n->domain )
1218 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1219 cpu_set(cpu, n->vcpu_dirty_cpumask);
1221 write_ptbase(n);
1223 if ( p->vcpu_id != n->vcpu_id )
1225 char gdt_load[10];
1226 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1227 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1228 asm volatile ( "lgdt %0" : "=m" (gdt_load) );
1231 if ( p->domain != n->domain )
1232 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1233 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1235 per_cpu(curr_vcpu, cpu) = n;
1239 void context_switch(struct vcpu *prev, struct vcpu *next)
1241 unsigned int cpu = smp_processor_id();
1242 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1244 ASSERT(local_irq_is_enabled());
1246 /* Allow at most one CPU at a time to be dirty. */
1247 ASSERT(cpus_weight(dirty_mask) <= 1);
1248 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1250 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1251 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1252 flush_tlb_mask(next->vcpu_dirty_cpumask);
1255 local_irq_disable();
1257 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1258 pt_save_timer(prev);
1260 set_current(next);
1262 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1264 local_irq_enable();
1266 else
1268 __context_switch();
1270 #ifdef CONFIG_COMPAT
1271 if ( !is_hvm_vcpu(next) &&
1272 (is_idle_vcpu(prev) ||
1273 is_hvm_vcpu(prev) ||
1274 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1276 uint64_t efer = read_efer();
1277 if ( !(efer & EFER_SCE) )
1278 write_efer(efer | EFER_SCE);
1279 flush_tlb_one_local(GDT_VIRT_START(next) +
1280 FIRST_RESERVED_GDT_BYTE);
1282 #endif
1284 /* Re-enable interrupts before restoring state which may fault. */
1285 local_irq_enable();
1287 if ( !is_hvm_vcpu(next) )
1289 load_LDT(next);
1290 load_segments(next);
1294 context_saved(prev);
1296 /* Update per-VCPU guest runstate shared memory area (if registered). */
1297 if ( !guest_handle_is_null(runstate_guest(next)) )
1299 if ( !is_pv_32on64_domain(next->domain) )
1300 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1301 #ifdef CONFIG_COMPAT
1302 else
1304 struct compat_vcpu_runstate_info info;
1306 XLAT_vcpu_runstate_info(&info, &next->runstate);
1307 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1309 #endif
1312 schedule_tail(next);
1313 BUG();
1316 void continue_running(struct vcpu *same)
1318 schedule_tail(same);
1319 BUG();
1322 int __sync_lazy_execstate(void)
1324 unsigned long flags;
1325 int switch_required;
1327 local_irq_save(flags);
1329 switch_required = (this_cpu(curr_vcpu) != current);
1331 if ( switch_required )
1333 ASSERT(current == idle_vcpu[smp_processor_id()]);
1334 __context_switch();
1337 local_irq_restore(flags);
1339 return switch_required;
1342 void sync_vcpu_execstate(struct vcpu *v)
1344 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1345 (void)__sync_lazy_execstate();
1347 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1348 flush_tlb_mask(v->vcpu_dirty_cpumask);
1351 struct migrate_info {
1352 long (*func)(void *data);
1353 void *data;
1354 void (*saved_schedule_tail)(struct vcpu *);
1355 cpumask_t saved_affinity;
1356 };
1358 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1360 struct cpu_user_regs *regs = guest_cpu_user_regs();
1361 struct migrate_info *info = v->arch.continue_info;
1362 cpumask_t mask = info->saved_affinity;
1364 regs->eax = info->func(info->data);
1366 v->arch.schedule_tail = info->saved_schedule_tail;
1367 v->arch.continue_info = NULL;
1369 xfree(info);
1371 vcpu_unlock_affinity(v, &mask);
1372 schedule_tail(v);
1375 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1377 struct vcpu *v = current;
1378 struct migrate_info *info;
1379 int rc;
1381 if ( cpu == smp_processor_id() )
1382 return func(data);
1384 info = xmalloc(struct migrate_info);
1385 if ( info == NULL )
1386 return -ENOMEM;
1388 info->func = func;
1389 info->data = data;
1390 info->saved_schedule_tail = v->arch.schedule_tail;
1391 info->saved_affinity = cpumask_of_cpu(cpu);
1393 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1394 v->arch.continue_info = info;
1396 rc = vcpu_lock_affinity(v, &info->saved_affinity);
1397 if ( rc )
1399 v->arch.schedule_tail = info->saved_schedule_tail;
1400 v->arch.continue_info = NULL;
1401 xfree(info);
1402 return rc;
1405 /* Dummy return value will be overwritten by new schedule_tail. */
1406 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1407 return 0;
1410 #define next_arg(fmt, args) ({ \
1411 unsigned long __arg; \
1412 switch ( *(fmt)++ ) \
1413 { \
1414 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1415 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1416 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1417 default: __arg = 0; BUG(); \
1418 } \
1419 __arg; \
1420 })
1422 DEFINE_PER_CPU(char, hc_preempted);
1424 unsigned long hypercall_create_continuation(
1425 unsigned int op, const char *format, ...)
1427 struct mc_state *mcs = &this_cpu(mc_state);
1428 struct cpu_user_regs *regs;
1429 const char *p = format;
1430 unsigned long arg;
1431 unsigned int i;
1432 va_list args;
1434 va_start(args, format);
1436 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1438 __set_bit(_MCSF_call_preempted, &mcs->flags);
1440 for ( i = 0; *p != '\0'; i++ )
1441 mcs->call.args[i] = next_arg(p, args);
1442 if ( is_pv_32on64_domain(current->domain) )
1444 for ( ; i < 6; i++ )
1445 mcs->call.args[i] = 0;
1448 else
1450 regs = guest_cpu_user_regs();
1451 regs->eax = op;
1452 /*
1453 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1454 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1455 */
1456 if ( !is_hvm_vcpu(current) )
1457 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1459 #ifdef __x86_64__
1460 if ( !is_hvm_vcpu(current) ?
1461 !is_pv_32on64_vcpu(current) :
1462 (hvm_guest_x86_mode(current) == 8) )
1464 for ( i = 0; *p != '\0'; i++ )
1466 arg = next_arg(p, args);
1467 switch ( i )
1469 case 0: regs->rdi = arg; break;
1470 case 1: regs->rsi = arg; break;
1471 case 2: regs->rdx = arg; break;
1472 case 3: regs->r10 = arg; break;
1473 case 4: regs->r8 = arg; break;
1474 case 5: regs->r9 = arg; break;
1478 else
1479 #endif
1481 if ( supervisor_mode_kernel )
1482 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1484 for ( i = 0; *p != '\0'; i++ )
1486 arg = next_arg(p, args);
1487 switch ( i )
1489 case 0: regs->ebx = arg; break;
1490 case 1: regs->ecx = arg; break;
1491 case 2: regs->edx = arg; break;
1492 case 3: regs->esi = arg; break;
1493 case 4: regs->edi = arg; break;
1494 case 5: regs->ebp = arg; break;
1499 this_cpu(hc_preempted) = 1;
1502 va_end(args);
1504 return op;
1507 #ifdef CONFIG_COMPAT
1508 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1510 int rc = 0;
1511 struct mc_state *mcs = &this_cpu(mc_state);
1512 struct cpu_user_regs *regs;
1513 unsigned int i, cval = 0;
1514 unsigned long nval = 0;
1515 va_list args;
1517 BUG_ON(*id > 5);
1518 BUG_ON(mask & (1U << *id));
1520 va_start(args, mask);
1522 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1524 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1525 return 0;
1526 for ( i = 0; i < 6; ++i, mask >>= 1 )
1528 if ( mask & 1 )
1530 nval = va_arg(args, unsigned long);
1531 cval = va_arg(args, unsigned int);
1532 if ( cval == nval )
1533 mask &= ~1U;
1534 else
1535 BUG_ON(nval == (unsigned int)nval);
1537 else if ( id && *id == i )
1539 *id = mcs->call.args[i];
1540 id = NULL;
1542 if ( (mask & 1) && mcs->call.args[i] == nval )
1544 mcs->call.args[i] = cval;
1545 ++rc;
1547 else
1548 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1551 else
1553 regs = guest_cpu_user_regs();
1554 for ( i = 0; i < 6; ++i, mask >>= 1 )
1556 unsigned long *reg;
1558 switch ( i )
1560 case 0: reg = &regs->ebx; break;
1561 case 1: reg = &regs->ecx; break;
1562 case 2: reg = &regs->edx; break;
1563 case 3: reg = &regs->esi; break;
1564 case 4: reg = &regs->edi; break;
1565 case 5: reg = &regs->ebp; break;
1566 default: BUG(); reg = NULL; break;
1568 if ( (mask & 1) )
1570 nval = va_arg(args, unsigned long);
1571 cval = va_arg(args, unsigned int);
1572 if ( cval == nval )
1573 mask &= ~1U;
1574 else
1575 BUG_ON(nval == (unsigned int)nval);
1577 else if ( id && *id == i )
1579 *id = *reg;
1580 id = NULL;
1582 if ( (mask & 1) && *reg == nval )
1584 *reg = cval;
1585 ++rc;
1587 else
1588 BUG_ON(*reg != (unsigned int)*reg);
1592 va_end(args);
1594 return rc;
1596 #endif
1598 static int relinquish_memory(
1599 struct domain *d, struct list_head *list, unsigned long type)
1601 struct list_head *ent;
1602 struct page_info *page;
1603 unsigned long x, y;
1604 int ret = 0;
1606 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1607 spin_lock_recursive(&d->page_alloc_lock);
1609 ent = list->next;
1610 while ( ent != list )
1612 page = list_entry(ent, struct page_info, list);
1614 /* Grab a reference to the page so it won't disappear from under us. */
1615 if ( unlikely(!get_page(page, d)) )
1617 /* Couldn't get a reference -- someone is freeing this page. */
1618 ent = ent->next;
1619 list_move_tail(&page->list, &d->arch.relmem_list);
1620 continue;
1623 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1624 put_page_and_type(page);
1626 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1627 put_page(page);
1629 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1630 /*
1631 * Forcibly drop reference counts of page tables above top most (which
1632 * were skipped to prevent long latencies due to deep recursion - see
1633 * the special treatment in free_lX_table()).
1634 */
1635 y = page->u.inuse.type_info;
1636 if ( (type < PGT_root_page_table) &&
1637 unlikely(((y + PGT_type_mask) &
1638 (PGT_type_mask|PGT_validated)) == type) )
1640 BUG_ON((y & PGT_count_mask) >=
1641 (page->count_info & PGC_count_mask));
1642 while ( y & PGT_count_mask )
1644 put_page_and_type(page);
1645 y = page->u.inuse.type_info;
1648 #endif
1650 /*
1651 * Forcibly invalidate top-most, still valid page tables at this point
1652 * to break circular 'linear page table' references. This is okay
1653 * because MMU structures are not shared across domains and this domain
1654 * is now dead. Thus top-most valid tables are not in use so a non-zero
1655 * count means circular reference.
1656 */
1657 y = page->u.inuse.type_info;
1658 for ( ; ; )
1660 x = y;
1661 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1662 (type|PGT_validated)) )
1663 break;
1665 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1666 if ( likely(y == x) )
1668 free_page_type(page, type);
1669 break;
1673 /* Follow the list chain and /then/ potentially free the page. */
1674 ent = ent->next;
1675 list_move_tail(&page->list, &d->arch.relmem_list);
1676 put_page(page);
1678 if ( hypercall_preempt_check() )
1680 ret = -EAGAIN;
1681 goto out;
1685 list_splice_init(&d->arch.relmem_list, list);
1687 out:
1688 spin_unlock_recursive(&d->page_alloc_lock);
1689 return ret;
1692 static void vcpu_destroy_pagetables(struct vcpu *v)
1694 struct domain *d = v->domain;
1695 unsigned long pfn;
1697 #ifdef __x86_64__
1698 if ( is_pv_32on64_vcpu(v) )
1700 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1701 __va(pagetable_get_paddr(v->arch.guest_table)));
1703 if ( pfn != 0 )
1705 if ( paging_mode_refcounts(d) )
1706 put_page(mfn_to_page(pfn));
1707 else
1708 put_page_and_type(mfn_to_page(pfn));
1711 l4e_write(
1712 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1713 l4e_empty());
1715 v->arch.cr3 = 0;
1716 return;
1718 #endif
1720 pfn = pagetable_get_pfn(v->arch.guest_table);
1721 if ( pfn != 0 )
1723 if ( paging_mode_refcounts(d) )
1724 put_page(mfn_to_page(pfn));
1725 else
1726 put_page_and_type(mfn_to_page(pfn));
1727 v->arch.guest_table = pagetable_null();
1730 #ifdef __x86_64__
1731 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1732 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1733 if ( pfn != 0 )
1735 if ( !is_pv_32bit_vcpu(v) )
1737 if ( paging_mode_refcounts(d) )
1738 put_page(mfn_to_page(pfn));
1739 else
1740 put_page_and_type(mfn_to_page(pfn));
1742 v->arch.guest_table_user = pagetable_null();
1744 #endif
1746 v->arch.cr3 = 0;
1749 int domain_relinquish_resources(struct domain *d)
1751 int ret;
1752 struct vcpu *v;
1754 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1756 switch ( d->arch.relmem )
1758 case RELMEM_not_started:
1759 /* Tear down paging-assistance stuff. */
1760 paging_teardown(d);
1762 for_each_vcpu ( d, v )
1764 /* Drop the in-use references to page-table bases. */
1765 vcpu_destroy_pagetables(v);
1767 /*
1768 * Relinquish GDT mappings. No need for explicit unmapping of the
1769 * LDT as it automatically gets squashed with the guest mappings.
1770 */
1771 destroy_gdt(v);
1773 unmap_vcpu_info(v);
1776 d->arch.relmem = RELMEM_xen;
1777 /* fallthrough */
1779 /* Relinquish every page of memory. */
1780 case RELMEM_xen:
1781 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1782 if ( ret )
1783 return ret;
1784 #if CONFIG_PAGING_LEVELS >= 4
1785 d->arch.relmem = RELMEM_l4;
1786 /* fallthrough */
1788 case RELMEM_l4:
1789 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1790 if ( ret )
1791 return ret;
1792 #endif
1793 #if CONFIG_PAGING_LEVELS >= 3
1794 d->arch.relmem = RELMEM_l3;
1795 /* fallthrough */
1797 case RELMEM_l3:
1798 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1799 if ( ret )
1800 return ret;
1801 #endif
1802 d->arch.relmem = RELMEM_l2;
1803 /* fallthrough */
1805 case RELMEM_l2:
1806 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1807 if ( ret )
1808 return ret;
1809 d->arch.relmem = RELMEM_done;
1810 /* fallthrough */
1812 case RELMEM_done:
1813 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1814 ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
1815 if ( ret )
1816 return ret;
1817 #endif
1818 WARN_ON(d->xenheap_pages);
1819 break;
1821 default:
1822 BUG();
1825 /* Free page used by xen oprofile buffer. */
1826 free_xenoprof_pages(d);
1828 if ( is_hvm_domain(d) )
1829 hvm_domain_relinquish_resources(d);
1831 return 0;
1834 void arch_dump_domain_info(struct domain *d)
1836 paging_dump_domain_info(d);
1839 void arch_dump_vcpu_info(struct vcpu *v)
1841 paging_dump_vcpu_info(v);
1844 void domain_cpuid(
1845 struct domain *d,
1846 unsigned int input,
1847 unsigned int sub_input,
1848 unsigned int *eax,
1849 unsigned int *ebx,
1850 unsigned int *ecx,
1851 unsigned int *edx)
1853 cpuid_input_t *cpuid;
1854 int i;
1856 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1858 cpuid = &d->arch.cpuids[i];
1860 if ( (cpuid->input[0] == input) &&
1861 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1862 (cpuid->input[1] == sub_input)) )
1864 *eax = cpuid->eax;
1865 *ebx = cpuid->ebx;
1866 *ecx = cpuid->ecx;
1867 *edx = cpuid->edx;
1868 return;
1872 *eax = *ebx = *ecx = *edx = 0;
1875 /*
1876 * Local variables:
1877 * mode: C
1878 * c-set-style: "BSD"
1879 * c-basic-offset: 4
1880 * tab-width: 4
1881 * indent-tabs-mode: nil
1882 * End:
1883 */