debuggers.hg

view xen/arch/x86/domain.c @ 4629:6375127fdf23

bitkeeper revision 1.1311.1.1 (426641eeBv97w6sl983zxeR4Dc3Utg)

Cleanup page table handling. Add macros to access page table
entries, fixup plenty of places in the code to use the page
table types instead of "unsigned long".

Signed-off-by: Gerd Knorr <kraxel@bytesex.org>
Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Wed Apr 20 11:50:06 2005 +0000 (2005-04-20)
parents f6326ed2e7fe
children 1803018b3b05
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <public/io/ioreq.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct exec_domain *curr_ed;
50 } __cacheline_aligned;
51 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
53 static void default_idle(void)
54 {
55 local_irq_disable();
56 if ( !softirq_pending(smp_processor_id()) )
57 safe_halt();
58 else
59 local_irq_enable();
60 }
62 static __attribute_used__ void idle_loop(void)
63 {
64 int cpu = smp_processor_id();
65 for ( ; ; )
66 {
67 irq_stat[cpu].idle_timestamp = jiffies;
68 while ( !softirq_pending(cpu) )
69 {
70 page_scrub_schedule_work();
71 default_idle();
72 }
73 do_softirq();
74 }
75 }
77 void startup_cpu_idle_loop(void)
78 {
79 /* Just some sanity to ensure that the scheduler is set up okay. */
80 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
81 percpu_ctxt[smp_processor_id()].curr_ed = current;
82 set_bit(smp_processor_id(), &current->domain->cpuset);
83 domain_unpause_by_systemcontroller(current->domain);
84 raise_softirq(SCHEDULE_SOFTIRQ);
85 do_softirq();
87 /*
88 * Declares CPU setup done to the boot processor.
89 * Therefore memory barrier to ensure state is visible.
90 */
91 smp_mb();
92 init_idle();
94 idle_loop();
95 }
97 static long no_idt[2];
98 static int reboot_mode;
100 static inline void kb_wait(void)
101 {
102 int i;
104 for ( i = 0; i < 0x10000; i++ )
105 if ( (inb_p(0x64) & 0x02) == 0 )
106 break;
107 }
109 void machine_restart(char * __unused)
110 {
111 int i;
113 if ( opt_noreboot )
114 {
115 printk("Reboot disabled on cmdline: require manual reset\n");
116 for ( ; ; )
117 safe_halt();
118 }
120 local_irq_enable();
122 /* Ensure we are the boot CPU. */
123 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
124 {
125 smp_call_function((void *)machine_restart, NULL, 1, 0);
126 for ( ; ; )
127 safe_halt();
128 }
130 /*
131 * Stop all CPUs and turn off local APICs and the IO-APIC, so
132 * other OSs see a clean IRQ state.
133 */
134 smp_send_stop();
135 disable_IO_APIC();
137 #ifdef CONFIG_VMX
138 stop_vmx();
139 #endif
141 /* Rebooting needs to touch the page at absolute address 0. */
142 *((unsigned short *)__va(0x472)) = reboot_mode;
144 for ( ; ; )
145 {
146 /* Pulse the keyboard reset line. */
147 for ( i = 0; i < 100; i++ )
148 {
149 kb_wait();
150 udelay(50);
151 outb(0xfe,0x64); /* pulse reset low */
152 udelay(50);
153 }
155 /* That didn't work - force a triple fault.. */
156 __asm__ __volatile__("lidt %0": "=m" (no_idt));
157 __asm__ __volatile__("int3");
158 }
159 }
162 void __attribute__((noreturn)) __machine_halt(void *unused)
163 {
164 for ( ; ; )
165 safe_halt();
166 }
168 void machine_halt(void)
169 {
170 watchdog_on = 0;
171 smp_call_function(__machine_halt, NULL, 1, 0);
172 __machine_halt(NULL);
173 }
175 void dump_pageframe_info(struct domain *d)
176 {
177 struct pfn_info *page;
179 if ( d->tot_pages < 10 )
180 {
181 list_for_each_entry ( page, &d->page_list, list )
182 {
183 printk("Page %08x: caf=%08x, taf=%08x\n",
184 page_to_phys(page), page->count_info,
185 page->u.inuse.type_info);
186 }
187 }
189 page = virt_to_page(d->shared_info);
190 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
191 page_to_phys(page), page->count_info,
192 page->u.inuse.type_info);
193 }
195 struct exec_domain *arch_alloc_exec_domain_struct(void)
196 {
197 return xmalloc(struct exec_domain);
198 }
200 void arch_free_exec_domain_struct(struct exec_domain *ed)
201 {
202 xfree(ed);
203 }
205 void free_perdomain_pt(struct domain *d)
206 {
207 free_xenheap_page((unsigned long)d->arch.mm_perdomain_pt);
208 #ifdef __x86_64__
209 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l2);
210 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l3);
211 #endif
212 }
214 static void continue_idle_task(struct exec_domain *ed)
215 {
216 reset_stack_and_jump(idle_loop);
217 }
219 static void continue_nonidle_task(struct exec_domain *ed)
220 {
221 reset_stack_and_jump(ret_from_intr);
222 }
224 void arch_do_createdomain(struct exec_domain *ed)
225 {
226 struct domain *d = ed->domain;
228 SET_DEFAULT_FAST_TRAP(&ed->arch);
230 ed->arch.flags = TF_kernel_mode;
232 if ( d->id == IDLE_DOMAIN_ID )
233 {
234 ed->arch.schedule_tail = continue_idle_task;
235 }
236 else
237 {
238 ed->arch.schedule_tail = continue_nonidle_task;
240 d->shared_info = (void *)alloc_xenheap_page();
241 memset(d->shared_info, 0, PAGE_SIZE);
242 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
243 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
244 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
245 PAGE_SHIFT] = INVALID_M2P_ENTRY;
247 d->arch.mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
248 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
249 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
250 PAGE_SHIFT] = INVALID_M2P_ENTRY;
251 ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
253 ed->arch.guest_vtable = __linear_l2_table;
254 ed->arch.shadow_vtable = __shadow_linear_l2_table;
256 #ifdef __x86_64__
257 ed->arch.guest_vl3table = __linear_l3_table;
258 ed->arch.guest_vl4table = __linear_l4_table;
260 d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
261 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
262 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
263 l2e_create_phys(__pa(d->arch.mm_perdomain_pt),
264 __PAGE_HYPERVISOR);
265 d->arch.mm_perdomain_l3 = (l3_pgentry_t *)alloc_xenheap_page();
266 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
267 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
268 l3e_create_phys(__pa(d->arch.mm_perdomain_l2),
269 __PAGE_HYPERVISOR);
270 #endif
272 (void)ptwr_init(d);
274 shadow_lock_init(d);
275 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
276 }
277 }
279 void arch_do_boot_vcpu(struct exec_domain *ed)
280 {
281 struct domain *d = ed->domain;
282 ed->arch.schedule_tail = d->exec_domain[0]->arch.schedule_tail;
283 ed->arch.perdomain_ptes =
284 d->arch.mm_perdomain_pt + (ed->eid << PDPT_VCPU_SHIFT);
285 ed->arch.flags = TF_kernel_mode;
286 }
288 #ifdef CONFIG_VMX
289 void arch_vmx_do_resume(struct exec_domain *ed)
290 {
291 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
293 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
294 vmx_do_resume(ed);
295 reset_stack_and_jump(vmx_asm_do_resume);
296 }
298 void arch_vmx_do_launch(struct exec_domain *ed)
299 {
300 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
302 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
303 vmx_do_launch(ed);
304 reset_stack_and_jump(vmx_asm_do_launch);
305 }
307 static int vmx_final_setup_guest(struct exec_domain *ed,
308 full_execution_context_t *full_context)
309 {
310 int error;
311 execution_context_t *context;
312 struct vmcs_struct *vmcs;
314 context = &full_context->cpu_ctxt;
316 /*
317 * Create a new VMCS
318 */
319 if (!(vmcs = alloc_vmcs())) {
320 printk("Failed to create a new VMCS\n");
321 return -ENOMEM;
322 }
324 memset(&ed->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
326 ed->arch.arch_vmx.vmcs = vmcs;
327 error = construct_vmcs(
328 &ed->arch.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
329 if ( error < 0 )
330 {
331 printk("Failed to construct a new VMCS\n");
332 goto out;
333 }
335 ed->arch.schedule_tail = arch_vmx_do_launch;
336 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
338 #if defined (__i386)
339 ed->arch.arch_vmx.vmx_platform.real_mode_data =
340 (unsigned long *) context->esi;
341 #endif
343 if (ed == ed->domain->exec_domain[0]) {
344 /*
345 * Required to do this once per domain
346 * XXX todo: add a seperate function to do these.
347 */
348 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
349 sizeof(ed->domain->shared_info->evtchn_mask));
350 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
352 /* Put the domain in shadow mode even though we're going to be using
353 * the shared 1:1 page table initially. It shouldn't hurt */
354 shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external);
355 }
357 return 0;
359 out:
360 free_vmcs(vmcs);
361 ed->arch.arch_vmx.vmcs = 0;
362 return error;
363 }
364 #endif
367 /* This is called by arch_final_setup_guest and do_boot_vcpu */
368 int arch_set_info_guest(
369 struct exec_domain *ed, full_execution_context_t *c)
370 {
371 struct domain *d = ed->domain;
372 unsigned long phys_basetab;
373 int i, rc;
375 /*
376 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
377 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
378 * If SS RPL or DPL differs from CS RPL then we'll #GP.
379 */
380 if (!(c->flags & ECF_VMX_GUEST))
381 if ( ((c->cpu_ctxt.cs & 3) == 0) ||
382 ((c->cpu_ctxt.ss & 3) == 0) )
383 return -EINVAL;
385 clear_bit(EDF_DONEFPUINIT, &ed->ed_flags);
386 if ( c->flags & ECF_I387_VALID )
387 set_bit(EDF_DONEFPUINIT, &ed->ed_flags);
389 ed->arch.flags &= ~TF_kernel_mode;
390 if ( c->flags & ECF_IN_KERNEL )
391 ed->arch.flags |= TF_kernel_mode;
393 memcpy(&ed->arch.user_ctxt,
394 &c->cpu_ctxt,
395 sizeof(ed->arch.user_ctxt));
397 memcpy(&ed->arch.i387,
398 &c->fpu_ctxt,
399 sizeof(ed->arch.i387));
401 /* IOPL privileges are virtualised. */
402 ed->arch.iopl = (ed->arch.user_ctxt.eflags >> 12) & 3;
403 ed->arch.user_ctxt.eflags &= ~EF_IOPL;
405 /* Clear IOPL for unprivileged domains. */
406 if (!IS_PRIV(d))
407 ed->arch.user_ctxt.eflags &= 0xffffcfff;
409 if (test_bit(EDF_DONEINIT, &ed->ed_flags))
410 return 0;
412 memcpy(ed->arch.traps,
413 &c->trap_ctxt,
414 sizeof(ed->arch.traps));
416 if ( (rc = (int)set_fast_trap(ed, c->fast_trap_idx)) != 0 )
417 return rc;
419 ed->arch.ldt_base = c->ldt_base;
420 ed->arch.ldt_ents = c->ldt_ents;
422 ed->arch.kernel_ss = c->kernel_ss;
423 ed->arch.kernel_sp = c->kernel_esp;
425 for ( i = 0; i < 8; i++ )
426 (void)set_debugreg(ed, i, c->debugreg[i]);
428 #if defined(__i386__)
429 ed->arch.event_selector = c->event_callback_cs;
430 ed->arch.event_address = c->event_callback_eip;
431 ed->arch.failsafe_selector = c->failsafe_callback_cs;
432 ed->arch.failsafe_address = c->failsafe_callback_eip;
433 #elif defined(__x86_64__)
434 ed->arch.event_address = c->event_callback_eip;
435 ed->arch.failsafe_address = c->failsafe_callback_eip;
436 ed->arch.syscall_address = c->syscall_callback_eip;
437 #endif
439 if ( ed->eid == 0 )
440 d->vm_assist = c->vm_assist;
442 phys_basetab = c->pt_base;
443 ed->arch.guest_table = mk_pagetable(phys_basetab);
445 if ( shadow_mode_enabled(d) )
446 {
447 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
448 return -EINVAL;
449 }
450 else
451 {
452 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
453 PGT_base_page_table) )
454 return -EINVAL;
455 }
457 /* Failure to set GDT is harmless. */
458 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
459 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
460 if ( c->gdt_ents != 0 )
461 {
462 if ( (rc = (int)set_gdt(ed, c->gdt_frames, c->gdt_ents)) != 0 )
463 {
464 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
465 return rc;
466 }
467 }
469 #ifdef CONFIG_VMX
470 if ( c->flags & ECF_VMX_GUEST )
471 {
472 int error;
474 // VMX uses the initially provided page tables as the P2M map.
475 //
476 // XXX: This creates a security issue -- Xen can't necessarily
477 // trust the VMX domain builder. Xen should validate this
478 // page table, and/or build the table itself, or ???
479 //
480 if ( !pagetable_val(d->arch.phys_table) )
481 d->arch.phys_table = ed->arch.guest_table;
483 if ( (error = vmx_final_setup_guest(ed, c)) )
484 return error;
485 }
486 #endif
488 update_pagetables(ed);
490 /* Don't redo final setup */
491 set_bit(EDF_DONEINIT, &ed->ed_flags);
493 return 0;
494 }
497 void new_thread(struct exec_domain *d,
498 unsigned long start_pc,
499 unsigned long start_stack,
500 unsigned long start_info)
501 {
502 execution_context_t *ec = &d->arch.user_ctxt;
504 /*
505 * Initial register values:
506 * DS,ES,FS,GS = FLAT_KERNEL_DS
507 * CS:EIP = FLAT_KERNEL_CS:start_pc
508 * SS:ESP = FLAT_KERNEL_SS:start_stack
509 * ESI = start_info
510 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
511 */
512 ec->ds = ec->es = ec->fs = ec->gs = FLAT_KERNEL_DS;
513 ec->ss = FLAT_KERNEL_SS;
514 ec->cs = FLAT_KERNEL_CS;
515 ec->eip = start_pc;
516 ec->esp = start_stack;
517 ec->esi = start_info;
519 __save_flags(ec->eflags);
520 ec->eflags |= X86_EFLAGS_IF;
521 }
524 #ifdef __x86_64__
526 void toggle_guest_mode(struct exec_domain *ed)
527 {
528 ed->arch.flags ^= TF_kernel_mode;
529 __asm__ __volatile__ ( "swapgs" );
530 update_pagetables(ed);
531 write_ptbase(ed);
532 }
534 #define loadsegment(seg,value) ({ \
535 int __r = 1; \
536 __asm__ __volatile__ ( \
537 "1: movl %k1,%%" #seg "\n2:\n" \
538 ".section .fixup,\"ax\"\n" \
539 "3: xorl %k0,%k0\n" \
540 " movl %k0,%%" #seg "\n" \
541 " jmp 2b\n" \
542 ".previous\n" \
543 ".section __ex_table,\"a\"\n" \
544 " .align 8\n" \
545 " .quad 1b,3b\n" \
546 ".previous" \
547 : "=r" (__r) : "r" (value), "0" (__r) );\
548 __r; })
550 static void load_segments(struct exec_domain *p, struct exec_domain *n)
551 {
552 int all_segs_okay = 1;
554 /* Either selector != 0 ==> reload. */
555 if ( unlikely(p->arch.user_ctxt.ds |
556 n->arch.user_ctxt.ds) )
557 all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds);
559 /* Either selector != 0 ==> reload. */
560 if ( unlikely(p->arch.user_ctxt.es |
561 n->arch.user_ctxt.es) )
562 all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es);
564 /*
565 * Either selector != 0 ==> reload.
566 * Also reload to reset FS_BASE if it was non-zero.
567 */
568 if ( unlikely(p->arch.user_ctxt.fs |
569 p->arch.user_ctxt.fs_base |
570 n->arch.user_ctxt.fs) )
571 {
572 all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs);
573 if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
574 p->arch.user_ctxt.fs_base = 0;
575 }
577 /*
578 * Either selector != 0 ==> reload.
579 * Also reload to reset GS_BASE if it was non-zero.
580 */
581 if ( unlikely(p->arch.user_ctxt.gs |
582 p->arch.user_ctxt.gs_base_user |
583 n->arch.user_ctxt.gs) )
584 {
585 /* Reset GS_BASE with user %gs? */
586 if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user )
587 all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs);
588 if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */
589 p->arch.user_ctxt.gs_base_user = 0;
590 }
592 /* This can only be non-zero if selector is NULL. */
593 if ( n->arch.user_ctxt.fs_base )
594 wrmsr(MSR_FS_BASE,
595 n->arch.user_ctxt.fs_base,
596 n->arch.user_ctxt.fs_base>>32);
598 /* Most kernels have non-zero GS base, so don't bother testing. */
599 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
600 wrmsr(MSR_SHADOW_GS_BASE,
601 n->arch.user_ctxt.gs_base_kernel,
602 n->arch.user_ctxt.gs_base_kernel>>32);
604 /* This can only be non-zero if selector is NULL. */
605 if ( n->arch.user_ctxt.gs_base_user )
606 wrmsr(MSR_GS_BASE,
607 n->arch.user_ctxt.gs_base_user,
608 n->arch.user_ctxt.gs_base_user>>32);
610 /* If in kernel mode then switch the GS bases around. */
611 if ( n->arch.flags & TF_kernel_mode )
612 __asm__ __volatile__ ( "swapgs" );
614 if ( unlikely(!all_segs_okay) )
615 {
616 struct xen_regs *regs = get_execution_context();
617 unsigned long *rsp =
618 (n->arch.flags & TF_kernel_mode) ?
619 (unsigned long *)regs->rsp :
620 (unsigned long *)n->arch.kernel_sp;
622 if ( !(n->arch.flags & TF_kernel_mode) )
623 toggle_guest_mode(n);
624 else
625 regs->cs &= ~3;
627 if ( put_user(regs->ss, rsp- 1) |
628 put_user(regs->rsp, rsp- 2) |
629 put_user(regs->rflags, rsp- 3) |
630 put_user(regs->cs, rsp- 4) |
631 put_user(regs->rip, rsp- 5) |
632 put_user(regs->gs, rsp- 6) |
633 put_user(regs->fs, rsp- 7) |
634 put_user(regs->es, rsp- 8) |
635 put_user(regs->ds, rsp- 9) |
636 put_user(regs->r11, rsp-10) |
637 put_user(regs->rcx, rsp-11) )
638 {
639 DPRINTK("Error while creating failsafe callback frame.\n");
640 domain_crash();
641 }
643 regs->entry_vector = TRAP_syscall;
644 regs->rflags &= 0xFFFCBEFFUL;
645 regs->ss = __GUEST_SS;
646 regs->rsp = (unsigned long)(rsp-11);
647 regs->cs = __GUEST_CS;
648 regs->rip = n->arch.failsafe_address;
649 }
650 }
652 static void save_segments(struct exec_domain *p)
653 {
654 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
655 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
656 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
657 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
658 }
660 static void clear_segments(void)
661 {
662 __asm__ __volatile__ (
663 " movl %0,%%ds; "
664 " movl %0,%%es; "
665 " movl %0,%%fs; "
666 " movl %0,%%gs; "
667 ""safe_swapgs" "
668 " movl %0,%%gs"
669 : : "r" (0) );
670 }
672 long do_switch_to_user(void)
673 {
674 struct xen_regs *regs = get_execution_context();
675 struct switch_to_user stu;
676 struct exec_domain *ed = current;
678 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
679 unlikely(pagetable_val(ed->arch.guest_table_user) == 0) )
680 return -EFAULT;
682 toggle_guest_mode(ed);
684 regs->rip = stu.rip;
685 regs->cs = stu.cs | 3; /* force guest privilege */
686 regs->rflags = stu.rflags;
687 regs->rsp = stu.rsp;
688 regs->ss = stu.ss | 3; /* force guest privilege */
690 if ( !(stu.flags & ECF_IN_SYSCALL) )
691 {
692 regs->entry_vector = 0;
693 regs->r11 = stu.r11;
694 regs->rcx = stu.rcx;
695 }
697 /* Saved %rax gets written back to regs->rax in entry.S. */
698 return stu.rax;
699 }
701 #define switch_kernel_stack(_n,_c) ((void)0)
703 #elif defined(__i386__)
705 #define load_segments(_p, _n) ((void)0)
706 #define save_segments(_p) ((void)0)
707 #define clear_segments() ((void)0)
709 static inline void switch_kernel_stack(struct exec_domain *n, unsigned int cpu)
710 {
711 struct tss_struct *tss = &init_tss[cpu];
712 tss->esp1 = n->arch.kernel_sp;
713 tss->ss1 = n->arch.kernel_ss;
714 }
716 #endif
718 #define loaddebug(_ed,_reg) \
719 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg]))
721 static void __context_switch(void)
722 {
723 execution_context_t *stack_ec = get_execution_context();
724 unsigned int cpu = smp_processor_id();
725 struct exec_domain *p = percpu_ctxt[cpu].curr_ed;
726 struct exec_domain *n = current;
728 if ( !is_idle_task(p->domain) )
729 {
730 memcpy(&p->arch.user_ctxt,
731 stack_ec,
732 sizeof(*stack_ec));
733 unlazy_fpu(p);
734 CLEAR_FAST_TRAP(&p->arch);
735 save_segments(p);
736 }
738 if ( !is_idle_task(n->domain) )
739 {
740 memcpy(stack_ec,
741 &n->arch.user_ctxt,
742 sizeof(*stack_ec));
744 /* Maybe switch the debug registers. */
745 if ( unlikely(n->arch.debugreg[7]) )
746 {
747 loaddebug(&n->arch, 0);
748 loaddebug(&n->arch, 1);
749 loaddebug(&n->arch, 2);
750 loaddebug(&n->arch, 3);
751 /* no 4 and 5 */
752 loaddebug(&n->arch, 6);
753 loaddebug(&n->arch, 7);
754 }
756 if ( !VMX_DOMAIN(n) )
757 {
758 SET_FAST_TRAP(&n->arch);
759 switch_kernel_stack(n, cpu);
760 }
761 }
763 if ( p->domain != n->domain )
764 set_bit(cpu, &n->domain->cpuset);
766 write_ptbase(n);
767 __asm__ __volatile__ ( "lgdt %0" : "=m" (*n->arch.gdt) );
769 if ( p->domain != n->domain )
770 clear_bit(cpu, &p->domain->cpuset);
772 percpu_ctxt[cpu].curr_ed = n;
773 }
776 void context_switch(struct exec_domain *prev, struct exec_domain *next)
777 {
778 struct exec_domain *realprev;
780 local_irq_disable();
782 set_current(next);
784 if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_ed) == next) ||
785 is_idle_task(next->domain) )
786 {
787 local_irq_enable();
788 }
789 else
790 {
791 __context_switch();
793 local_irq_enable();
795 if ( !VMX_DOMAIN(next) )
796 {
797 load_LDT(next);
798 load_segments(realprev, next);
799 }
800 }
802 /*
803 * We do this late on because it doesn't need to be protected by the
804 * schedule_lock, and because we want this to be the very last use of
805 * 'prev' (after this point, a dying domain's info structure may be freed
806 * without warning).
807 */
808 clear_bit(EDF_RUNNING, &prev->ed_flags);
810 schedule_tail(next);
812 BUG();
813 }
815 int __sync_lazy_execstate(void)
816 {
817 if ( percpu_ctxt[smp_processor_id()].curr_ed == current )
818 return 0;
819 __context_switch();
820 load_LDT(current);
821 clear_segments();
822 return 1;
823 }
825 void sync_lazy_execstate_cpuset(unsigned long cpuset)
826 {
827 flush_tlb_mask(cpuset);
828 }
830 void sync_lazy_execstate_all(void)
831 {
832 flush_tlb_all();
833 }
835 unsigned long __hypercall_create_continuation(
836 unsigned int op, unsigned int nr_args, ...)
837 {
838 struct mc_state *mcs = &mc_state[smp_processor_id()];
839 execution_context_t *ec;
840 unsigned int i;
841 va_list args;
843 va_start(args, nr_args);
845 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
846 {
847 __set_bit(_MCSF_call_preempted, &mcs->flags);
849 for ( i = 0; i < nr_args; i++ )
850 mcs->call.args[i] = va_arg(args, unsigned long);
851 }
852 else
853 {
854 ec = get_execution_context();
855 #if defined(__i386__)
856 ec->eax = op;
857 ec->eip -= 2; /* re-execute 'int 0x82' */
859 for ( i = 0; i < nr_args; i++ )
860 {
861 switch ( i )
862 {
863 case 0: ec->ebx = va_arg(args, unsigned long); break;
864 case 1: ec->ecx = va_arg(args, unsigned long); break;
865 case 2: ec->edx = va_arg(args, unsigned long); break;
866 case 3: ec->esi = va_arg(args, unsigned long); break;
867 case 4: ec->edi = va_arg(args, unsigned long); break;
868 case 5: ec->ebp = va_arg(args, unsigned long); break;
869 }
870 }
871 #elif defined(__x86_64__)
872 ec->rax = op;
873 ec->rip -= 2; /* re-execute 'syscall' */
875 for ( i = 0; i < nr_args; i++ )
876 {
877 switch ( i )
878 {
879 case 0: ec->rdi = va_arg(args, unsigned long); break;
880 case 1: ec->rsi = va_arg(args, unsigned long); break;
881 case 2: ec->rdx = va_arg(args, unsigned long); break;
882 case 3: ec->r10 = va_arg(args, unsigned long); break;
883 case 4: ec->r8 = va_arg(args, unsigned long); break;
884 case 5: ec->r9 = va_arg(args, unsigned long); break;
885 }
886 }
887 #endif
888 }
890 va_end(args);
892 return op;
893 }
895 #ifdef CONFIG_VMX
896 static void vmx_relinquish_resources(struct exec_domain *ed)
897 {
898 if ( !VMX_DOMAIN(ed) )
899 return;
901 BUG_ON(ed->arch.arch_vmx.vmcs == NULL);
902 free_vmcs(ed->arch.arch_vmx.vmcs);
903 ed->arch.arch_vmx.vmcs = 0;
905 free_monitor_pagetable(ed);
906 rem_ac_timer(&ed->arch.arch_vmx.vmx_platform.vmx_pit.pit_timer);
907 }
908 #else
909 #define vmx_relinquish_resources(_ed) ((void)0)
910 #endif
912 static void relinquish_memory(struct domain *d, struct list_head *list)
913 {
914 struct list_head *ent;
915 struct pfn_info *page;
916 unsigned long x, y;
918 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
919 spin_lock_recursive(&d->page_alloc_lock);
921 ent = list->next;
922 while ( ent != list )
923 {
924 page = list_entry(ent, struct pfn_info, list);
926 /* Grab a reference to the page so it won't disappear from under us. */
927 if ( unlikely(!get_page(page, d)) )
928 {
929 /* Couldn't get a reference -- someone is freeing this page. */
930 ent = ent->next;
931 continue;
932 }
934 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
935 put_page_and_type(page);
937 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
938 put_page(page);
940 /*
941 * Forcibly invalidate base page tables at this point to break circular
942 * 'linear page table' references. This is okay because MMU structures
943 * are not shared across domains and this domain is now dead. Thus base
944 * tables are not in use so a non-zero count means circular reference.
945 */
946 y = page->u.inuse.type_info;
947 for ( ; ; )
948 {
949 x = y;
950 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
951 (PGT_base_page_table|PGT_validated)) )
952 break;
954 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
955 if ( likely(y == x) )
956 {
957 free_page_type(page, PGT_base_page_table);
958 break;
959 }
960 }
962 /* Follow the list chain and /then/ potentially free the page. */
963 ent = ent->next;
964 put_page(page);
965 }
967 spin_unlock_recursive(&d->page_alloc_lock);
968 }
970 void domain_relinquish_resources(struct domain *d)
971 {
972 struct exec_domain *ed;
974 BUG_ON(d->cpuset != 0);
976 ptwr_destroy(d);
978 /* Release device mappings of other domains */
979 gnttab_release_dev_mappings(d->grant_table);
981 /* Exit shadow mode before deconstructing final guest page table. */
982 shadow_mode_disable(d);
984 /* Drop the in-use references to page-table bases. */
985 for_each_exec_domain ( d, ed )
986 {
987 if ( pagetable_val(ed->arch.guest_table) != 0 )
988 {
989 put_page_and_type(&frame_table[
990 pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT]);
991 ed->arch.guest_table = mk_pagetable(0);
992 }
994 if ( pagetable_val(ed->arch.guest_table_user) != 0 )
995 {
996 put_page_and_type(&frame_table[
997 pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT]);
998 ed->arch.guest_table_user = mk_pagetable(0);
999 }
1001 vmx_relinquish_resources(ed);
1004 /*
1005 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1006 * it automatically gets squashed when the guest's mappings go away.
1007 */
1008 for_each_exec_domain(d, ed)
1009 destroy_gdt(ed);
1011 /* Relinquish every page of memory. */
1012 relinquish_memory(d, &d->xenpage_list);
1013 relinquish_memory(d, &d->page_list);
1017 /*
1018 * Local variables:
1019 * mode: C
1020 * c-set-style: "BSD"
1021 * c-basic-offset: 4
1022 * tab-width: 4
1023 * indent-tabs-mode: nil
1024 * End:
1025 */