debuggers.hg

view xen/arch/x86/domain.c @ 4662:9a768d11cc7b

bitkeeper revision 1.1358 (4267e561Ml7gO0DQYGp9EYRUYPBDHA)

Merge burn.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into burn.cl.cam.ac.uk:/local/scratch-1/maf46/xen-unstable.bk

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author maf46@burn.cl.cam.ac.uk
date Thu Apr 21 17:39:45 2005 +0000 (2005-04-21)
parents 319e2634476d 8e987582b901
children 18a8f5216548
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <public/io/ioreq.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct exec_domain *curr_ed;
50 } __cacheline_aligned;
51 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
53 static void default_idle(void)
54 {
55 local_irq_disable();
56 if ( !softirq_pending(smp_processor_id()) )
57 safe_halt();
58 else
59 local_irq_enable();
60 }
62 static __attribute_used__ void idle_loop(void)
63 {
64 int cpu = smp_processor_id();
65 for ( ; ; )
66 {
67 irq_stat[cpu].idle_timestamp = jiffies;
68 while ( !softirq_pending(cpu) )
69 {
70 page_scrub_schedule_work();
71 default_idle();
72 }
73 do_softirq();
74 }
75 }
77 void startup_cpu_idle_loop(void)
78 {
79 /* Just some sanity to ensure that the scheduler is set up okay. */
80 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
81 percpu_ctxt[smp_processor_id()].curr_ed = current;
82 set_bit(smp_processor_id(), &current->domain->cpuset);
83 domain_unpause_by_systemcontroller(current->domain);
84 raise_softirq(SCHEDULE_SOFTIRQ);
85 do_softirq();
87 /*
88 * Declares CPU setup done to the boot processor.
89 * Therefore memory barrier to ensure state is visible.
90 */
91 smp_mb();
92 init_idle();
94 idle_loop();
95 }
97 static long no_idt[2];
98 static int reboot_mode;
100 static inline void kb_wait(void)
101 {
102 int i;
104 for ( i = 0; i < 0x10000; i++ )
105 if ( (inb_p(0x64) & 0x02) == 0 )
106 break;
107 }
109 void machine_restart(char * __unused)
110 {
111 int i;
113 if ( opt_noreboot )
114 {
115 printk("Reboot disabled on cmdline: require manual reset\n");
116 for ( ; ; )
117 safe_halt();
118 }
120 local_irq_enable();
122 /* Ensure we are the boot CPU. */
123 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
124 {
125 smp_call_function((void *)machine_restart, NULL, 1, 0);
126 for ( ; ; )
127 safe_halt();
128 }
130 /*
131 * Stop all CPUs and turn off local APICs and the IO-APIC, so
132 * other OSs see a clean IRQ state.
133 */
134 smp_send_stop();
135 disable_IO_APIC();
137 #ifdef CONFIG_VMX
138 stop_vmx();
139 #endif
141 /* Rebooting needs to touch the page at absolute address 0. */
142 *((unsigned short *)__va(0x472)) = reboot_mode;
144 for ( ; ; )
145 {
146 /* Pulse the keyboard reset line. */
147 for ( i = 0; i < 100; i++ )
148 {
149 kb_wait();
150 udelay(50);
151 outb(0xfe,0x64); /* pulse reset low */
152 udelay(50);
153 }
155 /* That didn't work - force a triple fault.. */
156 __asm__ __volatile__("lidt %0": "=m" (no_idt));
157 __asm__ __volatile__("int3");
158 }
159 }
162 void __attribute__((noreturn)) __machine_halt(void *unused)
163 {
164 for ( ; ; )
165 safe_halt();
166 }
168 void machine_halt(void)
169 {
170 watchdog_on = 0;
171 smp_call_function(__machine_halt, NULL, 1, 0);
172 __machine_halt(NULL);
173 }
175 void dump_pageframe_info(struct domain *d)
176 {
177 struct pfn_info *page;
179 if ( d->tot_pages < 10 )
180 {
181 list_for_each_entry ( page, &d->page_list, list )
182 {
183 printk("Page %08x: caf=%08x, taf=%08x\n",
184 page_to_phys(page), page->count_info,
185 page->u.inuse.type_info);
186 }
187 }
189 list_for_each_entry ( page, &d->xenpage_list, list )
190 {
191 printk("XenPage %08x: caf=%08x, taf=%08x\n",
192 page_to_phys(page), page->count_info,
193 page->u.inuse.type_info);
194 }
197 page = virt_to_page(d->shared_info);
198 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
199 page_to_phys(page), page->count_info,
200 page->u.inuse.type_info);
201 }
203 struct exec_domain *arch_alloc_exec_domain_struct(void)
204 {
205 return xmalloc(struct exec_domain);
206 }
208 void arch_free_exec_domain_struct(struct exec_domain *ed)
209 {
210 xfree(ed);
211 }
213 void free_perdomain_pt(struct domain *d)
214 {
215 free_xenheap_page((unsigned long)d->arch.mm_perdomain_pt);
216 #ifdef __x86_64__
217 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l2);
218 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l3);
219 #endif
220 }
222 static void continue_idle_task(struct exec_domain *ed)
223 {
224 reset_stack_and_jump(idle_loop);
225 }
227 static void continue_nonidle_task(struct exec_domain *ed)
228 {
229 reset_stack_and_jump(ret_from_intr);
230 }
232 void arch_do_createdomain(struct exec_domain *ed)
233 {
234 struct domain *d = ed->domain;
236 SET_DEFAULT_FAST_TRAP(&ed->arch);
238 ed->arch.flags = TF_kernel_mode;
240 if ( d->id == IDLE_DOMAIN_ID )
241 {
242 ed->arch.schedule_tail = continue_idle_task;
243 }
244 else
245 {
246 ed->arch.schedule_tail = continue_nonidle_task;
248 d->shared_info = (void *)alloc_xenheap_page();
249 memset(d->shared_info, 0, PAGE_SIZE);
250 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
251 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
252 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
253 PAGE_SHIFT] = INVALID_M2P_ENTRY;
255 d->arch.mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
256 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
257 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
258 PAGE_SHIFT] = INVALID_M2P_ENTRY;
259 ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
261 ed->arch.guest_vtable = __linear_l2_table;
262 ed->arch.shadow_vtable = __shadow_linear_l2_table;
264 #ifdef __x86_64__
265 ed->arch.guest_vl3table = __linear_l3_table;
266 ed->arch.guest_vl4table = __linear_l4_table;
268 d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
269 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
270 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
271 l2e_create_phys(__pa(d->arch.mm_perdomain_pt),
272 __PAGE_HYPERVISOR);
273 d->arch.mm_perdomain_l3 = (l3_pgentry_t *)alloc_xenheap_page();
274 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
275 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
276 l3e_create_phys(__pa(d->arch.mm_perdomain_l2),
277 __PAGE_HYPERVISOR);
278 #endif
280 (void)ptwr_init(d);
282 shadow_lock_init(d);
283 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
284 }
285 }
287 void arch_do_boot_vcpu(struct exec_domain *ed)
288 {
289 struct domain *d = ed->domain;
290 ed->arch.schedule_tail = d->exec_domain[0]->arch.schedule_tail;
291 ed->arch.perdomain_ptes =
292 d->arch.mm_perdomain_pt + (ed->eid << PDPT_VCPU_SHIFT);
293 ed->arch.flags = TF_kernel_mode;
294 }
296 #ifdef CONFIG_VMX
297 void arch_vmx_do_resume(struct exec_domain *ed)
298 {
299 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
301 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
302 vmx_do_resume(ed);
303 reset_stack_and_jump(vmx_asm_do_resume);
304 }
306 void arch_vmx_do_launch(struct exec_domain *ed)
307 {
308 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
310 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
311 vmx_do_launch(ed);
312 reset_stack_and_jump(vmx_asm_do_launch);
313 }
315 static int vmx_final_setup_guest(struct exec_domain *ed,
316 full_execution_context_t *full_context)
317 {
318 int error;
319 execution_context_t *context;
320 struct vmcs_struct *vmcs;
322 context = &full_context->cpu_ctxt;
324 /*
325 * Create a new VMCS
326 */
327 if (!(vmcs = alloc_vmcs())) {
328 printk("Failed to create a new VMCS\n");
329 return -ENOMEM;
330 }
332 memset(&ed->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
334 ed->arch.arch_vmx.vmcs = vmcs;
335 error = construct_vmcs(
336 &ed->arch.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
337 if ( error < 0 )
338 {
339 printk("Failed to construct a new VMCS\n");
340 goto out;
341 }
343 ed->arch.schedule_tail = arch_vmx_do_launch;
344 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
346 #if defined (__i386)
347 ed->arch.arch_vmx.vmx_platform.real_mode_data =
348 (unsigned long *) context->esi;
349 #endif
351 if (ed == ed->domain->exec_domain[0]) {
352 /*
353 * Required to do this once per domain
354 * XXX todo: add a seperate function to do these.
355 */
356 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
357 sizeof(ed->domain->shared_info->evtchn_mask));
358 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
360 /* Put the domain in shadow mode even though we're going to be using
361 * the shared 1:1 page table initially. It shouldn't hurt */
362 shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external);
363 }
365 return 0;
367 out:
368 free_vmcs(vmcs);
369 ed->arch.arch_vmx.vmcs = 0;
370 return error;
371 }
372 #endif
375 /* This is called by arch_final_setup_guest and do_boot_vcpu */
376 int arch_set_info_guest(
377 struct exec_domain *ed, full_execution_context_t *c)
378 {
379 struct domain *d = ed->domain;
380 unsigned long phys_basetab;
381 int i, rc;
383 /*
384 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
385 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
386 * If SS RPL or DPL differs from CS RPL then we'll #GP.
387 */
388 if (!(c->flags & ECF_VMX_GUEST))
389 if ( ((c->cpu_ctxt.cs & 3) == 0) ||
390 ((c->cpu_ctxt.ss & 3) == 0) )
391 return -EINVAL;
393 clear_bit(EDF_DONEFPUINIT, &ed->ed_flags);
394 if ( c->flags & ECF_I387_VALID )
395 set_bit(EDF_DONEFPUINIT, &ed->ed_flags);
397 ed->arch.flags &= ~TF_kernel_mode;
398 if ( c->flags & ECF_IN_KERNEL )
399 ed->arch.flags |= TF_kernel_mode;
401 memcpy(&ed->arch.user_ctxt,
402 &c->cpu_ctxt,
403 sizeof(ed->arch.user_ctxt));
405 memcpy(&ed->arch.i387,
406 &c->fpu_ctxt,
407 sizeof(ed->arch.i387));
409 /* IOPL privileges are virtualised. */
410 ed->arch.iopl = (ed->arch.user_ctxt.eflags >> 12) & 3;
411 ed->arch.user_ctxt.eflags &= ~EF_IOPL;
413 /* Clear IOPL for unprivileged domains. */
414 if (!IS_PRIV(d))
415 ed->arch.user_ctxt.eflags &= 0xffffcfff;
417 if (test_bit(EDF_DONEINIT, &ed->ed_flags))
418 return 0;
420 memcpy(ed->arch.traps,
421 &c->trap_ctxt,
422 sizeof(ed->arch.traps));
424 if ( (rc = (int)set_fast_trap(ed, c->fast_trap_idx)) != 0 )
425 return rc;
427 ed->arch.ldt_base = c->ldt_base;
428 ed->arch.ldt_ents = c->ldt_ents;
430 ed->arch.kernel_ss = c->kernel_ss;
431 ed->arch.kernel_sp = c->kernel_esp;
433 for ( i = 0; i < 8; i++ )
434 (void)set_debugreg(ed, i, c->debugreg[i]);
436 #if defined(__i386__)
437 ed->arch.event_selector = c->event_callback_cs;
438 ed->arch.event_address = c->event_callback_eip;
439 ed->arch.failsafe_selector = c->failsafe_callback_cs;
440 ed->arch.failsafe_address = c->failsafe_callback_eip;
441 #elif defined(__x86_64__)
442 ed->arch.event_address = c->event_callback_eip;
443 ed->arch.failsafe_address = c->failsafe_callback_eip;
444 ed->arch.syscall_address = c->syscall_callback_eip;
445 #endif
447 if ( ed->eid == 0 )
448 d->vm_assist = c->vm_assist;
450 phys_basetab = c->pt_base;
451 ed->arch.guest_table = mk_pagetable(phys_basetab);
453 if ( shadow_mode_enabled(d) )
454 {
455 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
456 return -EINVAL;
457 }
458 else
459 {
460 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
461 PGT_base_page_table) )
462 return -EINVAL;
463 }
465 /* Failure to set GDT is harmless. */
466 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
467 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
468 if ( c->gdt_ents != 0 )
469 {
470 if ( (rc = (int)set_gdt(ed, c->gdt_frames, c->gdt_ents)) != 0 )
471 {
472 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
473 return rc;
474 }
475 }
477 #ifdef CONFIG_VMX
478 if ( c->flags & ECF_VMX_GUEST )
479 {
480 int error;
482 // VMX uses the initially provided page tables as the P2M map.
483 //
484 // XXX: This creates a security issue -- Xen can't necessarily
485 // trust the VMX domain builder. Xen should validate this
486 // page table, and/or build the table itself, or ???
487 //
488 if ( !pagetable_val(d->arch.phys_table) )
489 d->arch.phys_table = ed->arch.guest_table;
491 if ( (error = vmx_final_setup_guest(ed, c)) )
492 return error;
493 }
494 #endif
496 update_pagetables(ed);
498 /* Don't redo final setup */
499 set_bit(EDF_DONEINIT, &ed->ed_flags);
501 return 0;
502 }
505 void new_thread(struct exec_domain *d,
506 unsigned long start_pc,
507 unsigned long start_stack,
508 unsigned long start_info)
509 {
510 execution_context_t *ec = &d->arch.user_ctxt;
512 /*
513 * Initial register values:
514 * DS,ES,FS,GS = FLAT_KERNEL_DS
515 * CS:EIP = FLAT_KERNEL_CS:start_pc
516 * SS:ESP = FLAT_KERNEL_SS:start_stack
517 * ESI = start_info
518 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
519 */
520 ec->ds = ec->es = ec->fs = ec->gs = FLAT_KERNEL_DS;
521 ec->ss = FLAT_KERNEL_SS;
522 ec->cs = FLAT_KERNEL_CS;
523 ec->eip = start_pc;
524 ec->esp = start_stack;
525 ec->esi = start_info;
527 __save_flags(ec->eflags);
528 ec->eflags |= X86_EFLAGS_IF;
529 }
532 #ifdef __x86_64__
534 void toggle_guest_mode(struct exec_domain *ed)
535 {
536 ed->arch.flags ^= TF_kernel_mode;
537 __asm__ __volatile__ ( "swapgs" );
538 update_pagetables(ed);
539 write_ptbase(ed);
540 }
542 #define loadsegment(seg,value) ({ \
543 int __r = 1; \
544 __asm__ __volatile__ ( \
545 "1: movl %k1,%%" #seg "\n2:\n" \
546 ".section .fixup,\"ax\"\n" \
547 "3: xorl %k0,%k0\n" \
548 " movl %k0,%%" #seg "\n" \
549 " jmp 2b\n" \
550 ".previous\n" \
551 ".section __ex_table,\"a\"\n" \
552 " .align 8\n" \
553 " .quad 1b,3b\n" \
554 ".previous" \
555 : "=r" (__r) : "r" (value), "0" (__r) );\
556 __r; })
558 static void load_segments(struct exec_domain *p, struct exec_domain *n)
559 {
560 int all_segs_okay = 1;
562 /* Either selector != 0 ==> reload. */
563 if ( unlikely(p->arch.user_ctxt.ds |
564 n->arch.user_ctxt.ds) )
565 all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds);
567 /* Either selector != 0 ==> reload. */
568 if ( unlikely(p->arch.user_ctxt.es |
569 n->arch.user_ctxt.es) )
570 all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es);
572 /*
573 * Either selector != 0 ==> reload.
574 * Also reload to reset FS_BASE if it was non-zero.
575 */
576 if ( unlikely(p->arch.user_ctxt.fs |
577 p->arch.user_ctxt.fs_base |
578 n->arch.user_ctxt.fs) )
579 {
580 all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs);
581 if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
582 p->arch.user_ctxt.fs_base = 0;
583 }
585 /*
586 * Either selector != 0 ==> reload.
587 * Also reload to reset GS_BASE if it was non-zero.
588 */
589 if ( unlikely(p->arch.user_ctxt.gs |
590 p->arch.user_ctxt.gs_base_user |
591 n->arch.user_ctxt.gs) )
592 {
593 /* Reset GS_BASE with user %gs? */
594 if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user )
595 all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs);
596 if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */
597 p->arch.user_ctxt.gs_base_user = 0;
598 }
600 /* This can only be non-zero if selector is NULL. */
601 if ( n->arch.user_ctxt.fs_base )
602 wrmsr(MSR_FS_BASE,
603 n->arch.user_ctxt.fs_base,
604 n->arch.user_ctxt.fs_base>>32);
606 /* Most kernels have non-zero GS base, so don't bother testing. */
607 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
608 wrmsr(MSR_SHADOW_GS_BASE,
609 n->arch.user_ctxt.gs_base_kernel,
610 n->arch.user_ctxt.gs_base_kernel>>32);
612 /* This can only be non-zero if selector is NULL. */
613 if ( n->arch.user_ctxt.gs_base_user )
614 wrmsr(MSR_GS_BASE,
615 n->arch.user_ctxt.gs_base_user,
616 n->arch.user_ctxt.gs_base_user>>32);
618 /* If in kernel mode then switch the GS bases around. */
619 if ( n->arch.flags & TF_kernel_mode )
620 __asm__ __volatile__ ( "swapgs" );
622 if ( unlikely(!all_segs_okay) )
623 {
624 struct xen_regs *regs = get_execution_context();
625 unsigned long *rsp =
626 (n->arch.flags & TF_kernel_mode) ?
627 (unsigned long *)regs->rsp :
628 (unsigned long *)n->arch.kernel_sp;
630 if ( !(n->arch.flags & TF_kernel_mode) )
631 toggle_guest_mode(n);
632 else
633 regs->cs &= ~3;
635 if ( put_user(regs->ss, rsp- 1) |
636 put_user(regs->rsp, rsp- 2) |
637 put_user(regs->rflags, rsp- 3) |
638 put_user(regs->cs, rsp- 4) |
639 put_user(regs->rip, rsp- 5) |
640 put_user(n->arch.user_ctxt.gs, rsp- 6) |
641 put_user(n->arch.user_ctxt.fs, rsp- 7) |
642 put_user(n->arch.user_ctxt.es, rsp- 8) |
643 put_user(n->arch.user_ctxt.ds, rsp- 9) |
644 put_user(regs->r11, rsp-10) |
645 put_user(regs->rcx, rsp-11) )
646 {
647 DPRINTK("Error while creating failsafe callback frame.\n");
648 domain_crash();
649 }
651 regs->entry_vector = TRAP_syscall;
652 regs->rflags &= 0xFFFCBEFFUL;
653 regs->ss = __GUEST_SS;
654 regs->rsp = (unsigned long)(rsp-11);
655 regs->cs = __GUEST_CS;
656 regs->rip = n->arch.failsafe_address;
657 }
658 }
660 static void save_segments(struct exec_domain *p)
661 {
662 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
663 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
664 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
665 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
666 }
668 static void clear_segments(void)
669 {
670 __asm__ __volatile__ (
671 " movl %0,%%ds; "
672 " movl %0,%%es; "
673 " movl %0,%%fs; "
674 " movl %0,%%gs; "
675 ""safe_swapgs" "
676 " movl %0,%%gs"
677 : : "r" (0) );
678 }
680 long do_switch_to_user(void)
681 {
682 struct xen_regs *regs = get_execution_context();
683 struct switch_to_user stu;
684 struct exec_domain *ed = current;
686 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
687 unlikely(pagetable_val(ed->arch.guest_table_user) == 0) )
688 return -EFAULT;
690 toggle_guest_mode(ed);
692 regs->rip = stu.rip;
693 regs->cs = stu.cs | 3; /* force guest privilege */
694 regs->rflags = stu.rflags;
695 regs->rsp = stu.rsp;
696 regs->ss = stu.ss | 3; /* force guest privilege */
698 if ( !(stu.flags & ECF_IN_SYSCALL) )
699 {
700 regs->entry_vector = 0;
701 regs->r11 = stu.r11;
702 regs->rcx = stu.rcx;
703 }
705 /* Saved %rax gets written back to regs->rax in entry.S. */
706 return stu.rax;
707 }
709 #define switch_kernel_stack(_n,_c) ((void)0)
711 #elif defined(__i386__)
713 #define load_segments(_p, _n) ((void)0)
714 #define save_segments(_p) ((void)0)
715 #define clear_segments() ((void)0)
717 static inline void switch_kernel_stack(struct exec_domain *n, unsigned int cpu)
718 {
719 struct tss_struct *tss = &init_tss[cpu];
720 tss->esp1 = n->arch.kernel_sp;
721 tss->ss1 = n->arch.kernel_ss;
722 }
724 #endif
726 #define loaddebug(_ed,_reg) \
727 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg]))
729 static void __context_switch(void)
730 {
731 execution_context_t *stack_ec = get_execution_context();
732 unsigned int cpu = smp_processor_id();
733 struct exec_domain *p = percpu_ctxt[cpu].curr_ed;
734 struct exec_domain *n = current;
736 if ( !is_idle_task(p->domain) )
737 {
738 memcpy(&p->arch.user_ctxt,
739 stack_ec,
740 CTXT_SWITCH_STACK_BYTES);
741 unlazy_fpu(p);
742 CLEAR_FAST_TRAP(&p->arch);
743 save_segments(p);
744 }
746 if ( !is_idle_task(n->domain) )
747 {
748 memcpy(stack_ec,
749 &n->arch.user_ctxt,
750 CTXT_SWITCH_STACK_BYTES);
752 /* Maybe switch the debug registers. */
753 if ( unlikely(n->arch.debugreg[7]) )
754 {
755 loaddebug(&n->arch, 0);
756 loaddebug(&n->arch, 1);
757 loaddebug(&n->arch, 2);
758 loaddebug(&n->arch, 3);
759 /* no 4 and 5 */
760 loaddebug(&n->arch, 6);
761 loaddebug(&n->arch, 7);
762 }
764 if ( !VMX_DOMAIN(n) )
765 {
766 SET_FAST_TRAP(&n->arch);
767 switch_kernel_stack(n, cpu);
768 }
769 }
771 if ( p->domain != n->domain )
772 set_bit(cpu, &n->domain->cpuset);
774 write_ptbase(n);
775 __asm__ __volatile__ ( "lgdt %0" : "=m" (*n->arch.gdt) );
777 if ( p->domain != n->domain )
778 clear_bit(cpu, &p->domain->cpuset);
780 percpu_ctxt[cpu].curr_ed = n;
781 }
784 void context_switch(struct exec_domain *prev, struct exec_domain *next)
785 {
786 struct exec_domain *realprev;
788 local_irq_disable();
790 set_current(next);
792 if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_ed) == next) ||
793 is_idle_task(next->domain) )
794 {
795 local_irq_enable();
796 }
797 else
798 {
799 __context_switch();
801 local_irq_enable();
803 if ( !VMX_DOMAIN(next) )
804 {
805 load_LDT(next);
806 load_segments(realprev, next);
807 }
808 }
810 /*
811 * We do this late on because it doesn't need to be protected by the
812 * schedule_lock, and because we want this to be the very last use of
813 * 'prev' (after this point, a dying domain's info structure may be freed
814 * without warning).
815 */
816 clear_bit(EDF_RUNNING, &prev->ed_flags);
818 schedule_tail(next);
820 BUG();
821 }
823 int __sync_lazy_execstate(void)
824 {
825 if ( percpu_ctxt[smp_processor_id()].curr_ed == current )
826 return 0;
827 __context_switch();
828 load_LDT(current);
829 clear_segments();
830 return 1;
831 }
833 void sync_lazy_execstate_cpuset(unsigned long cpuset)
834 {
835 flush_tlb_mask(cpuset);
836 }
838 void sync_lazy_execstate_all(void)
839 {
840 flush_tlb_all();
841 }
843 unsigned long __hypercall_create_continuation(
844 unsigned int op, unsigned int nr_args, ...)
845 {
846 struct mc_state *mcs = &mc_state[smp_processor_id()];
847 execution_context_t *ec;
848 unsigned int i;
849 va_list args;
851 va_start(args, nr_args);
853 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
854 {
855 __set_bit(_MCSF_call_preempted, &mcs->flags);
857 for ( i = 0; i < nr_args; i++ )
858 mcs->call.args[i] = va_arg(args, unsigned long);
859 }
860 else
861 {
862 ec = get_execution_context();
863 #if defined(__i386__)
864 ec->eax = op;
865 ec->eip -= 2; /* re-execute 'int 0x82' */
867 for ( i = 0; i < nr_args; i++ )
868 {
869 switch ( i )
870 {
871 case 0: ec->ebx = va_arg(args, unsigned long); break;
872 case 1: ec->ecx = va_arg(args, unsigned long); break;
873 case 2: ec->edx = va_arg(args, unsigned long); break;
874 case 3: ec->esi = va_arg(args, unsigned long); break;
875 case 4: ec->edi = va_arg(args, unsigned long); break;
876 case 5: ec->ebp = va_arg(args, unsigned long); break;
877 }
878 }
879 #elif defined(__x86_64__)
880 ec->rax = op;
881 ec->rip -= 2; /* re-execute 'syscall' */
883 for ( i = 0; i < nr_args; i++ )
884 {
885 switch ( i )
886 {
887 case 0: ec->rdi = va_arg(args, unsigned long); break;
888 case 1: ec->rsi = va_arg(args, unsigned long); break;
889 case 2: ec->rdx = va_arg(args, unsigned long); break;
890 case 3: ec->r10 = va_arg(args, unsigned long); break;
891 case 4: ec->r8 = va_arg(args, unsigned long); break;
892 case 5: ec->r9 = va_arg(args, unsigned long); break;
893 }
894 }
895 #endif
896 }
898 va_end(args);
900 return op;
901 }
903 #ifdef CONFIG_VMX
904 static void vmx_relinquish_resources(struct exec_domain *ed)
905 {
906 if ( !VMX_DOMAIN(ed) )
907 return;
909 BUG_ON(ed->arch.arch_vmx.vmcs == NULL);
910 free_vmcs(ed->arch.arch_vmx.vmcs);
911 ed->arch.arch_vmx.vmcs = 0;
913 free_monitor_pagetable(ed);
914 rem_ac_timer(&ed->arch.arch_vmx.vmx_platform.vmx_pit.pit_timer);
915 }
916 #else
917 #define vmx_relinquish_resources(_ed) ((void)0)
918 #endif
920 static void relinquish_memory(struct domain *d, struct list_head *list)
921 {
922 struct list_head *ent;
923 struct pfn_info *page;
924 unsigned long x, y;
926 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
927 spin_lock_recursive(&d->page_alloc_lock);
929 ent = list->next;
930 while ( ent != list )
931 {
932 page = list_entry(ent, struct pfn_info, list);
934 /* Grab a reference to the page so it won't disappear from under us. */
935 if ( unlikely(!get_page(page, d)) )
936 {
937 /* Couldn't get a reference -- someone is freeing this page. */
938 ent = ent->next;
939 continue;
940 }
942 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
943 put_page_and_type(page);
945 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
946 put_page(page);
948 /*
949 * Forcibly invalidate base page tables at this point to break circular
950 * 'linear page table' references. This is okay because MMU structures
951 * are not shared across domains and this domain is now dead. Thus base
952 * tables are not in use so a non-zero count means circular reference.
953 */
954 y = page->u.inuse.type_info;
955 for ( ; ; )
956 {
957 x = y;
958 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
959 (PGT_base_page_table|PGT_validated)) )
960 break;
962 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
963 if ( likely(y == x) )
964 {
965 free_page_type(page, PGT_base_page_table);
966 break;
967 }
968 }
970 /* Follow the list chain and /then/ potentially free the page. */
971 ent = ent->next;
972 put_page(page);
973 }
975 spin_unlock_recursive(&d->page_alloc_lock);
976 }
978 void domain_relinquish_resources(struct domain *d)
979 {
980 struct exec_domain *ed;
982 BUG_ON(d->cpuset != 0);
984 ptwr_destroy(d);
986 /* Release device mappings of other domains */
987 gnttab_release_dev_mappings(d->grant_table);
989 /* Drop the in-use references to page-table bases. */
990 for_each_exec_domain ( d, ed )
991 {
992 if ( pagetable_val(ed->arch.guest_table) != 0 )
993 {
994 struct pfn_info *page =
995 &frame_table[pagetable_val(ed->arch.guest_table)>>PAGE_SHIFT];
997 if ( shadow_mode_enabled(d) )
998 put_page(page);
999 else
1000 put_page_and_type(page);
1002 ed->arch.guest_table = mk_pagetable(0);
1005 if ( pagetable_val(ed->arch.guest_table_user) != 0 )
1007 struct pfn_info *page =
1008 &frame_table[pagetable_val(ed->arch.guest_table_user)
1009 >> PAGE_SHIFT];
1011 if ( shadow_mode_enabled(d) )
1012 put_page(page);
1013 else
1014 put_page_and_type(page);
1016 ed->arch.guest_table_user = mk_pagetable(0);
1019 vmx_relinquish_resources(ed);
1022 /* Exit shadow mode before deconstructing final guest page table. */
1023 shadow_mode_destroy(d);
1025 /*
1026 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1027 * it automatically gets squashed when the guest's mappings go away.
1028 */
1029 for_each_exec_domain(d, ed)
1030 destroy_gdt(ed);
1032 /* Relinquish every page of memory. */
1033 relinquish_memory(d, &d->xenpage_list);
1034 relinquish_memory(d, &d->page_list);
1038 /*
1039 * Local variables:
1040 * mode: C
1041 * c-set-style: "BSD"
1042 * c-basic-offset: 4
1043 * tab-width: 4
1044 * indent-tabs-mode: nil
1045 * End:
1046 */