debuggers.hg

view xen/arch/x86/domain.c @ 3661:060c1ea52343

bitkeeper revision 1.1159.212.73 (420154ceFUvIANCrxSTgPyOjFi1Pag)

More x86_64 work. Interrupts and exceptions are now working. Next step is
DOM0 construction. First part of that is to map rest of physical memory,
allocate and map the mach_to_phys table, and fix arch_init_memory().
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@viper.(none)
date Wed Feb 02 22:31:42 2005 +0000 (2005-02-02)
parents 0ef6e8e6e85d
children d55d523078f7
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <asm/regs.h>
23 #include <asm/mc146818rtc.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/i387.h>
29 #include <asm/mpspec.h>
30 #include <asm/ldt.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 #include <asm/shadow.h>
34 #include <xen/console.h>
35 #include <xen/elf.h>
36 #include <asm/vmx.h>
37 #include <asm/vmx_vmcs.h>
38 #include <xen/kernel.h>
39 #include <public/io/ioreq.h>
40 #include <xen/multicall.h>
42 /* opt_noreboot: If true, machine will need manual reset on error. */
43 static int opt_noreboot = 0;
44 boolean_param("noreboot", opt_noreboot);
46 #if !defined(CONFIG_X86_64BITMODE)
47 /* No ring-3 access in initial page tables. */
48 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
49 #else
50 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
51 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
52 #endif
53 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
54 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
55 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
57 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
58 #define round_pgdown(_p) ((_p)&PAGE_MASK)
60 static void default_idle(void)
61 {
62 __cli();
63 if ( !softirq_pending(smp_processor_id()) )
64 safe_halt();
65 else
66 __sti();
67 }
69 static __attribute_used__ void idle_loop(void)
70 {
71 int cpu = smp_processor_id();
72 for ( ; ; )
73 {
74 irq_stat[cpu].idle_timestamp = jiffies;
75 while ( !softirq_pending(cpu) )
76 default_idle();
77 do_softirq();
78 }
79 }
81 void startup_cpu_idle_loop(void)
82 {
83 /* Just some sanity to ensure that the scheduler is set up okay. */
84 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
85 domain_unpause_by_systemcontroller(current->domain);
86 __enter_scheduler();
88 /*
89 * Declares CPU setup done to the boot processor.
90 * Therefore memory barrier to ensure state is visible.
91 */
92 smp_mb();
93 init_idle();
95 idle_loop();
96 }
98 static long no_idt[2];
99 static int reboot_mode;
100 int reboot_thru_bios = 0;
102 #ifdef CONFIG_SMP
103 int reboot_smp = 0;
104 static int reboot_cpu = -1;
105 /* shamelessly grabbed from lib/vsprintf.c for readability */
106 #define is_digit(c) ((c) >= '0' && (c) <= '9')
107 #endif
110 static inline void kb_wait(void)
111 {
112 int i;
114 for (i=0; i<0x10000; i++)
115 if ((inb_p(0x64) & 0x02) == 0)
116 break;
117 }
120 void machine_restart(char * __unused)
121 {
122 #ifdef CONFIG_SMP
123 int cpuid;
124 #endif
126 if ( opt_noreboot )
127 {
128 printk("Reboot disabled on cmdline: require manual reset\n");
129 for ( ; ; ) __asm__ __volatile__ ("hlt");
130 }
132 #ifdef CONFIG_SMP
133 cpuid = GET_APIC_ID(apic_read(APIC_ID));
135 /* KAF: Need interrupts enabled for safe IPI. */
136 __sti();
138 if (reboot_smp) {
140 /* check to see if reboot_cpu is valid
141 if its not, default to the BSP */
142 if ((reboot_cpu == -1) ||
143 (reboot_cpu > (NR_CPUS -1)) ||
144 !(phys_cpu_present_map & (1<<cpuid)))
145 reboot_cpu = boot_cpu_physical_apicid;
147 reboot_smp = 0; /* use this as a flag to only go through this once*/
148 /* re-run this function on the other CPUs
149 it will fall though this section since we have
150 cleared reboot_smp, and do the reboot if it is the
151 correct CPU, otherwise it halts. */
152 if (reboot_cpu != cpuid)
153 smp_call_function((void *)machine_restart , NULL, 1, 0);
154 }
156 /* if reboot_cpu is still -1, then we want a tradional reboot,
157 and if we are not running on the reboot_cpu,, halt */
158 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
159 for (;;)
160 __asm__ __volatile__ ("hlt");
161 }
162 /*
163 * Stop all CPUs and turn off local APICs and the IO-APIC, so
164 * other OSs see a clean IRQ state.
165 */
166 smp_send_stop();
167 disable_IO_APIC();
168 #endif
169 #ifdef CONFIG_VMX
170 stop_vmx();
171 #endif
173 if(!reboot_thru_bios) {
174 /* rebooting needs to touch the page at absolute addr 0 */
175 *((unsigned short *)__va(0x472)) = reboot_mode;
176 for (;;) {
177 int i;
178 for (i=0; i<100; i++) {
179 kb_wait();
180 udelay(50);
181 outb(0xfe,0x64); /* pulse reset low */
182 udelay(50);
183 }
184 /* That didn't work - force a triple fault.. */
185 __asm__ __volatile__("lidt %0": "=m" (no_idt));
186 __asm__ __volatile__("int3");
187 }
188 }
190 panic("Need to reinclude BIOS reboot code\n");
191 }
194 void __attribute__((noreturn)) __machine_halt(void *unused)
195 {
196 for ( ; ; )
197 __asm__ __volatile__ ( "cli; hlt" );
198 }
200 void machine_halt(void)
201 {
202 smp_call_function(__machine_halt, NULL, 1, 1);
203 __machine_halt(NULL);
204 }
206 void dump_pageframe_info(struct domain *d)
207 {
208 struct pfn_info *page;
210 if ( d->tot_pages < 10 )
211 {
212 list_for_each_entry ( page, &d->page_list, list )
213 {
214 printk("Page %08x: caf=%08x, taf=%08x\n",
215 page_to_phys(page), page->count_info,
216 page->u.inuse.type_info);
217 }
218 }
220 page = virt_to_page(d->shared_info);
221 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
222 page_to_phys(page), page->count_info,
223 page->u.inuse.type_info);
224 }
226 struct domain *arch_alloc_domain_struct(void)
227 {
228 return xmalloc(struct domain);
229 }
231 void arch_free_domain_struct(struct domain *d)
232 {
233 xfree(d);
234 }
236 struct exec_domain *arch_alloc_exec_domain_struct(void)
237 {
238 return xmalloc(struct exec_domain);
239 }
241 void arch_free_exec_domain_struct(struct exec_domain *ed)
242 {
243 xfree(ed);
244 }
246 void free_perdomain_pt(struct domain *d)
247 {
248 free_xenheap_page((unsigned long)d->mm_perdomain_pt);
249 }
251 static void continue_idle_task(struct exec_domain *ed)
252 {
253 reset_stack_and_jump(idle_loop);
254 }
256 static void continue_nonidle_task(struct exec_domain *ed)
257 {
258 reset_stack_and_jump(ret_from_intr);
259 }
261 void arch_do_createdomain(struct exec_domain *ed)
262 {
263 struct domain *d = ed->domain;
264 #ifdef ARCH_HAS_FAST_TRAP
265 SET_DEFAULT_FAST_TRAP(&ed->thread);
266 #endif
268 if ( d->id == IDLE_DOMAIN_ID )
269 {
270 ed->thread.schedule_tail = continue_idle_task;
271 }
272 else
273 {
274 ed->thread.schedule_tail = continue_nonidle_task;
276 d->shared_info = (void *)alloc_xenheap_page();
277 memset(d->shared_info, 0, PAGE_SIZE);
278 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
279 d->shared_info->arch.mfn_to_pfn_start = m2p_start_mfn;
280 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
281 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
282 PAGE_SHIFT] = INVALID_P2M_ENTRY;
284 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
285 memset(d->mm_perdomain_pt, 0, PAGE_SIZE);
286 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >>
287 PAGE_SHIFT] = INVALID_P2M_ENTRY;
288 ed->mm.perdomain_ptes = d->mm_perdomain_pt;
289 }
290 }
292 #ifdef CONFIG_VMX
293 void arch_vmx_do_resume(struct exec_domain *ed)
294 {
295 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs);
297 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr);
298 vmx_do_resume(ed);
299 reset_stack_and_jump(vmx_asm_do_resume);
300 }
302 void arch_vmx_do_launch(struct exec_domain *ed)
303 {
304 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs);
306 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr);
307 vmx_do_launch(ed);
308 reset_stack_and_jump(vmx_asm_do_launch);
309 }
311 static void monitor_mk_pagetable(struct exec_domain *ed)
312 {
313 unsigned long mpfn;
314 l2_pgentry_t *mpl2e;
315 struct pfn_info *mpfn_info;
316 struct mm_struct *m = &ed->mm;
317 struct domain *d = ed->domain;
319 mpfn_info = alloc_domheap_page(NULL);
320 ASSERT( mpfn_info );
322 mpfn = (unsigned long) (mpfn_info - frame_table);
323 mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << L1_PAGETABLE_SHIFT);
324 memset(mpl2e, 0, PAGE_SIZE);
326 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
327 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
328 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
330 m->monitor_table = mk_pagetable(mpfn << L1_PAGETABLE_SHIFT);
331 m->shadow_mode = SHM_full_32;
333 mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
334 mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK)
335 | __PAGE_HYPERVISOR);
337 unmap_domain_mem(mpl2e);
338 }
340 /*
341 * Free the pages for monitor_table and guest_pl2e_cache
342 */
343 static void monitor_rm_pagetable(struct exec_domain *ed)
344 {
345 struct mm_struct *m = &ed->mm;
346 l2_pgentry_t *mpl2e;
347 unsigned long mpfn;
349 mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(m->monitor_table));
350 /*
351 * First get the pfn for guest_pl2e_cache by looking at monitor_table
352 */
353 mpfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
354 >> PAGE_SHIFT;
356 free_domheap_page(&frame_table[mpfn]);
357 unmap_domain_mem(mpl2e);
359 /*
360 * Then free monitor_table.
361 */
362 mpfn = (pagetable_val(m->monitor_table)) >> PAGE_SHIFT;
363 free_domheap_page(&frame_table[mpfn]);
365 m->monitor_table = mk_pagetable(0);
366 }
368 static int vmx_final_setup_guestos(struct exec_domain *ed,
369 full_execution_context_t *full_context)
370 {
371 int error;
372 execution_context_t *context;
373 struct vmcs_struct *vmcs;
375 context = &full_context->cpu_ctxt;
377 /*
378 * Create a new VMCS
379 */
380 if (!(vmcs = alloc_vmcs())) {
381 printk("Failed to create a new VMCS\n");
382 return -ENOMEM;
383 }
385 memset(&ed->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct));
387 ed->thread.arch_vmx.vmcs = vmcs;
388 error = construct_vmcs(&ed->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
389 if (error < 0) {
390 printk("Failed to construct a new VMCS\n");
391 goto out;
392 }
394 monitor_mk_pagetable(ed);
395 ed->thread.schedule_tail = arch_vmx_do_launch;
396 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->thread.arch_vmx.cpu_state);
398 #if defined (__i386)
399 ed->thread.arch_vmx.vmx_platform.real_mode_data =
400 (unsigned long *) context->esi;
401 #endif
403 if (ed == ed->domain->exec_domain[0]) {
404 /*
405 * Required to do this once per domain
406 */
407 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
408 sizeof(ed->domain->shared_info->evtchn_mask));
409 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
410 }
412 return 0;
414 out:
415 free_vmcs(vmcs);
416 ed->thread.arch_vmx.vmcs = 0;
417 return error;
418 }
419 #endif
421 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
422 {
423 unsigned long phys_basetab;
424 int i, rc;
426 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
427 if ( c->flags & ECF_I387_VALID )
428 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
430 memcpy(&d->thread.user_ctxt,
431 &c->cpu_ctxt,
432 sizeof(d->thread.user_ctxt));
434 /* Clear IOPL for unprivileged domains. */
435 if (!IS_PRIV(d->domain))
436 d->thread.user_ctxt.eflags &= 0xffffcfff;
438 /*
439 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
440 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
441 * If SS RPL or DPL differs from CS RPL then we'll #GP.
442 */
443 if (!(c->flags & ECF_VMX_GUEST))
444 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
445 ((d->thread.user_ctxt.ss & 3) == 0) )
446 return -EINVAL;
448 memcpy(&d->thread.i387,
449 &c->fpu_ctxt,
450 sizeof(d->thread.i387));
452 memcpy(d->thread.traps,
453 &c->trap_ctxt,
454 sizeof(d->thread.traps));
456 #ifdef ARCH_HAS_FAST_TRAP
457 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
458 return rc;
459 #endif
461 d->mm.ldt_base = c->ldt_base;
462 d->mm.ldt_ents = c->ldt_ents;
464 d->thread.guestos_ss = c->guestos_ss;
465 d->thread.guestos_sp = c->guestos_esp;
467 for ( i = 0; i < 8; i++ )
468 (void)set_debugreg(d, i, c->debugreg[i]);
470 d->thread.event_selector = c->event_callback_cs;
471 d->thread.event_address = c->event_callback_eip;
472 d->thread.failsafe_selector = c->failsafe_callback_cs;
473 d->thread.failsafe_address = c->failsafe_callback_eip;
475 phys_basetab = c->pt_base;
476 d->mm.pagetable = mk_pagetable(phys_basetab);
477 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
478 PGT_base_page_table) )
479 return -EINVAL;
481 /* Failure to set GDT is harmless. */
482 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
483 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
484 if ( c->gdt_ents != 0 )
485 {
486 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
487 {
488 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
489 return rc;
490 }
491 }
493 #ifdef CONFIG_VMX
494 if (c->flags & ECF_VMX_GUEST)
495 return vmx_final_setup_guestos(d, c);
496 #endif
498 return 0;
499 }
501 #if defined(__i386__) /* XXX */
503 void new_thread(struct exec_domain *d,
504 unsigned long start_pc,
505 unsigned long start_stack,
506 unsigned long start_info)
507 {
508 execution_context_t *ec = &d->thread.user_ctxt;
510 /*
511 * Initial register values:
512 * DS,ES,FS,GS = FLAT_RING1_DS
513 * CS:EIP = FLAT_RING1_CS:start_pc
514 * SS:ESP = FLAT_RING1_DS:start_stack
515 * ESI = start_info
516 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
517 */
518 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
519 ec->cs = FLAT_RING1_CS;
520 ec->eip = start_pc;
521 ec->esp = start_stack;
522 ec->esi = start_info;
524 __save_flags(ec->eflags);
525 ec->eflags |= X86_EFLAGS_IF;
526 }
529 /*
530 * This special macro can be used to load a debugging register
531 */
532 #define loaddebug(thread,register) \
533 __asm__("movl %0,%%db" #register \
534 : /* no output */ \
535 :"r" (thread->debugreg[register]))
538 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
539 {
540 struct thread_struct *next = &next_p->thread;
541 struct tss_struct *tss = init_tss + smp_processor_id();
542 execution_context_t *stack_ec = get_execution_context();
543 int i;
544 unsigned long vmx_domain = next_p->thread.arch_vmx.flags;
546 __cli();
548 /* Switch guest general-register state. */
549 if ( !is_idle_task(prev_p->domain) )
550 {
551 memcpy(&prev_p->thread.user_ctxt,
552 stack_ec,
553 sizeof(*stack_ec));
554 unlazy_fpu(prev_p);
555 CLEAR_FAST_TRAP(&prev_p->thread);
556 }
558 if ( !is_idle_task(next_p->domain) )
559 {
560 memcpy(stack_ec,
561 &next_p->thread.user_ctxt,
562 sizeof(*stack_ec));
564 /* Maybe switch the debug registers. */
565 if ( unlikely(next->debugreg[7]) )
566 {
567 loaddebug(next, 0);
568 loaddebug(next, 1);
569 loaddebug(next, 2);
570 loaddebug(next, 3);
571 /* no 4 and 5 */
572 loaddebug(next, 6);
573 loaddebug(next, 7);
574 }
576 if (vmx_domain) {
577 /* Switch page tables. */
578 write_ptbase(&next_p->mm);
580 set_current(next_p);
581 /* Switch GDT and LDT. */
582 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
584 __sti();
585 return;
586 }
588 SET_FAST_TRAP(&next_p->thread);
590 /* Switch the guest OS ring-1 stack. */
591 tss->esp1 = next->guestos_sp;
592 tss->ss1 = next->guestos_ss;
594 /* Switch page tables. */
595 write_ptbase(&next_p->mm);
596 }
598 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
599 {
600 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
601 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
602 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
603 ~0U, IOBMP_BYTES_PER_SELBIT);
604 tss->bitmap = IOBMP_INVALID_OFFSET;
605 }
607 if ( unlikely(next_p->thread.io_bitmap != NULL) )
608 {
609 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
610 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
611 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
612 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
613 IOBMP_BYTES_PER_SELBIT);
614 tss->bitmap = IOBMP_OFFSET;
615 }
617 set_current(next_p);
619 /* Switch GDT and LDT. */
620 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
621 load_LDT(next_p);
623 __sti();
624 }
627 /* XXX Currently the 'domain' field is ignored! XXX */
628 long do_iopl(domid_t domain, unsigned int new_io_pl)
629 {
630 execution_context_t *ec = get_execution_context();
631 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
632 return 0;
633 }
635 #endif
637 unsigned long hypercall_create_continuation(
638 unsigned int op, unsigned int nr_args, ...)
639 {
640 struct mc_state *mcs = &mc_state[smp_processor_id()];
641 execution_context_t *ec;
642 unsigned long *preg;
643 unsigned int i;
644 va_list args;
646 va_start(args, nr_args);
648 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
649 {
650 __set_bit(_MCSF_call_preempted, &mcs->flags);
652 for ( i = 0; i < nr_args; i++ )
653 mcs->call.args[i] = va_arg(args, unsigned long);
654 }
655 else
656 {
657 ec = get_execution_context();
658 #if defined(__i386__)
659 ec->eax = op;
660 ec->eip -= 2; /* re-execute 'int 0x82' */
662 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
663 *preg = va_arg(args, unsigned long);
664 #else
665 preg = NULL; /* XXX x86/64 */
666 #endif
667 }
669 va_end(args);
671 return op;
672 }
674 static void relinquish_list(struct domain *d, struct list_head *list)
675 {
676 struct list_head *ent;
677 struct pfn_info *page;
678 unsigned long x, y;
680 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
681 spin_lock_recursive(&d->page_alloc_lock);
683 ent = list->next;
684 while ( ent != list )
685 {
686 page = list_entry(ent, struct pfn_info, list);
688 /* Grab a reference to the page so it won't disappear from under us. */
689 if ( unlikely(!get_page(page, d)) )
690 {
691 /* Couldn't get a reference -- someone is freeing this page. */
692 ent = ent->next;
693 continue;
694 }
696 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
697 put_page_and_type(page);
699 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
700 put_page(page);
702 /*
703 * Forcibly invalidate base page tables at this point to break circular
704 * 'linear page table' references. This is okay because MMU structures
705 * are not shared across domains and this domain is now dead. Thus base
706 * tables are not in use so a non-zero count means circular reference.
707 */
708 y = page->u.inuse.type_info;
709 for ( ; ; )
710 {
711 x = y;
712 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
713 (PGT_base_page_table|PGT_validated)) )
714 break;
716 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
717 if ( likely(y == x) )
718 {
719 free_page_type(page, PGT_base_page_table);
720 break;
721 }
722 }
724 /* Follow the list chain and /then/ potentially free the page. */
725 ent = ent->next;
726 put_page(page);
727 }
729 spin_unlock_recursive(&d->page_alloc_lock);
730 }
732 #ifdef CONFIG_VMX
733 static void vmx_domain_relinquish_memory(struct exec_domain *ed)
734 {
735 struct domain *d = ed->domain;
737 /*
738 * Free VMCS
739 */
740 ASSERT(ed->thread.arch_vmx.vmcs);
741 free_vmcs(ed->thread.arch_vmx.vmcs);
742 ed->thread.arch_vmx.vmcs = 0;
744 monitor_rm_pagetable(ed);
746 if (ed == d->exec_domain[0]) {
747 int i;
748 unsigned long pfn;
750 for (i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++) {
751 unsigned long l1e;
753 l1e = l1_pgentry_val(d->mm_perdomain_pt[i]);
754 if (l1e & _PAGE_PRESENT) {
755 pfn = l1e >> PAGE_SHIFT;
756 free_domheap_page(&frame_table[pfn]);
757 }
758 }
759 }
761 }
762 #endif
764 void domain_relinquish_memory(struct domain *d)
765 {
766 struct exec_domain *ed;
768 /* Ensure that noone is running over the dead domain's page tables. */
769 synchronise_pagetables(~0UL);
771 /* Exit shadow mode before deconstructing final guest page table. */
772 shadow_mode_disable(d);
774 /* Drop the in-use reference to the page-table base. */
775 for_each_exec_domain ( d, ed )
776 {
777 if ( pagetable_val(ed->mm.pagetable) != 0 )
778 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >>
779 PAGE_SHIFT]);
780 }
782 #ifdef CONFIG_VMX
783 if ( VMX_DOMAIN(d->exec_domain[0]) )
784 for_each_exec_domain ( d, ed )
785 vmx_domain_relinquish_memory(ed);
786 #endif
788 /*
789 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
790 * it automatically gets squashed when the guest's mappings go away.
791 */
792 for_each_exec_domain(d, ed)
793 destroy_gdt(ed);
795 /* Relinquish every page of memory. */
796 relinquish_list(d, &d->xenpage_list);
797 relinquish_list(d, &d->page_list);
798 }
801 int construct_dom0(struct domain *p,
802 unsigned long alloc_start,
803 unsigned long alloc_end,
804 char *image_start, unsigned long image_len,
805 char *initrd_start, unsigned long initrd_len,
806 char *cmdline)
807 {
808 char *dst;
809 int i, rc;
810 unsigned long pfn, mfn;
811 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
812 unsigned long nr_pt_pages;
813 unsigned long count;
814 l2_pgentry_t *l2tab, *l2start;
815 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
816 struct pfn_info *page = NULL;
817 start_info_t *si;
818 struct exec_domain *ed = p->exec_domain[0];
820 /*
821 * This fully describes the memory layout of the initial domain. All
822 * *_start address are page-aligned, except v_start (and v_end) which are
823 * superpage-aligned.
824 */
825 struct domain_setup_info dsi;
826 unsigned long vinitrd_start;
827 unsigned long vinitrd_end;
828 unsigned long vphysmap_start;
829 unsigned long vphysmap_end;
830 unsigned long vstartinfo_start;
831 unsigned long vstartinfo_end;
832 unsigned long vstack_start;
833 unsigned long vstack_end;
834 unsigned long vpt_start;
835 unsigned long vpt_end;
836 unsigned long v_end;
838 /* Machine address of next candidate page-table page. */
839 unsigned long mpt_alloc;
841 extern void physdev_init_dom0(struct domain *);
843 /* Sanity! */
844 if ( p->id != 0 )
845 BUG();
846 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) )
847 BUG();
849 memset(&dsi, 0, sizeof(struct domain_setup_info));
851 printk("*** LOADING DOMAIN 0 ***\n");
853 /*
854 * This is all a bit grim. We've moved the modules to the "safe" physical
855 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
856 * routine we're going to copy it down into the region that's actually
857 * been allocated to domain 0. This is highly likely to be overlapping, so
858 * we use a forward copy.
859 *
860 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
861 * 4GB and lots of network/disk cards that allocate loads of buffers.
862 * We'll have to revisit this if we ever support PAE (64GB).
863 */
865 rc = parseelfimage(image_start, image_len, &dsi);
866 if ( rc != 0 )
867 return rc;
869 /* Set up domain options */
870 if ( dsi.use_writable_pagetables )
871 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
873 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
874 {
875 printk("Initial guest OS must load to a page boundary.\n");
876 return -EINVAL;
877 }
879 /*
880 * Why do we need this? The number of page-table frames depends on the
881 * size of the bootstrap address space. But the size of the address space
882 * depends on the number of page-table frames (since each one is mapped
883 * read-only). We have a pair of simultaneous equations in two unknowns,
884 * which we solve by exhaustive search.
885 */
886 vinitrd_start = round_pgup(dsi.v_kernend);
887 vinitrd_end = vinitrd_start + initrd_len;
888 vphysmap_start = round_pgup(vinitrd_end);
889 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
890 vpt_start = round_pgup(vphysmap_end);
891 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
892 {
893 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
894 vstartinfo_start = vpt_end;
895 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
896 vstack_start = vstartinfo_end;
897 vstack_end = vstack_start + PAGE_SIZE;
898 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
899 if ( (v_end - vstack_end) < (512 << 10) )
900 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
901 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
902 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
903 break;
904 }
906 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
907 " Kernel image: %p->%p\n"
908 " Initrd image: %p->%p\n"
909 " Dom0 alloc.: %08lx->%08lx\n",
910 image_start, image_start + image_len,
911 initrd_start, initrd_start + initrd_len,
912 alloc_start, alloc_end);
913 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
914 " Loaded kernel: %08lx->%08lx\n"
915 " Init. ramdisk: %08lx->%08lx\n"
916 " Phys-Mach map: %08lx->%08lx\n"
917 " Page tables: %08lx->%08lx\n"
918 " Start info: %08lx->%08lx\n"
919 " Boot stack: %08lx->%08lx\n"
920 " TOTAL: %08lx->%08lx\n",
921 dsi.v_kernstart, dsi.v_kernend,
922 vinitrd_start, vinitrd_end,
923 vphysmap_start, vphysmap_end,
924 vpt_start, vpt_end,
925 vstartinfo_start, vstartinfo_end,
926 vstack_start, vstack_end,
927 dsi.v_start, v_end);
928 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
930 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
931 {
932 printk("Initial guest OS requires too much space\n"
933 "(%luMB is greater than %luMB limit)\n",
934 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
935 return -ENOMEM;
936 }
938 /*
939 * Protect the lowest 1GB of memory. We use a temporary mapping there
940 * from which we copy the kernel and ramdisk images.
941 */
942 if ( dsi.v_start < (1<<30) )
943 {
944 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
945 return -EINVAL;
946 }
948 /* Paranoia: scrub DOM0's memory allocation. */
949 printk("Scrubbing DOM0 RAM: ");
950 dst = (char *)alloc_start;
951 while ( dst < (char *)alloc_end )
952 {
953 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
954 printk(".");
955 touch_nmi_watchdog();
956 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
957 {
958 memset(dst, 0, SCRUB_BYTES);
959 dst += SCRUB_BYTES;
960 }
961 else
962 {
963 memset(dst, 0, (char *)alloc_end - dst);
964 break;
965 }
966 }
967 printk("done.\n");
969 /* Construct a frame-allocation list for the initial domain. */
970 for ( mfn = (alloc_start>>PAGE_SHIFT);
971 mfn < (alloc_end>>PAGE_SHIFT);
972 mfn++ )
973 {
974 page = &frame_table[mfn];
975 page->u.inuse.domain = p;
976 page->u.inuse.type_info = 0;
977 page->count_info = PGC_allocated | 1;
978 list_add_tail(&page->list, &p->page_list);
979 p->tot_pages++; p->max_pages++;
980 }
982 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
984 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
985 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
987 /*
988 * We're basically forcing default RPLs to 1, so that our "what privilege
989 * level are we returning to?" logic works.
990 */
991 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
992 ed->thread.event_selector = FLAT_GUESTOS_CS;
993 ed->thread.guestos_ss = FLAT_GUESTOS_DS;
994 for ( i = 0; i < 256; i++ )
995 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
997 /* WARNING: The new domain must have its 'processor' field filled in! */
998 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
999 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
1000 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
1001 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
1002 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
1003 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
1004 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
1006 l2tab += l2_table_offset(dsi.v_start);
1007 mfn = alloc_start >> PAGE_SHIFT;
1008 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
1010 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
1012 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
1013 mpt_alloc += PAGE_SIZE;
1014 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
1015 clear_page(l1tab);
1016 if ( count == 0 )
1017 l1tab += l1_table_offset(dsi.v_start);
1019 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
1021 page = &frame_table[mfn];
1022 if ( !get_page_and_type(page, p, PGT_writable_page) )
1023 BUG();
1025 mfn++;
1028 /* Pages that are part of page tables must be read only. */
1029 l2tab = l2start + l2_table_offset(vpt_start);
1030 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
1031 l1tab += l1_table_offset(vpt_start);
1032 l2tab++;
1033 for ( count = 0; count < nr_pt_pages; count++ )
1035 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
1036 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
1037 if ( count == 0 )
1039 page->u.inuse.type_info &= ~PGT_type_mask;
1040 page->u.inuse.type_info |= PGT_l2_page_table;
1042 /*
1043 * No longer writable: decrement the type_count.
1044 * Installed as CR3: increment both the ref_count and type_count.
1045 * Net: just increment the ref_count.
1046 */
1047 get_page(page, p); /* an extra ref because of readable mapping */
1049 /* Get another ref to L2 page so that it can be pinned. */
1050 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
1051 BUG();
1052 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1054 else
1056 page->u.inuse.type_info &= ~PGT_type_mask;
1057 page->u.inuse.type_info |= PGT_l1_page_table;
1058 page->u.inuse.type_info |=
1059 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
1061 /*
1062 * No longer writable: decrement the type_count.
1063 * This is an L1 page, installed in a validated L2 page:
1064 * increment both the ref_count and type_count.
1065 * Net: just increment the ref_count.
1066 */
1067 get_page(page, p); /* an extra ref because of readable mapping */
1069 l1tab++;
1070 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
1071 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
1074 /* Set up shared-info area. */
1075 update_dom_time(p);
1076 p->shared_info->domain_time = 0;
1077 /* Mask all upcalls... */
1078 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1079 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
1080 p->shared_info->n_vcpu = smp_num_cpus;
1082 /* Install the new page tables. */
1083 __cli();
1084 write_ptbase(&ed->mm);
1086 /* Copy the OS image. */
1087 (void)loadelfimage(image_start);
1089 /* Copy the initial ramdisk. */
1090 if ( initrd_len != 0 )
1091 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1093 /* Set up start info area. */
1094 si = (start_info_t *)vstartinfo_start;
1095 memset(si, 0, PAGE_SIZE);
1096 si->nr_pages = p->tot_pages;
1097 si->shared_info = virt_to_phys(p->shared_info);
1098 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
1099 si->pt_base = vpt_start;
1100 si->nr_pt_frames = nr_pt_pages;
1101 si->mfn_list = vphysmap_start;
1103 /* Write the phys->machine and machine->phys table entries. */
1104 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
1106 mfn = pfn + (alloc_start>>PAGE_SHIFT);
1107 #ifndef NDEBUG
1108 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
1109 if ( pfn > REVERSE_START )
1110 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
1111 #endif
1112 ((unsigned long *)vphysmap_start)[pfn] = mfn;
1113 machine_to_phys_mapping[mfn] = pfn;
1116 if ( initrd_len != 0 )
1118 si->mod_start = vinitrd_start;
1119 si->mod_len = initrd_len;
1120 printk("Initrd len 0x%lx, start at 0x%08lx\n",
1121 si->mod_len, si->mod_start);
1124 dst = si->cmd_line;
1125 if ( cmdline != NULL )
1127 for ( i = 0; i < 255; i++ )
1129 if ( cmdline[i] == '\0' )
1130 break;
1131 *dst++ = cmdline[i];
1134 *dst = '\0';
1136 /* Reinstate the caller's page tables. */
1137 write_ptbase(&current->mm);
1138 __sti();
1140 /* Destroy low mappings - they were only for our convenience. */
1141 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
1142 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
1143 l2start[i] = mk_l2_pgentry(0);
1144 zap_low_mappings(); /* Do the same for the idle page tables. */
1146 /* DOM0 gets access to everything. */
1147 physdev_init_dom0(p);
1149 set_bit(DF_CONSTRUCTED, &p->d_flags);
1151 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
1153 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
1154 shadow_lock(&p->mm);
1155 shadow_mode_enable(p, SHM_test);
1156 shadow_unlock(&p->mm);
1157 #endif
1159 return 0;