debuggers.hg

view xen/arch/x86/domain.c @ 3352:32952707e391

bitkeeper revision 1.1159.1.498 (41c95548t401hgzsaGaOskmS9IeVmA)

Merge scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-2.0-testing.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
author kaf24@scramble.cl.cam.ac.uk
date Wed Dec 22 11:06:48 2004 +0000 (2004-12-22)
parents c65b544a8c4e 391b2a76a749
children a2352469313f
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <asm/regs.h>
22 #include <asm/mc146818rtc.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/mpspec.h>
29 #include <asm/ldt.h>
30 #include <xen/irq.h>
31 #include <xen/event.h>
32 #include <asm/shadow.h>
33 #include <xen/console.h>
34 #include <xen/elf.h>
35 #include <asm/vmx.h>
36 #include <asm/vmx_vmcs.h>
37 #include <xen/kernel.h>
38 #include <public/io/ioreq.h>
39 #include <xen/multicall.h>
41 #if !defined(CONFIG_X86_64BITMODE)
42 /* No ring-3 access in initial page tables. */
43 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
44 #else
45 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
46 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
47 #endif
48 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
49 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
50 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
52 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
53 #define round_pgdown(_p) ((_p)&PAGE_MASK)
55 static void default_idle(void)
56 {
57 __cli();
58 if ( !softirq_pending(smp_processor_id()) )
59 safe_halt();
60 else
61 __sti();
62 }
64 void idle_loop(void)
65 {
66 int cpu = smp_processor_id();
67 for ( ; ; )
68 {
69 irq_stat[cpu].idle_timestamp = jiffies;
70 while ( !softirq_pending(cpu) )
71 default_idle();
72 do_softirq();
73 }
74 }
76 void startup_cpu_idle_loop(void)
77 {
78 /* Just some sanity to ensure that the scheduler is set up okay. */
79 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
80 domain_unpause_by_systemcontroller(current->domain);
81 __enter_scheduler();
83 /*
84 * Declares CPU setup done to the boot processor.
85 * Therefore memory barrier to ensure state is visible.
86 */
87 smp_mb();
88 init_idle();
90 idle_loop();
91 }
93 static long no_idt[2];
94 static int reboot_mode;
95 int reboot_thru_bios = 0;
97 #ifdef CONFIG_SMP
98 int reboot_smp = 0;
99 static int reboot_cpu = -1;
100 /* shamelessly grabbed from lib/vsprintf.c for readability */
101 #define is_digit(c) ((c) >= '0' && (c) <= '9')
102 #endif
105 static inline void kb_wait(void)
106 {
107 int i;
109 for (i=0; i<0x10000; i++)
110 if ((inb_p(0x64) & 0x02) == 0)
111 break;
112 }
115 void machine_restart(char * __unused)
116 {
117 extern int opt_noreboot;
118 #ifdef CONFIG_SMP
119 int cpuid;
120 #endif
122 if ( opt_noreboot )
123 {
124 printk("Reboot disabled on cmdline: require manual reset\n");
125 for ( ; ; ) __asm__ __volatile__ ("hlt");
126 }
128 #ifdef CONFIG_SMP
129 cpuid = GET_APIC_ID(apic_read(APIC_ID));
131 /* KAF: Need interrupts enabled for safe IPI. */
132 __sti();
134 if (reboot_smp) {
136 /* check to see if reboot_cpu is valid
137 if its not, default to the BSP */
138 if ((reboot_cpu == -1) ||
139 (reboot_cpu > (NR_CPUS -1)) ||
140 !(phys_cpu_present_map & (1<<cpuid)))
141 reboot_cpu = boot_cpu_physical_apicid;
143 reboot_smp = 0; /* use this as a flag to only go through this once*/
144 /* re-run this function on the other CPUs
145 it will fall though this section since we have
146 cleared reboot_smp, and do the reboot if it is the
147 correct CPU, otherwise it halts. */
148 if (reboot_cpu != cpuid)
149 smp_call_function((void *)machine_restart , NULL, 1, 0);
150 }
152 /* if reboot_cpu is still -1, then we want a tradional reboot,
153 and if we are not running on the reboot_cpu,, halt */
154 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
155 for (;;)
156 __asm__ __volatile__ ("hlt");
157 }
158 /*
159 * Stop all CPUs and turn off local APICs and the IO-APIC, so
160 * other OSs see a clean IRQ state.
161 */
162 smp_send_stop();
163 disable_IO_APIC();
164 #endif
165 #ifdef CONFIG_VMX
166 stop_vmx();
167 #endif
169 if(!reboot_thru_bios) {
170 /* rebooting needs to touch the page at absolute addr 0 */
171 *((unsigned short *)__va(0x472)) = reboot_mode;
172 for (;;) {
173 int i;
174 for (i=0; i<100; i++) {
175 kb_wait();
176 udelay(50);
177 outb(0xfe,0x64); /* pulse reset low */
178 udelay(50);
179 }
180 /* That didn't work - force a triple fault.. */
181 __asm__ __volatile__("lidt %0": "=m" (no_idt));
182 __asm__ __volatile__("int3");
183 }
184 }
186 panic("Need to reinclude BIOS reboot code\n");
187 }
190 void __attribute__((noreturn)) __machine_halt(void *unused)
191 {
192 for ( ; ; )
193 __asm__ __volatile__ ( "cli; hlt" );
194 }
196 void machine_halt(void)
197 {
198 smp_call_function(__machine_halt, NULL, 1, 1);
199 __machine_halt(NULL);
200 }
202 void free_perdomain_pt(struct domain *d)
203 {
204 free_xenheap_page((unsigned long)d->mm_perdomain_pt);
205 }
207 static void continue_idle_task(struct exec_domain *ed)
208 {
209 reset_stack_and_jump(idle_loop);
210 }
212 static void continue_nonidle_task(struct exec_domain *ed)
213 {
214 reset_stack_and_jump(ret_from_intr);
215 }
217 void arch_do_createdomain(struct exec_domain *ed)
218 {
219 struct domain *d = ed->domain;
220 #ifdef ARCH_HAS_FAST_TRAP
221 SET_DEFAULT_FAST_TRAP(&ed->thread);
222 #endif
224 if ( d->id == IDLE_DOMAIN_ID )
225 {
226 ed->thread.schedule_tail = continue_idle_task;
227 }
228 else
229 {
230 ed->thread.schedule_tail = continue_nonidle_task;
232 d->shared_info = (void *)alloc_xenheap_page();
233 memset(d->shared_info, 0, PAGE_SIZE);
234 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
235 d->shared_info->arch.mfn_to_pfn_start =
236 virt_to_phys(&machine_to_phys_mapping[0])>>PAGE_SHIFT;
237 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
238 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
239 PAGE_SHIFT] = INVALID_P2M_ENTRY;
241 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
242 memset(d->mm_perdomain_pt, 0, PAGE_SIZE);
243 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >>
244 PAGE_SHIFT] = INVALID_P2M_ENTRY;
245 ed->mm.perdomain_ptes = d->mm_perdomain_pt;
246 }
247 }
249 #ifdef CONFIG_VMX
250 void arch_vmx_do_resume(struct exec_domain *d)
251 {
252 vmx_do_resume(d);
253 reset_stack_and_jump(vmx_asm_do_resume);
254 }
256 void arch_vmx_do_launch(struct exec_domain *d)
257 {
258 vmx_do_launch(d);
259 reset_stack_and_jump(vmx_asm_do_launch);
260 }
262 static void monitor_mk_pagetable(struct exec_domain *ed)
263 {
264 unsigned long mpfn;
265 l2_pgentry_t *mpl2e;
266 struct pfn_info *mpfn_info;
267 struct mm_struct *m = &ed->mm;
268 struct domain *d = ed->domain;
270 mpfn_info = alloc_domheap_page(NULL);
271 ASSERT( mpfn_info );
273 mpfn = (unsigned long) (mpfn_info - frame_table);
274 mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT);
275 memset(mpl2e, 0, PAGE_SIZE);
277 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
278 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
279 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
281 m->monitor_table = mk_pagetable(mpfn << PAGE_SHIFT);
282 m->shadow_mode = SHM_full_32;
284 mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
285 mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK)
286 | __PAGE_HYPERVISOR);
288 unmap_domain_mem(mpl2e);
289 }
291 static int vmx_final_setup_guestos(struct exec_domain *d,
292 full_execution_context_t *full_context)
293 {
294 int error;
295 execution_context_t *context;
296 struct vmcs_struct *vmcs;
297 unsigned long guest_pa;
299 context = &full_context->cpu_ctxt;
301 /*
302 * Create a new VMCS
303 */
304 if (!(vmcs = alloc_vmcs())) {
305 printk("Failed to create a new VMCS\n");
306 return -ENOMEM;
307 }
309 memset(&d->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct));
311 d->thread.arch_vmx.vmcs = vmcs;
312 error = construct_vmcs(&d->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
313 if (error < 0) {
314 printk("Failed to construct a new VMCS\n");
315 goto out;
316 }
318 monitor_mk_pagetable(d);
319 guest_pa = pagetable_val(d->mm.pagetable);
320 clear_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state);
322 d->thread.arch_vmx.vmx_platform.real_mode_data =
323 (unsigned long *) context->esi;
325 memset(&d->domain->shared_info->evtchn_mask[0], 0xff,
326 sizeof(d->domain->shared_info->evtchn_mask));
327 clear_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_mask[0]);
329 d->thread.schedule_tail = arch_vmx_do_launch;
331 return 0;
333 out:
334 free_vmcs(vmcs);
335 d->thread.arch_vmx.vmcs = 0;
336 return error;
337 }
338 #endif
340 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
341 {
342 unsigned long phys_basetab;
343 int i, rc;
345 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
346 if ( c->flags & ECF_I387_VALID )
347 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
349 memcpy(&d->thread.user_ctxt,
350 &c->cpu_ctxt,
351 sizeof(d->thread.user_ctxt));
353 /* Clear IOPL for unprivileged domains. */
354 if (!IS_PRIV(d->domain))
355 d->thread.user_ctxt.eflags &= 0xffffcfff;
357 /*
358 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
359 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
360 * If SS RPL or DPL differs from CS RPL then we'll #GP.
361 */
362 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
363 ((d->thread.user_ctxt.ss & 3) == 0) )
364 return -EINVAL;
366 memcpy(&d->thread.i387,
367 &c->fpu_ctxt,
368 sizeof(d->thread.i387));
370 memcpy(d->thread.traps,
371 &c->trap_ctxt,
372 sizeof(d->thread.traps));
374 #ifdef ARCH_HAS_FAST_TRAP
375 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
376 return rc;
377 #endif
379 d->mm.ldt_base = c->ldt_base;
380 d->mm.ldt_ents = c->ldt_ents;
382 d->thread.guestos_ss = c->guestos_ss;
383 d->thread.guestos_sp = c->guestos_esp;
385 for ( i = 0; i < 8; i++ )
386 (void)set_debugreg(d, i, c->debugreg[i]);
388 d->thread.event_selector = c->event_callback_cs;
389 d->thread.event_address = c->event_callback_eip;
390 d->thread.failsafe_selector = c->failsafe_callback_cs;
391 d->thread.failsafe_address = c->failsafe_callback_eip;
393 phys_basetab = c->pt_base;
394 d->mm.pagetable = mk_pagetable(phys_basetab);
395 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
396 PGT_base_page_table) )
397 return -EINVAL;
399 /* Failure to set GDT is harmless. */
400 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
401 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
402 if ( c->gdt_ents != 0 )
403 {
404 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
405 {
406 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
407 return rc;
408 }
409 }
411 #ifdef CONFIG_VMX
412 if (c->flags & ECF_VMX_GUEST)
413 return vmx_final_setup_guestos(d, c);
414 #endif
416 return 0;
417 }
419 #if defined(__i386__)
421 void new_thread(struct exec_domain *d,
422 unsigned long start_pc,
423 unsigned long start_stack,
424 unsigned long start_info)
425 {
426 execution_context_t *ec = &d->thread.user_ctxt;
428 /*
429 * Initial register values:
430 * DS,ES,FS,GS = FLAT_RING1_DS
431 * CS:EIP = FLAT_RING1_CS:start_pc
432 * SS:ESP = FLAT_RING1_DS:start_stack
433 * ESI = start_info
434 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
435 */
436 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
437 ec->cs = FLAT_RING1_CS;
438 ec->eip = start_pc;
439 ec->esp = start_stack;
440 ec->esi = start_info;
442 __save_flags(ec->eflags);
443 ec->eflags |= X86_EFLAGS_IF;
444 }
447 /*
448 * This special macro can be used to load a debugging register
449 */
450 #define loaddebug(thread,register) \
451 __asm__("movl %0,%%db" #register \
452 : /* no output */ \
453 :"r" (thread->debugreg[register]))
456 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
457 {
458 struct thread_struct *next = &next_p->thread;
459 struct tss_struct *tss = init_tss + smp_processor_id();
460 execution_context_t *stack_ec = get_execution_context();
461 int i;
462 unsigned long vmx_domain = next_p->thread.arch_vmx.flags;
464 __cli();
466 /* Switch guest general-register state. */
467 if ( !is_idle_task(prev_p->domain) )
468 {
469 memcpy(&prev_p->thread.user_ctxt,
470 stack_ec,
471 sizeof(*stack_ec));
472 unlazy_fpu(prev_p);
473 CLEAR_FAST_TRAP(&prev_p->thread);
474 }
476 if ( !is_idle_task(next_p->domain) )
477 {
478 memcpy(stack_ec,
479 &next_p->thread.user_ctxt,
480 sizeof(*stack_ec));
482 /* Maybe switch the debug registers. */
483 if ( unlikely(next->debugreg[7]) )
484 {
485 loaddebug(next, 0);
486 loaddebug(next, 1);
487 loaddebug(next, 2);
488 loaddebug(next, 3);
489 /* no 4 and 5 */
490 loaddebug(next, 6);
491 loaddebug(next, 7);
492 }
494 if (vmx_domain) {
495 /* Switch page tables. */
496 write_ptbase(&next_p->mm);
498 set_current(next_p);
499 /* Switch GDT and LDT. */
500 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
502 __sti();
503 return;
504 }
506 SET_FAST_TRAP(&next_p->thread);
508 /* Switch the guest OS ring-1 stack. */
509 tss->esp1 = next->guestos_sp;
510 tss->ss1 = next->guestos_ss;
512 /* Switch page tables. */
513 write_ptbase(&next_p->mm);
514 }
516 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
517 {
518 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
519 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
520 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
521 ~0U, IOBMP_BYTES_PER_SELBIT);
522 tss->bitmap = IOBMP_INVALID_OFFSET;
523 }
525 if ( unlikely(next_p->thread.io_bitmap != NULL) )
526 {
527 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
528 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
529 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
530 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
531 IOBMP_BYTES_PER_SELBIT);
532 tss->bitmap = IOBMP_OFFSET;
533 }
535 set_current(next_p);
537 /* Switch GDT and LDT. */
538 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
539 load_LDT(next_p);
541 __sti();
542 }
545 /* XXX Currently the 'domain' field is ignored! XXX */
546 long do_iopl(domid_t domain, unsigned int new_io_pl)
547 {
548 execution_context_t *ec = get_execution_context();
549 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
550 return 0;
551 }
553 #endif
555 unsigned long hypercall_create_continuation(
556 unsigned int op, unsigned int nr_args, ...)
557 {
558 struct mc_state *mcs = &mc_state[smp_processor_id()];
559 execution_context_t *ec;
560 unsigned long *preg;
561 unsigned int i;
562 va_list args;
564 va_start(args, nr_args);
566 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
567 {
568 __set_bit(_MCSF_call_preempted, &mcs->flags);
570 for ( i = 0; i < nr_args; i++ )
571 mcs->call.args[i] = va_arg(args, unsigned long);
572 }
573 else
574 {
575 ec = get_execution_context();
576 #if defined(__i386__)
577 ec->eax = op;
578 ec->eip -= 2; /* re-execute 'int 0x82' */
580 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
581 *preg = va_arg(args, unsigned long);
582 #else
583 preg = NULL; /* XXX x86/64 */
584 #endif
585 }
587 va_end(args);
589 return op;
590 }
592 static void relinquish_list(struct domain *d, struct list_head *list)
593 {
594 struct list_head *ent;
595 struct pfn_info *page;
596 unsigned long x, y;
598 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
599 spin_lock_recursive(&d->page_alloc_lock);
601 ent = list->next;
602 while ( ent != list )
603 {
604 page = list_entry(ent, struct pfn_info, list);
606 /* Grab a reference to the page so it won't disappear from under us. */
607 if ( unlikely(!get_page(page, d)) )
608 {
609 /* Couldn't get a reference -- someone is freeing this page. */
610 ent = ent->next;
611 continue;
612 }
614 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
615 put_page_and_type(page);
617 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
618 put_page(page);
620 /*
621 * Forcibly invalidate base page tables at this point to break circular
622 * 'linear page table' references. This is okay because MMU structures
623 * are not shared across domains and this domain is now dead. Thus base
624 * tables are not in use so a non-zero count means circular reference.
625 */
626 y = page->u.inuse.type_info;
627 for ( ; ; )
628 {
629 x = y;
630 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
631 (PGT_base_page_table|PGT_validated)) )
632 break;
634 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
635 if ( likely(y == x) )
636 {
637 free_page_type(page, PGT_base_page_table);
638 break;
639 }
640 }
642 /* Follow the list chain and /then/ potentially free the page. */
643 ent = ent->next;
644 put_page(page);
645 }
647 spin_unlock_recursive(&d->page_alloc_lock);
648 }
651 void domain_relinquish_memory(struct domain *d)
652 {
653 struct exec_domain *ed;
655 /* Ensure that noone is running over the dead domain's page tables. */
656 synchronise_pagetables(~0UL);
658 /* Exit shadow mode before deconstructing final guest page table. */
659 shadow_mode_disable(d);
661 /* Drop the in-use reference to the page-table base. */
662 for_each_exec_domain(d, ed) {
663 if ( pagetable_val(ed->mm.pagetable) != 0 )
664 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >>
665 PAGE_SHIFT]);
666 }
668 /*
669 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
670 * it automatically gets squashed when the guest's mappings go away.
671 */
672 for_each_exec_domain(d, ed)
673 destroy_gdt(ed);
675 /* Relinquish every page of memory. */
676 relinquish_list(d, &d->xenpage_list);
677 relinquish_list(d, &d->page_list);
678 }
681 int construct_dom0(struct domain *p,
682 unsigned long alloc_start,
683 unsigned long alloc_end,
684 char *image_start, unsigned long image_len,
685 char *initrd_start, unsigned long initrd_len,
686 char *cmdline)
687 {
688 char *dst;
689 int i, rc;
690 unsigned long pfn, mfn;
691 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
692 unsigned long nr_pt_pages;
693 unsigned long count;
694 l2_pgentry_t *l2tab, *l2start;
695 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
696 struct pfn_info *page = NULL;
697 start_info_t *si;
698 struct exec_domain *ed = p->exec_domain[0];
700 /*
701 * This fully describes the memory layout of the initial domain. All
702 * *_start address are page-aligned, except v_start (and v_end) which are
703 * superpage-aligned.
704 */
705 struct domain_setup_info dsi;
706 unsigned long vinitrd_start;
707 unsigned long vinitrd_end;
708 unsigned long vphysmap_start;
709 unsigned long vphysmap_end;
710 unsigned long vstartinfo_start;
711 unsigned long vstartinfo_end;
712 unsigned long vstack_start;
713 unsigned long vstack_end;
714 unsigned long vpt_start;
715 unsigned long vpt_end;
716 unsigned long v_end;
718 /* Machine address of next candidate page-table page. */
719 unsigned long mpt_alloc;
721 extern void physdev_init_dom0(struct domain *);
723 /* Sanity! */
724 if ( p->id != 0 )
725 BUG();
726 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) )
727 BUG();
729 memset(&dsi, 0, sizeof(struct domain_setup_info));
731 printk("*** LOADING DOMAIN 0 ***\n");
733 /*
734 * This is all a bit grim. We've moved the modules to the "safe" physical
735 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
736 * routine we're going to copy it down into the region that's actually
737 * been allocated to domain 0. This is highly likely to be overlapping, so
738 * we use a forward copy.
739 *
740 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
741 * 4GB and lots of network/disk cards that allocate loads of buffers.
742 * We'll have to revisit this if we ever support PAE (64GB).
743 */
745 rc = parseelfimage(image_start, image_len, &dsi);
746 if ( rc != 0 )
747 return rc;
749 /* Set up domain options */
750 if ( dsi.use_writable_pagetables )
751 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
753 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
754 {
755 printk("Initial guest OS must load to a page boundary.\n");
756 return -EINVAL;
757 }
759 /*
760 * Why do we need this? The number of page-table frames depends on the
761 * size of the bootstrap address space. But the size of the address space
762 * depends on the number of page-table frames (since each one is mapped
763 * read-only). We have a pair of simultaneous equations in two unknowns,
764 * which we solve by exhaustive search.
765 */
766 vinitrd_start = round_pgup(dsi.v_kernend);
767 vinitrd_end = vinitrd_start + initrd_len;
768 vphysmap_start = round_pgup(vinitrd_end);
769 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
770 vpt_start = round_pgup(vphysmap_end);
771 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
772 {
773 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
774 vstartinfo_start = vpt_end;
775 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
776 vstack_start = vstartinfo_end;
777 vstack_end = vstack_start + PAGE_SIZE;
778 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
779 if ( (v_end - vstack_end) < (512 << 10) )
780 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
781 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
782 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
783 break;
784 }
786 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
787 " Kernel image: %p->%p\n"
788 " Initrd image: %p->%p\n"
789 " Dom0 alloc.: %08lx->%08lx\n",
790 image_start, image_start + image_len,
791 initrd_start, initrd_start + initrd_len,
792 alloc_start, alloc_end);
793 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
794 " Loaded kernel: %08lx->%08lx\n"
795 " Init. ramdisk: %08lx->%08lx\n"
796 " Phys-Mach map: %08lx->%08lx\n"
797 " Page tables: %08lx->%08lx\n"
798 " Start info: %08lx->%08lx\n"
799 " Boot stack: %08lx->%08lx\n"
800 " TOTAL: %08lx->%08lx\n",
801 dsi.v_kernstart, dsi.v_kernend,
802 vinitrd_start, vinitrd_end,
803 vphysmap_start, vphysmap_end,
804 vpt_start, vpt_end,
805 vstartinfo_start, vstartinfo_end,
806 vstack_start, vstack_end,
807 dsi.v_start, v_end);
808 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
810 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
811 {
812 printk("Initial guest OS requires too much space\n"
813 "(%luMB is greater than %luMB limit)\n",
814 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
815 return -ENOMEM;
816 }
818 /*
819 * Protect the lowest 1GB of memory. We use a temporary mapping there
820 * from which we copy the kernel and ramdisk images.
821 */
822 if ( dsi.v_start < (1<<30) )
823 {
824 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
825 return -EINVAL;
826 }
828 /* Paranoia: scrub DOM0's memory allocation. */
829 printk("Scrubbing DOM0 RAM: ");
830 dst = (char *)alloc_start;
831 while ( dst < (char *)alloc_end )
832 {
833 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
834 printk(".");
835 touch_nmi_watchdog();
836 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
837 {
838 memset(dst, 0, SCRUB_BYTES);
839 dst += SCRUB_BYTES;
840 }
841 else
842 {
843 memset(dst, 0, (char *)alloc_end - dst);
844 break;
845 }
846 }
847 printk("done.\n");
849 /* Construct a frame-allocation list for the initial domain. */
850 for ( mfn = (alloc_start>>PAGE_SHIFT);
851 mfn < (alloc_end>>PAGE_SHIFT);
852 mfn++ )
853 {
854 page = &frame_table[mfn];
855 page->u.inuse.domain = p;
856 page->u.inuse.type_info = 0;
857 page->count_info = PGC_allocated | 1;
858 list_add_tail(&page->list, &p->page_list);
859 p->tot_pages++; p->max_pages++;
860 }
862 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
864 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
865 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
867 /*
868 * We're basically forcing default RPLs to 1, so that our "what privilege
869 * level are we returning to?" logic works.
870 */
871 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
872 ed->thread.event_selector = FLAT_GUESTOS_CS;
873 ed->thread.guestos_ss = FLAT_GUESTOS_DS;
874 for ( i = 0; i < 256; i++ )
875 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
877 /* WARNING: The new domain must have its 'processor' field filled in! */
878 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
879 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
880 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
881 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
882 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
883 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
884 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
886 l2tab += l2_table_offset(dsi.v_start);
887 mfn = alloc_start >> PAGE_SHIFT;
888 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
889 {
890 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
891 {
892 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
893 mpt_alloc += PAGE_SIZE;
894 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
895 clear_page(l1tab);
896 if ( count == 0 )
897 l1tab += l1_table_offset(dsi.v_start);
898 }
899 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
901 page = &frame_table[mfn];
902 if ( !get_page_and_type(page, p, PGT_writable_page) )
903 BUG();
905 mfn++;
906 }
908 /* Pages that are part of page tables must be read only. */
909 l2tab = l2start + l2_table_offset(vpt_start);
910 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
911 l1tab += l1_table_offset(vpt_start);
912 l2tab++;
913 for ( count = 0; count < nr_pt_pages; count++ )
914 {
915 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
916 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
917 if ( count == 0 )
918 {
919 page->u.inuse.type_info &= ~PGT_type_mask;
920 page->u.inuse.type_info |= PGT_l2_page_table;
922 /*
923 * No longer writable: decrement the type_count.
924 * Installed as CR3: increment both the ref_count and type_count.
925 * Net: just increment the ref_count.
926 */
927 get_page(page, p); /* an extra ref because of readable mapping */
929 /* Get another ref to L2 page so that it can be pinned. */
930 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
931 BUG();
932 set_bit(_PGT_pinned, &page->u.inuse.type_info);
933 }
934 else
935 {
936 page->u.inuse.type_info &= ~PGT_type_mask;
937 page->u.inuse.type_info |= PGT_l1_page_table;
938 page->u.inuse.type_info |=
939 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
941 /*
942 * No longer writable: decrement the type_count.
943 * This is an L1 page, installed in a validated L2 page:
944 * increment both the ref_count and type_count.
945 * Net: just increment the ref_count.
946 */
947 get_page(page, p); /* an extra ref because of readable mapping */
948 }
949 l1tab++;
950 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
951 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
952 }
954 /* Set up shared-info area. */
955 update_dom_time(p);
956 p->shared_info->domain_time = 0;
957 /* Mask all upcalls... */
958 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
959 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
960 p->shared_info->n_vcpu = smp_num_cpus;
962 /* Install the new page tables. */
963 __cli();
964 write_ptbase(&ed->mm);
966 /* Copy the OS image. */
967 (void)loadelfimage(image_start);
969 /* Copy the initial ramdisk. */
970 if ( initrd_len != 0 )
971 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
973 /* Set up start info area. */
974 si = (start_info_t *)vstartinfo_start;
975 memset(si, 0, PAGE_SIZE);
976 si->nr_pages = p->tot_pages;
977 si->shared_info = virt_to_phys(p->shared_info);
978 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
979 si->pt_base = vpt_start;
980 si->nr_pt_frames = nr_pt_pages;
981 si->mfn_list = vphysmap_start;
983 /* Write the phys->machine and machine->phys table entries. */
984 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
985 {
986 mfn = pfn + (alloc_start>>PAGE_SHIFT);
987 #ifndef NDEBUG
988 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
989 if ( pfn > REVERSE_START )
990 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
991 #endif
992 ((unsigned long *)vphysmap_start)[pfn] = mfn;
993 machine_to_phys_mapping[mfn] = pfn;
994 }
996 if ( initrd_len != 0 )
997 {
998 si->mod_start = vinitrd_start;
999 si->mod_len = initrd_len;
1000 printk("Initrd len 0x%lx, start at 0x%08lx\n",
1001 si->mod_len, si->mod_start);
1004 dst = si->cmd_line;
1005 if ( cmdline != NULL )
1007 for ( i = 0; i < 255; i++ )
1009 if ( cmdline[i] == '\0' )
1010 break;
1011 *dst++ = cmdline[i];
1014 *dst = '\0';
1016 /* Reinstate the caller's page tables. */
1017 write_ptbase(&current->mm);
1018 __sti();
1020 /* Destroy low mappings - they were only for our convenience. */
1021 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
1022 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
1023 l2start[i] = mk_l2_pgentry(0);
1024 zap_low_mappings(); /* Do the same for the idle page tables. */
1026 /* DOM0 gets access to everything. */
1027 physdev_init_dom0(p);
1029 set_bit(DF_CONSTRUCTED, &p->d_flags);
1031 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
1033 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
1034 shadow_lock(&p->mm);
1035 shadow_mode_enable(p, SHM_test);
1036 shadow_unlock(&p->mm);
1037 #endif
1039 return 0;