debuggers.hg

view xen/arch/x86/domain.c @ 3324:a5b20d4d45b1

bitkeeper revision 1.1159.1.479 (41c07654rvhrbSCSdKV1f18P_UminA)

sync w/ head.
author cl349@arcadians.cl.cam.ac.uk
date Wed Dec 15 17:37:24 2004 +0000 (2004-12-15)
parents eb8866af6c4b
children b9ab4345fd1b
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <asm/regs.h>
22 #include <asm/mc146818rtc.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/mpspec.h>
29 #include <asm/ldt.h>
30 #include <xen/irq.h>
31 #include <xen/event.h>
32 #include <asm/shadow.h>
33 #include <xen/console.h>
34 #include <xen/elf.h>
35 #include <xen/multicall.h>
37 #if !defined(CONFIG_X86_64BITMODE)
38 /* No ring-3 access in initial page tables. */
39 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
40 #else
41 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
42 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
43 #endif
44 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
45 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
46 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
48 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
49 #define round_pgdown(_p) ((_p)&PAGE_MASK)
51 static void default_idle(void)
52 {
53 __cli();
54 if ( !softirq_pending(smp_processor_id()) )
55 safe_halt();
56 else
57 __sti();
58 }
60 static void idle_loop(void)
61 {
62 int cpu = smp_processor_id();
63 for ( ; ; )
64 {
65 irq_stat[cpu].idle_timestamp = jiffies;
66 while ( !softirq_pending(cpu) )
67 default_idle();
68 do_softirq();
69 }
70 }
72 void startup_cpu_idle_loop(void)
73 {
74 /* Just some sanity to ensure that the scheduler is set up okay. */
75 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
76 domain_unpause_by_systemcontroller(current->domain);
77 __enter_scheduler();
79 /*
80 * Declares CPU setup done to the boot processor.
81 * Therefore memory barrier to ensure state is visible.
82 */
83 smp_mb();
84 init_idle();
86 idle_loop();
87 }
89 static long no_idt[2];
90 static int reboot_mode;
91 int reboot_thru_bios = 0;
93 #ifdef CONFIG_SMP
94 int reboot_smp = 0;
95 static int reboot_cpu = -1;
96 /* shamelessly grabbed from lib/vsprintf.c for readability */
97 #define is_digit(c) ((c) >= '0' && (c) <= '9')
98 #endif
101 static inline void kb_wait(void)
102 {
103 int i;
105 for (i=0; i<0x10000; i++)
106 if ((inb_p(0x64) & 0x02) == 0)
107 break;
108 }
111 void machine_restart(char * __unused)
112 {
113 extern int opt_noreboot;
114 #ifdef CONFIG_SMP
115 int cpuid;
116 #endif
118 if ( opt_noreboot )
119 {
120 printk("Reboot disabled on cmdline: require manual reset\n");
121 for ( ; ; ) __asm__ __volatile__ ("hlt");
122 }
124 #ifdef CONFIG_SMP
125 cpuid = GET_APIC_ID(apic_read(APIC_ID));
127 /* KAF: Need interrupts enabled for safe IPI. */
128 __sti();
130 if (reboot_smp) {
132 /* check to see if reboot_cpu is valid
133 if its not, default to the BSP */
134 if ((reboot_cpu == -1) ||
135 (reboot_cpu > (NR_CPUS -1)) ||
136 !(phys_cpu_present_map & (1<<cpuid)))
137 reboot_cpu = boot_cpu_physical_apicid;
139 reboot_smp = 0; /* use this as a flag to only go through this once*/
140 /* re-run this function on the other CPUs
141 it will fall though this section since we have
142 cleared reboot_smp, and do the reboot if it is the
143 correct CPU, otherwise it halts. */
144 if (reboot_cpu != cpuid)
145 smp_call_function((void *)machine_restart , NULL, 1, 0);
146 }
148 /* if reboot_cpu is still -1, then we want a tradional reboot,
149 and if we are not running on the reboot_cpu,, halt */
150 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
151 for (;;)
152 __asm__ __volatile__ ("hlt");
153 }
154 /*
155 * Stop all CPUs and turn off local APICs and the IO-APIC, so
156 * other OSs see a clean IRQ state.
157 */
158 smp_send_stop();
159 disable_IO_APIC();
160 #endif
162 if(!reboot_thru_bios) {
163 /* rebooting needs to touch the page at absolute addr 0 */
164 *((unsigned short *)__va(0x472)) = reboot_mode;
165 for (;;) {
166 int i;
167 for (i=0; i<100; i++) {
168 kb_wait();
169 udelay(50);
170 outb(0xfe,0x64); /* pulse reset low */
171 udelay(50);
172 }
173 /* That didn't work - force a triple fault.. */
174 __asm__ __volatile__("lidt %0": "=m" (no_idt));
175 __asm__ __volatile__("int3");
176 }
177 }
179 panic("Need to reinclude BIOS reboot code\n");
180 }
183 void __attribute__((noreturn)) __machine_halt(void *unused)
184 {
185 for ( ; ; )
186 __asm__ __volatile__ ( "cli; hlt" );
187 }
189 void machine_halt(void)
190 {
191 smp_call_function(__machine_halt, NULL, 1, 1);
192 __machine_halt(NULL);
193 }
195 void free_perdomain_pt(struct domain *d)
196 {
197 free_xenheap_page((unsigned long)d->mm_perdomain_pt);
198 }
200 static void continue_idle_task(struct exec_domain *ed)
201 {
202 reset_stack_and_jump(idle_loop);
203 }
205 static void continue_nonidle_task(struct exec_domain *ed)
206 {
207 reset_stack_and_jump(ret_from_intr);
208 }
210 void arch_do_createdomain(struct exec_domain *ed)
211 {
212 struct domain *d = ed->domain;
213 #ifdef ARCH_HAS_FAST_TRAP
214 SET_DEFAULT_FAST_TRAP(&ed->thread);
215 #endif
217 if ( d->id == IDLE_DOMAIN_ID )
218 {
219 ed->thread.schedule_tail = continue_idle_task;
220 }
221 else
222 {
223 ed->thread.schedule_tail = continue_nonidle_task;
225 d->shared_info = (void *)alloc_xenheap_page();
226 memset(d->shared_info, 0, PAGE_SIZE);
227 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
228 d->shared_info->arch.mfn_to_pfn_start =
229 virt_to_phys(&machine_to_phys_mapping[0])>>PAGE_SHIFT;
230 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
231 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
232 PAGE_SHIFT] = INVALID_P2M_ENTRY;
234 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
235 memset(d->mm_perdomain_pt, 0, PAGE_SIZE);
236 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >>
237 PAGE_SHIFT] = INVALID_P2M_ENTRY;
238 ed->mm.perdomain_ptes = d->mm_perdomain_pt;
239 }
240 }
242 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
243 {
244 unsigned long phys_basetab;
245 int i, rc;
247 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
248 if ( c->flags & ECF_I387_VALID )
249 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
251 memcpy(&d->thread.user_ctxt,
252 &c->cpu_ctxt,
253 sizeof(d->thread.user_ctxt));
255 /* Clear IOPL for unprivileged domains. */
256 if (!IS_PRIV(d->domain))
257 d->thread.user_ctxt.eflags &= 0xffffcfff;
259 /*
260 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
261 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
262 * If SS RPL or DPL differs from CS RPL then we'll #GP.
263 */
264 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
265 ((d->thread.user_ctxt.ss & 3) == 0) )
266 return -EINVAL;
268 memcpy(&d->thread.i387,
269 &c->fpu_ctxt,
270 sizeof(d->thread.i387));
272 memcpy(d->thread.traps,
273 &c->trap_ctxt,
274 sizeof(d->thread.traps));
276 #ifdef ARCH_HAS_FAST_TRAP
277 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
278 return rc;
279 #endif
281 d->mm.ldt_base = c->ldt_base;
282 d->mm.ldt_ents = c->ldt_ents;
284 d->thread.guestos_ss = c->guestos_ss;
285 d->thread.guestos_sp = c->guestos_esp;
287 for ( i = 0; i < 8; i++ )
288 (void)set_debugreg(d, i, c->debugreg[i]);
290 d->thread.event_selector = c->event_callback_cs;
291 d->thread.event_address = c->event_callback_eip;
292 d->thread.failsafe_selector = c->failsafe_callback_cs;
293 d->thread.failsafe_address = c->failsafe_callback_eip;
295 phys_basetab = c->pt_base;
296 d->mm.pagetable = mk_pagetable(phys_basetab);
297 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
298 PGT_base_page_table) )
299 return -EINVAL;
301 /* Failure to set GDT is harmless. */
302 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
303 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
304 if ( c->gdt_ents != 0 )
305 {
306 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
307 {
308 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
309 return rc;
310 }
311 }
313 return 0;
314 }
316 #if defined(__i386__)
318 void new_thread(struct exec_domain *d,
319 unsigned long start_pc,
320 unsigned long start_stack,
321 unsigned long start_info)
322 {
323 execution_context_t *ec = &d->thread.user_ctxt;
325 /*
326 * Initial register values:
327 * DS,ES,FS,GS = FLAT_RING1_DS
328 * CS:EIP = FLAT_RING1_CS:start_pc
329 * SS:ESP = FLAT_RING1_DS:start_stack
330 * ESI = start_info
331 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
332 */
333 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
334 ec->cs = FLAT_RING1_CS;
335 ec->eip = start_pc;
336 ec->esp = start_stack;
337 ec->esi = start_info;
339 __save_flags(ec->eflags);
340 ec->eflags |= X86_EFLAGS_IF;
341 }
344 /*
345 * This special macro can be used to load a debugging register
346 */
347 #define loaddebug(thread,register) \
348 __asm__("movl %0,%%db" #register \
349 : /* no output */ \
350 :"r" (thread->debugreg[register]))
353 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
354 {
355 struct thread_struct *next = &next_p->thread;
356 struct tss_struct *tss = init_tss + smp_processor_id();
357 execution_context_t *stack_ec = get_execution_context();
358 int i;
360 __cli();
362 /* Switch guest general-register state. */
363 if ( !is_idle_task(prev_p->domain) )
364 {
365 memcpy(&prev_p->thread.user_ctxt,
366 stack_ec,
367 sizeof(*stack_ec));
368 unlazy_fpu(prev_p);
369 CLEAR_FAST_TRAP(&prev_p->thread);
370 }
372 if ( !is_idle_task(next_p->domain) )
373 {
374 memcpy(stack_ec,
375 &next_p->thread.user_ctxt,
376 sizeof(*stack_ec));
378 SET_FAST_TRAP(&next_p->thread);
380 /* Switch the guest OS ring-1 stack. */
381 tss->esp1 = next->guestos_sp;
382 tss->ss1 = next->guestos_ss;
384 /* Maybe switch the debug registers. */
385 if ( unlikely(next->debugreg[7]) )
386 {
387 loaddebug(next, 0);
388 loaddebug(next, 1);
389 loaddebug(next, 2);
390 loaddebug(next, 3);
391 /* no 4 and 5 */
392 loaddebug(next, 6);
393 loaddebug(next, 7);
394 }
396 /* Switch page tables. */
397 write_ptbase(&next_p->mm);
398 }
400 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
401 {
402 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
403 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
404 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
405 ~0U, IOBMP_BYTES_PER_SELBIT);
406 tss->bitmap = IOBMP_INVALID_OFFSET;
407 }
409 if ( unlikely(next_p->thread.io_bitmap != NULL) )
410 {
411 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
412 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
413 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
414 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
415 IOBMP_BYTES_PER_SELBIT);
416 tss->bitmap = IOBMP_OFFSET;
417 }
419 set_current(next_p);
421 /* Switch GDT and LDT. */
422 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
423 load_LDT(next_p);
425 __sti();
426 }
429 /* XXX Currently the 'domain' field is ignored! XXX */
430 long do_iopl(domid_t domain, unsigned int new_io_pl)
431 {
432 execution_context_t *ec = get_execution_context();
433 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
434 return 0;
435 }
437 #endif
439 unsigned long hypercall_create_continuation(
440 unsigned int op, unsigned int nr_args, ...)
441 {
442 struct mc_state *mcs = &mc_state[smp_processor_id()];
443 execution_context_t *ec;
444 unsigned long *preg;
445 unsigned int i;
446 va_list args;
448 va_start(args, nr_args);
450 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
451 {
452 __set_bit(_MCSF_call_preempted, &mcs->flags);
454 for ( i = 0; i < nr_args; i++ )
455 mcs->call.args[i] = va_arg(args, unsigned long);
456 }
457 else
458 {
459 ec = get_execution_context();
460 #if defined(__i386__)
461 ec->eax = op;
462 ec->eip -= 2; /* re-execute 'int 0x82' */
464 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
465 *preg = va_arg(args, unsigned long);
466 #else
467 preg = NULL; /* XXX x86/64 */
468 #endif
469 }
471 va_end(args);
473 return op;
474 }
476 static void relinquish_list(struct domain *d, struct list_head *list)
477 {
478 struct list_head *ent;
479 struct pfn_info *page;
480 unsigned long x, y;
482 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
483 spin_lock_recursive(&d->page_alloc_lock);
485 ent = list->next;
486 while ( ent != list )
487 {
488 page = list_entry(ent, struct pfn_info, list);
490 /* Grab a reference to the page so it won't disappear from under us. */
491 if ( unlikely(!get_page(page, d)) )
492 {
493 /* Couldn't get a reference -- someone is freeing this page. */
494 ent = ent->next;
495 continue;
496 }
498 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
499 put_page_and_type(page);
501 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
502 put_page(page);
504 /*
505 * Forcibly invalidate base page tables at this point to break circular
506 * 'linear page table' references. This is okay because MMU structures
507 * are not shared across domains and this domain is now dead. Thus base
508 * tables are not in use so a non-zero count means circular reference.
509 */
510 y = page->u.inuse.type_info;
511 for ( ; ; )
512 {
513 x = y;
514 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
515 (PGT_base_page_table|PGT_validated)) )
516 break;
518 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
519 if ( likely(y == x) )
520 {
521 free_page_type(page, PGT_base_page_table);
522 break;
523 }
524 }
526 /* Follow the list chain and /then/ potentially free the page. */
527 ent = ent->next;
528 put_page(page);
529 }
531 spin_unlock_recursive(&d->page_alloc_lock);
532 }
535 void domain_relinquish_memory(struct domain *d)
536 {
537 struct exec_domain *ed;
539 /* Ensure that noone is running over the dead domain's page tables. */
540 synchronise_pagetables(~0UL);
542 /* Exit shadow mode before deconstructing final guest page table. */
543 shadow_mode_disable(d);
545 /* Drop the in-use reference to the page-table base. */
546 for_each_exec_domain(d, ed) {
547 if ( pagetable_val(ed->mm.pagetable) != 0 )
548 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >>
549 PAGE_SHIFT]);
550 }
552 /*
553 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
554 * it automatically gets squashed when the guest's mappings go away.
555 */
556 for_each_exec_domain(d, ed)
557 destroy_gdt(ed);
559 /* Relinquish every page of memory. */
560 relinquish_list(d, &d->xenpage_list);
561 relinquish_list(d, &d->page_list);
562 }
565 int construct_dom0(struct domain *p,
566 unsigned long alloc_start,
567 unsigned long alloc_end,
568 char *image_start, unsigned long image_len,
569 char *initrd_start, unsigned long initrd_len,
570 char *cmdline)
571 {
572 char *dst;
573 int i, rc;
574 unsigned long pfn, mfn;
575 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
576 unsigned long nr_pt_pages;
577 unsigned long count;
578 l2_pgentry_t *l2tab, *l2start;
579 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
580 struct pfn_info *page = NULL;
581 start_info_t *si;
582 struct exec_domain *ed = p->exec_domain[0];
584 /*
585 * This fully describes the memory layout of the initial domain. All
586 * *_start address are page-aligned, except v_start (and v_end) which are
587 * superpage-aligned.
588 */
589 struct domain_setup_info dsi;
590 unsigned long vinitrd_start;
591 unsigned long vinitrd_end;
592 unsigned long vphysmap_start;
593 unsigned long vphysmap_end;
594 unsigned long vstartinfo_start;
595 unsigned long vstartinfo_end;
596 unsigned long vstack_start;
597 unsigned long vstack_end;
598 unsigned long vpt_start;
599 unsigned long vpt_end;
600 unsigned long v_end;
602 /* Machine address of next candidate page-table page. */
603 unsigned long mpt_alloc;
605 extern void physdev_init_dom0(struct domain *);
607 /* Sanity! */
608 if ( p->id != 0 )
609 BUG();
610 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) )
611 BUG();
613 memset(&dsi, 0, sizeof(struct domain_setup_info));
615 printk("*** LOADING DOMAIN 0 ***\n");
617 /*
618 * This is all a bit grim. We've moved the modules to the "safe" physical
619 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
620 * routine we're going to copy it down into the region that's actually
621 * been allocated to domain 0. This is highly likely to be overlapping, so
622 * we use a forward copy.
623 *
624 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
625 * 4GB and lots of network/disk cards that allocate loads of buffers.
626 * We'll have to revisit this if we ever support PAE (64GB).
627 */
629 rc = parseelfimage(image_start, image_len, &dsi);
630 if ( rc != 0 )
631 return rc;
633 /* Set up domain options */
634 if ( dsi.use_writable_pagetables )
635 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
637 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
638 {
639 printk("Initial guest OS must load to a page boundary.\n");
640 return -EINVAL;
641 }
643 /*
644 * Why do we need this? The number of page-table frames depends on the
645 * size of the bootstrap address space. But the size of the address space
646 * depends on the number of page-table frames (since each one is mapped
647 * read-only). We have a pair of simultaneous equations in two unknowns,
648 * which we solve by exhaustive search.
649 */
650 vinitrd_start = round_pgup(dsi.v_kernend);
651 vinitrd_end = vinitrd_start + initrd_len;
652 vphysmap_start = round_pgup(vinitrd_end);
653 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
654 vpt_start = round_pgup(vphysmap_end);
655 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
656 {
657 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
658 vstartinfo_start = vpt_end;
659 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
660 vstack_start = vstartinfo_end;
661 vstack_end = vstack_start + PAGE_SIZE;
662 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
663 if ( (v_end - vstack_end) < (512 << 10) )
664 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
665 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
666 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
667 break;
668 }
670 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
671 " Kernel image: %p->%p\n"
672 " Initrd image: %p->%p\n"
673 " Dom0 alloc.: %08lx->%08lx\n",
674 image_start, image_start + image_len,
675 initrd_start, initrd_start + initrd_len,
676 alloc_start, alloc_end);
677 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
678 " Loaded kernel: %08lx->%08lx\n"
679 " Init. ramdisk: %08lx->%08lx\n"
680 " Phys-Mach map: %08lx->%08lx\n"
681 " Page tables: %08lx->%08lx\n"
682 " Start info: %08lx->%08lx\n"
683 " Boot stack: %08lx->%08lx\n"
684 " TOTAL: %08lx->%08lx\n",
685 dsi.v_kernstart, dsi.v_kernend,
686 vinitrd_start, vinitrd_end,
687 vphysmap_start, vphysmap_end,
688 vpt_start, vpt_end,
689 vstartinfo_start, vstartinfo_end,
690 vstack_start, vstack_end,
691 dsi.v_start, v_end);
692 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
694 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
695 {
696 printk("Initial guest OS requires too much space\n"
697 "(%luMB is greater than %luMB limit)\n",
698 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
699 return -ENOMEM;
700 }
702 /*
703 * Protect the lowest 1GB of memory. We use a temporary mapping there
704 * from which we copy the kernel and ramdisk images.
705 */
706 if ( dsi.v_start < (1<<30) )
707 {
708 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
709 return -EINVAL;
710 }
712 /* Paranoia: scrub DOM0's memory allocation. */
713 printk("Scrubbing DOM0 RAM: ");
714 dst = (char *)alloc_start;
715 while ( dst < (char *)alloc_end )
716 {
717 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
718 printk(".");
719 touch_nmi_watchdog();
720 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
721 {
722 memset(dst, 0, SCRUB_BYTES);
723 dst += SCRUB_BYTES;
724 }
725 else
726 {
727 memset(dst, 0, (char *)alloc_end - dst);
728 break;
729 }
730 }
731 printk("done.\n");
733 /* Construct a frame-allocation list for the initial domain. */
734 for ( mfn = (alloc_start>>PAGE_SHIFT);
735 mfn < (alloc_end>>PAGE_SHIFT);
736 mfn++ )
737 {
738 page = &frame_table[mfn];
739 page->u.inuse.domain = p;
740 page->u.inuse.type_info = 0;
741 page->count_info = PGC_allocated | 1;
742 list_add_tail(&page->list, &p->page_list);
743 p->tot_pages++; p->max_pages++;
744 }
746 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
748 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
749 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
751 /*
752 * We're basically forcing default RPLs to 1, so that our "what privilege
753 * level are we returning to?" logic works.
754 */
755 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
756 ed->thread.event_selector = FLAT_GUESTOS_CS;
757 ed->thread.guestos_ss = FLAT_GUESTOS_DS;
758 for ( i = 0; i < 256; i++ )
759 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
761 /* WARNING: The new domain must have its 'processor' field filled in! */
762 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
763 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
764 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
765 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
766 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
767 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
768 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
770 l2tab += l2_table_offset(dsi.v_start);
771 mfn = alloc_start >> PAGE_SHIFT;
772 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
773 {
774 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
775 {
776 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
777 mpt_alloc += PAGE_SIZE;
778 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
779 clear_page(l1tab);
780 if ( count == 0 )
781 l1tab += l1_table_offset(dsi.v_start);
782 }
783 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
785 page = &frame_table[mfn];
786 if ( !get_page_and_type(page, p, PGT_writable_page) )
787 BUG();
789 mfn++;
790 }
792 /* Pages that are part of page tables must be read only. */
793 l2tab = l2start + l2_table_offset(vpt_start);
794 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
795 l1tab += l1_table_offset(vpt_start);
796 l2tab++;
797 for ( count = 0; count < nr_pt_pages; count++ )
798 {
799 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
800 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
801 if ( count == 0 )
802 {
803 page->u.inuse.type_info &= ~PGT_type_mask;
804 page->u.inuse.type_info |= PGT_l2_page_table;
806 /*
807 * No longer writable: decrement the type_count.
808 * Installed as CR3: increment both the ref_count and type_count.
809 * Net: just increment the ref_count.
810 */
811 get_page(page, p); /* an extra ref because of readable mapping */
813 /* Get another ref to L2 page so that it can be pinned. */
814 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
815 BUG();
816 set_bit(_PGT_pinned, &page->u.inuse.type_info);
817 }
818 else
819 {
820 page->u.inuse.type_info &= ~PGT_type_mask;
821 page->u.inuse.type_info |= PGT_l1_page_table;
822 page->u.inuse.type_info |=
823 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
825 /*
826 * No longer writable: decrement the type_count.
827 * This is an L1 page, installed in a validated L2 page:
828 * increment both the ref_count and type_count.
829 * Net: just increment the ref_count.
830 */
831 get_page(page, p); /* an extra ref because of readable mapping */
832 }
833 l1tab++;
834 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
835 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
836 }
838 /* Set up shared-info area. */
839 update_dom_time(p);
840 p->shared_info->domain_time = 0;
841 /* Mask all upcalls... */
842 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
843 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
844 p->shared_info->n_vcpu = smp_num_cpus;
846 /* Install the new page tables. */
847 __cli();
848 write_ptbase(&ed->mm);
850 /* Copy the OS image. */
851 (void)loadelfimage(image_start);
853 /* Copy the initial ramdisk. */
854 if ( initrd_len != 0 )
855 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
857 /* Set up start info area. */
858 si = (start_info_t *)vstartinfo_start;
859 memset(si, 0, PAGE_SIZE);
860 si->nr_pages = p->tot_pages;
861 si->shared_info = virt_to_phys(p->shared_info);
862 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
863 si->pt_base = vpt_start;
864 si->nr_pt_frames = nr_pt_pages;
865 si->mfn_list = vphysmap_start;
867 /* Write the phys->machine and machine->phys table entries. */
868 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
869 {
870 mfn = pfn + (alloc_start>>PAGE_SHIFT);
871 #ifndef NDEBUG
872 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
873 if ( pfn > REVERSE_START )
874 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
875 #endif
876 ((unsigned long *)vphysmap_start)[pfn] = mfn;
877 machine_to_phys_mapping[mfn] = pfn;
878 }
880 if ( initrd_len != 0 )
881 {
882 si->mod_start = vinitrd_start;
883 si->mod_len = initrd_len;
884 printk("Initrd len 0x%lx, start at 0x%08lx\n",
885 si->mod_len, si->mod_start);
886 }
888 dst = si->cmd_line;
889 if ( cmdline != NULL )
890 {
891 for ( i = 0; i < 255; i++ )
892 {
893 if ( cmdline[i] == '\0' )
894 break;
895 *dst++ = cmdline[i];
896 }
897 }
898 *dst = '\0';
900 /* Reinstate the caller's page tables. */
901 write_ptbase(&current->mm);
902 __sti();
904 /* Destroy low mappings - they were only for our convenience. */
905 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
906 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
907 l2start[i] = mk_l2_pgentry(0);
908 zap_low_mappings(); /* Do the same for the idle page tables. */
910 /* DOM0 gets access to everything. */
911 physdev_init_dom0(p);
913 set_bit(DF_CONSTRUCTED, &p->d_flags);
915 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
917 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
918 shadow_lock(&p->mm);
919 shadow_mode_enable(p, SHM_test);
920 shadow_unlock(&p->mm);
921 #endif
923 return 0;
924 }