debuggers.hg

view xen/arch/x86/domain.c @ 3136:f1c44a4d4998

bitkeeper revision 1.1159.1.446 (41a48ee42Omqs3zoJHTZPhLlPx5LUw)

Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/auto/groups/xeno/users/cl349/BK/xen.bk-smp
author cl349@arcadians.cl.cam.ac.uk
date Wed Nov 24 13:38:44 2004 +0000 (2004-11-24)
parents 2befcd58b59b 2754a2ed61c3
children 75f82adfcc90
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <asm/regs.h>
22 #include <asm/mc146818rtc.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/mpspec.h>
29 #include <asm/ldt.h>
30 #include <xen/irq.h>
31 #include <xen/event.h>
32 #include <asm/shadow.h>
33 #include <xen/console.h>
34 #include <xen/elf.h>
36 #if !defined(CONFIG_X86_64BITMODE)
37 /* No ring-3 access in initial page tables. */
38 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
39 #else
40 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
41 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
42 #endif
43 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
44 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
45 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
47 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
48 #define round_pgdown(_p) ((_p)&PAGE_MASK)
50 int hlt_counter;
52 void disable_hlt(void)
53 {
54 hlt_counter++;
55 }
57 void enable_hlt(void)
58 {
59 hlt_counter--;
60 }
62 /*
63 * We use this if we don't have any better
64 * idle routine..
65 */
66 static void default_idle(void)
67 {
68 if ( hlt_counter == 0 )
69 {
70 __cli();
71 if ( !softirq_pending(smp_processor_id()) )
72 safe_halt();
73 else
74 __sti();
75 }
76 }
78 void continue_cpu_idle_loop(void)
79 {
80 int cpu = smp_processor_id();
81 for ( ; ; )
82 {
83 irq_stat[cpu].idle_timestamp = jiffies;
84 while ( !softirq_pending(cpu) )
85 default_idle();
86 do_softirq();
87 }
88 }
90 void startup_cpu_idle_loop(void)
91 {
92 /* Just some sanity to ensure that the scheduler is set up okay. */
93 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
94 domain_unpause_by_systemcontroller(current->domain);
95 __enter_scheduler();
97 /*
98 * Declares CPU setup done to the boot processor.
99 * Therefore memory barrier to ensure state is visible.
100 */
101 smp_mb();
102 init_idle();
104 continue_cpu_idle_loop();
105 }
107 static long no_idt[2];
108 static int reboot_mode;
109 int reboot_thru_bios = 0;
111 #ifdef CONFIG_SMP
112 int reboot_smp = 0;
113 static int reboot_cpu = -1;
114 /* shamelessly grabbed from lib/vsprintf.c for readability */
115 #define is_digit(c) ((c) >= '0' && (c) <= '9')
116 #endif
119 static inline void kb_wait(void)
120 {
121 int i;
123 for (i=0; i<0x10000; i++)
124 if ((inb_p(0x64) & 0x02) == 0)
125 break;
126 }
129 void machine_restart(char * __unused)
130 {
131 extern int opt_noreboot;
132 #ifdef CONFIG_SMP
133 int cpuid;
134 #endif
136 if ( opt_noreboot )
137 {
138 printk("Reboot disabled on cmdline: require manual reset\n");
139 for ( ; ; ) __asm__ __volatile__ ("hlt");
140 }
142 #ifdef CONFIG_SMP
143 cpuid = GET_APIC_ID(apic_read(APIC_ID));
145 /* KAF: Need interrupts enabled for safe IPI. */
146 __sti();
148 if (reboot_smp) {
150 /* check to see if reboot_cpu is valid
151 if its not, default to the BSP */
152 if ((reboot_cpu == -1) ||
153 (reboot_cpu > (NR_CPUS -1)) ||
154 !(phys_cpu_present_map & (1<<cpuid)))
155 reboot_cpu = boot_cpu_physical_apicid;
157 reboot_smp = 0; /* use this as a flag to only go through this once*/
158 /* re-run this function on the other CPUs
159 it will fall though this section since we have
160 cleared reboot_smp, and do the reboot if it is the
161 correct CPU, otherwise it halts. */
162 if (reboot_cpu != cpuid)
163 smp_call_function((void *)machine_restart , NULL, 1, 0);
164 }
166 /* if reboot_cpu is still -1, then we want a tradional reboot,
167 and if we are not running on the reboot_cpu,, halt */
168 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
169 for (;;)
170 __asm__ __volatile__ ("hlt");
171 }
172 /*
173 * Stop all CPUs and turn off local APICs and the IO-APIC, so
174 * other OSs see a clean IRQ state.
175 */
176 smp_send_stop();
177 disable_IO_APIC();
178 #endif
180 if(!reboot_thru_bios) {
181 /* rebooting needs to touch the page at absolute addr 0 */
182 *((unsigned short *)__va(0x472)) = reboot_mode;
183 for (;;) {
184 int i;
185 for (i=0; i<100; i++) {
186 kb_wait();
187 udelay(50);
188 outb(0xfe,0x64); /* pulse reset low */
189 udelay(50);
190 }
191 /* That didn't work - force a triple fault.. */
192 __asm__ __volatile__("lidt %0": "=m" (no_idt));
193 __asm__ __volatile__("int3");
194 }
195 }
197 panic("Need to reinclude BIOS reboot code\n");
198 }
201 void __attribute__((noreturn)) __machine_halt(void *unused)
202 {
203 for ( ; ; )
204 __asm__ __volatile__ ( "cli; hlt" );
205 }
207 void machine_halt(void)
208 {
209 smp_call_function(__machine_halt, NULL, 1, 1);
210 __machine_halt(NULL);
211 }
213 void free_perdomain_pt(struct domain *d)
214 {
215 free_xenheap_page((unsigned long)d->mm_perdomain_pt);
216 }
218 void arch_do_createdomain(struct exec_domain *ed)
219 {
220 struct domain *d = ed->domain;
221 d->shared_info = (void *)alloc_xenheap_page();
222 memset(d->shared_info, 0, PAGE_SIZE);
223 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
224 d->shared_info->arch.mfn_to_pfn_start =
225 virt_to_phys(&machine_to_phys_mapping[0])>>PAGE_SHIFT;
226 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
227 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
228 PAGE_SHIFT] = INVALID_P2M_ENTRY;
230 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
231 memset(d->mm_perdomain_pt, 0, PAGE_SIZE);
232 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >>
233 PAGE_SHIFT] = INVALID_P2M_ENTRY;
234 ed->mm.perdomain_ptes = d->mm_perdomain_pt;
235 }
237 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
238 {
239 unsigned long phys_basetab;
240 int i, rc;
242 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
243 if ( c->flags & ECF_I387_VALID )
244 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
246 memcpy(&d->thread.user_ctxt,
247 &c->cpu_ctxt,
248 sizeof(d->thread.user_ctxt));
250 /*
251 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
252 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
253 * If SS RPL or DPL differs from CS RPL then we'll #GP.
254 */
255 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
256 ((d->thread.user_ctxt.ss & 3) == 0) )
257 return -EINVAL;
259 memcpy(&d->thread.i387,
260 &c->fpu_ctxt,
261 sizeof(d->thread.i387));
263 memcpy(d->thread.traps,
264 &c->trap_ctxt,
265 sizeof(d->thread.traps));
267 #ifdef ARCH_HAS_FAST_TRAP
268 SET_DEFAULT_FAST_TRAP(&d->thread);
269 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
270 return rc;
271 #endif
273 d->mm.ldt_base = c->ldt_base;
274 d->mm.ldt_ents = c->ldt_ents;
276 d->thread.guestos_ss = c->guestos_ss;
277 d->thread.guestos_sp = c->guestos_esp;
279 for ( i = 0; i < 8; i++ )
280 (void)set_debugreg(d, i, c->debugreg[i]);
282 d->thread.event_selector = c->event_callback_cs;
283 d->thread.event_address = c->event_callback_eip;
284 d->thread.failsafe_selector = c->failsafe_callback_cs;
285 d->thread.failsafe_address = c->failsafe_callback_eip;
287 phys_basetab = c->pt_base;
288 d->mm.pagetable = mk_pagetable(phys_basetab);
289 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
290 PGT_base_page_table) )
291 return -EINVAL;
293 /* Failure to set GDT is harmless. */
294 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
295 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
296 if ( c->gdt_ents != 0 )
297 {
298 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
299 {
300 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
301 return rc;
302 }
303 }
305 return 0;
306 }
308 #if defined(__i386__)
310 void new_thread(struct exec_domain *d,
311 unsigned long start_pc,
312 unsigned long start_stack,
313 unsigned long start_info)
314 {
315 execution_context_t *ec = &d->thread.user_ctxt;
317 /*
318 * Initial register values:
319 * DS,ES,FS,GS = FLAT_RING1_DS
320 * CS:EIP = FLAT_RING1_CS:start_pc
321 * SS:ESP = FLAT_RING1_DS:start_stack
322 * ESI = start_info
323 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
324 */
325 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
326 ec->cs = FLAT_RING1_CS;
327 ec->eip = start_pc;
328 ec->esp = start_stack;
329 ec->esi = start_info;
331 __save_flags(ec->eflags);
332 ec->eflags |= X86_EFLAGS_IF;
334 /* No fast trap at start of day. */
335 SET_DEFAULT_FAST_TRAP(&d->thread);
336 }
339 /*
340 * This special macro can be used to load a debugging register
341 */
342 #define loaddebug(thread,register) \
343 __asm__("movl %0,%%db" #register \
344 : /* no output */ \
345 :"r" (thread->debugreg[register]))
348 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
349 {
350 struct thread_struct *next = &next_p->thread;
351 struct tss_struct *tss = init_tss + smp_processor_id();
352 execution_context_t *stack_ec = get_execution_context();
353 int i;
355 __cli();
357 /* Switch guest general-register state. */
358 if ( !is_idle_task(prev_p->domain) )
359 {
360 memcpy(&prev_p->thread.user_ctxt,
361 stack_ec,
362 sizeof(*stack_ec));
363 unlazy_fpu(prev_p);
364 CLEAR_FAST_TRAP(&prev_p->thread);
365 }
367 if ( !is_idle_task(next_p->domain) )
368 {
369 memcpy(stack_ec,
370 &next_p->thread.user_ctxt,
371 sizeof(*stack_ec));
373 SET_FAST_TRAP(&next_p->thread);
375 /* Switch the guest OS ring-1 stack. */
376 tss->esp1 = next->guestos_sp;
377 tss->ss1 = next->guestos_ss;
379 /* Maybe switch the debug registers. */
380 if ( unlikely(next->debugreg[7]) )
381 {
382 loaddebug(next, 0);
383 loaddebug(next, 1);
384 loaddebug(next, 2);
385 loaddebug(next, 3);
386 /* no 4 and 5 */
387 loaddebug(next, 6);
388 loaddebug(next, 7);
389 }
391 /* Switch page tables. */
392 write_ptbase(&next_p->mm);
393 }
395 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
396 {
397 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
398 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
399 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
400 ~0U, IOBMP_BYTES_PER_SELBIT);
401 tss->bitmap = IOBMP_INVALID_OFFSET;
402 }
404 if ( unlikely(next_p->thread.io_bitmap != NULL) )
405 {
406 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
407 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
408 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
409 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
410 IOBMP_BYTES_PER_SELBIT);
411 tss->bitmap = IOBMP_OFFSET;
412 }
414 set_current(next_p);
416 /* Switch GDT and LDT. */
417 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
418 load_LDT(next_p);
420 __sti();
421 }
424 /* XXX Currently the 'domain' field is ignored! XXX */
425 long do_iopl(domid_t domain, unsigned int new_io_pl)
426 {
427 execution_context_t *ec = get_execution_context();
428 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
429 return 0;
430 }
432 void hypercall_create_continuation(unsigned int op, unsigned int nr_args, ...)
433 {
434 execution_context_t *ec = get_execution_context();
435 unsigned long *preg = &ec->ebx;
436 unsigned int i;
437 va_list args;
439 ec->eax = op;
440 ec->eip -= 2; /* re-execute 'int 0x82' */
442 va_start(args, nr_args);
443 for ( i = 0; i < nr_args; i++ )
444 *preg++ = va_arg(args, unsigned long);
445 va_end(args);
446 }
448 #endif
451 static void relinquish_list(struct domain *d, struct list_head *list)
452 {
453 struct list_head *ent;
454 struct pfn_info *page;
455 unsigned long x, y;
457 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
458 spin_lock_recursive(&d->page_alloc_lock);
460 ent = list->next;
461 while ( ent != list )
462 {
463 page = list_entry(ent, struct pfn_info, list);
465 /* Grab a reference to the page so it won't disappear from under us. */
466 if ( unlikely(!get_page(page, d)) )
467 {
468 /* Couldn't get a reference -- someone is freeing this page. */
469 ent = ent->next;
470 continue;
471 }
473 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
474 put_page_and_type(page);
476 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
477 put_page(page);
479 /*
480 * Forcibly invalidate base page tables at this point to break circular
481 * 'linear page table' references. This is okay because MMU structures
482 * are not shared across domains and this domain is now dead. Thus base
483 * tables are not in use so a non-zero count means circular reference.
484 */
485 y = page->u.inuse.type_info;
486 for ( ; ; )
487 {
488 x = y;
489 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
490 (PGT_base_page_table|PGT_validated)) )
491 break;
493 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
494 if ( likely(y == x) )
495 {
496 free_page_type(page, PGT_base_page_table);
497 break;
498 }
499 }
501 /* Follow the list chain and /then/ potentially free the page. */
502 ent = ent->next;
503 put_page(page);
504 }
506 spin_unlock_recursive(&d->page_alloc_lock);
507 }
510 void domain_relinquish_memory(struct domain *d)
511 {
512 struct exec_domain *ed;
514 audit_domain(d);
516 /* Ensure that noone is running over the dead domain's page tables. */
517 synchronise_pagetables(~0UL);
519 /* Exit shadow mode before deconstructing final guest page table. */
520 shadow_mode_disable(d);
522 /* Drop the in-use reference to the page-table base. */
523 for_each_exec_domain(d, ed) {
524 if ( pagetable_val(ed->mm.pagetable) != 0 )
525 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >>
526 PAGE_SHIFT]);
527 }
529 /*
530 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
531 * it automatically gets squashed when the guest's mappings go away.
532 */
533 for_each_exec_domain(d, ed)
534 destroy_gdt(ed);
536 /* Relinquish every page of memory. */
537 relinquish_list(d, &d->xenpage_list);
538 relinquish_list(d, &d->page_list);
539 }
542 int construct_dom0(struct domain *p,
543 unsigned long alloc_start,
544 unsigned long alloc_end,
545 char *image_start, unsigned long image_len,
546 char *initrd_start, unsigned long initrd_len,
547 char *cmdline)
548 {
549 char *dst;
550 int i, rc;
551 unsigned long pfn, mfn;
552 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
553 unsigned long nr_pt_pages;
554 unsigned long count;
555 l2_pgentry_t *l2tab, *l2start;
556 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
557 struct pfn_info *page = NULL;
558 start_info_t *si;
559 struct exec_domain *ed = p->exec_domain[0];
561 /*
562 * This fully describes the memory layout of the initial domain. All
563 * *_start address are page-aligned, except v_start (and v_end) which are
564 * superpage-aligned.
565 */
566 struct domain_setup_info dsi;
567 unsigned long vinitrd_start;
568 unsigned long vinitrd_end;
569 unsigned long vphysmap_start;
570 unsigned long vphysmap_end;
571 unsigned long vstartinfo_start;
572 unsigned long vstartinfo_end;
573 unsigned long vstack_start;
574 unsigned long vstack_end;
575 unsigned long vpt_start;
576 unsigned long vpt_end;
577 unsigned long v_end;
579 /* Machine address of next candidate page-table page. */
580 unsigned long mpt_alloc;
582 extern void physdev_init_dom0(struct domain *);
584 /* Sanity! */
585 if ( p->id != 0 )
586 BUG();
587 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) )
588 BUG();
590 memset(&dsi, 0, sizeof(struct domain_setup_info));
592 printk("*** LOADING DOMAIN 0 ***\n");
594 /*
595 * This is all a bit grim. We've moved the modules to the "safe" physical
596 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
597 * routine we're going to copy it down into the region that's actually
598 * been allocated to domain 0. This is highly likely to be overlapping, so
599 * we use a forward copy.
600 *
601 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
602 * 4GB and lots of network/disk cards that allocate loads of buffers.
603 * We'll have to revisit this if we ever support PAE (64GB).
604 */
606 rc = parseelfimage(image_start, image_len, &dsi);
607 if ( rc != 0 )
608 return rc;
610 /* Set up domain options */
611 if ( dsi.use_writable_pagetables )
612 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
614 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
615 {
616 printk("Initial guest OS must load to a page boundary.\n");
617 return -EINVAL;
618 }
620 /*
621 * Why do we need this? The number of page-table frames depends on the
622 * size of the bootstrap address space. But the size of the address space
623 * depends on the number of page-table frames (since each one is mapped
624 * read-only). We have a pair of simultaneous equations in two unknowns,
625 * which we solve by exhaustive search.
626 */
627 vinitrd_start = round_pgup(dsi.v_kernend);
628 vinitrd_end = vinitrd_start + initrd_len;
629 vphysmap_start = round_pgup(vinitrd_end);
630 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
631 vpt_start = round_pgup(vphysmap_end);
632 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
633 {
634 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
635 vstartinfo_start = vpt_end;
636 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
637 vstack_start = vstartinfo_end;
638 vstack_end = vstack_start + PAGE_SIZE;
639 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
640 if ( (v_end - vstack_end) < (512 << 10) )
641 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
642 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
643 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
644 break;
645 }
647 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
648 " Kernel image: %p->%p\n"
649 " Initrd image: %p->%p\n"
650 " Dom0 alloc.: %08lx->%08lx\n",
651 image_start, image_start + image_len,
652 initrd_start, initrd_start + initrd_len,
653 alloc_start, alloc_end);
654 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
655 " Loaded kernel: %08lx->%08lx\n"
656 " Init. ramdisk: %08lx->%08lx\n"
657 " Phys-Mach map: %08lx->%08lx\n"
658 " Page tables: %08lx->%08lx\n"
659 " Start info: %08lx->%08lx\n"
660 " Boot stack: %08lx->%08lx\n"
661 " TOTAL: %08lx->%08lx\n",
662 dsi.v_kernstart, dsi.v_kernend,
663 vinitrd_start, vinitrd_end,
664 vphysmap_start, vphysmap_end,
665 vpt_start, vpt_end,
666 vstartinfo_start, vstartinfo_end,
667 vstack_start, vstack_end,
668 dsi.v_start, v_end);
669 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
671 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
672 {
673 printk("Initial guest OS requires too much space\n"
674 "(%luMB is greater than %luMB limit)\n",
675 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
676 return -ENOMEM;
677 }
679 /*
680 * Protect the lowest 1GB of memory. We use a temporary mapping there
681 * from which we copy the kernel and ramdisk images.
682 */
683 if ( dsi.v_start < (1<<30) )
684 {
685 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
686 return -EINVAL;
687 }
689 /* Paranoia: scrub DOM0's memory allocation. */
690 printk("Scrubbing DOM0 RAM: ");
691 dst = (char *)alloc_start;
692 while ( dst < (char *)alloc_end )
693 {
694 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
695 printk(".");
696 touch_nmi_watchdog();
697 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
698 {
699 memset(dst, 0, SCRUB_BYTES);
700 dst += SCRUB_BYTES;
701 }
702 else
703 {
704 memset(dst, 0, (char *)alloc_end - dst);
705 break;
706 }
707 }
708 printk("done.\n");
710 /* Construct a frame-allocation list for the initial domain. */
711 for ( mfn = (alloc_start>>PAGE_SHIFT);
712 mfn < (alloc_end>>PAGE_SHIFT);
713 mfn++ )
714 {
715 page = &frame_table[mfn];
716 page->u.inuse.domain = p;
717 page->u.inuse.type_info = 0;
718 page->count_info = PGC_allocated | 1;
719 list_add_tail(&page->list, &p->page_list);
720 p->tot_pages++; p->max_pages++;
721 }
723 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
725 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
726 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
728 /*
729 * We're basically forcing default RPLs to 1, so that our "what privilege
730 * level are we returning to?" logic works.
731 */
732 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
733 ed->thread.event_selector = FLAT_GUESTOS_CS;
734 ed->thread.guestos_ss = FLAT_GUESTOS_DS;
735 for ( i = 0; i < 256; i++ )
736 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
738 /* WARNING: The new domain must have its 'processor' field filled in! */
739 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
740 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
741 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
742 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
743 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
744 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
745 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
747 l2tab += l2_table_offset(dsi.v_start);
748 mfn = alloc_start >> PAGE_SHIFT;
749 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
750 {
751 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
752 {
753 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
754 mpt_alloc += PAGE_SIZE;
755 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
756 clear_page(l1tab);
757 if ( count == 0 )
758 l1tab += l1_table_offset(dsi.v_start);
759 }
760 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
762 page = &frame_table[mfn];
763 if ( !get_page_and_type(page, p, PGT_writable_page) )
764 BUG();
766 mfn++;
767 }
769 /* Pages that are part of page tables must be read only. */
770 l2tab = l2start + l2_table_offset(vpt_start);
771 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
772 l1tab += l1_table_offset(vpt_start);
773 l2tab++;
774 for ( count = 0; count < nr_pt_pages; count++ )
775 {
776 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
777 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
778 if ( count == 0 )
779 {
780 page->u.inuse.type_info &= ~PGT_type_mask;
781 page->u.inuse.type_info |= PGT_l2_page_table;
783 /*
784 * No longer writable: decrement the type_count.
785 * Installed as CR3: increment both the ref_count and type_count.
786 * Net: just increment the ref_count.
787 */
788 get_page(page, p); /* an extra ref because of readable mapping */
790 /* Get another ref to L2 page so that it can be pinned. */
791 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
792 BUG();
793 set_bit(_PGT_pinned, &page->u.inuse.type_info);
794 }
795 else
796 {
797 page->u.inuse.type_info &= ~PGT_type_mask;
798 page->u.inuse.type_info |= PGT_l1_page_table;
799 page->u.inuse.type_info |=
800 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
802 /*
803 * No longer writable: decrement the type_count.
804 * This is an L1 page, installed in a validated L2 page:
805 * increment both the ref_count and type_count.
806 * Net: just increment the ref_count.
807 */
808 get_page(page, p); /* an extra ref because of readable mapping */
809 }
810 l1tab++;
811 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
812 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
813 }
815 /* Set up shared-info area. */
816 update_dom_time(p);
817 p->shared_info->domain_time = 0;
818 /* Mask all upcalls... */
819 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
820 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
821 p->shared_info->n_vcpu = 1;
823 /* Install the new page tables. */
824 __cli();
825 write_ptbase(&ed->mm);
827 /* Copy the OS image. */
828 (void)loadelfimage(image_start);
830 /* Copy the initial ramdisk. */
831 if ( initrd_len != 0 )
832 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
834 /* Set up start info area. */
835 si = (start_info_t *)vstartinfo_start;
836 memset(si, 0, PAGE_SIZE);
837 si->nr_pages = p->tot_pages;
838 si->shared_info = virt_to_phys(p->shared_info);
839 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
840 si->pt_base = vpt_start;
841 si->nr_pt_frames = nr_pt_pages;
842 si->mfn_list = vphysmap_start;
844 /* Write the phys->machine and machine->phys table entries. */
845 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
846 {
847 mfn = pfn + (alloc_start>>PAGE_SHIFT);
848 #ifndef NDEBUG
849 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
850 if ( pfn > REVERSE_START )
851 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
852 #endif
853 ((unsigned long *)vphysmap_start)[pfn] = mfn;
854 machine_to_phys_mapping[mfn] = pfn;
855 }
857 if ( initrd_len != 0 )
858 {
859 si->mod_start = vinitrd_start;
860 si->mod_len = initrd_len;
861 printk("Initrd len 0x%lx, start at 0x%08lx\n",
862 si->mod_len, si->mod_start);
863 }
865 dst = si->cmd_line;
866 if ( cmdline != NULL )
867 {
868 for ( i = 0; i < 255; i++ )
869 {
870 if ( cmdline[i] == '\0' )
871 break;
872 *dst++ = cmdline[i];
873 }
874 }
875 *dst = '\0';
877 /* Reinstate the caller's page tables. */
878 write_ptbase(&current->mm);
879 __sti();
881 /* Destroy low mappings - they were only for our convenience. */
882 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
883 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
884 l2start[i] = mk_l2_pgentry(0);
885 zap_low_mappings(); /* Do the same for the idle page tables. */
887 /* DOM0 gets access to everything. */
888 physdev_init_dom0(p);
890 set_bit(DF_CONSTRUCTED, &p->d_flags);
892 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
894 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
895 shadow_lock(&p->mm);
896 shadow_mode_enable(p, SHM_test);
897 shadow_unlock(&p->mm);
898 #endif
900 return 0;
901 }