debuggers.hg

view xen/arch/x86/domain.c @ 3515:d331c6994d28

bitkeeper revision 1.1159.223.12 (41f14d3cE4GADmEAEr6XE9nXX4dyGw)

Common-code cleanups. Moved arch-specific code out into arch/x86
and asm-x86.
author kaf24@scramble.cl.cam.ac.uk
date Fri Jan 21 18:43:08 2005 +0000 (2005-01-21)
parents cfb5f80fb23e
children 46c14b1a4351 47059455441d dee91b44a753
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <asm/regs.h>
23 #include <asm/mc146818rtc.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/i387.h>
29 #include <asm/mpspec.h>
30 #include <asm/ldt.h>
31 #include <xen/irq.h>
32 #include <xen/event.h>
33 #include <asm/shadow.h>
34 #include <xen/console.h>
35 #include <xen/elf.h>
36 #include <xen/multicall.h>
38 /* opt_noreboot: If true, machine will need manual reset on error. */
39 static int opt_noreboot = 0;
40 boolean_param("noreboot", opt_noreboot);
42 #if !defined(CONFIG_X86_64BITMODE)
43 /* No ring-3 access in initial page tables. */
44 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
45 #else
46 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
47 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
48 #endif
49 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
50 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
51 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
53 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
54 #define round_pgdown(_p) ((_p)&PAGE_MASK)
56 static void default_idle(void)
57 {
58 __cli();
59 if ( !softirq_pending(smp_processor_id()) )
60 safe_halt();
61 else
62 __sti();
63 }
65 static __attribute_used__ void idle_loop(void)
66 {
67 int cpu = smp_processor_id();
68 for ( ; ; )
69 {
70 irq_stat[cpu].idle_timestamp = jiffies;
71 while ( !softirq_pending(cpu) )
72 default_idle();
73 do_softirq();
74 }
75 }
77 void startup_cpu_idle_loop(void)
78 {
79 /* Just some sanity to ensure that the scheduler is set up okay. */
80 ASSERT(current->id == IDLE_DOMAIN_ID);
81 domain_unpause_by_systemcontroller(current);
82 __enter_scheduler();
84 /*
85 * Declares CPU setup done to the boot processor.
86 * Therefore memory barrier to ensure state is visible.
87 */
88 smp_mb();
89 init_idle();
91 idle_loop();
92 }
94 static long no_idt[2];
95 static int reboot_mode;
96 int reboot_thru_bios = 0;
98 #ifdef CONFIG_SMP
99 int reboot_smp = 0;
100 static int reboot_cpu = -1;
101 /* shamelessly grabbed from lib/vsprintf.c for readability */
102 #define is_digit(c) ((c) >= '0' && (c) <= '9')
103 #endif
106 static inline void kb_wait(void)
107 {
108 int i;
110 for (i=0; i<0x10000; i++)
111 if ((inb_p(0x64) & 0x02) == 0)
112 break;
113 }
116 void machine_restart(char * __unused)
117 {
118 #ifdef CONFIG_SMP
119 int cpuid;
120 #endif
122 if ( opt_noreboot )
123 {
124 printk("Reboot disabled on cmdline: require manual reset\n");
125 for ( ; ; ) __asm__ __volatile__ ("hlt");
126 }
128 #ifdef CONFIG_SMP
129 cpuid = GET_APIC_ID(apic_read(APIC_ID));
131 /* KAF: Need interrupts enabled for safe IPI. */
132 __sti();
134 if (reboot_smp) {
136 /* check to see if reboot_cpu is valid
137 if its not, default to the BSP */
138 if ((reboot_cpu == -1) ||
139 (reboot_cpu > (NR_CPUS -1)) ||
140 !(phys_cpu_present_map & (1<<cpuid)))
141 reboot_cpu = boot_cpu_physical_apicid;
143 reboot_smp = 0; /* use this as a flag to only go through this once*/
144 /* re-run this function on the other CPUs
145 it will fall though this section since we have
146 cleared reboot_smp, and do the reboot if it is the
147 correct CPU, otherwise it halts. */
148 if (reboot_cpu != cpuid)
149 smp_call_function((void *)machine_restart , NULL, 1, 0);
150 }
152 /* if reboot_cpu is still -1, then we want a tradional reboot,
153 and if we are not running on the reboot_cpu,, halt */
154 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
155 for (;;)
156 __asm__ __volatile__ ("hlt");
157 }
158 /*
159 * Stop all CPUs and turn off local APICs and the IO-APIC, so
160 * other OSs see a clean IRQ state.
161 */
162 smp_send_stop();
163 disable_IO_APIC();
164 #endif
166 if(!reboot_thru_bios) {
167 /* rebooting needs to touch the page at absolute addr 0 */
168 *((unsigned short *)__va(0x472)) = reboot_mode;
169 for (;;) {
170 int i;
171 for (i=0; i<100; i++) {
172 kb_wait();
173 udelay(50);
174 outb(0xfe,0x64); /* pulse reset low */
175 udelay(50);
176 }
177 /* That didn't work - force a triple fault.. */
178 __asm__ __volatile__("lidt %0": "=m" (no_idt));
179 __asm__ __volatile__("int3");
180 }
181 }
183 panic("Need to reinclude BIOS reboot code\n");
184 }
187 void __attribute__((noreturn)) __machine_halt(void *unused)
188 {
189 for ( ; ; )
190 __asm__ __volatile__ ( "cli; hlt" );
191 }
193 void machine_halt(void)
194 {
195 smp_call_function(__machine_halt, NULL, 1, 1);
196 __machine_halt(NULL);
197 }
199 void dump_pageframe_info(struct domain *d)
200 {
201 struct pfn_info *page;
202 struct list_head *ent;
204 if ( d->tot_pages < 10 )
205 {
206 list_for_each ( ent, &d->page_list )
207 {
208 page = list_entry(ent, struct pfn_info, list);
209 printk("Page %08x: caf=%08x, taf=%08x\n",
210 page_to_phys(page), page->count_info,
211 page->u.inuse.type_info);
212 }
213 }
215 page = virt_to_page(d->shared_info);
216 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
217 page_to_phys(page), page->count_info,
218 page->u.inuse.type_info);
219 }
221 xmem_cache_t *domain_struct_cachep;
222 void __init domain_startofday(void)
223 {
224 domain_struct_cachep = xmem_cache_create(
225 "domain_cache", sizeof(struct domain),
226 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
227 if ( domain_struct_cachep == NULL )
228 panic("No slab cache for domain structs.");
229 }
231 struct domain *arch_alloc_domain_struct(void)
232 {
233 return xmem_cache_alloc(domain_struct_cachep);
234 }
236 void arch_free_domain_struct(struct domain *d)
237 {
238 xmem_cache_free(domain_struct_cachep, d);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_page((unsigned long)d->mm.perdomain_pt);
244 }
246 static void continue_idle_task(struct domain *d)
247 {
248 reset_stack_and_jump(idle_loop);
249 }
251 static void continue_nonidle_task(struct domain *d)
252 {
253 reset_stack_and_jump(ret_from_intr);
254 }
256 void arch_do_createdomain(struct domain *d)
257 {
258 #ifdef ARCH_HAS_FAST_TRAP
259 SET_DEFAULT_FAST_TRAP(&d->thread);
260 #endif
262 if ( d->id == IDLE_DOMAIN_ID )
263 {
264 d->thread.schedule_tail = continue_idle_task;
265 }
266 else
267 {
268 d->thread.schedule_tail = continue_nonidle_task;
270 d->shared_info = (void *)alloc_xenheap_page();
271 memset(d->shared_info, 0, PAGE_SIZE);
272 d->shared_info->arch.mfn_to_pfn_start = m2p_start_mfn;
273 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
274 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
275 PAGE_SHIFT] = INVALID_P2M_ENTRY;
277 d->mm.perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
278 memset(d->mm.perdomain_pt, 0, PAGE_SIZE);
279 machine_to_phys_mapping[virt_to_phys(d->mm.perdomain_pt) >>
280 PAGE_SHIFT] = INVALID_P2M_ENTRY;
281 }
282 }
284 int arch_final_setup_guestos(struct domain *d, full_execution_context_t *c)
285 {
286 unsigned long phys_basetab;
287 int i, rc;
289 clear_bit(DF_DONEFPUINIT, &d->flags);
290 if ( c->flags & ECF_I387_VALID )
291 set_bit(DF_DONEFPUINIT, &d->flags);
293 memcpy(&d->thread.user_ctxt,
294 &c->cpu_ctxt,
295 sizeof(d->thread.user_ctxt));
297 /*
298 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
299 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
300 * If SS RPL or DPL differs from CS RPL then we'll #GP.
301 */
302 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
303 ((d->thread.user_ctxt.ss & 3) == 0) )
304 return -EINVAL;
306 memcpy(&d->thread.i387,
307 &c->fpu_ctxt,
308 sizeof(d->thread.i387));
310 memcpy(d->thread.traps,
311 &c->trap_ctxt,
312 sizeof(d->thread.traps));
314 #ifdef ARCH_HAS_FAST_TRAP
315 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
316 return rc;
317 #endif
319 d->mm.ldt_base = c->ldt_base;
320 d->mm.ldt_ents = c->ldt_ents;
322 d->thread.guestos_ss = c->guestos_ss;
323 d->thread.guestos_sp = c->guestos_esp;
325 for ( i = 0; i < 8; i++ )
326 (void)set_debugreg(d, i, c->debugreg[i]);
328 d->thread.event_selector = c->event_callback_cs;
329 d->thread.event_address = c->event_callback_eip;
330 d->thread.failsafe_selector = c->failsafe_callback_cs;
331 d->thread.failsafe_address = c->failsafe_callback_eip;
333 phys_basetab = c->pt_base;
334 d->mm.pagetable = mk_pagetable(phys_basetab);
335 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
336 PGT_base_page_table) )
337 return -EINVAL;
339 /* Failure to set GDT is harmless. */
340 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
341 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
342 if ( c->gdt_ents != 0 )
343 {
344 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
345 {
346 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
347 return rc;
348 }
349 }
351 return 0;
352 }
354 #if defined(__i386__)
356 void new_thread(struct domain *d,
357 unsigned long start_pc,
358 unsigned long start_stack,
359 unsigned long start_info)
360 {
361 execution_context_t *ec = &d->thread.user_ctxt;
363 /*
364 * Initial register values:
365 * DS,ES,FS,GS = FLAT_RING1_DS
366 * CS:EIP = FLAT_RING1_CS:start_pc
367 * SS:ESP = FLAT_RING1_DS:start_stack
368 * ESI = start_info
369 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
370 */
371 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
372 ec->cs = FLAT_RING1_CS;
373 ec->eip = start_pc;
374 ec->esp = start_stack;
375 ec->esi = start_info;
377 __save_flags(ec->eflags);
378 ec->eflags |= X86_EFLAGS_IF;
379 }
382 /*
383 * This special macro can be used to load a debugging register
384 */
385 #define loaddebug(thread,register) \
386 __asm__("movl %0,%%db" #register \
387 : /* no output */ \
388 :"r" (thread->debugreg[register]))
391 void switch_to(struct domain *prev_p, struct domain *next_p)
392 {
393 struct thread_struct *next = &next_p->thread;
394 struct tss_struct *tss = init_tss + smp_processor_id();
395 execution_context_t *stack_ec = get_execution_context();
396 int i;
398 __cli();
400 /* Switch guest general-register state. */
401 if ( !is_idle_task(prev_p) )
402 {
403 memcpy(&prev_p->thread.user_ctxt,
404 stack_ec,
405 sizeof(*stack_ec));
406 unlazy_fpu(prev_p);
407 CLEAR_FAST_TRAP(&prev_p->thread);
408 }
410 if ( !is_idle_task(next_p) )
411 {
412 memcpy(stack_ec,
413 &next_p->thread.user_ctxt,
414 sizeof(*stack_ec));
416 SET_FAST_TRAP(&next_p->thread);
418 /* Switch the guest OS ring-1 stack. */
419 tss->esp1 = next->guestos_sp;
420 tss->ss1 = next->guestos_ss;
422 /* Maybe switch the debug registers. */
423 if ( unlikely(next->debugreg[7]) )
424 {
425 loaddebug(next, 0);
426 loaddebug(next, 1);
427 loaddebug(next, 2);
428 loaddebug(next, 3);
429 /* no 4 and 5 */
430 loaddebug(next, 6);
431 loaddebug(next, 7);
432 }
434 /* Switch page tables. */
435 write_ptbase(&next_p->mm);
436 }
438 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
439 {
440 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
441 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
442 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
443 ~0U, IOBMP_BYTES_PER_SELBIT);
444 tss->bitmap = IOBMP_INVALID_OFFSET;
445 }
447 if ( unlikely(next_p->thread.io_bitmap != NULL) )
448 {
449 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
450 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
451 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
452 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
453 IOBMP_BYTES_PER_SELBIT);
454 tss->bitmap = IOBMP_OFFSET;
455 }
457 set_current(next_p);
459 /* Switch GDT and LDT. */
460 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
461 load_LDT(next_p);
463 __sti();
464 }
467 /* XXX Currently the 'domain' field is ignored! XXX */
468 long do_iopl(domid_t domain, unsigned int new_io_pl)
469 {
470 execution_context_t *ec = get_execution_context();
471 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
472 return 0;
473 }
475 #endif
477 unsigned long hypercall_create_continuation(
478 unsigned int op, unsigned int nr_args, ...)
479 {
480 struct mc_state *mcs = &mc_state[smp_processor_id()];
481 execution_context_t *ec;
482 unsigned long *preg;
483 unsigned int i;
484 va_list args;
486 va_start(args, nr_args);
488 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
489 {
490 __set_bit(_MCSF_call_preempted, &mcs->flags);
492 for ( i = 0; i < nr_args; i++ )
493 mcs->call.args[i] = va_arg(args, unsigned long);
494 }
495 else
496 {
497 ec = get_execution_context();
498 #if defined(__i386__)
499 ec->eax = op;
500 ec->eip -= 2; /* re-execute 'int 0x82' */
502 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
503 *preg = va_arg(args, unsigned long);
504 #else
505 preg = NULL; /* XXX x86/64 */
506 #endif
507 }
509 va_end(args);
511 return op;
512 }
514 static void relinquish_list(struct domain *d, struct list_head *list)
515 {
516 struct list_head *ent;
517 struct pfn_info *page;
518 unsigned long x, y;
520 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
521 spin_lock_recursive(&d->page_alloc_lock);
523 ent = list->next;
524 while ( ent != list )
525 {
526 page = list_entry(ent, struct pfn_info, list);
528 /* Grab a reference to the page so it won't disappear from under us. */
529 if ( unlikely(!get_page(page, d)) )
530 {
531 /* Couldn't get a reference -- someone is freeing this page. */
532 ent = ent->next;
533 continue;
534 }
536 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
537 put_page_and_type(page);
539 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
540 put_page(page);
542 /*
543 * Forcibly invalidate base page tables at this point to break circular
544 * 'linear page table' references. This is okay because MMU structures
545 * are not shared across domains and this domain is now dead. Thus base
546 * tables are not in use so a non-zero count means circular reference.
547 */
548 y = page->u.inuse.type_info;
549 for ( ; ; )
550 {
551 x = y;
552 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
553 (PGT_base_page_table|PGT_validated)) )
554 break;
556 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
557 if ( likely(y == x) )
558 {
559 free_page_type(page, PGT_base_page_table);
560 break;
561 }
562 }
564 /* Follow the list chain and /then/ potentially free the page. */
565 ent = ent->next;
566 put_page(page);
567 }
569 spin_unlock_recursive(&d->page_alloc_lock);
570 }
573 void domain_relinquish_memory(struct domain *d)
574 {
575 /* Ensure that noone is running over the dead domain's page tables. */
576 synchronise_pagetables(~0UL);
578 /* Exit shadow mode before deconstructing final guest page table. */
579 shadow_mode_disable(d);
581 /* Drop the in-use reference to the page-table base. */
582 if ( pagetable_val(d->mm.pagetable) != 0 )
583 put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >>
584 PAGE_SHIFT]);
586 /*
587 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
588 * it automatically gets squashed when the guest's mappings go away.
589 */
590 destroy_gdt(d);
592 /* Relinquish every page of memory. */
593 relinquish_list(d, &d->xenpage_list);
594 relinquish_list(d, &d->page_list);
595 }
598 int construct_dom0(struct domain *p,
599 unsigned long alloc_start,
600 unsigned long alloc_end,
601 char *image_start, unsigned long image_len,
602 char *initrd_start, unsigned long initrd_len,
603 char *cmdline)
604 {
605 char *dst;
606 int i, rc;
607 unsigned long pfn, mfn;
608 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
609 unsigned long nr_pt_pages;
610 unsigned long count;
611 l2_pgentry_t *l2tab, *l2start;
612 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
613 struct pfn_info *page = NULL;
614 start_info_t *si;
616 /*
617 * This fully describes the memory layout of the initial domain. All
618 * *_start address are page-aligned, except v_start (and v_end) which are
619 * superpage-aligned.
620 */
621 struct domain_setup_info dsi;
622 unsigned long vinitrd_start;
623 unsigned long vinitrd_end;
624 unsigned long vphysmap_start;
625 unsigned long vphysmap_end;
626 unsigned long vstartinfo_start;
627 unsigned long vstartinfo_end;
628 unsigned long vstack_start;
629 unsigned long vstack_end;
630 unsigned long vpt_start;
631 unsigned long vpt_end;
632 unsigned long v_end;
634 /* Machine address of next candidate page-table page. */
635 unsigned long mpt_alloc;
637 extern void physdev_init_dom0(struct domain *);
639 /* Sanity! */
640 if ( p->id != 0 )
641 BUG();
642 if ( test_bit(DF_CONSTRUCTED, &p->flags) )
643 BUG();
645 memset(&dsi, 0, sizeof(struct domain_setup_info));
647 printk("*** LOADING DOMAIN 0 ***\n");
649 /*
650 * This is all a bit grim. We've moved the modules to the "safe" physical
651 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
652 * routine we're going to copy it down into the region that's actually
653 * been allocated to domain 0. This is highly likely to be overlapping, so
654 * we use a forward copy.
655 *
656 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
657 * 4GB and lots of network/disk cards that allocate loads of buffers.
658 * We'll have to revisit this if we ever support PAE (64GB).
659 */
661 rc = parseelfimage(image_start, image_len, &dsi);
662 if ( rc != 0 )
663 return rc;
665 /* Set up domain options */
666 if ( dsi.use_writable_pagetables )
667 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
669 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
670 {
671 printk("Initial guest OS must load to a page boundary.\n");
672 return -EINVAL;
673 }
675 /*
676 * Why do we need this? The number of page-table frames depends on the
677 * size of the bootstrap address space. But the size of the address space
678 * depends on the number of page-table frames (since each one is mapped
679 * read-only). We have a pair of simultaneous equations in two unknowns,
680 * which we solve by exhaustive search.
681 */
682 vinitrd_start = round_pgup(dsi.v_kernend);
683 vinitrd_end = vinitrd_start + initrd_len;
684 vphysmap_start = round_pgup(vinitrd_end);
685 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
686 vpt_start = round_pgup(vphysmap_end);
687 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
688 {
689 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
690 vstartinfo_start = vpt_end;
691 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
692 vstack_start = vstartinfo_end;
693 vstack_end = vstack_start + PAGE_SIZE;
694 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
695 if ( (v_end - vstack_end) < (512 << 10) )
696 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
697 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
698 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
699 break;
700 }
702 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
703 " Kernel image: %p->%p\n"
704 " Initrd image: %p->%p\n"
705 " Dom0 alloc.: %08lx->%08lx\n",
706 image_start, image_start + image_len,
707 initrd_start, initrd_start + initrd_len,
708 alloc_start, alloc_end);
709 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
710 " Loaded kernel: %08lx->%08lx\n"
711 " Init. ramdisk: %08lx->%08lx\n"
712 " Phys-Mach map: %08lx->%08lx\n"
713 " Page tables: %08lx->%08lx\n"
714 " Start info: %08lx->%08lx\n"
715 " Boot stack: %08lx->%08lx\n"
716 " TOTAL: %08lx->%08lx\n",
717 dsi.v_kernstart, dsi.v_kernend,
718 vinitrd_start, vinitrd_end,
719 vphysmap_start, vphysmap_end,
720 vpt_start, vpt_end,
721 vstartinfo_start, vstartinfo_end,
722 vstack_start, vstack_end,
723 dsi.v_start, v_end);
724 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
726 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
727 {
728 printk("Initial guest OS requires too much space\n"
729 "(%luMB is greater than %luMB limit)\n",
730 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
731 return -ENOMEM;
732 }
734 /*
735 * Protect the lowest 1GB of memory. We use a temporary mapping there
736 * from which we copy the kernel and ramdisk images.
737 */
738 if ( dsi.v_start < (1<<30) )
739 {
740 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
741 return -EINVAL;
742 }
744 /* Paranoia: scrub DOM0's memory allocation. */
745 printk("Scrubbing DOM0 RAM: ");
746 dst = (char *)alloc_start;
747 while ( dst < (char *)alloc_end )
748 {
749 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
750 printk(".");
751 touch_nmi_watchdog();
752 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
753 {
754 memset(dst, 0, SCRUB_BYTES);
755 dst += SCRUB_BYTES;
756 }
757 else
758 {
759 memset(dst, 0, (char *)alloc_end - dst);
760 break;
761 }
762 }
763 printk("done.\n");
765 /* Construct a frame-allocation list for the initial domain. */
766 for ( mfn = (alloc_start>>PAGE_SHIFT);
767 mfn < (alloc_end>>PAGE_SHIFT);
768 mfn++ )
769 {
770 page = &frame_table[mfn];
771 page->u.inuse.domain = p;
772 page->u.inuse.type_info = 0;
773 page->count_info = PGC_allocated | 1;
774 list_add_tail(&page->list, &p->page_list);
775 p->tot_pages++; p->max_pages++;
776 }
778 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
780 SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
781 SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
783 /*
784 * We're basically forcing default RPLs to 1, so that our "what privilege
785 * level are we returning to?" logic works.
786 */
787 p->thread.failsafe_selector = FLAT_GUESTOS_CS;
788 p->thread.event_selector = FLAT_GUESTOS_CS;
789 p->thread.guestos_ss = FLAT_GUESTOS_DS;
790 for ( i = 0; i < 256; i++ )
791 p->thread.traps[i].cs = FLAT_GUESTOS_CS;
793 /* WARNING: The new domain must have its 'processor' field filled in! */
794 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
795 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
796 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
797 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
798 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
799 mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
800 p->mm.pagetable = mk_pagetable((unsigned long)l2start);
802 l2tab += l2_table_offset(dsi.v_start);
803 mfn = alloc_start >> PAGE_SHIFT;
804 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
805 {
806 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
807 {
808 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
809 mpt_alloc += PAGE_SIZE;
810 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
811 clear_page(l1tab);
812 if ( count == 0 )
813 l1tab += l1_table_offset(dsi.v_start);
814 }
815 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
817 page = &frame_table[mfn];
818 if ( !get_page_and_type(page, p, PGT_writable_page) )
819 BUG();
821 mfn++;
822 }
824 /* Pages that are part of page tables must be read only. */
825 l2tab = l2start + l2_table_offset(vpt_start);
826 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
827 l1tab += l1_table_offset(vpt_start);
828 l2tab++;
829 for ( count = 0; count < nr_pt_pages; count++ )
830 {
831 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
832 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
833 if ( count == 0 )
834 {
835 page->u.inuse.type_info &= ~PGT_type_mask;
836 page->u.inuse.type_info |= PGT_l2_page_table;
838 /*
839 * No longer writable: decrement the type_count.
840 * Installed as CR3: increment both the ref_count and type_count.
841 * Net: just increment the ref_count.
842 */
843 get_page(page, p); /* an extra ref because of readable mapping */
845 /* Get another ref to L2 page so that it can be pinned. */
846 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
847 BUG();
848 set_bit(_PGT_pinned, &page->u.inuse.type_info);
849 }
850 else
851 {
852 page->u.inuse.type_info &= ~PGT_type_mask;
853 page->u.inuse.type_info |= PGT_l1_page_table;
854 page->u.inuse.type_info |=
855 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
857 /*
858 * No longer writable: decrement the type_count.
859 * This is an L1 page, installed in a validated L2 page:
860 * increment both the ref_count and type_count.
861 * Net: just increment the ref_count.
862 */
863 get_page(page, p); /* an extra ref because of readable mapping */
864 }
865 l1tab++;
866 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
867 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
868 }
870 /* Set up shared-info area. */
871 update_dom_time(p->shared_info);
872 p->shared_info->domain_time = 0;
873 /* Mask all upcalls... */
874 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
875 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
877 /* Install the new page tables. */
878 __cli();
879 write_ptbase(&p->mm);
881 /* Copy the OS image. */
882 (void)loadelfimage(image_start);
884 /* Copy the initial ramdisk. */
885 if ( initrd_len != 0 )
886 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
888 /* Set up start info area. */
889 si = (start_info_t *)vstartinfo_start;
890 memset(si, 0, PAGE_SIZE);
891 si->nr_pages = p->tot_pages;
892 si->shared_info = virt_to_phys(p->shared_info);
893 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
894 si->pt_base = vpt_start;
895 si->nr_pt_frames = nr_pt_pages;
896 si->mfn_list = vphysmap_start;
898 /* Write the phys->machine and machine->phys table entries. */
899 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
900 {
901 mfn = pfn + (alloc_start>>PAGE_SHIFT);
902 #ifndef NDEBUG
903 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
904 if ( pfn > REVERSE_START )
905 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
906 #endif
907 ((unsigned long *)vphysmap_start)[pfn] = mfn;
908 machine_to_phys_mapping[mfn] = pfn;
909 }
911 if ( initrd_len != 0 )
912 {
913 si->mod_start = vinitrd_start;
914 si->mod_len = initrd_len;
915 printk("Initrd len 0x%lx, start at 0x%08lx\n",
916 si->mod_len, si->mod_start);
917 }
919 dst = si->cmd_line;
920 if ( cmdline != NULL )
921 {
922 for ( i = 0; i < 255; i++ )
923 {
924 if ( cmdline[i] == '\0' )
925 break;
926 *dst++ = cmdline[i];
927 }
928 }
929 *dst = '\0';
931 /* Reinstate the caller's page tables. */
932 write_ptbase(&current->mm);
933 __sti();
935 /* Destroy low mappings - they were only for our convenience. */
936 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
937 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
938 l2start[i] = mk_l2_pgentry(0);
939 zap_low_mappings(); /* Do the same for the idle page tables. */
941 /* DOM0 gets access to everything. */
942 physdev_init_dom0(p);
944 set_bit(DF_CONSTRUCTED, &p->flags);
946 new_thread(p, dsi.v_kernentry, vstack_end, vstartinfo_start);
948 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
949 shadow_lock(&p->mm);
950 shadow_mode_enable(p, SHM_test);
951 shadow_unlock(&p->mm);
952 #endif
954 return 0;
955 }