debuggers.hg

annotate xen/arch/x86/domain.c @ 3668:d55d523078f7

bitkeeper revision 1.1159.212.77 (4202221693AFbvFZWeMHHIjQfbzTIQ)

More x86_64 prgress. Many more gaps filled in. Next step is DOM0
construction.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Thu Feb 03 13:07:34 2005 +0000 (2005-02-03)
parents 060c1ea52343
children 677cb76cff18
rev   line source
kaf24@1787 1 /******************************************************************************
kaf24@1787 2 * arch/x86/domain.c
kaf24@1787 3 *
kaf24@1787 4 * x86-specific domain handling (e.g., register setup and context switching).
kaf24@1787 5 */
kaf24@1787 6
kaf24@1710 7 /*
kaf24@1710 8 * Copyright (C) 1995 Linus Torvalds
kaf24@1710 9 *
kaf24@1710 10 * Pentium III FXSR, SSE support
kaf24@1710 11 * Gareth Hughes <gareth@valinux.com>, May 2000
kaf24@1710 12 */
kaf24@1710 13
kaf24@1710 14 #include <xen/config.h>
kaf24@3372 15 #include <xen/init.h>
kaf24@1710 16 #include <xen/lib.h>
kaf24@1710 17 #include <xen/errno.h>
kaf24@1710 18 #include <xen/sched.h>
kaf24@1710 19 #include <xen/smp.h>
kaf24@1710 20 #include <xen/delay.h>
kaf24@1710 21 #include <xen/softirq.h>
ach61@2843 22 #include <asm/regs.h>
kaf24@1710 23 #include <asm/mc146818rtc.h>
kaf24@1710 24 #include <asm/system.h>
kaf24@1710 25 #include <asm/io.h>
kaf24@1710 26 #include <asm/processor.h>
kaf24@1710 27 #include <asm/desc.h>
kaf24@1710 28 #include <asm/i387.h>
kaf24@1710 29 #include <asm/mpspec.h>
kaf24@1710 30 #include <asm/ldt.h>
kaf24@1710 31 #include <xen/irq.h>
kaf24@1710 32 #include <xen/event.h>
kaf24@1787 33 #include <asm/shadow.h>
djm@1752 34 #include <xen/console.h>
djm@1752 35 #include <xen/elf.h>
iap10@3328 36 #include <asm/vmx.h>
iap10@3328 37 #include <asm/vmx_vmcs.h>
iap10@3328 38 #include <xen/kernel.h>
iap10@3328 39 #include <public/io/ioreq.h>
kaf24@3177 40 #include <xen/multicall.h>
djm@1752 41
kaf24@3372 42 /* opt_noreboot: If true, machine will need manual reset on error. */
kaf24@3372 43 static int opt_noreboot = 0;
kaf24@3372 44 boolean_param("noreboot", opt_noreboot);
kaf24@3372 45
djm@1752 46 #if !defined(CONFIG_X86_64BITMODE)
djm@1752 47 /* No ring-3 access in initial page tables. */
djm@1752 48 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
djm@1752 49 #else
djm@1752 50 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
djm@1752 51 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
djm@1752 52 #endif
djm@1752 53 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
djm@1752 54 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
djm@1752 55 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
djm@1752 56
djm@1752 57 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
djm@1752 58 #define round_pgdown(_p) ((_p)&PAGE_MASK)
djm@1752 59
kaf24@3310 60 static void default_idle(void)
kaf24@1710 61 {
kaf24@3310 62 __cli();
kaf24@3310 63 if ( !softirq_pending(smp_processor_id()) )
kaf24@3310 64 safe_halt();
kaf24@3310 65 else
kaf24@3310 66 __sti();
kaf24@1710 67 }
kaf24@1710 68
kaf24@3354 69 static __attribute_used__ void idle_loop(void)
kaf24@1710 70 {
kaf24@1710 71 int cpu = smp_processor_id();
kaf24@1710 72 for ( ; ; )
kaf24@1710 73 {
kaf24@1710 74 irq_stat[cpu].idle_timestamp = jiffies;
kaf24@1710 75 while ( !softirq_pending(cpu) )
kaf24@1710 76 default_idle();
kaf24@1710 77 do_softirq();
kaf24@1710 78 }
kaf24@1710 79 }
kaf24@1710 80
kaf24@1710 81 void startup_cpu_idle_loop(void)
kaf24@1710 82 {
kaf24@1710 83 /* Just some sanity to ensure that the scheduler is set up okay. */
cl349@3036 84 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
cl349@2957 85 domain_unpause_by_systemcontroller(current->domain);
kaf24@1710 86 __enter_scheduler();
kaf24@1710 87
kaf24@1710 88 /*
kaf24@1710 89 * Declares CPU setup done to the boot processor.
kaf24@1710 90 * Therefore memory barrier to ensure state is visible.
kaf24@1710 91 */
kaf24@1710 92 smp_mb();
kaf24@1710 93 init_idle();
kaf24@1710 94
kaf24@3310 95 idle_loop();
kaf24@1710 96 }
kaf24@1710 97
kaf24@1710 98 static long no_idt[2];
kaf24@1710 99 static int reboot_mode;
kaf24@1710 100 int reboot_thru_bios = 0;
kaf24@1710 101
kaf24@1710 102 #ifdef CONFIG_SMP
kaf24@1710 103 int reboot_smp = 0;
kaf24@1710 104 static int reboot_cpu = -1;
kaf24@1710 105 /* shamelessly grabbed from lib/vsprintf.c for readability */
kaf24@1710 106 #define is_digit(c) ((c) >= '0' && (c) <= '9')
kaf24@1710 107 #endif
kaf24@1710 108
kaf24@1710 109
kaf24@1710 110 static inline void kb_wait(void)
kaf24@1710 111 {
kaf24@1710 112 int i;
kaf24@1710 113
kaf24@1710 114 for (i=0; i<0x10000; i++)
kaf24@1710 115 if ((inb_p(0x64) & 0x02) == 0)
kaf24@1710 116 break;
kaf24@1710 117 }
kaf24@1710 118
kaf24@1710 119
kaf24@1710 120 void machine_restart(char * __unused)
kaf24@1710 121 {
kaf24@1710 122 #ifdef CONFIG_SMP
kaf24@1710 123 int cpuid;
kaf24@1710 124 #endif
kaf24@1710 125
kaf24@1710 126 if ( opt_noreboot )
kaf24@1710 127 {
kaf24@1710 128 printk("Reboot disabled on cmdline: require manual reset\n");
kaf24@1710 129 for ( ; ; ) __asm__ __volatile__ ("hlt");
kaf24@1710 130 }
kaf24@1710 131
kaf24@1710 132 #ifdef CONFIG_SMP
kaf24@1710 133 cpuid = GET_APIC_ID(apic_read(APIC_ID));
kaf24@1710 134
kaf24@1710 135 /* KAF: Need interrupts enabled for safe IPI. */
kaf24@1710 136 __sti();
kaf24@1710 137
kaf24@1710 138 if (reboot_smp) {
kaf24@1710 139
kaf24@1710 140 /* check to see if reboot_cpu is valid
kaf24@1710 141 if its not, default to the BSP */
kaf24@1710 142 if ((reboot_cpu == -1) ||
kaf24@1710 143 (reboot_cpu > (NR_CPUS -1)) ||
kaf24@1710 144 !(phys_cpu_present_map & (1<<cpuid)))
kaf24@1710 145 reboot_cpu = boot_cpu_physical_apicid;
kaf24@1710 146
kaf24@1710 147 reboot_smp = 0; /* use this as a flag to only go through this once*/
kaf24@1710 148 /* re-run this function on the other CPUs
kaf24@1710 149 it will fall though this section since we have
kaf24@1710 150 cleared reboot_smp, and do the reboot if it is the
kaf24@1710 151 correct CPU, otherwise it halts. */
kaf24@1710 152 if (reboot_cpu != cpuid)
kaf24@1710 153 smp_call_function((void *)machine_restart , NULL, 1, 0);
kaf24@1710 154 }
kaf24@1710 155
kaf24@1710 156 /* if reboot_cpu is still -1, then we want a tradional reboot,
kaf24@1710 157 and if we are not running on the reboot_cpu,, halt */
kaf24@1710 158 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
kaf24@1710 159 for (;;)
kaf24@1710 160 __asm__ __volatile__ ("hlt");
kaf24@1710 161 }
kaf24@1710 162 /*
kaf24@1710 163 * Stop all CPUs and turn off local APICs and the IO-APIC, so
kaf24@1710 164 * other OSs see a clean IRQ state.
kaf24@1710 165 */
kaf24@1710 166 smp_send_stop();
kaf24@1710 167 disable_IO_APIC();
kaf24@1710 168 #endif
iap10@3328 169 #ifdef CONFIG_VMX
iap10@3328 170 stop_vmx();
iap10@3328 171 #endif
kaf24@1710 172
kaf24@1710 173 if(!reboot_thru_bios) {
kaf24@1710 174 /* rebooting needs to touch the page at absolute addr 0 */
kaf24@1710 175 *((unsigned short *)__va(0x472)) = reboot_mode;
kaf24@1710 176 for (;;) {
kaf24@1710 177 int i;
kaf24@1710 178 for (i=0; i<100; i++) {
kaf24@1710 179 kb_wait();
kaf24@1710 180 udelay(50);
kaf24@1710 181 outb(0xfe,0x64); /* pulse reset low */
kaf24@1710 182 udelay(50);
kaf24@1710 183 }
kaf24@1710 184 /* That didn't work - force a triple fault.. */
kaf24@1710 185 __asm__ __volatile__("lidt %0": "=m" (no_idt));
kaf24@1710 186 __asm__ __volatile__("int3");
kaf24@1710 187 }
kaf24@1710 188 }
kaf24@1710 189
kaf24@1710 190 panic("Need to reinclude BIOS reboot code\n");
kaf24@1710 191 }
kaf24@1710 192
kaf24@1849 193
kaf24@1849 194 void __attribute__((noreturn)) __machine_halt(void *unused)
kaf24@1849 195 {
kaf24@1849 196 for ( ; ; )
kaf24@1849 197 __asm__ __volatile__ ( "cli; hlt" );
kaf24@1849 198 }
kaf24@1849 199
kaf24@1710 200 void machine_halt(void)
kaf24@1710 201 {
kaf24@1849 202 smp_call_function(__machine_halt, NULL, 1, 1);
kaf24@1849 203 __machine_halt(NULL);
kaf24@1710 204 }
kaf24@1710 205
kaf24@3515 206 void dump_pageframe_info(struct domain *d)
kaf24@3515 207 {
kaf24@3515 208 struct pfn_info *page;
kaf24@3515 209
kaf24@3515 210 if ( d->tot_pages < 10 )
kaf24@3515 211 {
kaf24@3568 212 list_for_each_entry ( page, &d->page_list, list )
kaf24@3515 213 {
kaf24@3515 214 printk("Page %08x: caf=%08x, taf=%08x\n",
kaf24@3515 215 page_to_phys(page), page->count_info,
kaf24@3515 216 page->u.inuse.type_info);
kaf24@3515 217 }
kaf24@3515 218 }
kaf24@3515 219
kaf24@3515 220 page = virt_to_page(d->shared_info);
kaf24@3515 221 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
kaf24@3515 222 page_to_phys(page), page->count_info,
kaf24@3515 223 page->u.inuse.type_info);
kaf24@3515 224 }
kaf24@3515 225
kaf24@3515 226 struct domain *arch_alloc_domain_struct(void)
kaf24@3515 227 {
iap10@3651 228 return xmalloc(struct domain);
kaf24@3515 229 }
kaf24@3515 230
kaf24@3515 231 void arch_free_domain_struct(struct domain *d)
kaf24@3515 232 {
iap10@3651 233 xfree(d);
kaf24@3515 234 }
kaf24@3515 235
kaf24@3517 236 struct exec_domain *arch_alloc_exec_domain_struct(void)
kaf24@3517 237 {
iap10@3651 238 return xmalloc(struct exec_domain);
kaf24@3517 239 }
kaf24@3517 240
kaf24@3517 241 void arch_free_exec_domain_struct(struct exec_domain *ed)
kaf24@3517 242 {
iap10@3651 243 xfree(ed);
kaf24@3517 244 }
kaf24@3517 245
kaf24@1974 246 void free_perdomain_pt(struct domain *d)
kaf24@1974 247 {
cl349@3036 248 free_xenheap_page((unsigned long)d->mm_perdomain_pt);
kaf24@1974 249 }
kaf24@1974 250
cl349@3319 251 static void continue_idle_task(struct exec_domain *ed)
kaf24@3310 252 {
kaf24@3310 253 reset_stack_and_jump(idle_loop);
kaf24@3310 254 }
kaf24@3310 255
cl349@3319 256 static void continue_nonidle_task(struct exec_domain *ed)
kaf24@3310 257 {
kaf24@3310 258 reset_stack_and_jump(ret_from_intr);
kaf24@3310 259 }
kaf24@3310 260
cl349@3319 261 void arch_do_createdomain(struct exec_domain *ed)
djm@1736 262 {
cl349@2959 263 struct domain *d = ed->domain;
kaf24@3668 264
cl349@3319 265 SET_DEFAULT_FAST_TRAP(&ed->thread);
kaf24@3310 266
kaf24@3310 267 if ( d->id == IDLE_DOMAIN_ID )
kaf24@3310 268 {
cl349@3319 269 ed->thread.schedule_tail = continue_idle_task;
kaf24@3310 270 }
kaf24@3310 271 else
kaf24@3310 272 {
cl349@3319 273 ed->thread.schedule_tail = continue_nonidle_task;
djm@1736 274
kaf24@3310 275 d->shared_info = (void *)alloc_xenheap_page();
kaf24@3310 276 memset(d->shared_info, 0, PAGE_SIZE);
cl349@3318 277 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
kaf24@3310 278 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
kaf24@3310 279 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
kaf24@3310 280 PAGE_SHIFT] = INVALID_P2M_ENTRY;
kaf24@3310 281
cl349@3318 282 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
cl349@3318 283 memset(d->mm_perdomain_pt, 0, PAGE_SIZE);
cl349@3318 284 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >>
kaf24@3310 285 PAGE_SHIFT] = INVALID_P2M_ENTRY;
cl349@3324 286 ed->mm.perdomain_ptes = d->mm_perdomain_pt;
kaf24@3310 287 }
djm@1736 288 }
djm@1736 289
iap10@3328 290 #ifdef CONFIG_VMX
iap10@3605 291 void arch_vmx_do_resume(struct exec_domain *ed)
iap10@3328 292 {
iap10@3605 293 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs);
iap10@3605 294
iap10@3605 295 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr);
iap10@3605 296 vmx_do_resume(ed);
iap10@3328 297 reset_stack_and_jump(vmx_asm_do_resume);
iap10@3328 298 }
iap10@3328 299
iap10@3605 300 void arch_vmx_do_launch(struct exec_domain *ed)
iap10@3328 301 {
iap10@3605 302 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs);
iap10@3605 303
iap10@3605 304 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr);
iap10@3605 305 vmx_do_launch(ed);
iap10@3328 306 reset_stack_and_jump(vmx_asm_do_launch);
iap10@3328 307 }
iap10@3328 308
iap10@3328 309 static void monitor_mk_pagetable(struct exec_domain *ed)
iap10@3328 310 {
iap10@3328 311 unsigned long mpfn;
iap10@3328 312 l2_pgentry_t *mpl2e;
iap10@3328 313 struct pfn_info *mpfn_info;
iap10@3328 314 struct mm_struct *m = &ed->mm;
iap10@3328 315 struct domain *d = ed->domain;
iap10@3328 316
iap10@3328 317 mpfn_info = alloc_domheap_page(NULL);
iap10@3328 318 ASSERT( mpfn_info );
iap10@3328 319
iap10@3328 320 mpfn = (unsigned long) (mpfn_info - frame_table);
iap10@3605 321 mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << L1_PAGETABLE_SHIFT);
iap10@3328 322 memset(mpl2e, 0, PAGE_SIZE);
iap10@3328 323
iap10@3328 324 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
iap10@3328 325 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
iap10@3328 326 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
iap10@3328 327
iap10@3605 328 m->monitor_table = mk_pagetable(mpfn << L1_PAGETABLE_SHIFT);
iap10@3328 329 m->shadow_mode = SHM_full_32;
iap10@3328 330
iap10@3328 331 mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
iap10@3328 332 mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK)
iap10@3328 333 | __PAGE_HYPERVISOR);
iap10@3328 334
iap10@3328 335 unmap_domain_mem(mpl2e);
iap10@3328 336 }
iap10@3328 337
iap10@3605 338 /*
iap10@3605 339 * Free the pages for monitor_table and guest_pl2e_cache
iap10@3605 340 */
iap10@3605 341 static void monitor_rm_pagetable(struct exec_domain *ed)
iap10@3605 342 {
iap10@3605 343 struct mm_struct *m = &ed->mm;
iap10@3605 344 l2_pgentry_t *mpl2e;
iap10@3605 345 unsigned long mpfn;
iap10@3605 346
iap10@3605 347 mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(m->monitor_table));
iap10@3605 348 /*
iap10@3605 349 * First get the pfn for guest_pl2e_cache by looking at monitor_table
iap10@3605 350 */
iap10@3605 351 mpfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
iap10@3605 352 >> PAGE_SHIFT;
iap10@3605 353
iap10@3605 354 free_domheap_page(&frame_table[mpfn]);
iap10@3605 355 unmap_domain_mem(mpl2e);
iap10@3605 356
iap10@3605 357 /*
iap10@3605 358 * Then free monitor_table.
iap10@3605 359 */
iap10@3605 360 mpfn = (pagetable_val(m->monitor_table)) >> PAGE_SHIFT;
iap10@3605 361 free_domheap_page(&frame_table[mpfn]);
iap10@3605 362
iap10@3605 363 m->monitor_table = mk_pagetable(0);
iap10@3605 364 }
iap10@3605 365
iap10@3605 366 static int vmx_final_setup_guestos(struct exec_domain *ed,
iap10@3328 367 full_execution_context_t *full_context)
iap10@3328 368 {
iap10@3328 369 int error;
iap10@3328 370 execution_context_t *context;
iap10@3328 371 struct vmcs_struct *vmcs;
iap10@3328 372
iap10@3328 373 context = &full_context->cpu_ctxt;
iap10@3328 374
iap10@3328 375 /*
iap10@3328 376 * Create a new VMCS
iap10@3328 377 */
iap10@3328 378 if (!(vmcs = alloc_vmcs())) {
iap10@3328 379 printk("Failed to create a new VMCS\n");
iap10@3328 380 return -ENOMEM;
iap10@3328 381 }
iap10@3328 382
iap10@3605 383 memset(&ed->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct));
iap10@3328 384
iap10@3605 385 ed->thread.arch_vmx.vmcs = vmcs;
iap10@3605 386 error = construct_vmcs(&ed->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
iap10@3328 387 if (error < 0) {
iap10@3328 388 printk("Failed to construct a new VMCS\n");
iap10@3328 389 goto out;
iap10@3328 390 }
iap10@3328 391
iap10@3605 392 monitor_mk_pagetable(ed);
iap10@3605 393 ed->thread.schedule_tail = arch_vmx_do_launch;
iap10@3605 394 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->thread.arch_vmx.cpu_state);
iap10@3328 395
iap10@3605 396 #if defined (__i386)
iap10@3605 397 ed->thread.arch_vmx.vmx_platform.real_mode_data =
iap10@3328 398 (unsigned long *) context->esi;
iap10@3605 399 #endif
iap10@3328 400
iap10@3605 401 if (ed == ed->domain->exec_domain[0]) {
iap10@3605 402 /*
iap10@3605 403 * Required to do this once per domain
iap10@3605 404 */
iap10@3605 405 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
iap10@3605 406 sizeof(ed->domain->shared_info->evtchn_mask));
iap10@3605 407 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
iap10@3605 408 }
iap10@3328 409
iap10@3328 410 return 0;
iap10@3328 411
iap10@3328 412 out:
iap10@3328 413 free_vmcs(vmcs);
iap10@3605 414 ed->thread.arch_vmx.vmcs = 0;
iap10@3328 415 return error;
iap10@3328 416 }
iap10@3328 417 #endif
iap10@3328 418
cl349@2957 419 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
djm@1736 420 {
djm@1736 421 unsigned long phys_basetab;
kaf24@2503 422 int i, rc;
djm@1736 423
cl349@2957 424 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
djm@1736 425 if ( c->flags & ECF_I387_VALID )
cl349@2957 426 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
kaf24@2503 427
kaf24@2722 428 memcpy(&d->thread.user_ctxt,
djm@1736 429 &c->cpu_ctxt,
kaf24@2722 430 sizeof(d->thread.user_ctxt));
kaf24@2722 431
cl349@3221 432 /* Clear IOPL for unprivileged domains. */
cl349@3221 433 if (!IS_PRIV(d->domain))
cl349@3221 434 d->thread.user_ctxt.eflags &= 0xffffcfff;
cl349@3221 435
kaf24@2722 436 /*
kaf24@2722 437 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
kaf24@2722 438 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
kaf24@2722 439 * If SS RPL or DPL differs from CS RPL then we'll #GP.
kaf24@2722 440 */
iap10@3427 441 if (!(c->flags & ECF_VMX_GUEST))
iap10@3427 442 if ( ((d->thread.user_ctxt.cs & 3) == 0) ||
iap10@3427 443 ((d->thread.user_ctxt.ss & 3) == 0) )
iap10@3427 444 return -EINVAL;
kaf24@2503 445
kaf24@2503 446 memcpy(&d->thread.i387,
djm@1736 447 &c->fpu_ctxt,
kaf24@2503 448 sizeof(d->thread.i387));
kaf24@2503 449
kaf24@2503 450 memcpy(d->thread.traps,
djm@1736 451 &c->trap_ctxt,
kaf24@2503 452 sizeof(d->thread.traps));
kaf24@2503 453
kaf24@2503 454 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
kaf24@2503 455 return rc;
kaf24@2503 456
kaf24@2503 457 d->mm.ldt_base = c->ldt_base;
kaf24@2503 458 d->mm.ldt_ents = c->ldt_ents;
kaf24@2503 459
kaf24@2503 460 d->thread.guestos_ss = c->guestos_ss;
kaf24@2503 461 d->thread.guestos_sp = c->guestos_esp;
kaf24@2503 462
djm@1736 463 for ( i = 0; i < 8; i++ )
kaf24@2503 464 (void)set_debugreg(d, i, c->debugreg[i]);
kaf24@2503 465
kaf24@3081 466 d->thread.event_selector = c->event_callback_cs;
kaf24@3081 467 d->thread.event_address = c->event_callback_eip;
kaf24@3081 468 d->thread.failsafe_selector = c->failsafe_callback_cs;
kaf24@3081 469 d->thread.failsafe_address = c->failsafe_callback_eip;
djm@1736 470
djm@1736 471 phys_basetab = c->pt_base;
kaf24@2503 472 d->mm.pagetable = mk_pagetable(phys_basetab);
cl349@2957 473 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
kaf24@2503 474 PGT_base_page_table) )
kaf24@2503 475 return -EINVAL;
kaf24@2503 476
kaf24@2503 477 /* Failure to set GDT is harmless. */
kaf24@2503 478 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
kaf24@2503 479 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
kaf24@2503 480 if ( c->gdt_ents != 0 )
kaf24@2503 481 {
kaf24@2503 482 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
kaf24@2503 483 {
kaf24@2503 484 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
kaf24@2503 485 return rc;
kaf24@2503 486 }
kaf24@2503 487 }
kaf24@2503 488
kaf24@3613 489 #ifdef CONFIG_VMX
iap10@3328 490 if (c->flags & ECF_VMX_GUEST)
iap10@3328 491 return vmx_final_setup_guestos(d, c);
kaf24@3613 492 #endif
iap10@3328 493
kaf24@2503 494 return 0;
djm@1736 495 }
djm@1736 496
cl349@2957 497 void new_thread(struct exec_domain *d,
kaf24@1710 498 unsigned long start_pc,
kaf24@1710 499 unsigned long start_stack,
kaf24@1710 500 unsigned long start_info)
kaf24@1710 501 {
kaf24@2722 502 execution_context_t *ec = &d->thread.user_ctxt;
kaf24@1710 503
kaf24@1710 504 /*
kaf24@1710 505 * Initial register values:
kaf24@1710 506 * DS,ES,FS,GS = FLAT_RING1_DS
kaf24@1710 507 * CS:EIP = FLAT_RING1_CS:start_pc
kaf24@1710 508 * SS:ESP = FLAT_RING1_DS:start_stack
kaf24@1710 509 * ESI = start_info
kaf24@1710 510 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
kaf24@1710 511 */
kaf24@3668 512 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_GUESTOS_DS;
kaf24@3668 513 ec->cs = FLAT_GUESTOS_CS;
kaf24@1710 514 ec->eip = start_pc;
kaf24@1710 515 ec->esp = start_stack;
kaf24@1710 516 ec->esi = start_info;
kaf24@1710 517
kaf24@1710 518 __save_flags(ec->eflags);
kaf24@1710 519 ec->eflags |= X86_EFLAGS_IF;
kaf24@1710 520 }
kaf24@1710 521
kaf24@1710 522
kaf24@1710 523 /*
kaf24@1710 524 * This special macro can be used to load a debugging register
kaf24@1710 525 */
kaf24@1710 526 #define loaddebug(thread,register) \
kaf24@3668 527 __asm__("mov %0,%%db" #register \
kaf24@1710 528 : /* no output */ \
kaf24@1710 529 :"r" (thread->debugreg[register]))
kaf24@1710 530
cl349@2957 531 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
kaf24@1710 532 {
kaf24@1710 533 struct thread_struct *next = &next_p->thread;
kaf24@1710 534 struct tss_struct *tss = init_tss + smp_processor_id();
kaf24@1710 535 execution_context_t *stack_ec = get_execution_context();
kaf24@1710 536 int i;
kaf24@3668 537 #ifdef CONFIG_VMX
iap10@3328 538 unsigned long vmx_domain = next_p->thread.arch_vmx.flags;
kaf24@3668 539 #endif
iap10@3328 540
kaf24@1710 541 __cli();
kaf24@1710 542
kaf24@1710 543 /* Switch guest general-register state. */
cl349@2957 544 if ( !is_idle_task(prev_p->domain) )
kaf24@1710 545 {
kaf24@2722 546 memcpy(&prev_p->thread.user_ctxt,
kaf24@1710 547 stack_ec,
kaf24@1710 548 sizeof(*stack_ec));
kaf24@1710 549 unlazy_fpu(prev_p);
kaf24@1710 550 CLEAR_FAST_TRAP(&prev_p->thread);
kaf24@1710 551 }
kaf24@1710 552
cl349@2957 553 if ( !is_idle_task(next_p->domain) )
kaf24@1710 554 {
kaf24@1710 555 memcpy(stack_ec,
kaf24@2722 556 &next_p->thread.user_ctxt,
kaf24@1710 557 sizeof(*stack_ec));
kaf24@1710 558
kaf24@1710 559 /* Maybe switch the debug registers. */
kaf24@1710 560 if ( unlikely(next->debugreg[7]) )
kaf24@1710 561 {
kaf24@1710 562 loaddebug(next, 0);
kaf24@1710 563 loaddebug(next, 1);
kaf24@1710 564 loaddebug(next, 2);
kaf24@1710 565 loaddebug(next, 3);
kaf24@1710 566 /* no 4 and 5 */
kaf24@1710 567 loaddebug(next, 6);
kaf24@1710 568 loaddebug(next, 7);
kaf24@1710 569 }
kaf24@1710 570
kaf24@3668 571 #ifdef CONFIG_VMX
kaf24@3668 572 if ( vmx_domain )
kaf24@3668 573 {
iap10@3328 574 /* Switch page tables. */
iap10@3328 575 write_ptbase(&next_p->mm);
iap10@3328 576
iap10@3328 577 set_current(next_p);
iap10@3328 578 /* Switch GDT and LDT. */
iap10@3328 579 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
iap10@3328 580
iap10@3328 581 __sti();
iap10@3328 582 return;
kaf24@3668 583 }
kaf24@3668 584 #endif
iap10@3328 585
iap10@3328 586 SET_FAST_TRAP(&next_p->thread);
iap10@3328 587
kaf24@3668 588 #ifdef __i386__
iap10@3328 589 /* Switch the guest OS ring-1 stack. */
iap10@3328 590 tss->esp1 = next->guestos_sp;
iap10@3328 591 tss->ss1 = next->guestos_ss;
kaf24@3668 592 #endif
iap10@3328 593
kaf24@1710 594 /* Switch page tables. */
kaf24@1710 595 write_ptbase(&next_p->mm);
kaf24@1710 596 }
kaf24@1710 597
kaf24@3088 598 if ( unlikely(prev_p->thread.io_bitmap != NULL) )
kaf24@1710 599 {
kaf24@3088 600 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ )
kaf24@3088 601 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) )
kaf24@3088 602 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
kaf24@3088 603 ~0U, IOBMP_BYTES_PER_SELBIT);
kaf24@3088 604 tss->bitmap = IOBMP_INVALID_OFFSET;
kaf24@3088 605 }
kaf24@1710 606
kaf24@3088 607 if ( unlikely(next_p->thread.io_bitmap != NULL) )
kaf24@3088 608 {
kaf24@3088 609 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ )
kaf24@3088 610 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) )
kaf24@3088 611 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
kaf24@3088 612 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
kaf24@3088 613 IOBMP_BYTES_PER_SELBIT);
kaf24@3088 614 tss->bitmap = IOBMP_OFFSET;
kaf24@1710 615 }
kaf24@1710 616
kaf24@1710 617 set_current(next_p);
kaf24@1710 618
kaf24@1710 619 /* Switch GDT and LDT. */
kaf24@1710 620 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
kaf24@1710 621 load_LDT(next_p);
kaf24@1710 622
kaf24@1710 623 __sti();
kaf24@1710 624 }
kaf24@1710 625
kaf24@1710 626
kaf24@1710 627 /* XXX Currently the 'domain' field is ignored! XXX */
kaf24@1710 628 long do_iopl(domid_t domain, unsigned int new_io_pl)
kaf24@1710 629 {
kaf24@1710 630 execution_context_t *ec = get_execution_context();
kaf24@1710 631 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
kaf24@1710 632 return 0;
kaf24@1710 633 }
kaf24@1710 634
kaf24@3187 635 unsigned long hypercall_create_continuation(
kaf24@3187 636 unsigned int op, unsigned int nr_args, ...)
kaf24@3129 637 {
kaf24@3177 638 struct mc_state *mcs = &mc_state[smp_processor_id()];
kaf24@3177 639 execution_context_t *ec;
kaf24@3177 640 unsigned long *preg;
kaf24@3129 641 unsigned int i;
kaf24@3129 642 va_list args;
kaf24@3129 643
kaf24@3177 644 va_start(args, nr_args);
kaf24@3177 645
kaf24@3177 646 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
kaf24@3177 647 {
kaf24@3177 648 __set_bit(_MCSF_call_preempted, &mcs->flags);
kaf24@3129 649
kaf24@3177 650 for ( i = 0; i < nr_args; i++ )
kaf24@3177 651 mcs->call.args[i] = va_arg(args, unsigned long);
kaf24@3177 652 }
kaf24@3177 653 else
kaf24@3177 654 {
kaf24@3177 655 ec = get_execution_context();
kaf24@3314 656 #if defined(__i386__)
kaf24@3177 657 ec->eax = op;
kaf24@3177 658 ec->eip -= 2; /* re-execute 'int 0x82' */
kaf24@3177 659
kaf24@3177 660 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
kaf24@3177 661 *preg = va_arg(args, unsigned long);
kaf24@3314 662 #else
kaf24@3314 663 preg = NULL; /* XXX x86/64 */
kaf24@3314 664 #endif
kaf24@3177 665 }
kaf24@3177 666
kaf24@3129 667 va_end(args);
kaf24@3187 668
kaf24@3187 669 return op;
kaf24@3129 670 }
kaf24@3129 671
kaf24@2466 672 static void relinquish_list(struct domain *d, struct list_head *list)
djm@1752 673 {
kaf24@2466 674 struct list_head *ent;
djm@1752 675 struct pfn_info *page;
djm@1752 676 unsigned long x, y;
djm@1752 677
kaf24@2466 678 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
kaf24@2466 679 spin_lock_recursive(&d->page_alloc_lock);
kaf24@2466 680
kaf24@2466 681 ent = list->next;
kaf24@2466 682 while ( ent != list )
kaf24@2466 683 {
kaf24@2466 684 page = list_entry(ent, struct pfn_info, list);
kaf24@2466 685
kaf24@2467 686 /* Grab a reference to the page so it won't disappear from under us. */
kaf24@2467 687 if ( unlikely(!get_page(page, d)) )
kaf24@2466 688 {
kaf24@2467 689 /* Couldn't get a reference -- someone is freeing this page. */
kaf24@2467 690 ent = ent->next;
kaf24@2466 691 continue;
kaf24@2466 692 }
kaf24@2466 693
kaf24@2467 694 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
kaf24@2467 695 put_page_and_type(page);
kaf24@2467 696
kaf24@2466 697 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
kaf24@2466 698 put_page(page);
kaf24@2466 699
kaf24@2466 700 /*
kaf24@2466 701 * Forcibly invalidate base page tables at this point to break circular
kaf24@2466 702 * 'linear page table' references. This is okay because MMU structures
kaf24@2466 703 * are not shared across domains and this domain is now dead. Thus base
kaf24@2466 704 * tables are not in use so a non-zero count means circular reference.
kaf24@2466 705 */
kaf24@2466 706 y = page->u.inuse.type_info;
kaf24@2467 707 for ( ; ; )
kaf24@2467 708 {
kaf24@2466 709 x = y;
kaf24@2466 710 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
kaf24@2466 711 (PGT_base_page_table|PGT_validated)) )
kaf24@2466 712 break;
kaf24@2467 713
kaf24@2466 714 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
kaf24@2466 715 if ( likely(y == x) )
kaf24@2466 716 {
kaf24@2466 717 free_page_type(page, PGT_base_page_table);
kaf24@2467 718 break;
kaf24@2466 719 }
kaf24@2466 720 }
kaf24@2467 721
kaf24@2467 722 /* Follow the list chain and /then/ potentially free the page. */
kaf24@2467 723 ent = ent->next;
kaf24@2467 724 put_page(page);
kaf24@2466 725 }
kaf24@2466 726
kaf24@2466 727 spin_unlock_recursive(&d->page_alloc_lock);
kaf24@2466 728 }
kaf24@2466 729
kaf24@3613 730 #ifdef CONFIG_VMX
iap10@3605 731 static void vmx_domain_relinquish_memory(struct exec_domain *ed)
iap10@3605 732 {
iap10@3605 733 struct domain *d = ed->domain;
iap10@3605 734
iap10@3605 735 /*
iap10@3605 736 * Free VMCS
iap10@3605 737 */
iap10@3605 738 ASSERT(ed->thread.arch_vmx.vmcs);
iap10@3605 739 free_vmcs(ed->thread.arch_vmx.vmcs);
iap10@3605 740 ed->thread.arch_vmx.vmcs = 0;
iap10@3605 741
iap10@3605 742 monitor_rm_pagetable(ed);
iap10@3605 743
iap10@3605 744 if (ed == d->exec_domain[0]) {
iap10@3605 745 int i;
iap10@3605 746 unsigned long pfn;
iap10@3605 747
iap10@3605 748 for (i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++) {
iap10@3605 749 unsigned long l1e;
iap10@3605 750
iap10@3605 751 l1e = l1_pgentry_val(d->mm_perdomain_pt[i]);
iap10@3605 752 if (l1e & _PAGE_PRESENT) {
iap10@3605 753 pfn = l1e >> PAGE_SHIFT;
iap10@3605 754 free_domheap_page(&frame_table[pfn]);
iap10@3605 755 }
iap10@3605 756 }
iap10@3605 757 }
iap10@3605 758
iap10@3605 759 }
kaf24@3613 760 #endif
kaf24@2466 761
kaf24@2466 762 void domain_relinquish_memory(struct domain *d)
kaf24@2466 763 {
cl349@2961 764 struct exec_domain *ed;
cl349@2961 765
kaf24@1806 766 /* Ensure that noone is running over the dead domain's page tables. */
kaf24@1806 767 synchronise_pagetables(~0UL);
djm@1752 768
djm@1752 769 /* Exit shadow mode before deconstructing final guest page table. */
kaf24@1787 770 shadow_mode_disable(d);
djm@1752 771
djm@1752 772 /* Drop the in-use reference to the page-table base. */
kaf24@3613 773 for_each_exec_domain ( d, ed )
kaf24@3613 774 {
cl349@2961 775 if ( pagetable_val(ed->mm.pagetable) != 0 )
cl349@2961 776 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >>
cl349@2961 777 PAGE_SHIFT]);
cl349@2961 778 }
djm@1752 779
kaf24@3613 780 #ifdef CONFIG_VMX
kaf24@3613 781 if ( VMX_DOMAIN(d->exec_domain[0]) )
kaf24@3613 782 for_each_exec_domain ( d, ed )
iap10@3605 783 vmx_domain_relinquish_memory(ed);
kaf24@3613 784 #endif
iap10@3605 785
kaf24@1787 786 /*
kaf24@1787 787 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
kaf24@1787 788 * it automatically gets squashed when the guest's mappings go away.
kaf24@1787 789 */
cl349@2964 790 for_each_exec_domain(d, ed)
cl349@2964 791 destroy_gdt(ed);
kaf24@1787 792
kaf24@2466 793 /* Relinquish every page of memory. */
kaf24@2466 794 relinquish_list(d, &d->xenpage_list);
kaf24@2466 795 relinquish_list(d, &d->page_list);
djm@1752 796 }
djm@1752 797
djm@1752 798
djm@1752 799 int construct_dom0(struct domain *p,
djm@1752 800 unsigned long alloc_start,
djm@1752 801 unsigned long alloc_end,
djm@1752 802 char *image_start, unsigned long image_len,
djm@1752 803 char *initrd_start, unsigned long initrd_len,
djm@1752 804 char *cmdline)
djm@1752 805 {
djm@1752 806 char *dst;
djm@1752 807 int i, rc;
djm@1752 808 unsigned long pfn, mfn;
djm@1752 809 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
djm@1752 810 unsigned long nr_pt_pages;
djm@1752 811 unsigned long count;
djm@1752 812 l2_pgentry_t *l2tab, *l2start;
djm@1752 813 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
djm@1752 814 struct pfn_info *page = NULL;
djm@1752 815 start_info_t *si;
cl349@2957 816 struct exec_domain *ed = p->exec_domain[0];
djm@1752 817
djm@1752 818 /*
djm@1752 819 * This fully describes the memory layout of the initial domain. All
djm@1752 820 * *_start address are page-aligned, except v_start (and v_end) which are
djm@1752 821 * superpage-aligned.
djm@1752 822 */
cl349@2486 823 struct domain_setup_info dsi;
djm@1752 824 unsigned long vinitrd_start;
djm@1752 825 unsigned long vinitrd_end;
djm@1752 826 unsigned long vphysmap_start;
djm@1752 827 unsigned long vphysmap_end;
djm@1752 828 unsigned long vstartinfo_start;
djm@1752 829 unsigned long vstartinfo_end;
djm@1752 830 unsigned long vstack_start;
djm@1752 831 unsigned long vstack_end;
djm@1752 832 unsigned long vpt_start;
djm@1752 833 unsigned long vpt_end;
djm@1752 834 unsigned long v_end;
djm@1752 835
djm@1752 836 /* Machine address of next candidate page-table page. */
djm@1752 837 unsigned long mpt_alloc;
djm@1752 838
djm@1752 839 extern void physdev_init_dom0(struct domain *);
djm@1752 840
djm@1752 841 /* Sanity! */
kaf24@2748 842 if ( p->id != 0 )
djm@1752 843 BUG();
cl349@2957 844 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) )
djm@1752 845 BUG();
djm@1752 846
cl349@2486 847 memset(&dsi, 0, sizeof(struct domain_setup_info));
cl349@2486 848
djm@1752 849 printk("*** LOADING DOMAIN 0 ***\n");
djm@1752 850
djm@1752 851 /*
djm@1752 852 * This is all a bit grim. We've moved the modules to the "safe" physical
djm@1752 853 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
djm@1752 854 * routine we're going to copy it down into the region that's actually
djm@1752 855 * been allocated to domain 0. This is highly likely to be overlapping, so
djm@1752 856 * we use a forward copy.
djm@1752 857 *
djm@1752 858 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
djm@1752 859 * 4GB and lots of network/disk cards that allocate loads of buffers.
djm@1752 860 * We'll have to revisit this if we ever support PAE (64GB).
djm@1752 861 */
djm@1752 862
cl349@2486 863 rc = parseelfimage(image_start, image_len, &dsi);
djm@1752 864 if ( rc != 0 )
djm@1752 865 return rc;
djm@1752 866
cl349@2487 867 /* Set up domain options */
cl349@2487 868 if ( dsi.use_writable_pagetables )
cl349@2487 869 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
cl349@2487 870
cl349@2486 871 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
kaf24@1818 872 {
kaf24@1818 873 printk("Initial guest OS must load to a page boundary.\n");
kaf24@1818 874 return -EINVAL;
kaf24@1818 875 }
kaf24@1818 876
djm@1752 877 /*
djm@1752 878 * Why do we need this? The number of page-table frames depends on the
djm@1752 879 * size of the bootstrap address space. But the size of the address space
djm@1752 880 * depends on the number of page-table frames (since each one is mapped
djm@1752 881 * read-only). We have a pair of simultaneous equations in two unknowns,
djm@1752 882 * which we solve by exhaustive search.
djm@1752 883 */
cl349@2513 884 vinitrd_start = round_pgup(dsi.v_kernend);
cl349@2513 885 vinitrd_end = vinitrd_start + initrd_len;
cl349@2513 886 vphysmap_start = round_pgup(vinitrd_end);
cl349@2513 887 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
cl349@2513 888 vpt_start = round_pgup(vphysmap_end);
djm@1752 889 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
djm@1752 890 {
djm@1752 891 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
djm@1752 892 vstartinfo_start = vpt_end;
djm@1752 893 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
djm@1752 894 vstack_start = vstartinfo_end;
djm@1752 895 vstack_end = vstack_start + PAGE_SIZE;
djm@1752 896 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
djm@1752 897 if ( (v_end - vstack_end) < (512 << 10) )
djm@1752 898 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
cl349@2486 899 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
kaf24@1818 900 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
djm@1752 901 break;
djm@1752 902 }
djm@1752 903
djm@1752 904 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
djm@1752 905 " Kernel image: %p->%p\n"
djm@1752 906 " Initrd image: %p->%p\n"
djm@1752 907 " Dom0 alloc.: %08lx->%08lx\n",
djm@1752 908 image_start, image_start + image_len,
djm@1752 909 initrd_start, initrd_start + initrd_len,
djm@1752 910 alloc_start, alloc_end);
djm@1752 911 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
djm@1752 912 " Loaded kernel: %08lx->%08lx\n"
djm@1752 913 " Init. ramdisk: %08lx->%08lx\n"
djm@1752 914 " Phys-Mach map: %08lx->%08lx\n"
djm@1752 915 " Page tables: %08lx->%08lx\n"
djm@1752 916 " Start info: %08lx->%08lx\n"
djm@1752 917 " Boot stack: %08lx->%08lx\n"
djm@1752 918 " TOTAL: %08lx->%08lx\n",
cl349@2486 919 dsi.v_kernstart, dsi.v_kernend,
djm@1752 920 vinitrd_start, vinitrd_end,
djm@1752 921 vphysmap_start, vphysmap_end,
djm@1752 922 vpt_start, vpt_end,
djm@1752 923 vstartinfo_start, vstartinfo_end,
djm@1752 924 vstack_start, vstack_end,
cl349@2486 925 dsi.v_start, v_end);
cl349@2486 926 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
djm@1752 927
cl349@2486 928 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
kaf24@1818 929 {
kaf24@1818 930 printk("Initial guest OS requires too much space\n"
kaf24@1818 931 "(%luMB is greater than %luMB limit)\n",
cl349@2486 932 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
kaf24@1818 933 return -ENOMEM;
kaf24@1818 934 }
kaf24@1818 935
djm@1752 936 /*
djm@1752 937 * Protect the lowest 1GB of memory. We use a temporary mapping there
djm@1752 938 * from which we copy the kernel and ramdisk images.
djm@1752 939 */
cl349@2486 940 if ( dsi.v_start < (1<<30) )
djm@1752 941 {
djm@1752 942 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
djm@1752 943 return -EINVAL;
djm@1752 944 }
djm@1752 945
kaf24@2810 946 /* Paranoia: scrub DOM0's memory allocation. */
kaf24@2888 947 printk("Scrubbing DOM0 RAM: ");
kaf24@2888 948 dst = (char *)alloc_start;
kaf24@2888 949 while ( dst < (char *)alloc_end )
kaf24@2888 950 {
kaf24@2888 951 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
kaf24@2888 952 printk(".");
kaf24@2888 953 touch_nmi_watchdog();
kaf24@2888 954 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
kaf24@2888 955 {
kaf24@2888 956 memset(dst, 0, SCRUB_BYTES);
kaf24@2888 957 dst += SCRUB_BYTES;
kaf24@2888 958 }
kaf24@2888 959 else
kaf24@2888 960 {
kaf24@2888 961 memset(dst, 0, (char *)alloc_end - dst);
kaf24@2888 962 break;
kaf24@2888 963 }
kaf24@2888 964 }
kaf24@2888 965 printk("done.\n");
kaf24@2810 966
djm@1752 967 /* Construct a frame-allocation list for the initial domain. */
djm@1752 968 for ( mfn = (alloc_start>>PAGE_SHIFT);
djm@1752 969 mfn < (alloc_end>>PAGE_SHIFT);
djm@1752 970 mfn++ )
djm@1752 971 {
djm@1752 972 page = &frame_table[mfn];
kaf24@2384 973 page->u.inuse.domain = p;
kaf24@2384 974 page->u.inuse.type_info = 0;
kaf24@2655 975 page->count_info = PGC_allocated | 1;
djm@1752 976 list_add_tail(&page->list, &p->page_list);
djm@1752 977 p->tot_pages++; p->max_pages++;
djm@1752 978 }
djm@1752 979
cl349@2486 980 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
djm@1752 981
cl349@2957 982 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
cl349@2957 983 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
djm@1752 984
djm@1752 985 /*
djm@1752 986 * We're basically forcing default RPLs to 1, so that our "what privilege
djm@1752 987 * level are we returning to?" logic works.
djm@1752 988 */
cl349@3085 989 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
cl349@3085 990 ed->thread.event_selector = FLAT_GUESTOS_CS;
cl349@2957 991 ed->thread.guestos_ss = FLAT_GUESTOS_DS;
djm@1752 992 for ( i = 0; i < 256; i++ )
cl349@2957 993 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
djm@1752 994
djm@1752 995 /* WARNING: The new domain must have its 'processor' field filled in! */
djm@1752 996 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
djm@1752 997 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
djm@1752 998 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
djm@1752 999 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
djm@1752 1000 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
cl349@3036 1001 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
cl349@2957 1002 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
djm@1752 1003
cl349@2486 1004 l2tab += l2_table_offset(dsi.v_start);
djm@1752 1005 mfn = alloc_start >> PAGE_SHIFT;
cl349@2486 1006 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
djm@1752 1007 {
djm@1752 1008 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
djm@1752 1009 {
djm@1752 1010 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
djm@1752 1011 mpt_alloc += PAGE_SIZE;
djm@1752 1012 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
djm@1752 1013 clear_page(l1tab);
cl349@1944 1014 if ( count == 0 )
cl349@2486 1015 l1tab += l1_table_offset(dsi.v_start);
djm@1752 1016 }
djm@1752 1017 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
djm@1752 1018
djm@1752 1019 page = &frame_table[mfn];
kaf24@2375 1020 if ( !get_page_and_type(page, p, PGT_writable_page) )
djm@1752 1021 BUG();
djm@1752 1022
djm@1752 1023 mfn++;
djm@1752 1024 }
djm@1752 1025
djm@1752 1026 /* Pages that are part of page tables must be read only. */
djm@1752 1027 l2tab = l2start + l2_table_offset(vpt_start);
djm@1752 1028 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
djm@1752 1029 l1tab += l1_table_offset(vpt_start);
djm@1752 1030 l2tab++;
djm@1752 1031 for ( count = 0; count < nr_pt_pages; count++ )
djm@1752 1032 {
djm@1752 1033 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
djm@1752 1034 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
djm@1752 1035 if ( count == 0 )
djm@1752 1036 {
kaf24@1970 1037 page->u.inuse.type_info &= ~PGT_type_mask;
kaf24@1970 1038 page->u.inuse.type_info |= PGT_l2_page_table;
mafetter@2808 1039
mafetter@2808 1040 /*
mafetter@2808 1041 * No longer writable: decrement the type_count.
mafetter@2808 1042 * Installed as CR3: increment both the ref_count and type_count.
mafetter@2808 1043 * Net: just increment the ref_count.
mafetter@2808 1044 */
djm@1752 1045 get_page(page, p); /* an extra ref because of readable mapping */
mafetter@2808 1046
djm@1752 1047 /* Get another ref to L2 page so that it can be pinned. */
djm@1752 1048 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
djm@1752 1049 BUG();
kaf24@2466 1050 set_bit(_PGT_pinned, &page->u.inuse.type_info);
djm@1752 1051 }
djm@1752 1052 else
djm@1752 1053 {
kaf24@1970 1054 page->u.inuse.type_info &= ~PGT_type_mask;
kaf24@1970 1055 page->u.inuse.type_info |= PGT_l1_page_table;
iap10@2458 1056 page->u.inuse.type_info |=
cl349@2486 1057 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
iap10@2458 1058
mafetter@2808 1059 /*
mafetter@2808 1060 * No longer writable: decrement the type_count.
mafetter@2808 1061 * This is an L1 page, installed in a validated L2 page:
mafetter@2808 1062 * increment both the ref_count and type_count.
mafetter@2808 1063 * Net: just increment the ref_count.
mafetter@2808 1064 */
djm@1752 1065 get_page(page, p); /* an extra ref because of readable mapping */
djm@1752 1066 }
djm@1752 1067 l1tab++;
djm@1752 1068 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
djm@1752 1069 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
djm@1752 1070 }
djm@1752 1071
djm@1752 1072 /* Set up shared-info area. */
cl349@2960 1073 update_dom_time(p);
djm@1752 1074 p->shared_info->domain_time = 0;
djm@1752 1075 /* Mask all upcalls... */
djm@1752 1076 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
djm@1752 1077 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
cl349@3201 1078 p->shared_info->n_vcpu = smp_num_cpus;
djm@1752 1079
djm@1752 1080 /* Install the new page tables. */
djm@1752 1081 __cli();
cl349@2957 1082 write_ptbase(&ed->mm);
djm@1752 1083
djm@1752 1084 /* Copy the OS image. */
djm@1752 1085 (void)loadelfimage(image_start);
djm@1752 1086
djm@1752 1087 /* Copy the initial ramdisk. */
djm@1752 1088 if ( initrd_len != 0 )
djm@1752 1089 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
djm@1752 1090
djm@1752 1091 /* Set up start info area. */
djm@1752 1092 si = (start_info_t *)vstartinfo_start;
djm@1752 1093 memset(si, 0, PAGE_SIZE);
djm@1752 1094 si->nr_pages = p->tot_pages;
djm@1752 1095 si->shared_info = virt_to_phys(p->shared_info);
djm@1752 1096 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
djm@1752 1097 si->pt_base = vpt_start;
djm@1752 1098 si->nr_pt_frames = nr_pt_pages;
djm@1752 1099 si->mfn_list = vphysmap_start;
djm@1752 1100
djm@1752 1101 /* Write the phys->machine and machine->phys table entries. */
kaf24@2084 1102 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
djm@1752 1103 {
kaf24@2084 1104 mfn = pfn + (alloc_start>>PAGE_SHIFT);
kaf24@2084 1105 #ifndef NDEBUG
cl349@2486 1106 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
kaf24@2084 1107 if ( pfn > REVERSE_START )
kaf24@2084 1108 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
kaf24@2084 1109 #endif
djm@1752 1110 ((unsigned long *)vphysmap_start)[pfn] = mfn;
djm@1752 1111 machine_to_phys_mapping[mfn] = pfn;
djm@1752 1112 }
djm@1752 1113
djm@1752 1114 if ( initrd_len != 0 )
djm@1752 1115 {
djm@1752 1116 si->mod_start = vinitrd_start;
djm@1752 1117 si->mod_len = initrd_len;
djm@1752 1118 printk("Initrd len 0x%lx, start at 0x%08lx\n",
djm@1752 1119 si->mod_len, si->mod_start);
djm@1752 1120 }
djm@1752 1121
djm@1752 1122 dst = si->cmd_line;
djm@1752 1123 if ( cmdline != NULL )
djm@1752 1124 {
djm@1752 1125 for ( i = 0; i < 255; i++ )
djm@1752 1126 {
djm@1752 1127 if ( cmdline[i] == '\0' )
djm@1752 1128 break;
djm@1752 1129 *dst++ = cmdline[i];
djm@1752 1130 }
djm@1752 1131 }
djm@1752 1132 *dst = '\0';
djm@1752 1133
djm@1752 1134 /* Reinstate the caller's page tables. */
djm@1752 1135 write_ptbase(&current->mm);
djm@1752 1136 __sti();
djm@1752 1137
djm@1752 1138 /* Destroy low mappings - they were only for our convenience. */
djm@1752 1139 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
djm@1752 1140 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
djm@1752 1141 l2start[i] = mk_l2_pgentry(0);
djm@1752 1142 zap_low_mappings(); /* Do the same for the idle page tables. */
djm@1752 1143
djm@1752 1144 /* DOM0 gets access to everything. */
djm@1752 1145 physdev_init_dom0(p);
djm@1752 1146
cl349@2957 1147 set_bit(DF_CONSTRUCTED, &p->d_flags);
djm@1752 1148
cl349@2957 1149 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
iap10@2509 1150
djm@1752 1151 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
iap10@2509 1152 shadow_lock(&p->mm);
iap10@2509 1153 shadow_mode_enable(p, SHM_test);
iap10@2509 1154 shadow_unlock(&p->mm);
djm@1752 1155 #endif
djm@1752 1156
djm@1752 1157 return 0;
djm@1752 1158 }