debuggers.hg
annotate xen/arch/x86/domain.c @ 3668:d55d523078f7
bitkeeper revision 1.1159.212.77 (4202221693AFbvFZWeMHHIjQfbzTIQ)
More x86_64 prgress. Many more gaps filled in. Next step is DOM0
construction.
Signed-off-by: keir.fraser@cl.cam.ac.uk
More x86_64 prgress. Many more gaps filled in. Next step is DOM0
construction.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author | kaf24@scramble.cl.cam.ac.uk |
---|---|
date | Thu Feb 03 13:07:34 2005 +0000 (2005-02-03) |
parents | 060c1ea52343 |
children | 677cb76cff18 |
rev | line source |
---|---|
kaf24@1787 | 1 /****************************************************************************** |
kaf24@1787 | 2 * arch/x86/domain.c |
kaf24@1787 | 3 * |
kaf24@1787 | 4 * x86-specific domain handling (e.g., register setup and context switching). |
kaf24@1787 | 5 */ |
kaf24@1787 | 6 |
kaf24@1710 | 7 /* |
kaf24@1710 | 8 * Copyright (C) 1995 Linus Torvalds |
kaf24@1710 | 9 * |
kaf24@1710 | 10 * Pentium III FXSR, SSE support |
kaf24@1710 | 11 * Gareth Hughes <gareth@valinux.com>, May 2000 |
kaf24@1710 | 12 */ |
kaf24@1710 | 13 |
kaf24@1710 | 14 #include <xen/config.h> |
kaf24@3372 | 15 #include <xen/init.h> |
kaf24@1710 | 16 #include <xen/lib.h> |
kaf24@1710 | 17 #include <xen/errno.h> |
kaf24@1710 | 18 #include <xen/sched.h> |
kaf24@1710 | 19 #include <xen/smp.h> |
kaf24@1710 | 20 #include <xen/delay.h> |
kaf24@1710 | 21 #include <xen/softirq.h> |
ach61@2843 | 22 #include <asm/regs.h> |
kaf24@1710 | 23 #include <asm/mc146818rtc.h> |
kaf24@1710 | 24 #include <asm/system.h> |
kaf24@1710 | 25 #include <asm/io.h> |
kaf24@1710 | 26 #include <asm/processor.h> |
kaf24@1710 | 27 #include <asm/desc.h> |
kaf24@1710 | 28 #include <asm/i387.h> |
kaf24@1710 | 29 #include <asm/mpspec.h> |
kaf24@1710 | 30 #include <asm/ldt.h> |
kaf24@1710 | 31 #include <xen/irq.h> |
kaf24@1710 | 32 #include <xen/event.h> |
kaf24@1787 | 33 #include <asm/shadow.h> |
djm@1752 | 34 #include <xen/console.h> |
djm@1752 | 35 #include <xen/elf.h> |
iap10@3328 | 36 #include <asm/vmx.h> |
iap10@3328 | 37 #include <asm/vmx_vmcs.h> |
iap10@3328 | 38 #include <xen/kernel.h> |
iap10@3328 | 39 #include <public/io/ioreq.h> |
kaf24@3177 | 40 #include <xen/multicall.h> |
djm@1752 | 41 |
kaf24@3372 | 42 /* opt_noreboot: If true, machine will need manual reset on error. */ |
kaf24@3372 | 43 static int opt_noreboot = 0; |
kaf24@3372 | 44 boolean_param("noreboot", opt_noreboot); |
kaf24@3372 | 45 |
djm@1752 | 46 #if !defined(CONFIG_X86_64BITMODE) |
djm@1752 | 47 /* No ring-3 access in initial page tables. */ |
djm@1752 | 48 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) |
djm@1752 | 49 #else |
djm@1752 | 50 /* Allow ring-3 access in long mode as guest cannot use ring 1. */ |
djm@1752 | 51 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) |
djm@1752 | 52 #endif |
djm@1752 | 53 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) |
djm@1752 | 54 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) |
djm@1752 | 55 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) |
djm@1752 | 56 |
djm@1752 | 57 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) |
djm@1752 | 58 #define round_pgdown(_p) ((_p)&PAGE_MASK) |
djm@1752 | 59 |
kaf24@3310 | 60 static void default_idle(void) |
kaf24@1710 | 61 { |
kaf24@3310 | 62 __cli(); |
kaf24@3310 | 63 if ( !softirq_pending(smp_processor_id()) ) |
kaf24@3310 | 64 safe_halt(); |
kaf24@3310 | 65 else |
kaf24@3310 | 66 __sti(); |
kaf24@1710 | 67 } |
kaf24@1710 | 68 |
kaf24@3354 | 69 static __attribute_used__ void idle_loop(void) |
kaf24@1710 | 70 { |
kaf24@1710 | 71 int cpu = smp_processor_id(); |
kaf24@1710 | 72 for ( ; ; ) |
kaf24@1710 | 73 { |
kaf24@1710 | 74 irq_stat[cpu].idle_timestamp = jiffies; |
kaf24@1710 | 75 while ( !softirq_pending(cpu) ) |
kaf24@1710 | 76 default_idle(); |
kaf24@1710 | 77 do_softirq(); |
kaf24@1710 | 78 } |
kaf24@1710 | 79 } |
kaf24@1710 | 80 |
kaf24@1710 | 81 void startup_cpu_idle_loop(void) |
kaf24@1710 | 82 { |
kaf24@1710 | 83 /* Just some sanity to ensure that the scheduler is set up okay. */ |
cl349@3036 | 84 ASSERT(current->domain->id == IDLE_DOMAIN_ID); |
cl349@2957 | 85 domain_unpause_by_systemcontroller(current->domain); |
kaf24@1710 | 86 __enter_scheduler(); |
kaf24@1710 | 87 |
kaf24@1710 | 88 /* |
kaf24@1710 | 89 * Declares CPU setup done to the boot processor. |
kaf24@1710 | 90 * Therefore memory barrier to ensure state is visible. |
kaf24@1710 | 91 */ |
kaf24@1710 | 92 smp_mb(); |
kaf24@1710 | 93 init_idle(); |
kaf24@1710 | 94 |
kaf24@3310 | 95 idle_loop(); |
kaf24@1710 | 96 } |
kaf24@1710 | 97 |
kaf24@1710 | 98 static long no_idt[2]; |
kaf24@1710 | 99 static int reboot_mode; |
kaf24@1710 | 100 int reboot_thru_bios = 0; |
kaf24@1710 | 101 |
kaf24@1710 | 102 #ifdef CONFIG_SMP |
kaf24@1710 | 103 int reboot_smp = 0; |
kaf24@1710 | 104 static int reboot_cpu = -1; |
kaf24@1710 | 105 /* shamelessly grabbed from lib/vsprintf.c for readability */ |
kaf24@1710 | 106 #define is_digit(c) ((c) >= '0' && (c) <= '9') |
kaf24@1710 | 107 #endif |
kaf24@1710 | 108 |
kaf24@1710 | 109 |
kaf24@1710 | 110 static inline void kb_wait(void) |
kaf24@1710 | 111 { |
kaf24@1710 | 112 int i; |
kaf24@1710 | 113 |
kaf24@1710 | 114 for (i=0; i<0x10000; i++) |
kaf24@1710 | 115 if ((inb_p(0x64) & 0x02) == 0) |
kaf24@1710 | 116 break; |
kaf24@1710 | 117 } |
kaf24@1710 | 118 |
kaf24@1710 | 119 |
kaf24@1710 | 120 void machine_restart(char * __unused) |
kaf24@1710 | 121 { |
kaf24@1710 | 122 #ifdef CONFIG_SMP |
kaf24@1710 | 123 int cpuid; |
kaf24@1710 | 124 #endif |
kaf24@1710 | 125 |
kaf24@1710 | 126 if ( opt_noreboot ) |
kaf24@1710 | 127 { |
kaf24@1710 | 128 printk("Reboot disabled on cmdline: require manual reset\n"); |
kaf24@1710 | 129 for ( ; ; ) __asm__ __volatile__ ("hlt"); |
kaf24@1710 | 130 } |
kaf24@1710 | 131 |
kaf24@1710 | 132 #ifdef CONFIG_SMP |
kaf24@1710 | 133 cpuid = GET_APIC_ID(apic_read(APIC_ID)); |
kaf24@1710 | 134 |
kaf24@1710 | 135 /* KAF: Need interrupts enabled for safe IPI. */ |
kaf24@1710 | 136 __sti(); |
kaf24@1710 | 137 |
kaf24@1710 | 138 if (reboot_smp) { |
kaf24@1710 | 139 |
kaf24@1710 | 140 /* check to see if reboot_cpu is valid |
kaf24@1710 | 141 if its not, default to the BSP */ |
kaf24@1710 | 142 if ((reboot_cpu == -1) || |
kaf24@1710 | 143 (reboot_cpu > (NR_CPUS -1)) || |
kaf24@1710 | 144 !(phys_cpu_present_map & (1<<cpuid))) |
kaf24@1710 | 145 reboot_cpu = boot_cpu_physical_apicid; |
kaf24@1710 | 146 |
kaf24@1710 | 147 reboot_smp = 0; /* use this as a flag to only go through this once*/ |
kaf24@1710 | 148 /* re-run this function on the other CPUs |
kaf24@1710 | 149 it will fall though this section since we have |
kaf24@1710 | 150 cleared reboot_smp, and do the reboot if it is the |
kaf24@1710 | 151 correct CPU, otherwise it halts. */ |
kaf24@1710 | 152 if (reboot_cpu != cpuid) |
kaf24@1710 | 153 smp_call_function((void *)machine_restart , NULL, 1, 0); |
kaf24@1710 | 154 } |
kaf24@1710 | 155 |
kaf24@1710 | 156 /* if reboot_cpu is still -1, then we want a tradional reboot, |
kaf24@1710 | 157 and if we are not running on the reboot_cpu,, halt */ |
kaf24@1710 | 158 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { |
kaf24@1710 | 159 for (;;) |
kaf24@1710 | 160 __asm__ __volatile__ ("hlt"); |
kaf24@1710 | 161 } |
kaf24@1710 | 162 /* |
kaf24@1710 | 163 * Stop all CPUs and turn off local APICs and the IO-APIC, so |
kaf24@1710 | 164 * other OSs see a clean IRQ state. |
kaf24@1710 | 165 */ |
kaf24@1710 | 166 smp_send_stop(); |
kaf24@1710 | 167 disable_IO_APIC(); |
kaf24@1710 | 168 #endif |
iap10@3328 | 169 #ifdef CONFIG_VMX |
iap10@3328 | 170 stop_vmx(); |
iap10@3328 | 171 #endif |
kaf24@1710 | 172 |
kaf24@1710 | 173 if(!reboot_thru_bios) { |
kaf24@1710 | 174 /* rebooting needs to touch the page at absolute addr 0 */ |
kaf24@1710 | 175 *((unsigned short *)__va(0x472)) = reboot_mode; |
kaf24@1710 | 176 for (;;) { |
kaf24@1710 | 177 int i; |
kaf24@1710 | 178 for (i=0; i<100; i++) { |
kaf24@1710 | 179 kb_wait(); |
kaf24@1710 | 180 udelay(50); |
kaf24@1710 | 181 outb(0xfe,0x64); /* pulse reset low */ |
kaf24@1710 | 182 udelay(50); |
kaf24@1710 | 183 } |
kaf24@1710 | 184 /* That didn't work - force a triple fault.. */ |
kaf24@1710 | 185 __asm__ __volatile__("lidt %0": "=m" (no_idt)); |
kaf24@1710 | 186 __asm__ __volatile__("int3"); |
kaf24@1710 | 187 } |
kaf24@1710 | 188 } |
kaf24@1710 | 189 |
kaf24@1710 | 190 panic("Need to reinclude BIOS reboot code\n"); |
kaf24@1710 | 191 } |
kaf24@1710 | 192 |
kaf24@1849 | 193 |
kaf24@1849 | 194 void __attribute__((noreturn)) __machine_halt(void *unused) |
kaf24@1849 | 195 { |
kaf24@1849 | 196 for ( ; ; ) |
kaf24@1849 | 197 __asm__ __volatile__ ( "cli; hlt" ); |
kaf24@1849 | 198 } |
kaf24@1849 | 199 |
kaf24@1710 | 200 void machine_halt(void) |
kaf24@1710 | 201 { |
kaf24@1849 | 202 smp_call_function(__machine_halt, NULL, 1, 1); |
kaf24@1849 | 203 __machine_halt(NULL); |
kaf24@1710 | 204 } |
kaf24@1710 | 205 |
kaf24@3515 | 206 void dump_pageframe_info(struct domain *d) |
kaf24@3515 | 207 { |
kaf24@3515 | 208 struct pfn_info *page; |
kaf24@3515 | 209 |
kaf24@3515 | 210 if ( d->tot_pages < 10 ) |
kaf24@3515 | 211 { |
kaf24@3568 | 212 list_for_each_entry ( page, &d->page_list, list ) |
kaf24@3515 | 213 { |
kaf24@3515 | 214 printk("Page %08x: caf=%08x, taf=%08x\n", |
kaf24@3515 | 215 page_to_phys(page), page->count_info, |
kaf24@3515 | 216 page->u.inuse.type_info); |
kaf24@3515 | 217 } |
kaf24@3515 | 218 } |
kaf24@3515 | 219 |
kaf24@3515 | 220 page = virt_to_page(d->shared_info); |
kaf24@3515 | 221 printk("Shared_info@%08x: caf=%08x, taf=%08x\n", |
kaf24@3515 | 222 page_to_phys(page), page->count_info, |
kaf24@3515 | 223 page->u.inuse.type_info); |
kaf24@3515 | 224 } |
kaf24@3515 | 225 |
kaf24@3515 | 226 struct domain *arch_alloc_domain_struct(void) |
kaf24@3515 | 227 { |
iap10@3651 | 228 return xmalloc(struct domain); |
kaf24@3515 | 229 } |
kaf24@3515 | 230 |
kaf24@3515 | 231 void arch_free_domain_struct(struct domain *d) |
kaf24@3515 | 232 { |
iap10@3651 | 233 xfree(d); |
kaf24@3515 | 234 } |
kaf24@3515 | 235 |
kaf24@3517 | 236 struct exec_domain *arch_alloc_exec_domain_struct(void) |
kaf24@3517 | 237 { |
iap10@3651 | 238 return xmalloc(struct exec_domain); |
kaf24@3517 | 239 } |
kaf24@3517 | 240 |
kaf24@3517 | 241 void arch_free_exec_domain_struct(struct exec_domain *ed) |
kaf24@3517 | 242 { |
iap10@3651 | 243 xfree(ed); |
kaf24@3517 | 244 } |
kaf24@3517 | 245 |
kaf24@1974 | 246 void free_perdomain_pt(struct domain *d) |
kaf24@1974 | 247 { |
cl349@3036 | 248 free_xenheap_page((unsigned long)d->mm_perdomain_pt); |
kaf24@1974 | 249 } |
kaf24@1974 | 250 |
cl349@3319 | 251 static void continue_idle_task(struct exec_domain *ed) |
kaf24@3310 | 252 { |
kaf24@3310 | 253 reset_stack_and_jump(idle_loop); |
kaf24@3310 | 254 } |
kaf24@3310 | 255 |
cl349@3319 | 256 static void continue_nonidle_task(struct exec_domain *ed) |
kaf24@3310 | 257 { |
kaf24@3310 | 258 reset_stack_and_jump(ret_from_intr); |
kaf24@3310 | 259 } |
kaf24@3310 | 260 |
cl349@3319 | 261 void arch_do_createdomain(struct exec_domain *ed) |
djm@1736 | 262 { |
cl349@2959 | 263 struct domain *d = ed->domain; |
kaf24@3668 | 264 |
cl349@3319 | 265 SET_DEFAULT_FAST_TRAP(&ed->thread); |
kaf24@3310 | 266 |
kaf24@3310 | 267 if ( d->id == IDLE_DOMAIN_ID ) |
kaf24@3310 | 268 { |
cl349@3319 | 269 ed->thread.schedule_tail = continue_idle_task; |
kaf24@3310 | 270 } |
kaf24@3310 | 271 else |
kaf24@3310 | 272 { |
cl349@3319 | 273 ed->thread.schedule_tail = continue_nonidle_task; |
djm@1736 | 274 |
kaf24@3310 | 275 d->shared_info = (void *)alloc_xenheap_page(); |
kaf24@3310 | 276 memset(d->shared_info, 0, PAGE_SIZE); |
cl349@3318 | 277 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid]; |
kaf24@3310 | 278 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); |
kaf24@3310 | 279 machine_to_phys_mapping[virt_to_phys(d->shared_info) >> |
kaf24@3310 | 280 PAGE_SHIFT] = INVALID_P2M_ENTRY; |
kaf24@3310 | 281 |
cl349@3318 | 282 d->mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page(); |
cl349@3318 | 283 memset(d->mm_perdomain_pt, 0, PAGE_SIZE); |
cl349@3318 | 284 machine_to_phys_mapping[virt_to_phys(d->mm_perdomain_pt) >> |
kaf24@3310 | 285 PAGE_SHIFT] = INVALID_P2M_ENTRY; |
cl349@3324 | 286 ed->mm.perdomain_ptes = d->mm_perdomain_pt; |
kaf24@3310 | 287 } |
djm@1736 | 288 } |
djm@1736 | 289 |
iap10@3328 | 290 #ifdef CONFIG_VMX |
iap10@3605 | 291 void arch_vmx_do_resume(struct exec_domain *ed) |
iap10@3328 | 292 { |
iap10@3605 | 293 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs); |
iap10@3605 | 294 |
iap10@3605 | 295 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr); |
iap10@3605 | 296 vmx_do_resume(ed); |
iap10@3328 | 297 reset_stack_and_jump(vmx_asm_do_resume); |
iap10@3328 | 298 } |
iap10@3328 | 299 |
iap10@3605 | 300 void arch_vmx_do_launch(struct exec_domain *ed) |
iap10@3328 | 301 { |
iap10@3605 | 302 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->thread.arch_vmx.vmcs); |
iap10@3605 | 303 |
iap10@3605 | 304 load_vmcs(&ed->thread.arch_vmx, vmcs_phys_ptr); |
iap10@3605 | 305 vmx_do_launch(ed); |
iap10@3328 | 306 reset_stack_and_jump(vmx_asm_do_launch); |
iap10@3328 | 307 } |
iap10@3328 | 308 |
iap10@3328 | 309 static void monitor_mk_pagetable(struct exec_domain *ed) |
iap10@3328 | 310 { |
iap10@3328 | 311 unsigned long mpfn; |
iap10@3328 | 312 l2_pgentry_t *mpl2e; |
iap10@3328 | 313 struct pfn_info *mpfn_info; |
iap10@3328 | 314 struct mm_struct *m = &ed->mm; |
iap10@3328 | 315 struct domain *d = ed->domain; |
iap10@3328 | 316 |
iap10@3328 | 317 mpfn_info = alloc_domheap_page(NULL); |
iap10@3328 | 318 ASSERT( mpfn_info ); |
iap10@3328 | 319 |
iap10@3328 | 320 mpfn = (unsigned long) (mpfn_info - frame_table); |
iap10@3605 | 321 mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << L1_PAGETABLE_SHIFT); |
iap10@3328 | 322 memset(mpl2e, 0, PAGE_SIZE); |
iap10@3328 | 323 |
iap10@3328 | 324 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], |
iap10@3328 | 325 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], |
iap10@3328 | 326 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); |
iap10@3328 | 327 |
iap10@3605 | 328 m->monitor_table = mk_pagetable(mpfn << L1_PAGETABLE_SHIFT); |
iap10@3328 | 329 m->shadow_mode = SHM_full_32; |
iap10@3328 | 330 |
iap10@3328 | 331 mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = |
iap10@3328 | 332 mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK) |
iap10@3328 | 333 | __PAGE_HYPERVISOR); |
iap10@3328 | 334 |
iap10@3328 | 335 unmap_domain_mem(mpl2e); |
iap10@3328 | 336 } |
iap10@3328 | 337 |
iap10@3605 | 338 /* |
iap10@3605 | 339 * Free the pages for monitor_table and guest_pl2e_cache |
iap10@3605 | 340 */ |
iap10@3605 | 341 static void monitor_rm_pagetable(struct exec_domain *ed) |
iap10@3605 | 342 { |
iap10@3605 | 343 struct mm_struct *m = &ed->mm; |
iap10@3605 | 344 l2_pgentry_t *mpl2e; |
iap10@3605 | 345 unsigned long mpfn; |
iap10@3605 | 346 |
iap10@3605 | 347 mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(m->monitor_table)); |
iap10@3605 | 348 /* |
iap10@3605 | 349 * First get the pfn for guest_pl2e_cache by looking at monitor_table |
iap10@3605 | 350 */ |
iap10@3605 | 351 mpfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) |
iap10@3605 | 352 >> PAGE_SHIFT; |
iap10@3605 | 353 |
iap10@3605 | 354 free_domheap_page(&frame_table[mpfn]); |
iap10@3605 | 355 unmap_domain_mem(mpl2e); |
iap10@3605 | 356 |
iap10@3605 | 357 /* |
iap10@3605 | 358 * Then free monitor_table. |
iap10@3605 | 359 */ |
iap10@3605 | 360 mpfn = (pagetable_val(m->monitor_table)) >> PAGE_SHIFT; |
iap10@3605 | 361 free_domheap_page(&frame_table[mpfn]); |
iap10@3605 | 362 |
iap10@3605 | 363 m->monitor_table = mk_pagetable(0); |
iap10@3605 | 364 } |
iap10@3605 | 365 |
iap10@3605 | 366 static int vmx_final_setup_guestos(struct exec_domain *ed, |
iap10@3328 | 367 full_execution_context_t *full_context) |
iap10@3328 | 368 { |
iap10@3328 | 369 int error; |
iap10@3328 | 370 execution_context_t *context; |
iap10@3328 | 371 struct vmcs_struct *vmcs; |
iap10@3328 | 372 |
iap10@3328 | 373 context = &full_context->cpu_ctxt; |
iap10@3328 | 374 |
iap10@3328 | 375 /* |
iap10@3328 | 376 * Create a new VMCS |
iap10@3328 | 377 */ |
iap10@3328 | 378 if (!(vmcs = alloc_vmcs())) { |
iap10@3328 | 379 printk("Failed to create a new VMCS\n"); |
iap10@3328 | 380 return -ENOMEM; |
iap10@3328 | 381 } |
iap10@3328 | 382 |
iap10@3605 | 383 memset(&ed->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct)); |
iap10@3328 | 384 |
iap10@3605 | 385 ed->thread.arch_vmx.vmcs = vmcs; |
iap10@3605 | 386 error = construct_vmcs(&ed->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV); |
iap10@3328 | 387 if (error < 0) { |
iap10@3328 | 388 printk("Failed to construct a new VMCS\n"); |
iap10@3328 | 389 goto out; |
iap10@3328 | 390 } |
iap10@3328 | 391 |
iap10@3605 | 392 monitor_mk_pagetable(ed); |
iap10@3605 | 393 ed->thread.schedule_tail = arch_vmx_do_launch; |
iap10@3605 | 394 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->thread.arch_vmx.cpu_state); |
iap10@3328 | 395 |
iap10@3605 | 396 #if defined (__i386) |
iap10@3605 | 397 ed->thread.arch_vmx.vmx_platform.real_mode_data = |
iap10@3328 | 398 (unsigned long *) context->esi; |
iap10@3605 | 399 #endif |
iap10@3328 | 400 |
iap10@3605 | 401 if (ed == ed->domain->exec_domain[0]) { |
iap10@3605 | 402 /* |
iap10@3605 | 403 * Required to do this once per domain |
iap10@3605 | 404 */ |
iap10@3605 | 405 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff, |
iap10@3605 | 406 sizeof(ed->domain->shared_info->evtchn_mask)); |
iap10@3605 | 407 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]); |
iap10@3605 | 408 } |
iap10@3328 | 409 |
iap10@3328 | 410 return 0; |
iap10@3328 | 411 |
iap10@3328 | 412 out: |
iap10@3328 | 413 free_vmcs(vmcs); |
iap10@3605 | 414 ed->thread.arch_vmx.vmcs = 0; |
iap10@3328 | 415 return error; |
iap10@3328 | 416 } |
iap10@3328 | 417 #endif |
iap10@3328 | 418 |
cl349@2957 | 419 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c) |
djm@1736 | 420 { |
djm@1736 | 421 unsigned long phys_basetab; |
kaf24@2503 | 422 int i, rc; |
djm@1736 | 423 |
cl349@2957 | 424 clear_bit(EDF_DONEFPUINIT, &d->ed_flags); |
djm@1736 | 425 if ( c->flags & ECF_I387_VALID ) |
cl349@2957 | 426 set_bit(EDF_DONEFPUINIT, &d->ed_flags); |
kaf24@2503 | 427 |
kaf24@2722 | 428 memcpy(&d->thread.user_ctxt, |
djm@1736 | 429 &c->cpu_ctxt, |
kaf24@2722 | 430 sizeof(d->thread.user_ctxt)); |
kaf24@2722 | 431 |
cl349@3221 | 432 /* Clear IOPL for unprivileged domains. */ |
cl349@3221 | 433 if (!IS_PRIV(d->domain)) |
cl349@3221 | 434 d->thread.user_ctxt.eflags &= 0xffffcfff; |
cl349@3221 | 435 |
kaf24@2722 | 436 /* |
kaf24@2722 | 437 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll |
kaf24@2722 | 438 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically. |
kaf24@2722 | 439 * If SS RPL or DPL differs from CS RPL then we'll #GP. |
kaf24@2722 | 440 */ |
iap10@3427 | 441 if (!(c->flags & ECF_VMX_GUEST)) |
iap10@3427 | 442 if ( ((d->thread.user_ctxt.cs & 3) == 0) || |
iap10@3427 | 443 ((d->thread.user_ctxt.ss & 3) == 0) ) |
iap10@3427 | 444 return -EINVAL; |
kaf24@2503 | 445 |
kaf24@2503 | 446 memcpy(&d->thread.i387, |
djm@1736 | 447 &c->fpu_ctxt, |
kaf24@2503 | 448 sizeof(d->thread.i387)); |
kaf24@2503 | 449 |
kaf24@2503 | 450 memcpy(d->thread.traps, |
djm@1736 | 451 &c->trap_ctxt, |
kaf24@2503 | 452 sizeof(d->thread.traps)); |
kaf24@2503 | 453 |
kaf24@2503 | 454 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 ) |
kaf24@2503 | 455 return rc; |
kaf24@2503 | 456 |
kaf24@2503 | 457 d->mm.ldt_base = c->ldt_base; |
kaf24@2503 | 458 d->mm.ldt_ents = c->ldt_ents; |
kaf24@2503 | 459 |
kaf24@2503 | 460 d->thread.guestos_ss = c->guestos_ss; |
kaf24@2503 | 461 d->thread.guestos_sp = c->guestos_esp; |
kaf24@2503 | 462 |
djm@1736 | 463 for ( i = 0; i < 8; i++ ) |
kaf24@2503 | 464 (void)set_debugreg(d, i, c->debugreg[i]); |
kaf24@2503 | 465 |
kaf24@3081 | 466 d->thread.event_selector = c->event_callback_cs; |
kaf24@3081 | 467 d->thread.event_address = c->event_callback_eip; |
kaf24@3081 | 468 d->thread.failsafe_selector = c->failsafe_callback_cs; |
kaf24@3081 | 469 d->thread.failsafe_address = c->failsafe_callback_eip; |
djm@1736 | 470 |
djm@1736 | 471 phys_basetab = c->pt_base; |
kaf24@2503 | 472 d->mm.pagetable = mk_pagetable(phys_basetab); |
cl349@2957 | 473 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain, |
kaf24@2503 | 474 PGT_base_page_table) ) |
kaf24@2503 | 475 return -EINVAL; |
kaf24@2503 | 476 |
kaf24@2503 | 477 /* Failure to set GDT is harmless. */ |
kaf24@2503 | 478 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES); |
kaf24@2503 | 479 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS); |
kaf24@2503 | 480 if ( c->gdt_ents != 0 ) |
kaf24@2503 | 481 { |
kaf24@2503 | 482 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 ) |
kaf24@2503 | 483 { |
kaf24@2503 | 484 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]); |
kaf24@2503 | 485 return rc; |
kaf24@2503 | 486 } |
kaf24@2503 | 487 } |
kaf24@2503 | 488 |
kaf24@3613 | 489 #ifdef CONFIG_VMX |
iap10@3328 | 490 if (c->flags & ECF_VMX_GUEST) |
iap10@3328 | 491 return vmx_final_setup_guestos(d, c); |
kaf24@3613 | 492 #endif |
iap10@3328 | 493 |
kaf24@2503 | 494 return 0; |
djm@1736 | 495 } |
djm@1736 | 496 |
cl349@2957 | 497 void new_thread(struct exec_domain *d, |
kaf24@1710 | 498 unsigned long start_pc, |
kaf24@1710 | 499 unsigned long start_stack, |
kaf24@1710 | 500 unsigned long start_info) |
kaf24@1710 | 501 { |
kaf24@2722 | 502 execution_context_t *ec = &d->thread.user_ctxt; |
kaf24@1710 | 503 |
kaf24@1710 | 504 /* |
kaf24@1710 | 505 * Initial register values: |
kaf24@1710 | 506 * DS,ES,FS,GS = FLAT_RING1_DS |
kaf24@1710 | 507 * CS:EIP = FLAT_RING1_CS:start_pc |
kaf24@1710 | 508 * SS:ESP = FLAT_RING1_DS:start_stack |
kaf24@1710 | 509 * ESI = start_info |
kaf24@1710 | 510 * [EAX,EBX,ECX,EDX,EDI,EBP are zero] |
kaf24@1710 | 511 */ |
kaf24@3668 | 512 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_GUESTOS_DS; |
kaf24@3668 | 513 ec->cs = FLAT_GUESTOS_CS; |
kaf24@1710 | 514 ec->eip = start_pc; |
kaf24@1710 | 515 ec->esp = start_stack; |
kaf24@1710 | 516 ec->esi = start_info; |
kaf24@1710 | 517 |
kaf24@1710 | 518 __save_flags(ec->eflags); |
kaf24@1710 | 519 ec->eflags |= X86_EFLAGS_IF; |
kaf24@1710 | 520 } |
kaf24@1710 | 521 |
kaf24@1710 | 522 |
kaf24@1710 | 523 /* |
kaf24@1710 | 524 * This special macro can be used to load a debugging register |
kaf24@1710 | 525 */ |
kaf24@1710 | 526 #define loaddebug(thread,register) \ |
kaf24@3668 | 527 __asm__("mov %0,%%db" #register \ |
kaf24@1710 | 528 : /* no output */ \ |
kaf24@1710 | 529 :"r" (thread->debugreg[register])) |
kaf24@1710 | 530 |
cl349@2957 | 531 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) |
kaf24@1710 | 532 { |
kaf24@1710 | 533 struct thread_struct *next = &next_p->thread; |
kaf24@1710 | 534 struct tss_struct *tss = init_tss + smp_processor_id(); |
kaf24@1710 | 535 execution_context_t *stack_ec = get_execution_context(); |
kaf24@1710 | 536 int i; |
kaf24@3668 | 537 #ifdef CONFIG_VMX |
iap10@3328 | 538 unsigned long vmx_domain = next_p->thread.arch_vmx.flags; |
kaf24@3668 | 539 #endif |
iap10@3328 | 540 |
kaf24@1710 | 541 __cli(); |
kaf24@1710 | 542 |
kaf24@1710 | 543 /* Switch guest general-register state. */ |
cl349@2957 | 544 if ( !is_idle_task(prev_p->domain) ) |
kaf24@1710 | 545 { |
kaf24@2722 | 546 memcpy(&prev_p->thread.user_ctxt, |
kaf24@1710 | 547 stack_ec, |
kaf24@1710 | 548 sizeof(*stack_ec)); |
kaf24@1710 | 549 unlazy_fpu(prev_p); |
kaf24@1710 | 550 CLEAR_FAST_TRAP(&prev_p->thread); |
kaf24@1710 | 551 } |
kaf24@1710 | 552 |
cl349@2957 | 553 if ( !is_idle_task(next_p->domain) ) |
kaf24@1710 | 554 { |
kaf24@1710 | 555 memcpy(stack_ec, |
kaf24@2722 | 556 &next_p->thread.user_ctxt, |
kaf24@1710 | 557 sizeof(*stack_ec)); |
kaf24@1710 | 558 |
kaf24@1710 | 559 /* Maybe switch the debug registers. */ |
kaf24@1710 | 560 if ( unlikely(next->debugreg[7]) ) |
kaf24@1710 | 561 { |
kaf24@1710 | 562 loaddebug(next, 0); |
kaf24@1710 | 563 loaddebug(next, 1); |
kaf24@1710 | 564 loaddebug(next, 2); |
kaf24@1710 | 565 loaddebug(next, 3); |
kaf24@1710 | 566 /* no 4 and 5 */ |
kaf24@1710 | 567 loaddebug(next, 6); |
kaf24@1710 | 568 loaddebug(next, 7); |
kaf24@1710 | 569 } |
kaf24@1710 | 570 |
kaf24@3668 | 571 #ifdef CONFIG_VMX |
kaf24@3668 | 572 if ( vmx_domain ) |
kaf24@3668 | 573 { |
iap10@3328 | 574 /* Switch page tables. */ |
iap10@3328 | 575 write_ptbase(&next_p->mm); |
iap10@3328 | 576 |
iap10@3328 | 577 set_current(next_p); |
iap10@3328 | 578 /* Switch GDT and LDT. */ |
iap10@3328 | 579 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt)); |
iap10@3328 | 580 |
iap10@3328 | 581 __sti(); |
iap10@3328 | 582 return; |
kaf24@3668 | 583 } |
kaf24@3668 | 584 #endif |
iap10@3328 | 585 |
iap10@3328 | 586 SET_FAST_TRAP(&next_p->thread); |
iap10@3328 | 587 |
kaf24@3668 | 588 #ifdef __i386__ |
iap10@3328 | 589 /* Switch the guest OS ring-1 stack. */ |
iap10@3328 | 590 tss->esp1 = next->guestos_sp; |
iap10@3328 | 591 tss->ss1 = next->guestos_ss; |
kaf24@3668 | 592 #endif |
iap10@3328 | 593 |
kaf24@1710 | 594 /* Switch page tables. */ |
kaf24@1710 | 595 write_ptbase(&next_p->mm); |
kaf24@1710 | 596 } |
kaf24@1710 | 597 |
kaf24@3088 | 598 if ( unlikely(prev_p->thread.io_bitmap != NULL) ) |
kaf24@1710 | 599 { |
kaf24@3088 | 600 for ( i = 0; i < sizeof(prev_p->thread.io_bitmap_sel) * 8; i++ ) |
kaf24@3088 | 601 if ( !test_bit(i, &prev_p->thread.io_bitmap_sel) ) |
kaf24@3088 | 602 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT], |
kaf24@3088 | 603 ~0U, IOBMP_BYTES_PER_SELBIT); |
kaf24@3088 | 604 tss->bitmap = IOBMP_INVALID_OFFSET; |
kaf24@3088 | 605 } |
kaf24@1710 | 606 |
kaf24@3088 | 607 if ( unlikely(next_p->thread.io_bitmap != NULL) ) |
kaf24@3088 | 608 { |
kaf24@3088 | 609 for ( i = 0; i < sizeof(next_p->thread.io_bitmap_sel) * 8; i++ ) |
kaf24@3088 | 610 if ( !test_bit(i, &next_p->thread.io_bitmap_sel) ) |
kaf24@3088 | 611 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT], |
kaf24@3088 | 612 &next_p->thread.io_bitmap[i * IOBMP_BYTES_PER_SELBIT], |
kaf24@3088 | 613 IOBMP_BYTES_PER_SELBIT); |
kaf24@3088 | 614 tss->bitmap = IOBMP_OFFSET; |
kaf24@1710 | 615 } |
kaf24@1710 | 616 |
kaf24@1710 | 617 set_current(next_p); |
kaf24@1710 | 618 |
kaf24@1710 | 619 /* Switch GDT and LDT. */ |
kaf24@1710 | 620 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt)); |
kaf24@1710 | 621 load_LDT(next_p); |
kaf24@1710 | 622 |
kaf24@1710 | 623 __sti(); |
kaf24@1710 | 624 } |
kaf24@1710 | 625 |
kaf24@1710 | 626 |
kaf24@1710 | 627 /* XXX Currently the 'domain' field is ignored! XXX */ |
kaf24@1710 | 628 long do_iopl(domid_t domain, unsigned int new_io_pl) |
kaf24@1710 | 629 { |
kaf24@1710 | 630 execution_context_t *ec = get_execution_context(); |
kaf24@1710 | 631 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12); |
kaf24@1710 | 632 return 0; |
kaf24@1710 | 633 } |
kaf24@1710 | 634 |
kaf24@3187 | 635 unsigned long hypercall_create_continuation( |
kaf24@3187 | 636 unsigned int op, unsigned int nr_args, ...) |
kaf24@3129 | 637 { |
kaf24@3177 | 638 struct mc_state *mcs = &mc_state[smp_processor_id()]; |
kaf24@3177 | 639 execution_context_t *ec; |
kaf24@3177 | 640 unsigned long *preg; |
kaf24@3129 | 641 unsigned int i; |
kaf24@3129 | 642 va_list args; |
kaf24@3129 | 643 |
kaf24@3177 | 644 va_start(args, nr_args); |
kaf24@3177 | 645 |
kaf24@3177 | 646 if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) |
kaf24@3177 | 647 { |
kaf24@3177 | 648 __set_bit(_MCSF_call_preempted, &mcs->flags); |
kaf24@3129 | 649 |
kaf24@3177 | 650 for ( i = 0; i < nr_args; i++ ) |
kaf24@3177 | 651 mcs->call.args[i] = va_arg(args, unsigned long); |
kaf24@3177 | 652 } |
kaf24@3177 | 653 else |
kaf24@3177 | 654 { |
kaf24@3177 | 655 ec = get_execution_context(); |
kaf24@3314 | 656 #if defined(__i386__) |
kaf24@3177 | 657 ec->eax = op; |
kaf24@3177 | 658 ec->eip -= 2; /* re-execute 'int 0x82' */ |
kaf24@3177 | 659 |
kaf24@3177 | 660 for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ ) |
kaf24@3177 | 661 *preg = va_arg(args, unsigned long); |
kaf24@3314 | 662 #else |
kaf24@3314 | 663 preg = NULL; /* XXX x86/64 */ |
kaf24@3314 | 664 #endif |
kaf24@3177 | 665 } |
kaf24@3177 | 666 |
kaf24@3129 | 667 va_end(args); |
kaf24@3187 | 668 |
kaf24@3187 | 669 return op; |
kaf24@3129 | 670 } |
kaf24@3129 | 671 |
kaf24@2466 | 672 static void relinquish_list(struct domain *d, struct list_head *list) |
djm@1752 | 673 { |
kaf24@2466 | 674 struct list_head *ent; |
djm@1752 | 675 struct pfn_info *page; |
djm@1752 | 676 unsigned long x, y; |
djm@1752 | 677 |
kaf24@2466 | 678 /* Use a recursive lock, as we may enter 'free_domheap_page'. */ |
kaf24@2466 | 679 spin_lock_recursive(&d->page_alloc_lock); |
kaf24@2466 | 680 |
kaf24@2466 | 681 ent = list->next; |
kaf24@2466 | 682 while ( ent != list ) |
kaf24@2466 | 683 { |
kaf24@2466 | 684 page = list_entry(ent, struct pfn_info, list); |
kaf24@2466 | 685 |
kaf24@2467 | 686 /* Grab a reference to the page so it won't disappear from under us. */ |
kaf24@2467 | 687 if ( unlikely(!get_page(page, d)) ) |
kaf24@2466 | 688 { |
kaf24@2467 | 689 /* Couldn't get a reference -- someone is freeing this page. */ |
kaf24@2467 | 690 ent = ent->next; |
kaf24@2466 | 691 continue; |
kaf24@2466 | 692 } |
kaf24@2466 | 693 |
kaf24@2467 | 694 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) |
kaf24@2467 | 695 put_page_and_type(page); |
kaf24@2467 | 696 |
kaf24@2466 | 697 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) |
kaf24@2466 | 698 put_page(page); |
kaf24@2466 | 699 |
kaf24@2466 | 700 /* |
kaf24@2466 | 701 * Forcibly invalidate base page tables at this point to break circular |
kaf24@2466 | 702 * 'linear page table' references. This is okay because MMU structures |
kaf24@2466 | 703 * are not shared across domains and this domain is now dead. Thus base |
kaf24@2466 | 704 * tables are not in use so a non-zero count means circular reference. |
kaf24@2466 | 705 */ |
kaf24@2466 | 706 y = page->u.inuse.type_info; |
kaf24@2467 | 707 for ( ; ; ) |
kaf24@2467 | 708 { |
kaf24@2466 | 709 x = y; |
kaf24@2466 | 710 if ( likely((x & (PGT_type_mask|PGT_validated)) != |
kaf24@2466 | 711 (PGT_base_page_table|PGT_validated)) ) |
kaf24@2466 | 712 break; |
kaf24@2467 | 713 |
kaf24@2466 | 714 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated); |
kaf24@2466 | 715 if ( likely(y == x) ) |
kaf24@2466 | 716 { |
kaf24@2466 | 717 free_page_type(page, PGT_base_page_table); |
kaf24@2467 | 718 break; |
kaf24@2466 | 719 } |
kaf24@2466 | 720 } |
kaf24@2467 | 721 |
kaf24@2467 | 722 /* Follow the list chain and /then/ potentially free the page. */ |
kaf24@2467 | 723 ent = ent->next; |
kaf24@2467 | 724 put_page(page); |
kaf24@2466 | 725 } |
kaf24@2466 | 726 |
kaf24@2466 | 727 spin_unlock_recursive(&d->page_alloc_lock); |
kaf24@2466 | 728 } |
kaf24@2466 | 729 |
kaf24@3613 | 730 #ifdef CONFIG_VMX |
iap10@3605 | 731 static void vmx_domain_relinquish_memory(struct exec_domain *ed) |
iap10@3605 | 732 { |
iap10@3605 | 733 struct domain *d = ed->domain; |
iap10@3605 | 734 |
iap10@3605 | 735 /* |
iap10@3605 | 736 * Free VMCS |
iap10@3605 | 737 */ |
iap10@3605 | 738 ASSERT(ed->thread.arch_vmx.vmcs); |
iap10@3605 | 739 free_vmcs(ed->thread.arch_vmx.vmcs); |
iap10@3605 | 740 ed->thread.arch_vmx.vmcs = 0; |
iap10@3605 | 741 |
iap10@3605 | 742 monitor_rm_pagetable(ed); |
iap10@3605 | 743 |
iap10@3605 | 744 if (ed == d->exec_domain[0]) { |
iap10@3605 | 745 int i; |
iap10@3605 | 746 unsigned long pfn; |
iap10@3605 | 747 |
iap10@3605 | 748 for (i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++) { |
iap10@3605 | 749 unsigned long l1e; |
iap10@3605 | 750 |
iap10@3605 | 751 l1e = l1_pgentry_val(d->mm_perdomain_pt[i]); |
iap10@3605 | 752 if (l1e & _PAGE_PRESENT) { |
iap10@3605 | 753 pfn = l1e >> PAGE_SHIFT; |
iap10@3605 | 754 free_domheap_page(&frame_table[pfn]); |
iap10@3605 | 755 } |
iap10@3605 | 756 } |
iap10@3605 | 757 } |
iap10@3605 | 758 |
iap10@3605 | 759 } |
kaf24@3613 | 760 #endif |
kaf24@2466 | 761 |
kaf24@2466 | 762 void domain_relinquish_memory(struct domain *d) |
kaf24@2466 | 763 { |
cl349@2961 | 764 struct exec_domain *ed; |
cl349@2961 | 765 |
kaf24@1806 | 766 /* Ensure that noone is running over the dead domain's page tables. */ |
kaf24@1806 | 767 synchronise_pagetables(~0UL); |
djm@1752 | 768 |
djm@1752 | 769 /* Exit shadow mode before deconstructing final guest page table. */ |
kaf24@1787 | 770 shadow_mode_disable(d); |
djm@1752 | 771 |
djm@1752 | 772 /* Drop the in-use reference to the page-table base. */ |
kaf24@3613 | 773 for_each_exec_domain ( d, ed ) |
kaf24@3613 | 774 { |
cl349@2961 | 775 if ( pagetable_val(ed->mm.pagetable) != 0 ) |
cl349@2961 | 776 put_page_and_type(&frame_table[pagetable_val(ed->mm.pagetable) >> |
cl349@2961 | 777 PAGE_SHIFT]); |
cl349@2961 | 778 } |
djm@1752 | 779 |
kaf24@3613 | 780 #ifdef CONFIG_VMX |
kaf24@3613 | 781 if ( VMX_DOMAIN(d->exec_domain[0]) ) |
kaf24@3613 | 782 for_each_exec_domain ( d, ed ) |
iap10@3605 | 783 vmx_domain_relinquish_memory(ed); |
kaf24@3613 | 784 #endif |
iap10@3605 | 785 |
kaf24@1787 | 786 /* |
kaf24@1787 | 787 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as |
kaf24@1787 | 788 * it automatically gets squashed when the guest's mappings go away. |
kaf24@1787 | 789 */ |
cl349@2964 | 790 for_each_exec_domain(d, ed) |
cl349@2964 | 791 destroy_gdt(ed); |
kaf24@1787 | 792 |
kaf24@2466 | 793 /* Relinquish every page of memory. */ |
kaf24@2466 | 794 relinquish_list(d, &d->xenpage_list); |
kaf24@2466 | 795 relinquish_list(d, &d->page_list); |
djm@1752 | 796 } |
djm@1752 | 797 |
djm@1752 | 798 |
djm@1752 | 799 int construct_dom0(struct domain *p, |
djm@1752 | 800 unsigned long alloc_start, |
djm@1752 | 801 unsigned long alloc_end, |
djm@1752 | 802 char *image_start, unsigned long image_len, |
djm@1752 | 803 char *initrd_start, unsigned long initrd_len, |
djm@1752 | 804 char *cmdline) |
djm@1752 | 805 { |
djm@1752 | 806 char *dst; |
djm@1752 | 807 int i, rc; |
djm@1752 | 808 unsigned long pfn, mfn; |
djm@1752 | 809 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT; |
djm@1752 | 810 unsigned long nr_pt_pages; |
djm@1752 | 811 unsigned long count; |
djm@1752 | 812 l2_pgentry_t *l2tab, *l2start; |
djm@1752 | 813 l1_pgentry_t *l1tab = NULL, *l1start = NULL; |
djm@1752 | 814 struct pfn_info *page = NULL; |
djm@1752 | 815 start_info_t *si; |
cl349@2957 | 816 struct exec_domain *ed = p->exec_domain[0]; |
djm@1752 | 817 |
djm@1752 | 818 /* |
djm@1752 | 819 * This fully describes the memory layout of the initial domain. All |
djm@1752 | 820 * *_start address are page-aligned, except v_start (and v_end) which are |
djm@1752 | 821 * superpage-aligned. |
djm@1752 | 822 */ |
cl349@2486 | 823 struct domain_setup_info dsi; |
djm@1752 | 824 unsigned long vinitrd_start; |
djm@1752 | 825 unsigned long vinitrd_end; |
djm@1752 | 826 unsigned long vphysmap_start; |
djm@1752 | 827 unsigned long vphysmap_end; |
djm@1752 | 828 unsigned long vstartinfo_start; |
djm@1752 | 829 unsigned long vstartinfo_end; |
djm@1752 | 830 unsigned long vstack_start; |
djm@1752 | 831 unsigned long vstack_end; |
djm@1752 | 832 unsigned long vpt_start; |
djm@1752 | 833 unsigned long vpt_end; |
djm@1752 | 834 unsigned long v_end; |
djm@1752 | 835 |
djm@1752 | 836 /* Machine address of next candidate page-table page. */ |
djm@1752 | 837 unsigned long mpt_alloc; |
djm@1752 | 838 |
djm@1752 | 839 extern void physdev_init_dom0(struct domain *); |
djm@1752 | 840 |
djm@1752 | 841 /* Sanity! */ |
kaf24@2748 | 842 if ( p->id != 0 ) |
djm@1752 | 843 BUG(); |
cl349@2957 | 844 if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) |
djm@1752 | 845 BUG(); |
djm@1752 | 846 |
cl349@2486 | 847 memset(&dsi, 0, sizeof(struct domain_setup_info)); |
cl349@2486 | 848 |
djm@1752 | 849 printk("*** LOADING DOMAIN 0 ***\n"); |
djm@1752 | 850 |
djm@1752 | 851 /* |
djm@1752 | 852 * This is all a bit grim. We've moved the modules to the "safe" physical |
djm@1752 | 853 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this |
djm@1752 | 854 * routine we're going to copy it down into the region that's actually |
djm@1752 | 855 * been allocated to domain 0. This is highly likely to be overlapping, so |
djm@1752 | 856 * we use a forward copy. |
djm@1752 | 857 * |
djm@1752 | 858 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with |
djm@1752 | 859 * 4GB and lots of network/disk cards that allocate loads of buffers. |
djm@1752 | 860 * We'll have to revisit this if we ever support PAE (64GB). |
djm@1752 | 861 */ |
djm@1752 | 862 |
cl349@2486 | 863 rc = parseelfimage(image_start, image_len, &dsi); |
djm@1752 | 864 if ( rc != 0 ) |
djm@1752 | 865 return rc; |
djm@1752 | 866 |
cl349@2487 | 867 /* Set up domain options */ |
cl349@2487 | 868 if ( dsi.use_writable_pagetables ) |
cl349@2487 | 869 vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); |
cl349@2487 | 870 |
cl349@2486 | 871 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 ) |
kaf24@1818 | 872 { |
kaf24@1818 | 873 printk("Initial guest OS must load to a page boundary.\n"); |
kaf24@1818 | 874 return -EINVAL; |
kaf24@1818 | 875 } |
kaf24@1818 | 876 |
djm@1752 | 877 /* |
djm@1752 | 878 * Why do we need this? The number of page-table frames depends on the |
djm@1752 | 879 * size of the bootstrap address space. But the size of the address space |
djm@1752 | 880 * depends on the number of page-table frames (since each one is mapped |
djm@1752 | 881 * read-only). We have a pair of simultaneous equations in two unknowns, |
djm@1752 | 882 * which we solve by exhaustive search. |
djm@1752 | 883 */ |
cl349@2513 | 884 vinitrd_start = round_pgup(dsi.v_kernend); |
cl349@2513 | 885 vinitrd_end = vinitrd_start + initrd_len; |
cl349@2513 | 886 vphysmap_start = round_pgup(vinitrd_end); |
cl349@2513 | 887 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); |
cl349@2513 | 888 vpt_start = round_pgup(vphysmap_end); |
djm@1752 | 889 for ( nr_pt_pages = 2; ; nr_pt_pages++ ) |
djm@1752 | 890 { |
djm@1752 | 891 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); |
djm@1752 | 892 vstartinfo_start = vpt_end; |
djm@1752 | 893 vstartinfo_end = vstartinfo_start + PAGE_SIZE; |
djm@1752 | 894 vstack_start = vstartinfo_end; |
djm@1752 | 895 vstack_end = vstack_start + PAGE_SIZE; |
djm@1752 | 896 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); |
djm@1752 | 897 if ( (v_end - vstack_end) < (512 << 10) ) |
djm@1752 | 898 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ |
cl349@2486 | 899 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> |
kaf24@1818 | 900 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) |
djm@1752 | 901 break; |
djm@1752 | 902 } |
djm@1752 | 903 |
djm@1752 | 904 printk("PHYSICAL MEMORY ARRANGEMENT:\n" |
djm@1752 | 905 " Kernel image: %p->%p\n" |
djm@1752 | 906 " Initrd image: %p->%p\n" |
djm@1752 | 907 " Dom0 alloc.: %08lx->%08lx\n", |
djm@1752 | 908 image_start, image_start + image_len, |
djm@1752 | 909 initrd_start, initrd_start + initrd_len, |
djm@1752 | 910 alloc_start, alloc_end); |
djm@1752 | 911 printk("VIRTUAL MEMORY ARRANGEMENT:\n" |
djm@1752 | 912 " Loaded kernel: %08lx->%08lx\n" |
djm@1752 | 913 " Init. ramdisk: %08lx->%08lx\n" |
djm@1752 | 914 " Phys-Mach map: %08lx->%08lx\n" |
djm@1752 | 915 " Page tables: %08lx->%08lx\n" |
djm@1752 | 916 " Start info: %08lx->%08lx\n" |
djm@1752 | 917 " Boot stack: %08lx->%08lx\n" |
djm@1752 | 918 " TOTAL: %08lx->%08lx\n", |
cl349@2486 | 919 dsi.v_kernstart, dsi.v_kernend, |
djm@1752 | 920 vinitrd_start, vinitrd_end, |
djm@1752 | 921 vphysmap_start, vphysmap_end, |
djm@1752 | 922 vpt_start, vpt_end, |
djm@1752 | 923 vstartinfo_start, vstartinfo_end, |
djm@1752 | 924 vstack_start, vstack_end, |
cl349@2486 | 925 dsi.v_start, v_end); |
cl349@2486 | 926 printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry); |
djm@1752 | 927 |
cl349@2486 | 928 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) ) |
kaf24@1818 | 929 { |
kaf24@1818 | 930 printk("Initial guest OS requires too much space\n" |
kaf24@1818 | 931 "(%luMB is greater than %luMB limit)\n", |
cl349@2486 | 932 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20); |
kaf24@1818 | 933 return -ENOMEM; |
kaf24@1818 | 934 } |
kaf24@1818 | 935 |
djm@1752 | 936 /* |
djm@1752 | 937 * Protect the lowest 1GB of memory. We use a temporary mapping there |
djm@1752 | 938 * from which we copy the kernel and ramdisk images. |
djm@1752 | 939 */ |
cl349@2486 | 940 if ( dsi.v_start < (1<<30) ) |
djm@1752 | 941 { |
djm@1752 | 942 printk("Initial loading isn't allowed to lowest 1GB of memory.\n"); |
djm@1752 | 943 return -EINVAL; |
djm@1752 | 944 } |
djm@1752 | 945 |
kaf24@2810 | 946 /* Paranoia: scrub DOM0's memory allocation. */ |
kaf24@2888 | 947 printk("Scrubbing DOM0 RAM: "); |
kaf24@2888 | 948 dst = (char *)alloc_start; |
kaf24@2888 | 949 while ( dst < (char *)alloc_end ) |
kaf24@2888 | 950 { |
kaf24@2888 | 951 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */ |
kaf24@2888 | 952 printk("."); |
kaf24@2888 | 953 touch_nmi_watchdog(); |
kaf24@2888 | 954 if ( ((char *)alloc_end - dst) > SCRUB_BYTES ) |
kaf24@2888 | 955 { |
kaf24@2888 | 956 memset(dst, 0, SCRUB_BYTES); |
kaf24@2888 | 957 dst += SCRUB_BYTES; |
kaf24@2888 | 958 } |
kaf24@2888 | 959 else |
kaf24@2888 | 960 { |
kaf24@2888 | 961 memset(dst, 0, (char *)alloc_end - dst); |
kaf24@2888 | 962 break; |
kaf24@2888 | 963 } |
kaf24@2888 | 964 } |
kaf24@2888 | 965 printk("done.\n"); |
kaf24@2810 | 966 |
djm@1752 | 967 /* Construct a frame-allocation list for the initial domain. */ |
djm@1752 | 968 for ( mfn = (alloc_start>>PAGE_SHIFT); |
djm@1752 | 969 mfn < (alloc_end>>PAGE_SHIFT); |
djm@1752 | 970 mfn++ ) |
djm@1752 | 971 { |
djm@1752 | 972 page = &frame_table[mfn]; |
kaf24@2384 | 973 page->u.inuse.domain = p; |
kaf24@2384 | 974 page->u.inuse.type_info = 0; |
kaf24@2655 | 975 page->count_info = PGC_allocated | 1; |
djm@1752 | 976 list_add_tail(&page->list, &p->page_list); |
djm@1752 | 977 p->tot_pages++; p->max_pages++; |
djm@1752 | 978 } |
djm@1752 | 979 |
cl349@2486 | 980 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start; |
djm@1752 | 981 |
cl349@2957 | 982 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES); |
cl349@2957 | 983 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS); |
djm@1752 | 984 |
djm@1752 | 985 /* |
djm@1752 | 986 * We're basically forcing default RPLs to 1, so that our "what privilege |
djm@1752 | 987 * level are we returning to?" logic works. |
djm@1752 | 988 */ |
cl349@3085 | 989 ed->thread.failsafe_selector = FLAT_GUESTOS_CS; |
cl349@3085 | 990 ed->thread.event_selector = FLAT_GUESTOS_CS; |
cl349@2957 | 991 ed->thread.guestos_ss = FLAT_GUESTOS_DS; |
djm@1752 | 992 for ( i = 0; i < 256; i++ ) |
cl349@2957 | 993 ed->thread.traps[i].cs = FLAT_GUESTOS_CS; |
djm@1752 | 994 |
djm@1752 | 995 /* WARNING: The new domain must have its 'processor' field filled in! */ |
djm@1752 | 996 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; |
djm@1752 | 997 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); |
djm@1752 | 998 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = |
djm@1752 | 999 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR); |
djm@1752 | 1000 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = |
cl349@3036 | 1001 mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR); |
cl349@2957 | 1002 ed->mm.pagetable = mk_pagetable((unsigned long)l2start); |
djm@1752 | 1003 |
cl349@2486 | 1004 l2tab += l2_table_offset(dsi.v_start); |
djm@1752 | 1005 mfn = alloc_start >> PAGE_SHIFT; |
cl349@2486 | 1006 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) |
djm@1752 | 1007 { |
djm@1752 | 1008 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) |
djm@1752 | 1009 { |
djm@1752 | 1010 l1start = l1tab = (l1_pgentry_t *)mpt_alloc; |
djm@1752 | 1011 mpt_alloc += PAGE_SIZE; |
djm@1752 | 1012 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT); |
djm@1752 | 1013 clear_page(l1tab); |
cl349@1944 | 1014 if ( count == 0 ) |
cl349@2486 | 1015 l1tab += l1_table_offset(dsi.v_start); |
djm@1752 | 1016 } |
djm@1752 | 1017 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); |
djm@1752 | 1018 |
djm@1752 | 1019 page = &frame_table[mfn]; |
kaf24@2375 | 1020 if ( !get_page_and_type(page, p, PGT_writable_page) ) |
djm@1752 | 1021 BUG(); |
djm@1752 | 1022 |
djm@1752 | 1023 mfn++; |
djm@1752 | 1024 } |
djm@1752 | 1025 |
djm@1752 | 1026 /* Pages that are part of page tables must be read only. */ |
djm@1752 | 1027 l2tab = l2start + l2_table_offset(vpt_start); |
djm@1752 | 1028 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); |
djm@1752 | 1029 l1tab += l1_table_offset(vpt_start); |
djm@1752 | 1030 l2tab++; |
djm@1752 | 1031 for ( count = 0; count < nr_pt_pages; count++ ) |
djm@1752 | 1032 { |
djm@1752 | 1033 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); |
djm@1752 | 1034 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; |
djm@1752 | 1035 if ( count == 0 ) |
djm@1752 | 1036 { |
kaf24@1970 | 1037 page->u.inuse.type_info &= ~PGT_type_mask; |
kaf24@1970 | 1038 page->u.inuse.type_info |= PGT_l2_page_table; |
mafetter@2808 | 1039 |
mafetter@2808 | 1040 /* |
mafetter@2808 | 1041 * No longer writable: decrement the type_count. |
mafetter@2808 | 1042 * Installed as CR3: increment both the ref_count and type_count. |
mafetter@2808 | 1043 * Net: just increment the ref_count. |
mafetter@2808 | 1044 */ |
djm@1752 | 1045 get_page(page, p); /* an extra ref because of readable mapping */ |
mafetter@2808 | 1046 |
djm@1752 | 1047 /* Get another ref to L2 page so that it can be pinned. */ |
djm@1752 | 1048 if ( !get_page_and_type(page, p, PGT_l2_page_table) ) |
djm@1752 | 1049 BUG(); |
kaf24@2466 | 1050 set_bit(_PGT_pinned, &page->u.inuse.type_info); |
djm@1752 | 1051 } |
djm@1752 | 1052 else |
djm@1752 | 1053 { |
kaf24@1970 | 1054 page->u.inuse.type_info &= ~PGT_type_mask; |
kaf24@1970 | 1055 page->u.inuse.type_info |= PGT_l1_page_table; |
iap10@2458 | 1056 page->u.inuse.type_info |= |
cl349@2486 | 1057 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift; |
iap10@2458 | 1058 |
mafetter@2808 | 1059 /* |
mafetter@2808 | 1060 * No longer writable: decrement the type_count. |
mafetter@2808 | 1061 * This is an L1 page, installed in a validated L2 page: |
mafetter@2808 | 1062 * increment both the ref_count and type_count. |
mafetter@2808 | 1063 * Net: just increment the ref_count. |
mafetter@2808 | 1064 */ |
djm@1752 | 1065 get_page(page, p); /* an extra ref because of readable mapping */ |
djm@1752 | 1066 } |
djm@1752 | 1067 l1tab++; |
djm@1752 | 1068 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) ) |
djm@1752 | 1069 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); |
djm@1752 | 1070 } |
djm@1752 | 1071 |
djm@1752 | 1072 /* Set up shared-info area. */ |
cl349@2960 | 1073 update_dom_time(p); |
djm@1752 | 1074 p->shared_info->domain_time = 0; |
djm@1752 | 1075 /* Mask all upcalls... */ |
djm@1752 | 1076 for ( i = 0; i < MAX_VIRT_CPUS; i++ ) |
djm@1752 | 1077 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; |
cl349@3201 | 1078 p->shared_info->n_vcpu = smp_num_cpus; |
djm@1752 | 1079 |
djm@1752 | 1080 /* Install the new page tables. */ |
djm@1752 | 1081 __cli(); |
cl349@2957 | 1082 write_ptbase(&ed->mm); |
djm@1752 | 1083 |
djm@1752 | 1084 /* Copy the OS image. */ |
djm@1752 | 1085 (void)loadelfimage(image_start); |
djm@1752 | 1086 |
djm@1752 | 1087 /* Copy the initial ramdisk. */ |
djm@1752 | 1088 if ( initrd_len != 0 ) |
djm@1752 | 1089 memcpy((void *)vinitrd_start, initrd_start, initrd_len); |
djm@1752 | 1090 |
djm@1752 | 1091 /* Set up start info area. */ |
djm@1752 | 1092 si = (start_info_t *)vstartinfo_start; |
djm@1752 | 1093 memset(si, 0, PAGE_SIZE); |
djm@1752 | 1094 si->nr_pages = p->tot_pages; |
djm@1752 | 1095 si->shared_info = virt_to_phys(p->shared_info); |
djm@1752 | 1096 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; |
djm@1752 | 1097 si->pt_base = vpt_start; |
djm@1752 | 1098 si->nr_pt_frames = nr_pt_pages; |
djm@1752 | 1099 si->mfn_list = vphysmap_start; |
djm@1752 | 1100 |
djm@1752 | 1101 /* Write the phys->machine and machine->phys table entries. */ |
kaf24@2084 | 1102 for ( pfn = 0; pfn < p->tot_pages; pfn++ ) |
djm@1752 | 1103 { |
kaf24@2084 | 1104 mfn = pfn + (alloc_start>>PAGE_SHIFT); |
kaf24@2084 | 1105 #ifndef NDEBUG |
cl349@2486 | 1106 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT) |
kaf24@2084 | 1107 if ( pfn > REVERSE_START ) |
kaf24@2084 | 1108 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START); |
kaf24@2084 | 1109 #endif |
djm@1752 | 1110 ((unsigned long *)vphysmap_start)[pfn] = mfn; |
djm@1752 | 1111 machine_to_phys_mapping[mfn] = pfn; |
djm@1752 | 1112 } |
djm@1752 | 1113 |
djm@1752 | 1114 if ( initrd_len != 0 ) |
djm@1752 | 1115 { |
djm@1752 | 1116 si->mod_start = vinitrd_start; |
djm@1752 | 1117 si->mod_len = initrd_len; |
djm@1752 | 1118 printk("Initrd len 0x%lx, start at 0x%08lx\n", |
djm@1752 | 1119 si->mod_len, si->mod_start); |
djm@1752 | 1120 } |
djm@1752 | 1121 |
djm@1752 | 1122 dst = si->cmd_line; |
djm@1752 | 1123 if ( cmdline != NULL ) |
djm@1752 | 1124 { |
djm@1752 | 1125 for ( i = 0; i < 255; i++ ) |
djm@1752 | 1126 { |
djm@1752 | 1127 if ( cmdline[i] == '\0' ) |
djm@1752 | 1128 break; |
djm@1752 | 1129 *dst++ = cmdline[i]; |
djm@1752 | 1130 } |
djm@1752 | 1131 } |
djm@1752 | 1132 *dst = '\0'; |
djm@1752 | 1133 |
djm@1752 | 1134 /* Reinstate the caller's page tables. */ |
djm@1752 | 1135 write_ptbase(¤t->mm); |
djm@1752 | 1136 __sti(); |
djm@1752 | 1137 |
djm@1752 | 1138 /* Destroy low mappings - they were only for our convenience. */ |
djm@1752 | 1139 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) |
djm@1752 | 1140 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE ) |
djm@1752 | 1141 l2start[i] = mk_l2_pgentry(0); |
djm@1752 | 1142 zap_low_mappings(); /* Do the same for the idle page tables. */ |
djm@1752 | 1143 |
djm@1752 | 1144 /* DOM0 gets access to everything. */ |
djm@1752 | 1145 physdev_init_dom0(p); |
djm@1752 | 1146 |
cl349@2957 | 1147 set_bit(DF_CONSTRUCTED, &p->d_flags); |
djm@1752 | 1148 |
cl349@2957 | 1149 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); |
iap10@2509 | 1150 |
djm@1752 | 1151 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ |
iap10@2509 | 1152 shadow_lock(&p->mm); |
iap10@2509 | 1153 shadow_mode_enable(p, SHM_test); |
iap10@2509 | 1154 shadow_unlock(&p->mm); |
djm@1752 | 1155 #endif |
djm@1752 | 1156 |
djm@1752 | 1157 return 0; |
djm@1752 | 1158 } |