debuggers.hg

view xen/arch/x86/domain_build.c @ 6680:d0a4f770a5f4

phys_to_mach and mach_to_phys tables contain long entries, not
32-bit entries.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Sep 06 18:01:24 2005 +0000 (2005-09-06)
parents 1f460d0fd6c6
children 8db9c5873b9b
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <asm/regs.h>
18 #include <asm/system.h>
19 #include <asm/io.h>
20 #include <asm/processor.h>
21 #include <asm/desc.h>
22 #include <asm/i387.h>
23 #include <asm/physdev.h>
24 #include <asm/shadow.h>
26 static long dom0_nrpages;
28 /*
29 * dom0_mem:
30 * If +ve:
31 * * The specified amount of memory is allocated to domain 0.
32 * If -ve:
33 * * All of memory is allocated to domain 0, minus the specified amount.
34 * If not specified:
35 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
36 * for uses such as DMA buffers (the reservation is clamped to 128MB).
37 */
38 static void parse_dom0_mem(char *s)
39 {
40 unsigned long long bytes;
41 char *t = s;
42 if ( *s == '-' )
43 t++;
44 bytes = parse_size_and_unit(t);
45 dom0_nrpages = bytes >> PAGE_SHIFT;
46 if ( *s == '-' )
47 dom0_nrpages = -dom0_nrpages;
48 }
49 custom_param("dom0_mem", parse_dom0_mem);
51 static unsigned int opt_dom0_shadow = 0;
52 boolean_param("dom0_shadow", opt_dom0_shadow);
54 static unsigned int opt_dom0_translate = 0;
55 boolean_param("dom0_translate", opt_dom0_translate);
57 #if defined(__i386__)
58 /* No ring-3 access in initial leaf page tables. */
59 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
60 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
61 #define L3_PROT (_PAGE_PRESENT)
62 #elif defined(__x86_64__)
63 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
64 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
65 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
66 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
67 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
68 #endif
70 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
71 #define round_pgdown(_p) ((_p)&PAGE_MASK)
73 static struct pfn_info *alloc_chunk(struct domain *d, unsigned long max_pages)
74 {
75 struct pfn_info *page;
76 unsigned int order;
77 /*
78 * Allocate up to 2MB at a time:
79 * 1. This prevents overflow of get_order() when allocating more than
80 * 4GB to domain 0 on a PAE machine.
81 * 2. It prevents allocating very large chunks from DMA pools before
82 * the >4GB pool is fully depleted.
83 */
84 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
85 max_pages = 2UL << (20 - PAGE_SHIFT);
86 order = get_order(max_pages << PAGE_SHIFT);
87 if ( (max_pages & (max_pages-1)) != 0 )
88 order--;
89 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
90 if ( order-- == 0 )
91 break;
92 return page;
93 }
95 int construct_dom0(struct domain *d,
96 unsigned long _image_start, unsigned long image_len,
97 unsigned long _initrd_start, unsigned long initrd_len,
98 char *cmdline)
99 {
100 int i, rc, dom0_pae, xen_pae, order;
101 unsigned long pfn, mfn;
102 unsigned long nr_pages;
103 unsigned long nr_pt_pages;
104 unsigned long alloc_spfn;
105 unsigned long alloc_epfn;
106 unsigned long count;
107 struct pfn_info *page = NULL;
108 start_info_t *si;
109 struct vcpu *v = d->vcpu[0];
110 #if defined(__i386__)
111 char *image_start = (char *)_image_start; /* use lowmem mappings */
112 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
113 #elif defined(__x86_64__)
114 char *image_start = __va(_image_start);
115 char *initrd_start = __va(_initrd_start);
116 #endif
117 #if CONFIG_PAGING_LEVELS >= 4
118 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
119 #endif
120 #if CONFIG_PAGING_LEVELS >= 3
121 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
122 #endif
123 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
124 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
126 /*
127 * This fully describes the memory layout of the initial domain. All
128 * *_start address are page-aligned, except v_start (and v_end) which are
129 * superpage-aligned.
130 */
131 struct domain_setup_info dsi;
132 unsigned long vinitrd_start;
133 unsigned long vinitrd_end;
134 unsigned long vphysmap_start;
135 unsigned long vphysmap_end;
136 unsigned long vstartinfo_start;
137 unsigned long vstartinfo_end;
138 unsigned long vstack_start;
139 unsigned long vstack_end;
140 unsigned long vpt_start;
141 unsigned long vpt_end;
142 unsigned long v_end;
144 /* Machine address of next candidate page-table page. */
145 unsigned long mpt_alloc;
147 extern void physdev_init_dom0(struct domain *);
148 extern void translate_l2pgtable(
149 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
151 /* Sanity! */
152 if ( d->domain_id != 0 )
153 BUG();
154 if ( test_bit(_DOMF_constructed, &d->domain_flags) )
155 BUG();
157 memset(&dsi, 0, sizeof(struct domain_setup_info));
158 dsi.image_addr = (unsigned long)image_start;
159 dsi.image_len = image_len;
161 printk("*** LOADING DOMAIN 0 ***\n");
163 d->max_pages = ~0U;
165 /*
166 * If domain 0 allocation isn't specified, reserve 1/16th of available
167 * memory for things like DMA buffers. This reservation is clamped to
168 * a maximum of 128MB.
169 */
170 if ( dom0_nrpages == 0 )
171 {
172 dom0_nrpages = avail_domheap_pages() +
173 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
174 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
175 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
176 dom0_nrpages = -dom0_nrpages;
177 }
179 /* Negative memory specification means "all memory - specified amount". */
180 if ( dom0_nrpages < 0 )
181 nr_pages = avail_domheap_pages() +
182 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
183 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
184 dom0_nrpages;
185 else
186 nr_pages = dom0_nrpages;
188 if ( (rc = parseelfimage(&dsi)) != 0 )
189 return rc;
191 if ( dsi.xen_section_string == NULL )
192 {
193 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
194 return -EINVAL;
195 }
197 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
198 xen_pae = (CONFIG_PAGING_LEVELS == 3);
199 if ( dom0_pae != xen_pae )
200 {
201 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
202 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
203 return -EINVAL;
204 }
205 if (strstr(dsi.xen_section_string, "SHADOW=translate"))
206 opt_dom0_translate = 1;
208 /* Align load address to 4MB boundary. */
209 dsi.v_start &= ~((1UL<<22)-1);
211 /*
212 * Why do we need this? The number of page-table frames depends on the
213 * size of the bootstrap address space. But the size of the address space
214 * depends on the number of page-table frames (since each one is mapped
215 * read-only). We have a pair of simultaneous equations in two unknowns,
216 * which we solve by exhaustive search.
217 */
218 vinitrd_start = round_pgup(dsi.v_end);
219 vinitrd_end = vinitrd_start + initrd_len;
220 vphysmap_start = round_pgup(vinitrd_end);
221 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
222 vstartinfo_start = round_pgup(vphysmap_end);
223 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
224 vpt_start = vstartinfo_end;
225 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
226 {
227 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
228 vstack_start = vpt_end;
229 vstack_end = vstack_start + PAGE_SIZE;
230 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
231 if ( (v_end - vstack_end) < (512UL << 10) )
232 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
233 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
234 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
235 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
236 break;
237 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
238 /* 5 pages: 1x 3rd + 4x 2nd level */
239 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
240 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
241 break;
242 #elif defined(__x86_64__)
243 #define NR(_l,_h,_s) \
244 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
245 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
246 if ( (1 + /* # L4 */
247 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
248 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
249 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
250 <= nr_pt_pages )
251 break;
252 #endif
253 }
255 order = get_order(v_end - dsi.v_start);
256 if ( (1UL << order) > nr_pages )
257 panic("Domain 0 allocation is too small for kernel image.\n");
259 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
260 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
261 panic("Not enough RAM for domain 0 allocation.\n");
262 alloc_spfn = page_to_pfn(page);
263 alloc_epfn = alloc_spfn + d->tot_pages;
265 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
266 " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr,
267 pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn));
268 if ( d->tot_pages < nr_pages )
269 printk(" (%lu pages to be allocated)",
270 nr_pages - d->tot_pages);
271 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
272 " Loaded kernel: %p->%p\n"
273 " Init. ramdisk: %p->%p\n"
274 " Phys-Mach map: %p->%p\n"
275 " Start info: %p->%p\n"
276 " Page tables: %p->%p\n"
277 " Boot stack: %p->%p\n"
278 " TOTAL: %p->%p\n",
279 _p(dsi.v_kernstart), _p(dsi.v_kernend),
280 _p(vinitrd_start), _p(vinitrd_end),
281 _p(vphysmap_start), _p(vphysmap_end),
282 _p(vstartinfo_start), _p(vstartinfo_end),
283 _p(vpt_start), _p(vpt_end),
284 _p(vstack_start), _p(vstack_end),
285 _p(dsi.v_start), _p(v_end));
286 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
288 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
289 {
290 printk("Initial guest OS requires too much space\n"
291 "(%luMB is greater than %luMB limit)\n",
292 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
293 return -ENOMEM;
294 }
296 mpt_alloc = (vpt_start - dsi.v_start) +
297 (unsigned long)pfn_to_phys(alloc_spfn);
299 /*
300 * We're basically forcing default RPLs to 1, so that our "what privilege
301 * level are we returning to?" logic works.
302 */
303 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
304 for ( i = 0; i < 256; i++ )
305 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
307 #if defined(__i386__)
309 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
310 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
312 /*
313 * Protect the lowest 1GB of memory. We use a temporary mapping there
314 * from which we copy the kernel and ramdisk images.
315 */
316 if ( dsi.v_start < (1UL<<30) )
317 {
318 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
319 return -EINVAL;
320 }
322 /* WARNING: The new domain must have its 'processor' field filled in! */
323 #if CONFIG_PAGING_LEVELS == 3
324 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
325 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
326 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
327 for (i = 0; i < 4; i++) {
328 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
329 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
330 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
331 }
332 {
333 unsigned long va;
334 for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
335 va += (1 << L2_PAGETABLE_SHIFT)) {
336 l2tab[va >> L2_PAGETABLE_SHIFT] =
337 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
338 (va-PERDOMAIN_VIRT_START),
339 __PAGE_HYPERVISOR);
340 }
341 }
342 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
343 #else
344 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
345 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
346 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
347 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
348 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
349 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
350 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
351 #endif
353 l2tab += l2_linear_offset(dsi.v_start);
354 mfn = alloc_spfn;
355 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
356 {
357 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
358 {
359 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
360 mpt_alloc += PAGE_SIZE;
361 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
362 l2tab++;
363 clear_page(l1tab);
364 if ( count == 0 )
365 l1tab += l1_table_offset(dsi.v_start);
366 }
367 *l1tab = l1e_from_pfn(mfn, L1_PROT);
368 l1tab++;
370 page = &frame_table[mfn];
371 if ( !get_page_and_type(page, d, PGT_writable_page) )
372 BUG();
374 mfn++;
375 }
377 /* Pages that are part of page tables must be read only. */
378 l2tab = l2start + l2_linear_offset(vpt_start);
379 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
380 l1tab += l1_table_offset(vpt_start);
381 for ( count = 0; count < nr_pt_pages; count++ )
382 {
383 page = &frame_table[l1e_get_pfn(*l1tab)];
384 if ( !opt_dom0_shadow )
385 l1e_remove_flags(*l1tab, _PAGE_RW);
386 else
387 if ( !get_page_type(page, PGT_writable_page) )
388 BUG();
390 #if CONFIG_PAGING_LEVELS == 3
391 switch (count) {
392 case 0:
393 page->u.inuse.type_info &= ~PGT_type_mask;
394 page->u.inuse.type_info |= PGT_l3_page_table;
395 get_page(page, d); /* an extra ref because of readable mapping */
397 /* Get another ref to L3 page so that it can be pinned. */
398 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
399 BUG();
400 set_bit(_PGT_pinned, &page->u.inuse.type_info);
401 break;
402 case 1 ... 4:
403 page->u.inuse.type_info &= ~PGT_type_mask;
404 page->u.inuse.type_info |= PGT_l2_page_table;
405 page->u.inuse.type_info |=
406 (count-1) << PGT_va_shift;
407 get_page(page, d); /* an extra ref because of readable mapping */
408 break;
409 default:
410 page->u.inuse.type_info &= ~PGT_type_mask;
411 page->u.inuse.type_info |= PGT_l1_page_table;
412 page->u.inuse.type_info |=
413 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
414 get_page(page, d); /* an extra ref because of readable mapping */
415 break;
416 }
417 #else
418 if ( count == 0 )
419 {
420 page->u.inuse.type_info &= ~PGT_type_mask;
421 page->u.inuse.type_info |= PGT_l2_page_table;
423 /*
424 * No longer writable: decrement the type_count.
425 * Installed as CR3: increment both the ref_count and type_count.
426 * Net: just increment the ref_count.
427 */
428 get_page(page, d); /* an extra ref because of readable mapping */
430 /* Get another ref to L2 page so that it can be pinned. */
431 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
432 BUG();
433 set_bit(_PGT_pinned, &page->u.inuse.type_info);
434 }
435 else
436 {
437 page->u.inuse.type_info &= ~PGT_type_mask;
438 page->u.inuse.type_info |= PGT_l1_page_table;
439 page->u.inuse.type_info |=
440 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
442 /*
443 * No longer writable: decrement the type_count.
444 * This is an L1 page, installed in a validated L2 page:
445 * increment both the ref_count and type_count.
446 * Net: just increment the ref_count.
447 */
448 get_page(page, d); /* an extra ref because of readable mapping */
449 }
450 #endif
451 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
452 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
453 }
455 #elif defined(__x86_64__)
457 /* Overlap with Xen protected area? */
458 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
459 (v_end > HYPERVISOR_VIRT_START) )
460 {
461 printk("DOM0 image overlaps with Xen private area.\n");
462 return -EINVAL;
463 }
465 /* WARNING: The new domain must have its 'processor' field filled in! */
466 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
467 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
468 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
469 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
470 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
471 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
472 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
473 v->arch.guest_table = mk_pagetable(__pa(l4start));
475 l4tab += l4_table_offset(dsi.v_start);
476 mfn = alloc_spfn;
477 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
478 {
479 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
480 {
481 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
482 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
483 clear_page(l1tab);
484 if ( count == 0 )
485 l1tab += l1_table_offset(dsi.v_start);
486 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
487 {
488 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
489 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
490 clear_page(l2tab);
491 if ( count == 0 )
492 l2tab += l2_table_offset(dsi.v_start);
493 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
494 {
495 phys_to_page(mpt_alloc)->u.inuse.type_info =
496 PGT_l3_page_table;
497 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
498 clear_page(l3tab);
499 if ( count == 0 )
500 l3tab += l3_table_offset(dsi.v_start);
501 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
502 l4tab++;
503 }
504 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
505 l3tab++;
506 }
507 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
508 l2tab++;
509 }
510 *l1tab = l1e_from_pfn(mfn, L1_PROT);
511 l1tab++;
513 page = &frame_table[mfn];
514 if ( (page->u.inuse.type_info == 0) &&
515 !get_page_and_type(page, d, PGT_writable_page) )
516 BUG();
518 mfn++;
519 }
521 /* Pages that are part of page tables must be read only. */
522 l4tab = l4start + l4_table_offset(vpt_start);
523 l3start = l3tab = l4e_to_l3e(*l4tab);
524 l3tab += l3_table_offset(vpt_start);
525 l2start = l2tab = l3e_to_l2e(*l3tab);
526 l2tab += l2_table_offset(vpt_start);
527 l1start = l1tab = l2e_to_l1e(*l2tab);
528 l1tab += l1_table_offset(vpt_start);
529 for ( count = 0; count < nr_pt_pages; count++ )
530 {
531 l1e_remove_flags(*l1tab, _PAGE_RW);
532 page = &frame_table[l1e_get_pfn(*l1tab)];
534 /* Read-only mapping + PGC_allocated + page-table page. */
535 page->count_info = PGC_allocated | 3;
536 page->u.inuse.type_info |= PGT_validated | 1;
538 /* Top-level p.t. is pinned. */
539 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
540 {
541 page->count_info += 1;
542 page->u.inuse.type_info += 1 | PGT_pinned;
543 }
545 /* Iterate. */
546 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
547 {
548 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
549 {
550 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
551 l3start = l3tab = l4e_to_l3e(*++l4tab);
552 l2start = l2tab = l3e_to_l2e(*l3tab);
553 }
554 l1start = l1tab = l2e_to_l1e(*l2tab);
555 }
556 }
558 #endif /* __x86_64__ */
560 /* Mask all upcalls... */
561 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
562 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
563 d->shared_info->n_vcpu = num_online_cpus();
565 /* Set up monitor table */
566 update_pagetables(v);
568 /* Install the new page tables. */
569 local_irq_disable();
570 write_ptbase(v);
572 /* Copy the OS image and free temporary buffer. */
573 (void)loadelfimage(&dsi);
575 init_domheap_pages(
576 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
578 /* Copy the initial ramdisk and free temporary buffer. */
579 if ( initrd_len != 0 )
580 {
581 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
582 init_domheap_pages(
583 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
584 }
586 d->next_io_page = max_page;
588 /* Set up start info area. */
589 si = (start_info_t *)vstartinfo_start;
590 memset(si, 0, PAGE_SIZE);
591 si->nr_pages = nr_pages;
593 if ( opt_dom0_translate )
594 {
595 si->shared_info = d->next_io_page << PAGE_SHIFT;
596 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT, d->next_io_page);
597 d->next_io_page++;
598 }
599 else
600 si->shared_info = virt_to_phys(d->shared_info);
602 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
603 si->pt_base = vpt_start;
604 si->nr_pt_frames = nr_pt_pages;
605 si->mfn_list = vphysmap_start;
607 /* Write the phys->machine and machine->phys table entries. */
608 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
609 {
610 mfn = pfn + alloc_spfn;
611 #ifndef NDEBUG
612 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
613 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
614 mfn = alloc_epfn - (pfn - REVERSE_START);
615 #endif
616 ((unsigned long *)vphysmap_start)[pfn] = mfn;
617 set_pfn_from_mfn(mfn, pfn);
618 }
619 while ( pfn < nr_pages )
620 {
621 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
622 panic("Not enough RAM for DOM0 reservation.\n");
623 while ( pfn < d->tot_pages )
624 {
625 mfn = page_to_pfn(page);
626 #ifndef NDEBUG
627 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
628 #endif
629 ((unsigned long *)vphysmap_start)[pfn] = mfn;
630 set_pfn_from_mfn(mfn, pfn);
631 #undef pfn
632 page++; pfn++;
633 }
634 }
636 if ( initrd_len != 0 )
637 {
638 si->mod_start = vinitrd_start;
639 si->mod_len = initrd_len;
640 printk("Initrd len 0x%lx, start at 0x%lx\n",
641 si->mod_len, si->mod_start);
642 }
644 memset(si->cmd_line, 0, sizeof(si->cmd_line));
645 if ( cmdline != NULL )
646 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
648 /* Reinstate the caller's page tables. */
649 write_ptbase(current);
650 local_irq_enable();
652 #if defined(__i386__)
653 /* Destroy low mappings - they were only for our convenience. */
654 zap_low_mappings(l2start);
655 zap_low_mappings(idle_pg_table_l2);
656 #endif
658 /* DOM0 gets access to everything. */
659 physdev_init_dom0(d);
661 init_domain_time(d);
663 set_bit(_DOMF_constructed, &d->domain_flags);
665 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
667 if ( opt_dom0_shadow || opt_dom0_translate )
668 {
669 printk("dom0: shadow enable\n");
670 shadow_mode_enable(d, (opt_dom0_translate
671 ? SHM_enable | SHM_refcounts | SHM_translate
672 : SHM_enable));
673 if ( opt_dom0_translate )
674 {
675 printk("dom0: shadow translate\n");
676 #if defined(__i386__) && defined(CONFIG_X86_PAE)
677 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
678 __FILE__, __LINE__, __FUNCTION__);
679 for ( ; ; )
680 __asm__ __volatile__ ( "hlt" );
681 #else
682 /* Hmm, what does this?
683 Looks like isn't portable across 32/64 bit and pae/non-pae ...
684 -- kraxel */
686 /* mafetter: This code is mostly a hack in order to be able to
687 * test with dom0's which are running with shadow translate.
688 * I expect we'll rip this out once we have a stable set of
689 * domU clients which use the various shadow modes, but it's
690 * useful to leave this here for now...
691 */
693 // map this domain's p2m table into current page table,
694 // so that we can easily access it.
695 //
696 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
697 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
698 idle_pg_table[1] = root_from_paddr(
699 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
700 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
701 pagetable_get_pfn(v->arch.guest_table));
702 idle_pg_table[1] = root_empty();
703 local_flush_tlb();
704 #endif
705 }
707 update_pagetables(v); /* XXX SMP */
708 printk("dom0: shadow setup done\n");
709 }
711 /*
712 * Modify I/O port access permissions.
713 */
714 /* Master Interrupt Controller (PIC). */
715 physdev_modify_ioport_access_range(dom0, 0, 0x20, 2);
716 /* Slave Interrupt Controller (PIC). */
717 physdev_modify_ioport_access_range(dom0, 0, 0xA0, 2);
718 /* Interval Timer (PIT). */
719 physdev_modify_ioport_access_range(dom0, 0, 0x40, 4);
720 /* PIT Channel 2 / PC Speaker Control. */
721 physdev_modify_ioport_access_range(dom0, 0, 0x61, 1);
723 return 0;
724 }
726 int elf_sanity_check(Elf_Ehdr *ehdr)
727 {
728 if ( !IS_ELF(*ehdr) ||
729 #if defined(__i386__)
730 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
731 (ehdr->e_machine != EM_386) ||
732 #elif defined(__x86_64__)
733 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
734 (ehdr->e_machine != EM_X86_64) ||
735 #endif
736 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
737 (ehdr->e_type != ET_EXEC) )
738 {
739 printk("DOM0 image is not a Xen-compatible Elf image.\n");
740 return 0;
741 }
743 return 1;
744 }
746 /*
747 * Local variables:
748 * mode: C
749 * c-set-style: "BSD"
750 * c-basic-offset: 4
751 * tab-width: 4
752 * indent-tabs-mode: nil
753 * End:
754 */