debuggers.hg

view xen/arch/x86/x86_64/domain_build.c @ 3686:5c112b235281

bitkeeper revision 1.1159.212.85 (42038b45EjUo-1JiSCHXW0Wav4TZGQ)

x86_64 progress: now entering ring 3. Need a hypercall (SYSCALL)
entry point, and some kind of DOM0 image to test against.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Fri Feb 04 14:48:37 2005 +0000 (2005-02-04)
parents dbc41aaba297
children 715c644ba9ef
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/sched.h>
11 #include <xen/smp.h>
12 #include <xen/delay.h>
13 #include <asm/regs.h>
14 #include <asm/system.h>
15 #include <asm/io.h>
16 #include <asm/processor.h>
17 #include <asm/desc.h>
18 #include <asm/i387.h>
19 #include <xen/event.h>
20 #include <xen/elf.h>
21 #include <xen/kernel.h>
23 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
24 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
25 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
26 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
27 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
29 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
30 #define round_pgdown(_p) ((_p)&PAGE_MASK)
32 int construct_dom0(struct domain *d,
33 unsigned long alloc_start,
34 unsigned long alloc_end,
35 unsigned long _image_start, unsigned long image_len,
36 unsigned long _initrd_start, unsigned long initrd_len,
37 char *cmdline)
38 {
39 char *dst;
40 int i, rc;
41 unsigned long pfn, mfn;
42 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
43 unsigned long nr_pt_pages;
44 unsigned long count;
45 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
46 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
47 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
48 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
49 struct pfn_info *page = NULL;
50 start_info_t *si;
51 struct exec_domain *ed = d->exec_domain[0];
52 char *image_start = __va(_image_start);
53 char *initrd_start = __va(_initrd_start);
55 /*
56 * This fully describes the memory layout of the initial domain. All
57 * *_start address are page-aligned, except v_start (and v_end) which are
58 * superpage-aligned.
59 */
60 struct domain_setup_info dsi;
61 unsigned long vinitrd_start;
62 unsigned long vinitrd_end;
63 unsigned long vphysmap_start;
64 unsigned long vphysmap_end;
65 unsigned long vstartinfo_start;
66 unsigned long vstartinfo_end;
67 unsigned long vstack_start;
68 unsigned long vstack_end;
69 unsigned long vpt_start;
70 unsigned long vpt_end;
71 unsigned long v_end;
73 /* Machine address of next candidate page-table page. */
74 unsigned long mpt_alloc;
76 extern void physdev_init_dom0(struct domain *);
78 /* Sanity! */
79 if ( d->id != 0 )
80 BUG();
81 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
82 BUG();
84 memset(&dsi, 0, sizeof(struct domain_setup_info));
86 printk("*** LOADING DOMAIN 0 ***\n");
88 /*
89 * This is all a bit grim. We've moved the modules to the "safe" physical
90 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
91 * routine we're going to copy it down into the region that's actually
92 * been allocated to domain 0. This is highly likely to be overlapping, so
93 * we use a forward copy.
94 *
95 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
96 * 4GB and lots of network/disk cards that allocate loads of buffers.
97 * We'll have to revisit this if we ever support PAE (64GB).
98 */
100 rc = parseelfimage(image_start, image_len, &dsi);
101 if ( rc != 0 )
102 return rc;
104 /* Set up domain options */
105 if ( dsi.use_writable_pagetables )
106 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
108 /* Align load address to 4MB boundary. */
109 dsi.v_start &= ~((1UL<<22)-1);
111 /*
112 * Why do we need this? The number of page-table frames depends on the
113 * size of the bootstrap address space. But the size of the address space
114 * depends on the number of page-table frames (since each one is mapped
115 * read-only). We have a pair of simultaneous equations in two unknowns,
116 * which we solve by exhaustive search.
117 */
118 vinitrd_start = round_pgup(dsi.v_kernend);
119 vinitrd_end = vinitrd_start + initrd_len;
120 vphysmap_start = round_pgup(vinitrd_end);
121 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
122 vpt_start = round_pgup(vphysmap_end);
123 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
124 {
125 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
126 vstartinfo_start = vpt_end;
127 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
128 vstack_start = vstartinfo_end;
129 vstack_end = vstack_start + PAGE_SIZE;
130 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
131 if ( (v_end - vstack_end) < (512UL << 10) )
132 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
133 #define RD(_p,_s) ((_p) >> (_s)) /* round up */
134 #define RU(_p,_s) (((_p) + ((1UL<<(_s))-1)) >> (_s)) /* round down */
135 if ( (1 + /* # L4 */
136 (RU(v_end, L4_PAGETABLE_SHIFT) -
137 RD(dsi.v_start, L4_PAGETABLE_SHIFT)) + /* # L3 */
138 (RU(v_end, L3_PAGETABLE_SHIFT) -
139 RD(dsi.v_start, L3_PAGETABLE_SHIFT)) + /* # L2 */
140 (RU(v_end, L2_PAGETABLE_SHIFT) -
141 RD(dsi.v_start, L2_PAGETABLE_SHIFT))) /* # L1 */
142 <= nr_pt_pages )
143 break;
144 }
146 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
147 " Kernel image: %p->%p\n"
148 " Initrd image: %p->%p\n"
149 " Dom0 alloc.: %p->%p\n",
150 _image_start, _image_start + image_len,
151 _initrd_start, _initrd_start + initrd_len,
152 alloc_start, alloc_end);
153 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
154 " Loaded kernel: %p->%p\n"
155 " Init. ramdisk: %p->%p\n"
156 " Phys-Mach map: %p->%p\n"
157 " Page tables: %p->%p\n"
158 " Start info: %p->%p\n"
159 " Boot stack: %p->%p\n"
160 " TOTAL: %p->%p\n",
161 dsi.v_kernstart, dsi.v_kernend,
162 vinitrd_start, vinitrd_end,
163 vphysmap_start, vphysmap_end,
164 vpt_start, vpt_end,
165 vstartinfo_start, vstartinfo_end,
166 vstack_start, vstack_end,
167 dsi.v_start, v_end);
168 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
170 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
171 {
172 printk("Initial guest OS requires too much space\n"
173 "(%luMB is greater than %luMB limit)\n",
174 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
175 return -ENOMEM;
176 }
178 /* Overlap with Xen protected area? */
179 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
180 (v_end > HYPERVISOR_VIRT_START) )
181 {
182 printk("DOM0 image overlaps with Xen private area.\n");
183 return -EINVAL;
184 }
186 /* Paranoia: scrub DOM0's memory allocation. */
187 printk("Scrubbing DOM0 RAM: ");
188 dst = __va(alloc_start);
189 while ( __pa(dst) < alloc_end )
190 {
191 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
192 printk(".");
193 touch_nmi_watchdog();
194 if ( (alloc_end - __pa(dst)) > SCRUB_BYTES )
195 {
196 memset(dst, 0, SCRUB_BYTES);
197 dst += SCRUB_BYTES;
198 }
199 else
200 {
201 memset(dst, 0, alloc_end - __pa(dst));
202 break;
203 }
204 }
205 printk("done.\n");
207 /* Construct a frame-allocation list for the initial domain. */
208 for ( mfn = (alloc_start>>PAGE_SHIFT);
209 mfn < (alloc_end>>PAGE_SHIFT);
210 mfn++ )
211 {
212 page = &frame_table[mfn];
213 page_set_owner(page, d);
214 page->u.inuse.type_info = 0;
215 page->count_info = PGC_allocated | 1;
216 list_add_tail(&page->list, &d->page_list);
217 d->tot_pages++; d->max_pages++;
218 }
220 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
222 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
223 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
225 /*
226 * We're basically forcing default RPLs to 1, so that our "what privilege
227 * level are we returning to?" logic works.
228 */
229 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
230 ed->thread.event_selector = FLAT_GUESTOS_CS;
231 ed->thread.guestos_ss = FLAT_GUESTOS_SS;
232 for ( i = 0; i < 256; i++ )
233 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
235 /* WARNING: The new domain must have its 'processor' field filled in! */
236 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
237 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
238 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
239 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
240 mk_l4_pgentry(__pa(l4start) | __PAGE_HYPERVISOR);
241 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
242 mk_l4_pgentry(__pa(d->mm_perdomain_pt) | __PAGE_HYPERVISOR);
243 ed->mm.pagetable = mk_pagetable(__pa(l4start));
245 l4tab += l4_table_offset(dsi.v_start);
246 mfn = alloc_start >> PAGE_SHIFT;
247 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
248 {
249 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
250 {
251 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
252 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
253 clear_page(l1tab);
254 if ( count == 0 )
255 l1tab += l1_table_offset(dsi.v_start);
256 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
257 {
258 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
259 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
260 clear_page(l2tab);
261 if ( count == 0 )
262 l2tab += l2_table_offset(dsi.v_start);
263 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
264 {
265 phys_to_page(mpt_alloc)->u.inuse.type_info =
266 PGT_l3_page_table;
267 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
268 clear_page(l3tab);
269 if ( count == 0 )
270 l3tab += l3_table_offset(dsi.v_start);
271 *l4tab++ = mk_l4_pgentry(__pa(l3start) | L4_PROT);
272 }
273 *l3tab++ = mk_l3_pgentry(__pa(l2start) | L3_PROT);
274 }
275 *l2tab++ = mk_l2_pgentry(__pa(l1start) | L2_PROT);
276 }
277 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
279 page = &frame_table[mfn];
280 if ( (page->u.inuse.type_info == 0) &&
281 !get_page_and_type(page, d, PGT_writable_page) )
282 BUG();
284 mfn++;
285 }
287 /* Pages that are part of page tables must be read only. */
288 l4tab = l4start + l4_table_offset(vpt_start);
289 l3start = l3tab = l4_pgentry_to_l3(*l4tab);
290 l3tab += l3_table_offset(vpt_start);
291 l2start = l2tab = l3_pgentry_to_l2(*l3tab);
292 l2tab += l2_table_offset(vpt_start);
293 l1start = l1tab = l2_pgentry_to_l1(*l2tab);
294 l1tab += l1_table_offset(vpt_start);
295 for ( count = 0; count < nr_pt_pages; count++ )
296 {
297 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
298 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
300 /* Read-only mapping + PGC_allocated + page-table page. */
301 page->count_info = PGC_allocated | 3;
302 page->u.inuse.type_info |= PGT_validated | 1;
304 /* Top-level p.t. is pinned. */
305 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
306 {
307 page->count_info += 1;
308 page->u.inuse.type_info += 1 | PGT_pinned;
309 }
311 /* Iterate. */
312 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
313 {
314 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
315 {
316 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
317 l3start = l3tab = l4_pgentry_to_l3(*++l4tab);
318 l2start = l2tab = l3_pgentry_to_l2(*l3tab);
319 }
320 l1start = l1tab = l2_pgentry_to_l1(*l2tab);
321 }
322 }
324 /* Set up shared-info area. */
325 update_dom_time(d);
326 d->shared_info->domain_time = 0;
327 /* Mask all upcalls... */
328 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
329 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
330 d->shared_info->n_vcpu = smp_num_cpus;
332 /* Install the new page tables. */
333 __cli();
334 write_ptbase(&ed->mm);
336 /* Copy the OS image. */
337 (void)loadelfimage(image_start);
339 /* Copy the initial ramdisk. */
340 if ( initrd_len != 0 )
341 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
343 /* Set up start info area. */
344 si = (start_info_t *)vstartinfo_start;
345 memset(si, 0, PAGE_SIZE);
346 si->nr_pages = d->tot_pages;
347 si->shared_info = virt_to_phys(d->shared_info);
348 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
349 si->pt_base = vpt_start;
350 si->nr_pt_frames = nr_pt_pages;
351 si->mfn_list = vphysmap_start;
353 /* Write the phys->machine and machine->phys table entries. */
354 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
355 {
356 mfn = pfn + (alloc_start>>PAGE_SHIFT);
357 #ifndef NDEBUG
358 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
359 if ( pfn > REVERSE_START )
360 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
361 #endif
362 ((unsigned long *)vphysmap_start)[pfn] = mfn;
363 machine_to_phys_mapping[mfn] = pfn;
364 }
366 if ( initrd_len != 0 )
367 {
368 si->mod_start = vinitrd_start;
369 si->mod_len = initrd_len;
370 printk("Initrd len 0x%lx, start at 0x%p\n",
371 si->mod_len, si->mod_start);
372 }
374 dst = si->cmd_line;
375 if ( cmdline != NULL )
376 {
377 for ( i = 0; i < 255; i++ )
378 {
379 if ( cmdline[i] == '\0' )
380 break;
381 *dst++ = cmdline[i];
382 }
383 }
384 *dst = '\0';
386 /* Reinstate the caller's page tables. */
387 write_ptbase(&current->mm);
388 __sti();
390 /* DOM0 gets access to everything. */
391 physdev_init_dom0(d);
393 set_bit(DF_CONSTRUCTED, &d->d_flags);
395 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
397 return 0;
398 }
400 int elf_sanity_check(Elf_Ehdr *ehdr)
401 {
402 if ( !IS_ELF(*ehdr) ||
403 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
404 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
405 (ehdr->e_type != ET_EXEC) ||
406 (ehdr->e_machine != EM_X86_64) )
407 {
408 printk("DOM0 image is not x86/64-compatible executable Elf image.\n");
409 return 0;
410 }
412 return 1;
413 }