debuggers.hg

view xen/arch/x86/x86_64/domain_build.c @ 3715:d93748c50893

bitkeeper revision 1.1159.212.100 (42050e5fWLAKCQAvoZ3CPmyAaL-51g)

Reorganise 'struct domain' and 'struct exec_domain' to each have an
architecture-specific portion. Removed 'mm_struct'.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@viper.(none)
date Sat Feb 05 18:20:15 2005 +0000 (2005-02-05)
parents 715c644ba9ef
children 88957a238191 4dfebfdc7933 89e86842952a
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * domain_build.c
4 *
5 * Copyright (c) 2002-2005, K A Fraser
6 */
8 #include <xen/config.h>
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <asm/regs.h>
15 #include <asm/system.h>
16 #include <asm/io.h>
17 #include <asm/processor.h>
18 #include <asm/desc.h>
19 #include <asm/i387.h>
20 #include <xen/event.h>
21 #include <xen/elf.h>
22 #include <xen/kernel.h>
24 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
25 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
26 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
27 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
28 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
30 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
31 #define round_pgdown(_p) ((_p)&PAGE_MASK)
33 int construct_dom0(struct domain *d,
34 unsigned long alloc_start,
35 unsigned long alloc_end,
36 unsigned long _image_start, unsigned long image_len,
37 unsigned long _initrd_start, unsigned long initrd_len,
38 char *cmdline)
39 {
40 char *dst;
41 int i, rc;
42 unsigned long pfn, mfn;
43 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
44 unsigned long nr_pt_pages;
45 unsigned long count;
46 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
47 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
48 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
49 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
50 struct pfn_info *page = NULL;
51 start_info_t *si;
52 struct exec_domain *ed = d->exec_domain[0];
53 char *image_start = __va(_image_start);
54 char *initrd_start = __va(_initrd_start);
56 /*
57 * This fully describes the memory layout of the initial domain. All
58 * *_start address are page-aligned, except v_start (and v_end) which are
59 * superpage-aligned.
60 */
61 struct domain_setup_info dsi;
62 unsigned long vinitrd_start;
63 unsigned long vinitrd_end;
64 unsigned long vphysmap_start;
65 unsigned long vphysmap_end;
66 unsigned long vstartinfo_start;
67 unsigned long vstartinfo_end;
68 unsigned long vstack_start;
69 unsigned long vstack_end;
70 unsigned long vpt_start;
71 unsigned long vpt_end;
72 unsigned long v_end;
74 /* Machine address of next candidate page-table page. */
75 unsigned long mpt_alloc;
77 extern void physdev_init_dom0(struct domain *);
79 /* Sanity! */
80 if ( d->id != 0 )
81 BUG();
82 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
83 BUG();
85 memset(&dsi, 0, sizeof(struct domain_setup_info));
87 printk("*** LOADING DOMAIN 0 ***\n");
89 /*
90 * This is all a bit grim. We've moved the modules to the "safe" physical
91 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
92 * routine we're going to copy it down into the region that's actually
93 * been allocated to domain 0. This is highly likely to be overlapping, so
94 * we use a forward copy.
95 *
96 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
97 * 4GB and lots of network/disk cards that allocate loads of buffers.
98 * We'll have to revisit this if we ever support PAE (64GB).
99 */
101 rc = parseelfimage(image_start, image_len, &dsi);
102 if ( rc != 0 )
103 return rc;
105 /* Set up domain options */
106 if ( dsi.use_writable_pagetables )
107 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
109 /* Align load address to 4MB boundary. */
110 dsi.v_start &= ~((1UL<<22)-1);
112 /*
113 * Why do we need this? The number of page-table frames depends on the
114 * size of the bootstrap address space. But the size of the address space
115 * depends on the number of page-table frames (since each one is mapped
116 * read-only). We have a pair of simultaneous equations in two unknowns,
117 * which we solve by exhaustive search.
118 */
119 vinitrd_start = round_pgup(dsi.v_kernend);
120 vinitrd_end = vinitrd_start + initrd_len;
121 vphysmap_start = round_pgup(vinitrd_end);
122 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
123 vpt_start = round_pgup(vphysmap_end);
124 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
125 {
126 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
127 vstartinfo_start = vpt_end;
128 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
129 vstack_start = vstartinfo_end;
130 vstack_end = vstack_start + PAGE_SIZE;
131 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
132 if ( (v_end - vstack_end) < (512UL << 10) )
133 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
134 #define NR(_l,_h,_s) \
135 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
136 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
137 if ( (1 + /* # L4 */
138 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
139 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
140 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
141 <= nr_pt_pages )
142 break;
143 }
145 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
146 " Kernel image: %p->%p\n"
147 " Initrd image: %p->%p\n"
148 " Dom0 alloc.: %p->%p\n",
149 _image_start, _image_start + image_len,
150 _initrd_start, _initrd_start + initrd_len,
151 alloc_start, alloc_end);
152 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
153 " Loaded kernel: %p->%p\n"
154 " Init. ramdisk: %p->%p\n"
155 " Phys-Mach map: %p->%p\n"
156 " Page tables: %p->%p\n"
157 " Start info: %p->%p\n"
158 " Boot stack: %p->%p\n"
159 " TOTAL: %p->%p\n",
160 dsi.v_kernstart, dsi.v_kernend,
161 vinitrd_start, vinitrd_end,
162 vphysmap_start, vphysmap_end,
163 vpt_start, vpt_end,
164 vstartinfo_start, vstartinfo_end,
165 vstack_start, vstack_end,
166 dsi.v_start, v_end);
167 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
169 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
170 {
171 printk("Initial guest OS requires too much space\n"
172 "(%luMB is greater than %luMB limit)\n",
173 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
174 return -ENOMEM;
175 }
177 /* Overlap with Xen protected area? */
178 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
179 (v_end > HYPERVISOR_VIRT_START) )
180 {
181 printk("DOM0 image overlaps with Xen private area.\n");
182 return -EINVAL;
183 }
185 /* Paranoia: scrub DOM0's memory allocation. */
186 printk("Scrubbing DOM0 RAM: ");
187 dst = __va(alloc_start);
188 while ( __pa(dst) < alloc_end )
189 {
190 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
191 printk(".");
192 touch_nmi_watchdog();
193 if ( (alloc_end - __pa(dst)) > SCRUB_BYTES )
194 {
195 memset(dst, 0, SCRUB_BYTES);
196 dst += SCRUB_BYTES;
197 }
198 else
199 {
200 memset(dst, 0, alloc_end - __pa(dst));
201 break;
202 }
203 }
204 printk("done.\n");
206 /* Construct a frame-allocation list for the initial domain. */
207 for ( mfn = (alloc_start>>PAGE_SHIFT);
208 mfn < (alloc_end>>PAGE_SHIFT);
209 mfn++ )
210 {
211 page = &frame_table[mfn];
212 page_set_owner(page, d);
213 page->u.inuse.type_info = 0;
214 page->count_info = PGC_allocated | 1;
215 list_add_tail(&page->list, &d->page_list);
216 d->tot_pages++; d->max_pages++;
217 }
219 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
221 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
222 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
224 /*
225 * We're basically forcing default RPLs to 1, so that our "what privilege
226 * level are we returning to?" logic works.
227 */
228 ed->arch.failsafe_selector = FLAT_GUESTOS_CS;
229 ed->arch.event_selector = FLAT_GUESTOS_CS;
230 ed->arch.guestos_ss = FLAT_GUESTOS_SS;
231 for ( i = 0; i < 256; i++ )
232 ed->arch.traps[i].cs = FLAT_GUESTOS_CS;
234 /* WARNING: The new domain must have its 'processor' field filled in! */
235 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
236 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
237 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
238 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
239 mk_l4_pgentry(__pa(l4start) | __PAGE_HYPERVISOR);
240 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
241 mk_l4_pgentry(__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR);
242 ed->arch.pagetable = mk_pagetable(__pa(l4start));
244 l4tab += l4_table_offset(dsi.v_start);
245 mfn = alloc_start >> PAGE_SHIFT;
246 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
247 {
248 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
249 {
250 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
251 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
252 clear_page(l1tab);
253 if ( count == 0 )
254 l1tab += l1_table_offset(dsi.v_start);
255 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
256 {
257 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
258 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
259 clear_page(l2tab);
260 if ( count == 0 )
261 l2tab += l2_table_offset(dsi.v_start);
262 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
263 {
264 phys_to_page(mpt_alloc)->u.inuse.type_info =
265 PGT_l3_page_table;
266 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
267 clear_page(l3tab);
268 if ( count == 0 )
269 l3tab += l3_table_offset(dsi.v_start);
270 *l4tab++ = mk_l4_pgentry(__pa(l3start) | L4_PROT);
271 }
272 *l3tab++ = mk_l3_pgentry(__pa(l2start) | L3_PROT);
273 }
274 *l2tab++ = mk_l2_pgentry(__pa(l1start) | L2_PROT);
275 }
276 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
278 page = &frame_table[mfn];
279 if ( (page->u.inuse.type_info == 0) &&
280 !get_page_and_type(page, d, PGT_writable_page) )
281 BUG();
283 mfn++;
284 }
286 /* Pages that are part of page tables must be read only. */
287 l4tab = l4start + l4_table_offset(vpt_start);
288 l3start = l3tab = l4_pgentry_to_l3(*l4tab);
289 l3tab += l3_table_offset(vpt_start);
290 l2start = l2tab = l3_pgentry_to_l2(*l3tab);
291 l2tab += l2_table_offset(vpt_start);
292 l1start = l1tab = l2_pgentry_to_l1(*l2tab);
293 l1tab += l1_table_offset(vpt_start);
294 for ( count = 0; count < nr_pt_pages; count++ )
295 {
296 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
297 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
299 /* Read-only mapping + PGC_allocated + page-table page. */
300 page->count_info = PGC_allocated | 3;
301 page->u.inuse.type_info |= PGT_validated | 1;
303 /* Top-level p.t. is pinned. */
304 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
305 {
306 page->count_info += 1;
307 page->u.inuse.type_info += 1 | PGT_pinned;
308 }
310 /* Iterate. */
311 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
312 {
313 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
314 {
315 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
316 l3start = l3tab = l4_pgentry_to_l3(*++l4tab);
317 l2start = l2tab = l3_pgentry_to_l2(*l3tab);
318 }
319 l1start = l1tab = l2_pgentry_to_l1(*l2tab);
320 }
321 }
323 /* Set up shared-info area. */
324 update_dom_time(d);
325 d->shared_info->domain_time = 0;
326 /* Mask all upcalls... */
327 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
328 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
329 d->shared_info->n_vcpu = smp_num_cpus;
331 /* Install the new page tables. */
332 __cli();
333 write_ptbase(ed);
335 /* Copy the OS image. */
336 (void)loadelfimage(image_start);
338 /* Copy the initial ramdisk. */
339 if ( initrd_len != 0 )
340 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
342 /* Set up start info area. */
343 si = (start_info_t *)vstartinfo_start;
344 memset(si, 0, PAGE_SIZE);
345 si->nr_pages = d->tot_pages;
346 si->shared_info = virt_to_phys(d->shared_info);
347 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
348 si->pt_base = vpt_start;
349 si->nr_pt_frames = nr_pt_pages;
350 si->mfn_list = vphysmap_start;
352 /* Write the phys->machine and machine->phys table entries. */
353 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
354 {
355 mfn = pfn + (alloc_start>>PAGE_SHIFT);
356 #ifndef NDEBUG
357 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
358 if ( pfn > REVERSE_START )
359 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
360 #endif
361 ((unsigned long *)vphysmap_start)[pfn] = mfn;
362 machine_to_phys_mapping[mfn] = pfn;
363 }
365 if ( initrd_len != 0 )
366 {
367 si->mod_start = vinitrd_start;
368 si->mod_len = initrd_len;
369 printk("Initrd len 0x%lx, start at 0x%p\n",
370 si->mod_len, si->mod_start);
371 }
373 dst = si->cmd_line;
374 if ( cmdline != NULL )
375 {
376 for ( i = 0; i < 255; i++ )
377 {
378 if ( cmdline[i] == '\0' )
379 break;
380 *dst++ = cmdline[i];
381 }
382 }
383 *dst = '\0';
385 /* Reinstate the caller's page tables. */
386 write_ptbase(current);
387 __sti();
389 /* DOM0 gets access to everything. */
390 physdev_init_dom0(d);
392 set_bit(DF_CONSTRUCTED, &d->d_flags);
394 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
396 return 0;
397 }
399 int elf_sanity_check(Elf_Ehdr *ehdr)
400 {
401 if ( !IS_ELF(*ehdr) ||
402 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
403 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
404 (ehdr->e_type != ET_EXEC) ||
405 (ehdr->e_machine != EM_X86_64) )
406 {
407 printk("DOM0 image is not x86/64-compatible executable Elf image.\n");
408 return 0;
409 }
411 return 1;
412 }