debuggers.hg

view xen/arch/x86/x86_32/domain_build.c @ 3766:89e86842952a

bitkeeper revision 1.1159.212.132 (4208e2acn2x2RLZsxZIR12IGEO1b3A)

Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
author kaf24@scramble.cl.cam.ac.uk
date Tue Feb 08 16:02:52 2005 +0000 (2005-02-08)
parents ea98f0bb6510 4dfebfdc7933
children f5f2757b3aa2
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * domain_build.c
4 *
5 * Copyright (c) 2002-2005, K A Fraser
6 */
8 #include <xen/config.h>
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <asm/regs.h>
15 #include <asm/system.h>
16 #include <asm/io.h>
17 #include <asm/processor.h>
18 #include <asm/desc.h>
19 #include <asm/i387.h>
20 #include <xen/event.h>
21 #include <xen/elf.h>
22 #include <xen/kernel.h>
23 #include <asm/shadow.h>
25 /* No ring-3 access in initial page tables. */
26 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
27 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
29 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
30 #define round_pgdown(_p) ((_p)&PAGE_MASK)
32 int construct_dom0(struct domain *d,
33 unsigned long alloc_start,
34 unsigned long alloc_end,
35 unsigned long _image_start, unsigned long image_len,
36 unsigned long _initrd_start, unsigned long initrd_len,
37 char *cmdline)
38 {
39 char *dst;
40 int i, rc;
41 unsigned long pfn, mfn;
42 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
43 unsigned long nr_pt_pages;
44 unsigned long count;
45 l2_pgentry_t *l2tab, *l2start;
46 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
47 struct pfn_info *page = NULL;
48 start_info_t *si;
49 struct exec_domain *ed = d->exec_domain[0];
50 char *image_start = (char *)_image_start; /* use lowmem mappings */
51 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
53 /*
54 * This fully describes the memory layout of the initial domain. All
55 * *_start address are page-aligned, except v_start (and v_end) which are
56 * superpage-aligned.
57 */
58 struct domain_setup_info dsi;
59 unsigned long vinitrd_start;
60 unsigned long vinitrd_end;
61 unsigned long vphysmap_start;
62 unsigned long vphysmap_end;
63 unsigned long vstartinfo_start;
64 unsigned long vstartinfo_end;
65 unsigned long vstack_start;
66 unsigned long vstack_end;
67 unsigned long vpt_start;
68 unsigned long vpt_end;
69 unsigned long v_end;
71 /* Machine address of next candidate page-table page. */
72 unsigned long mpt_alloc;
74 extern void physdev_init_dom0(struct domain *);
76 /* Sanity! */
77 if ( d->id != 0 )
78 BUG();
79 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
80 BUG();
82 memset(&dsi, 0, sizeof(struct domain_setup_info));
84 printk("*** LOADING DOMAIN 0 ***\n");
86 /*
87 * This is all a bit grim. We've moved the modules to the "safe" physical
88 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
89 * routine we're going to copy it down into the region that's actually
90 * been allocated to domain 0. This is highly likely to be overlapping, so
91 * we use a forward copy.
92 *
93 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
94 * 4GB and lots of network/disk cards that allocate loads of buffers.
95 * We'll have to revisit this if we ever support PAE (64GB).
96 */
98 rc = parseelfimage(image_start, image_len, &dsi);
99 if ( rc != 0 )
100 return rc;
102 /* Set up domain options */
103 if ( dsi.use_writable_pagetables )
104 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
106 /* Align load address to 4MB boundary. */
107 dsi.v_start &= ~((1UL<<22)-1);
109 /*
110 * Why do we need this? The number of page-table frames depends on the
111 * size of the bootstrap address space. But the size of the address space
112 * depends on the number of page-table frames (since each one is mapped
113 * read-only). We have a pair of simultaneous equations in two unknowns,
114 * which we solve by exhaustive search.
115 */
116 vinitrd_start = round_pgup(dsi.v_kernend);
117 vinitrd_end = vinitrd_start + initrd_len;
118 vphysmap_start = round_pgup(vinitrd_end);
119 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
120 vpt_start = round_pgup(vphysmap_end);
121 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
122 {
123 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
124 vstartinfo_start = vpt_end;
125 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
126 vstack_start = vstartinfo_end;
127 vstack_end = vstack_start + PAGE_SIZE;
128 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
129 if ( (v_end - vstack_end) < (512UL << 10) )
130 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
131 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
132 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
133 break;
134 }
136 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
137 " Kernel image: %p->%p\n"
138 " Initrd image: %p->%p\n"
139 " Dom0 alloc.: %p->%p\n",
140 _image_start, _image_start + image_len,
141 _initrd_start, _initrd_start + initrd_len,
142 alloc_start, alloc_end);
143 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
144 " Loaded kernel: %p->%p\n"
145 " Init. ramdisk: %p->%p\n"
146 " Phys-Mach map: %p->%p\n"
147 " Page tables: %p->%p\n"
148 " Start info: %p->%p\n"
149 " Boot stack: %p->%p\n"
150 " TOTAL: %p->%p\n",
151 dsi.v_kernstart, dsi.v_kernend,
152 vinitrd_start, vinitrd_end,
153 vphysmap_start, vphysmap_end,
154 vpt_start, vpt_end,
155 vstartinfo_start, vstartinfo_end,
156 vstack_start, vstack_end,
157 dsi.v_start, v_end);
158 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
160 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
161 {
162 printk("Initial guest OS requires too much space\n"
163 "(%luMB is greater than %luMB limit)\n",
164 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
165 return -ENOMEM;
166 }
168 /*
169 * Protect the lowest 1GB of memory. We use a temporary mapping there
170 * from which we copy the kernel and ramdisk images.
171 */
172 if ( dsi.v_start < (1UL<<30) )
173 {
174 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
175 return -EINVAL;
176 }
178 /* Paranoia: scrub DOM0's memory allocation. */
179 printk("Scrubbing DOM0 RAM: ");
180 dst = (char *)alloc_start;
181 while ( dst < (char *)alloc_end )
182 {
183 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
184 printk(".");
185 touch_nmi_watchdog();
186 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
187 {
188 memset(dst, 0, SCRUB_BYTES);
189 dst += SCRUB_BYTES;
190 }
191 else
192 {
193 memset(dst, 0, (char *)alloc_end - dst);
194 break;
195 }
196 }
197 printk("done.\n");
199 /* Construct a frame-allocation list for the initial domain. */
200 for ( mfn = (alloc_start>>PAGE_SHIFT);
201 mfn < (alloc_end>>PAGE_SHIFT);
202 mfn++ )
203 {
204 page = &frame_table[mfn];
205 page_set_owner(page, d);
206 page->u.inuse.type_info = 0;
207 page->count_info = PGC_allocated | 1;
208 list_add_tail(&page->list, &d->page_list);
209 d->tot_pages++; d->max_pages++;
210 }
212 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
214 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
215 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
217 /*
218 * We're basically forcing default RPLs to 1, so that our "what privilege
219 * level are we returning to?" logic works.
220 */
221 ed->arch.failsafe_selector = FLAT_GUESTOS_CS;
222 ed->arch.event_selector = FLAT_GUESTOS_CS;
223 ed->arch.guestos_ss = FLAT_GUESTOS_SS;
224 for ( i = 0; i < 256; i++ )
225 ed->arch.traps[i].cs = FLAT_GUESTOS_CS;
227 /* WARNING: The new domain must have its 'processor' field filled in! */
228 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
229 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
230 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
231 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
232 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
233 mk_l2_pgentry(__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR);
234 ed->arch.pagetable = mk_pagetable((unsigned long)l2start);
236 l2tab += l2_table_offset(dsi.v_start);
237 mfn = alloc_start >> PAGE_SHIFT;
238 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
239 {
240 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
241 {
242 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
243 mpt_alloc += PAGE_SIZE;
244 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
245 clear_page(l1tab);
246 if ( count == 0 )
247 l1tab += l1_table_offset(dsi.v_start);
248 }
249 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
251 page = &frame_table[mfn];
252 if ( !get_page_and_type(page, d, PGT_writable_page) )
253 BUG();
255 mfn++;
256 }
258 /* Pages that are part of page tables must be read only. */
259 l2tab = l2start + l2_table_offset(vpt_start);
260 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
261 l1tab += l1_table_offset(vpt_start);
262 for ( count = 0; count < nr_pt_pages; count++ )
263 {
264 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
265 page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
266 if ( count == 0 )
267 {
268 page->u.inuse.type_info &= ~PGT_type_mask;
269 page->u.inuse.type_info |= PGT_l2_page_table;
271 /*
272 * No longer writable: decrement the type_count.
273 * Installed as CR3: increment both the ref_count and type_count.
274 * Net: just increment the ref_count.
275 */
276 get_page(page, d); /* an extra ref because of readable mapping */
278 /* Get another ref to L2 page so that it can be pinned. */
279 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
280 BUG();
281 set_bit(_PGT_pinned, &page->u.inuse.type_info);
282 }
283 else
284 {
285 page->u.inuse.type_info &= ~PGT_type_mask;
286 page->u.inuse.type_info |= PGT_l1_page_table;
287 page->u.inuse.type_info |=
288 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
290 /*
291 * No longer writable: decrement the type_count.
292 * This is an L1 page, installed in a validated L2 page:
293 * increment both the ref_count and type_count.
294 * Net: just increment the ref_count.
295 */
296 get_page(page, d); /* an extra ref because of readable mapping */
297 }
298 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
299 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*++l2tab);
300 }
302 /* Set up shared-info area. */
303 update_dom_time(d);
304 d->shared_info->domain_time = 0;
305 /* Mask all upcalls... */
306 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
307 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
308 d->shared_info->n_vcpu = smp_num_cpus;
310 /* Install the new page tables. */
311 __cli();
312 write_ptbase(ed);
314 /* Copy the OS image. */
315 (void)loadelfimage(image_start);
317 /* Copy the initial ramdisk. */
318 if ( initrd_len != 0 )
319 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
321 /* Set up start info area. */
322 si = (start_info_t *)vstartinfo_start;
323 memset(si, 0, PAGE_SIZE);
324 si->nr_pages = d->tot_pages;
325 si->shared_info = virt_to_phys(d->shared_info);
326 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
327 si->pt_base = vpt_start;
328 si->nr_pt_frames = nr_pt_pages;
329 si->mfn_list = vphysmap_start;
331 /* Write the phys->machine and machine->phys table entries. */
332 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
333 {
334 mfn = pfn + (alloc_start>>PAGE_SHIFT);
335 #ifndef NDEBUG
336 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
337 if ( pfn > REVERSE_START )
338 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
339 #endif
340 ((unsigned long *)vphysmap_start)[pfn] = mfn;
341 machine_to_phys_mapping[mfn] = pfn;
342 }
344 if ( initrd_len != 0 )
345 {
346 si->mod_start = vinitrd_start;
347 si->mod_len = initrd_len;
348 printk("Initrd len 0x%lx, start at 0x%p\n",
349 si->mod_len, si->mod_start);
350 }
352 dst = si->cmd_line;
353 if ( cmdline != NULL )
354 {
355 for ( i = 0; i < 255; i++ )
356 {
357 if ( cmdline[i] == '\0' )
358 break;
359 *dst++ = cmdline[i];
360 }
361 }
362 *dst = '\0';
364 /* Reinstate the caller's page tables. */
365 write_ptbase(current);
366 __sti();
368 /* Destroy low mappings - they were only for our convenience. */
369 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
370 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
371 l2start[i] = mk_l2_pgentry(0);
372 zap_low_mappings(); /* Do the same for the idle page tables. */
374 /* DOM0 gets access to everything. */
375 physdev_init_dom0(d);
377 set_bit(DF_CONSTRUCTED, &d->d_flags);
379 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
381 #ifndef NDEBUG
382 if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
383 {
384 shadow_lock(d);
385 shadow_mode_enable(d, SHM_test);
386 shadow_unlock(d);
387 }
388 #endif
390 return 0;
391 }
393 int elf_sanity_check(Elf_Ehdr *ehdr)
394 {
395 if ( !IS_ELF(*ehdr) ||
396 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
397 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
398 (ehdr->e_type != ET_EXEC) ||
399 (ehdr->e_machine != EM_386) )
400 {
401 printk("DOM0 image is not i386-compatible executable Elf image.\n");
402 return 0;
403 }
405 return 1;
406 }