debuggers.hg

view xen/arch/x86/x86_32/domain_build.c @ 3715:d93748c50893

bitkeeper revision 1.1159.212.100 (42050e5fWLAKCQAvoZ3CPmyAaL-51g)

Reorganise 'struct domain' and 'struct exec_domain' to each have an
architecture-specific portion. Removed 'mm_struct'.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@viper.(none)
date Sat Feb 05 18:20:15 2005 +0000 (2005-02-05)
parents 5c112b235281
children 88957a238191 ea98f0bb6510
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * domain_build.c
4 *
5 * Copyright (c) 2002-2005, K A Fraser
6 */
8 #include <xen/config.h>
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <asm/regs.h>
15 #include <asm/system.h>
16 #include <asm/io.h>
17 #include <asm/processor.h>
18 #include <asm/desc.h>
19 #include <asm/i387.h>
20 #include <xen/event.h>
21 #include <xen/elf.h>
22 #include <xen/kernel.h>
24 /* No ring-3 access in initial page tables. */
25 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
26 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
28 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
29 #define round_pgdown(_p) ((_p)&PAGE_MASK)
31 int construct_dom0(struct domain *d,
32 unsigned long alloc_start,
33 unsigned long alloc_end,
34 unsigned long _image_start, unsigned long image_len,
35 unsigned long _initrd_start, unsigned long initrd_len,
36 char *cmdline)
37 {
38 char *dst;
39 int i, rc;
40 unsigned long pfn, mfn;
41 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
42 unsigned long nr_pt_pages;
43 unsigned long count;
44 l2_pgentry_t *l2tab, *l2start;
45 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
46 struct pfn_info *page = NULL;
47 start_info_t *si;
48 struct exec_domain *ed = d->exec_domain[0];
49 char *image_start = (char *)_image_start; /* use lowmem mappings */
50 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
52 /*
53 * This fully describes the memory layout of the initial domain. All
54 * *_start address are page-aligned, except v_start (and v_end) which are
55 * superpage-aligned.
56 */
57 struct domain_setup_info dsi;
58 unsigned long vinitrd_start;
59 unsigned long vinitrd_end;
60 unsigned long vphysmap_start;
61 unsigned long vphysmap_end;
62 unsigned long vstartinfo_start;
63 unsigned long vstartinfo_end;
64 unsigned long vstack_start;
65 unsigned long vstack_end;
66 unsigned long vpt_start;
67 unsigned long vpt_end;
68 unsigned long v_end;
70 /* Machine address of next candidate page-table page. */
71 unsigned long mpt_alloc;
73 extern void physdev_init_dom0(struct domain *);
75 /* Sanity! */
76 if ( d->id != 0 )
77 BUG();
78 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
79 BUG();
81 memset(&dsi, 0, sizeof(struct domain_setup_info));
83 printk("*** LOADING DOMAIN 0 ***\n");
85 /*
86 * This is all a bit grim. We've moved the modules to the "safe" physical
87 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
88 * routine we're going to copy it down into the region that's actually
89 * been allocated to domain 0. This is highly likely to be overlapping, so
90 * we use a forward copy.
91 *
92 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
93 * 4GB and lots of network/disk cards that allocate loads of buffers.
94 * We'll have to revisit this if we ever support PAE (64GB).
95 */
97 rc = parseelfimage(image_start, image_len, &dsi);
98 if ( rc != 0 )
99 return rc;
101 /* Set up domain options */
102 if ( dsi.use_writable_pagetables )
103 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
105 /* Align load address to 4MB boundary. */
106 dsi.v_start &= ~((1UL<<22)-1);
108 /*
109 * Why do we need this? The number of page-table frames depends on the
110 * size of the bootstrap address space. But the size of the address space
111 * depends on the number of page-table frames (since each one is mapped
112 * read-only). We have a pair of simultaneous equations in two unknowns,
113 * which we solve by exhaustive search.
114 */
115 vinitrd_start = round_pgup(dsi.v_kernend);
116 vinitrd_end = vinitrd_start + initrd_len;
117 vphysmap_start = round_pgup(vinitrd_end);
118 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
119 vpt_start = round_pgup(vphysmap_end);
120 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
121 {
122 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
123 vstartinfo_start = vpt_end;
124 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
125 vstack_start = vstartinfo_end;
126 vstack_end = vstack_start + PAGE_SIZE;
127 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
128 if ( (v_end - vstack_end) < (512UL << 10) )
129 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
130 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
131 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
132 break;
133 }
135 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
136 " Kernel image: %p->%p\n"
137 " Initrd image: %p->%p\n"
138 " Dom0 alloc.: %p->%p\n",
139 _image_start, _image_start + image_len,
140 _initrd_start, _initrd_start + initrd_len,
141 alloc_start, alloc_end);
142 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
143 " Loaded kernel: %p->%p\n"
144 " Init. ramdisk: %p->%p\n"
145 " Phys-Mach map: %p->%p\n"
146 " Page tables: %p->%p\n"
147 " Start info: %p->%p\n"
148 " Boot stack: %p->%p\n"
149 " TOTAL: %p->%p\n",
150 dsi.v_kernstart, dsi.v_kernend,
151 vinitrd_start, vinitrd_end,
152 vphysmap_start, vphysmap_end,
153 vpt_start, vpt_end,
154 vstartinfo_start, vstartinfo_end,
155 vstack_start, vstack_end,
156 dsi.v_start, v_end);
157 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
159 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
160 {
161 printk("Initial guest OS requires too much space\n"
162 "(%luMB is greater than %luMB limit)\n",
163 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
164 return -ENOMEM;
165 }
167 /*
168 * Protect the lowest 1GB of memory. We use a temporary mapping there
169 * from which we copy the kernel and ramdisk images.
170 */
171 if ( dsi.v_start < (1UL<<30) )
172 {
173 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
174 return -EINVAL;
175 }
177 /* Paranoia: scrub DOM0's memory allocation. */
178 printk("Scrubbing DOM0 RAM: ");
179 dst = (char *)alloc_start;
180 while ( dst < (char *)alloc_end )
181 {
182 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
183 printk(".");
184 touch_nmi_watchdog();
185 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
186 {
187 memset(dst, 0, SCRUB_BYTES);
188 dst += SCRUB_BYTES;
189 }
190 else
191 {
192 memset(dst, 0, (char *)alloc_end - dst);
193 break;
194 }
195 }
196 printk("done.\n");
198 /* Construct a frame-allocation list for the initial domain. */
199 for ( mfn = (alloc_start>>PAGE_SHIFT);
200 mfn < (alloc_end>>PAGE_SHIFT);
201 mfn++ )
202 {
203 page = &frame_table[mfn];
204 page_set_owner(page, d);
205 page->u.inuse.type_info = 0;
206 page->count_info = PGC_allocated | 1;
207 list_add_tail(&page->list, &d->page_list);
208 d->tot_pages++; d->max_pages++;
209 }
211 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
213 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
214 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
216 /*
217 * We're basically forcing default RPLs to 1, so that our "what privilege
218 * level are we returning to?" logic works.
219 */
220 ed->arch.failsafe_selector = FLAT_GUESTOS_CS;
221 ed->arch.event_selector = FLAT_GUESTOS_CS;
222 ed->arch.guestos_ss = FLAT_GUESTOS_SS;
223 for ( i = 0; i < 256; i++ )
224 ed->arch.traps[i].cs = FLAT_GUESTOS_CS;
226 /* WARNING: The new domain must have its 'processor' field filled in! */
227 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
228 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
229 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
230 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
231 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
232 mk_l2_pgentry(__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR);
233 ed->arch.pagetable = mk_pagetable((unsigned long)l2start);
235 l2tab += l2_table_offset(dsi.v_start);
236 mfn = alloc_start >> PAGE_SHIFT;
237 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
238 {
239 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
240 {
241 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
242 mpt_alloc += PAGE_SIZE;
243 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
244 clear_page(l1tab);
245 if ( count == 0 )
246 l1tab += l1_table_offset(dsi.v_start);
247 }
248 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
250 page = &frame_table[mfn];
251 if ( !get_page_and_type(page, d, PGT_writable_page) )
252 BUG();
254 mfn++;
255 }
257 /* Pages that are part of page tables must be read only. */
258 l2tab = l2start + l2_table_offset(vpt_start);
259 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
260 l1tab += l1_table_offset(vpt_start);
261 for ( count = 0; count < nr_pt_pages; count++ )
262 {
263 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
264 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
265 if ( count == 0 )
266 {
267 page->u.inuse.type_info &= ~PGT_type_mask;
268 page->u.inuse.type_info |= PGT_l2_page_table;
270 /*
271 * No longer writable: decrement the type_count.
272 * Installed as CR3: increment both the ref_count and type_count.
273 * Net: just increment the ref_count.
274 */
275 get_page(page, d); /* an extra ref because of readable mapping */
277 /* Get another ref to L2 page so that it can be pinned. */
278 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
279 BUG();
280 set_bit(_PGT_pinned, &page->u.inuse.type_info);
281 }
282 else
283 {
284 page->u.inuse.type_info &= ~PGT_type_mask;
285 page->u.inuse.type_info |= PGT_l1_page_table;
286 page->u.inuse.type_info |=
287 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
289 /*
290 * No longer writable: decrement the type_count.
291 * This is an L1 page, installed in a validated L2 page:
292 * increment both the ref_count and type_count.
293 * Net: just increment the ref_count.
294 */
295 get_page(page, d); /* an extra ref because of readable mapping */
296 }
297 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
298 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*++l2tab);
299 }
301 /* Set up shared-info area. */
302 update_dom_time(d);
303 d->shared_info->domain_time = 0;
304 /* Mask all upcalls... */
305 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
306 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
307 d->shared_info->n_vcpu = smp_num_cpus;
309 /* Install the new page tables. */
310 __cli();
311 write_ptbase(ed);
313 /* Copy the OS image. */
314 (void)loadelfimage(image_start);
316 /* Copy the initial ramdisk. */
317 if ( initrd_len != 0 )
318 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
320 /* Set up start info area. */
321 si = (start_info_t *)vstartinfo_start;
322 memset(si, 0, PAGE_SIZE);
323 si->nr_pages = d->tot_pages;
324 si->shared_info = virt_to_phys(d->shared_info);
325 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
326 si->pt_base = vpt_start;
327 si->nr_pt_frames = nr_pt_pages;
328 si->mfn_list = vphysmap_start;
330 /* Write the phys->machine and machine->phys table entries. */
331 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
332 {
333 mfn = pfn + (alloc_start>>PAGE_SHIFT);
334 #ifndef NDEBUG
335 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
336 if ( pfn > REVERSE_START )
337 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
338 #endif
339 ((unsigned long *)vphysmap_start)[pfn] = mfn;
340 machine_to_phys_mapping[mfn] = pfn;
341 }
343 if ( initrd_len != 0 )
344 {
345 si->mod_start = vinitrd_start;
346 si->mod_len = initrd_len;
347 printk("Initrd len 0x%lx, start at 0x%p\n",
348 si->mod_len, si->mod_start);
349 }
351 dst = si->cmd_line;
352 if ( cmdline != NULL )
353 {
354 for ( i = 0; i < 255; i++ )
355 {
356 if ( cmdline[i] == '\0' )
357 break;
358 *dst++ = cmdline[i];
359 }
360 }
361 *dst = '\0';
363 /* Reinstate the caller's page tables. */
364 write_ptbase(current);
365 __sti();
367 /* Destroy low mappings - they were only for our convenience. */
368 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
369 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
370 l2start[i] = mk_l2_pgentry(0);
371 zap_low_mappings(); /* Do the same for the idle page tables. */
373 /* DOM0 gets access to everything. */
374 physdev_init_dom0(d);
376 set_bit(DF_CONSTRUCTED, &d->d_flags);
378 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
380 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
381 shadow_lock(&d->mm);
382 shadow_mode_enable(d, SHM_test);
383 shadow_unlock(&d->mm);
384 #endif
386 return 0;
387 }
389 int elf_sanity_check(Elf_Ehdr *ehdr)
390 {
391 if ( !IS_ELF(*ehdr) ||
392 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
393 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
394 (ehdr->e_type != ET_EXEC) ||
395 (ehdr->e_machine != EM_386) )
396 {
397 printk("DOM0 image is not i386-compatible executable Elf image.\n");
398 return 0;
399 }
401 return 1;
402 }