debuggers.hg

view xen/arch/x86/x86_32/domain_build.c @ 3686:5c112b235281

bitkeeper revision 1.1159.212.85 (42038b45EjUo-1JiSCHXW0Wav4TZGQ)

x86_64 progress: now entering ring 3. Need a hypercall (SYSCALL)
entry point, and some kind of DOM0 image to test against.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Fri Feb 04 14:48:37 2005 +0000 (2005-02-04)
parents dbc41aaba297
children 393483ae9f62 d93748c50893
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/sched.h>
11 #include <xen/smp.h>
12 #include <xen/delay.h>
13 #include <asm/regs.h>
14 #include <asm/system.h>
15 #include <asm/io.h>
16 #include <asm/processor.h>
17 #include <asm/desc.h>
18 #include <asm/i387.h>
19 #include <xen/event.h>
20 #include <xen/elf.h>
21 #include <xen/kernel.h>
23 /* No ring-3 access in initial page tables. */
24 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
25 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
27 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
28 #define round_pgdown(_p) ((_p)&PAGE_MASK)
30 int construct_dom0(struct domain *d,
31 unsigned long alloc_start,
32 unsigned long alloc_end,
33 unsigned long _image_start, unsigned long image_len,
34 unsigned long _initrd_start, unsigned long initrd_len,
35 char *cmdline)
36 {
37 char *dst;
38 int i, rc;
39 unsigned long pfn, mfn;
40 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
41 unsigned long nr_pt_pages;
42 unsigned long count;
43 l2_pgentry_t *l2tab, *l2start;
44 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
45 struct pfn_info *page = NULL;
46 start_info_t *si;
47 struct exec_domain *ed = d->exec_domain[0];
48 char *image_start = (char *)_image_start; /* use lowmem mappings */
49 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
51 /*
52 * This fully describes the memory layout of the initial domain. All
53 * *_start address are page-aligned, except v_start (and v_end) which are
54 * superpage-aligned.
55 */
56 struct domain_setup_info dsi;
57 unsigned long vinitrd_start;
58 unsigned long vinitrd_end;
59 unsigned long vphysmap_start;
60 unsigned long vphysmap_end;
61 unsigned long vstartinfo_start;
62 unsigned long vstartinfo_end;
63 unsigned long vstack_start;
64 unsigned long vstack_end;
65 unsigned long vpt_start;
66 unsigned long vpt_end;
67 unsigned long v_end;
69 /* Machine address of next candidate page-table page. */
70 unsigned long mpt_alloc;
72 extern void physdev_init_dom0(struct domain *);
74 /* Sanity! */
75 if ( d->id != 0 )
76 BUG();
77 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
78 BUG();
80 memset(&dsi, 0, sizeof(struct domain_setup_info));
82 printk("*** LOADING DOMAIN 0 ***\n");
84 /*
85 * This is all a bit grim. We've moved the modules to the "safe" physical
86 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
87 * routine we're going to copy it down into the region that's actually
88 * been allocated to domain 0. This is highly likely to be overlapping, so
89 * we use a forward copy.
90 *
91 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
92 * 4GB and lots of network/disk cards that allocate loads of buffers.
93 * We'll have to revisit this if we ever support PAE (64GB).
94 */
96 rc = parseelfimage(image_start, image_len, &dsi);
97 if ( rc != 0 )
98 return rc;
100 /* Set up domain options */
101 if ( dsi.use_writable_pagetables )
102 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
104 /* Align load address to 4MB boundary. */
105 dsi.v_start &= ~((1UL<<22)-1);
107 /*
108 * Why do we need this? The number of page-table frames depends on the
109 * size of the bootstrap address space. But the size of the address space
110 * depends on the number of page-table frames (since each one is mapped
111 * read-only). We have a pair of simultaneous equations in two unknowns,
112 * which we solve by exhaustive search.
113 */
114 vinitrd_start = round_pgup(dsi.v_kernend);
115 vinitrd_end = vinitrd_start + initrd_len;
116 vphysmap_start = round_pgup(vinitrd_end);
117 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
118 vpt_start = round_pgup(vphysmap_end);
119 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
120 {
121 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
122 vstartinfo_start = vpt_end;
123 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
124 vstack_start = vstartinfo_end;
125 vstack_end = vstack_start + PAGE_SIZE;
126 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
127 if ( (v_end - vstack_end) < (512UL << 10) )
128 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
129 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
130 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
131 break;
132 }
134 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
135 " Kernel image: %p->%p\n"
136 " Initrd image: %p->%p\n"
137 " Dom0 alloc.: %p->%p\n",
138 _image_start, _image_start + image_len,
139 _initrd_start, _initrd_start + initrd_len,
140 alloc_start, alloc_end);
141 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
142 " Loaded kernel: %p->%p\n"
143 " Init. ramdisk: %p->%p\n"
144 " Phys-Mach map: %p->%p\n"
145 " Page tables: %p->%p\n"
146 " Start info: %p->%p\n"
147 " Boot stack: %p->%p\n"
148 " TOTAL: %p->%p\n",
149 dsi.v_kernstart, dsi.v_kernend,
150 vinitrd_start, vinitrd_end,
151 vphysmap_start, vphysmap_end,
152 vpt_start, vpt_end,
153 vstartinfo_start, vstartinfo_end,
154 vstack_start, vstack_end,
155 dsi.v_start, v_end);
156 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
158 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
159 {
160 printk("Initial guest OS requires too much space\n"
161 "(%luMB is greater than %luMB limit)\n",
162 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
163 return -ENOMEM;
164 }
166 /*
167 * Protect the lowest 1GB of memory. We use a temporary mapping there
168 * from which we copy the kernel and ramdisk images.
169 */
170 if ( dsi.v_start < (1UL<<30) )
171 {
172 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
173 return -EINVAL;
174 }
176 /* Paranoia: scrub DOM0's memory allocation. */
177 printk("Scrubbing DOM0 RAM: ");
178 dst = (char *)alloc_start;
179 while ( dst < (char *)alloc_end )
180 {
181 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
182 printk(".");
183 touch_nmi_watchdog();
184 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
185 {
186 memset(dst, 0, SCRUB_BYTES);
187 dst += SCRUB_BYTES;
188 }
189 else
190 {
191 memset(dst, 0, (char *)alloc_end - dst);
192 break;
193 }
194 }
195 printk("done.\n");
197 /* Construct a frame-allocation list for the initial domain. */
198 for ( mfn = (alloc_start>>PAGE_SHIFT);
199 mfn < (alloc_end>>PAGE_SHIFT);
200 mfn++ )
201 {
202 page = &frame_table[mfn];
203 page_set_owner(page, d);
204 page->u.inuse.type_info = 0;
205 page->count_info = PGC_allocated | 1;
206 list_add_tail(&page->list, &d->page_list);
207 d->tot_pages++; d->max_pages++;
208 }
210 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
212 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
213 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
215 /*
216 * We're basically forcing default RPLs to 1, so that our "what privilege
217 * level are we returning to?" logic works.
218 */
219 ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
220 ed->thread.event_selector = FLAT_GUESTOS_CS;
221 ed->thread.guestos_ss = FLAT_GUESTOS_SS;
222 for ( i = 0; i < 256; i++ )
223 ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
225 /* WARNING: The new domain must have its 'processor' field filled in! */
226 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
227 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
228 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
229 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
230 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
231 mk_l2_pgentry(__pa(d->mm_perdomain_pt) | __PAGE_HYPERVISOR);
232 ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
234 l2tab += l2_table_offset(dsi.v_start);
235 mfn = alloc_start >> PAGE_SHIFT;
236 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
237 {
238 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
239 {
240 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
241 mpt_alloc += PAGE_SIZE;
242 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
243 clear_page(l1tab);
244 if ( count == 0 )
245 l1tab += l1_table_offset(dsi.v_start);
246 }
247 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
249 page = &frame_table[mfn];
250 if ( !get_page_and_type(page, d, PGT_writable_page) )
251 BUG();
253 mfn++;
254 }
256 /* Pages that are part of page tables must be read only. */
257 l2tab = l2start + l2_table_offset(vpt_start);
258 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
259 l1tab += l1_table_offset(vpt_start);
260 for ( count = 0; count < nr_pt_pages; count++ )
261 {
262 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
263 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
264 if ( count == 0 )
265 {
266 page->u.inuse.type_info &= ~PGT_type_mask;
267 page->u.inuse.type_info |= PGT_l2_page_table;
269 /*
270 * No longer writable: decrement the type_count.
271 * Installed as CR3: increment both the ref_count and type_count.
272 * Net: just increment the ref_count.
273 */
274 get_page(page, d); /* an extra ref because of readable mapping */
276 /* Get another ref to L2 page so that it can be pinned. */
277 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
278 BUG();
279 set_bit(_PGT_pinned, &page->u.inuse.type_info);
280 }
281 else
282 {
283 page->u.inuse.type_info &= ~PGT_type_mask;
284 page->u.inuse.type_info |= PGT_l1_page_table;
285 page->u.inuse.type_info |=
286 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
288 /*
289 * No longer writable: decrement the type_count.
290 * This is an L1 page, installed in a validated L2 page:
291 * increment both the ref_count and type_count.
292 * Net: just increment the ref_count.
293 */
294 get_page(page, d); /* an extra ref because of readable mapping */
295 }
296 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
297 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*++l2tab);
298 }
300 /* Set up shared-info area. */
301 update_dom_time(d);
302 d->shared_info->domain_time = 0;
303 /* Mask all upcalls... */
304 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
305 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
306 d->shared_info->n_vcpu = smp_num_cpus;
308 /* Install the new page tables. */
309 __cli();
310 write_ptbase(&ed->mm);
312 /* Copy the OS image. */
313 (void)loadelfimage(image_start);
315 /* Copy the initial ramdisk. */
316 if ( initrd_len != 0 )
317 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
319 /* Set up start info area. */
320 si = (start_info_t *)vstartinfo_start;
321 memset(si, 0, PAGE_SIZE);
322 si->nr_pages = d->tot_pages;
323 si->shared_info = virt_to_phys(d->shared_info);
324 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
325 si->pt_base = vpt_start;
326 si->nr_pt_frames = nr_pt_pages;
327 si->mfn_list = vphysmap_start;
329 /* Write the phys->machine and machine->phys table entries. */
330 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
331 {
332 mfn = pfn + (alloc_start>>PAGE_SHIFT);
333 #ifndef NDEBUG
334 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
335 if ( pfn > REVERSE_START )
336 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
337 #endif
338 ((unsigned long *)vphysmap_start)[pfn] = mfn;
339 machine_to_phys_mapping[mfn] = pfn;
340 }
342 if ( initrd_len != 0 )
343 {
344 si->mod_start = vinitrd_start;
345 si->mod_len = initrd_len;
346 printk("Initrd len 0x%lx, start at 0x%p\n",
347 si->mod_len, si->mod_start);
348 }
350 dst = si->cmd_line;
351 if ( cmdline != NULL )
352 {
353 for ( i = 0; i < 255; i++ )
354 {
355 if ( cmdline[i] == '\0' )
356 break;
357 *dst++ = cmdline[i];
358 }
359 }
360 *dst = '\0';
362 /* Reinstate the caller's page tables. */
363 write_ptbase(&current->mm);
364 __sti();
366 /* Destroy low mappings - they were only for our convenience. */
367 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
368 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
369 l2start[i] = mk_l2_pgentry(0);
370 zap_low_mappings(); /* Do the same for the idle page tables. */
372 /* DOM0 gets access to everything. */
373 physdev_init_dom0(d);
375 set_bit(DF_CONSTRUCTED, &d->d_flags);
377 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
379 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
380 shadow_lock(&d->mm);
381 shadow_mode_enable(d, SHM_test);
382 shadow_unlock(&d->mm);
383 #endif
385 return 0;
386 }
388 int elf_sanity_check(Elf_Ehdr *ehdr)
389 {
390 if ( !IS_ELF(*ehdr) ||
391 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
392 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
393 (ehdr->e_type != ET_EXEC) ||
394 (ehdr->e_machine != EM_386) )
395 {
396 printk("DOM0 image is not i386-compatible executable Elf image.\n");
397 return 0;
398 }
400 return 1;
401 }