debuggers.hg

view xen/arch/x86/x86_32/domain_build.c @ 3755:ea98f0bb6510

bitkeeper revision 1.1159.212.127 (4208b02bTdSR4AVYRg8diDkKZmIVUg)

General shadow code cleanup.

Fixed compilation problems when SHADOW_DEBUG is enabled.
Fixed compilation problems when CONFIG_VMX is undefined.

Simplified l1pte_write_fault and l1pte_read_fault.
Name change: spfn => smfn (shadow machine frame numbers).

In general, the terms pfn and gpfn now refer to pages in the
guest's idea of physical frames (which diffs for full shadow
guests). mfn always refers to a machine frame number.

One bug fix for check_pagetable():
If we're using writable page tables
along with shadow mode, don't check the currently writable page table
page -- check its snapshot instead.

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Tue Feb 08 12:27:23 2005 +0000 (2005-02-08)
parents d93748c50893
children 4dfebfdc7933 89e86842952a
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * domain_build.c
4 *
5 * Copyright (c) 2002-2005, K A Fraser
6 */
8 #include <xen/config.h>
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <asm/regs.h>
15 #include <asm/system.h>
16 #include <asm/io.h>
17 #include <asm/processor.h>
18 #include <asm/desc.h>
19 #include <asm/i387.h>
20 #include <xen/event.h>
21 #include <xen/elf.h>
22 #include <xen/kernel.h>
23 #include <asm/shadow.h>
25 /* No ring-3 access in initial page tables. */
26 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
27 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
29 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
30 #define round_pgdown(_p) ((_p)&PAGE_MASK)
32 int construct_dom0(struct domain *d,
33 unsigned long alloc_start,
34 unsigned long alloc_end,
35 unsigned long _image_start, unsigned long image_len,
36 unsigned long _initrd_start, unsigned long initrd_len,
37 char *cmdline)
38 {
39 char *dst;
40 int i, rc;
41 unsigned long pfn, mfn;
42 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
43 unsigned long nr_pt_pages;
44 unsigned long count;
45 l2_pgentry_t *l2tab, *l2start;
46 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
47 struct pfn_info *page = NULL;
48 start_info_t *si;
49 struct exec_domain *ed = d->exec_domain[0];
50 char *image_start = (char *)_image_start; /* use lowmem mappings */
51 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
53 /*
54 * This fully describes the memory layout of the initial domain. All
55 * *_start address are page-aligned, except v_start (and v_end) which are
56 * superpage-aligned.
57 */
58 struct domain_setup_info dsi;
59 unsigned long vinitrd_start;
60 unsigned long vinitrd_end;
61 unsigned long vphysmap_start;
62 unsigned long vphysmap_end;
63 unsigned long vstartinfo_start;
64 unsigned long vstartinfo_end;
65 unsigned long vstack_start;
66 unsigned long vstack_end;
67 unsigned long vpt_start;
68 unsigned long vpt_end;
69 unsigned long v_end;
71 /* Machine address of next candidate page-table page. */
72 unsigned long mpt_alloc;
74 extern void physdev_init_dom0(struct domain *);
76 /* Sanity! */
77 if ( d->id != 0 )
78 BUG();
79 if ( test_bit(DF_CONSTRUCTED, &d->d_flags) )
80 BUG();
82 memset(&dsi, 0, sizeof(struct domain_setup_info));
84 printk("*** LOADING DOMAIN 0 ***\n");
86 /*
87 * This is all a bit grim. We've moved the modules to the "safe" physical
88 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
89 * routine we're going to copy it down into the region that's actually
90 * been allocated to domain 0. This is highly likely to be overlapping, so
91 * we use a forward copy.
92 *
93 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
94 * 4GB and lots of network/disk cards that allocate loads of buffers.
95 * We'll have to revisit this if we ever support PAE (64GB).
96 */
98 rc = parseelfimage(image_start, image_len, &dsi);
99 if ( rc != 0 )
100 return rc;
102 /* Set up domain options */
103 if ( dsi.use_writable_pagetables )
104 vm_assist(d, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
106 /* Align load address to 4MB boundary. */
107 dsi.v_start &= ~((1UL<<22)-1);
109 /*
110 * Why do we need this? The number of page-table frames depends on the
111 * size of the bootstrap address space. But the size of the address space
112 * depends on the number of page-table frames (since each one is mapped
113 * read-only). We have a pair of simultaneous equations in two unknowns,
114 * which we solve by exhaustive search.
115 */
116 vinitrd_start = round_pgup(dsi.v_kernend);
117 vinitrd_end = vinitrd_start + initrd_len;
118 vphysmap_start = round_pgup(vinitrd_end);
119 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
120 vpt_start = round_pgup(vphysmap_end);
121 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
122 {
123 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
124 vstartinfo_start = vpt_end;
125 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
126 vstack_start = vstartinfo_end;
127 vstack_end = vstack_start + PAGE_SIZE;
128 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
129 if ( (v_end - vstack_end) < (512UL << 10) )
130 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
131 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
132 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
133 break;
134 }
136 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
137 " Kernel image: %p->%p\n"
138 " Initrd image: %p->%p\n"
139 " Dom0 alloc.: %p->%p\n",
140 _image_start, _image_start + image_len,
141 _initrd_start, _initrd_start + initrd_len,
142 alloc_start, alloc_end);
143 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
144 " Loaded kernel: %p->%p\n"
145 " Init. ramdisk: %p->%p\n"
146 " Phys-Mach map: %p->%p\n"
147 " Page tables: %p->%p\n"
148 " Start info: %p->%p\n"
149 " Boot stack: %p->%p\n"
150 " TOTAL: %p->%p\n",
151 dsi.v_kernstart, dsi.v_kernend,
152 vinitrd_start, vinitrd_end,
153 vphysmap_start, vphysmap_end,
154 vpt_start, vpt_end,
155 vstartinfo_start, vstartinfo_end,
156 vstack_start, vstack_end,
157 dsi.v_start, v_end);
158 printk(" ENTRY ADDRESS: %p\n", dsi.v_kernentry);
160 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
161 {
162 printk("Initial guest OS requires too much space\n"
163 "(%luMB is greater than %luMB limit)\n",
164 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
165 return -ENOMEM;
166 }
168 /*
169 * Protect the lowest 1GB of memory. We use a temporary mapping there
170 * from which we copy the kernel and ramdisk images.
171 */
172 if ( dsi.v_start < (1UL<<30) )
173 {
174 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
175 return -EINVAL;
176 }
178 /* Paranoia: scrub DOM0's memory allocation. */
179 printk("Scrubbing DOM0 RAM: ");
180 dst = (char *)alloc_start;
181 while ( dst < (char *)alloc_end )
182 {
183 #define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
184 printk(".");
185 touch_nmi_watchdog();
186 if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
187 {
188 memset(dst, 0, SCRUB_BYTES);
189 dst += SCRUB_BYTES;
190 }
191 else
192 {
193 memset(dst, 0, (char *)alloc_end - dst);
194 break;
195 }
196 }
197 printk("done.\n");
199 /* Construct a frame-allocation list for the initial domain. */
200 for ( mfn = (alloc_start>>PAGE_SHIFT);
201 mfn < (alloc_end>>PAGE_SHIFT);
202 mfn++ )
203 {
204 page = &frame_table[mfn];
205 page_set_owner(page, d);
206 page->u.inuse.type_info = 0;
207 page->count_info = PGC_allocated | 1;
208 list_add_tail(&page->list, &d->page_list);
209 d->tot_pages++; d->max_pages++;
210 }
212 mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
214 SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
215 SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
217 /*
218 * We're basically forcing default RPLs to 1, so that our "what privilege
219 * level are we returning to?" logic works.
220 */
221 ed->arch.failsafe_selector = FLAT_GUESTOS_CS;
222 ed->arch.event_selector = FLAT_GUESTOS_CS;
223 ed->arch.guestos_ss = FLAT_GUESTOS_SS;
224 for ( i = 0; i < 256; i++ )
225 ed->arch.traps[i].cs = FLAT_GUESTOS_CS;
227 /* WARNING: The new domain must have its 'processor' field filled in! */
228 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
229 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
230 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
231 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
232 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
233 mk_l2_pgentry(__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR);
234 ed->arch.pagetable = mk_pagetable((unsigned long)l2start);
236 l2tab += l2_table_offset(dsi.v_start);
237 mfn = alloc_start >> PAGE_SHIFT;
238 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
239 {
240 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
241 {
242 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
243 mpt_alloc += PAGE_SIZE;
244 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
245 clear_page(l1tab);
246 if ( count == 0 )
247 l1tab += l1_table_offset(dsi.v_start);
248 }
249 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
251 page = &frame_table[mfn];
252 if ( !get_page_and_type(page, d, PGT_writable_page) )
253 BUG();
255 mfn++;
256 }
258 /* Pages that are part of page tables must be read only. */
259 l2tab = l2start + l2_table_offset(vpt_start);
260 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
261 l1tab += l1_table_offset(vpt_start);
262 for ( count = 0; count < nr_pt_pages; count++ )
263 {
264 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
265 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
266 if ( count == 0 )
267 {
268 page->u.inuse.type_info &= ~PGT_type_mask;
269 page->u.inuse.type_info |= PGT_l2_page_table;
271 /*
272 * No longer writable: decrement the type_count.
273 * Installed as CR3: increment both the ref_count and type_count.
274 * Net: just increment the ref_count.
275 */
276 get_page(page, d); /* an extra ref because of readable mapping */
278 /* Get another ref to L2 page so that it can be pinned. */
279 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
280 BUG();
281 set_bit(_PGT_pinned, &page->u.inuse.type_info);
282 }
283 else
284 {
285 page->u.inuse.type_info &= ~PGT_type_mask;
286 page->u.inuse.type_info |= PGT_l1_page_table;
287 page->u.inuse.type_info |=
288 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
290 /*
291 * No longer writable: decrement the type_count.
292 * This is an L1 page, installed in a validated L2 page:
293 * increment both the ref_count and type_count.
294 * Net: just increment the ref_count.
295 */
296 get_page(page, d); /* an extra ref because of readable mapping */
297 }
298 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
299 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*++l2tab);
300 }
302 /* Set up shared-info area. */
303 update_dom_time(d);
304 d->shared_info->domain_time = 0;
305 /* Mask all upcalls... */
306 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
307 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
308 d->shared_info->n_vcpu = smp_num_cpus;
310 /* Install the new page tables. */
311 __cli();
312 write_ptbase(ed);
314 /* Copy the OS image. */
315 (void)loadelfimage(image_start);
317 /* Copy the initial ramdisk. */
318 if ( initrd_len != 0 )
319 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
321 /* Set up start info area. */
322 si = (start_info_t *)vstartinfo_start;
323 memset(si, 0, PAGE_SIZE);
324 si->nr_pages = d->tot_pages;
325 si->shared_info = virt_to_phys(d->shared_info);
326 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
327 si->pt_base = vpt_start;
328 si->nr_pt_frames = nr_pt_pages;
329 si->mfn_list = vphysmap_start;
331 /* Write the phys->machine and machine->phys table entries. */
332 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
333 {
334 mfn = pfn + (alloc_start>>PAGE_SHIFT);
335 #ifndef NDEBUG
336 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
337 if ( pfn > REVERSE_START )
338 mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
339 #endif
340 ((unsigned long *)vphysmap_start)[pfn] = mfn;
341 machine_to_phys_mapping[mfn] = pfn;
342 }
344 if ( initrd_len != 0 )
345 {
346 si->mod_start = vinitrd_start;
347 si->mod_len = initrd_len;
348 printk("Initrd len 0x%lx, start at 0x%p\n",
349 si->mod_len, si->mod_start);
350 }
352 dst = si->cmd_line;
353 if ( cmdline != NULL )
354 {
355 for ( i = 0; i < 255; i++ )
356 {
357 if ( cmdline[i] == '\0' )
358 break;
359 *dst++ = cmdline[i];
360 }
361 }
362 *dst = '\0';
364 /* Reinstate the caller's page tables. */
365 write_ptbase(current);
366 __sti();
368 /* Destroy low mappings - they were only for our convenience. */
369 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
370 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
371 l2start[i] = mk_l2_pgentry(0);
372 zap_low_mappings(); /* Do the same for the idle page tables. */
374 /* DOM0 gets access to everything. */
375 physdev_init_dom0(d);
377 set_bit(DF_CONSTRUCTED, &d->d_flags);
379 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
381 #ifndef NDEBUG
382 if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
383 {
384 shadow_lock(d);
385 shadow_mode_enable(d, SHM_test);
386 shadow_unlock(d);
387 }
388 #endif
390 return 0;
391 }
393 int elf_sanity_check(Elf_Ehdr *ehdr)
394 {
395 if ( !IS_ELF(*ehdr) ||
396 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
397 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
398 (ehdr->e_type != ET_EXEC) ||
399 (ehdr->e_machine != EM_386) )
400 {
401 printk("DOM0 image is not i386-compatible executable Elf image.\n");
402 return 0;
403 }
405 return 1;
406 }