debuggers.hg

view xen/arch/x86/domain_build.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents cecc76506afc
children 8440fc9f7a25
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <xen/libelf.h>
23 #include <asm/regs.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/i387.h>
29 #include <asm/paging.h>
30 #include <asm/p2m.h>
31 #include <asm/e820.h>
33 #include <public/version.h>
35 int __init bzimage_parse(
36 char *output, char **image_start, unsigned long *image_len);
38 extern unsigned long initial_images_nrpages(void);
39 extern void discard_initial_images(void);
41 static long __initdata dom0_nrpages;
42 static long __initdata dom0_min_nrpages;
43 static long __initdata dom0_max_nrpages = LONG_MAX;
45 /*
46 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
47 *
48 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
49 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
50 * <amt>: The precise amount of memory to allocate for dom0.
51 *
52 * Notes:
53 * 1. <amt> is clamped from below by <min_amt> and from above by available
54 * memory and <max_amt>
55 * 2. <min_amt> is clamped from above by available memory and <max_amt>
56 * 3. <min_amt> is ignored if it is greater than <max_amt>
57 * 4. If <amt> is not specified, it is calculated as follows:
58 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
59 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
60 *
61 * Each value can be specified as positive or negative:
62 * If +ve: The specified amount is an absolute value.
63 * If -ve: The specified amount is subtracted from total available memory.
64 */
65 static long __init parse_amt(const char *s, const char **ps)
66 {
67 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
68 return (*s == '-') ? -pages : pages;
69 }
70 static void __init parse_dom0_mem(const char *s)
71 {
72 do {
73 if ( !strncmp(s, "min:", 4) )
74 dom0_min_nrpages = parse_amt(s+4, &s);
75 else if ( !strncmp(s, "max:", 4) )
76 dom0_max_nrpages = parse_amt(s+4, &s);
77 else
78 dom0_nrpages = parse_amt(s, &s);
79 if ( *s != ',' )
80 break;
81 } while ( *s++ == ',' );
82 }
83 custom_param("dom0_mem", parse_dom0_mem);
85 static unsigned int __initdata opt_dom0_max_vcpus;
86 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
88 struct vcpu *__init alloc_dom0_vcpu0(void)
89 {
90 if ( opt_dom0_max_vcpus == 0 )
91 opt_dom0_max_vcpus = num_online_cpus();
92 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
93 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
95 dom0->vcpu = xmalloc_array(struct vcpu *, opt_dom0_max_vcpus);
96 if ( !dom0->vcpu )
97 return NULL;
98 memset(dom0->vcpu, 0, opt_dom0_max_vcpus * sizeof(*dom0->vcpu));
99 dom0->max_vcpus = opt_dom0_max_vcpus;
101 return alloc_vcpu(dom0, 0, 0);
102 }
104 static unsigned int opt_dom0_shadow;
105 boolean_param("dom0_shadow", opt_dom0_shadow);
107 static char opt_dom0_ioports_disable[200] = "";
108 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
110 #if defined(__i386__)
111 /* No ring-3 access in initial leaf page tables. */
112 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
113 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
114 #define L3_PROT (_PAGE_PRESENT)
115 #elif defined(__x86_64__)
116 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
117 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
118 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
119 /* ... except for compatibility mode guests. */
120 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
121 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
122 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
123 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
124 #endif
126 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
127 #define round_pgdown(_p) ((_p)&PAGE_MASK)
129 static struct page_info * __init alloc_chunk(
130 struct domain *d, unsigned long max_pages)
131 {
132 struct page_info *page;
133 unsigned int order;
134 /*
135 * Allocate up to 2MB at a time: It prevents allocating very large chunks
136 * from DMA pools before the >4GB pool is fully depleted.
137 */
138 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
139 max_pages = 2UL << (20 - PAGE_SHIFT);
140 order = get_order_from_pages(max_pages);
141 if ( (max_pages & (max_pages-1)) != 0 )
142 order--;
143 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
144 if ( order-- == 0 )
145 break;
146 return page;
147 }
149 static unsigned long __init compute_dom0_nr_pages(void)
150 {
151 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
153 /*
154 * If domain 0 allocation isn't specified, reserve 1/16th of available
155 * memory for things like DMA buffers. This reservation is clamped to
156 * a maximum of 128MB.
157 */
158 if ( dom0_nrpages == 0 )
159 {
160 dom0_nrpages = avail;
161 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
162 dom0_nrpages = -dom0_nrpages;
163 }
165 /* Negative memory specification means "all memory - specified amount". */
166 if ( dom0_nrpages < 0 ) dom0_nrpages += avail;
167 if ( dom0_min_nrpages < 0 ) dom0_min_nrpages += avail;
168 if ( dom0_max_nrpages < 0 ) dom0_max_nrpages += avail;
170 /* Clamp dom0 memory according to min/max limits and available memory. */
171 dom0_nrpages = max(dom0_nrpages, dom0_min_nrpages);
172 dom0_nrpages = min(dom0_nrpages, dom0_max_nrpages);
173 dom0_nrpages = min(dom0_nrpages, (long)avail);
175 return dom0_nrpages;
176 }
178 static void __init process_dom0_ioports_disable(void)
179 {
180 unsigned long io_from, io_to;
181 char *t, *s = opt_dom0_ioports_disable;
182 const char *u;
184 if ( *s == '\0' )
185 return;
187 while ( (t = strsep(&s, ",")) != NULL )
188 {
189 io_from = simple_strtoul(t, &u, 16);
190 if ( u == t )
191 {
192 parse_error:
193 printk("Invalid ioport range <%s> "
194 "in dom0_ioports_disable, skipping\n", t);
195 continue;
196 }
198 if ( *u == '\0' )
199 io_to = io_from;
200 else if ( *u == '-' )
201 io_to = simple_strtoul(u + 1, &u, 16);
202 else
203 goto parse_error;
205 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
206 goto parse_error;
208 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
209 io_from, io_to);
211 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
212 BUG();
213 }
214 }
216 int __init construct_dom0(
217 struct domain *d,
218 unsigned long _image_base,
219 unsigned long _image_start, unsigned long image_len,
220 unsigned long _initrd_start, unsigned long initrd_len,
221 char *cmdline)
222 {
223 int i, rc, compatible, compat32, order, machine;
224 struct cpu_user_regs *regs;
225 unsigned long pfn, mfn;
226 unsigned long nr_pages;
227 unsigned long nr_pt_pages;
228 unsigned long alloc_spfn;
229 unsigned long alloc_epfn;
230 unsigned long count;
231 struct page_info *page = NULL;
232 start_info_t *si;
233 struct vcpu *v = d->vcpu[0];
234 unsigned long long value;
235 #if defined(__i386__)
236 char *image_base = (char *)_image_base; /* use lowmem mappings */
237 char *image_start = (char *)_image_start; /* use lowmem mappings */
238 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
239 #elif defined(__x86_64__)
240 char *image_base = __va(_image_base);
241 char *image_start = __va(_image_start);
242 char *initrd_start = __va(_initrd_start);
243 #endif
244 #if CONFIG_PAGING_LEVELS >= 4
245 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
246 #endif
247 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
248 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
249 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
251 /*
252 * This fully describes the memory layout of the initial domain. All
253 * *_start address are page-aligned, except v_start (and v_end) which are
254 * superpage-aligned.
255 */
256 struct elf_binary elf;
257 struct elf_dom_parms parms;
258 unsigned long vkern_start;
259 unsigned long vkern_end;
260 unsigned long vinitrd_start;
261 unsigned long vinitrd_end;
262 unsigned long vphysmap_start;
263 unsigned long vphysmap_end;
264 unsigned long vstartinfo_start;
265 unsigned long vstartinfo_end;
266 unsigned long vstack_start;
267 unsigned long vstack_end;
268 unsigned long vpt_start;
269 unsigned long vpt_end;
270 unsigned long v_start;
271 unsigned long v_end;
273 /* Machine address of next candidate page-table page. */
274 unsigned long mpt_alloc;
276 /* Sanity! */
277 BUG_ON(d->domain_id != 0);
278 BUG_ON(d->vcpu[0] == NULL);
279 BUG_ON(v->is_initialised);
281 printk("*** LOADING DOMAIN 0 ***\n");
283 d->max_pages = ~0U;
285 nr_pages = compute_dom0_nr_pages();
287 if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
288 return rc;
290 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
291 return rc;
292 #ifdef VERBOSE
293 elf_set_verbose(&elf);
294 #endif
295 elf_parse_binary(&elf);
296 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
297 return rc;
299 /* compatibility check */
300 compatible = 0;
301 compat32 = 0;
302 machine = elf_uval(&elf, elf.ehdr, e_machine);
303 switch (CONFIG_PAGING_LEVELS) {
304 case 3: /* x86_32p */
305 if (parms.pae == PAEKERN_bimodal)
306 parms.pae = PAEKERN_extended_cr3;
307 printk(" Xen kernel: 32-bit, PAE, lsb\n");
308 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
309 compatible = 1;
310 break;
311 case 4: /* x86_64 */
312 printk(" Xen kernel: 64-bit, lsb, compat32\n");
313 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
314 parms.pae = PAEKERN_extended_cr3;
315 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
316 {
317 compat32 = 1;
318 compatible = 1;
319 }
320 if (elf_64bit(&elf) && machine == EM_X86_64)
321 compatible = 1;
322 break;
323 }
324 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
325 elf_64bit(&elf) ? "64-bit" : "32-bit",
326 parms.pae ? ", PAE" : "",
327 elf_msb(&elf) ? "msb" : "lsb",
328 elf.pstart, elf.pend);
329 if ( elf.bsd_symtab_pstart )
330 printk(" Dom0 symbol map 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
331 elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
333 if ( !compatible )
334 {
335 printk("Mismatch between Xen and DOM0 kernel\n");
336 return -EINVAL;
337 }
339 #if defined(__x86_64__)
340 if ( compat32 )
341 {
342 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
343 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
345 if ( nr_pages != (unsigned int)nr_pages )
346 nr_pages = UINT_MAX;
347 }
348 #endif
350 if ( parms.pae == PAEKERN_extended_cr3 )
351 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
353 if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
354 {
355 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
356 value = (parms.virt_hv_start_low + mask) & ~mask;
357 BUG_ON(!is_pv_32bit_domain(d));
358 #if defined(__i386__)
359 if ( value > HYPERVISOR_VIRT_START )
360 panic("Domain 0 expects too high a hypervisor start address.\n");
361 #else
362 if ( value > __HYPERVISOR_COMPAT_VIRT_START )
363 panic("Domain 0 expects too high a hypervisor start address.\n");
364 HYPERVISOR_COMPAT_VIRT_START(d) =
365 max_t(unsigned int, m2p_compat_vstart, value);
366 #endif
367 }
369 if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
370 {
371 printk(XENLOG_WARNING "P2M table base ignored\n");
372 parms.p2m_base = UNSET_ADDR;
373 }
375 domain_set_alloc_bitsize(d);
377 /*
378 * Why do we need this? The number of page-table frames depends on the
379 * size of the bootstrap address space. But the size of the address space
380 * depends on the number of page-table frames (since each one is mapped
381 * read-only). We have a pair of simultaneous equations in two unknowns,
382 * which we solve by exhaustive search.
383 */
384 v_start = parms.virt_base;
385 vkern_start = parms.virt_kstart;
386 vkern_end = parms.virt_kend;
387 vinitrd_start = round_pgup(vkern_end);
388 vinitrd_end = vinitrd_start + initrd_len;
389 vphysmap_start = round_pgup(vinitrd_end);
390 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
391 sizeof(unsigned long) :
392 sizeof(unsigned int)));
393 if ( parms.p2m_base != UNSET_ADDR )
394 vphysmap_end = vphysmap_start;
395 vstartinfo_start = round_pgup(vphysmap_end);
396 vstartinfo_end = (vstartinfo_start +
397 sizeof(struct start_info) +
398 sizeof(struct dom0_vga_console_info));
399 vpt_start = round_pgup(vstartinfo_end);
400 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
401 {
402 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
403 vstack_start = vpt_end;
404 vstack_end = vstack_start + PAGE_SIZE;
405 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
406 if ( (v_end - vstack_end) < (512UL << 10) )
407 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
408 #if defined(__i386__)
409 /* 5 pages: 1x 3rd + 4x 2nd level */
410 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
411 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
412 break;
413 #elif defined(__x86_64__)
414 #define NR(_l,_h,_s) \
415 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
416 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
417 if ( (1 + /* # L4 */
418 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
419 (!is_pv_32on64_domain(d) ?
420 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
421 4) + /* # compat L2 */
422 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
423 <= nr_pt_pages )
424 break;
425 #endif
426 }
428 order = get_order_from_bytes(v_end - v_start);
429 if ( (1UL << order) > nr_pages )
430 panic("Domain 0 allocation is too small for kernel image.\n");
432 #ifdef __i386__
433 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
434 page = alloc_domheap_pages(d, order, MEMF_bits(30));
435 #else
436 if ( parms.p2m_base != UNSET_ADDR )
437 {
438 vphysmap_start = parms.p2m_base;
439 vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long);
440 }
441 page = alloc_domheap_pages(d, order, 0);
442 #endif
443 if ( page == NULL )
444 panic("Not enough RAM for domain 0 allocation.\n");
445 alloc_spfn = page_to_mfn(page);
446 alloc_epfn = alloc_spfn + d->tot_pages;
448 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
449 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
450 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
451 if ( d->tot_pages < nr_pages )
452 printk(" (%lu pages to be allocated)",
453 nr_pages - d->tot_pages);
454 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
455 " Loaded kernel: %p->%p\n"
456 " Init. ramdisk: %p->%p\n"
457 " Phys-Mach map: %p->%p\n"
458 " Start info: %p->%p\n"
459 " Page tables: %p->%p\n"
460 " Boot stack: %p->%p\n"
461 " TOTAL: %p->%p\n",
462 _p(vkern_start), _p(vkern_end),
463 _p(vinitrd_start), _p(vinitrd_end),
464 _p(vphysmap_start), _p(vphysmap_end),
465 _p(vstartinfo_start), _p(vstartinfo_end),
466 _p(vpt_start), _p(vpt_end),
467 _p(vstack_start), _p(vstack_end),
468 _p(v_start), _p(v_end));
469 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
471 mpt_alloc = (vpt_start - v_start) +
472 (unsigned long)pfn_to_paddr(alloc_spfn);
474 #if defined(__i386__)
475 /*
476 * Protect the lowest 1GB of memory. We use a temporary mapping there
477 * from which we copy the kernel and ramdisk images.
478 */
479 if ( v_start < (1UL<<30) )
480 {
481 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
482 return -EINVAL;
483 }
485 /* WARNING: The new domain must have its 'processor' field filled in! */
486 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
487 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
488 for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
489 copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
490 idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
491 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
492 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
493 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
494 }
495 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
497 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
498 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
499 l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
501 l2tab += l2_linear_offset(v_start);
502 mfn = alloc_spfn;
503 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
504 {
505 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
506 {
507 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
508 mpt_alloc += PAGE_SIZE;
509 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
510 l2tab++;
511 clear_page(l1tab);
512 if ( count == 0 )
513 l1tab += l1_table_offset(v_start);
514 }
515 *l1tab = l1e_from_pfn(mfn, L1_PROT);
516 l1tab++;
518 page = mfn_to_page(mfn);
519 if ( !get_page_and_type(page, d, PGT_writable_page) )
520 BUG();
522 mfn++;
523 }
525 /* Pages that are part of page tables must be read only. */
526 l2tab = l2start + l2_linear_offset(vpt_start);
527 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
528 l1tab += l1_table_offset(vpt_start);
529 for ( count = 0; count < nr_pt_pages; count++ )
530 {
531 page = mfn_to_page(l1e_get_pfn(*l1tab));
532 if ( !opt_dom0_shadow )
533 l1e_remove_flags(*l1tab, _PAGE_RW);
534 else
535 if ( !get_page_type(page, PGT_writable_page) )
536 BUG();
538 switch ( count )
539 {
540 case 0:
541 page->u.inuse.type_info &= ~PGT_type_mask;
542 page->u.inuse.type_info |= PGT_l3_page_table;
543 get_page(page, d); /* an extra ref because of readable mapping */
545 /* Get another ref to L3 page so that it can be pinned. */
546 page->u.inuse.type_info++;
547 page->count_info++;
548 set_bit(_PGT_pinned, &page->u.inuse.type_info);
549 break;
550 case 1 ... 4:
551 page->u.inuse.type_info &= ~PGT_type_mask;
552 page->u.inuse.type_info |= PGT_l2_page_table;
553 if ( count == 4 )
554 page->u.inuse.type_info |= PGT_pae_xen_l2;
555 get_page(page, d); /* an extra ref because of readable mapping */
556 break;
557 default:
558 page->u.inuse.type_info &= ~PGT_type_mask;
559 page->u.inuse.type_info |= PGT_l1_page_table;
560 get_page(page, d); /* an extra ref because of readable mapping */
561 break;
562 }
563 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
564 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
565 }
567 #elif defined(__x86_64__)
569 /* Overlap with Xen protected area? */
570 if ( !is_pv_32on64_domain(d) ?
571 ((v_start < HYPERVISOR_VIRT_END) &&
572 (v_end > HYPERVISOR_VIRT_START)) :
573 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
574 {
575 printk("DOM0 image overlaps with Xen private area.\n");
576 return -EINVAL;
577 }
579 if ( is_pv_32on64_domain(d) )
580 {
581 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
582 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
583 }
585 /* WARNING: The new domain must have its 'processor' field filled in! */
586 if ( !is_pv_32on64_domain(d) )
587 {
588 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
589 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
590 }
591 else
592 {
593 page = alloc_domheap_page(NULL, 0);
594 if ( !page )
595 panic("Not enough RAM for domain 0 PML4.\n");
596 page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
597 l4start = l4tab = page_to_virt(page);
598 }
599 copy_page(l4tab, idle_pg_table);
600 l4tab[0] = l4e_empty(); /* zap trampoline mapping */
601 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
602 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
603 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
604 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
605 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
606 if ( is_pv_32on64_domain(d) )
607 v->arch.guest_table_user = v->arch.guest_table;
609 l4tab += l4_table_offset(v_start);
610 mfn = alloc_spfn;
611 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
612 {
613 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
614 {
615 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
616 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
617 clear_page(l1tab);
618 if ( count == 0 )
619 l1tab += l1_table_offset(v_start);
620 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
621 {
622 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
623 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
624 clear_page(l2tab);
625 if ( count == 0 )
626 l2tab += l2_table_offset(v_start);
627 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
628 {
629 maddr_to_page(mpt_alloc)->u.inuse.type_info =
630 PGT_l3_page_table;
631 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
632 clear_page(l3tab);
633 if ( count == 0 )
634 l3tab += l3_table_offset(v_start);
635 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
636 l4tab++;
637 }
638 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
639 l3tab++;
640 }
641 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
642 l2tab++;
643 }
644 *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
645 L1_PROT : COMPAT_L1_PROT));
646 l1tab++;
648 page = mfn_to_page(mfn);
649 if ( (page->u.inuse.type_info == 0) &&
650 !get_page_and_type(page, d, PGT_writable_page) )
651 BUG();
653 mfn++;
654 }
656 if ( is_pv_32on64_domain(d) )
657 {
658 /* Ensure the first four L3 entries are all populated. */
659 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
660 {
661 if ( !l3e_get_intpte(*l3tab) )
662 {
663 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
664 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
665 clear_page(l2tab);
666 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
667 }
668 if ( i == 3 )
669 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
670 }
671 /* Install read-only guest visible MPT mapping. */
672 l2tab = l3e_to_l2e(l3start[3]);
673 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
674 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
675 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
676 }
678 /* Pages that are part of page tables must be read only. */
679 l4tab = l4start + l4_table_offset(vpt_start);
680 l3start = l3tab = l4e_to_l3e(*l4tab);
681 l3tab += l3_table_offset(vpt_start);
682 l2start = l2tab = l3e_to_l2e(*l3tab);
683 l2tab += l2_table_offset(vpt_start);
684 l1start = l1tab = l2e_to_l1e(*l2tab);
685 l1tab += l1_table_offset(vpt_start);
686 for ( count = 0; count < nr_pt_pages; count++ )
687 {
688 l1e_remove_flags(*l1tab, _PAGE_RW);
689 page = mfn_to_page(l1e_get_pfn(*l1tab));
691 /* Read-only mapping + PGC_allocated + page-table page. */
692 page->count_info = PGC_allocated | 3;
693 page->u.inuse.type_info |= PGT_validated | 1;
695 /* Top-level p.t. is pinned. */
696 if ( (page->u.inuse.type_info & PGT_type_mask) ==
697 (!is_pv_32on64_domain(d) ?
698 PGT_l4_page_table : PGT_l3_page_table) )
699 {
700 page->count_info += 1;
701 page->u.inuse.type_info += 1 | PGT_pinned;
702 }
704 /* Iterate. */
705 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
706 {
707 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
708 {
709 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
710 l3start = l3tab = l4e_to_l3e(*++l4tab);
711 l2start = l2tab = l3e_to_l2e(*l3tab);
712 }
713 l1start = l1tab = l2e_to_l1e(*l2tab);
714 }
715 }
717 #endif /* __x86_64__ */
719 /* Mask all upcalls... */
720 for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
721 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
723 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
725 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
726 (void)alloc_vcpu(d, i, i % num_online_cpus());
728 /* Set up CR3 value for write_ptbase */
729 if ( paging_mode_enabled(d) )
730 paging_update_paging_modes(v);
731 else
732 update_cr3(v);
734 /* We run on dom0's page tables for the final part of the build process. */
735 write_ptbase(v);
737 /* Copy the OS image and free temporary buffer. */
738 elf.dest = (void*)vkern_start;
739 elf_load_binary(&elf);
741 if ( UNSET_ADDR != parms.virt_hypercall )
742 {
743 if ( (parms.virt_hypercall < v_start) ||
744 (parms.virt_hypercall >= v_end) )
745 {
746 write_ptbase(current);
747 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
748 return -1;
749 }
750 hypercall_page_initialise(
751 d, (void *)(unsigned long)parms.virt_hypercall);
752 }
754 /* Copy the initial ramdisk. */
755 if ( initrd_len != 0 )
756 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
758 /* Free temporary buffers. */
759 discard_initial_images();
761 /* Set up start info area. */
762 si = (start_info_t *)vstartinfo_start;
763 clear_page(si);
764 si->nr_pages = nr_pages;
766 si->shared_info = virt_to_maddr(d->shared_info);
768 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
769 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
770 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
771 si->nr_pt_frames = nr_pt_pages;
772 si->mfn_list = vphysmap_start;
773 snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
774 elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
776 count = d->tot_pages;
777 #ifdef __x86_64__
778 /* Set up the phys->machine table if not part of the initial mapping. */
779 if ( parms.p2m_base != UNSET_ADDR )
780 {
781 unsigned long va = vphysmap_start;
783 if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
784 panic("DOM0 P->M table overlaps initial mapping");
786 while ( va < vphysmap_end )
787 {
788 if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
789 >> PAGE_SHIFT) + 3 > nr_pages )
790 panic("Dom0 allocation too small for initial P->M table.\n");
792 l4tab = l4start + l4_table_offset(va);
793 if ( !l4e_get_intpte(*l4tab) )
794 {
795 page = alloc_domheap_page(d, 0);
796 if ( !page )
797 break;
798 /* No mapping, PGC_allocated + page-table page. */
799 page->count_info = PGC_allocated | 2;
800 page->u.inuse.type_info =
801 PGT_l3_page_table | PGT_validated | 1;
802 clear_page(page_to_virt(page));
803 *l4tab = l4e_from_page(page, L4_PROT);
804 }
805 l3tab = page_to_virt(l4e_get_page(*l4tab));
806 l3tab += l3_table_offset(va);
807 if ( !l3e_get_intpte(*l3tab) )
808 {
809 if ( cpu_has_page1gb &&
810 !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
811 vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
812 (page = alloc_domheap_pages(d,
813 L3_PAGETABLE_SHIFT -
814 PAGE_SHIFT,
815 0)) != NULL )
816 {
817 *l3tab = l3e_from_page(page,
818 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
819 va += 1UL << L3_PAGETABLE_SHIFT;
820 continue;
821 }
822 if ( (page = alloc_domheap_page(d, 0)) == NULL )
823 break;
824 else
825 {
826 /* No mapping, PGC_allocated + page-table page. */
827 page->count_info = PGC_allocated | 2;
828 page->u.inuse.type_info =
829 PGT_l2_page_table | PGT_validated | 1;
830 clear_page(page_to_virt(page));
831 *l3tab = l3e_from_page(page, L3_PROT);
832 }
833 }
834 l2tab = page_to_virt(l3e_get_page(*l3tab));
835 l2tab += l2_table_offset(va);
836 if ( !l2e_get_intpte(*l2tab) )
837 {
838 if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
839 vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
840 (page = alloc_domheap_pages(d,
841 L2_PAGETABLE_SHIFT -
842 PAGE_SHIFT,
843 0)) != NULL )
844 {
845 *l2tab = l2e_from_page(page,
846 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
847 va += 1UL << L2_PAGETABLE_SHIFT;
848 continue;
849 }
850 if ( (page = alloc_domheap_page(d, 0)) == NULL )
851 break;
852 else
853 {
854 /* No mapping, PGC_allocated + page-table page. */
855 page->count_info = PGC_allocated | 2;
856 page->u.inuse.type_info =
857 PGT_l1_page_table | PGT_validated | 1;
858 clear_page(page_to_virt(page));
859 *l2tab = l2e_from_page(page, L2_PROT);
860 }
861 }
862 l1tab = page_to_virt(l2e_get_page(*l2tab));
863 l1tab += l1_table_offset(va);
864 BUG_ON(l1e_get_intpte(*l1tab));
865 page = alloc_domheap_page(d, 0);
866 if ( !page )
867 break;
868 *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
869 va += PAGE_SIZE;
870 va &= PAGE_MASK;
871 }
872 if ( !page )
873 panic("Not enough RAM for DOM0 P->M table.\n");
874 }
875 #endif
877 /* Write the phys->machine and machine->phys table entries. */
878 for ( pfn = 0; pfn < count; pfn++ )
879 {
880 mfn = pfn + alloc_spfn;
881 #ifndef NDEBUG
882 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
883 if ( pfn > REVERSE_START )
884 mfn = alloc_epfn - (pfn - REVERSE_START);
885 #endif
886 if ( !is_pv_32on64_domain(d) )
887 ((unsigned long *)vphysmap_start)[pfn] = mfn;
888 else
889 ((unsigned int *)vphysmap_start)[pfn] = mfn;
890 set_gpfn_from_mfn(mfn, pfn);
891 }
892 si->first_p2m_pfn = pfn;
893 si->nr_p2m_frames = d->tot_pages - count;
894 page_list_for_each ( page, &d->page_list )
895 {
896 mfn = page_to_mfn(page);
897 if ( get_gpfn_from_mfn(mfn) >= count )
898 {
899 BUG_ON(is_pv_32bit_domain(d));
900 if ( !page->u.inuse.type_info &&
901 !get_page_and_type(page, d, PGT_writable_page) )
902 BUG();
903 ((unsigned long *)vphysmap_start)[pfn] = mfn;
904 set_gpfn_from_mfn(mfn, pfn);
905 ++pfn;
906 #ifndef NDEBUG
907 ++alloc_epfn;
908 #endif
909 }
910 }
911 BUG_ON(pfn != d->tot_pages);
912 while ( pfn < nr_pages )
913 {
914 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
915 panic("Not enough RAM for DOM0 reservation.\n");
916 while ( pfn < d->tot_pages )
917 {
918 mfn = page_to_mfn(page);
919 #ifndef NDEBUG
920 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
921 #endif
922 if ( !is_pv_32on64_domain(d) )
923 ((unsigned long *)vphysmap_start)[pfn] = mfn;
924 else
925 ((unsigned int *)vphysmap_start)[pfn] = mfn;
926 set_gpfn_from_mfn(mfn, pfn);
927 #undef pfn
928 page++; pfn++;
929 }
930 }
932 if ( initrd_len != 0 )
933 {
934 si->mod_start = vinitrd_start;
935 si->mod_len = initrd_len;
936 }
938 memset(si->cmd_line, 0, sizeof(si->cmd_line));
939 if ( cmdline != NULL )
940 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
942 if ( fill_console_start_info((void *)(si + 1)) )
943 {
944 si->console.dom0.info_off = sizeof(struct start_info);
945 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
946 }
948 #if defined(__x86_64__)
949 if ( is_pv_32on64_domain(d) )
950 xlat_start_info(si, XLAT_start_info_console_dom0);
951 #endif
953 /* Return to idle domain's page tables. */
954 write_ptbase(current);
956 #if defined(__i386__)
957 /* Destroy low mappings - they were only for our convenience. */
958 zap_low_mappings(l2start);
959 #endif
961 update_domain_wallclock_time(d);
963 v->is_initialised = 1;
964 clear_bit(_VPF_down, &v->pause_flags);
966 /*
967 * Initial register values:
968 * DS,ES,FS,GS = FLAT_KERNEL_DS
969 * CS:EIP = FLAT_KERNEL_CS:start_pc
970 * SS:ESP = FLAT_KERNEL_SS:start_stack
971 * ESI = start_info
972 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
973 */
974 regs = &v->arch.guest_context.user_regs;
975 regs->ds = regs->es = regs->fs = regs->gs =
976 !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
977 regs->ss = (!is_pv_32on64_domain(d) ?
978 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
979 regs->cs = (!is_pv_32on64_domain(d) ?
980 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
981 regs->eip = parms.virt_entry;
982 regs->esp = vstack_end;
983 regs->esi = vstartinfo_start;
984 regs->eflags = X86_EFLAGS_IF;
986 if ( opt_dom0_shadow )
987 if ( paging_enable(d, PG_SH_enable) == 0 )
988 paging_update_paging_modes(v);
990 if ( supervisor_mode_kernel )
991 {
992 v->arch.guest_context.kernel_ss &= ~3;
993 v->arch.guest_context.user_regs.ss &= ~3;
994 v->arch.guest_context.user_regs.es &= ~3;
995 v->arch.guest_context.user_regs.ds &= ~3;
996 v->arch.guest_context.user_regs.fs &= ~3;
997 v->arch.guest_context.user_regs.gs &= ~3;
998 printk("Dom0 runs in ring 0 (supervisor mode)\n");
999 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
1000 parms.f_supported) )
1001 panic("Dom0 does not support supervisor-mode execution\n");
1003 else
1005 if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
1006 panic("Dom0 requires supervisor-mode execution\n");
1009 rc = 0;
1011 /* DOM0 is permitted full I/O capabilities. */
1012 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
1013 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
1014 rc |= irqs_permit_access(dom0, 0, d->nr_pirqs - 1);
1016 /*
1017 * Modify I/O port access permissions.
1018 */
1019 /* Master Interrupt Controller (PIC). */
1020 rc |= ioports_deny_access(dom0, 0x20, 0x21);
1021 /* Slave Interrupt Controller (PIC). */
1022 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
1023 /* Interval Timer (PIT). */
1024 rc |= ioports_deny_access(dom0, 0x40, 0x43);
1025 /* PIT Channel 2 / PC Speaker Control. */
1026 rc |= ioports_deny_access(dom0, 0x61, 0x61);
1027 /* PCI configuration space (NB. 0xcf8 has special treatment). */
1028 rc |= ioports_deny_access(dom0, 0xcfc, 0xcff);
1029 /* Command-line I/O ranges. */
1030 process_dom0_ioports_disable();
1032 /*
1033 * Modify I/O memory access permissions.
1034 */
1035 /* Local APIC. */
1036 if ( mp_lapic_addr != 0 )
1038 mfn = paddr_to_pfn(mp_lapic_addr);
1039 rc |= iomem_deny_access(dom0, mfn, mfn);
1041 /* I/O APICs. */
1042 for ( i = 0; i < nr_ioapics; i++ )
1044 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
1045 if ( smp_found_config )
1046 rc |= iomem_deny_access(dom0, mfn, mfn);
1049 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
1050 for ( i = 0; i < e820.nr_map; i++ )
1052 unsigned long sfn, efn;
1053 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
1054 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
1055 if ( (e820.map[i].type == E820_UNUSABLE) &&
1056 (e820.map[i].size != 0) &&
1057 (sfn <= efn) )
1058 rc |= iomem_deny_access(dom0, sfn, efn);
1061 BUG_ON(rc != 0);
1063 return 0;
1066 /*
1067 * Local variables:
1068 * mode: C
1069 * c-set-style: "BSD"
1070 * c-basic-offset: 4
1071 * tab-width: 4
1072 * indent-tabs-mode: nil
1073 * End:
1074 */