debuggers.hg

view xen/arch/x86/setup.c @ 16959:ed8ab1a36b09

x86-64: use 1GB pages in 1:1 mapping if available

At once adjust the 2/4Mb page handling slightly in a few places (to
match the newly added code):
- when re-creating a large page mapping after finding that all small
page mappings in the respective area are using identical flags and
suitable MFNs, the virtual address was already incremented pas the
area to be dealt with, which needs to be accounted for in the
invocation of flush_area() in that path
- don't or-in/and-out _PAGE_PSE on non-present pages
- when comparing flags, try minimse the number of l1f_to_lNf()/
lNf_to_l1f() instances used
- instead of skipping a single page when encountering a big page
mapping equalling to what a small page mapping would establish, skip
to the next larger page boundary

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 28 10:17:05 2008 +0000 (2008-01-28)
parents 76601c290fa9
children 96b099ad0497
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <xen/dmi.h>
23 #include <public/version.h>
24 #ifdef CONFIG_COMPAT
25 #include <compat/platform.h>
26 #include <compat/xen.h>
27 #endif
28 #include <asm/bitops.h>
29 #include <asm/smp.h>
30 #include <asm/processor.h>
31 #include <asm/mpspec.h>
32 #include <asm/apic.h>
33 #include <asm/desc.h>
34 #include <asm/paging.h>
35 #include <asm/e820.h>
36 #include <xsm/acm/acm_hooks.h>
37 #include <xen/kexec.h>
38 #include <asm/edd.h>
39 #include <xsm/xsm.h>
40 #include <asm/tboot.h>
42 #if defined(CONFIG_X86_64)
43 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
44 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
45 #else
46 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
47 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
48 #endif
50 extern void generic_apic_probe(void);
51 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53 extern u16 boot_edid_caps;
54 extern u8 boot_edid_info[128];
55 extern struct boot_video_info boot_vid_info;
57 /*
58 * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
59 * page_info table and allocation bitmap.
60 */
61 static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
62 #if defined(CONFIG_X86_64)
63 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
64 #endif
66 /* opt_nosmp: If true, secondary processors are ignored. */
67 static int opt_nosmp = 0;
68 boolean_param("nosmp", opt_nosmp);
70 /* maxcpus: maximum number of CPUs to activate. */
71 static unsigned int max_cpus = NR_CPUS;
72 integer_param("maxcpus", max_cpus);
74 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
75 static int opt_watchdog = 0;
76 boolean_param("watchdog", opt_watchdog);
78 /* **** Linux config option: propagated to domain0. */
79 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
80 /* "acpi=force": Override the disable blacklist. */
81 /* "acpi=strict": Disables out-of-spec workarounds. */
82 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
83 /* "acpi=noirq": Disables ACPI interrupt routing. */
84 static void parse_acpi_param(char *s);
85 custom_param("acpi", parse_acpi_param);
87 /* **** Linux config option: propagated to domain0. */
88 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
89 extern int acpi_skip_timer_override;
90 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
92 /* **** Linux config option: propagated to domain0. */
93 /* noapic: Disable IOAPIC setup. */
94 extern int skip_ioapic_setup;
95 boolean_param("noapic", skip_ioapic_setup);
97 int early_boot = 1;
99 cpumask_t cpu_present_map;
101 unsigned long xen_phys_start;
103 /* Limits of Xen heap, used to initialise the allocator. */
104 unsigned long xenheap_phys_start, xenheap_phys_end;
106 extern void arch_init_memory(void);
107 extern void init_IRQ(void);
108 extern void early_time_init(void);
109 extern void early_cpu_init(void);
110 extern void vesa_init(void);
111 extern void vesa_mtrr_init(void);
113 struct tss_struct init_tss[NR_CPUS];
115 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
117 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
119 #if CONFIG_PAGING_LEVELS > 2
120 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
121 #else
122 unsigned long mmu_cr4_features = X86_CR4_PSE;
123 #endif
124 EXPORT_SYMBOL(mmu_cr4_features);
126 int acpi_disabled;
128 int acpi_force;
129 char acpi_param[10] = "";
130 static void __init parse_acpi_param(char *s)
131 {
132 /* Save the parameter so it can be propagated to domain0. */
133 safe_strcpy(acpi_param, s);
135 /* Interpret the parameter for use within Xen. */
136 if ( !strcmp(s, "off") )
137 {
138 disable_acpi();
139 }
140 else if ( !strcmp(s, "force") )
141 {
142 acpi_force = 1;
143 acpi_ht = 1;
144 acpi_disabled = 0;
145 }
146 else if ( !strcmp(s, "strict") )
147 {
148 acpi_strict = 1;
149 }
150 else if ( !strcmp(s, "ht") )
151 {
152 if ( !acpi_force )
153 disable_acpi();
154 acpi_ht = 1;
155 }
156 else if ( !strcmp(s, "noirq") )
157 {
158 acpi_noirq_set();
159 }
160 }
162 static void __init do_initcalls(void)
163 {
164 initcall_t *call;
165 for ( call = &__initcall_start; call < &__initcall_end; call++ )
166 (*call)();
167 }
169 #define EARLY_FAIL(f, a...) do { \
170 printk( f , ## a ); \
171 for ( ; ; ) halt(); \
172 } while (0)
174 static unsigned long __initdata initial_images_start, initial_images_end;
176 unsigned long __init initial_images_nrpages(void)
177 {
178 ASSERT(!(initial_images_start & ~PAGE_MASK));
179 ASSERT(!(initial_images_end & ~PAGE_MASK));
180 return ((initial_images_end >> PAGE_SHIFT) -
181 (initial_images_start >> PAGE_SHIFT));
182 }
184 void __init discard_initial_images(void)
185 {
186 init_domheap_pages(initial_images_start, initial_images_end);
187 }
189 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
191 static void __init percpu_init_areas(void)
192 {
193 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
194 unsigned int first_unused;
196 BUG_ON(data_size > PERCPU_SIZE);
198 /* Initialise per-cpu data area for all possible secondary CPUs. */
199 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
200 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
201 __per_cpu_start,
202 data_size);
203 first_unused = i;
205 /* Check that there are no holes in cpu_possible_map. */
206 for ( ; i < NR_CPUS; i++ )
207 BUG_ON(cpu_possible(i));
209 #ifndef MEMORY_GUARD
210 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
211 __pa(__per_cpu_end));
212 #endif
213 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
214 (NR_CPUS - first_unused) << PERCPU_SHIFT);
215 #if defined(CONFIG_X86_64)
216 /* Also zap the mapping in the 1:1 area. */
217 memguard_guard_range(__va(__pa(__per_cpu_start)) +
218 (first_unused << PERCPU_SHIFT),
219 (NR_CPUS - first_unused) << PERCPU_SHIFT);
220 #endif
221 }
223 static void __init init_idle_domain(void)
224 {
225 struct domain *idle_domain;
227 /* Domain creation requires that scheduler structures are initialised. */
228 scheduler_init();
230 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
231 if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
232 BUG();
234 set_current(idle_domain->vcpu[0]);
235 idle_vcpu[0] = this_cpu(curr_vcpu) = current;
237 setup_idle_pagetable();
238 }
240 static void __init srat_detect_node(int cpu)
241 {
242 unsigned node;
243 u8 apicid = x86_cpu_to_apicid[cpu];
245 node = apicid_to_node[apicid];
246 if ( node == NUMA_NO_NODE )
247 node = 0;
248 numa_set_node(cpu, node);
250 if ( acpi_numa > 0 )
251 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
252 }
254 /*
255 * Ensure a given physical memory range is present in the bootstrap mappings.
256 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
257 */
258 static void __init bootstrap_map(unsigned long start, unsigned long end)
259 {
260 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
261 start = max_t(unsigned long, start & ~mask, 16UL << 20);
262 end = (end + mask) & ~mask;
263 if ( start >= end )
264 return;
265 if ( end > BOOTSTRAP_DIRECTMAP_END )
266 panic("Cannot access memory beyond end of "
267 "bootstrap direct-map area\n");
268 map_pages_to_xen(
269 (unsigned long)maddr_to_bootstrap_virt(start),
270 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
271 }
273 static void __init move_memory(
274 unsigned long dst, unsigned long src_start, unsigned long src_end)
275 {
276 bootstrap_map(src_start, src_end);
277 bootstrap_map(dst, dst + src_end - src_start);
278 memmove(maddr_to_bootstrap_virt(dst),
279 maddr_to_bootstrap_virt(src_start),
280 src_end - src_start);
281 }
283 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
284 static struct e820map __initdata boot_e820;
286 struct boot_video_info {
287 u8 orig_x; /* 0x00 */
288 u8 orig_y; /* 0x01 */
289 u8 orig_video_mode; /* 0x02 */
290 u8 orig_video_cols; /* 0x03 */
291 u8 orig_video_lines; /* 0x04 */
292 u8 orig_video_isVGA; /* 0x05 */
293 u16 orig_video_points; /* 0x06 */
295 /* VESA graphic mode -- linear frame buffer */
296 u32 capabilities; /* 0x08 */
297 u16 lfb_linelength; /* 0x0c */
298 u16 lfb_width; /* 0x0e */
299 u16 lfb_height; /* 0x10 */
300 u16 lfb_depth; /* 0x12 */
301 u32 lfb_base; /* 0x14 */
302 u32 lfb_size; /* 0x18 */
303 u8 red_size; /* 0x1c */
304 u8 red_pos; /* 0x1d */
305 u8 green_size; /* 0x1e */
306 u8 green_pos; /* 0x1f */
307 u8 blue_size; /* 0x20 */
308 u8 blue_pos; /* 0x21 */
309 u8 rsvd_size; /* 0x22 */
310 u8 rsvd_pos; /* 0x23 */
311 u16 vesapm_seg; /* 0x24 */
312 u16 vesapm_off; /* 0x26 */
313 u16 vesa_attrib; /* 0x28 */
314 };
316 static void __init parse_video_info(void)
317 {
318 struct boot_video_info *bvi = &bootsym(boot_vid_info);
320 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
321 {
322 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
323 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
324 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
325 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
326 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
327 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
328 }
329 else if ( bvi->orig_video_isVGA == 0x23 )
330 {
331 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
332 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
333 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
334 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
335 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
336 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
337 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
338 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
339 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
340 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
341 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
342 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
343 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
344 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
345 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
346 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
347 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
348 }
349 }
351 void __init kexec_reserve_area(struct e820map *e820)
352 {
353 unsigned long kdump_start = kexec_crash_area.start;
354 unsigned long kdump_size = kexec_crash_area.size;
355 static int is_reserved = 0;
357 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
359 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
360 return;
362 is_reserved = 1;
364 if ( !reserve_e820_ram(e820, kdump_start, kdump_size) )
365 {
366 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
367 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
368 kexec_crash_area.start = kexec_crash_area.size = 0;
369 }
370 else
371 {
372 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
373 kdump_size >> 20, kdump_size >> 10, kdump_start);
374 }
375 }
377 void init_done(void)
378 {
379 extern char __init_begin[], __init_end[];
381 /* Free (or page-protect) the init areas. */
382 #ifndef MEMORY_GUARD
383 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
384 #endif
385 memguard_guard_range(__init_begin, __init_end - __init_begin);
386 #if defined(CONFIG_X86_64)
387 /* Also zap the mapping in the 1:1 area. */
388 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
389 #endif
390 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
392 startup_cpu_idle_loop();
393 }
395 static char * __init cmdline_cook(char *p)
396 {
397 p = p ? : "";
398 while ( *p == ' ' )
399 p++;
400 while ( (*p != ' ') && (*p != '\0') )
401 p++;
402 while ( *p == ' ' )
403 p++;
404 return p;
405 }
407 void __init __start_xen(unsigned long mbi_p)
408 {
409 char *memmap_type = NULL;
410 char *cmdline, *kextra;
411 unsigned long _initrd_start = 0, _initrd_len = 0;
412 unsigned int initrdidx = 1;
413 multiboot_info_t *mbi = __va(mbi_p);
414 module_t *mod = (module_t *)__va(mbi->mods_addr);
415 unsigned long nr_pages, modules_length;
416 int i, e820_warn = 0, bytes = 0;
417 struct ns16550_defaults ns16550 = {
418 .data_bits = 8,
419 .parity = 'n',
420 .stop_bits = 1
421 };
423 extern void early_page_fault(void);
424 set_intr_gate(TRAP_page_fault, &early_page_fault);
426 /* Parse the command-line options. */
427 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
428 __va(mbi->cmdline) : NULL);
429 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
430 {
431 /*
432 * Options after ' -- ' separator belong to dom0.
433 * 1. Orphan dom0's options from Xen's command line.
434 * 2. Skip all but final leading space from dom0's options.
435 */
436 *kextra = '\0';
437 kextra += 3;
438 while ( kextra[1] == ' ' ) kextra++;
439 }
440 cmdline_parse(cmdline);
442 parse_video_info();
444 set_current((struct vcpu *)0xfffff000); /* debug sanity */
445 idle_vcpu[0] = current;
446 set_processor_id(0); /* needed early, for smp_processor_id() */
447 if ( cpu_has_efer )
448 rdmsrl(MSR_EFER, this_cpu(efer));
449 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
451 smp_prepare_boot_cpu();
453 /* We initialise the serial devices very early so we can get debugging. */
454 ns16550.io_base = 0x3f8;
455 ns16550.irq = 4;
456 ns16550_init(0, &ns16550);
457 ns16550.io_base = 0x2f8;
458 ns16550.irq = 3;
459 ns16550_init(1, &ns16550);
460 serial_init_preirq();
462 init_console();
464 printk("Command line: %s\n", cmdline);
466 printk("Video information:\n");
468 /* Print VGA display mode information. */
469 switch ( vga_console_info.video_type )
470 {
471 case XEN_VGATYPE_TEXT_MODE_3:
472 printk(" VGA is text mode %dx%d, font 8x%d\n",
473 vga_console_info.u.text_mode_3.columns,
474 vga_console_info.u.text_mode_3.rows,
475 vga_console_info.u.text_mode_3.font_height);
476 break;
477 case XEN_VGATYPE_VESA_LFB:
478 printk(" VGA is graphics mode %dx%d, %d bpp\n",
479 vga_console_info.u.vesa_lfb.width,
480 vga_console_info.u.vesa_lfb.height,
481 vga_console_info.u.vesa_lfb.bits_per_pixel);
482 break;
483 default:
484 printk(" No VGA detected\n");
485 break;
486 }
488 /* Print VBE/DDC EDID information. */
489 if ( bootsym(boot_edid_caps) != 0x1313 )
490 {
491 u16 caps = bootsym(boot_edid_caps);
492 printk(" VBE/DDC methods:%s%s%s; ",
493 (caps & 1) ? " V1" : "",
494 (caps & 2) ? " V2" : "",
495 !(caps & 3) ? " none" : "");
496 printk("EDID transfer time: %d seconds\n", caps >> 8);
497 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
498 {
499 printk(" EDID info not retrieved because ");
500 if ( !(caps & 3) )
501 printk("no DDC retrieval method detected\n");
502 else if ( (caps >> 8) > 5 )
503 printk("takes longer than 5 seconds\n");
504 else
505 printk("of reasons unknown\n");
506 }
507 }
509 printk("Disc information:\n");
510 printk(" Found %d MBR signatures\n",
511 bootsym(boot_mbr_signature_nr));
512 printk(" Found %d EDD information structures\n",
513 bootsym(boot_edd_info_nr));
515 /* Check that we have at least one Multiboot module. */
516 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
517 EARLY_FAIL("dom0 kernel not specified. "
518 "Check bootloader configuration.\n");
520 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
521 EARLY_FAIL("Misaligned CPU0 stack.\n");
523 /*
524 * Since there are some stubs getting built on the stacks which use
525 * direct calls/jumps, the heap must be confined to the lower 2G so
526 * that those branches can reach their targets.
527 */
528 if ( opt_xenheap_megabytes > 2048 )
529 opt_xenheap_megabytes = 2048;
531 if ( e820_raw_nr != 0 )
532 {
533 memmap_type = "Xen-e820";
534 }
535 else if ( bootsym(lowmem_kb) )
536 {
537 memmap_type = "Xen-e801";
538 e820_raw[0].addr = 0;
539 e820_raw[0].size = bootsym(lowmem_kb) << 10;
540 e820_raw[0].type = E820_RAM;
541 e820_raw[1].addr = 0x100000;
542 e820_raw[1].size = bootsym(highmem_kb) << 10;
543 e820_raw[1].type = E820_RAM;
544 e820_raw_nr = 2;
545 }
546 else if ( mbi->flags & MBI_MEMMAP )
547 {
548 memmap_type = "Multiboot-e820";
549 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
550 {
551 memory_map_t *map = __va(mbi->mmap_addr + bytes);
553 /*
554 * This is a gross workaround for a BIOS bug. Some bootloaders do
555 * not write e820 map entries into pre-zeroed memory. This is
556 * okay if the BIOS fills in all fields of the map entry, but
557 * some broken BIOSes do not bother to write the high word of
558 * the length field if the length is smaller than 4GB. We
559 * detect and fix this by flagging sections below 4GB that
560 * appear to be larger than 4GB in size.
561 */
562 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
563 {
564 if ( !e820_warn )
565 {
566 printk("WARNING: Buggy e820 map detected and fixed "
567 "(truncated length fields).\n");
568 e820_warn = 1;
569 }
570 map->length_high = 0;
571 }
573 e820_raw[e820_raw_nr].addr =
574 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
575 e820_raw[e820_raw_nr].size =
576 ((u64)map->length_high << 32) | (u64)map->length_low;
577 e820_raw[e820_raw_nr].type = map->type;
578 e820_raw_nr++;
580 bytes += map->size + 4;
581 }
582 }
583 else if ( mbi->flags & MBI_MEMLIMITS )
584 {
585 memmap_type = "Multiboot-e801";
586 e820_raw[0].addr = 0;
587 e820_raw[0].size = mbi->mem_lower << 10;
588 e820_raw[0].type = E820_RAM;
589 e820_raw[1].addr = 0x100000;
590 e820_raw[1].size = mbi->mem_upper << 10;
591 e820_raw[1].type = E820_RAM;
592 e820_raw_nr = 2;
593 }
594 else
595 {
596 EARLY_FAIL("Bootloader provided no memory information.\n");
597 }
599 /* Sanitise the raw E820 map to produce a final clean version. */
600 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
602 /* Create a temporary copy of the E820 map. */
603 memcpy(&boot_e820, &e820, sizeof(e820));
605 /* Early kexec reservation (explicit static start address). */
606 kexec_reserve_area(&boot_e820);
608 /*
609 * Iterate backwards over all superpage-aligned RAM regions.
610 *
611 * We require superpage alignment because the boot allocator is not yet
612 * initialised. Hence we can only map superpages in the address range
613 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
614 * dynamic allocation of pagetables.
615 *
616 * As well as mapping superpages in that range, in preparation for
617 * initialising the boot allocator, we also look for a region to which
618 * we can relocate the dom0 kernel and other multiboot modules. Also, on
619 * x86/64, we relocate Xen to higher memory.
620 */
621 modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start;
622 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
623 {
624 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
626 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
627 s = (boot_e820.map[i].addr + mask) & ~mask;
628 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
629 s = max_t(uint64_t, s, 16 << 20);
630 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
631 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
632 continue;
634 /* Map the chunk. No memory will need to be allocated to do this. */
635 map_pages_to_xen(
636 (unsigned long)maddr_to_bootstrap_virt(s),
637 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
639 #if defined(CONFIG_X86_64)
640 /* Is the region suitable for relocating Xen? */
641 if ( !xen_phys_start && (((e-s) >> 20) >= opt_xenheap_megabytes) )
642 {
643 extern l2_pgentry_t l2_xenmap[];
644 l4_pgentry_t *pl4e;
645 l3_pgentry_t *pl3e;
646 l2_pgentry_t *pl2e;
647 int i, j, k;
649 /* Select relocation address. */
650 e = (e - (opt_xenheap_megabytes << 20)) & ~mask;
651 xen_phys_start = e;
652 bootsym(trampoline_xen_phys_start) = e;
654 /*
655 * Perform relocation to new physical address.
656 * Before doing so we must sync static/global data with main memory
657 * with a barrier(). After this we must *not* modify static/global
658 * data until after we have switched to the relocated pagetables!
659 */
660 barrier();
661 move_memory(e, 0, __pa(&_end) - xen_phys_start);
663 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
664 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
666 /* Walk initial pagetables, relocating page directory entries. */
667 pl4e = __va(__pa(idle_pg_table));
668 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
669 {
670 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
671 continue;
672 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
673 xen_phys_start);
674 pl3e = l4e_to_l3e(*pl4e);
675 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
676 {
677 /* Not present, 1GB mapping, or already relocated? */
678 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
679 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
680 (l3e_get_pfn(*pl3e) > 0x1000) )
681 continue;
682 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
683 xen_phys_start);
684 pl2e = l3e_to_l2e(*pl3e);
685 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
686 {
687 /* Not present, PSE, or already relocated? */
688 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
689 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
690 (l2e_get_pfn(*pl2e) > 0x1000) )
691 continue;
692 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
693 xen_phys_start);
694 }
695 }
696 }
698 /* The only data mappings to be relocated are in the Xen area. */
699 pl2e = __va(__pa(l2_xenmap));
700 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
701 PAGE_HYPERVISOR | _PAGE_PSE);
702 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
703 {
704 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
705 continue;
706 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
707 xen_phys_start);
708 }
710 /* Re-sync the stack and then switch to relocated pagetables. */
711 asm volatile (
712 "rep movsb ; " /* re-sync the stack */
713 "movq %%cr4,%%rsi ; "
714 "andb $0x7f,%%sil ; "
715 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
716 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
717 "orb $0x80,%%sil ; "
718 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
719 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
720 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
721 }
722 #endif
724 /* Is the region suitable for relocating the multiboot modules? */
725 if ( !initial_images_start && (s < e) && ((e-s) >= modules_length) )
726 {
727 initial_images_end = e;
728 e = (e - modules_length) & PAGE_MASK;
729 initial_images_start = e;
730 move_memory(initial_images_start,
731 mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
732 }
734 if ( !kexec_crash_area.start && (s < e) &&
735 ((e-s) >= kexec_crash_area.size) )
736 {
737 e = (e - kexec_crash_area.size) & PAGE_MASK;
738 kexec_crash_area.start = e;
739 }
740 }
742 if ( !initial_images_start )
743 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
744 reserve_e820_ram(&boot_e820, initial_images_start, initial_images_end);
746 /* Initialise Xen heap and boot heap. */
747 xenheap_phys_start = init_boot_allocator(__pa(&_end));
748 xenheap_phys_end = opt_xenheap_megabytes << 20;
749 #if defined(CONFIG_X86_64)
750 if ( !xen_phys_start )
751 EARLY_FAIL("Not enough memory to relocate Xen.\n");
752 xenheap_phys_end += xen_phys_start;
753 reserve_e820_ram(&boot_e820, xen_phys_start,
754 xen_phys_start + (opt_xenheap_megabytes<<20));
755 #endif
757 /* Late kexec reservation (dynamic start address). */
758 kexec_reserve_area(&boot_e820);
760 /*
761 * With the boot allocator now initialised, we can walk every RAM region
762 * and map it in its entirety (on x86/64, at least) and notify it to the
763 * boot allocator.
764 */
765 for ( i = 0; i < boot_e820.nr_map; i++ )
766 {
767 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
769 /* Only page alignment required now. */
770 s = (boot_e820.map[i].addr + mask) & ~mask;
771 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
772 #if defined(CONFIG_X86_32)
773 s = max_t(uint64_t, s, xenheap_phys_end);
774 #else
775 s = max_t(uint64_t, s, 1<<20);
776 #endif
777 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
778 continue;
780 /* Need to create mappings above 16MB. */
781 map_s = max_t(uint64_t, s, 16<<20);
782 map_e = e;
783 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
784 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
785 #endif
787 /* Pass mapped memory to allocator /before/ creating new mappings. */
788 init_boot_pages(s, min_t(uint64_t, map_s, e));
790 /* Create new mappings /before/ passing memory to the allocator. */
791 if ( map_s < map_e )
792 map_pages_to_xen(
793 (unsigned long)maddr_to_bootstrap_virt(map_s),
794 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
795 PAGE_HYPERVISOR);
797 /* Pass remainder of this memory chunk to the allocator. */
798 init_boot_pages(map_s, e);
799 }
801 memguard_init();
803 nr_pages = 0;
804 for ( i = 0; i < e820.nr_map; i++ )
805 if ( e820.map[i].type == E820_RAM )
806 nr_pages += e820.map[i].size >> PAGE_SHIFT;
807 printk("System RAM: %luMB (%lukB)\n",
808 nr_pages >> (20 - PAGE_SHIFT),
809 nr_pages << (PAGE_SHIFT - 10));
810 total_pages = nr_pages;
812 /* Sanity check for unwanted bloat of certain hypercall structures. */
813 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
814 sizeof(((struct xen_platform_op *)0)->u.pad));
815 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
816 sizeof(((struct xen_domctl *)0)->u.pad));
817 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
818 sizeof(((struct xen_sysctl *)0)->u.pad));
820 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
821 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
822 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
824 #ifdef CONFIG_COMPAT
825 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
826 sizeof(((struct compat_platform_op *)0)->u.pad));
827 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
828 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
829 #endif
831 /* Check definitions in public headers match internal defs. */
832 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
833 #ifdef HYPERVISOR_VIRT_END
834 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
835 #endif
836 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
837 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
839 init_frametable();
841 acpi_boot_table_init();
843 acpi_numa_init();
845 numa_initmem_init(0, max_page);
847 /* Initialise the Xen heap, skipping RAM holes. */
848 init_xenheap_pages(xenheap_phys_start, xenheap_phys_end);
849 nr_pages = (xenheap_phys_end - xenheap_phys_start) >> PAGE_SHIFT;
850 #ifdef __x86_64__
851 init_xenheap_pages(xen_phys_start, __pa(&_start));
852 nr_pages += (__pa(&_start) - xen_phys_start) >> PAGE_SHIFT;
853 vesa_init();
854 #endif
855 xenheap_phys_start = xen_phys_start;
856 printk("Xen heap: %luMB (%lukB)\n",
857 nr_pages >> (20 - PAGE_SHIFT),
858 nr_pages << (PAGE_SHIFT - 10));
860 end_boot_allocator();
862 early_boot = 0;
864 early_cpu_init();
866 paging_init();
868 tboot_probe();
870 /* Unmap the first page of CPU0's stack. */
871 memguard_guard_stack(cpu0_stack);
873 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
875 if ( opt_watchdog )
876 nmi_watchdog = NMI_LOCAL_APIC;
878 sort_exception_tables();
880 find_smp_config();
882 dmi_scan_machine();
884 generic_apic_probe();
886 acpi_boot_init();
888 init_cpu_to_node();
890 if ( smp_found_config )
891 get_smp_config();
893 #ifdef CONFIG_X86_64
894 /* Low mappings were only needed for some BIOS table parsing. */
895 zap_low_mappings();
896 #endif
898 init_apic_mappings();
900 init_IRQ();
902 percpu_init_areas();
904 xsm_init(&initrdidx, mbi, initial_images_start);
906 init_idle_domain();
908 trap_init();
910 rcu_init();
912 timer_init();
914 early_time_init();
916 arch_init_memory();
918 identify_cpu(&boot_cpu_data);
919 if ( cpu_has_fxsr )
920 set_in_cr4(X86_CR4_OSFXSR);
921 if ( cpu_has_xmm )
922 set_in_cr4(X86_CR4_OSXMMEXCPT);
923 #ifdef CONFIG_X86_64
924 vesa_mtrr_init();
925 #endif
927 if ( opt_nosmp )
928 max_cpus = 0;
930 smp_prepare_cpus(max_cpus);
932 /*
933 * Initialise higher-level timer functions. We do this fairly late
934 * (post-SMP) because the time bases and scale factors need to be updated
935 * regularly, and SMP initialisation can cause a long delay with
936 * interrupts not yet enabled.
937 */
938 init_xen_time();
940 initialize_keytable();
942 serial_init_postirq();
944 BUG_ON(!local_irq_is_enabled());
946 for_each_present_cpu ( i )
947 {
948 if ( num_online_cpus() >= max_cpus )
949 break;
950 if ( !cpu_online(i) )
951 {
952 rcu_online_cpu(i);
953 __cpu_up(i);
954 }
956 /* Set up cpu_to_node[]. */
957 srat_detect_node(i);
958 /* Set up node_to_cpumask based on cpu_to_node[]. */
959 numa_add_cpu(i);
960 }
962 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
963 smp_cpus_done(max_cpus);
965 initialise_gdb(); /* could be moved earlier */
967 do_initcalls();
969 if ( opt_watchdog )
970 watchdog_enable();
972 /* Create initial domain 0. */
973 dom0 = domain_create(0, 0, DOM0_SSIDREF);
974 if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
975 panic("Error creating domain 0\n");
977 dom0->is_privileged = 1;
978 dom0->target = NULL;
980 /* Grab the DOM0 command line. */
981 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
982 if ( (cmdline != NULL) || (kextra != NULL) )
983 {
984 static char dom0_cmdline[MAX_GUEST_CMDLINE];
986 cmdline = cmdline_cook(cmdline);
987 safe_strcpy(dom0_cmdline, cmdline);
989 if ( kextra != NULL )
990 /* kextra always includes exactly one leading space. */
991 safe_strcat(dom0_cmdline, kextra);
993 /* Append any extra parameters. */
994 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
995 safe_strcat(dom0_cmdline, " noapic");
996 if ( acpi_skip_timer_override &&
997 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
998 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
999 if ( (strlen(acpi_param) == 0) && acpi_disabled )
1001 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1002 safe_strcpy(acpi_param, "off");
1004 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1006 safe_strcat(dom0_cmdline, " acpi=");
1007 safe_strcat(dom0_cmdline, acpi_param);
1010 cmdline = dom0_cmdline;
1013 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
1015 _initrd_start = initial_images_start +
1016 (mod[initrdidx].mod_start - mod[0].mod_start);
1017 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1020 iommu_setup();
1022 amd_iommu_detect();
1024 /*
1025 * We're going to setup domain0 using the module(s) that we stashed safely
1026 * above our heap. The second module, if present, is an initrd ramdisk.
1027 */
1028 if ( construct_dom0(dom0,
1029 initial_images_start,
1030 mod[0].mod_end-mod[0].mod_start,
1031 _initrd_start,
1032 _initrd_len,
1033 cmdline) != 0)
1034 panic("Could not set up DOM0 guest OS\n");
1036 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1037 scrub_heap_pages();
1039 init_trace_bufs();
1041 console_endboot();
1043 /* Hide UART from DOM0 if we're using it */
1044 serial_endboot();
1046 domain_unpause_by_systemcontroller(dom0);
1048 reset_stack_and_jump(init_done);
1051 void arch_get_xen_caps(xen_capabilities_info_t *info)
1053 /* Interface name is always xen-3.0-* for Xen-3.x. */
1054 int major = 3, minor = 0;
1055 char s[32];
1057 (*info)[0] = '\0';
1059 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
1061 snprintf(s, sizeof(s), "xen-%d.%d-x86_32 ", major, minor);
1062 safe_strcat(*info, s);
1063 if ( hvm_enabled )
1065 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1066 safe_strcat(*info, s);
1069 #elif defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
1071 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1072 safe_strcat(*info, s);
1073 if ( hvm_enabled )
1075 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1076 safe_strcat(*info, s);
1077 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1078 safe_strcat(*info, s);
1081 #elif defined(CONFIG_X86_64)
1083 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1084 safe_strcat(*info, s);
1085 #ifdef CONFIG_COMPAT
1086 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1087 safe_strcat(*info, s);
1088 #endif
1089 if ( hvm_enabled )
1091 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1092 safe_strcat(*info, s);
1093 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1094 safe_strcat(*info, s);
1095 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1096 safe_strcat(*info, s);
1099 #endif
1102 /*
1103 * Local variables:
1104 * mode: C
1105 * c-set-style: "BSD"
1106 * c-basic-offset: 4
1107 * tab-width: 4
1108 * indent-tabs-mode: nil
1109 * End:
1110 */