debuggers.hg

view xen/arch/x86/setup.c @ 19964:3952eaeb70b0

Introduce and use a per-CPU read-mostly sub-section

Since mixing data that only gets setup once and then (perhaps
frequently) gets read by remote CPUs with data that the local CPU may
modify (again, perhaps frequently) still causes undesirable cache
protocol related bus traffic, separate the former class of objects
from the latter.

These objects converted here are just picked based on their write-once
(or write-very-rarely) properties; perhaps some more adjustments may
be desirable subsequently. The primary users of the new sub-section
will result from the next patch.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jul 13 11:32:41 2009 +0100 (2009-07-13)
parents 7406764457a0
children 3f12d48f2880
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <xen/dmi.h>
23 #include <public/version.h>
24 #ifdef CONFIG_COMPAT
25 #include <compat/platform.h>
26 #include <compat/xen.h>
27 #endif
28 #include <asm/bitops.h>
29 #include <asm/smp.h>
30 #include <asm/processor.h>
31 #include <asm/mpspec.h>
32 #include <asm/apic.h>
33 #include <asm/desc.h>
34 #include <asm/paging.h>
35 #include <asm/e820.h>
36 #include <xsm/acm/acm_hooks.h>
37 #include <xen/kexec.h>
38 #include <asm/edd.h>
39 #include <xsm/xsm.h>
40 #include <asm/tboot.h>
42 int __init bzimage_headroom(char *image_start, unsigned long image_length);
44 #if defined(CONFIG_X86_64)
45 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
46 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
47 #else
48 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
49 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
50 #endif
52 extern void generic_apic_probe(void);
53 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
55 extern u16 boot_edid_caps;
56 extern u8 boot_edid_info[128];
57 extern struct boot_video_info boot_vid_info;
59 /* opt_nosmp: If true, secondary processors are ignored. */
60 static int opt_nosmp = 0;
61 boolean_param("nosmp", opt_nosmp);
63 /* maxcpus: maximum number of CPUs to activate. */
64 static unsigned int max_cpus = NR_CPUS;
65 integer_param("maxcpus", max_cpus);
67 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
68 static int opt_watchdog = 0;
69 boolean_param("watchdog", opt_watchdog);
71 /* **** Linux config option: propagated to domain0. */
72 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
73 /* "acpi=force": Override the disable blacklist. */
74 /* "acpi=strict": Disables out-of-spec workarounds. */
75 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
76 /* "acpi=noirq": Disables ACPI interrupt routing. */
77 static void parse_acpi_param(char *s);
78 custom_param("acpi", parse_acpi_param);
80 /* **** Linux config option: propagated to domain0. */
81 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
82 extern int acpi_skip_timer_override;
83 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
85 /* **** Linux config option: propagated to domain0. */
86 /* noapic: Disable IOAPIC setup. */
87 extern int skip_ioapic_setup;
88 boolean_param("noapic", skip_ioapic_setup);
90 /* **** Linux config option: propagated to domain0. */
91 /* xen_cpuidle: xen control cstate. */
92 /*static*/ int xen_cpuidle = -1;
93 boolean_param("cpuidle", xen_cpuidle);
95 int early_boot = 1;
97 cpumask_t cpu_present_map;
99 unsigned long xen_phys_start;
101 #ifdef CONFIG_X86_32
102 /* Limits of Xen heap, used to initialise the allocator. */
103 unsigned long xenheap_initial_phys_start, xenheap_phys_end;
104 #endif
106 extern void arch_init_memory(void);
107 extern void init_IRQ(void);
108 extern void early_time_init(void);
109 extern void early_cpu_init(void);
110 extern void vesa_init(void);
111 extern void vesa_mtrr_init(void);
112 extern void init_tmem(void);
114 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
115 #ifdef CONFIG_COMPAT
116 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table)
117 = boot_cpu_compat_gdt_table;
118 #endif
120 DEFINE_PER_CPU(struct tss_struct, init_tss);
122 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
124 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
126 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
127 EXPORT_SYMBOL(mmu_cr4_features);
129 int acpi_disabled;
131 int acpi_force;
132 char acpi_param[10] = "";
133 static void __init parse_acpi_param(char *s)
134 {
135 /* Save the parameter so it can be propagated to domain0. */
136 safe_strcpy(acpi_param, s);
138 /* Interpret the parameter for use within Xen. */
139 if ( !strcmp(s, "off") )
140 {
141 disable_acpi();
142 }
143 else if ( !strcmp(s, "force") )
144 {
145 acpi_force = 1;
146 acpi_ht = 1;
147 acpi_disabled = 0;
148 }
149 else if ( !strcmp(s, "strict") )
150 {
151 acpi_strict = 1;
152 }
153 else if ( !strcmp(s, "ht") )
154 {
155 if ( !acpi_force )
156 disable_acpi();
157 acpi_ht = 1;
158 }
159 else if ( !strcmp(s, "noirq") )
160 {
161 acpi_noirq_set();
162 }
163 }
165 static void __init do_initcalls(void)
166 {
167 initcall_t *call;
168 for ( call = &__initcall_start; call < &__initcall_end; call++ )
169 (*call)();
170 }
172 #define EARLY_FAIL(f, a...) do { \
173 printk( f , ## a ); \
174 for ( ; ; ) halt(); \
175 } while (0)
177 static unsigned long __initdata initial_images_base;
178 static unsigned long __initdata initial_images_start;
179 static unsigned long __initdata initial_images_end;
181 unsigned long __init initial_images_nrpages(void)
182 {
183 ASSERT(!(initial_images_base & ~PAGE_MASK));
184 ASSERT(!(initial_images_end & ~PAGE_MASK));
185 return ((initial_images_end >> PAGE_SHIFT) -
186 (initial_images_base >> PAGE_SHIFT));
187 }
189 void __init discard_initial_images(void)
190 {
191 init_domheap_pages(initial_images_base, initial_images_end);
192 }
194 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
196 static void __init percpu_init_areas(void)
197 {
198 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
199 unsigned int first_unused;
201 BUG_ON(data_size > PERCPU_SIZE);
203 /* Initialise per-cpu data area for all possible secondary CPUs. */
204 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
205 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
206 __per_cpu_start,
207 data_size);
208 first_unused = i;
210 /* Check that there are no holes in cpu_possible_map. */
211 for ( ; i < NR_CPUS; i++ )
212 BUG_ON(cpu_possible(i));
214 #ifndef MEMORY_GUARD
215 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
216 __pa(__per_cpu_end));
217 #endif
218 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
219 (NR_CPUS - first_unused) << PERCPU_SHIFT);
220 #if defined(CONFIG_X86_64)
221 /* Also zap the mapping in the 1:1 area. */
222 memguard_guard_range(__va(__pa(__per_cpu_start)) +
223 (first_unused << PERCPU_SHIFT),
224 (NR_CPUS - first_unused) << PERCPU_SHIFT);
225 #endif
226 }
228 static void __init init_idle_domain(void)
229 {
230 struct domain *idle_domain;
232 /* Domain creation requires that scheduler structures are initialised. */
233 scheduler_init();
235 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
236 if ( idle_domain == NULL )
237 BUG();
238 idle_domain->vcpu = idle_vcpu;
239 idle_domain->max_vcpus = NR_CPUS;
240 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
241 BUG();
243 set_current(idle_vcpu[0]);
244 this_cpu(curr_vcpu) = current;
246 setup_idle_pagetable();
247 }
249 static void __init srat_detect_node(int cpu)
250 {
251 unsigned node;
252 u32 apicid = x86_cpu_to_apicid[cpu];
254 node = apicid_to_node[apicid];
255 if ( node == NUMA_NO_NODE )
256 node = 0;
257 numa_set_node(cpu, node);
259 if ( acpi_numa > 0 )
260 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
261 }
263 /*
264 * Ensure a given physical memory range is present in the bootstrap mappings.
265 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
266 */
267 static void __init bootstrap_map(unsigned long start, unsigned long end)
268 {
269 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
270 start = max_t(unsigned long, start & ~mask, 16UL << 20);
271 end = (end + mask) & ~mask;
272 if ( start >= end )
273 return;
274 if ( end > BOOTSTRAP_DIRECTMAP_END )
275 panic("Cannot access memory beyond end of "
276 "bootstrap direct-map area\n");
277 map_pages_to_xen(
278 (unsigned long)maddr_to_bootstrap_virt(start),
279 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
280 }
282 static void __init move_memory(
283 unsigned long dst, unsigned long src_start, unsigned long src_end)
284 {
285 bootstrap_map(src_start, src_end);
286 bootstrap_map(dst, dst + src_end - src_start);
287 memmove(maddr_to_bootstrap_virt(dst),
288 maddr_to_bootstrap_virt(src_start),
289 src_end - src_start);
290 }
292 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
293 static struct e820map __initdata boot_e820;
295 struct boot_video_info {
296 u8 orig_x; /* 0x00 */
297 u8 orig_y; /* 0x01 */
298 u8 orig_video_mode; /* 0x02 */
299 u8 orig_video_cols; /* 0x03 */
300 u8 orig_video_lines; /* 0x04 */
301 u8 orig_video_isVGA; /* 0x05 */
302 u16 orig_video_points; /* 0x06 */
304 /* VESA graphic mode -- linear frame buffer */
305 u32 capabilities; /* 0x08 */
306 u16 lfb_linelength; /* 0x0c */
307 u16 lfb_width; /* 0x0e */
308 u16 lfb_height; /* 0x10 */
309 u16 lfb_depth; /* 0x12 */
310 u32 lfb_base; /* 0x14 */
311 u32 lfb_size; /* 0x18 */
312 u8 red_size; /* 0x1c */
313 u8 red_pos; /* 0x1d */
314 u8 green_size; /* 0x1e */
315 u8 green_pos; /* 0x1f */
316 u8 blue_size; /* 0x20 */
317 u8 blue_pos; /* 0x21 */
318 u8 rsvd_size; /* 0x22 */
319 u8 rsvd_pos; /* 0x23 */
320 u16 vesapm_seg; /* 0x24 */
321 u16 vesapm_off; /* 0x26 */
322 u16 vesa_attrib; /* 0x28 */
323 };
325 static void __init parse_video_info(void)
326 {
327 struct boot_video_info *bvi = &bootsym(boot_vid_info);
329 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
330 {
331 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
332 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
333 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
334 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
335 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
336 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
337 }
338 else if ( bvi->orig_video_isVGA == 0x23 )
339 {
340 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
341 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
342 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
343 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
344 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
345 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
346 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
347 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
348 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
349 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
350 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
351 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
352 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
353 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
354 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
355 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
356 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
357 }
358 }
360 void __init kexec_reserve_area(struct e820map *e820)
361 {
362 unsigned long kdump_start = kexec_crash_area.start;
363 unsigned long kdump_size = kexec_crash_area.size;
364 static int is_reserved = 0;
366 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
368 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
369 return;
371 is_reserved = 1;
373 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
374 {
375 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
376 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
377 kexec_crash_area.start = kexec_crash_area.size = 0;
378 }
379 else
380 {
381 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
382 kdump_size >> 20, kdump_size >> 10, kdump_start);
383 }
384 }
386 void init_done(void)
387 {
388 extern char __init_begin[], __init_end[];
390 /* Free (or page-protect) the init areas. */
391 memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
392 #ifndef MEMORY_GUARD
393 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
394 #endif
395 memguard_guard_range(__init_begin, __init_end - __init_begin);
396 #if defined(CONFIG_X86_64)
397 /* Also zap the mapping in the 1:1 area. */
398 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
399 #endif
400 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
402 startup_cpu_idle_loop();
403 }
405 static char * __init cmdline_cook(char *p)
406 {
407 p = p ? : "";
408 while ( *p == ' ' )
409 p++;
410 while ( (*p != ' ') && (*p != '\0') )
411 p++;
412 while ( *p == ' ' )
413 p++;
414 return p;
415 }
417 void __init __start_xen(unsigned long mbi_p)
418 {
419 char *memmap_type = NULL;
420 char *cmdline, *kextra;
421 unsigned long _initrd_start = 0, _initrd_len = 0;
422 unsigned int initrdidx = 1;
423 multiboot_info_t *mbi = __va(mbi_p);
424 module_t *mod = (module_t *)__va(mbi->mods_addr);
425 unsigned long nr_pages, modules_length, modules_headroom;
426 int i, j, e820_warn = 0, bytes = 0;
427 struct ns16550_defaults ns16550 = {
428 .data_bits = 8,
429 .parity = 'n',
430 .stop_bits = 1
431 };
433 extern void early_page_fault(void);
434 set_intr_gate(TRAP_page_fault, &early_page_fault);
436 /* Parse the command-line options. */
437 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
438 __va(mbi->cmdline) : NULL);
439 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
440 {
441 /*
442 * Options after ' -- ' separator belong to dom0.
443 * 1. Orphan dom0's options from Xen's command line.
444 * 2. Skip all but final leading space from dom0's options.
445 */
446 *kextra = '\0';
447 kextra += 3;
448 while ( kextra[1] == ' ' ) kextra++;
449 }
450 cmdline_parse(cmdline);
452 parse_video_info();
454 set_current((struct vcpu *)0xfffff000); /* debug sanity */
455 idle_vcpu[0] = current;
456 set_processor_id(0); /* needed early, for smp_processor_id() */
457 if ( cpu_has_efer )
458 rdmsrl(MSR_EFER, this_cpu(efer));
459 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
461 smp_prepare_boot_cpu();
463 /* We initialise the serial devices very early so we can get debugging. */
464 ns16550.io_base = 0x3f8;
465 ns16550.irq = 4;
466 ns16550_init(0, &ns16550);
467 ns16550.io_base = 0x2f8;
468 ns16550.irq = 3;
469 ns16550_init(1, &ns16550);
470 console_init_preirq();
472 printk("Command line: %s\n", cmdline);
474 printk("Video information:\n");
476 /* Print VGA display mode information. */
477 switch ( vga_console_info.video_type )
478 {
479 case XEN_VGATYPE_TEXT_MODE_3:
480 printk(" VGA is text mode %dx%d, font 8x%d\n",
481 vga_console_info.u.text_mode_3.columns,
482 vga_console_info.u.text_mode_3.rows,
483 vga_console_info.u.text_mode_3.font_height);
484 break;
485 case XEN_VGATYPE_VESA_LFB:
486 printk(" VGA is graphics mode %dx%d, %d bpp\n",
487 vga_console_info.u.vesa_lfb.width,
488 vga_console_info.u.vesa_lfb.height,
489 vga_console_info.u.vesa_lfb.bits_per_pixel);
490 break;
491 default:
492 printk(" No VGA detected\n");
493 break;
494 }
496 /* Print VBE/DDC EDID information. */
497 if ( bootsym(boot_edid_caps) != 0x1313 )
498 {
499 u16 caps = bootsym(boot_edid_caps);
500 printk(" VBE/DDC methods:%s%s%s; ",
501 (caps & 1) ? " V1" : "",
502 (caps & 2) ? " V2" : "",
503 !(caps & 3) ? " none" : "");
504 printk("EDID transfer time: %d seconds\n", caps >> 8);
505 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
506 {
507 printk(" EDID info not retrieved because ");
508 if ( !(caps & 3) )
509 printk("no DDC retrieval method detected\n");
510 else if ( (caps >> 8) > 5 )
511 printk("takes longer than 5 seconds\n");
512 else
513 printk("of reasons unknown\n");
514 }
515 }
517 printk("Disc information:\n");
518 printk(" Found %d MBR signatures\n",
519 bootsym(boot_mbr_signature_nr));
520 printk(" Found %d EDD information structures\n",
521 bootsym(boot_edd_info_nr));
523 /* Check that we have at least one Multiboot module. */
524 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
525 EARLY_FAIL("dom0 kernel not specified. "
526 "Check bootloader configuration.\n");
528 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
529 EARLY_FAIL("Misaligned CPU0 stack.\n");
531 if ( e820_raw_nr != 0 )
532 {
533 memmap_type = "Xen-e820";
534 }
535 else if ( bootsym(lowmem_kb) )
536 {
537 memmap_type = "Xen-e801";
538 e820_raw[0].addr = 0;
539 e820_raw[0].size = bootsym(lowmem_kb) << 10;
540 e820_raw[0].type = E820_RAM;
541 e820_raw[1].addr = 0x100000;
542 e820_raw[1].size = bootsym(highmem_kb) << 10;
543 e820_raw[1].type = E820_RAM;
544 e820_raw_nr = 2;
545 }
546 else if ( mbi->flags & MBI_MEMMAP )
547 {
548 memmap_type = "Multiboot-e820";
549 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
550 {
551 memory_map_t *map = __va(mbi->mmap_addr + bytes);
553 /*
554 * This is a gross workaround for a BIOS bug. Some bootloaders do
555 * not write e820 map entries into pre-zeroed memory. This is
556 * okay if the BIOS fills in all fields of the map entry, but
557 * some broken BIOSes do not bother to write the high word of
558 * the length field if the length is smaller than 4GB. We
559 * detect and fix this by flagging sections below 4GB that
560 * appear to be larger than 4GB in size.
561 */
562 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
563 {
564 if ( !e820_warn )
565 {
566 printk("WARNING: Buggy e820 map detected and fixed "
567 "(truncated length fields).\n");
568 e820_warn = 1;
569 }
570 map->length_high = 0;
571 }
573 e820_raw[e820_raw_nr].addr =
574 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
575 e820_raw[e820_raw_nr].size =
576 ((u64)map->length_high << 32) | (u64)map->length_low;
577 e820_raw[e820_raw_nr].type = map->type;
578 e820_raw_nr++;
580 bytes += map->size + 4;
581 }
582 }
583 else if ( mbi->flags & MBI_MEMLIMITS )
584 {
585 memmap_type = "Multiboot-e801";
586 e820_raw[0].addr = 0;
587 e820_raw[0].size = mbi->mem_lower << 10;
588 e820_raw[0].type = E820_RAM;
589 e820_raw[1].addr = 0x100000;
590 e820_raw[1].size = mbi->mem_upper << 10;
591 e820_raw[1].type = E820_RAM;
592 e820_raw_nr = 2;
593 }
594 else
595 {
596 EARLY_FAIL("Bootloader provided no memory information.\n");
597 }
599 /* Sanitise the raw E820 map to produce a final clean version. */
600 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
602 /* Create a temporary copy of the E820 map. */
603 memcpy(&boot_e820, &e820, sizeof(e820));
605 /* Early kexec reservation (explicit static start address). */
606 kexec_reserve_area(&boot_e820);
608 /*
609 * Iterate backwards over all superpage-aligned RAM regions.
610 *
611 * We require superpage alignment because the boot allocator is not yet
612 * initialised. Hence we can only map superpages in the address range
613 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
614 * dynamic allocation of pagetables.
615 *
616 * As well as mapping superpages in that range, in preparation for
617 * initialising the boot allocator, we also look for a region to which
618 * we can relocate the dom0 kernel and other multiboot modules. Also, on
619 * x86/64, we relocate Xen to higher memory.
620 */
621 modules_length = 0;
622 for ( i = 0; i < mbi->mods_count; i++ )
623 modules_length += mod[i].mod_end - mod[i].mod_start;
625 /* ensure mod[0] is mapped before parsing */
626 bootstrap_map(mod[0].mod_start, mod[0].mod_end);
627 modules_headroom = bzimage_headroom(
628 (char *)(unsigned long)mod[0].mod_start,
629 (unsigned long)(mod[0].mod_end - mod[0].mod_start));
631 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
632 {
633 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
635 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
636 s = (boot_e820.map[i].addr + mask) & ~mask;
637 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
638 s = max_t(uint64_t, s, 16 << 20);
639 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
640 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
641 continue;
643 /* Map the chunk. No memory will need to be allocated to do this. */
644 map_pages_to_xen(
645 (unsigned long)maddr_to_bootstrap_virt(s),
646 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
648 #if defined(CONFIG_X86_64)
649 #define reloc_size ((__pa(&_end) + mask) & ~mask)
650 /* Is the region suitable for relocating Xen? */
651 if ( !xen_phys_start && ((e-s) >= reloc_size) )
652 {
653 extern l2_pgentry_t l2_xenmap[];
654 l4_pgentry_t *pl4e;
655 l3_pgentry_t *pl3e;
656 l2_pgentry_t *pl2e;
657 int i, j, k;
659 /* Select relocation address. */
660 e -= reloc_size;
661 xen_phys_start = e;
662 bootsym(trampoline_xen_phys_start) = e;
664 /*
665 * Perform relocation to new physical address.
666 * Before doing so we must sync static/global data with main memory
667 * with a barrier(). After this we must *not* modify static/global
668 * data until after we have switched to the relocated pagetables!
669 */
670 barrier();
671 move_memory(e, 0, __pa(&_end) - xen_phys_start);
673 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
674 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
676 /* Walk initial pagetables, relocating page directory entries. */
677 pl4e = __va(__pa(idle_pg_table));
678 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
679 {
680 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
681 continue;
682 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
683 xen_phys_start);
684 pl3e = l4e_to_l3e(*pl4e);
685 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
686 {
687 /* Not present, 1GB mapping, or already relocated? */
688 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
689 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
690 (l3e_get_pfn(*pl3e) > 0x1000) )
691 continue;
692 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
693 xen_phys_start);
694 pl2e = l3e_to_l2e(*pl3e);
695 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
696 {
697 /* Not present, PSE, or already relocated? */
698 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
699 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
700 (l2e_get_pfn(*pl2e) > 0x1000) )
701 continue;
702 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
703 xen_phys_start);
704 }
705 }
706 }
708 /* The only data mappings to be relocated are in the Xen area. */
709 pl2e = __va(__pa(l2_xenmap));
710 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
711 PAGE_HYPERVISOR | _PAGE_PSE);
712 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
713 {
714 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
715 continue;
716 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
717 xen_phys_start);
718 }
720 /* Re-sync the stack and then switch to relocated pagetables. */
721 asm volatile (
722 "rep movsb ; " /* re-sync the stack */
723 "movq %%cr4,%%rsi ; "
724 "andb $0x7f,%%sil ; "
725 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
726 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
727 "orb $0x80,%%sil ; "
728 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
729 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
730 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
731 }
732 #endif
734 /* Is the region suitable for relocating the multiboot modules? */
735 if ( !initial_images_start && (s < e) &&
736 ((e-s) >= (modules_length+modules_headroom)) )
737 {
738 initial_images_end = e;
739 e = (e - modules_length) & PAGE_MASK;
740 initial_images_start = e;
741 e -= modules_headroom;
742 initial_images_base = e;
743 e += modules_length + modules_headroom;
744 for ( j = mbi->mods_count-1; j >= 0; j-- )
745 {
746 e -= mod[j].mod_end - mod[j].mod_start;
747 move_memory(e, mod[j].mod_start, mod[j].mod_end);
748 mod[j].mod_end += e - mod[j].mod_start;
749 mod[j].mod_start = e;
750 }
751 }
753 if ( !kexec_crash_area.start && (s < e) &&
754 ((e-s) >= kexec_crash_area.size) )
755 {
756 e = (e - kexec_crash_area.size) & PAGE_MASK;
757 kexec_crash_area.start = e;
758 }
759 }
761 if ( !initial_images_start )
762 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
763 reserve_e820_ram(&boot_e820, initial_images_base, initial_images_end);
765 #if defined(CONFIG_X86_32)
766 xenheap_initial_phys_start = (PFN_UP(__pa(&_end)) + 1) << PAGE_SHIFT;
767 /* Must pass a single mapped page for populating bootmem_region_list. */
768 init_boot_pages(__pa(&_end), xenheap_initial_phys_start);
769 xenheap_phys_end = DIRECTMAP_MBYTES << 20;
770 #else
771 if ( !xen_phys_start )
772 EARLY_FAIL("Not enough memory to relocate Xen.\n");
773 reserve_e820_ram(&boot_e820, __pa(&_start), __pa(&_end));
774 #endif
776 /* Late kexec reservation (dynamic start address). */
777 kexec_reserve_area(&boot_e820);
779 /*
780 * Walk every RAM region and map it in its entirety (on x86/64, at least)
781 * and notify it to the boot allocator.
782 */
783 for ( i = 0; i < boot_e820.nr_map; i++ )
784 {
785 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
787 /* Only page alignment required now. */
788 s = (boot_e820.map[i].addr + mask) & ~mask;
789 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
790 #if defined(CONFIG_X86_32)
791 s = max_t(uint64_t, s, xenheap_phys_end);
792 #else
793 s = max_t(uint64_t, s, 1<<20);
794 #endif
795 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
796 continue;
798 /* Need to create mappings above 16MB. */
799 map_s = max_t(uint64_t, s, 16<<20);
800 map_e = e;
801 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
802 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
803 #endif
805 /* Pass mapped memory to allocator /before/ creating new mappings. */
806 init_boot_pages(s, min_t(uint64_t, map_s, e));
808 /* Create new mappings /before/ passing memory to the allocator. */
809 if ( map_s < map_e )
810 map_pages_to_xen(
811 (unsigned long)maddr_to_bootstrap_virt(map_s),
812 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
813 PAGE_HYPERVISOR);
815 /* Pass remainder of this memory chunk to the allocator. */
816 init_boot_pages(map_s, e);
817 }
819 memguard_init();
821 nr_pages = 0;
822 for ( i = 0; i < e820.nr_map; i++ )
823 if ( e820.map[i].type == E820_RAM )
824 nr_pages += e820.map[i].size >> PAGE_SHIFT;
825 printk("System RAM: %luMB (%lukB)\n",
826 nr_pages >> (20 - PAGE_SHIFT),
827 nr_pages << (PAGE_SHIFT - 10));
828 total_pages = nr_pages;
830 /* Sanity check for unwanted bloat of certain hypercall structures. */
831 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
832 sizeof(((struct xen_platform_op *)0)->u.pad));
833 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
834 sizeof(((struct xen_domctl *)0)->u.pad));
835 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
836 sizeof(((struct xen_sysctl *)0)->u.pad));
838 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
839 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
840 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
842 #ifdef CONFIG_COMPAT
843 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
844 sizeof(((struct compat_platform_op *)0)->u.pad));
845 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
846 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
847 #endif
849 /* Check definitions in public headers match internal defs. */
850 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
851 #ifdef HYPERVISOR_VIRT_END
852 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
853 #endif
854 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
855 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
857 init_frametable();
859 acpi_boot_table_init();
861 acpi_numa_init();
863 numa_initmem_init(0, max_page);
865 #if defined(CONFIG_X86_32)
866 /* Initialise the Xen heap. */
867 init_xenheap_pages(xenheap_initial_phys_start, xenheap_phys_end);
868 nr_pages = (xenheap_phys_end - xenheap_initial_phys_start) >> PAGE_SHIFT;
869 printk("Xen heap: %luMB (%lukB)\n",
870 nr_pages >> (20 - PAGE_SHIFT),
871 nr_pages << (PAGE_SHIFT - 10));
872 #endif
874 end_boot_allocator();
875 early_boot = 0;
877 #if defined(CONFIG_X86_64)
878 vesa_init();
879 #endif
881 softirq_init();
883 early_cpu_init();
885 paging_init();
887 tboot_probe();
889 /* Unmap the first page of CPU0's stack. */
890 memguard_guard_stack(cpu0_stack);
892 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
894 if ( opt_watchdog )
895 nmi_watchdog = NMI_LOCAL_APIC;
897 sort_exception_tables();
899 find_smp_config();
901 dmi_scan_machine();
903 generic_apic_probe();
905 if ( x2apic_is_available() )
906 enable_x2apic();
908 acpi_boot_init();
910 init_cpu_to_node();
912 if ( smp_found_config )
913 get_smp_config();
915 #ifdef CONFIG_X86_64
916 /* Low mappings were only needed for some BIOS table parsing. */
917 zap_low_mappings();
918 #endif
920 init_apic_mappings();
922 init_IRQ();
924 percpu_init_areas();
926 xsm_init(&initrdidx, mbi, initial_images_start);
928 init_idle_domain();
930 trap_init();
932 rcu_init();
934 timer_init();
936 early_time_init();
938 arch_init_memory();
940 identify_cpu(&boot_cpu_data);
941 if ( cpu_has_fxsr )
942 set_in_cr4(X86_CR4_OSFXSR);
943 if ( cpu_has_xmm )
944 set_in_cr4(X86_CR4_OSXMMEXCPT);
946 local_irq_enable();
948 #ifdef CONFIG_X86_64
949 vesa_mtrr_init();
950 #endif
952 if ( opt_nosmp )
953 max_cpus = 0;
955 smp_prepare_cpus(max_cpus);
957 spin_debug_enable();
959 /*
960 * Initialise higher-level timer functions. We do this fairly late
961 * (post-SMP) because the time bases and scale factors need to be updated
962 * regularly, and SMP initialisation can cause a long delay with
963 * interrupts not yet enabled.
964 */
965 init_xen_time();
967 initialize_keytable();
969 console_init_postirq();
971 for_each_present_cpu ( i )
972 {
973 if ( num_online_cpus() >= max_cpus )
974 break;
975 if ( !cpu_online(i) )
976 {
977 rcu_online_cpu(i);
978 __cpu_up(i);
979 }
981 /* Set up cpu_to_node[]. */
982 srat_detect_node(i);
983 /* Set up node_to_cpumask based on cpu_to_node[]. */
984 numa_add_cpu(i);
985 }
987 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
988 smp_cpus_done(max_cpus);
990 initialise_gdb(); /* could be moved earlier */
992 do_initcalls();
994 if ( opt_watchdog )
995 watchdog_enable();
997 if ( !tboot_protect_mem_regions() )
998 panic("Could not protect TXT memory regions\n");
1000 /* Create initial domain 0. */
1001 dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
1002 if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) )
1003 panic("Error creating domain 0\n");
1005 dom0->is_privileged = 1;
1006 dom0->target = NULL;
1008 /* Grab the DOM0 command line. */
1009 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1010 if ( (cmdline != NULL) || (kextra != NULL) )
1012 static char dom0_cmdline[MAX_GUEST_CMDLINE];
1014 cmdline = cmdline_cook(cmdline);
1015 safe_strcpy(dom0_cmdline, cmdline);
1017 if ( kextra != NULL )
1018 /* kextra always includes exactly one leading space. */
1019 safe_strcat(dom0_cmdline, kextra);
1021 /* Append any extra parameters. */
1022 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1023 safe_strcat(dom0_cmdline, " noapic");
1024 if ( acpi_skip_timer_override &&
1025 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
1026 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
1027 if ( (strlen(acpi_param) == 0) && acpi_disabled )
1029 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1030 safe_strcpy(acpi_param, "off");
1032 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1034 safe_strcat(dom0_cmdline, " acpi=");
1035 safe_strcat(dom0_cmdline, acpi_param);
1038 cmdline = dom0_cmdline;
1041 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
1043 _initrd_start = mod[initrdidx].mod_start;
1044 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1047 if ( xen_cpuidle )
1048 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1050 /*
1051 * We're going to setup domain0 using the module(s) that we stashed safely
1052 * above our heap. The second module, if present, is an initrd ramdisk.
1053 */
1054 if ( construct_dom0(dom0,
1055 initial_images_base,
1056 initial_images_start,
1057 mod[0].mod_end-mod[0].mod_start,
1058 _initrd_start,
1059 _initrd_len,
1060 cmdline) != 0)
1061 panic("Could not set up DOM0 guest OS\n");
1063 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1064 scrub_heap_pages();
1066 init_trace_bufs();
1068 init_tmem();
1070 console_endboot();
1072 /* Hide UART from DOM0 if we're using it */
1073 serial_endboot();
1075 domain_unpause_by_systemcontroller(dom0);
1077 reset_stack_and_jump(init_done);
1080 void arch_get_xen_caps(xen_capabilities_info_t *info)
1082 /* Interface name is always xen-3.0-* for Xen-3.x. */
1083 int major = 3, minor = 0;
1084 char s[32];
1086 (*info)[0] = '\0';
1088 #if defined(CONFIG_X86_32)
1090 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1091 safe_strcat(*info, s);
1092 if ( hvm_enabled )
1094 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1095 safe_strcat(*info, s);
1096 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1097 safe_strcat(*info, s);
1100 #elif defined(CONFIG_X86_64)
1102 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1103 safe_strcat(*info, s);
1104 #ifdef CONFIG_COMPAT
1105 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1106 safe_strcat(*info, s);
1107 #endif
1108 if ( hvm_enabled )
1110 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1111 safe_strcat(*info, s);
1112 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1113 safe_strcat(*info, s);
1114 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1115 safe_strcat(*info, s);
1118 #endif
1121 int xen_in_range(paddr_t start, paddr_t end)
1123 int i;
1124 static struct {
1125 paddr_t s, e;
1126 } xen_regions[4];
1128 /* initialize first time */
1129 if ( !xen_regions[0].s )
1131 extern char __init_begin[], __bss_start[];
1133 /* S3 resume code (and other real mode trampoline code) */
1134 xen_regions[0].s = bootsym_phys(trampoline_start);
1135 xen_regions[0].e = bootsym_phys(trampoline_end);
1136 /* hypervisor code + data */
1137 xen_regions[1].s =__pa(&_stext);
1138 xen_regions[1].e = __pa(&__init_begin);
1139 /* per-cpu data */
1140 xen_regions[2].s = __pa(&__per_cpu_start);
1141 xen_regions[2].e = xen_regions[2].s +
1142 (((paddr_t)last_cpu(cpu_possible_map) + 1) << PERCPU_SHIFT);
1143 /* bss */
1144 xen_regions[3].s = __pa(&__bss_start);
1145 xen_regions[3].e = __pa(&_end);
1148 for ( i = 0; i < ARRAY_SIZE(xen_regions); i++ )
1150 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
1151 return 1;
1154 return 0;
1157 /*
1158 * Local variables:
1159 * mode: C
1160 * c-set-style: "BSD"
1161 * c-basic-offset: 4
1162 * tab-width: 4
1163 * indent-tabs-mode: nil
1164 * End:
1165 */