debuggers.hg

view xen/arch/x86/x86_32/traps.c @ 19964:3952eaeb70b0

Introduce and use a per-CPU read-mostly sub-section

Since mixing data that only gets setup once and then (perhaps
frequently) gets read by remote CPUs with data that the local CPU may
modify (again, perhaps frequently) still causes undesirable cache
protocol related bus traffic, separate the former class of objects
from the latter.

These objects converted here are just picked based on their write-once
(or write-very-rarely) properties; perhaps some more adjustments may
be desirable subsequently. The primary users of the new sub-section
will result from the next patch.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jul 13 11:32:41 2009 +0100 (2009-07-13)
parents 7406764457a0
children 62b7fc245d1f
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/domain_page.h>
5 #include <xen/init.h>
6 #include <xen/sched.h>
7 #include <xen/lib.h>
8 #include <xen/console.h>
9 #include <xen/mm.h>
10 #include <xen/irq.h>
11 #include <xen/symbols.h>
12 #include <xen/shutdown.h>
13 #include <xen/nmi.h>
14 #include <asm/current.h>
15 #include <asm/flushtlb.h>
16 #include <asm/hvm/hvm.h>
17 #include <asm/hvm/support.h>
19 #include <public/callback.h>
21 static void print_xen_info(void)
22 {
23 char taint_str[TAINT_STRING_MAX_LEN];
24 char debug = 'n', *arch = "x86_32p";
26 #ifndef NDEBUG
27 debug = 'y';
28 #endif
30 printk("----[ Xen-%d.%d%s %s debug=%c %s ]----\n",
31 xen_major_version(), xen_minor_version(), xen_extra_version(),
32 arch, debug, print_tainted(taint_str));
33 }
35 enum context { CTXT_hypervisor, CTXT_pv_guest, CTXT_hvm_guest };
37 static void _show_registers(
38 const struct cpu_user_regs *regs, unsigned long crs[8],
39 enum context context, const struct vcpu *v)
40 {
41 const static char *context_names[] = {
42 [CTXT_hypervisor] = "hypervisor",
43 [CTXT_pv_guest] = "pv guest",
44 [CTXT_hvm_guest] = "hvm guest"
45 };
47 printk("EIP: %04x:[<%08x>]", regs->cs, regs->eip);
48 if ( context == CTXT_hypervisor )
49 print_symbol(" %s", regs->eip);
50 printk("\nEFLAGS: %08x ", regs->eflags);
51 if ( (context == CTXT_pv_guest) && v && v->vcpu_info )
52 printk("EM: %d ", !!v->vcpu_info->evtchn_upcall_mask);
53 printk("CONTEXT: %s\n", context_names[context]);
55 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
56 regs->eax, regs->ebx, regs->ecx, regs->edx);
57 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
58 regs->esi, regs->edi, regs->ebp, regs->esp);
59 printk("cr0: %08lx cr4: %08lx cr3: %08lx cr2: %08lx\n",
60 crs[0], crs[4], crs[3], crs[2]);
61 printk("ds: %04x es: %04x fs: %04x gs: %04x "
62 "ss: %04x cs: %04x\n",
63 regs->ds, regs->es, regs->fs,
64 regs->gs, regs->ss, regs->cs);
65 }
67 void show_registers(struct cpu_user_regs *regs)
68 {
69 struct cpu_user_regs fault_regs = *regs;
70 unsigned long fault_crs[8];
71 enum context context;
72 struct vcpu *v = current;
74 if ( is_hvm_vcpu(v) && guest_mode(regs) )
75 {
76 struct segment_register sreg;
77 context = CTXT_hvm_guest;
78 fault_crs[0] = v->arch.hvm_vcpu.guest_cr[0];
79 fault_crs[2] = v->arch.hvm_vcpu.guest_cr[2];
80 fault_crs[3] = v->arch.hvm_vcpu.guest_cr[3];
81 fault_crs[4] = v->arch.hvm_vcpu.guest_cr[4];
82 hvm_get_segment_register(v, x86_seg_cs, &sreg);
83 fault_regs.cs = sreg.sel;
84 hvm_get_segment_register(v, x86_seg_ds, &sreg);
85 fault_regs.ds = sreg.sel;
86 hvm_get_segment_register(v, x86_seg_es, &sreg);
87 fault_regs.es = sreg.sel;
88 hvm_get_segment_register(v, x86_seg_fs, &sreg);
89 fault_regs.fs = sreg.sel;
90 hvm_get_segment_register(v, x86_seg_gs, &sreg);
91 fault_regs.gs = sreg.sel;
92 hvm_get_segment_register(v, x86_seg_ss, &sreg);
93 fault_regs.ss = sreg.sel;
94 }
95 else
96 {
97 if ( !guest_mode(regs) )
98 {
99 context = CTXT_hypervisor;
100 fault_regs.esp = (unsigned long)&regs->esp;
101 fault_regs.ss = read_segment_register(ss);
102 fault_regs.ds = read_segment_register(ds);
103 fault_regs.es = read_segment_register(es);
104 fault_regs.fs = read_segment_register(fs);
105 fault_regs.gs = read_segment_register(gs);
106 fault_crs[2] = read_cr2();
107 }
108 else
109 {
110 context = CTXT_pv_guest;
111 fault_crs[2] = v->vcpu_info->arch.cr2;
112 }
114 fault_crs[0] = read_cr0();
115 fault_crs[3] = read_cr3();
116 fault_crs[4] = read_cr4();
117 }
119 print_xen_info();
120 printk("CPU: %d\n", smp_processor_id());
121 _show_registers(&fault_regs, fault_crs, context, v);
123 if ( this_cpu(ler_msr) && !guest_mode(regs) )
124 {
125 u32 from, to, hi;
126 rdmsr(this_cpu(ler_msr), from, hi);
127 rdmsr(this_cpu(ler_msr) + 1, to, hi);
128 printk("ler: %08x -> %08x\n", from, to);
129 }
130 }
132 void vcpu_show_registers(const struct vcpu *v)
133 {
134 unsigned long crs[8];
136 /* No need to handle HVM for now. */
137 if ( is_hvm_vcpu(v) )
138 return;
140 crs[0] = v->arch.guest_context.ctrlreg[0];
141 crs[2] = v->vcpu_info->arch.cr2;
142 crs[3] = pagetable_get_paddr(v->arch.guest_table);
143 crs[4] = v->arch.guest_context.ctrlreg[4];
145 _show_registers(&v->arch.guest_context.user_regs, crs, CTXT_pv_guest, v);
146 }
148 void show_page_walk(unsigned long addr)
149 {
150 unsigned long pfn, mfn, cr3 = read_cr3();
151 l3_pgentry_t l3e, *l3t;
152 l2_pgentry_t l2e, *l2t;
153 l1_pgentry_t l1e, *l1t;
155 printk("Pagetable walk from %08lx:\n", addr);
157 mfn = cr3 >> PAGE_SHIFT;
159 l3t = map_domain_page(mfn);
160 l3t += (cr3 & 0xFE0UL) >> 3;
161 l3e = l3t[l3_table_offset(addr)];
162 mfn = l3e_get_pfn(l3e);
163 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
164 printk(" L3[0x%03lx] = %"PRIpte" %08lx\n",
165 l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
166 unmap_domain_page(l3t);
167 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
168 return;
170 l2t = map_domain_page(mfn);
171 l2e = l2t[l2_table_offset(addr)];
172 mfn = l2e_get_pfn(l2e);
173 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
174 printk(" L2[0x%03lx] = %"PRIpte" %08lx %s\n",
175 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
176 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
177 unmap_domain_page(l2t);
178 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
179 (l2e_get_flags(l2e) & _PAGE_PSE) )
180 return;
182 l1t = map_domain_page(mfn);
183 l1e = l1t[l1_table_offset(addr)];
184 mfn = l1e_get_pfn(l1e);
185 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
186 printk(" L1[0x%03lx] = %"PRIpte" %08lx\n",
187 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
188 unmap_domain_page(l1t);
189 }
191 DEFINE_PER_CPU_READ_MOSTLY(struct tss_struct *, doublefault_tss);
192 static unsigned char __attribute__ ((__section__ (".bss.page_aligned")))
193 boot_cpu_doublefault_space[PAGE_SIZE];
195 asmlinkage void do_double_fault(void)
196 {
197 struct tss_struct *tss;
198 unsigned int cpu;
200 watchdog_disable();
202 console_force_unlock();
204 asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
206 /* Find information saved during fault and dump it to the console. */
207 tss = &per_cpu(init_tss, cpu);
208 printk("*** DOUBLE FAULT ***\n");
209 print_xen_info();
210 printk("CPU: %d\nEIP: %04x:[<%08x>]",
211 cpu, tss->cs, tss->eip);
212 print_symbol(" %s\n", tss->eip);
213 printk("EFLAGS: %08x\n", tss->eflags);
214 printk("CR3: %08x\n", tss->__cr3);
215 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
216 tss->eax, tss->ebx, tss->ecx, tss->edx);
217 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
218 tss->esi, tss->edi, tss->ebp, tss->esp);
219 printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
220 tss->ds, tss->es, tss->fs, tss->gs, tss->ss);
221 show_stack_overflow(cpu, tss->esp);
223 panic("DOUBLE FAULT -- system shutdown\n");
224 }
226 unsigned long do_iret(void)
227 {
228 struct cpu_user_regs *regs = guest_cpu_user_regs();
229 struct vcpu *v = current;
230 u32 eflags;
232 /* Check worst-case stack frame for overlap with Xen protected area. */
233 if ( unlikely(!access_ok(regs->esp, 40)) )
234 goto exit_and_crash;
236 /* Pop and restore EAX (clobbered by hypercall). */
237 if ( unlikely(__copy_from_user(&regs->eax, (void *)regs->esp, 4)) )
238 goto exit_and_crash;
239 regs->esp += 4;
241 /* Pop and restore CS and EIP. */
242 if ( unlikely(__copy_from_user(&regs->eip, (void *)regs->esp, 8)) )
243 goto exit_and_crash;
244 regs->esp += 8;
246 /*
247 * Pop, fix up and restore EFLAGS. We fix up in a local staging area
248 * to avoid firing the BUG_ON(IOPL) check in arch_get_info_guest.
249 */
250 if ( unlikely(__copy_from_user(&eflags, (void *)regs->esp, 4)) )
251 goto exit_and_crash;
252 regs->esp += 4;
253 regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
255 if ( vm86_mode(regs) )
256 {
257 /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */
258 if ( __copy_from_user(&regs->esp, (void *)regs->esp, 24) )
259 goto exit_and_crash;
260 }
261 else if ( unlikely(ring_0(regs)) )
262 {
263 goto exit_and_crash;
264 }
265 else if ( !ring_1(regs) )
266 {
267 /* Return to ring 2/3: pop and restore ESP and SS. */
268 if ( __copy_from_user(&regs->esp, (void *)regs->esp, 8) )
269 goto exit_and_crash;
270 }
272 /* Restore affinity. */
273 if ((v->trap_priority >= VCPU_TRAP_NMI)
274 && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
275 vcpu_set_affinity(v, &v->cpu_affinity_tmp);
277 /* Restore previous trap priority */
278 v->trap_priority = v->old_trap_priority;
280 /* Restore upcall mask from supplied EFLAGS.IF. */
281 vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
283 /*
284 * The hypercall exit path will overwrite EAX with this return
285 * value.
286 */
287 return regs->eax;
289 exit_and_crash:
290 gdprintk(XENLOG_ERR, "Fatal error\n");
291 domain_crash(v->domain);
292 return 0;
293 }
295 static void set_task_gate(unsigned int n, unsigned int sel)
296 {
297 idt_table[n].b = 0;
298 wmb(); /* disable gate /then/ rewrite */
299 idt_table[n].a = sel << 16;
300 wmb(); /* rewrite /then/ enable gate */
301 idt_table[n].b = 0x8500;
302 }
304 void __devinit subarch_percpu_traps_init(void)
305 {
306 struct tss_struct *tss = this_cpu(doublefault_tss);
307 asmlinkage int hypercall(void);
309 if ( !tss )
310 {
311 /* The hypercall entry vector is only accessible from ring 1. */
312 _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall);
314 tss = (void *)boot_cpu_doublefault_space;
315 this_cpu(doublefault_tss) = tss;
316 }
318 /*
319 * Make a separate task for double faults. This will get us debug output if
320 * we blow the kernel stack.
321 */
322 tss->ds = __HYPERVISOR_DS;
323 tss->es = __HYPERVISOR_DS;
324 tss->ss = __HYPERVISOR_DS;
325 tss->esp = (unsigned long)tss + PAGE_SIZE;
326 tss->__cr3 = __pa(idle_pg_table);
327 tss->cs = __HYPERVISOR_CS;
328 tss->eip = (unsigned long)do_double_fault;
329 tss->eflags = 2;
330 tss->bitmap = IOBMP_INVALID_OFFSET;
331 _set_tssldt_desc(
332 this_cpu(gdt_table) + DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
333 (unsigned long)tss, 235, 9);
335 set_task_gate(TRAP_double_fault, DOUBLEFAULT_TSS_ENTRY << 3);
336 }
338 void init_int80_direct_trap(struct vcpu *v)
339 {
340 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
342 /*
343 * We can't virtualise interrupt gates, as there's no way to get
344 * the CPU to automatically clear the events_mask variable. Also we
345 * must ensure that the CS is safe to poke into an interrupt gate.
346 *
347 * When running with supervisor_mode_kernel enabled a direct trap
348 * to the guest OS cannot be used because the INT instruction will
349 * switch to the Xen stack and we need to swap back to the guest
350 * kernel stack before passing control to the system call entry point.
351 */
352 if ( TI_GET_IF(ti) || !guest_gate_selector_okay(v->domain, ti->cs) ||
353 supervisor_mode_kernel )
354 {
355 v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
356 return;
357 }
359 v->arch.int80_desc.a = (ti->cs << 16) | (ti->address & 0xffff);
360 v->arch.int80_desc.b =
361 (ti->address & 0xffff0000) | 0x8f00 | ((TI_GET_DPL(ti) & 3) << 13);
363 if ( v == current )
364 set_int80_direct_trap(v);
365 }
367 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
368 static void do_update_sysenter(void *info)
369 {
370 xen_callback_t *address = info;
372 wrmsr(MSR_IA32_SYSENTER_CS, address->cs, 0);
373 wrmsr(MSR_IA32_SYSENTER_EIP, address->eip, 0);
374 }
375 #endif
377 static long register_guest_callback(struct callback_register *reg)
378 {
379 long ret = 0;
380 struct vcpu *v = current;
382 fixup_guest_code_selector(v->domain, reg->address.cs);
384 switch ( reg->type )
385 {
386 case CALLBACKTYPE_event:
387 v->arch.guest_context.event_callback_cs = reg->address.cs;
388 v->arch.guest_context.event_callback_eip = reg->address.eip;
389 break;
391 case CALLBACKTYPE_failsafe:
392 v->arch.guest_context.failsafe_callback_cs = reg->address.cs;
393 v->arch.guest_context.failsafe_callback_eip = reg->address.eip;
394 if ( reg->flags & CALLBACKF_mask_events )
395 set_bit(_VGCF_failsafe_disables_events,
396 &v->arch.guest_context.flags);
397 else
398 clear_bit(_VGCF_failsafe_disables_events,
399 &v->arch.guest_context.flags);
400 break;
402 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
403 case CALLBACKTYPE_sysenter_deprecated:
404 if ( !cpu_has_sep )
405 ret = -EINVAL;
406 else if ( on_each_cpu(do_update_sysenter, &reg->address, 1) != 0 )
407 ret = -EIO;
408 break;
410 case CALLBACKTYPE_sysenter:
411 if ( !cpu_has_sep )
412 ret = -EINVAL;
413 else
414 do_update_sysenter(&reg->address);
415 break;
416 #endif
418 case CALLBACKTYPE_nmi:
419 ret = register_guest_nmi_callback(reg->address.eip);
420 break;
422 default:
423 ret = -ENOSYS;
424 break;
425 }
427 return ret;
428 }
430 static long unregister_guest_callback(struct callback_unregister *unreg)
431 {
432 long ret;
434 switch ( unreg->type )
435 {
436 case CALLBACKTYPE_event:
437 case CALLBACKTYPE_failsafe:
438 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
439 case CALLBACKTYPE_sysenter_deprecated:
440 case CALLBACKTYPE_sysenter:
441 #endif
442 ret = -EINVAL;
443 break;
445 case CALLBACKTYPE_nmi:
446 ret = unregister_guest_nmi_callback();
447 break;
449 default:
450 ret = -ENOSYS;
451 break;
452 }
454 return ret;
455 }
458 long do_callback_op(int cmd, XEN_GUEST_HANDLE(const_void) arg)
459 {
460 long ret;
462 switch ( cmd )
463 {
464 case CALLBACKOP_register:
465 {
466 struct callback_register reg;
468 ret = -EFAULT;
469 if ( copy_from_guest(&reg, arg, 1) )
470 break;
472 ret = register_guest_callback(&reg);
473 }
474 break;
476 case CALLBACKOP_unregister:
477 {
478 struct callback_unregister unreg;
480 ret = -EFAULT;
481 if ( copy_from_guest(&unreg, arg, 1) )
482 break;
484 ret = unregister_guest_callback(&unreg);
485 }
486 break;
488 default:
489 ret = -ENOSYS;
490 break;
491 }
493 return ret;
494 }
496 long do_set_callbacks(unsigned long event_selector,
497 unsigned long event_address,
498 unsigned long failsafe_selector,
499 unsigned long failsafe_address)
500 {
501 struct callback_register event = {
502 .type = CALLBACKTYPE_event,
503 .address = { event_selector, event_address },
504 };
505 struct callback_register failsafe = {
506 .type = CALLBACKTYPE_failsafe,
507 .address = { failsafe_selector, failsafe_address },
508 };
510 register_guest_callback(&event);
511 register_guest_callback(&failsafe);
513 return 0;
514 }
516 static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
517 {
518 extern asmlinkage int hypercall(void);
519 char *p;
520 int i;
522 /* Fill in all the transfer points with template machine code. */
524 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
525 {
526 p = (char *)(hypercall_page + (i * 32));
528 *(u8 *)(p+ 0) = 0x9c; /* pushf */
529 *(u8 *)(p+ 1) = 0xfa; /* cli */
530 *(u8 *)(p+ 2) = 0xb8; /* mov $<i>,%eax */
531 *(u32 *)(p+ 3) = i;
532 *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
533 *(u32 *)(p+ 8) = (u32)&hypercall;
534 *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
535 *(u8 *)(p+14) = 0xc3; /* ret */
536 }
538 /*
539 * HYPERVISOR_iret is special because it doesn't return and expects a
540 * special stack frame. Guests jump at this transfer point instead of
541 * calling it.
542 */
543 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
544 *(u8 *)(p+ 0) = 0x50; /* push %eax */
545 *(u8 *)(p+ 1) = 0x9c; /* pushf */
546 *(u8 *)(p+ 2) = 0xfa; /* cli */
547 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
548 *(u32 *)(p+ 4) = __HYPERVISOR_iret;
549 *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
550 *(u32 *)(p+ 9) = (u32)&hypercall;
551 *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
552 }
554 static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
555 {
556 char *p;
557 int i;
559 /* Fill in all the transfer points with template machine code. */
561 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
562 {
563 p = (char *)(hypercall_page + (i * 32));
564 *(u8 *)(p+ 0) = 0xb8; /* mov $<i>,%eax */
565 *(u32 *)(p+ 1) = i;
566 *(u16 *)(p+ 5) = 0x82cd; /* int $0x82 */
567 *(u8 *)(p+ 7) = 0xc3; /* ret */
568 }
570 /*
571 * HYPERVISOR_iret is special because it doesn't return and expects a
572 * special stack frame. Guests jump at this transfer point instead of
573 * calling it.
574 */
575 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
576 *(u8 *)(p+ 0) = 0x50; /* push %eax */
577 *(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
578 *(u32 *)(p+ 2) = __HYPERVISOR_iret;
579 *(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */
580 }
582 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
583 {
584 memset(hypercall_page, 0xCC, PAGE_SIZE);
585 if ( is_hvm_domain(d) )
586 hvm_hypercall_page_initialise(d, hypercall_page);
587 else if ( supervisor_mode_kernel )
588 hypercall_page_initialise_ring0_kernel(hypercall_page);
589 else
590 hypercall_page_initialise_ring1_kernel(hypercall_page);
591 }
593 /*
594 * Local variables:
595 * mode: C
596 * c-set-style: "BSD"
597 * c-basic-offset: 4
598 * tab-width: 4
599 * indent-tabs-mode: nil
600 * End:
601 */