debuggers.hg

view xen/arch/x86/x86_64/traps.c @ 16959:ed8ab1a36b09

x86-64: use 1GB pages in 1:1 mapping if available

At once adjust the 2/4Mb page handling slightly in a few places (to
match the newly added code):
- when re-creating a large page mapping after finding that all small
page mappings in the respective area are using identical flags and
suitable MFNs, the virtual address was already incremented pas the
area to be dealt with, which needs to be accounted for in the
invocation of flush_area() in that path
- don't or-in/and-out _PAGE_PSE on non-present pages
- when comparing flags, try minimse the number of l1f_to_lNf()/
lNf_to_l1f() instances used
- instead of skipping a single page when encountering a big page
mapping equalling to what a small page mapping would establish, skip
to the next larger page boundary

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 28 10:17:05 2008 +0000 (2008-01-28)
parents 1a2f557448cf
children 6b1795ee1b19
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/init.h>
5 #include <xen/sched.h>
6 #include <xen/lib.h>
7 #include <xen/errno.h>
8 #include <xen/mm.h>
9 #include <xen/irq.h>
10 #include <xen/symbols.h>
11 #include <xen/console.h>
12 #include <xen/sched.h>
13 #include <xen/shutdown.h>
14 #include <xen/nmi.h>
15 #include <asm/current.h>
16 #include <asm/flushtlb.h>
17 #include <asm/msr.h>
18 #include <asm/page.h>
19 #include <asm/shared.h>
20 #include <asm/hvm/hvm.h>
21 #include <asm/hvm/support.h>
22 #include <public/callback.h>
24 asmlinkage void syscall_enter(void);
25 asmlinkage void sysenter_entry(void);
26 asmlinkage void compat_hypercall(void);
27 asmlinkage void int80_direct_trap(void);
29 static void print_xen_info(void)
30 {
31 char taint_str[TAINT_STRING_MAX_LEN];
32 char debug = 'n';
34 #ifndef NDEBUG
35 debug = 'y';
36 #endif
38 printk("----[ Xen-%d.%d%s x86_64 debug=%c %s ]----\n",
39 xen_major_version(), xen_minor_version(), xen_extra_version(),
40 debug, print_tainted(taint_str));
41 }
43 void show_registers(struct cpu_user_regs *regs)
44 {
45 struct cpu_user_regs fault_regs = *regs;
46 unsigned long fault_crs[8];
47 const char *context;
48 struct vcpu *v = current;
50 if ( is_hvm_vcpu(v) && guest_mode(regs) )
51 {
52 struct segment_register sreg;
53 context = "hvm";
54 fault_crs[0] = v->arch.hvm_vcpu.guest_cr[0];
55 fault_crs[2] = v->arch.hvm_vcpu.guest_cr[2];
56 fault_crs[3] = v->arch.hvm_vcpu.guest_cr[3];
57 fault_crs[4] = v->arch.hvm_vcpu.guest_cr[4];
58 hvm_get_segment_register(v, x86_seg_cs, &sreg);
59 fault_regs.cs = sreg.sel;
60 hvm_get_segment_register(v, x86_seg_ds, &sreg);
61 fault_regs.ds = sreg.sel;
62 hvm_get_segment_register(v, x86_seg_es, &sreg);
63 fault_regs.es = sreg.sel;
64 hvm_get_segment_register(v, x86_seg_fs, &sreg);
65 fault_regs.fs = sreg.sel;
66 hvm_get_segment_register(v, x86_seg_gs, &sreg);
67 fault_regs.gs = sreg.sel;
68 hvm_get_segment_register(v, x86_seg_ss, &sreg);
69 fault_regs.ss = sreg.sel;
70 }
71 else
72 {
73 if ( guest_mode(regs) )
74 {
75 context = "guest";
76 fault_crs[2] = arch_get_cr2(v);
77 }
78 else
79 {
80 context = "hypervisor";
81 fault_crs[2] = read_cr2();
82 }
84 fault_crs[0] = read_cr0();
85 fault_crs[3] = read_cr3();
86 fault_crs[4] = read_cr4();
87 fault_regs.ds = read_segment_register(ds);
88 fault_regs.es = read_segment_register(es);
89 fault_regs.fs = read_segment_register(fs);
90 fault_regs.gs = read_segment_register(gs);
91 }
93 print_xen_info();
94 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
95 smp_processor_id(), fault_regs.cs, fault_regs.rip);
96 if ( !guest_mode(regs) )
97 print_symbol(" %s", fault_regs.rip);
98 printk("\nRFLAGS: %016lx CONTEXT: %s\n", fault_regs.rflags, context);
99 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
100 fault_regs.rax, fault_regs.rbx, fault_regs.rcx);
101 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
102 fault_regs.rdx, fault_regs.rsi, fault_regs.rdi);
103 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
104 fault_regs.rbp, fault_regs.rsp, fault_regs.r8);
105 printk("r9: %016lx r10: %016lx r11: %016lx\n",
106 fault_regs.r9, fault_regs.r10, fault_regs.r11);
107 printk("r12: %016lx r13: %016lx r14: %016lx\n",
108 fault_regs.r12, fault_regs.r13, fault_regs.r14);
109 printk("r15: %016lx cr0: %016lx cr4: %016lx\n",
110 fault_regs.r15, fault_crs[0], fault_crs[4]);
111 printk("cr3: %016lx cr2: %016lx\n", fault_crs[3], fault_crs[2]);
112 printk("ds: %04x es: %04x fs: %04x gs: %04x "
113 "ss: %04x cs: %04x\n",
114 fault_regs.ds, fault_regs.es, fault_regs.fs,
115 fault_regs.gs, fault_regs.ss, fault_regs.cs);
117 if ( this_cpu(ler_msr) && !guest_mode(regs) )
118 {
119 u64 from, to;
120 rdmsrl(this_cpu(ler_msr), from);
121 rdmsrl(this_cpu(ler_msr) + 1, to);
122 printk("ler: %016lx -> %016lx\n", from, to);
123 }
124 }
126 void show_page_walk(unsigned long addr)
127 {
128 unsigned long pfn, mfn = read_cr3() >> PAGE_SHIFT;
129 l4_pgentry_t l4e, *l4t;
130 l3_pgentry_t l3e, *l3t;
131 l2_pgentry_t l2e, *l2t;
132 l1_pgentry_t l1e, *l1t;
134 printk("Pagetable walk from %016lx:\n", addr);
136 l4t = mfn_to_virt(mfn);
137 l4e = l4t[l4_table_offset(addr)];
138 mfn = l4e_get_pfn(l4e);
139 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
140 printk(" L4[0x%03lx] = %"PRIpte" %016lx\n",
141 l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
142 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
143 return;
145 l3t = mfn_to_virt(mfn);
146 l3e = l3t[l3_table_offset(addr)];
147 mfn = l3e_get_pfn(l3e);
148 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
149 printk(" L3[0x%03lx] = %"PRIpte" %016lx%s\n",
150 l3_table_offset(addr), l3e_get_intpte(l3e), pfn,
151 (l3e_get_flags(l3e) & _PAGE_PSE) ? " (PSE)" : "");
152 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
153 (l3e_get_flags(l3e) & _PAGE_PSE) )
154 return;
156 l2t = mfn_to_virt(mfn);
157 l2e = l2t[l2_table_offset(addr)];
158 mfn = l2e_get_pfn(l2e);
159 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
160 printk(" L2[0x%03lx] = %"PRIpte" %016lx %s\n",
161 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
162 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
163 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
164 (l2e_get_flags(l2e) & _PAGE_PSE) )
165 return;
167 l1t = mfn_to_virt(mfn);
168 l1e = l1t[l1_table_offset(addr)];
169 mfn = l1e_get_pfn(l1e);
170 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
171 printk(" L1[0x%03lx] = %"PRIpte" %016lx\n",
172 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
173 }
175 asmlinkage void double_fault(void);
176 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
177 {
178 unsigned int cpu, tr;
180 asm volatile ( "str %0" : "=r" (tr) );
181 cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
183 watchdog_disable();
185 console_force_unlock();
187 /* Find information saved during fault and dump it to the console. */
188 printk("*** DOUBLE FAULT ***\n");
189 print_xen_info();
190 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
191 cpu, regs->cs, regs->rip);
192 print_symbol(" %s", regs->rip);
193 printk("\nRFLAGS: %016lx\n", regs->rflags);
194 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
195 regs->rax, regs->rbx, regs->rcx);
196 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
197 regs->rdx, regs->rsi, regs->rdi);
198 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
199 regs->rbp, regs->rsp, regs->r8);
200 printk("r9: %016lx r10: %016lx r11: %016lx\n",
201 regs->r9, regs->r10, regs->r11);
202 printk("r12: %016lx r13: %016lx r14: %016lx\n",
203 regs->r12, regs->r13, regs->r14);
204 printk("r15: %016lx cs: %016lx ss: %016lx\n",
205 regs->r15, (long)regs->cs, (long)regs->ss);
206 show_stack_overflow(cpu, regs->rsp);
208 panic("DOUBLE FAULT -- system shutdown\n");
209 }
211 void toggle_guest_mode(struct vcpu *v)
212 {
213 if ( is_pv_32bit_vcpu(v) )
214 return;
215 v->arch.flags ^= TF_kernel_mode;
216 asm volatile ( "swapgs" );
217 update_cr3(v);
218 #ifdef USER_MAPPINGS_ARE_GLOBAL
219 /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
220 asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
221 #else
222 write_ptbase(v);
223 #endif
224 }
226 unsigned long do_iret(void)
227 {
228 struct cpu_user_regs *regs = guest_cpu_user_regs();
229 struct iret_context iret_saved;
230 struct vcpu *v = current;
232 if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
233 sizeof(iret_saved))) )
234 {
235 gdprintk(XENLOG_ERR, "Fault while reading IRET context from "
236 "guest stack\n");
237 goto exit_and_crash;
238 }
240 /* Returning to user mode? */
241 if ( (iret_saved.cs & 3) == 3 )
242 {
243 if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) )
244 {
245 gdprintk(XENLOG_ERR, "Guest switching to user mode with no "
246 "user page tables\n");
247 goto exit_and_crash;
248 }
249 toggle_guest_mode(v);
250 }
252 regs->rip = iret_saved.rip;
253 regs->cs = iret_saved.cs | 3; /* force guest privilege */
254 regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
255 regs->rsp = iret_saved.rsp;
256 regs->ss = iret_saved.ss | 3; /* force guest privilege */
258 if ( !(iret_saved.flags & VGCF_in_syscall) )
259 {
260 regs->entry_vector = 0;
261 regs->r11 = iret_saved.r11;
262 regs->rcx = iret_saved.rcx;
263 }
265 /* No longer in NMI context. */
266 v->nmi_masked = 0;
268 /* Restore upcall mask from supplied EFLAGS.IF. */
269 vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
271 /* Saved %rax gets written back to regs->rax in entry.S. */
272 return iret_saved.rax;
274 exit_and_crash:
275 gdprintk(XENLOG_ERR, "Fatal error\n");
276 domain_crash(v->domain);
277 return 0;
278 }
280 static int write_stack_trampoline(
281 char *stack, char *stack_bottom, uint16_t cs_seg)
282 {
283 /* movq %rsp, saversp(%rip) */
284 stack[0] = 0x48;
285 stack[1] = 0x89;
286 stack[2] = 0x25;
287 *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
289 /* leaq saversp(%rip), %rsp */
290 stack[7] = 0x48;
291 stack[8] = 0x8d;
292 stack[9] = 0x25;
293 *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
295 /* pushq %r11 */
296 stack[14] = 0x41;
297 stack[15] = 0x53;
299 /* pushq $<cs_seg> */
300 stack[16] = 0x68;
301 *(u32 *)&stack[17] = cs_seg;
303 /* movq $syscall_enter,%r11 */
304 stack[21] = 0x49;
305 stack[22] = 0xbb;
306 *(void **)&stack[23] = (void *)syscall_enter;
308 /* jmpq *%r11 */
309 stack[31] = 0x41;
310 stack[32] = 0xff;
311 stack[33] = 0xe3;
313 return 34;
314 }
316 void __devinit subarch_percpu_traps_init(void)
317 {
318 char *stack_bottom, *stack;
319 int cpu = smp_processor_id();
321 if ( cpu == 0 )
322 {
323 /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
324 set_intr_gate(TRAP_double_fault, &double_fault);
325 idt_table[TRAP_double_fault].a |= IST_DF << 32;
326 idt_table[TRAP_nmi].a |= IST_NMI << 32;
327 idt_table[TRAP_machine_check].a |= IST_MCE << 32;
329 /*
330 * The 32-on-64 hypercall entry vector is only accessible from ring 1.
331 * Also note that this is a trap gate, not an interrupt gate.
332 */
333 _set_gate(idt_table+HYPERCALL_VECTOR, 15, 1, &compat_hypercall);
335 /* Fast trap for int80 (faster than taking the #GP-fixup path). */
336 _set_gate(idt_table+0x80, 15, 3, &int80_direct_trap);
337 }
339 stack_bottom = (char *)get_stack_bottom();
340 stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
342 /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
343 BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
345 /* Machine Check handler has its own per-CPU 4kB stack. */
346 init_tss[cpu].ist[IST_MCE] = (unsigned long)&stack[IST_MCE * PAGE_SIZE];
348 /* Double-fault handler has its own per-CPU 4kB stack. */
349 init_tss[cpu].ist[IST_DF] = (unsigned long)&stack[IST_DF * PAGE_SIZE];
351 /* NMI handler has its own per-CPU 4kB stack. */
352 init_tss[cpu].ist[IST_NMI] = (unsigned long)&stack[IST_NMI * PAGE_SIZE];
354 /* Trampoline for SYSCALL entry from long mode. */
355 stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
356 wrmsrl(MSR_LSTAR, (unsigned long)stack);
357 stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
359 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
360 {
361 /* SYSENTER entry. */
362 wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
363 wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
364 wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
365 }
367 /* Trampoline for SYSCALL entry from compatibility mode. */
368 stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
369 wrmsrl(MSR_CSTAR, (unsigned long)stack);
370 stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
372 /* Common SYSCALL parameters. */
373 wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
374 wrmsr(MSR_SYSCALL_MASK, EF_VM|EF_RF|EF_NT|EF_DF|EF_IE|EF_TF, 0U);
375 }
377 void init_int80_direct_trap(struct vcpu *v)
378 {
379 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
380 struct trap_bounce *tb = &v->arch.int80_bounce;
382 tb->flags = TBF_EXCEPTION;
383 tb->cs = ti->cs;
384 tb->eip = ti->address;
386 if ( null_trap_bounce(v, tb) )
387 tb->flags = 0;
388 }
390 static long register_guest_callback(struct callback_register *reg)
391 {
392 long ret = 0;
393 struct vcpu *v = current;
395 if ( !is_canonical_address(reg->address) )
396 return -EINVAL;
398 switch ( reg->type )
399 {
400 case CALLBACKTYPE_event:
401 v->arch.guest_context.event_callback_eip = reg->address;
402 break;
404 case CALLBACKTYPE_failsafe:
405 v->arch.guest_context.failsafe_callback_eip = reg->address;
406 if ( reg->flags & CALLBACKF_mask_events )
407 set_bit(_VGCF_failsafe_disables_events,
408 &v->arch.guest_context.flags);
409 else
410 clear_bit(_VGCF_failsafe_disables_events,
411 &v->arch.guest_context.flags);
412 break;
414 case CALLBACKTYPE_syscall:
415 v->arch.guest_context.syscall_callback_eip = reg->address;
416 if ( reg->flags & CALLBACKF_mask_events )
417 set_bit(_VGCF_syscall_disables_events,
418 &v->arch.guest_context.flags);
419 else
420 clear_bit(_VGCF_syscall_disables_events,
421 &v->arch.guest_context.flags);
422 break;
424 case CALLBACKTYPE_syscall32:
425 v->arch.syscall32_callback_eip = reg->address;
426 v->arch.syscall32_disables_events =
427 !!(reg->flags & CALLBACKF_mask_events);
428 break;
430 case CALLBACKTYPE_sysenter:
431 v->arch.sysenter_callback_eip = reg->address;
432 v->arch.sysenter_disables_events =
433 !!(reg->flags & CALLBACKF_mask_events);
434 break;
436 case CALLBACKTYPE_nmi:
437 ret = register_guest_nmi_callback(reg->address);
438 break;
440 default:
441 ret = -ENOSYS;
442 break;
443 }
445 return ret;
446 }
448 static long unregister_guest_callback(struct callback_unregister *unreg)
449 {
450 long ret;
452 switch ( unreg->type )
453 {
454 case CALLBACKTYPE_event:
455 case CALLBACKTYPE_failsafe:
456 case CALLBACKTYPE_syscall:
457 case CALLBACKTYPE_syscall32:
458 case CALLBACKTYPE_sysenter:
459 ret = -EINVAL;
460 break;
462 case CALLBACKTYPE_nmi:
463 ret = unregister_guest_nmi_callback();
464 break;
466 default:
467 ret = -ENOSYS;
468 break;
469 }
471 return ret;
472 }
475 long do_callback_op(int cmd, XEN_GUEST_HANDLE(const_void) arg)
476 {
477 long ret;
479 switch ( cmd )
480 {
481 case CALLBACKOP_register:
482 {
483 struct callback_register reg;
485 ret = -EFAULT;
486 if ( copy_from_guest(&reg, arg, 1) )
487 break;
489 ret = register_guest_callback(&reg);
490 }
491 break;
493 case CALLBACKOP_unregister:
494 {
495 struct callback_unregister unreg;
497 ret = -EFAULT;
498 if ( copy_from_guest(&unreg, arg, 1) )
499 break;
501 ret = unregister_guest_callback(&unreg);
502 }
503 break;
505 default:
506 ret = -ENOSYS;
507 break;
508 }
510 return ret;
511 }
513 long do_set_callbacks(unsigned long event_address,
514 unsigned long failsafe_address,
515 unsigned long syscall_address)
516 {
517 struct callback_register event = {
518 .type = CALLBACKTYPE_event,
519 .address = event_address,
520 };
521 struct callback_register failsafe = {
522 .type = CALLBACKTYPE_failsafe,
523 .address = failsafe_address,
524 };
525 struct callback_register syscall = {
526 .type = CALLBACKTYPE_syscall,
527 .address = syscall_address,
528 };
530 register_guest_callback(&event);
531 register_guest_callback(&failsafe);
532 register_guest_callback(&syscall);
534 return 0;
535 }
537 static void hypercall_page_initialise_ring3_kernel(void *hypercall_page)
538 {
539 char *p;
540 int i;
542 /* Fill in all the transfer points with template machine code. */
543 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
544 {
545 p = (char *)(hypercall_page + (i * 32));
546 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
547 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
548 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
549 *(u32 *)(p+ 4) = i;
550 *(u16 *)(p+ 8) = 0x050f; /* syscall */
551 *(u16 *)(p+10) = 0x5b41; /* pop %r11 */
552 *(u8 *)(p+12) = 0x59; /* pop %rcx */
553 *(u8 *)(p+13) = 0xc3; /* ret */
554 }
556 /*
557 * HYPERVISOR_iret is special because it doesn't return and expects a
558 * special stack frame. Guests jump at this transfer point instead of
559 * calling it.
560 */
561 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
562 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
563 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
564 *(u8 *)(p+ 3) = 0x50; /* push %rax */
565 *(u8 *)(p+ 4) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
566 *(u32 *)(p+ 5) = __HYPERVISOR_iret;
567 *(u16 *)(p+ 9) = 0x050f; /* syscall */
568 }
570 #include "compat/traps.c"
572 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
573 {
574 memset(hypercall_page, 0xCC, PAGE_SIZE);
575 if ( is_hvm_domain(d) )
576 hvm_hypercall_page_initialise(d, hypercall_page);
577 else if ( !is_pv_32bit_domain(d) )
578 hypercall_page_initialise_ring3_kernel(hypercall_page);
579 else
580 hypercall_page_initialise_ring1_kernel(hypercall_page);
581 }
583 /*
584 * Local variables:
585 * mode: C
586 * c-set-style: "BSD"
587 * c-basic-offset: 4
588 * tab-width: 4
589 * indent-tabs-mode: nil
590 * End:
591 */