debuggers.hg

view xen/arch/x86/traps.c @ 17013:99b8ffe25088

x86: adjust reserved bit page fault handling
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 04 13:57:01 2008 +0000 (2008-02-04)
parents 923f2f736507
children 2c2b442902e2
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
65 #include <public/arch-x86/cpuid.h>
67 /*
68 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
69 * fatal: Xen prints diagnostic message and then hangs.
70 * dom0: The NMI is virtualised to DOM0.
71 * ignore: The NMI error is cleared and ignored.
72 */
73 #ifdef NDEBUG
74 char opt_nmi[10] = "dom0";
75 #else
76 char opt_nmi[10] = "fatal";
77 #endif
78 string_param("nmi", opt_nmi);
80 DEFINE_PER_CPU(u32, ler_msr);
82 /* Master table, used by CPU0. */
83 idt_entry_t idt_table[IDT_ENTRIES];
85 /* Pointer to the IDT of every CPU. */
86 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
88 #define DECLARE_TRAP_HANDLER(_name) \
89 asmlinkage void _name(void); \
90 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
92 DECLARE_TRAP_HANDLER(divide_error);
93 DECLARE_TRAP_HANDLER(debug);
94 DECLARE_TRAP_HANDLER(nmi);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(machine_check);
109 DECLARE_TRAP_HANDLER(alignment_check);
110 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
112 long do_set_debugreg(int reg, unsigned long value);
113 unsigned long do_get_debugreg(int reg);
114 void (*ioemul_handle_quirk)(
115 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
117 static int debug_stack_lines = 20;
118 integer_param("debug_stack_lines", debug_stack_lines);
120 static int opt_ler;
121 boolean_param("ler", opt_ler);
123 #ifdef CONFIG_X86_32
124 #define stack_words_per_line 8
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
126 #else
127 #define stack_words_per_line 4
128 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
129 #endif
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 struct vcpu *curr = current;
135 unsigned long *stack, addr;
137 if ( is_hvm_vcpu(curr) )
138 return;
140 if ( is_pv_32on64_vcpu(curr) )
141 {
142 compat_show_guest_stack(regs, debug_stack_lines);
143 return;
144 }
146 if ( vm86_mode(regs) )
147 {
148 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
149 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
150 regs->ss, (uint16_t)(regs->esp & 0xffff));
151 }
152 else
153 {
154 stack = (unsigned long *)regs->esp;
155 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
156 }
158 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
159 {
160 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
161 break;
162 if ( get_user(addr, stack) )
163 {
164 if ( i != 0 )
165 printk("\n ");
166 printk("Fault while accessing guest memory.");
167 i = 1;
168 break;
169 }
170 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
171 printk("\n ");
172 printk(" %p", _p(addr));
173 stack++;
174 }
175 if ( i == 0 )
176 printk("Stack empty.");
177 printk("\n");
178 }
180 #if !defined(CONFIG_FRAME_POINTER)
182 static void show_trace(struct cpu_user_regs *regs)
183 {
184 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
186 printk("Xen call trace:\n ");
188 printk("[<%p>]", _p(regs->eip));
189 print_symbol(" %s\n ", regs->eip);
191 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
192 {
193 addr = *stack++;
194 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
195 {
196 printk("[<%p>]", _p(addr));
197 print_symbol(" %s\n ", addr);
198 }
199 }
201 printk("\n");
202 }
204 #else
206 static void show_trace(struct cpu_user_regs *regs)
207 {
208 unsigned long *frame, next, addr, low, high;
210 printk("Xen call trace:\n ");
212 printk("[<%p>]", _p(regs->eip));
213 print_symbol(" %s\n ", regs->eip);
215 /* Bounds for range of valid frame pointer. */
216 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
217 high = (low & ~(STACK_SIZE - 1)) +
218 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
220 /* The initial frame pointer. */
221 next = regs->ebp;
223 for ( ; ; )
224 {
225 /* Valid frame pointer? */
226 if ( (next < low) || (next >= high) )
227 {
228 /*
229 * Exception stack frames have a different layout, denoted by an
230 * inverted frame pointer.
231 */
232 next = ~next;
233 if ( (next < low) || (next >= high) )
234 break;
235 frame = (unsigned long *)next;
236 next = frame[0];
237 addr = frame[(offsetof(struct cpu_user_regs, eip) -
238 offsetof(struct cpu_user_regs, ebp))
239 / BYTES_PER_LONG];
240 }
241 else
242 {
243 /* Ordinary stack frame. */
244 frame = (unsigned long *)next;
245 next = frame[0];
246 addr = frame[1];
247 }
249 printk("[<%p>]", _p(addr));
250 print_symbol(" %s\n ", addr);
252 low = (unsigned long)&frame[2];
253 }
255 printk("\n");
256 }
258 #endif
260 void show_stack(struct cpu_user_regs *regs)
261 {
262 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
263 int i;
265 if ( guest_mode(regs) )
266 return show_guest_stack(regs);
268 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
270 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
271 {
272 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
273 break;
274 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
275 printk("\n ");
276 addr = *stack++;
277 printk(" %p", _p(addr));
278 }
279 if ( i == 0 )
280 printk("Stack empty.");
281 printk("\n");
283 show_trace(regs);
284 }
286 void show_stack_overflow(unsigned int cpu, unsigned long esp)
287 {
288 #ifdef MEMORY_GUARD
289 unsigned long esp_top, esp_bottom;
290 unsigned long *stack, addr;
292 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
293 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
295 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
296 (void *)esp_top, (void *)esp_bottom, (void *)esp,
297 (void *)init_tss[cpu].esp0);
299 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
300 if ( ((unsigned long)(esp - esp_top) > 512) &&
301 ((unsigned long)(esp_top - esp) > 512) )
302 {
303 printk("No stack overflow detected. Skipping stack trace.\n");
304 return;
305 }
307 if ( esp < esp_top )
308 esp = esp_top;
310 printk("Xen stack overflow (dumping trace %p-%p):\n ",
311 (void *)esp, (void *)esp_bottom);
313 stack = (unsigned long *)esp;
314 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
315 {
316 addr = *stack++;
317 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
318 {
319 printk("%p: [<%p>]", stack, _p(addr));
320 print_symbol(" %s\n ", addr);
321 }
322 }
324 printk("\n");
325 #endif
326 }
328 void show_execution_state(struct cpu_user_regs *regs)
329 {
330 show_registers(regs);
331 show_stack(regs);
332 }
334 char *trapstr(int trapnr)
335 {
336 static char *strings[] = {
337 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
338 "invalid opcode", "device not available", "double fault",
339 "coprocessor segment", "invalid tss", "segment not found",
340 "stack error", "general protection fault", "page fault",
341 "spurious interrupt", "coprocessor error", "alignment check",
342 "machine check", "simd error"
343 };
345 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
346 return "???";
348 return strings[trapnr];
349 }
351 /*
352 * This is called for faults at very unexpected times (e.g., when interrupts
353 * are disabled). In such situations we can't do much that is safe. We try to
354 * print out some tracing and then we just spin.
355 */
356 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
357 {
358 static DEFINE_PER_CPU(char, depth);
360 /*
361 * In some cases, we can end up in a vicious cycle of fatal_trap()s
362 * within fatal_trap()s. We give the problem a couple of iterations to
363 * bottom out, and then we just panic.
364 */
365 if ( ++this_cpu(depth) < 3 )
366 {
367 watchdog_disable();
368 console_start_sync();
370 show_execution_state(regs);
372 if ( trapnr == TRAP_page_fault )
373 {
374 unsigned long cr2 = read_cr2();
375 printk("Faulting linear address: %p\n", _p(cr2));
376 show_page_walk(cr2);
377 }
378 }
380 panic("FATAL TRAP: vector = %d (%s)\n"
381 "[error_code=%04x] %s\n",
382 trapnr, trapstr(trapnr), regs->error_code,
383 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
384 }
386 static void do_guest_trap(
387 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
388 {
389 struct vcpu *v = current;
390 struct trap_bounce *tb;
391 const struct trap_info *ti;
393 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
395 tb = &v->arch.trap_bounce;
396 ti = &v->arch.guest_context.trap_ctxt[trapnr];
398 tb->flags = TBF_EXCEPTION;
399 tb->cs = ti->cs;
400 tb->eip = ti->address;
402 if ( use_error_code )
403 {
404 tb->flags |= TBF_EXCEPTION_ERRCODE;
405 tb->error_code = regs->error_code;
406 }
408 if ( TI_GET_IF(ti) )
409 tb->flags |= TBF_INTERRUPT;
411 if ( unlikely(null_trap_bounce(v, tb)) )
412 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
413 "on VCPU %d [ec=%04x]\n",
414 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
415 }
417 static void instruction_done(
418 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
419 {
420 regs->eip = eip;
421 regs->eflags &= ~X86_EFLAGS_RF;
422 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
423 {
424 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
425 if ( regs->eflags & X86_EFLAGS_TF )
426 current->arch.guest_context.debugreg[6] |= 0x4000;
427 do_guest_trap(TRAP_debug, regs, 0);
428 }
429 }
431 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
432 unsigned int port, unsigned int len)
433 {
434 unsigned int width, i, match = 0;
435 unsigned long start;
437 if ( !(v->arch.guest_context.debugreg[5]) ||
438 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
439 return 0;
441 for ( i = 0; i < 4; i++ )
442 {
443 if ( !(v->arch.guest_context.debugreg[5] &
444 (3 << (i * DR_ENABLE_SIZE))) )
445 continue;
447 start = v->arch.guest_context.debugreg[i];
448 width = 0;
450 switch ( (v->arch.guest_context.debugreg[7] >>
451 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
452 {
453 case DR_LEN_1: width = 1; break;
454 case DR_LEN_2: width = 2; break;
455 case DR_LEN_4: width = 4; break;
456 case DR_LEN_8: width = 8; break;
457 }
459 if ( (start < (port + len)) && ((start + width) > port) )
460 match |= 1 << i;
461 }
463 return match;
464 }
466 /*
467 * Called from asm to set up the NMI trapbounce info.
468 * Returns 0 if no callback is set up, else 1.
469 */
470 asmlinkage int set_guest_nmi_trapbounce(void)
471 {
472 struct vcpu *v = current;
473 struct trap_bounce *tb = &v->arch.trap_bounce;
474 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
475 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
476 return !null_trap_bounce(v, tb);
477 }
479 static inline void do_trap(
480 int trapnr, struct cpu_user_regs *regs, int use_error_code)
481 {
482 unsigned long fixup;
484 DEBUGGER_trap_entry(trapnr, regs);
486 if ( guest_mode(regs) )
487 {
488 do_guest_trap(trapnr, regs, use_error_code);
489 return;
490 }
492 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
493 {
494 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
495 trapnr, _p(regs->eip), _p(fixup));
496 regs->eip = fixup;
497 return;
498 }
500 DEBUGGER_trap_fatal(trapnr, regs);
502 show_execution_state(regs);
503 panic("FATAL TRAP: vector = %d (%s)\n"
504 "[error_code=%04x]\n",
505 trapnr, trapstr(trapnr), regs->error_code);
506 }
508 #define DO_ERROR_NOCODE(trapnr, name) \
509 asmlinkage void do_##name(struct cpu_user_regs *regs) \
510 { \
511 do_trap(trapnr, regs, 0); \
512 }
514 #define DO_ERROR(trapnr, name) \
515 asmlinkage void do_##name(struct cpu_user_regs *regs) \
516 { \
517 do_trap(trapnr, regs, 1); \
518 }
520 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
521 DO_ERROR_NOCODE(TRAP_overflow, overflow)
522 DO_ERROR_NOCODE(TRAP_bounds, bounds)
523 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
524 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
525 DO_ERROR( TRAP_no_segment, segment_not_present)
526 DO_ERROR( TRAP_stack_error, stack_segment)
527 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
528 DO_ERROR( TRAP_alignment_check, alignment_check)
529 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
531 int rdmsr_hypervisor_regs(
532 uint32_t idx, uint32_t *eax, uint32_t *edx)
533 {
534 idx -= 0x40000000;
535 if ( idx > 0 )
536 return 0;
538 switch ( idx )
539 {
540 case 0:
541 {
542 *eax = *edx = 0;
543 break;
544 }
545 default:
546 BUG();
547 }
549 return 1;
550 }
552 int wrmsr_hypervisor_regs(
553 uint32_t idx, uint32_t eax, uint32_t edx)
554 {
555 struct domain *d = current->domain;
557 idx -= 0x40000000;
558 if ( idx > 0 )
559 return 0;
561 switch ( idx )
562 {
563 case 0:
564 {
565 void *hypercall_page;
566 unsigned long mfn;
567 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
568 unsigned int idx = eax & 0xfff;
570 if ( idx > 0 )
571 {
572 gdprintk(XENLOG_WARNING,
573 "Out of range index %u to MSR %08x\n",
574 idx, 0x40000000);
575 return 0;
576 }
578 mfn = gmfn_to_mfn(d, gmfn);
580 if ( !mfn_valid(mfn) ||
581 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
582 {
583 gdprintk(XENLOG_WARNING,
584 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
585 gmfn, mfn, 0x40000000);
586 return 0;
587 }
589 hypercall_page = map_domain_page(mfn);
590 hypercall_page_initialise(d, hypercall_page);
591 unmap_domain_page(hypercall_page);
593 put_page_and_type(mfn_to_page(mfn));
594 break;
595 }
597 default:
598 BUG();
599 }
601 return 1;
602 }
604 int cpuid_hypervisor_leaves(
605 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
606 {
607 idx -= 0x40000000;
608 if ( idx > 2 )
609 return 0;
611 switch ( idx )
612 {
613 case 0:
614 *eax = 0x40000002; /* Largest leaf */
615 *ebx = XEN_CPUID_SIGNATURE_EBX;
616 *ecx = XEN_CPUID_SIGNATURE_ECX;
617 *edx = XEN_CPUID_SIGNATURE_EDX;
618 break;
620 case 1:
621 *eax = (xen_major_version() << 16) | xen_minor_version();
622 *ebx = 0; /* Reserved */
623 *ecx = 0; /* Reserved */
624 *edx = 0; /* Reserved */
625 break;
627 case 2:
628 *eax = 1; /* Number of hypercall-transfer pages */
629 *ebx = 0x40000000; /* MSR base address */
630 *ecx = 0; /* Features 1 */
631 *edx = 0; /* Features 2 */
632 if ( !is_hvm_vcpu(current) )
633 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
634 break;
636 default:
637 BUG();
638 }
640 return 1;
641 }
643 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
644 {
645 char sig[5], instr[2];
646 uint32_t a, b, c, d;
647 unsigned long eip, rc;
649 a = regs->eax;
650 b = regs->ebx;
651 c = regs->ecx;
652 d = regs->edx;
653 eip = regs->eip;
655 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
656 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
657 {
658 propagate_page_fault(eip + sizeof(sig) - rc, 0);
659 return EXCRET_fault_fixed;
660 }
661 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
662 return 0;
663 eip += sizeof(sig);
665 /* We only emulate CPUID. */
666 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
667 {
668 propagate_page_fault(eip + sizeof(instr) - rc, 0);
669 return EXCRET_fault_fixed;
670 }
671 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
672 return 0;
673 eip += sizeof(instr);
675 asm (
676 "cpuid"
677 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
678 : "0" (a), "1" (b), "2" (c), "3" (d) );
680 if ( regs->eax == 1 )
681 {
682 /* Modify Feature Information. */
683 __clear_bit(X86_FEATURE_VME, &d);
684 __clear_bit(X86_FEATURE_PSE, &d);
685 __clear_bit(X86_FEATURE_PGE, &d);
686 if ( !cpu_has_sep )
687 __clear_bit(X86_FEATURE_SEP, &d);
688 #ifdef __i386__
689 if ( !supervisor_mode_kernel )
690 __clear_bit(X86_FEATURE_SEP, &d);
691 #endif
692 if ( !IS_PRIV(current->domain) )
693 __clear_bit(X86_FEATURE_MTRR, &d);
694 }
695 else if ( regs->eax == 0x80000001 )
696 {
697 /* Modify Feature Information. */
698 #ifdef __i386__
699 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
700 #endif
701 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
702 }
703 else
704 {
705 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
706 }
708 regs->eax = a;
709 regs->ebx = b;
710 regs->ecx = c;
711 regs->edx = d;
713 instruction_done(regs, eip, 0);
715 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
717 return EXCRET_fault_fixed;
718 }
720 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
721 {
722 struct bug_frame bug;
723 struct bug_frame_str bug_str;
724 char *filename, *predicate, *eip = (char *)regs->eip;
725 unsigned long fixup;
726 int id, lineno;
728 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
730 if ( likely(guest_mode(regs)) )
731 {
732 if ( !emulate_forced_invalid_op(regs) )
733 do_guest_trap(TRAP_invalid_op, regs, 0);
734 return;
735 }
737 if ( !is_kernel(eip) ||
738 __copy_from_user(&bug, eip, sizeof(bug)) ||
739 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
740 (bug.ret != 0xc2) )
741 goto die;
742 eip += sizeof(bug);
744 id = bug.id & 3;
746 if ( id == BUGFRAME_dump )
747 {
748 show_execution_state(regs);
749 regs->eip = (unsigned long)eip;
750 return;
751 }
753 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
754 if ( !is_kernel(eip) ||
755 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
756 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
757 goto die;
758 eip += sizeof(bug_str);
760 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
761 lineno = bug.id >> 2;
763 if ( id == BUGFRAME_warn )
764 {
765 printk("Xen WARN at %.50s:%d\n", filename, lineno);
766 show_execution_state(regs);
767 regs->eip = (unsigned long)eip;
768 return;
769 }
771 if ( id == BUGFRAME_bug )
772 {
773 printk("Xen BUG at %.50s:%d\n", filename, lineno);
774 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
775 show_execution_state(regs);
776 panic("Xen BUG at %.50s:%d\n", filename, lineno);
777 }
779 /* ASSERT: decode the predicate string pointer. */
780 ASSERT(id == BUGFRAME_assert);
781 if ( !is_kernel(eip) ||
782 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
783 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
784 goto die;
785 eip += sizeof(bug_str);
787 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
788 printk("Assertion '%s' failed at %.50s:%d\n",
789 predicate, filename, lineno);
790 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
791 show_execution_state(regs);
792 panic("Assertion '%s' failed at %.50s:%d\n",
793 predicate, filename, lineno);
795 die:
796 if ( (fixup = search_exception_table(regs->eip)) != 0 )
797 {
798 regs->eip = fixup;
799 return;
800 }
801 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
802 show_execution_state(regs);
803 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
804 }
806 asmlinkage void do_int3(struct cpu_user_regs *regs)
807 {
808 DEBUGGER_trap_entry(TRAP_int3, regs);
810 if ( !guest_mode(regs) )
811 {
812 debugger_trap_fatal(TRAP_int3, regs);
813 return;
814 }
816 do_guest_trap(TRAP_int3, regs, 0);
817 }
819 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
820 {
821 extern fastcall void (*machine_check_vector)(
822 struct cpu_user_regs *, long error_code);
823 machine_check_vector(regs, regs->error_code);
824 }
826 static void reserved_bit_page_fault(
827 unsigned long addr, struct cpu_user_regs *regs)
828 {
829 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
830 current->domain->domain_id, current->vcpu_id, regs->error_code);
831 show_page_walk(addr);
832 show_execution_state(regs);
833 }
835 void propagate_page_fault(unsigned long addr, u16 error_code)
836 {
837 struct trap_info *ti;
838 struct vcpu *v = current;
839 struct trap_bounce *tb = &v->arch.trap_bounce;
841 v->arch.guest_context.ctrlreg[2] = addr;
842 arch_set_cr2(v, addr);
844 /* Re-set error_code.user flag appropriately for the guest. */
845 error_code &= ~PFEC_user_mode;
846 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
847 error_code |= PFEC_user_mode;
849 trace_pv_page_fault(addr, error_code);
851 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
852 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
853 tb->error_code = error_code;
854 tb->cs = ti->cs;
855 tb->eip = ti->address;
856 if ( TI_GET_IF(ti) )
857 tb->flags |= TBF_INTERRUPT;
858 if ( unlikely(null_trap_bounce(v, tb)) )
859 {
860 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
861 v->domain->domain_id, v->vcpu_id, error_code);
862 show_page_walk(addr);
863 }
865 if ( unlikely(error_code & PFEC_reserved_bit) )
866 reserved_bit_page_fault(addr, guest_cpu_user_regs());
867 }
869 static int handle_gdt_ldt_mapping_fault(
870 unsigned long offset, struct cpu_user_regs *regs)
871 {
872 struct vcpu *curr = current;
873 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
874 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
875 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
877 /* Should never fault in another vcpu's area. */
878 BUG_ON(vcpu_area != curr->vcpu_id);
880 /* Byte offset within the gdt/ldt sub-area. */
881 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
883 if ( likely(is_ldt_area) )
884 {
885 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
886 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
887 {
888 if ( guest_mode(regs) )
889 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
890 regs->eip, offset);
891 }
892 else
893 {
894 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
895 if ( !guest_mode(regs) )
896 return 0;
897 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
898 propagate_page_fault(
899 curr->arch.guest_context.ldt_base + offset,
900 regs->error_code);
901 }
902 }
903 else
904 {
905 /* GDT fault: handle the fault as #GP(selector). */
906 regs->error_code = (u16)offset & ~7;
907 (void)do_general_protection(regs);
908 }
910 return EXCRET_fault_fixed;
911 }
913 #ifdef HYPERVISOR_VIRT_END
914 #define IN_HYPERVISOR_RANGE(va) \
915 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
916 #else
917 #define IN_HYPERVISOR_RANGE(va) \
918 (((va) >= HYPERVISOR_VIRT_START))
919 #endif
921 static int __spurious_page_fault(
922 unsigned long addr, struct cpu_user_regs *regs)
923 {
924 unsigned long mfn, cr3 = read_cr3();
925 #if CONFIG_PAGING_LEVELS >= 4
926 l4_pgentry_t l4e, *l4t;
927 #endif
928 #if CONFIG_PAGING_LEVELS >= 3
929 l3_pgentry_t l3e, *l3t;
930 #endif
931 l2_pgentry_t l2e, *l2t;
932 l1_pgentry_t l1e, *l1t;
933 unsigned int required_flags, disallowed_flags;
935 /*
936 * We do not take spurious page faults in IRQ handlers as we do not
937 * modify page tables in IRQ context. We therefore bail here because
938 * map_domain_page() is not IRQ-safe.
939 */
940 if ( in_irq() )
941 return 0;
943 /* Reserved bit violations are never spurious faults. */
944 if ( regs->error_code & PFEC_reserved_bit )
945 return 0;
947 required_flags = _PAGE_PRESENT;
948 if ( regs->error_code & PFEC_write_access )
949 required_flags |= _PAGE_RW;
950 if ( regs->error_code & PFEC_user_mode )
951 required_flags |= _PAGE_USER;
953 disallowed_flags = 0;
954 if ( regs->error_code & PFEC_insn_fetch )
955 disallowed_flags |= _PAGE_NX;
957 mfn = cr3 >> PAGE_SHIFT;
959 #if CONFIG_PAGING_LEVELS >= 4
960 l4t = map_domain_page(mfn);
961 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
962 mfn = l4e_get_pfn(l4e);
963 unmap_domain_page(l4t);
964 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
965 (l4e_get_flags(l4e) & disallowed_flags) )
966 return 0;
967 #endif
969 #if CONFIG_PAGING_LEVELS >= 3
970 l3t = map_domain_page(mfn);
971 #ifdef CONFIG_X86_PAE
972 l3t += (cr3 & 0xFE0UL) >> 3;
973 #endif
974 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
975 mfn = l3e_get_pfn(l3e);
976 unmap_domain_page(l3t);
977 #ifdef CONFIG_X86_PAE
978 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
979 return 0;
980 #else
981 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
982 (l3e_get_flags(l3e) & disallowed_flags) )
983 return 0;
984 #endif
985 #endif
987 l2t = map_domain_page(mfn);
988 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
989 mfn = l2e_get_pfn(l2e);
990 unmap_domain_page(l2t);
991 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
992 (l2e_get_flags(l2e) & disallowed_flags) )
993 return 0;
994 if ( l2e_get_flags(l2e) & _PAGE_PSE )
995 {
996 l1e = l1e_empty(); /* define before use in debug tracing */
997 goto spurious;
998 }
1000 l1t = map_domain_page(mfn);
1001 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1002 mfn = l1e_get_pfn(l1e);
1003 unmap_domain_page(l1t);
1004 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1005 (l1e_get_flags(l1e) & disallowed_flags) )
1006 return 0;
1008 spurious:
1009 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1010 "at addr %lx, e/c %04x\n",
1011 current->domain->domain_id, current->vcpu_id,
1012 addr, regs->error_code);
1013 #if CONFIG_PAGING_LEVELS >= 4
1014 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1015 #endif
1016 #if CONFIG_PAGING_LEVELS >= 3
1017 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1018 #endif
1019 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1020 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1021 #ifndef NDEBUG
1022 show_registers(regs);
1023 #endif
1024 return 1;
1027 static int spurious_page_fault(
1028 unsigned long addr, struct cpu_user_regs *regs)
1030 unsigned long flags;
1031 int is_spurious;
1033 /*
1034 * Disabling interrupts prevents TLB flushing, and hence prevents
1035 * page tables from becoming invalid under our feet during the walk.
1036 */
1037 local_irq_save(flags);
1038 is_spurious = __spurious_page_fault(addr, regs);
1039 local_irq_restore(flags);
1041 return is_spurious;
1044 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1046 struct vcpu *v = current;
1047 struct domain *d = v->domain;
1049 /* No fixups in interrupt context or when interrupts are disabled. */
1050 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1051 return 0;
1053 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1055 if ( paging_mode_external(d) && guest_mode(regs) )
1057 int ret = paging_fault(addr, regs);
1058 if ( ret == EXCRET_fault_fixed )
1059 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1060 return ret;
1062 if ( !(regs->error_code & PFEC_reserved_bit) &&
1063 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1064 return handle_gdt_ldt_mapping_fault(
1065 addr - GDT_LDT_VIRT_START, regs);
1066 return 0;
1069 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1070 guest_kernel_mode(v, regs) &&
1071 /* Do not check if access-protection fault since the page may
1072 legitimately be not present in shadow page tables */
1073 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1074 PFEC_write_access) &&
1075 ptwr_do_page_fault(v, addr, regs) )
1076 return EXCRET_fault_fixed;
1078 if ( paging_mode_enabled(d) )
1080 int ret = paging_fault(addr, regs);
1081 if ( ret == EXCRET_fault_fixed )
1082 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1083 return ret;
1086 return 0;
1089 /*
1090 * #PF error code:
1091 * Bit 0: Protection violation (=1) ; Page not present (=0)
1092 * Bit 1: Write access
1093 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1094 * Bit 3: Reserved bit violation
1095 * Bit 4: Instruction fetch
1096 */
1097 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1099 unsigned long addr, fixup;
1101 addr = read_cr2();
1103 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1105 perfc_incr(page_faults);
1107 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1108 return;
1110 if ( unlikely(!guest_mode(regs)) )
1112 if ( spurious_page_fault(addr, regs) )
1113 return;
1115 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1117 perfc_incr(copy_user_faults);
1118 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1119 reserved_bit_page_fault(addr, regs);
1120 regs->eip = fixup;
1121 return;
1124 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1126 show_execution_state(regs);
1127 show_page_walk(addr);
1128 panic("FATAL PAGE FAULT\n"
1129 "[error_code=%04x]\n"
1130 "Faulting linear address: %p\n",
1131 regs->error_code, _p(addr));
1134 propagate_page_fault(addr, regs->error_code);
1137 /*
1138 * Early #PF handler to print CR2, error code, and stack.
1140 * We also deal with spurious faults here, even though they should never happen
1141 * during early boot (an issue was seen once, but was most likely a hardware
1142 * problem).
1143 */
1144 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1146 static int stuck;
1147 static unsigned long prev_eip, prev_cr2;
1148 unsigned long cr2 = read_cr2();
1150 BUG_ON(smp_processor_id() != 0);
1152 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1154 prev_eip = regs->eip;
1155 prev_cr2 = cr2;
1156 stuck = 0;
1157 return;
1160 if ( stuck++ == 1000 )
1162 unsigned long *stk = (unsigned long *)regs;
1163 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1164 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1165 printk("Stack dump: ");
1166 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1167 printk("%p ", _p(*stk++));
1168 for ( ; ; ) ;
1172 long do_fpu_taskswitch(int set)
1174 struct vcpu *v = current;
1176 if ( set )
1178 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1179 stts();
1181 else
1183 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1184 if ( v->fpu_dirtied )
1185 clts();
1188 return 0;
1191 static int read_descriptor(unsigned int sel,
1192 const struct vcpu *v,
1193 const struct cpu_user_regs * regs,
1194 unsigned long *base,
1195 unsigned long *limit,
1196 unsigned int *ar,
1197 unsigned int vm86attr)
1199 struct desc_struct desc;
1201 if ( !vm86_mode(regs) )
1203 if ( sel < 4)
1204 desc.b = desc.a = 0;
1205 else if ( __get_user(desc,
1206 (const struct desc_struct *)(!(sel & 4)
1207 ? GDT_VIRT_START(v)
1208 : LDT_VIRT_START(v))
1209 + (sel >> 3)) )
1210 return 0;
1211 if ( !(vm86attr & _SEGMENT_CODE) )
1212 desc.b &= ~_SEGMENT_L;
1214 else
1216 desc.a = (sel << 20) | 0xffff;
1217 desc.b = vm86attr | (sel >> 12);
1220 *ar = desc.b & 0x00f0ff00;
1221 if ( !(desc.b & _SEGMENT_L) )
1223 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1224 (desc.b & 0xff000000));
1225 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1226 if ( desc.b & _SEGMENT_G )
1227 *limit = ((*limit + 1) << 12) - 1;
1228 #ifndef NDEBUG
1229 if ( !vm86_mode(regs) && (sel > 3) )
1231 unsigned int a, l;
1232 unsigned char valid;
1234 asm volatile (
1235 "larl %2,%0 ; setz %1"
1236 : "=r" (a), "=rm" (valid) : "rm" (sel));
1237 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1238 asm volatile (
1239 "lsll %2,%0 ; setz %1"
1240 : "=r" (l), "=rm" (valid) : "rm" (sel));
1241 BUG_ON(valid && (l != *limit));
1243 #endif
1245 else
1247 *base = 0UL;
1248 *limit = ~0UL;
1251 return 1;
1254 #ifdef __x86_64__
1255 static int read_gate_descriptor(unsigned int gate_sel,
1256 const struct vcpu *v,
1257 unsigned int *sel,
1258 unsigned long *off,
1259 unsigned int *ar)
1261 struct desc_struct desc;
1262 const struct desc_struct *pdesc;
1265 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1266 GDT_VIRT_START(v) :
1267 LDT_VIRT_START(v))
1268 + (gate_sel >> 3);
1269 if ( gate_sel < 4 ||
1270 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1271 __get_user(desc, pdesc) )
1272 return 0;
1274 *sel = (desc.a >> 16) & 0x0000fffc;
1275 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1276 *ar = desc.b & 0x0000ffff;
1277 /*
1278 * check_descriptor() clears the DPL field and stores the
1279 * guest requested DPL in the selector's RPL field.
1280 */
1281 ASSERT(!(*ar & _SEGMENT_DPL));
1282 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1284 if ( !is_pv_32bit_vcpu(v) )
1286 if ( (*ar & 0x1f00) != 0x0c00 ||
1287 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1288 __get_user(desc, pdesc + 1) ||
1289 (desc.b & 0x1f00) )
1290 return 0;
1292 *off |= (unsigned long)desc.a << 32;
1293 return 1;
1296 switch ( *ar & 0x1f00 )
1298 case 0x0400:
1299 *off &= 0xffff;
1300 break;
1301 case 0x0c00:
1302 break;
1303 default:
1304 return 0;
1307 return 1;
1309 #endif
1311 /* Has the guest requested sufficient permission for this I/O access? */
1312 static inline int guest_io_okay(
1313 unsigned int port, unsigned int bytes,
1314 struct vcpu *v, struct cpu_user_regs *regs)
1316 #if defined(__x86_64__)
1317 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1318 int user_mode = !(v->arch.flags & TF_kernel_mode);
1319 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1320 #elif defined(__i386__)
1321 #define TOGGLE_MODE() ((void)0)
1322 #endif
1324 if ( !vm86_mode(regs) &&
1325 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1326 return 1;
1328 if ( v->arch.iobmp_limit > (port + bytes) )
1330 union { uint8_t bytes[2]; uint16_t mask; } x;
1332 /*
1333 * Grab permission bytes from guest space. Inaccessible bytes are
1334 * read as 0xff (no access allowed).
1335 */
1336 TOGGLE_MODE();
1337 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1338 port>>3, 2) )
1340 default: x.bytes[0] = ~0;
1341 case 1: x.bytes[1] = ~0;
1342 case 0: break;
1344 TOGGLE_MODE();
1346 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1347 return 1;
1350 return 0;
1353 /* Has the administrator granted sufficient permission for this I/O access? */
1354 static inline int admin_io_okay(
1355 unsigned int port, unsigned int bytes,
1356 struct vcpu *v, struct cpu_user_regs *regs)
1358 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1361 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1362 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1363 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1364 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1365 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1366 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1368 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1369 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1370 __attribute__((__regparm__(1)));
1371 unsigned long guest_to_host_gpr_switch(unsigned long)
1372 __attribute__((__regparm__(1)));
1374 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1376 /* Instruction fetch with error handling. */
1377 #define insn_fetch(type, base, eip, limit) \
1378 ({ unsigned long _rc, _ptr = (base) + (eip); \
1379 type _x; \
1380 if ( ad_default < 8 ) \
1381 _ptr = (unsigned int)_ptr; \
1382 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1383 goto fail; \
1384 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1385 { \
1386 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1387 goto skip; \
1388 } \
1389 (eip) += sizeof(_x); _x; })
1391 #if defined(CONFIG_X86_32)
1392 # define read_sreg(regs, sr) ((regs)->sr)
1393 #elif defined(CONFIG_X86_64)
1394 # define read_sreg(regs, sr) read_segment_register(sr)
1395 #endif
1397 static int emulate_privileged_op(struct cpu_user_regs *regs)
1399 struct vcpu *v = current;
1400 unsigned long *reg, eip = regs->eip, res;
1401 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1402 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1403 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1404 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1405 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1406 ? regs->reg \
1407 : ad_bytes == 4 \
1408 ? (u32)regs->reg \
1409 : (u16)regs->reg)
1410 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1411 ? regs->reg = (val) \
1412 : ad_bytes == 4 \
1413 ? (*(u32 *)&regs->reg = (val)) \
1414 : (*(u16 *)&regs->reg = (val)))
1415 unsigned long code_base, code_limit;
1416 char io_emul_stub[32];
1417 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1418 u32 l, h, eax, edx;
1420 if ( !read_descriptor(regs->cs, v, regs,
1421 &code_base, &code_limit, &ar,
1422 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1423 goto fail;
1424 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1425 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1426 if ( !(ar & _SEGMENT_S) ||
1427 !(ar & _SEGMENT_P) ||
1428 !(ar & _SEGMENT_CODE) )
1429 goto fail;
1431 /* emulating only opcodes not allowing SS to be default */
1432 data_sel = read_sreg(regs, ds);
1434 /* Legacy prefixes. */
1435 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1437 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1439 case 0x66: /* operand-size override */
1440 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1441 continue;
1442 case 0x67: /* address-size override */
1443 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1444 continue;
1445 case 0x2e: /* CS override */
1446 data_sel = regs->cs;
1447 continue;
1448 case 0x3e: /* DS override */
1449 data_sel = read_sreg(regs, ds);
1450 continue;
1451 case 0x26: /* ES override */
1452 data_sel = read_sreg(regs, es);
1453 continue;
1454 case 0x64: /* FS override */
1455 data_sel = read_sreg(regs, fs);
1456 lm_ovr = lm_seg_fs;
1457 continue;
1458 case 0x65: /* GS override */
1459 data_sel = read_sreg(regs, gs);
1460 lm_ovr = lm_seg_gs;
1461 continue;
1462 case 0x36: /* SS override */
1463 data_sel = regs->ss;
1464 continue;
1465 case 0xf0: /* LOCK */
1466 lock = 1;
1467 continue;
1468 case 0xf2: /* REPNE/REPNZ */
1469 case 0xf3: /* REP/REPE/REPZ */
1470 rep_prefix = 1;
1471 continue;
1472 default:
1473 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1475 rex = opcode;
1476 continue;
1478 break;
1480 break;
1483 /* REX prefix. */
1484 if ( rex & 8 ) /* REX.W */
1485 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1486 modrm_reg = (rex & 4) << 1; /* REX.R */
1487 /* REX.X does not need to be decoded. */
1488 modrm_rm = (rex & 1) << 3; /* REX.B */
1490 if ( opcode == 0x0f )
1491 goto twobyte_opcode;
1493 if ( lock )
1494 goto fail;
1496 /* Input/Output String instructions. */
1497 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1499 unsigned long data_base, data_limit;
1501 if ( rep_prefix && (rd_ad(ecx) == 0) )
1502 goto done;
1504 if ( !(opcode & 2) )
1506 data_sel = read_sreg(regs, es);
1507 lm_ovr = lm_seg_none;
1510 if ( !(ar & _SEGMENT_L) )
1512 if ( !read_descriptor(data_sel, v, regs,
1513 &data_base, &data_limit, &ar,
1514 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1515 goto fail;
1516 if ( !(ar & _SEGMENT_S) ||
1517 !(ar & _SEGMENT_P) ||
1518 (opcode & 2 ?
1519 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1520 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1521 goto fail;
1523 #ifdef CONFIG_X86_64
1524 else
1526 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1528 switch ( lm_ovr )
1530 case lm_seg_none:
1531 data_base = 0UL;
1532 break;
1533 case lm_seg_fs:
1534 data_base = v->arch.guest_context.fs_base;
1535 break;
1536 case lm_seg_gs:
1537 if ( guest_kernel_mode(v, regs) )
1538 data_base = v->arch.guest_context.gs_base_kernel;
1539 else
1540 data_base = v->arch.guest_context.gs_base_user;
1541 break;
1544 else
1545 read_descriptor(data_sel, v, regs,
1546 &data_base, &data_limit, &ar,
1547 0);
1548 data_limit = ~0UL;
1549 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1551 #endif
1553 port = (u16)regs->edx;
1555 continue_io_string:
1556 switch ( opcode )
1558 case 0x6c: /* INSB */
1559 op_bytes = 1;
1560 case 0x6d: /* INSW/INSL */
1561 if ( data_limit < op_bytes - 1 ||
1562 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1563 !guest_io_okay(port, op_bytes, v, regs) )
1564 goto fail;
1565 switch ( op_bytes )
1567 case 1:
1568 /* emulate PIT counter 2 */
1569 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1570 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1571 pv_pit_handler(port, 0, 0) : ~0));
1572 break;
1573 case 2:
1574 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1575 break;
1576 case 4:
1577 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1578 break;
1580 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1582 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1583 PFEC_write_access);
1584 return EXCRET_fault_fixed;
1586 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1587 break;
1589 case 0x6e: /* OUTSB */
1590 op_bytes = 1;
1591 case 0x6f: /* OUTSW/OUTSL */
1592 if ( data_limit < op_bytes - 1 ||
1593 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1594 !guest_io_okay(port, op_bytes, v, regs) )
1595 goto fail;
1596 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1597 if ( rc != 0 )
1599 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1600 return EXCRET_fault_fixed;
1602 switch ( op_bytes )
1604 case 1:
1605 if ( guest_outb_okay(port, v, regs) )
1607 outb((u8)data, port);
1608 if ( pv_post_outb_hook )
1609 pv_post_outb_hook(port, data);
1611 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1612 pv_pit_handler(port, data, 1);
1613 break;
1614 case 2:
1615 if ( guest_outw_okay(port, v, regs) )
1616 outw((u16)data, port);
1617 break;
1618 case 4:
1619 if ( guest_outl_okay(port, v, regs) )
1620 outl((u32)data, port);
1621 break;
1623 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1624 break;
1627 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1629 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1631 if ( !bpmatch && !hypercall_preempt_check() )
1632 goto continue_io_string;
1633 eip = regs->eip;
1636 goto done;
1639 /*
1640 * Very likely to be an I/O instruction (IN/OUT).
1641 * Build an on-stack stub to execute the instruction with full guest
1642 * GPR context. This is needed for some systems which (ab)use IN/OUT
1643 * to communicate with BIOS code in system-management mode.
1644 */
1645 #ifdef __x86_64__
1646 /* movq $host_to_guest_gpr_switch,%rcx */
1647 io_emul_stub[0] = 0x48;
1648 io_emul_stub[1] = 0xb9;
1649 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1650 /* callq *%rcx */
1651 io_emul_stub[10] = 0xff;
1652 io_emul_stub[11] = 0xd1;
1653 #else
1654 /* call host_to_guest_gpr_switch */
1655 io_emul_stub[0] = 0xe8;
1656 *(s32 *)&io_emul_stub[1] =
1657 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1658 /* 7 x nop */
1659 memset(&io_emul_stub[5], 0x90, 7);
1660 #endif
1661 /* data16 or nop */
1662 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1663 /* <io-access opcode> */
1664 io_emul_stub[13] = opcode;
1665 /* imm8 or nop */
1666 io_emul_stub[14] = 0x90;
1667 /* ret (jumps to guest_to_host_gpr_switch) */
1668 io_emul_stub[15] = 0xc3;
1670 /* Handy function-typed pointer to the stub. */
1671 io_emul = (void *)io_emul_stub;
1673 if ( ioemul_handle_quirk )
1674 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1676 /* I/O Port and Interrupt Flag instructions. */
1677 switch ( opcode )
1679 case 0xe4: /* IN imm8,%al */
1680 op_bytes = 1;
1681 case 0xe5: /* IN imm8,%eax */
1682 port = insn_fetch(u8, code_base, eip, code_limit);
1683 io_emul_stub[14] = port; /* imm8 */
1684 exec_in:
1685 if ( !guest_io_okay(port, op_bytes, v, regs) )
1686 goto fail;
1687 switch ( op_bytes )
1689 case 1:
1690 if ( guest_inb_okay(port, v, regs) )
1691 io_emul(regs);
1692 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1694 regs->eax &= ~0xffUL;
1695 regs->eax |= pv_pit_handler(port, 0, 0);
1697 else
1698 regs->eax |= (u8)~0;
1699 break;
1700 case 2:
1701 if ( guest_inw_okay(port, v, regs) )
1702 io_emul(regs);
1703 else
1704 regs->eax |= (u16)~0;
1705 break;
1706 case 4:
1707 if ( guest_inl_okay(port, v, regs) )
1708 io_emul(regs);
1709 else
1710 regs->eax = (u32)~0;
1711 break;
1713 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1714 goto done;
1716 case 0xec: /* IN %dx,%al */
1717 op_bytes = 1;
1718 case 0xed: /* IN %dx,%eax */
1719 port = (u16)regs->edx;
1720 goto exec_in;
1722 case 0xe6: /* OUT %al,imm8 */
1723 op_bytes = 1;
1724 case 0xe7: /* OUT %eax,imm8 */
1725 port = insn_fetch(u8, code_base, eip, code_limit);
1726 io_emul_stub[14] = port; /* imm8 */
1727 exec_out:
1728 if ( !guest_io_okay(port, op_bytes, v, regs) )
1729 goto fail;
1730 switch ( op_bytes )
1732 case 1:
1733 if ( guest_outb_okay(port, v, regs) )
1735 io_emul(regs);
1736 if ( pv_post_outb_hook )
1737 pv_post_outb_hook(port, regs->eax);
1739 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1740 pv_pit_handler(port, regs->eax, 1);
1741 break;
1742 case 2:
1743 if ( guest_outw_okay(port, v, regs) )
1744 io_emul(regs);
1745 break;
1746 case 4:
1747 if ( guest_outl_okay(port, v, regs) )
1748 io_emul(regs);
1749 break;
1751 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1752 goto done;
1754 case 0xee: /* OUT %al,%dx */
1755 op_bytes = 1;
1756 case 0xef: /* OUT %eax,%dx */
1757 port = (u16)regs->edx;
1758 goto exec_out;
1760 case 0xfa: /* CLI */
1761 case 0xfb: /* STI */
1762 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1763 goto fail;
1764 /*
1765 * This is just too dangerous to allow, in my opinion. Consider if the
1766 * caller then tries to reenable interrupts using POPF: we can't trap
1767 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1768 * do for us. :-)
1769 */
1770 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1771 goto done;
1774 /* No decode of this single-byte opcode. */
1775 goto fail;
1777 twobyte_opcode:
1778 /* Two-byte opcodes only emulated from guest kernel. */
1779 if ( !guest_kernel_mode(v, regs) )
1780 goto fail;
1782 /* Privileged (ring 0) instructions. */
1783 opcode = insn_fetch(u8, code_base, eip, code_limit);
1784 if ( lock && (opcode & ~3) != 0x20 )
1785 goto fail;
1786 switch ( opcode )
1788 case 0x06: /* CLTS */
1789 (void)do_fpu_taskswitch(0);
1790 break;
1792 case 0x09: /* WBINVD */
1793 /* Ignore the instruction if unprivileged. */
1794 if ( !cache_flush_permitted(v->domain) )
1795 /* Non-physdev domain attempted WBINVD; ignore for now since
1796 newer linux uses this in some start-of-day timing loops */
1798 else
1799 wbinvd();
1800 break;
1802 case 0x20: /* MOV CR?,<reg> */
1803 opcode = insn_fetch(u8, code_base, eip, code_limit);
1804 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1805 modrm_rm |= (opcode >> 0) & 7;
1806 reg = decode_register(modrm_rm, regs, 0);
1807 switch ( modrm_reg )
1809 case 0: /* Read CR0 */
1810 *reg = (read_cr0() & ~X86_CR0_TS) |
1811 v->arch.guest_context.ctrlreg[0];
1812 break;
1814 case 2: /* Read CR2 */
1815 *reg = v->arch.guest_context.ctrlreg[2];
1816 break;
1818 case 3: /* Read CR3 */
1819 if ( !is_pv_32on64_vcpu(v) )
1820 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1821 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1822 #ifdef CONFIG_COMPAT
1823 else
1824 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1825 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1826 #endif
1827 break;
1829 case 4: /* Read CR4 */
1830 /*
1831 * Guests can read CR4 to see what features Xen has enabled. We
1832 * therefore lie about PGE & PSE as they are unavailable to guests.
1833 */
1834 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1835 break;
1837 default:
1838 goto fail;
1840 break;
1842 case 0x21: /* MOV DR?,<reg> */
1843 opcode = insn_fetch(u8, code_base, eip, code_limit);
1844 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1845 modrm_rm |= (opcode >> 0) & 7;
1846 reg = decode_register(modrm_rm, regs, 0);
1847 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1848 goto fail;
1849 *reg = res;
1850 break;
1852 case 0x22: /* MOV <reg>,CR? */
1853 opcode = insn_fetch(u8, code_base, eip, code_limit);
1854 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1855 modrm_rm |= (opcode >> 0) & 7;
1856 reg = decode_register(modrm_rm, regs, 0);
1857 switch ( modrm_reg )
1859 case 0: /* Write CR0 */
1860 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1862 gdprintk(XENLOG_WARNING,
1863 "Attempt to change unmodifiable CR0 flags.\n");
1864 goto fail;
1866 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1867 break;
1869 case 2: /* Write CR2 */
1870 v->arch.guest_context.ctrlreg[2] = *reg;
1871 arch_set_cr2(v, *reg);
1872 break;
1874 case 3: /* Write CR3 */
1875 LOCK_BIGLOCK(v->domain);
1876 if ( !is_pv_32on64_vcpu(v) )
1877 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1878 #ifdef CONFIG_COMPAT
1879 else
1880 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1881 #endif
1882 UNLOCK_BIGLOCK(v->domain);
1883 if ( rc == 0 ) /* not okay */
1884 goto fail;
1885 break;
1887 case 4: /* Write CR4 */
1888 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1889 write_cr4(pv_guest_cr4_to_real_cr4(
1890 v->arch.guest_context.ctrlreg[4]));
1891 break;
1893 default:
1894 goto fail;
1896 break;
1898 case 0x23: /* MOV <reg>,DR? */
1899 opcode = insn_fetch(u8, code_base, eip, code_limit);
1900 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1901 modrm_rm |= (opcode >> 0) & 7;
1902 reg = decode_register(modrm_rm, regs, 0);
1903 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1904 goto fail;
1905 break;
1907 case 0x30: /* WRMSR */
1908 eax = regs->eax;
1909 edx = regs->edx;
1910 res = ((u64)edx << 32) | eax;
1911 switch ( (u32)regs->ecx )
1913 #ifdef CONFIG_X86_64
1914 case MSR_FS_BASE:
1915 if ( is_pv_32on64_vcpu(v) )
1916 goto fail;
1917 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1918 goto fail;
1919 v->arch.guest_context.fs_base = res;
1920 break;
1921 case MSR_GS_BASE:
1922 if ( is_pv_32on64_vcpu(v) )
1923 goto fail;
1924 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1925 goto fail;
1926 v->arch.guest_context.gs_base_kernel = res;
1927 break;
1928 case MSR_SHADOW_GS_BASE:
1929 if ( is_pv_32on64_vcpu(v) )
1930 goto fail;
1931 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1932 goto fail;
1933 v->arch.guest_context.gs_base_user = res;
1934 break;
1935 #endif
1936 case MSR_K7_FID_VID_STATUS:
1937 case MSR_K7_FID_VID_CTL:
1938 case MSR_K8_PSTATE_LIMIT:
1939 case MSR_K8_PSTATE_CTRL:
1940 case MSR_K8_PSTATE_STATUS:
1941 case MSR_K8_PSTATE0:
1942 case MSR_K8_PSTATE1:
1943 case MSR_K8_PSTATE2:
1944 case MSR_K8_PSTATE3:
1945 case MSR_K8_PSTATE4:
1946 case MSR_K8_PSTATE5:
1947 case MSR_K8_PSTATE6:
1948 case MSR_K8_PSTATE7:
1949 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1950 goto fail;
1951 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1952 break;
1953 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1954 goto fail;
1955 break;
1956 case MSR_IA32_PERF_CTL:
1957 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1958 goto fail;
1959 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1960 break;
1961 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1962 goto fail;
1963 break;
1964 default:
1965 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1966 break;
1967 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1968 (eax != l) || (edx != h) )
1969 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1970 "%08x:%08x to %08x:%08x.\n",
1971 _p(regs->ecx), h, l, edx, eax);
1972 break;
1974 break;
1976 case 0x31: /* RDTSC */
1977 rdtsc(regs->eax, regs->edx);
1978 break;
1980 case 0x32: /* RDMSR */
1981 switch ( (u32)regs->ecx )
1983 #ifdef CONFIG_X86_64
1984 case MSR_FS_BASE:
1985 if ( is_pv_32on64_vcpu(v) )
1986 goto fail;
1987 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1988 regs->edx = v->arch.guest_context.fs_base >> 32;
1989 break;
1990 case MSR_GS_BASE:
1991 if ( is_pv_32on64_vcpu(v) )
1992 goto fail;
1993 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1994 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1995 break;
1996 case MSR_SHADOW_GS_BASE:
1997 if ( is_pv_32on64_vcpu(v) )
1998 goto fail;
1999 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2000 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2001 break;
2002 #endif
2003 case MSR_K7_FID_VID_CTL:
2004 case MSR_K7_FID_VID_STATUS:
2005 case MSR_K8_PSTATE_LIMIT:
2006 case MSR_K8_PSTATE_CTRL:
2007 case MSR_K8_PSTATE_STATUS:
2008 case MSR_K8_PSTATE0:
2009 case MSR_K8_PSTATE1:
2010 case MSR_K8_PSTATE2:
2011 case MSR_K8_PSTATE3:
2012 case MSR_K8_PSTATE4:
2013 case MSR_K8_PSTATE5:
2014 case MSR_K8_PSTATE6:
2015 case MSR_K8_PSTATE7:
2016 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2017 goto fail;
2018 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2020 regs->eax = regs->edx = 0;
2021 break;
2023 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2024 goto fail;
2025 break;
2026 case MSR_EFER:
2027 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2028 goto fail;
2029 break;
2030 default:
2031 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2033 regs->eax = l;
2034 regs->edx = h;
2035 break;
2037 /* Everyone can read the MSR space. */
2038 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2039 _p(regs->ecx));*/
2040 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2041 goto fail;
2042 break;
2044 break;
2046 default:
2047 goto fail;
2050 #undef wr_ad
2051 #undef rd_ad
2053 done:
2054 instruction_done(regs, eip, bpmatch);
2055 skip:
2056 return EXCRET_fault_fixed;
2058 fail:
2059 return 0;
2062 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2063 unsigned int esp, unsigned int decr)
2065 return (((esp - decr) < (esp - 1)) &&
2066 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2069 static void emulate_gate_op(struct cpu_user_regs *regs)
2071 #ifdef __x86_64__
2072 struct vcpu *v = current;
2073 unsigned int sel, ar, dpl, nparm, opnd_sel;
2074 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2075 unsigned long off, eip, opnd_off, base, limit;
2076 int jump;
2078 /* Check whether this fault is due to the use of a call gate. */
2079 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2080 ((ar >> 13) & 3) < (regs->cs & 3) ||
2081 (ar & _SEGMENT_TYPE) != 0xc00 )
2083 do_guest_trap(TRAP_gp_fault, regs, 1);
2084 return;
2086 if ( !(ar & _SEGMENT_P) )
2088 do_guest_trap(TRAP_no_segment, regs, 1);
2089 return;
2091 dpl = (ar >> 13) & 3;
2092 nparm = ar & 0x1f;
2094 /*
2095 * Decode instruction (and perhaps operand) to determine RPL,
2096 * whether this is a jump or a call, and the call return offset.
2097 */
2098 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2099 !(ar & _SEGMENT_S) ||
2100 !(ar & _SEGMENT_P) ||
2101 !(ar & _SEGMENT_CODE) )
2103 do_guest_trap(TRAP_gp_fault, regs, 1);
2104 return;
2107 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2108 ad_default = ad_bytes = op_default;
2109 opnd_sel = opnd_off = 0;
2110 jump = -1;
2111 for ( eip = regs->eip; eip - regs->_eip < 10; )
2113 switch ( insn_fetch(u8, base, eip, limit) )
2115 case 0x66: /* operand-size override */
2116 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2117 continue;
2118 case 0x67: /* address-size override */
2119 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2120 continue;
2121 case 0x2e: /* CS override */
2122 opnd_sel = regs->cs;
2123 ASSERT(opnd_sel);
2124 continue;
2125 case 0x3e: /* DS override */
2126 opnd_sel = read_sreg(regs, ds);
2127 if ( !opnd_sel )
2128 opnd_sel = dpl;
2129 continue;
2130 case 0x26: /* ES override */
2131 opnd_sel = read_sreg(regs, es);
2132 if ( !opnd_sel )
2133 opnd_sel = dpl;
2134 continue;
2135 case 0x64: /* FS override */
2136 opnd_sel = read_sreg(regs, fs);
2137 if ( !opnd_sel )
2138 opnd_sel = dpl;
2139 continue;
2140 case 0x65: /* GS override */
2141 opnd_sel = read_sreg(regs, gs);
2142 if ( !opnd_sel )
2143 opnd_sel = dpl;
2144 continue;
2145 case 0x36: /* SS override */
2146 opnd_sel = regs->ss;
2147 if ( !opnd_sel )
2148 opnd_sel = dpl;
2149 continue;
2150 case 0xea:
2151 ++jump;
2152 /* FALLTHROUGH */
2153 case 0x9a:
2154 ++jump;
2155 opnd_sel = regs->cs;
2156 opnd_off = eip;
2157 ad_bytes = ad_default;
2158 eip += op_bytes + 2;
2159 break;
2160 case 0xff:
2162 unsigned int modrm;
2164 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2166 case 0x28: case 0x68: case 0xa8:
2167 ++jump;
2168 /* FALLTHROUGH */
2169 case 0x18: case 0x58: case 0x98:
2170 ++jump;
2171 if ( ad_bytes != 2 )
2173 if ( (modrm & 7) == 4 )
2175 unsigned int sib = insn_fetch(u8, base, eip, limit);
2177 modrm = (modrm & ~7) | (sib & 7);
2178 if ( (sib >>= 3) != 4 )
2179 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2180 opnd_off <<= sib >> 3;
2182 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2183 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2184 else
2185 modrm |= 0x87;
2186 if ( !opnd_sel )
2188 switch ( modrm & 7 )
2190 default:
2191 opnd_sel = read_sreg(regs, ds);
2192 break;
2193 case 4: case 5:
2194 opnd_sel = regs->ss;
2195 break;
2199 else
2201 switch ( modrm & 7 )
2203 case 0: case 1: case 7:
2204 opnd_off = regs->ebx;
2205 break;
2206 case 6:
2207 if ( !(modrm & 0xc0) )
2208 modrm |= 0x80;
2209 else
2210 case 2: case 3:
2212 opnd_off = regs->ebp;
2213 if ( !opnd_sel )
2214 opnd_sel = regs->ss;
2216 break;
2218 if ( !opnd_sel )
2219 opnd_sel = read_sreg(regs, ds);
2220 switch ( modrm & 7 )
2222 case 0: case 2: case 4:
2223 opnd_off += regs->esi;
2224 break;
2225 case 1: case 3: case 5:
2226 opnd_off += regs->edi;
2227 break;
2230 switch ( modrm & 0xc0 )
2232 case 0x40:
2233 opnd_off += insn_fetch(s8, base, eip, limit);
2234 break;
2235 case 0x80:
2236 opnd_off += insn_fetch(s32, base, eip, limit);
2237 break;
2239 if ( ad_bytes == 4 )
2240 opnd_off = (unsigned int)opnd_off;
2241 else if ( ad_bytes == 2 )
2242 opnd_off = (unsigned short)opnd_off;
2243 break;
2246 break;
2248 break;
2251 if ( jump < 0 )
2253 fail:
2254 do_guest_trap(TRAP_gp_fault, regs, 1);
2255 skip:
2256 return;
2259 if ( (opnd_sel != regs->cs &&
2260 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2261 !(ar & _SEGMENT_S) ||
2262 !(ar & _SEGMENT_P) ||
2263 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2265 do_guest_trap(TRAP_gp_fault, regs, 1);
2266 return;
2269 opnd_off += op_bytes;
2270 #define ad_default ad_bytes
2271 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2272 #undef ad_default
2273 ASSERT((opnd_sel & ~3) == regs->error_code);
2274 if ( dpl < (opnd_sel & 3) )
2276 do_guest_trap(TRAP_gp_fault, regs, 1);
2277 return;
2280 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2281 !(ar & _SEGMENT_S) ||
2282 !(ar & _SEGMENT_CODE) ||
2283 (!jump || (ar & _SEGMENT_EC) ?
2284 ((ar >> 13) & 3) > (regs->cs & 3) :
2285 ((ar >> 13) & 3) != (regs->cs & 3)) )
2287 regs->error_code = sel;
2288 do_guest_trap(TRAP_gp_fault, regs, 1);
2289 return;
2291 if ( !(ar & _SEGMENT_P) )
2293 regs->error_code = sel;
2294 do_guest_trap(TRAP_no_segment, regs, 1);
2295 return;
2297 if ( off > limit )
2299 regs->error_code = 0;
2300 do_guest_trap(TRAP_gp_fault, regs, 1);
2301 return;
2304 if ( !jump )
2306 unsigned int ss, esp, *stkp;
2307 int rc;
2308 #define push(item) do \
2309 { \
2310 --stkp; \
2311 esp -= 4; \
2312 rc = __put_user(item, stkp); \
2313 if ( rc ) \
2314 { \
2315 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2316 PFEC_write_access); \
2317 return; \
2318 } \
2319 } while ( 0 )
2321 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2323 sel |= (ar >> 13) & 3;
2324 /* Inner stack known only for kernel ring. */
2325 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2327 do_guest_trap(TRAP_gp_fault, regs, 1);
2328 return;
2330 esp = v->arch.guest_context.kernel_sp;
2331 ss = v->arch.guest_context.kernel_ss;
2332 if ( (ss & 3) != (sel & 3) ||
2333 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2334 ((ar >> 13) & 3) != (sel & 3) ||
2335 !(ar & _SEGMENT_S) ||
2336 (ar & _SEGMENT_CODE) ||
2337 !(ar & _SEGMENT_WR) )
2339 regs->error_code = ss & ~3;
2340 do_guest_trap(TRAP_invalid_tss, regs, 1);
2341 return;
2343 if ( !(ar & _SEGMENT_P) ||
2344 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2346 regs->error_code = ss & ~3;
2347 do_guest_trap(TRAP_stack_error, regs, 1);
2348 return;
2350 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2351 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2353 do_guest_trap(TRAP_gp_fault, regs, 1);
2354 return;
2356 push(regs->ss);
2357 push(regs->esp);
2358 if ( nparm )
2360 const unsigned int *ustkp;
2362 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2363 ((ar >> 13) & 3) != (regs->cs & 3) ||
2364 !(ar & _SEGMENT_S) ||
2365 (ar & _SEGMENT_CODE) ||
2366 !(ar & _SEGMENT_WR) ||
2367 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2368 return do_guest_trap(TRAP_gp_fault, regs, 1);
2369 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2370 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2372 do_guest_trap(TRAP_gp_fault, regs, 1);
2373 return;
2375 do
2377 unsigned int parm;
2379 --ustkp;
2380 rc = __get_user(parm, ustkp);
2381 if ( rc )
2383 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2384 return;
2386 push(parm);
2387 } while ( --nparm );
2390 else
2392 sel |= (regs->cs & 3);
2393 esp = regs->esp;
2394 ss = regs->ss;
2395 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2396 ((ar >> 13) & 3) != (sel & 3) )
2398 do_guest_trap(TRAP_gp_fault, regs, 1);
2399 return;
2401 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2403 regs->error_code = 0;
2404 do_guest_trap(TRAP_stack_error, regs, 1);
2405 return;
2407 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2408 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2410 do_guest_trap(TRAP_gp_fault, regs, 1);
2411 return;
2414 push(regs->cs);
2415 push(eip);
2416 #undef push
2417 regs->esp = esp;
2418 regs->ss = ss;
2420 else
2421 sel |= (regs->cs & 3);
2423 regs->cs = sel;
2424 instruction_done(regs, off, 0);
2425 #endif
2428 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2430 struct vcpu *v = current;
2431 unsigned long fixup;
2433 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2435 if ( regs->error_code & 1 )
2436 goto hardware_gp;
2438 if ( !guest_mode(regs) )
2439 goto gp_in_kernel;
2441 /*
2442 * Cunning trick to allow arbitrary "INT n" handling.
2444 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2445 * instruction from trapping to the appropriate vector, when that might not
2446 * be expected by Xen or the guest OS. For example, that entry might be for
2447 * a fault handler (unlike traps, faults don't increment EIP), or might
2448 * expect an error code on the stack (which a software trap never
2449 * provides), or might be a hardware interrupt handler that doesn't like
2450 * being called spuriously.
2452 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2453 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2454 * clear to indicate that it's a software fault, not hardware.
2456 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2457 * okay because they can only be triggered by an explicit DPL-checked
2458 * instruction. The DPL specified by the guest OS for these vectors is NOT
2459 * CHECKED!!
2460 */
2461 if ( (regs->error_code & 3) == 2 )
2463 /* This fault must be due to <INT n> instruction. */
2464 const struct trap_info *ti;
2465 unsigned char vector = regs->error_code >> 3;
2466 ti = &v->arch.guest_context.trap_ctxt[vector];
2467 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2469 regs->eip += 2;
2470 do_guest_trap(vector, regs, 0);
2471 return;
2474 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2476 emulate_gate_op(regs);
2477 return;
2480 /* Emulate some simple privileged and I/O instructions. */
2481 if ( (regs->error_code == 0) &&
2482 emulate_privileged_op(regs) )
2484 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2485 return;
2488 #if defined(__i386__)
2489 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2490 (regs->error_code == 0) &&
2491 gpf_emulate_4gb(regs) )
2493 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2494 return;
2496 #endif
2498 /* Pass on GPF as is. */
2499 do_guest_trap(TRAP_gp_fault, regs, 1);
2500 return;
2502 gp_in_kernel:
2504 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2506 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2507 regs->error_code, _p(regs->eip), _p(fixup));
2508 regs->eip = fixup;
2509 return;
2512 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2514 hardware_gp:
2515 show_execution_state(regs);
2516 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2519 static void nmi_softirq(void)
2521 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2522 vcpu_kick(dom0->vcpu[0]);
2525 static void nmi_dom0_report(unsigned int reason_idx)
2527 struct domain *d;
2528 struct vcpu *v;
2530 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2531 return;
2533 set_bit(reason_idx, nmi_reason(d));
2535 if ( !test_and_set_bool(v->nmi_pending) )
2536 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2539 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2541 switch ( opt_nmi[0] )
2543 case 'd': /* 'dom0' */
2544 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2545 case 'i': /* 'ignore' */
2546 break;
2547 default: /* 'fatal' */
2548 console_force_unlock();
2549 printk("\n\nNMI - MEMORY ERROR\n");
2550 fatal_trap(TRAP_nmi, regs);
2553 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2554 mdelay(1);
2555 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2558 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2560 switch ( opt_nmi[0] )
2562 case 'd': /* 'dom0' */
2563 nmi_dom0_report(_XEN_NMIREASON_io_error);
2564 case 'i': /* 'ignore' */
2565 break;
2566 default: /* 'fatal' */
2567 console_force_unlock();
2568 printk("\n\nNMI - I/O ERROR\n");
2569 fatal_trap(TRAP_nmi, regs);
2572 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2573 mdelay(1);
2574 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2577 static void unknown_nmi_error(unsigned char reason)
2579 switch ( opt_nmi[0] )
2581 case 'd': /* 'dom0' */
2582 nmi_dom0_report(_XEN_NMIREASON_unknown);
2583 case 'i': /* 'ignore' */
2584 break;
2585 default: /* 'fatal' */
2586 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2587 printk("Dazed and confused, but trying to continue\n");
2588 printk("Do you have a strange power saving mode enabled?\n");
2589 kexec_crash();
2593 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2595 return 0;
2598 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2600 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2602 unsigned int cpu = smp_processor_id();
2603 unsigned char reason;
2605 ++nmi_count(cpu);
2607 if ( nmi_callback(regs, cpu) )
2608 return;
2610 if ( nmi_watchdog )
2611 nmi_watchdog_tick(regs);
2613 /* Only the BSP gets external NMIs from the system. */
2614 if ( cpu == 0 )
2616 reason = inb(0x61);
2617 if ( reason & 0x80 )
2618 mem_parity_error(regs);
2619 else if ( reason & 0x40 )
2620 io_check_error(regs);
2621 else if ( !nmi_watchdog )
2622 unknown_nmi_error((unsigned char)(reason&0xff));
2626 void set_nmi_callback(nmi_callback_t callback)
2628 nmi_callback = callback;
2631 void unset_nmi_callback(void)
2633 nmi_callback = dummy_nmi_callback;
2636 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2638 struct vcpu *curr = current;
2640 BUG_ON(!guest_mode(regs));
2642 setup_fpu(curr);
2644 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2646 do_guest_trap(TRAP_no_device, regs, 0);
2647 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2649 else
2650 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2652 return;
2655 asmlinkage void do_debug(struct cpu_user_regs *regs)
2657 struct vcpu *v = current;
2659 DEBUGGER_trap_entry(TRAP_debug, regs);
2661 if ( !guest_mode(regs) )
2663 if ( regs->eflags & EF_TF )
2665 #ifdef __x86_64__
2666 void sysenter_entry(void);
2667 void sysenter_eflags_saved(void);
2668 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2669 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2670 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2671 goto out;
2672 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2673 #else
2674 WARN_ON(1);
2675 #endif
2676 regs->eflags &= ~EF_TF;
2678 else
2680 /*
2681 * We ignore watchpoints when they trigger within Xen. This may
2682 * happen when a buffer is passed to us which previously had a
2683 * watchpoint set on it. No need to bump EIP; the only faulting
2684 * trap is an instruction breakpoint, which can't happen to us.
2685 */
2686 WARN_ON(!search_exception_table(regs->eip));
2688 goto out;
2691 /* Save debug status register where guest OS can peek at it */
2692 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2694 ler_enable();
2695 do_guest_trap(TRAP_debug, regs, 0);
2696 return;
2698 out:
2699 ler_enable();
2700 return;
2703 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2707 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2709 int i;
2710 /* Keep secondary tables in sync with IRQ updates. */
2711 for ( i = 1; i < NR_CPUS; i++ )
2712 if ( idt_tables[i] != NULL )
2713 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2714 _set_gate(&idt_table[n], 14, dpl, addr);
2717 static void set_swint_gate(unsigned int n, void *addr)
2719 __set_intr_gate(n, 3, addr);
2722 void set_intr_gate(unsigned int n, void *addr)
2724 __set_intr_gate(n, 0, addr);
2727 void set_tss_desc(unsigned int n, void *addr)
2729 _set_tssldt_desc(
2730 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2731 (unsigned long)addr,
2732 offsetof(struct tss_struct, __cacheline_filler) - 1,
2733 9);
2734 #ifdef CONFIG_COMPAT
2735 _set_tssldt_desc(
2736 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2737 (unsigned long)addr,
2738 offsetof(struct tss_struct, __cacheline_filler) - 1,
2739 11);
2740 #endif
2743 void __devinit percpu_traps_init(void)
2745 subarch_percpu_traps_init();
2747 if ( !opt_ler )
2748 return;
2750 switch ( boot_cpu_data.x86_vendor )
2752 case X86_VENDOR_INTEL:
2753 switch ( boot_cpu_data.x86 )
2755 case 6:
2756 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2757 break;
2758 case 15:
2759 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2760 break;
2762 break;
2763 case X86_VENDOR_AMD:
2764 switch ( boot_cpu_data.x86 )
2766 case 6:
2767 case 15:
2768 case 16:
2769 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2770 break;
2772 break;
2775 ler_enable();
2778 void __init trap_init(void)
2780 /*
2781 * Note that interrupt gates are always used, rather than trap gates. We
2782 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2783 * first activation must have the "bad" value(s) for these registers and
2784 * we may lose them if another activation is installed before they are
2785 * saved. The page-fault handler also needs interrupts disabled until %cr2
2786 * has been read and saved on the stack.
2787 */
2788 set_intr_gate(TRAP_divide_error,&divide_error);
2789 set_intr_gate(TRAP_debug,&debug);
2790 set_intr_gate(TRAP_nmi,&nmi);
2791 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2792 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2793 set_intr_gate(TRAP_bounds,&bounds);
2794 set_intr_gate(TRAP_invalid_op,&invalid_op);
2795 set_intr_gate(TRAP_no_device,&device_not_available);
2796 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2797 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2798 set_intr_gate(TRAP_no_segment,&segment_not_present);
2799 set_intr_gate(TRAP_stack_error,&stack_segment);
2800 set_intr_gate(TRAP_gp_fault,&general_protection);
2801 set_intr_gate(TRAP_page_fault,&page_fault);
2802 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2803 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2804 set_intr_gate(TRAP_alignment_check,&alignment_check);
2805 set_intr_gate(TRAP_machine_check,&machine_check);
2806 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2808 /* CPU0 uses the master IDT. */
2809 idt_tables[0] = idt_table;
2811 percpu_traps_init();
2813 cpu_init();
2815 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2818 long register_guest_nmi_callback(unsigned long address)
2820 struct vcpu *v = current;
2821 struct domain *d = v->domain;
2822 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2824 t->vector = TRAP_nmi;
2825 t->flags = 0;
2826 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2827 t->address = address;
2828 TI_SET_IF(t, 1);
2830 /*
2831 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2832 * now.
2833 */
2834 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2835 v->nmi_pending = 1;
2837 return 0;
2840 long unregister_guest_nmi_callback(void)
2842 struct vcpu *v = current;
2843 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2845 memset(t, 0, sizeof(*t));
2847 return 0;
2850 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
2852 struct trap_info cur;
2853 struct vcpu *curr = current;
2854 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2855 long rc = 0;
2857 /* If no table is presented then clear the entire virtual IDT. */
2858 if ( guest_handle_is_null(traps) )
2860 memset(dst, 0, 256 * sizeof(*dst));
2861 init_int80_direct_trap(curr);
2862 return 0;
2865 for ( ; ; )
2867 if ( hypercall_preempt_check() )
2869 rc = hypercall_create_continuation(
2870 __HYPERVISOR_set_trap_table, "h", traps);
2871 break;
2874 if ( copy_from_guest(&cur, traps, 1) )
2876 rc = -EFAULT;
2877 break;
2880 if ( cur.address == 0 )
2881 break;
2883 fixup_guest_code_selector(curr->domain, cur.cs);
2885 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2887 if ( cur.vector == 0x80 )
2888 init_int80_direct_trap(curr);
2890 guest_handle_add_offset(traps, 1);
2893 return rc;
2896 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2898 int i;
2899 struct vcpu *curr = current;
2901 switch ( reg )
2903 case 0:
2904 if ( !access_ok(value, sizeof(long)) )
2905 return -EPERM;
2906 if ( v == curr )
2907 write_debugreg(0, value);
2908 break;
2909 case 1:
2910 if ( !access_ok(value, sizeof(long)) )
2911 return -EPERM;
2912 if ( v == curr )
2913 write_debugreg(1, value);
2914 break;
2915 case 2:
2916 if ( !access_ok(value, sizeof(long)) )
2917 return -EPERM;
2918 if ( v == curr )
2919 write_debugreg(2, value);
2920 break;
2921 case 3:
2922 if ( !access_ok(value, sizeof(long)) )
2923 return -EPERM;
2924 if ( v == curr )
2925 write_debugreg(3, value);
2926 break;
2927 case 6:
2928 /*
2929 * DR6: Bits 4-11,16-31 reserved (set to 1).
2930 * Bit 12 reserved (set to 0).
2931 */
2932 value &= 0xffffefff; /* reserved bits => 0 */
2933 value |= 0xffff0ff0; /* reserved bits => 1 */
2934 if ( v == curr )
2935 write_debugreg(6, value);
2936 break;
2937 case 7:
2938 /*
2939 * DR7: Bit 10 reserved (set to 1).
2940 * Bits 11-12,14-15 reserved (set to 0).
2941 */
2942 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2943 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
2944 /*
2945 * Privileged bits:
2946 * GD (bit 13): must be 0.
2947 */
2948 if ( value & DR_GENERAL_DETECT )
2949 return -EPERM;
2950 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2951 if ( value & DR7_ACTIVE_MASK )
2953 unsigned int io_enable = 0;
2955 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2957 if ( ((value >> i) & 3) == DR_IO )
2959 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
2960 return -EPERM;
2961 io_enable |= value & (3 << ((i - 16) >> 1));
2963 #ifdef __i386__
2964 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
2965 !boot_cpu_has(X86_FEATURE_LM)) &&
2966 (((value >> i) & 0xc) == DR_LEN_8) )
2967 return -EPERM;
2968 #endif
2971 /* Guest DR5 is a handy stash for I/O intercept information. */
2972 v->arch.guest_context.debugreg[5] = io_enable;
2973 value &= ~io_enable;
2975 /*
2976 * If DR7 was previously clear then we need to load all other
2977 * debug registers at this point as they were not restored during
2978 * context switch.
2979 */
2980 if ( (v == curr) &&
2981 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
2983 write_debugreg(0, v->arch.guest_context.debugreg[0]);
2984 write_debugreg(1, v->arch.guest_context.debugreg[1]);
2985 write_debugreg(2, v->arch.guest_context.debugreg[2]);
2986 write_debugreg(3, v->arch.guest_context.debugreg[3]);
2987 write_debugreg(6, v->arch.guest_context.debugreg[6]);
2990 if ( v == curr )
2991 write_debugreg(7, value);
2992 break;
2993 default:
2994 return -EINVAL;
2997 v->arch.guest_context.debugreg[reg] = value;
2998 return 0;
3001 long do_set_debugreg(int reg, unsigned long value)
3003 return set_debugreg(current, reg, value);
3006 unsigned long do_get_debugreg(int reg)
3008 struct vcpu *curr = current;
3010 switch ( reg )
3012 case 0 ... 3:
3013 case 6:
3014 return curr->arch.guest_context.debugreg[reg];
3015 case 7:
3016 return (curr->arch.guest_context.debugreg[7] |
3017 curr->arch.guest_context.debugreg[5]);
3018 case 4 ... 5:
3019 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3020 curr->arch.guest_context.debugreg[reg + 2] : 0);
3023 return -EINVAL;
3026 /*
3027 * Local variables:
3028 * mode: C
3029 * c-set-style: "BSD"
3030 * c-basic-offset: 4
3031 * tab-width: 4
3032 * indent-tabs-mode: nil
3033 * End:
3034 */