debuggers.hg

view xen/arch/x86/traps.c @ 16751:973221f4d9c7

x86: INT3 and INTO trap gates should have DPL==3.
This was broken by c/s 16667 (gdbstub changes).
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 15 11:29:15 2008 +0000 (2008-01-15)
parents 9bf8b152df9f
children a66bdc82d8fa
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
91 DECLARE_TRAP_HANDLER(divide_error);
92 DECLARE_TRAP_HANDLER(debug);
93 DECLARE_TRAP_HANDLER(nmi);
94 DECLARE_TRAP_HANDLER(int3);
95 DECLARE_TRAP_HANDLER(overflow);
96 DECLARE_TRAP_HANDLER(bounds);
97 DECLARE_TRAP_HANDLER(invalid_op);
98 DECLARE_TRAP_HANDLER(device_not_available);
99 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
100 DECLARE_TRAP_HANDLER(invalid_TSS);
101 DECLARE_TRAP_HANDLER(segment_not_present);
102 DECLARE_TRAP_HANDLER(stack_segment);
103 DECLARE_TRAP_HANDLER(general_protection);
104 DECLARE_TRAP_HANDLER(page_fault);
105 DECLARE_TRAP_HANDLER(coprocessor_error);
106 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
107 DECLARE_TRAP_HANDLER(machine_check);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
113 void (*ioemul_handle_quirk)(
114 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
116 static int debug_stack_lines = 20;
117 integer_param("debug_stack_lines", debug_stack_lines);
119 static int opt_ler;
120 boolean_param("ler", opt_ler);
122 #ifdef CONFIG_X86_32
123 #define stack_words_per_line 8
124 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
125 #else
126 #define stack_words_per_line 4
127 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
128 #endif
130 static void show_guest_stack(struct cpu_user_regs *regs)
131 {
132 int i;
133 struct vcpu *curr = current;
134 unsigned long *stack, addr;
136 if ( is_hvm_vcpu(curr) )
137 return;
139 if ( is_pv_32on64_vcpu(curr) )
140 {
141 compat_show_guest_stack(regs, debug_stack_lines);
142 return;
143 }
145 if ( vm86_mode(regs) )
146 {
147 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
148 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
149 regs->ss, (uint16_t)(regs->esp & 0xffff));
150 }
151 else
152 {
153 stack = (unsigned long *)regs->esp;
154 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
155 }
157 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
158 {
159 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
160 break;
161 if ( get_user(addr, stack) )
162 {
163 if ( i != 0 )
164 printk("\n ");
165 printk("Fault while accessing guest memory.");
166 i = 1;
167 break;
168 }
169 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
170 printk("\n ");
171 printk(" %p", _p(addr));
172 stack++;
173 }
174 if ( i == 0 )
175 printk("Stack empty.");
176 printk("\n");
177 }
179 #if !defined(CONFIG_FRAME_POINTER)
181 static void show_trace(struct cpu_user_regs *regs)
182 {
183 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
185 printk("Xen call trace:\n ");
187 printk("[<%p>]", _p(regs->eip));
188 print_symbol(" %s\n ", regs->eip);
190 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
191 {
192 addr = *stack++;
193 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
194 {
195 printk("[<%p>]", _p(addr));
196 print_symbol(" %s\n ", addr);
197 }
198 }
200 printk("\n");
201 }
203 #else
205 static void show_trace(struct cpu_user_regs *regs)
206 {
207 unsigned long *frame, next, addr, low, high;
209 printk("Xen call trace:\n ");
211 printk("[<%p>]", _p(regs->eip));
212 print_symbol(" %s\n ", regs->eip);
214 /* Bounds for range of valid frame pointer. */
215 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
216 high = (low & ~(STACK_SIZE - 1)) +
217 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
219 /* The initial frame pointer. */
220 next = regs->ebp;
222 for ( ; ; )
223 {
224 /* Valid frame pointer? */
225 if ( (next < low) || (next >= high) )
226 {
227 /*
228 * Exception stack frames have a different layout, denoted by an
229 * inverted frame pointer.
230 */
231 next = ~next;
232 if ( (next < low) || (next >= high) )
233 break;
234 frame = (unsigned long *)next;
235 next = frame[0];
236 addr = frame[(offsetof(struct cpu_user_regs, eip) -
237 offsetof(struct cpu_user_regs, ebp))
238 / BYTES_PER_LONG];
239 }
240 else
241 {
242 /* Ordinary stack frame. */
243 frame = (unsigned long *)next;
244 next = frame[0];
245 addr = frame[1];
246 }
248 printk("[<%p>]", _p(addr));
249 print_symbol(" %s\n ", addr);
251 low = (unsigned long)&frame[2];
252 }
254 printk("\n");
255 }
257 #endif
259 void show_stack(struct cpu_user_regs *regs)
260 {
261 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
262 int i;
264 if ( guest_mode(regs) )
265 return show_guest_stack(regs);
267 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
269 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
270 {
271 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
272 break;
273 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
274 printk("\n ");
275 addr = *stack++;
276 printk(" %p", _p(addr));
277 }
278 if ( i == 0 )
279 printk("Stack empty.");
280 printk("\n");
282 show_trace(regs);
283 }
285 void show_stack_overflow(unsigned int cpu, unsigned long esp)
286 {
287 #ifdef MEMORY_GUARD
288 unsigned long esp_top, esp_bottom;
289 unsigned long *stack, addr;
291 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
292 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
294 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
295 (void *)esp_top, (void *)esp_bottom, (void *)esp,
296 (void *)init_tss[cpu].esp0);
298 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
299 if ( ((unsigned long)(esp - esp_top) > 512) &&
300 ((unsigned long)(esp_top - esp) > 512) )
301 {
302 printk("No stack overflow detected. Skipping stack trace.\n");
303 return;
304 }
306 if ( esp < esp_top )
307 esp = esp_top;
309 printk("Xen stack overflow (dumping trace %p-%p):\n ",
310 (void *)esp, (void *)esp_bottom);
312 stack = (unsigned long *)esp;
313 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
314 {
315 addr = *stack++;
316 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
317 {
318 printk("%p: [<%p>]", stack, _p(addr));
319 print_symbol(" %s\n ", addr);
320 }
321 }
323 printk("\n");
324 #endif
325 }
327 void show_execution_state(struct cpu_user_regs *regs)
328 {
329 show_registers(regs);
330 show_stack(regs);
331 }
333 char *trapstr(int trapnr)
334 {
335 static char *strings[] = {
336 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
337 "invalid opcode", "device not available", "double fault",
338 "coprocessor segment", "invalid tss", "segment not found",
339 "stack error", "general protection fault", "page fault",
340 "spurious interrupt", "coprocessor error", "alignment check",
341 "machine check", "simd error"
342 };
344 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
345 return "???";
347 return strings[trapnr];
348 }
350 /*
351 * This is called for faults at very unexpected times (e.g., when interrupts
352 * are disabled). In such situations we can't do much that is safe. We try to
353 * print out some tracing and then we just spin.
354 */
355 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
356 {
357 static DEFINE_PER_CPU(char, depth);
359 /*
360 * In some cases, we can end up in a vicious cycle of fatal_trap()s
361 * within fatal_trap()s. We give the problem a couple of iterations to
362 * bottom out, and then we just panic.
363 */
364 if ( ++this_cpu(depth) < 3 )
365 {
366 watchdog_disable();
367 console_start_sync();
369 show_execution_state(regs);
371 if ( trapnr == TRAP_page_fault )
372 {
373 unsigned long cr2 = read_cr2();
374 printk("Faulting linear address: %p\n", _p(cr2));
375 show_page_walk(cr2);
376 }
377 }
379 panic("FATAL TRAP: vector = %d (%s)\n"
380 "[error_code=%04x] %s\n",
381 trapnr, trapstr(trapnr), regs->error_code,
382 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
383 }
385 static void do_guest_trap(
386 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
387 {
388 struct vcpu *v = current;
389 struct trap_bounce *tb;
390 const struct trap_info *ti;
392 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
394 tb = &v->arch.trap_bounce;
395 ti = &v->arch.guest_context.trap_ctxt[trapnr];
397 tb->flags = TBF_EXCEPTION;
398 tb->cs = ti->cs;
399 tb->eip = ti->address;
401 if ( use_error_code )
402 {
403 tb->flags |= TBF_EXCEPTION_ERRCODE;
404 tb->error_code = regs->error_code;
405 }
407 if ( TI_GET_IF(ti) )
408 tb->flags |= TBF_INTERRUPT;
410 if ( unlikely(null_trap_bounce(v, tb)) )
411 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
412 "on VCPU %d [ec=%04x]\n",
413 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
414 }
416 static void instruction_done(
417 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
418 {
419 regs->eip = eip;
420 regs->eflags &= ~X86_EFLAGS_RF;
421 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
422 {
423 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
424 if ( regs->eflags & X86_EFLAGS_TF )
425 current->arch.guest_context.debugreg[6] |= 0x4000;
426 do_guest_trap(TRAP_debug, regs, 0);
427 }
428 }
430 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
431 unsigned int port, unsigned int len)
432 {
433 unsigned int width, i, match = 0;
434 unsigned long start;
436 if ( !(v->arch.guest_context.debugreg[5]) ||
437 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
438 return 0;
440 for ( i = 0; i < 4; i++ )
441 {
442 if ( !(v->arch.guest_context.debugreg[5] &
443 (3 << (i * DR_ENABLE_SIZE))) )
444 continue;
446 start = v->arch.guest_context.debugreg[i];
447 width = 0;
449 switch ( (v->arch.guest_context.debugreg[7] >>
450 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
451 {
452 case DR_LEN_1: width = 1; break;
453 case DR_LEN_2: width = 2; break;
454 case DR_LEN_4: width = 4; break;
455 case DR_LEN_8: width = 8; break;
456 }
458 if ( (start < (port + len)) && ((start + width) > port) )
459 match |= 1 << i;
460 }
462 return match;
463 }
465 /*
466 * Called from asm to set up the NMI trapbounce info.
467 * Returns 0 if no callback is set up, else 1.
468 */
469 asmlinkage int set_guest_nmi_trapbounce(void)
470 {
471 struct vcpu *v = current;
472 struct trap_bounce *tb = &v->arch.trap_bounce;
473 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
474 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
475 return !null_trap_bounce(v, tb);
476 }
478 static inline void do_trap(
479 int trapnr, struct cpu_user_regs *regs, int use_error_code)
480 {
481 unsigned long fixup;
483 DEBUGGER_trap_entry(trapnr, regs);
485 if ( guest_mode(regs) )
486 {
487 do_guest_trap(trapnr, regs, use_error_code);
488 return;
489 }
491 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
492 {
493 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
494 trapnr, _p(regs->eip), _p(fixup));
495 regs->eip = fixup;
496 return;
497 }
499 DEBUGGER_trap_fatal(trapnr, regs);
501 show_execution_state(regs);
502 panic("FATAL TRAP: vector = %d (%s)\n"
503 "[error_code=%04x]\n",
504 trapnr, trapstr(trapnr), regs->error_code);
505 }
507 #define DO_ERROR_NOCODE(trapnr, name) \
508 asmlinkage void do_##name(struct cpu_user_regs *regs) \
509 { \
510 do_trap(trapnr, regs, 0); \
511 }
513 #define DO_ERROR(trapnr, name) \
514 asmlinkage void do_##name(struct cpu_user_regs *regs) \
515 { \
516 do_trap(trapnr, regs, 1); \
517 }
519 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
520 DO_ERROR_NOCODE(TRAP_overflow, overflow)
521 DO_ERROR_NOCODE(TRAP_bounds, bounds)
522 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
523 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
524 DO_ERROR( TRAP_no_segment, segment_not_present)
525 DO_ERROR( TRAP_stack_error, stack_segment)
526 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
527 DO_ERROR( TRAP_alignment_check, alignment_check)
528 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
530 int rdmsr_hypervisor_regs(
531 uint32_t idx, uint32_t *eax, uint32_t *edx)
532 {
533 idx -= 0x40000000;
534 if ( idx > 0 )
535 return 0;
537 switch ( idx )
538 {
539 case 0:
540 {
541 *eax = *edx = 0;
542 break;
543 }
544 default:
545 BUG();
546 }
548 return 1;
549 }
551 int wrmsr_hypervisor_regs(
552 uint32_t idx, uint32_t eax, uint32_t edx)
553 {
554 struct domain *d = current->domain;
556 idx -= 0x40000000;
557 if ( idx > 0 )
558 return 0;
560 switch ( idx )
561 {
562 case 0:
563 {
564 void *hypercall_page;
565 unsigned long mfn;
566 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
567 unsigned int idx = eax & 0xfff;
569 if ( idx > 0 )
570 {
571 gdprintk(XENLOG_WARNING,
572 "Out of range index %u to MSR %08x\n",
573 idx, 0x40000000);
574 return 0;
575 }
577 mfn = gmfn_to_mfn(d, gmfn);
579 if ( !mfn_valid(mfn) ||
580 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
581 {
582 gdprintk(XENLOG_WARNING,
583 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
584 gmfn, mfn, 0x40000000);
585 return 0;
586 }
588 hypercall_page = map_domain_page(mfn);
589 hypercall_page_initialise(d, hypercall_page);
590 unmap_domain_page(hypercall_page);
592 put_page_and_type(mfn_to_page(mfn));
593 break;
594 }
596 default:
597 BUG();
598 }
600 return 1;
601 }
603 int cpuid_hypervisor_leaves(
604 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
605 {
606 idx -= 0x40000000;
607 if ( idx > 2 )
608 return 0;
610 switch ( idx )
611 {
612 case 0:
613 *eax = 0x40000002; /* Largest leaf */
614 *ebx = 0x566e6558; /* Signature 1: "XenV" */
615 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
616 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
617 break;
619 case 1:
620 *eax = (xen_major_version() << 16) | xen_minor_version();
621 *ebx = 0; /* Reserved */
622 *ecx = 0; /* Reserved */
623 *edx = 0; /* Reserved */
624 break;
626 case 2:
627 *eax = 1; /* Number of hypercall-transfer pages */
628 *ebx = 0x40000000; /* MSR base address */
629 *ecx = 0; /* Features 1 */
630 *edx = 0; /* Features 2 */
631 break;
633 default:
634 BUG();
635 }
637 return 1;
638 }
640 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
641 {
642 char sig[5], instr[2];
643 uint32_t a, b, c, d;
644 unsigned long eip, rc;
646 a = regs->eax;
647 b = regs->ebx;
648 c = regs->ecx;
649 d = regs->edx;
650 eip = regs->eip;
652 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
653 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
654 {
655 propagate_page_fault(eip + sizeof(sig) - rc, 0);
656 return EXCRET_fault_fixed;
657 }
658 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
659 return 0;
660 eip += sizeof(sig);
662 /* We only emulate CPUID. */
663 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
664 {
665 propagate_page_fault(eip + sizeof(instr) - rc, 0);
666 return EXCRET_fault_fixed;
667 }
668 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
669 return 0;
670 eip += sizeof(instr);
672 asm (
673 "cpuid"
674 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
675 : "0" (a), "1" (b), "2" (c), "3" (d) );
677 if ( regs->eax == 1 )
678 {
679 /* Modify Feature Information. */
680 __clear_bit(X86_FEATURE_VME, &d);
681 __clear_bit(X86_FEATURE_PSE, &d);
682 __clear_bit(X86_FEATURE_PGE, &d);
683 if ( !cpu_has_sep )
684 __clear_bit(X86_FEATURE_SEP, &d);
685 #ifdef __i386__
686 if ( !supervisor_mode_kernel )
687 __clear_bit(X86_FEATURE_SEP, &d);
688 #endif
689 if ( !IS_PRIV(current->domain) )
690 __clear_bit(X86_FEATURE_MTRR, &d);
691 }
692 else if ( regs->eax == 0x80000001 )
693 {
694 /* Modify Feature Information. */
695 #ifdef __i386__
696 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
697 #endif
698 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
699 }
700 else
701 {
702 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
703 }
705 regs->eax = a;
706 regs->ebx = b;
707 regs->ecx = c;
708 regs->edx = d;
710 instruction_done(regs, eip, 0);
712 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
714 return EXCRET_fault_fixed;
715 }
717 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
718 {
719 struct bug_frame bug;
720 struct bug_frame_str bug_str;
721 char *filename, *predicate, *eip = (char *)regs->eip;
722 unsigned long fixup;
723 int id, lineno;
725 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
727 if ( likely(guest_mode(regs)) )
728 {
729 if ( !emulate_forced_invalid_op(regs) )
730 do_guest_trap(TRAP_invalid_op, regs, 0);
731 return;
732 }
734 if ( !is_kernel(eip) ||
735 __copy_from_user(&bug, eip, sizeof(bug)) ||
736 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
737 (bug.ret != 0xc2) )
738 goto die;
739 eip += sizeof(bug);
741 id = bug.id & 3;
743 if ( id == BUGFRAME_dump )
744 {
745 show_execution_state(regs);
746 regs->eip = (unsigned long)eip;
747 return;
748 }
750 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
751 if ( !is_kernel(eip) ||
752 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
753 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
754 goto die;
755 eip += sizeof(bug_str);
757 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
758 lineno = bug.id >> 2;
760 if ( id == BUGFRAME_warn )
761 {
762 printk("Xen WARN at %.50s:%d\n", filename, lineno);
763 show_execution_state(regs);
764 regs->eip = (unsigned long)eip;
765 return;
766 }
768 if ( id == BUGFRAME_bug )
769 {
770 printk("Xen BUG at %.50s:%d\n", filename, lineno);
771 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
772 show_execution_state(regs);
773 panic("Xen BUG at %.50s:%d\n", filename, lineno);
774 }
776 /* ASSERT: decode the predicate string pointer. */
777 ASSERT(id == BUGFRAME_assert);
778 if ( !is_kernel(eip) ||
779 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
780 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
781 goto die;
782 eip += sizeof(bug_str);
784 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
785 printk("Assertion '%s' failed at %.50s:%d\n",
786 predicate, filename, lineno);
787 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
788 show_execution_state(regs);
789 panic("Assertion '%s' failed at %.50s:%d\n",
790 predicate, filename, lineno);
792 die:
793 if ( (fixup = search_exception_table(regs->eip)) != 0 )
794 {
795 regs->eip = fixup;
796 return;
797 }
798 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
799 show_execution_state(regs);
800 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
801 }
803 asmlinkage void do_int3(struct cpu_user_regs *regs)
804 {
805 DEBUGGER_trap_entry(TRAP_int3, regs);
807 if ( !guest_mode(regs) )
808 {
809 debugger_trap_fatal(TRAP_int3, regs);
810 return;
811 }
813 do_guest_trap(TRAP_int3, regs, 0);
814 }
816 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
817 {
818 extern fastcall void (*machine_check_vector)(
819 struct cpu_user_regs *, long error_code);
820 machine_check_vector(regs, regs->error_code);
821 }
823 void propagate_page_fault(unsigned long addr, u16 error_code)
824 {
825 struct trap_info *ti;
826 struct vcpu *v = current;
827 struct trap_bounce *tb = &v->arch.trap_bounce;
829 v->arch.guest_context.ctrlreg[2] = addr;
830 arch_set_cr2(v, addr);
832 /* Re-set error_code.user flag appropriately for the guest. */
833 error_code &= ~PFEC_user_mode;
834 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
835 error_code |= PFEC_user_mode;
837 trace_pv_page_fault(addr, error_code);
839 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
840 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
841 tb->error_code = error_code;
842 tb->cs = ti->cs;
843 tb->eip = ti->address;
844 if ( TI_GET_IF(ti) )
845 tb->flags |= TBF_INTERRUPT;
846 if ( unlikely(null_trap_bounce(v, tb)) )
847 {
848 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
849 v->domain->domain_id, v->vcpu_id, error_code);
850 show_page_walk(addr);
851 }
852 }
854 static int handle_gdt_ldt_mapping_fault(
855 unsigned long offset, struct cpu_user_regs *regs)
856 {
857 struct vcpu *curr = current;
858 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
859 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
860 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
862 /* Should never fault in another vcpu's area. */
863 BUG_ON(vcpu_area != curr->vcpu_id);
865 /* Byte offset within the gdt/ldt sub-area. */
866 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
868 if ( likely(is_ldt_area) )
869 {
870 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
871 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
872 {
873 if ( guest_mode(regs) )
874 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
875 regs->eip, offset);
876 }
877 else
878 {
879 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
880 if ( !guest_mode(regs) )
881 return 0;
882 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
883 propagate_page_fault(
884 curr->arch.guest_context.ldt_base + offset,
885 regs->error_code);
886 }
887 }
888 else
889 {
890 /* GDT fault: handle the fault as #GP(selector). */
891 regs->error_code = (u16)offset & ~7;
892 (void)do_general_protection(regs);
893 }
895 return EXCRET_fault_fixed;
896 }
898 #ifdef HYPERVISOR_VIRT_END
899 #define IN_HYPERVISOR_RANGE(va) \
900 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
901 #else
902 #define IN_HYPERVISOR_RANGE(va) \
903 (((va) >= HYPERVISOR_VIRT_START))
904 #endif
906 static int __spurious_page_fault(
907 unsigned long addr, struct cpu_user_regs *regs)
908 {
909 unsigned long mfn, cr3 = read_cr3();
910 #if CONFIG_PAGING_LEVELS >= 4
911 l4_pgentry_t l4e, *l4t;
912 #endif
913 #if CONFIG_PAGING_LEVELS >= 3
914 l3_pgentry_t l3e, *l3t;
915 #endif
916 l2_pgentry_t l2e, *l2t;
917 l1_pgentry_t l1e, *l1t;
918 unsigned int required_flags, disallowed_flags;
920 /*
921 * We do not take spurious page faults in IRQ handlers as we do not
922 * modify page tables in IRQ context. We therefore bail here because
923 * map_domain_page() is not IRQ-safe.
924 */
925 if ( in_irq() )
926 return 0;
928 /* Reserved bit violations are never spurious faults. */
929 if ( regs->error_code & PFEC_reserved_bit )
930 return 0;
932 required_flags = _PAGE_PRESENT;
933 if ( regs->error_code & PFEC_write_access )
934 required_flags |= _PAGE_RW;
935 if ( regs->error_code & PFEC_user_mode )
936 required_flags |= _PAGE_USER;
938 disallowed_flags = 0;
939 if ( regs->error_code & PFEC_insn_fetch )
940 disallowed_flags |= _PAGE_NX;
942 mfn = cr3 >> PAGE_SHIFT;
944 #if CONFIG_PAGING_LEVELS >= 4
945 l4t = map_domain_page(mfn);
946 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
947 mfn = l4e_get_pfn(l4e);
948 unmap_domain_page(l4t);
949 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
950 (l4e_get_flags(l4e) & disallowed_flags) )
951 return 0;
952 #endif
954 #if CONFIG_PAGING_LEVELS >= 3
955 l3t = map_domain_page(mfn);
956 #ifdef CONFIG_X86_PAE
957 l3t += (cr3 & 0xFE0UL) >> 3;
958 #endif
959 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
960 mfn = l3e_get_pfn(l3e);
961 unmap_domain_page(l3t);
962 #ifdef CONFIG_X86_PAE
963 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
964 return 0;
965 #else
966 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
967 (l3e_get_flags(l3e) & disallowed_flags) )
968 return 0;
969 #endif
970 #endif
972 l2t = map_domain_page(mfn);
973 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
974 mfn = l2e_get_pfn(l2e);
975 unmap_domain_page(l2t);
976 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
977 (l2e_get_flags(l2e) & disallowed_flags) )
978 return 0;
979 if ( l2e_get_flags(l2e) & _PAGE_PSE )
980 {
981 l1e = l1e_empty(); /* define before use in debug tracing */
982 goto spurious;
983 }
985 l1t = map_domain_page(mfn);
986 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
987 mfn = l1e_get_pfn(l1e);
988 unmap_domain_page(l1t);
989 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
990 (l1e_get_flags(l1e) & disallowed_flags) )
991 return 0;
993 spurious:
994 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
995 "at addr %lx, e/c %04x\n",
996 current->domain->domain_id, current->vcpu_id,
997 addr, regs->error_code);
998 #if CONFIG_PAGING_LEVELS >= 4
999 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1000 #endif
1001 #if CONFIG_PAGING_LEVELS >= 3
1002 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1003 #endif
1004 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1005 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1006 #ifndef NDEBUG
1007 show_registers(regs);
1008 #endif
1009 return 1;
1012 static int spurious_page_fault(
1013 unsigned long addr, struct cpu_user_regs *regs)
1015 unsigned long flags;
1016 int is_spurious;
1018 /*
1019 * Disabling interrupts prevents TLB flushing, and hence prevents
1020 * page tables from becoming invalid under our feet during the walk.
1021 */
1022 local_irq_save(flags);
1023 is_spurious = __spurious_page_fault(addr, regs);
1024 local_irq_restore(flags);
1026 return is_spurious;
1029 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1031 struct vcpu *v = current;
1032 struct domain *d = v->domain;
1034 /* No fixups in interrupt context or when interrupts are disabled. */
1035 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1036 return 0;
1038 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1040 if ( paging_mode_external(d) && guest_mode(regs) )
1042 int ret = paging_fault(addr, regs);
1043 if ( ret == EXCRET_fault_fixed )
1044 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1045 return ret;
1047 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1048 return handle_gdt_ldt_mapping_fault(
1049 addr - GDT_LDT_VIRT_START, regs);
1050 return 0;
1053 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1054 guest_kernel_mode(v, regs) &&
1055 /* Do not check if access-protection fault since the page may
1056 legitimately be not present in shadow page tables */
1057 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
1058 ptwr_do_page_fault(v, addr, regs) )
1059 return EXCRET_fault_fixed;
1061 if ( paging_mode_enabled(d) )
1063 int ret = paging_fault(addr, regs);
1064 if ( ret == EXCRET_fault_fixed )
1065 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1066 return ret;
1069 return 0;
1072 /*
1073 * #PF error code:
1074 * Bit 0: Protection violation (=1) ; Page not present (=0)
1075 * Bit 1: Write access
1076 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1077 * Bit 3: Reserved bit violation
1078 * Bit 4: Instruction fetch
1079 */
1080 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1082 unsigned long addr, fixup;
1084 addr = read_cr2();
1086 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1088 perfc_incr(page_faults);
1090 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1091 return;
1093 if ( unlikely(!guest_mode(regs)) )
1095 if ( spurious_page_fault(addr, regs) )
1096 return;
1098 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1100 perfc_incr(copy_user_faults);
1101 regs->eip = fixup;
1102 return;
1105 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1107 show_execution_state(regs);
1108 show_page_walk(addr);
1109 panic("FATAL PAGE FAULT\n"
1110 "[error_code=%04x]\n"
1111 "Faulting linear address: %p\n",
1112 regs->error_code, _p(addr));
1115 propagate_page_fault(addr, regs->error_code);
1118 /*
1119 * Early handler to deal with spurious page faults. For example, consider a
1120 * routine that uses a mapping immediately after installing it (making it
1121 * present). The CPU may speculatively execute the memory access before
1122 * executing the PTE write. The instruction will then be marked to cause a
1123 * page fault when it is retired, despite the fact that the PTE is present and
1124 * correct at that point in time.
1125 */
1126 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1128 static int stuck;
1129 static unsigned long prev_eip, prev_cr2;
1130 unsigned long cr2 = read_cr2();
1132 BUG_ON(smp_processor_id() != 0);
1134 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1136 prev_eip = regs->eip;
1137 prev_cr2 = cr2;
1138 stuck = 0;
1139 return;
1142 if ( stuck++ == 1000 )
1143 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1144 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1147 long do_fpu_taskswitch(int set)
1149 struct vcpu *v = current;
1151 if ( set )
1153 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1154 stts();
1156 else
1158 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1159 if ( v->fpu_dirtied )
1160 clts();
1163 return 0;
1166 static int read_descriptor(unsigned int sel,
1167 const struct vcpu *v,
1168 const struct cpu_user_regs * regs,
1169 unsigned long *base,
1170 unsigned long *limit,
1171 unsigned int *ar,
1172 unsigned int vm86attr)
1174 struct desc_struct desc;
1176 if ( !vm86_mode(regs) )
1178 if ( sel < 4)
1179 desc.b = desc.a = 0;
1180 else if ( __get_user(desc,
1181 (const struct desc_struct *)(!(sel & 4)
1182 ? GDT_VIRT_START(v)
1183 : LDT_VIRT_START(v))
1184 + (sel >> 3)) )
1185 return 0;
1186 if ( !(vm86attr & _SEGMENT_CODE) )
1187 desc.b &= ~_SEGMENT_L;
1189 else
1191 desc.a = (sel << 20) | 0xffff;
1192 desc.b = vm86attr | (sel >> 12);
1195 *ar = desc.b & 0x00f0ff00;
1196 if ( !(desc.b & _SEGMENT_L) )
1198 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1199 (desc.b & 0xff000000));
1200 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1201 if ( desc.b & _SEGMENT_G )
1202 *limit = ((*limit + 1) << 12) - 1;
1203 #ifndef NDEBUG
1204 if ( !vm86_mode(regs) && (sel > 3) )
1206 unsigned int a, l;
1207 unsigned char valid;
1209 asm volatile (
1210 "larl %2,%0 ; setz %1"
1211 : "=r" (a), "=rm" (valid) : "rm" (sel));
1212 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1213 asm volatile (
1214 "lsll %2,%0 ; setz %1"
1215 : "=r" (l), "=rm" (valid) : "rm" (sel));
1216 BUG_ON(valid && (l != *limit));
1218 #endif
1220 else
1222 *base = 0UL;
1223 *limit = ~0UL;
1226 return 1;
1229 #ifdef __x86_64__
1230 static int read_gate_descriptor(unsigned int gate_sel,
1231 const struct vcpu *v,
1232 unsigned int *sel,
1233 unsigned long *off,
1234 unsigned int *ar)
1236 struct desc_struct desc;
1237 const struct desc_struct *pdesc;
1240 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1241 GDT_VIRT_START(v) :
1242 LDT_VIRT_START(v))
1243 + (gate_sel >> 3);
1244 if ( gate_sel < 4 ||
1245 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1246 __get_user(desc, pdesc) )
1247 return 0;
1249 *sel = (desc.a >> 16) & 0x0000fffc;
1250 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1251 *ar = desc.b & 0x0000ffff;
1252 /*
1253 * check_descriptor() clears the DPL field and stores the
1254 * guest requested DPL in the selector's RPL field.
1255 */
1256 ASSERT(!(*ar & _SEGMENT_DPL));
1257 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1259 if ( !is_pv_32bit_vcpu(v) )
1261 if ( (*ar & 0x1f00) != 0x0c00 ||
1262 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1263 __get_user(desc, pdesc + 1) ||
1264 (desc.b & 0x1f00) )
1265 return 0;
1267 *off |= (unsigned long)desc.a << 32;
1268 return 1;
1271 switch ( *ar & 0x1f00 )
1273 case 0x0400:
1274 *off &= 0xffff;
1275 break;
1276 case 0x0c00:
1277 break;
1278 default:
1279 return 0;
1282 return 1;
1284 #endif
1286 /* Has the guest requested sufficient permission for this I/O access? */
1287 static inline int guest_io_okay(
1288 unsigned int port, unsigned int bytes,
1289 struct vcpu *v, struct cpu_user_regs *regs)
1291 #if defined(__x86_64__)
1292 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1293 int user_mode = !(v->arch.flags & TF_kernel_mode);
1294 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1295 #elif defined(__i386__)
1296 #define TOGGLE_MODE() ((void)0)
1297 #endif
1299 if ( !vm86_mode(regs) &&
1300 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1301 return 1;
1303 if ( v->arch.iobmp_limit > (port + bytes) )
1305 union { uint8_t bytes[2]; uint16_t mask; } x;
1307 /*
1308 * Grab permission bytes from guest space. Inaccessible bytes are
1309 * read as 0xff (no access allowed).
1310 */
1311 TOGGLE_MODE();
1312 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1313 port>>3, 2) )
1315 default: x.bytes[0] = ~0;
1316 case 1: x.bytes[1] = ~0;
1317 case 0: break;
1319 TOGGLE_MODE();
1321 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1322 return 1;
1325 return 0;
1328 /* Has the administrator granted sufficient permission for this I/O access? */
1329 static inline int admin_io_okay(
1330 unsigned int port, unsigned int bytes,
1331 struct vcpu *v, struct cpu_user_regs *regs)
1333 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1336 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1337 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1338 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1339 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1340 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1341 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1343 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1344 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1345 __attribute__((__regparm__(1)));
1346 unsigned long guest_to_host_gpr_switch(unsigned long)
1347 __attribute__((__regparm__(1)));
1349 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1351 /* Instruction fetch with error handling. */
1352 #define insn_fetch(type, base, eip, limit) \
1353 ({ unsigned long _rc, _ptr = (base) + (eip); \
1354 type _x; \
1355 if ( ad_default < 8 ) \
1356 _ptr = (unsigned int)_ptr; \
1357 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1358 goto fail; \
1359 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1360 { \
1361 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1362 goto skip; \
1363 } \
1364 (eip) += sizeof(_x); _x; })
1366 #if defined(CONFIG_X86_32)
1367 # define read_sreg(regs, sr) ((regs)->sr)
1368 #elif defined(CONFIG_X86_64)
1369 # define read_sreg(regs, sr) read_segment_register(sr)
1370 #endif
1372 static int emulate_privileged_op(struct cpu_user_regs *regs)
1374 struct vcpu *v = current;
1375 unsigned long *reg, eip = regs->eip, res;
1376 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1377 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1378 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1379 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1380 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1381 ? regs->reg \
1382 : ad_bytes == 4 \
1383 ? (u32)regs->reg \
1384 : (u16)regs->reg)
1385 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1386 ? regs->reg = (val) \
1387 : ad_bytes == 4 \
1388 ? (*(u32 *)&regs->reg = (val)) \
1389 : (*(u16 *)&regs->reg = (val)))
1390 unsigned long code_base, code_limit;
1391 char io_emul_stub[32];
1392 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1393 u32 l, h, eax, edx;
1395 if ( !read_descriptor(regs->cs, v, regs,
1396 &code_base, &code_limit, &ar,
1397 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1398 goto fail;
1399 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1400 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1401 if ( !(ar & _SEGMENT_S) ||
1402 !(ar & _SEGMENT_P) ||
1403 !(ar & _SEGMENT_CODE) )
1404 goto fail;
1406 /* emulating only opcodes not allowing SS to be default */
1407 data_sel = read_sreg(regs, ds);
1409 /* Legacy prefixes. */
1410 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1412 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1414 case 0x66: /* operand-size override */
1415 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1416 continue;
1417 case 0x67: /* address-size override */
1418 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1419 continue;
1420 case 0x2e: /* CS override */
1421 data_sel = regs->cs;
1422 continue;
1423 case 0x3e: /* DS override */
1424 data_sel = read_sreg(regs, ds);
1425 continue;
1426 case 0x26: /* ES override */
1427 data_sel = read_sreg(regs, es);
1428 continue;
1429 case 0x64: /* FS override */
1430 data_sel = read_sreg(regs, fs);
1431 lm_ovr = lm_seg_fs;
1432 continue;
1433 case 0x65: /* GS override */
1434 data_sel = read_sreg(regs, gs);
1435 lm_ovr = lm_seg_gs;
1436 continue;
1437 case 0x36: /* SS override */
1438 data_sel = regs->ss;
1439 continue;
1440 case 0xf0: /* LOCK */
1441 lock = 1;
1442 continue;
1443 case 0xf2: /* REPNE/REPNZ */
1444 case 0xf3: /* REP/REPE/REPZ */
1445 rep_prefix = 1;
1446 continue;
1447 default:
1448 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1450 rex = opcode;
1451 continue;
1453 break;
1455 break;
1458 /* REX prefix. */
1459 if ( rex & 8 ) /* REX.W */
1460 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1461 modrm_reg = (rex & 4) << 1; /* REX.R */
1462 /* REX.X does not need to be decoded. */
1463 modrm_rm = (rex & 1) << 3; /* REX.B */
1465 if ( opcode == 0x0f )
1466 goto twobyte_opcode;
1468 if ( lock )
1469 goto fail;
1471 /* Input/Output String instructions. */
1472 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1474 unsigned long data_base, data_limit;
1476 if ( rep_prefix && (rd_ad(ecx) == 0) )
1477 goto done;
1479 if ( !(opcode & 2) )
1481 data_sel = read_sreg(regs, es);
1482 lm_ovr = lm_seg_none;
1485 if ( !(ar & _SEGMENT_L) )
1487 if ( !read_descriptor(data_sel, v, regs,
1488 &data_base, &data_limit, &ar,
1489 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1490 goto fail;
1491 if ( !(ar & _SEGMENT_S) ||
1492 !(ar & _SEGMENT_P) ||
1493 (opcode & 2 ?
1494 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1495 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1496 goto fail;
1498 #ifdef CONFIG_X86_64
1499 else
1501 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1503 switch ( lm_ovr )
1505 case lm_seg_none:
1506 data_base = 0UL;
1507 break;
1508 case lm_seg_fs:
1509 data_base = v->arch.guest_context.fs_base;
1510 break;
1511 case lm_seg_gs:
1512 if ( guest_kernel_mode(v, regs) )
1513 data_base = v->arch.guest_context.gs_base_kernel;
1514 else
1515 data_base = v->arch.guest_context.gs_base_user;
1516 break;
1519 else
1520 read_descriptor(data_sel, v, regs,
1521 &data_base, &data_limit, &ar,
1522 0);
1523 data_limit = ~0UL;
1524 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1526 #endif
1528 port = (u16)regs->edx;
1530 continue_io_string:
1531 switch ( opcode )
1533 case 0x6c: /* INSB */
1534 op_bytes = 1;
1535 case 0x6d: /* INSW/INSL */
1536 if ( data_limit < op_bytes - 1 ||
1537 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1538 !guest_io_okay(port, op_bytes, v, regs) )
1539 goto fail;
1540 switch ( op_bytes )
1542 case 1:
1543 /* emulate PIT counter 2 */
1544 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1545 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1546 pv_pit_handler(port, 0, 0) : ~0));
1547 break;
1548 case 2:
1549 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1550 break;
1551 case 4:
1552 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1553 break;
1555 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1557 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1558 PFEC_write_access);
1559 return EXCRET_fault_fixed;
1561 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1562 break;
1564 case 0x6e: /* OUTSB */
1565 op_bytes = 1;
1566 case 0x6f: /* OUTSW/OUTSL */
1567 if ( data_limit < op_bytes - 1 ||
1568 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1569 !guest_io_okay(port, op_bytes, v, regs) )
1570 goto fail;
1571 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1572 if ( rc != 0 )
1574 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1575 return EXCRET_fault_fixed;
1577 switch ( op_bytes )
1579 case 1:
1580 if ( guest_outb_okay(port, v, regs) )
1582 outb((u8)data, port);
1583 if ( pv_post_outb_hook )
1584 pv_post_outb_hook(port, data);
1586 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1587 pv_pit_handler(port, data, 1);
1588 break;
1589 case 2:
1590 if ( guest_outw_okay(port, v, regs) )
1591 outw((u16)data, port);
1592 break;
1593 case 4:
1594 if ( guest_outl_okay(port, v, regs) )
1595 outl((u32)data, port);
1596 break;
1598 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1599 break;
1602 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1604 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1606 if ( !bpmatch && !hypercall_preempt_check() )
1607 goto continue_io_string;
1608 eip = regs->eip;
1611 goto done;
1614 /*
1615 * Very likely to be an I/O instruction (IN/OUT).
1616 * Build an on-stack stub to execute the instruction with full guest
1617 * GPR context. This is needed for some systems which (ab)use IN/OUT
1618 * to communicate with BIOS code in system-management mode.
1619 */
1620 #ifdef __x86_64__
1621 /* movq $host_to_guest_gpr_switch,%rcx */
1622 io_emul_stub[0] = 0x48;
1623 io_emul_stub[1] = 0xb9;
1624 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1625 /* callq *%rcx */
1626 io_emul_stub[10] = 0xff;
1627 io_emul_stub[11] = 0xd1;
1628 #else
1629 /* call host_to_guest_gpr_switch */
1630 io_emul_stub[0] = 0xe8;
1631 *(s32 *)&io_emul_stub[1] =
1632 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1633 /* 7 x nop */
1634 memset(&io_emul_stub[5], 0x90, 7);
1635 #endif
1636 /* data16 or nop */
1637 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1638 /* <io-access opcode> */
1639 io_emul_stub[13] = opcode;
1640 /* imm8 or nop */
1641 io_emul_stub[14] = 0x90;
1642 /* ret (jumps to guest_to_host_gpr_switch) */
1643 io_emul_stub[15] = 0xc3;
1645 /* Handy function-typed pointer to the stub. */
1646 io_emul = (void *)io_emul_stub;
1648 if ( ioemul_handle_quirk )
1649 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1651 /* I/O Port and Interrupt Flag instructions. */
1652 switch ( opcode )
1654 case 0xe4: /* IN imm8,%al */
1655 op_bytes = 1;
1656 case 0xe5: /* IN imm8,%eax */
1657 port = insn_fetch(u8, code_base, eip, code_limit);
1658 io_emul_stub[14] = port; /* imm8 */
1659 exec_in:
1660 if ( !guest_io_okay(port, op_bytes, v, regs) )
1661 goto fail;
1662 switch ( op_bytes )
1664 case 1:
1665 if ( guest_inb_okay(port, v, regs) )
1666 io_emul(regs);
1667 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1669 regs->eax &= ~0xffUL;
1670 regs->eax |= pv_pit_handler(port, 0, 0);
1672 else
1673 regs->eax |= (u8)~0;
1674 break;
1675 case 2:
1676 if ( guest_inw_okay(port, v, regs) )
1677 io_emul(regs);
1678 else
1679 regs->eax |= (u16)~0;
1680 break;
1681 case 4:
1682 if ( guest_inl_okay(port, v, regs) )
1683 io_emul(regs);
1684 else
1685 regs->eax = (u32)~0;
1686 break;
1688 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1689 goto done;
1691 case 0xec: /* IN %dx,%al */
1692 op_bytes = 1;
1693 case 0xed: /* IN %dx,%eax */
1694 port = (u16)regs->edx;
1695 goto exec_in;
1697 case 0xe6: /* OUT %al,imm8 */
1698 op_bytes = 1;
1699 case 0xe7: /* OUT %eax,imm8 */
1700 port = insn_fetch(u8, code_base, eip, code_limit);
1701 io_emul_stub[14] = port; /* imm8 */
1702 exec_out:
1703 if ( !guest_io_okay(port, op_bytes, v, regs) )
1704 goto fail;
1705 switch ( op_bytes )
1707 case 1:
1708 if ( guest_outb_okay(port, v, regs) )
1710 io_emul(regs);
1711 if ( pv_post_outb_hook )
1712 pv_post_outb_hook(port, regs->eax);
1714 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1715 pv_pit_handler(port, regs->eax, 1);
1716 break;
1717 case 2:
1718 if ( guest_outw_okay(port, v, regs) )
1719 io_emul(regs);
1720 break;
1721 case 4:
1722 if ( guest_outl_okay(port, v, regs) )
1723 io_emul(regs);
1724 break;
1726 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1727 goto done;
1729 case 0xee: /* OUT %al,%dx */
1730 op_bytes = 1;
1731 case 0xef: /* OUT %eax,%dx */
1732 port = (u16)regs->edx;
1733 goto exec_out;
1735 case 0xfa: /* CLI */
1736 case 0xfb: /* STI */
1737 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1738 goto fail;
1739 /*
1740 * This is just too dangerous to allow, in my opinion. Consider if the
1741 * caller then tries to reenable interrupts using POPF: we can't trap
1742 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1743 * do for us. :-)
1744 */
1745 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1746 goto done;
1749 /* No decode of this single-byte opcode. */
1750 goto fail;
1752 twobyte_opcode:
1753 /* Two-byte opcodes only emulated from guest kernel. */
1754 if ( !guest_kernel_mode(v, regs) )
1755 goto fail;
1757 /* Privileged (ring 0) instructions. */
1758 opcode = insn_fetch(u8, code_base, eip, code_limit);
1759 if ( lock && (opcode & ~3) != 0x20 )
1760 goto fail;
1761 switch ( opcode )
1763 case 0x06: /* CLTS */
1764 (void)do_fpu_taskswitch(0);
1765 break;
1767 case 0x09: /* WBINVD */
1768 /* Ignore the instruction if unprivileged. */
1769 if ( !cache_flush_permitted(v->domain) )
1770 /* Non-physdev domain attempted WBINVD; ignore for now since
1771 newer linux uses this in some start-of-day timing loops */
1773 else
1774 wbinvd();
1775 break;
1777 case 0x20: /* MOV CR?,<reg> */
1778 opcode = insn_fetch(u8, code_base, eip, code_limit);
1779 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1780 modrm_rm |= (opcode >> 0) & 7;
1781 reg = decode_register(modrm_rm, regs, 0);
1782 switch ( modrm_reg )
1784 case 0: /* Read CR0 */
1785 *reg = (read_cr0() & ~X86_CR0_TS) |
1786 v->arch.guest_context.ctrlreg[0];
1787 break;
1789 case 2: /* Read CR2 */
1790 *reg = v->arch.guest_context.ctrlreg[2];
1791 break;
1793 case 3: /* Read CR3 */
1794 if ( !is_pv_32on64_vcpu(v) )
1795 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1796 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1797 #ifdef CONFIG_COMPAT
1798 else
1799 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1800 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1801 #endif
1802 break;
1804 case 4: /* Read CR4 */
1805 /*
1806 * Guests can read CR4 to see what features Xen has enabled. We
1807 * therefore lie about PGE & PSE as they are unavailable to guests.
1808 */
1809 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1810 break;
1812 default:
1813 goto fail;
1815 break;
1817 case 0x21: /* MOV DR?,<reg> */
1818 opcode = insn_fetch(u8, code_base, eip, code_limit);
1819 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1820 modrm_rm |= (opcode >> 0) & 7;
1821 reg = decode_register(modrm_rm, regs, 0);
1822 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1823 goto fail;
1824 *reg = res;
1825 break;
1827 case 0x22: /* MOV <reg>,CR? */
1828 opcode = insn_fetch(u8, code_base, eip, code_limit);
1829 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1830 modrm_rm |= (opcode >> 0) & 7;
1831 reg = decode_register(modrm_rm, regs, 0);
1832 switch ( modrm_reg )
1834 case 0: /* Write CR0 */
1835 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1837 gdprintk(XENLOG_WARNING,
1838 "Attempt to change unmodifiable CR0 flags.\n");
1839 goto fail;
1841 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1842 break;
1844 case 2: /* Write CR2 */
1845 v->arch.guest_context.ctrlreg[2] = *reg;
1846 arch_set_cr2(v, *reg);
1847 break;
1849 case 3: /* Write CR3 */
1850 LOCK_BIGLOCK(v->domain);
1851 if ( !is_pv_32on64_vcpu(v) )
1852 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1853 #ifdef CONFIG_COMPAT
1854 else
1855 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1856 #endif
1857 UNLOCK_BIGLOCK(v->domain);
1858 if ( rc == 0 ) /* not okay */
1859 goto fail;
1860 break;
1862 case 4: /* Write CR4 */
1863 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1864 write_cr4(pv_guest_cr4_to_real_cr4(
1865 v->arch.guest_context.ctrlreg[4]));
1866 break;
1868 default:
1869 goto fail;
1871 break;
1873 case 0x23: /* MOV <reg>,DR? */
1874 opcode = insn_fetch(u8, code_base, eip, code_limit);
1875 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1876 modrm_rm |= (opcode >> 0) & 7;
1877 reg = decode_register(modrm_rm, regs, 0);
1878 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1879 goto fail;
1880 break;
1882 case 0x30: /* WRMSR */
1883 eax = regs->eax;
1884 edx = regs->edx;
1885 res = ((u64)edx << 32) | eax;
1886 switch ( (u32)regs->ecx )
1888 #ifdef CONFIG_X86_64
1889 case MSR_FS_BASE:
1890 if ( is_pv_32on64_vcpu(v) )
1891 goto fail;
1892 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1893 goto fail;
1894 v->arch.guest_context.fs_base = res;
1895 break;
1896 case MSR_GS_BASE:
1897 if ( is_pv_32on64_vcpu(v) )
1898 goto fail;
1899 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1900 goto fail;
1901 v->arch.guest_context.gs_base_kernel = res;
1902 break;
1903 case MSR_SHADOW_GS_BASE:
1904 if ( is_pv_32on64_vcpu(v) )
1905 goto fail;
1906 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1907 goto fail;
1908 v->arch.guest_context.gs_base_user = res;
1909 break;
1910 #endif
1911 case MSR_K7_FID_VID_STATUS:
1912 case MSR_K7_FID_VID_CTL:
1913 case MSR_K8_PSTATE_LIMIT:
1914 case MSR_K8_PSTATE_CTRL:
1915 case MSR_K8_PSTATE_STATUS:
1916 case MSR_K8_PSTATE0:
1917 case MSR_K8_PSTATE1:
1918 case MSR_K8_PSTATE2:
1919 case MSR_K8_PSTATE3:
1920 case MSR_K8_PSTATE4:
1921 case MSR_K8_PSTATE5:
1922 case MSR_K8_PSTATE6:
1923 case MSR_K8_PSTATE7:
1924 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1925 goto fail;
1926 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1927 break;
1928 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1929 goto fail;
1930 break;
1931 case MSR_IA32_PERF_CTL:
1932 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1933 goto fail;
1934 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1935 break;
1936 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
1937 goto fail;
1938 break;
1939 default:
1940 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1941 break;
1942 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1943 (eax != l) || (edx != h) )
1944 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1945 "%08x:%08x to %08x:%08x.\n",
1946 _p(regs->ecx), h, l, edx, eax);
1947 break;
1949 break;
1951 case 0x31: /* RDTSC */
1952 rdtsc(regs->eax, regs->edx);
1953 break;
1955 case 0x32: /* RDMSR */
1956 switch ( (u32)regs->ecx )
1958 #ifdef CONFIG_X86_64
1959 case MSR_FS_BASE:
1960 if ( is_pv_32on64_vcpu(v) )
1961 goto fail;
1962 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1963 regs->edx = v->arch.guest_context.fs_base >> 32;
1964 break;
1965 case MSR_GS_BASE:
1966 if ( is_pv_32on64_vcpu(v) )
1967 goto fail;
1968 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1969 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1970 break;
1971 case MSR_SHADOW_GS_BASE:
1972 if ( is_pv_32on64_vcpu(v) )
1973 goto fail;
1974 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1975 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1976 break;
1977 #endif
1978 case MSR_K7_FID_VID_CTL:
1979 case MSR_K7_FID_VID_STATUS:
1980 case MSR_K8_PSTATE_LIMIT:
1981 case MSR_K8_PSTATE_CTRL:
1982 case MSR_K8_PSTATE_STATUS:
1983 case MSR_K8_PSTATE0:
1984 case MSR_K8_PSTATE1:
1985 case MSR_K8_PSTATE2:
1986 case MSR_K8_PSTATE3:
1987 case MSR_K8_PSTATE4:
1988 case MSR_K8_PSTATE5:
1989 case MSR_K8_PSTATE6:
1990 case MSR_K8_PSTATE7:
1991 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1992 goto fail;
1993 if ( cpufreq_controller != FREQCTL_dom0_kernel )
1995 regs->eax = regs->edx = 0;
1996 break;
1998 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
1999 goto fail;
2000 break;
2001 case MSR_EFER:
2002 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2003 goto fail;
2004 break;
2005 default:
2006 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2008 regs->eax = l;
2009 regs->edx = h;
2010 break;
2012 /* Everyone can read the MSR space. */
2013 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2014 _p(regs->ecx));*/
2015 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2016 goto fail;
2017 break;
2019 break;
2021 default:
2022 goto fail;
2025 #undef wr_ad
2026 #undef rd_ad
2028 done:
2029 instruction_done(regs, eip, bpmatch);
2030 skip:
2031 return EXCRET_fault_fixed;
2033 fail:
2034 return 0;
2037 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2038 unsigned int esp, unsigned int decr)
2040 return (((esp - decr) < (esp - 1)) &&
2041 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2044 static void emulate_gate_op(struct cpu_user_regs *regs)
2046 #ifdef __x86_64__
2047 struct vcpu *v = current;
2048 unsigned int sel, ar, dpl, nparm, opnd_sel;
2049 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2050 unsigned long off, eip, opnd_off, base, limit;
2051 int jump;
2053 /* Check whether this fault is due to the use of a call gate. */
2054 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2055 ((ar >> 13) & 3) < (regs->cs & 3) ||
2056 (ar & _SEGMENT_TYPE) != 0xc00 )
2058 do_guest_trap(TRAP_gp_fault, regs, 1);
2059 return;
2061 if ( !(ar & _SEGMENT_P) )
2063 do_guest_trap(TRAP_no_segment, regs, 1);
2064 return;
2066 dpl = (ar >> 13) & 3;
2067 nparm = ar & 0x1f;
2069 /*
2070 * Decode instruction (and perhaps operand) to determine RPL,
2071 * whether this is a jump or a call, and the call return offset.
2072 */
2073 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2074 !(ar & _SEGMENT_S) ||
2075 !(ar & _SEGMENT_P) ||
2076 !(ar & _SEGMENT_CODE) )
2078 do_guest_trap(TRAP_gp_fault, regs, 1);
2079 return;
2082 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2083 ad_default = ad_bytes = op_default;
2084 opnd_sel = opnd_off = 0;
2085 jump = -1;
2086 for ( eip = regs->eip; eip - regs->_eip < 10; )
2088 switch ( insn_fetch(u8, base, eip, limit) )
2090 case 0x66: /* operand-size override */
2091 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2092 continue;
2093 case 0x67: /* address-size override */
2094 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2095 continue;
2096 case 0x2e: /* CS override */
2097 opnd_sel = regs->cs;
2098 ASSERT(opnd_sel);
2099 continue;
2100 case 0x3e: /* DS override */
2101 opnd_sel = read_sreg(regs, ds);
2102 if ( !opnd_sel )
2103 opnd_sel = dpl;
2104 continue;
2105 case 0x26: /* ES override */
2106 opnd_sel = read_sreg(regs, es);
2107 if ( !opnd_sel )
2108 opnd_sel = dpl;
2109 continue;
2110 case 0x64: /* FS override */
2111 opnd_sel = read_sreg(regs, fs);
2112 if ( !opnd_sel )
2113 opnd_sel = dpl;
2114 continue;
2115 case 0x65: /* GS override */
2116 opnd_sel = read_sreg(regs, gs);
2117 if ( !opnd_sel )
2118 opnd_sel = dpl;
2119 continue;
2120 case 0x36: /* SS override */
2121 opnd_sel = regs->ss;
2122 if ( !opnd_sel )
2123 opnd_sel = dpl;
2124 continue;
2125 case 0xea:
2126 ++jump;
2127 /* FALLTHROUGH */
2128 case 0x9a:
2129 ++jump;
2130 opnd_sel = regs->cs;
2131 opnd_off = eip;
2132 ad_bytes = ad_default;
2133 eip += op_bytes + 2;
2134 break;
2135 case 0xff:
2137 unsigned int modrm;
2139 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2141 case 0x28: case 0x68: case 0xa8:
2142 ++jump;
2143 /* FALLTHROUGH */
2144 case 0x18: case 0x58: case 0x98:
2145 ++jump;
2146 if ( ad_bytes != 2 )
2148 if ( (modrm & 7) == 4 )
2150 unsigned int sib = insn_fetch(u8, base, eip, limit);
2152 modrm = (modrm & ~7) | (sib & 7);
2153 if ( (sib >>= 3) != 4 )
2154 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2155 opnd_off <<= sib >> 3;
2157 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2158 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2159 else
2160 modrm |= 0x87;
2161 if ( !opnd_sel )
2163 switch ( modrm & 7 )
2165 default:
2166 opnd_sel = read_sreg(regs, ds);
2167 break;
2168 case 4: case 5:
2169 opnd_sel = regs->ss;
2170 break;
2174 else
2176 switch ( modrm & 7 )
2178 case 0: case 1: case 7:
2179 opnd_off = regs->ebx;
2180 break;
2181 case 6:
2182 if ( !(modrm & 0xc0) )
2183 modrm |= 0x80;
2184 else
2185 case 2: case 3:
2187 opnd_off = regs->ebp;
2188 if ( !opnd_sel )
2189 opnd_sel = regs->ss;
2191 break;
2193 if ( !opnd_sel )
2194 opnd_sel = read_sreg(regs, ds);
2195 switch ( modrm & 7 )
2197 case 0: case 2: case 4:
2198 opnd_off += regs->esi;
2199 break;
2200 case 1: case 3: case 5:
2201 opnd_off += regs->edi;
2202 break;
2205 switch ( modrm & 0xc0 )
2207 case 0x40:
2208 opnd_off += insn_fetch(s8, base, eip, limit);
2209 break;
2210 case 0x80:
2211 opnd_off += insn_fetch(s32, base, eip, limit);
2212 break;
2214 if ( ad_bytes == 4 )
2215 opnd_off = (unsigned int)opnd_off;
2216 else if ( ad_bytes == 2 )
2217 opnd_off = (unsigned short)opnd_off;
2218 break;
2221 break;
2223 break;
2226 if ( jump < 0 )
2228 fail:
2229 do_guest_trap(TRAP_gp_fault, regs, 1);
2230 skip:
2231 return;
2234 if ( (opnd_sel != regs->cs &&
2235 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2236 !(ar & _SEGMENT_S) ||
2237 !(ar & _SEGMENT_P) ||
2238 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2240 do_guest_trap(TRAP_gp_fault, regs, 1);
2241 return;
2244 opnd_off += op_bytes;
2245 #define ad_default ad_bytes
2246 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2247 #undef ad_default
2248 ASSERT((opnd_sel & ~3) == regs->error_code);
2249 if ( dpl < (opnd_sel & 3) )
2251 do_guest_trap(TRAP_gp_fault, regs, 1);
2252 return;
2255 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2256 !(ar & _SEGMENT_S) ||
2257 !(ar & _SEGMENT_CODE) ||
2258 (!jump || (ar & _SEGMENT_EC) ?
2259 ((ar >> 13) & 3) > (regs->cs & 3) :
2260 ((ar >> 13) & 3) != (regs->cs & 3)) )
2262 regs->error_code = sel;
2263 do_guest_trap(TRAP_gp_fault, regs, 1);
2264 return;
2266 if ( !(ar & _SEGMENT_P) )
2268 regs->error_code = sel;
2269 do_guest_trap(TRAP_no_segment, regs, 1);
2270 return;
2272 if ( off > limit )
2274 regs->error_code = 0;
2275 do_guest_trap(TRAP_gp_fault, regs, 1);
2276 return;
2279 if ( !jump )
2281 unsigned int ss, esp, *stkp;
2282 int rc;
2283 #define push(item) do \
2284 { \
2285 --stkp; \
2286 esp -= 4; \
2287 rc = __put_user(item, stkp); \
2288 if ( rc ) \
2289 { \
2290 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2291 PFEC_write_access); \
2292 return; \
2293 } \
2294 } while ( 0 )
2296 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2298 sel |= (ar >> 13) & 3;
2299 /* Inner stack known only for kernel ring. */
2300 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2302 do_guest_trap(TRAP_gp_fault, regs, 1);
2303 return;
2305 esp = v->arch.guest_context.kernel_sp;
2306 ss = v->arch.guest_context.kernel_ss;
2307 if ( (ss & 3) != (sel & 3) ||
2308 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2309 ((ar >> 13) & 3) != (sel & 3) ||
2310 !(ar & _SEGMENT_S) ||
2311 (ar & _SEGMENT_CODE) ||
2312 !(ar & _SEGMENT_WR) )
2314 regs->error_code = ss & ~3;
2315 do_guest_trap(TRAP_invalid_tss, regs, 1);
2316 return;
2318 if ( !(ar & _SEGMENT_P) ||
2319 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2321 regs->error_code = ss & ~3;
2322 do_guest_trap(TRAP_stack_error, regs, 1);
2323 return;
2325 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2326 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2328 do_guest_trap(TRAP_gp_fault, regs, 1);
2329 return;
2331 push(regs->ss);
2332 push(regs->esp);
2333 if ( nparm )
2335 const unsigned int *ustkp;
2337 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2338 ((ar >> 13) & 3) != (regs->cs & 3) ||
2339 !(ar & _SEGMENT_S) ||
2340 (ar & _SEGMENT_CODE) ||
2341 !(ar & _SEGMENT_WR) ||
2342 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2343 return do_guest_trap(TRAP_gp_fault, regs, 1);
2344 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2345 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2347 do_guest_trap(TRAP_gp_fault, regs, 1);
2348 return;
2350 do
2352 unsigned int parm;
2354 --ustkp;
2355 rc = __get_user(parm, ustkp);
2356 if ( rc )
2358 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2359 return;
2361 push(parm);
2362 } while ( --nparm );
2365 else
2367 sel |= (regs->cs & 3);
2368 esp = regs->esp;
2369 ss = regs->ss;
2370 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2371 ((ar >> 13) & 3) != (sel & 3) )
2373 do_guest_trap(TRAP_gp_fault, regs, 1);
2374 return;
2376 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2378 regs->error_code = 0;
2379 do_guest_trap(TRAP_stack_error, regs, 1);
2380 return;
2382 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2383 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2385 do_guest_trap(TRAP_gp_fault, regs, 1);
2386 return;
2389 push(regs->cs);
2390 push(eip);
2391 #undef push
2392 regs->esp = esp;
2393 regs->ss = ss;
2395 else
2396 sel |= (regs->cs & 3);
2398 regs->cs = sel;
2399 instruction_done(regs, off, 0);
2400 #endif
2403 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2405 struct vcpu *v = current;
2406 unsigned long fixup;
2408 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2410 if ( regs->error_code & 1 )
2411 goto hardware_gp;
2413 if ( !guest_mode(regs) )
2414 goto gp_in_kernel;
2416 /*
2417 * Cunning trick to allow arbitrary "INT n" handling.
2419 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2420 * instruction from trapping to the appropriate vector, when that might not
2421 * be expected by Xen or the guest OS. For example, that entry might be for
2422 * a fault handler (unlike traps, faults don't increment EIP), or might
2423 * expect an error code on the stack (which a software trap never
2424 * provides), or might be a hardware interrupt handler that doesn't like
2425 * being called spuriously.
2427 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2428 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2429 * clear to indicate that it's a software fault, not hardware.
2431 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2432 * okay because they can only be triggered by an explicit DPL-checked
2433 * instruction. The DPL specified by the guest OS for these vectors is NOT
2434 * CHECKED!!
2435 */
2436 if ( (regs->error_code & 3) == 2 )
2438 /* This fault must be due to <INT n> instruction. */
2439 const struct trap_info *ti;
2440 unsigned char vector = regs->error_code >> 3;
2441 ti = &v->arch.guest_context.trap_ctxt[vector];
2442 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2444 regs->eip += 2;
2445 do_guest_trap(vector, regs, 0);
2446 return;
2449 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2451 emulate_gate_op(regs);
2452 return;
2455 /* Emulate some simple privileged and I/O instructions. */
2456 if ( (regs->error_code == 0) &&
2457 emulate_privileged_op(regs) )
2459 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2460 return;
2463 #if defined(__i386__)
2464 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2465 (regs->error_code == 0) &&
2466 gpf_emulate_4gb(regs) )
2468 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2469 return;
2471 #endif
2473 /* Pass on GPF as is. */
2474 do_guest_trap(TRAP_gp_fault, regs, 1);
2475 return;
2477 gp_in_kernel:
2479 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2481 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2482 regs->error_code, _p(regs->eip), _p(fixup));
2483 regs->eip = fixup;
2484 return;
2487 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2489 hardware_gp:
2490 show_execution_state(regs);
2491 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2494 static void nmi_softirq(void)
2496 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2497 vcpu_kick(dom0->vcpu[0]);
2500 static void nmi_dom0_report(unsigned int reason_idx)
2502 struct domain *d;
2503 struct vcpu *v;
2505 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2506 return;
2508 set_bit(reason_idx, nmi_reason(d));
2510 if ( !test_and_set_bool(v->nmi_pending) )
2511 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2514 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2516 switch ( opt_nmi[0] )
2518 case 'd': /* 'dom0' */
2519 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2520 case 'i': /* 'ignore' */
2521 break;
2522 default: /* 'fatal' */
2523 console_force_unlock();
2524 printk("\n\nNMI - MEMORY ERROR\n");
2525 fatal_trap(TRAP_nmi, regs);
2528 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2529 mdelay(1);
2530 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2533 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2535 switch ( opt_nmi[0] )
2537 case 'd': /* 'dom0' */
2538 nmi_dom0_report(_XEN_NMIREASON_io_error);
2539 case 'i': /* 'ignore' */
2540 break;
2541 default: /* 'fatal' */
2542 console_force_unlock();
2543 printk("\n\nNMI - I/O ERROR\n");
2544 fatal_trap(TRAP_nmi, regs);
2547 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2548 mdelay(1);
2549 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2552 static void unknown_nmi_error(unsigned char reason)
2554 switch ( opt_nmi[0] )
2556 case 'd': /* 'dom0' */
2557 nmi_dom0_report(_XEN_NMIREASON_unknown);
2558 case 'i': /* 'ignore' */
2559 break;
2560 default: /* 'fatal' */
2561 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2562 printk("Dazed and confused, but trying to continue\n");
2563 printk("Do you have a strange power saving mode enabled?\n");
2564 kexec_crash();
2568 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2570 return 0;
2573 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2575 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2577 unsigned int cpu = smp_processor_id();
2578 unsigned char reason;
2580 ++nmi_count(cpu);
2582 if ( nmi_callback(regs, cpu) )
2583 return;
2585 if ( nmi_watchdog )
2586 nmi_watchdog_tick(regs);
2588 /* Only the BSP gets external NMIs from the system. */
2589 if ( cpu == 0 )
2591 reason = inb(0x61);
2592 if ( reason & 0x80 )
2593 mem_parity_error(regs);
2594 else if ( reason & 0x40 )
2595 io_check_error(regs);
2596 else if ( !nmi_watchdog )
2597 unknown_nmi_error((unsigned char)(reason&0xff));
2601 void set_nmi_callback(nmi_callback_t callback)
2603 nmi_callback = callback;
2606 void unset_nmi_callback(void)
2608 nmi_callback = dummy_nmi_callback;
2611 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2613 struct vcpu *curr = current;
2615 BUG_ON(!guest_mode(regs));
2617 setup_fpu(curr);
2619 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2621 do_guest_trap(TRAP_no_device, regs, 0);
2622 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2624 else
2625 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2627 return;
2630 asmlinkage void do_debug(struct cpu_user_regs *regs)
2632 struct vcpu *v = current;
2634 DEBUGGER_trap_entry(TRAP_debug, regs);
2636 if ( !guest_mode(regs) )
2638 if ( regs->eflags & EF_TF )
2640 #ifdef __x86_64__
2641 void sysenter_entry(void);
2642 void sysenter_eflags_saved(void);
2643 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2644 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2645 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2646 goto out;
2647 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2648 #else
2649 WARN_ON(1);
2650 #endif
2651 regs->eflags &= ~EF_TF;
2653 else
2655 /*
2656 * We ignore watchpoints when they trigger within Xen. This may
2657 * happen when a buffer is passed to us which previously had a
2658 * watchpoint set on it. No need to bump EIP; the only faulting
2659 * trap is an instruction breakpoint, which can't happen to us.
2660 */
2661 WARN_ON(!search_exception_table(regs->eip));
2663 goto out;
2666 /* Save debug status register where guest OS can peek at it */
2667 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2669 ler_enable();
2670 do_guest_trap(TRAP_debug, regs, 0);
2671 return;
2673 out:
2674 ler_enable();
2675 return;
2678 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2682 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2684 int i;
2685 /* Keep secondary tables in sync with IRQ updates. */
2686 for ( i = 1; i < NR_CPUS; i++ )
2687 if ( idt_tables[i] != NULL )
2688 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2689 _set_gate(&idt_table[n], 14, dpl, addr);
2692 static void set_swint_gate(unsigned int n, void *addr)
2694 __set_intr_gate(n, 3, addr);
2697 void set_intr_gate(unsigned int n, void *addr)
2699 __set_intr_gate(n, 0, addr);
2702 void set_tss_desc(unsigned int n, void *addr)
2704 _set_tssldt_desc(
2705 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2706 (unsigned long)addr,
2707 offsetof(struct tss_struct, __cacheline_filler) - 1,
2708 9);
2709 #ifdef CONFIG_COMPAT
2710 _set_tssldt_desc(
2711 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2712 (unsigned long)addr,
2713 offsetof(struct tss_struct, __cacheline_filler) - 1,
2714 11);
2715 #endif
2718 void __devinit percpu_traps_init(void)
2720 subarch_percpu_traps_init();
2722 if ( !opt_ler )
2723 return;
2725 switch ( boot_cpu_data.x86_vendor )
2727 case X86_VENDOR_INTEL:
2728 switch ( boot_cpu_data.x86 )
2730 case 6:
2731 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2732 break;
2733 case 15:
2734 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2735 break;
2737 break;
2738 case X86_VENDOR_AMD:
2739 switch ( boot_cpu_data.x86 )
2741 case 6:
2742 case 15:
2743 case 16:
2744 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2745 break;
2747 break;
2750 ler_enable();
2753 void __init trap_init(void)
2755 /*
2756 * Note that interrupt gates are always used, rather than trap gates. We
2757 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2758 * first activation must have the "bad" value(s) for these registers and
2759 * we may lose them if another activation is installed before they are
2760 * saved. The page-fault handler also needs interrupts disabled until %cr2
2761 * has been read and saved on the stack.
2762 */
2763 set_intr_gate(TRAP_divide_error,&divide_error);
2764 set_intr_gate(TRAP_debug,&debug);
2765 set_intr_gate(TRAP_nmi,&nmi);
2766 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2767 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2768 set_intr_gate(TRAP_bounds,&bounds);
2769 set_intr_gate(TRAP_invalid_op,&invalid_op);
2770 set_intr_gate(TRAP_no_device,&device_not_available);
2771 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2772 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2773 set_intr_gate(TRAP_no_segment,&segment_not_present);
2774 set_intr_gate(TRAP_stack_error,&stack_segment);
2775 set_intr_gate(TRAP_gp_fault,&general_protection);
2776 set_intr_gate(TRAP_page_fault,&page_fault);
2777 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2778 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2779 set_intr_gate(TRAP_alignment_check,&alignment_check);
2780 set_intr_gate(TRAP_machine_check,&machine_check);
2781 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2783 /* CPU0 uses the master IDT. */
2784 idt_tables[0] = idt_table;
2786 percpu_traps_init();
2788 cpu_init();
2790 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2793 long register_guest_nmi_callback(unsigned long address)
2795 struct vcpu *v = current;
2796 struct domain *d = v->domain;
2797 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2799 t->vector = TRAP_nmi;
2800 t->flags = 0;
2801 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2802 t->address = address;
2803 TI_SET_IF(t, 1);
2805 /*
2806 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2807 * now.
2808 */
2809 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2810 v->nmi_pending = 1;
2812 return 0;
2815 long unregister_guest_nmi_callback(void)
2817 struct vcpu *v = current;
2818 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2820 memset(t, 0, sizeof(*t));
2822 return 0;
2825 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2827 struct trap_info cur;
2828 struct vcpu *curr = current;
2829 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2830 long rc = 0;
2832 /* If no table is presented then clear the entire virtual IDT. */
2833 if ( guest_handle_is_null(traps) )
2835 memset(dst, 0, 256 * sizeof(*dst));
2836 init_int80_direct_trap(curr);
2837 return 0;
2840 for ( ; ; )
2842 if ( hypercall_preempt_check() )
2844 rc = hypercall_create_continuation(
2845 __HYPERVISOR_set_trap_table, "h", traps);
2846 break;
2849 if ( copy_from_guest(&cur, traps, 1) )
2851 rc = -EFAULT;
2852 break;
2855 if ( cur.address == 0 )
2856 break;
2858 fixup_guest_code_selector(curr->domain, cur.cs);
2860 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2862 if ( cur.vector == 0x80 )
2863 init_int80_direct_trap(curr);
2865 guest_handle_add_offset(traps, 1);
2868 return rc;
2871 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2873 int i;
2874 struct vcpu *curr = current;
2876 switch ( reg )
2878 case 0:
2879 if ( !access_ok(value, sizeof(long)) )
2880 return -EPERM;
2881 if ( v == curr )
2882 write_debugreg(0, value);
2883 break;
2884 case 1:
2885 if ( !access_ok(value, sizeof(long)) )
2886 return -EPERM;
2887 if ( v == curr )
2888 write_debugreg(1, value);
2889 break;
2890 case 2:
2891 if ( !access_ok(value, sizeof(long)) )
2892 return -EPERM;
2893 if ( v == curr )
2894 write_debugreg(2, value);
2895 break;
2896 case 3:
2897 if ( !access_ok(value, sizeof(long)) )
2898 return -EPERM;
2899 if ( v == curr )
2900 write_debugreg(3, value);
2901 break;
2902 case 6:
2903 /*
2904 * DR6: Bits 4-11,16-31 reserved (set to 1).
2905 * Bit 12 reserved (set to 0).
2906 */
2907 value &= 0xffffefff; /* reserved bits => 0 */
2908 value |= 0xffff0ff0; /* reserved bits => 1 */
2909 if ( v == curr )
2910 write_debugreg(6, value);
2911 break;
2912 case 7:
2913 /*
2914 * DR7: Bit 10 reserved (set to 1).
2915 * Bits 11-12,14-15 reserved (set to 0).
2916 */
2917 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2918 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
2919 /*
2920 * Privileged bits:
2921 * GD (bit 13): must be 0.
2922 */
2923 if ( value & DR_GENERAL_DETECT )
2924 return -EPERM;
2925 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2926 if ( value & DR7_ACTIVE_MASK )
2928 unsigned int io_enable = 0;
2930 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2932 if ( ((value >> i) & 3) == DR_IO )
2934 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
2935 return -EPERM;
2936 io_enable |= value & (3 << ((i - 16) >> 1));
2938 #ifdef __i386__
2939 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
2940 !boot_cpu_has(X86_FEATURE_LM)) &&
2941 (((value >> i) & 0xc) == DR_LEN_8) )
2942 return -EPERM;
2943 #endif
2946 /* Guest DR5 is a handy stash for I/O intercept information. */
2947 v->arch.guest_context.debugreg[5] = io_enable;
2948 value &= ~io_enable;
2950 /*
2951 * If DR7 was previously clear then we need to load all other
2952 * debug registers at this point as they were not restored during
2953 * context switch.
2954 */
2955 if ( (v == curr) &&
2956 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
2958 write_debugreg(0, v->arch.guest_context.debugreg[0]);
2959 write_debugreg(1, v->arch.guest_context.debugreg[1]);
2960 write_debugreg(2, v->arch.guest_context.debugreg[2]);
2961 write_debugreg(3, v->arch.guest_context.debugreg[3]);
2962 write_debugreg(6, v->arch.guest_context.debugreg[6]);
2965 if ( v == curr )
2966 write_debugreg(7, value);
2967 break;
2968 default:
2969 return -EINVAL;
2972 v->arch.guest_context.debugreg[reg] = value;
2973 return 0;
2976 long do_set_debugreg(int reg, unsigned long value)
2978 return set_debugreg(current, reg, value);
2981 unsigned long do_get_debugreg(int reg)
2983 struct vcpu *curr = current;
2985 switch ( reg )
2987 case 0 ... 3:
2988 case 6:
2989 return curr->arch.guest_context.debugreg[reg];
2990 case 7:
2991 return (curr->arch.guest_context.debugreg[7] |
2992 curr->arch.guest_context.debugreg[5]);
2993 case 4 ... 5:
2994 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
2995 curr->arch.guest_context.debugreg[reg + 2] : 0);
2998 return -EINVAL;
3001 /*
3002 * Local variables:
3003 * mode: C
3004 * c-set-style: "BSD"
3005 * c-basic-offset: 4
3006 * tab-width: 4
3007 * indent-tabs-mode: nil
3008 * End:
3009 */