debuggers.hg

view xen/arch/x86/traps.c @ 16371:00db9ec39831

x86: Fix PV guest CR4 handling. We should not leak hidden CR4 bits
into guest CR4 value.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Wed Nov 07 14:15:44 2007 +0000 (2007-11-07)
parents ddc9e6b2babb
children 8c305873f2b8
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
91 asmlinkage void nmi(void);
92 asmlinkage void machine_check(void);
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
114 static int debug_stack_lines = 20;
115 integer_param("debug_stack_lines", debug_stack_lines);
117 static int opt_ler;
118 boolean_param("ler", opt_ler);
120 #ifdef CONFIG_X86_32
121 #define stack_words_per_line 8
122 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
123 #else
124 #define stack_words_per_line 4
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
126 #endif
128 static void show_guest_stack(struct cpu_user_regs *regs)
129 {
130 int i;
131 struct vcpu *curr = current;
132 unsigned long *stack, addr;
134 if ( is_hvm_vcpu(curr) )
135 return;
137 if ( is_pv_32on64_vcpu(curr) )
138 {
139 compat_show_guest_stack(regs, debug_stack_lines);
140 return;
141 }
143 if ( vm86_mode(regs) )
144 {
145 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
146 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
147 regs->ss, (uint16_t)(regs->esp & 0xffff));
148 }
149 else
150 {
151 stack = (unsigned long *)regs->esp;
152 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
153 }
155 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
156 {
157 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
158 break;
159 if ( get_user(addr, stack) )
160 {
161 if ( i != 0 )
162 printk("\n ");
163 printk("Fault while accessing guest memory.");
164 i = 1;
165 break;
166 }
167 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
168 printk("\n ");
169 printk(" %p", _p(addr));
170 stack++;
171 }
172 if ( i == 0 )
173 printk("Stack empty.");
174 printk("\n");
175 }
177 #if !defined(CONFIG_FRAME_POINTER)
179 static void show_trace(struct cpu_user_regs *regs)
180 {
181 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
183 printk("Xen call trace:\n ");
185 printk("[<%p>]", _p(regs->eip));
186 print_symbol(" %s\n ", regs->eip);
188 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
189 {
190 addr = *stack++;
191 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
192 {
193 printk("[<%p>]", _p(addr));
194 print_symbol(" %s\n ", addr);
195 }
196 }
198 printk("\n");
199 }
201 #else
203 static void show_trace(struct cpu_user_regs *regs)
204 {
205 unsigned long *frame, next, addr, low, high;
207 printk("Xen call trace:\n ");
209 printk("[<%p>]", _p(regs->eip));
210 print_symbol(" %s\n ", regs->eip);
212 /* Bounds for range of valid frame pointer. */
213 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
214 high = (low & ~(STACK_SIZE - 1)) +
215 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
217 /* The initial frame pointer. */
218 next = regs->ebp;
220 for ( ; ; )
221 {
222 /* Valid frame pointer? */
223 if ( (next < low) || (next >= high) )
224 {
225 /*
226 * Exception stack frames have a different layout, denoted by an
227 * inverted frame pointer.
228 */
229 next = ~next;
230 if ( (next < low) || (next >= high) )
231 break;
232 frame = (unsigned long *)next;
233 next = frame[0];
234 addr = frame[(offsetof(struct cpu_user_regs, eip) -
235 offsetof(struct cpu_user_regs, ebp))
236 / BYTES_PER_LONG];
237 }
238 else
239 {
240 /* Ordinary stack frame. */
241 frame = (unsigned long *)next;
242 next = frame[0];
243 addr = frame[1];
244 }
246 printk("[<%p>]", _p(addr));
247 print_symbol(" %s\n ", addr);
249 low = (unsigned long)&frame[2];
250 }
252 printk("\n");
253 }
255 #endif
257 void show_stack(struct cpu_user_regs *regs)
258 {
259 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
260 int i;
262 if ( guest_mode(regs) )
263 return show_guest_stack(regs);
265 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
267 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
268 {
269 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
270 break;
271 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
272 printk("\n ");
273 addr = *stack++;
274 printk(" %p", _p(addr));
275 }
276 if ( i == 0 )
277 printk("Stack empty.");
278 printk("\n");
280 show_trace(regs);
281 }
283 void show_stack_overflow(unsigned int cpu, unsigned long esp)
284 {
285 #ifdef MEMORY_GUARD
286 unsigned long esp_top, esp_bottom;
287 unsigned long *stack, addr;
289 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
290 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
292 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
293 (void *)esp_top, (void *)esp_bottom, (void *)esp,
294 (void *)init_tss[cpu].esp0);
296 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
297 if ( ((unsigned long)(esp - esp_top) > 512) &&
298 ((unsigned long)(esp_top - esp) > 512) )
299 {
300 printk("No stack overflow detected. Skipping stack trace.\n");
301 return;
302 }
304 if ( esp < esp_top )
305 esp = esp_top;
307 printk("Xen stack overflow (dumping trace %p-%p):\n ",
308 (void *)esp, (void *)esp_bottom);
310 stack = (unsigned long *)esp;
311 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
312 {
313 addr = *stack++;
314 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
315 {
316 printk("%p: [<%p>]", stack, _p(addr));
317 print_symbol(" %s\n ", addr);
318 }
319 }
321 printk("\n");
322 #endif
323 }
325 void show_execution_state(struct cpu_user_regs *regs)
326 {
327 show_registers(regs);
328 show_stack(regs);
329 }
331 char *trapstr(int trapnr)
332 {
333 static char *strings[] = {
334 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
335 "invalid opcode", "device not available", "double fault",
336 "coprocessor segment", "invalid tss", "segment not found",
337 "stack error", "general protection fault", "page fault",
338 "spurious interrupt", "coprocessor error", "alignment check",
339 "machine check", "simd error"
340 };
342 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
343 return "???";
345 return strings[trapnr];
346 }
348 /*
349 * This is called for faults at very unexpected times (e.g., when interrupts
350 * are disabled). In such situations we can't do much that is safe. We try to
351 * print out some tracing and then we just spin.
352 */
353 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
354 {
355 static DEFINE_PER_CPU(char, depth);
357 /*
358 * In some cases, we can end up in a vicious cycle of fatal_trap()s
359 * within fatal_trap()s. We give the problem a couple of iterations to
360 * bottom out, and then we just panic.
361 */
362 if ( ++this_cpu(depth) < 3 )
363 {
364 watchdog_disable();
365 console_start_sync();
367 show_execution_state(regs);
369 if ( trapnr == TRAP_page_fault )
370 {
371 unsigned long cr2 = read_cr2();
372 printk("Faulting linear address: %p\n", _p(cr2));
373 show_page_walk(cr2);
374 }
375 }
377 panic("FATAL TRAP: vector = %d (%s)\n"
378 "[error_code=%04x] %s\n",
379 trapnr, trapstr(trapnr), regs->error_code,
380 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
381 }
383 static int do_guest_trap(
384 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
385 {
386 struct vcpu *v = current;
387 struct trap_bounce *tb;
388 const struct trap_info *ti;
390 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
392 tb = &v->arch.trap_bounce;
393 ti = &v->arch.guest_context.trap_ctxt[trapnr];
395 tb->flags = TBF_EXCEPTION;
396 tb->cs = ti->cs;
397 tb->eip = ti->address;
399 if ( use_error_code )
400 {
401 tb->flags |= TBF_EXCEPTION_ERRCODE;
402 tb->error_code = regs->error_code;
403 }
405 if ( TI_GET_IF(ti) )
406 tb->flags |= TBF_INTERRUPT;
408 if ( unlikely(null_trap_bounce(v, tb)) )
409 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
410 "domain %d on VCPU %d [ec=%04x]\n",
411 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
412 regs->error_code);
414 return 0;
415 }
417 /*
418 * Called from asm to set up the NMI trapbounce info.
419 * Returns 0 if no callback is set up, else 1.
420 */
421 asmlinkage int set_guest_nmi_trapbounce(void)
422 {
423 struct vcpu *v = current;
424 struct trap_bounce *tb = &v->arch.trap_bounce;
425 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
426 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
427 return !null_trap_bounce(v, tb);
428 }
430 static inline int do_trap(
431 int trapnr, struct cpu_user_regs *regs, int use_error_code)
432 {
433 unsigned long fixup;
435 DEBUGGER_trap_entry(trapnr, regs);
437 if ( guest_mode(regs) )
438 return do_guest_trap(trapnr, regs, use_error_code);
440 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
441 {
442 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
443 trapnr, _p(regs->eip), _p(fixup));
444 regs->eip = fixup;
445 return 0;
446 }
448 DEBUGGER_trap_fatal(trapnr, regs);
450 show_execution_state(regs);
451 panic("FATAL TRAP: vector = %d (%s)\n"
452 "[error_code=%04x]\n",
453 trapnr, trapstr(trapnr), regs->error_code);
454 return 0;
455 }
457 #define DO_ERROR_NOCODE(trapnr, name) \
458 asmlinkage int do_##name(struct cpu_user_regs *regs) \
459 { \
460 return do_trap(trapnr, regs, 0); \
461 }
463 #define DO_ERROR(trapnr, name) \
464 asmlinkage int do_##name(struct cpu_user_regs *regs) \
465 { \
466 return do_trap(trapnr, regs, 1); \
467 }
469 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
470 DO_ERROR_NOCODE(TRAP_overflow, overflow)
471 DO_ERROR_NOCODE(TRAP_bounds, bounds)
472 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
473 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
474 DO_ERROR( TRAP_no_segment, segment_not_present)
475 DO_ERROR( TRAP_stack_error, stack_segment)
476 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
477 DO_ERROR( TRAP_alignment_check, alignment_check)
478 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
480 int rdmsr_hypervisor_regs(
481 uint32_t idx, uint32_t *eax, uint32_t *edx)
482 {
483 idx -= 0x40000000;
484 if ( idx > 0 )
485 return 0;
487 switch ( idx )
488 {
489 case 0:
490 {
491 *eax = *edx = 0;
492 break;
493 }
494 default:
495 BUG();
496 }
498 return 1;
499 }
501 int wrmsr_hypervisor_regs(
502 uint32_t idx, uint32_t eax, uint32_t edx)
503 {
504 struct domain *d = current->domain;
506 idx -= 0x40000000;
507 if ( idx > 0 )
508 return 0;
510 switch ( idx )
511 {
512 case 0:
513 {
514 void *hypercall_page;
515 unsigned long mfn;
516 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
517 unsigned int idx = eax & 0xfff;
519 if ( idx > 0 )
520 {
521 gdprintk(XENLOG_WARNING,
522 "Dom%d: Out of range index %u to MSR %08x\n",
523 d->domain_id, idx, 0x40000000);
524 return 0;
525 }
527 mfn = gmfn_to_mfn(d, gmfn);
529 if ( !mfn_valid(mfn) ||
530 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
531 {
532 gdprintk(XENLOG_WARNING,
533 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
534 d->domain_id, gmfn, mfn, 0x40000000);
535 return 0;
536 }
538 hypercall_page = map_domain_page(mfn);
539 hypercall_page_initialise(d, hypercall_page);
540 unmap_domain_page(hypercall_page);
542 put_page_and_type(mfn_to_page(mfn));
543 break;
544 }
546 default:
547 BUG();
548 }
550 return 1;
551 }
553 int cpuid_hypervisor_leaves(
554 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
555 {
556 idx -= 0x40000000;
557 if ( idx > 2 )
558 return 0;
560 switch ( idx )
561 {
562 case 0:
563 *eax = 0x40000002; /* Largest leaf */
564 *ebx = 0x566e6558; /* Signature 1: "XenV" */
565 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
566 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
567 break;
569 case 1:
570 *eax = (xen_major_version() << 16) | xen_minor_version();
571 *ebx = 0; /* Reserved */
572 *ecx = 0; /* Reserved */
573 *edx = 0; /* Reserved */
574 break;
576 case 2:
577 *eax = 1; /* Number of hypercall-transfer pages */
578 *ebx = 0x40000000; /* MSR base address */
579 *ecx = 0; /* Features 1 */
580 *edx = 0; /* Features 2 */
581 break;
583 default:
584 BUG();
585 }
587 return 1;
588 }
590 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
591 {
592 char sig[5], instr[2];
593 uint32_t a, b, c, d;
594 unsigned long eip, rc;
596 a = regs->eax;
597 b = regs->ebx;
598 c = regs->ecx;
599 d = regs->edx;
600 eip = regs->eip;
602 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
603 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
604 {
605 propagate_page_fault(eip + sizeof(sig) - rc, 0);
606 return EXCRET_fault_fixed;
607 }
608 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
609 return 0;
610 eip += sizeof(sig);
612 /* We only emulate CPUID. */
613 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
614 {
615 propagate_page_fault(eip + sizeof(instr) - rc, 0);
616 return EXCRET_fault_fixed;
617 }
618 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
619 return 0;
620 eip += sizeof(instr);
622 asm (
623 "cpuid"
624 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
625 : "0" (a), "1" (b), "2" (c), "3" (d) );
627 if ( regs->eax == 1 )
628 {
629 /* Modify Feature Information. */
630 clear_bit(X86_FEATURE_VME, &d);
631 clear_bit(X86_FEATURE_DE, &d);
632 clear_bit(X86_FEATURE_PSE, &d);
633 clear_bit(X86_FEATURE_PGE, &d);
634 if ( !cpu_has_sep )
635 clear_bit(X86_FEATURE_SEP, &d);
636 #ifdef __i386__
637 if ( !supervisor_mode_kernel )
638 clear_bit(X86_FEATURE_SEP, &d);
639 #endif
640 if ( !IS_PRIV(current->domain) )
641 clear_bit(X86_FEATURE_MTRR, &d);
642 }
643 else if ( regs->eax == 0x80000001 )
644 {
645 /* Modify Feature Information. */
646 #ifdef __i386__
647 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
648 #endif
649 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
650 }
651 else
652 {
653 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
654 }
656 regs->eax = a;
657 regs->ebx = b;
658 regs->ecx = c;
659 regs->edx = d;
660 regs->eip = eip;
661 regs->eflags &= ~X86_EFLAGS_RF;
663 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
665 return EXCRET_fault_fixed;
666 }
668 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
669 {
670 struct bug_frame bug;
671 struct bug_frame_str bug_str;
672 char *filename, *predicate, *eip = (char *)regs->eip;
673 int rc, id, lineno;
675 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
677 if ( likely(guest_mode(regs)) )
678 {
679 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
680 return rc;
681 return do_guest_trap(TRAP_invalid_op, regs, 0);
682 }
684 if ( !is_kernel(eip) ||
685 __copy_from_user(&bug, eip, sizeof(bug)) ||
686 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
687 (bug.ret != 0xc2) )
688 goto die;
689 eip += sizeof(bug);
691 id = bug.id & 3;
693 if ( id == BUGFRAME_dump )
694 {
695 show_execution_state(regs);
696 regs->eip = (unsigned long)eip;
697 return EXCRET_fault_fixed;
698 }
700 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
701 if ( !is_kernel(eip) ||
702 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
703 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
704 goto die;
705 eip += sizeof(bug_str);
707 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
708 lineno = bug.id >> 2;
710 if ( id == BUGFRAME_warn )
711 {
712 printk("Xen WARN at %.50s:%d\n", filename, lineno);
713 show_execution_state(regs);
714 regs->eip = (unsigned long)eip;
715 return EXCRET_fault_fixed;
716 }
718 if ( id == BUGFRAME_bug )
719 {
720 printk("Xen BUG at %.50s:%d\n", filename, lineno);
721 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
722 show_execution_state(regs);
723 panic("Xen BUG at %.50s:%d\n", filename, lineno);
724 }
726 /* ASSERT: decode the predicate string pointer. */
727 ASSERT(id == BUGFRAME_assert);
728 if ( !is_kernel(eip) ||
729 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
730 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
731 goto die;
732 eip += sizeof(bug_str);
734 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
735 printk("Assertion '%s' failed at %.50s:%d\n",
736 predicate, filename, lineno);
737 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
738 show_execution_state(regs);
739 panic("Assertion '%s' failed at %.50s:%d\n",
740 predicate, filename, lineno);
742 die:
743 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
744 show_execution_state(regs);
745 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
746 return 0;
747 }
749 asmlinkage int do_int3(struct cpu_user_regs *regs)
750 {
751 DEBUGGER_trap_entry(TRAP_int3, regs);
753 if ( !guest_mode(regs) )
754 {
755 DEBUGGER_trap_fatal(TRAP_int3, regs);
756 show_execution_state(regs);
757 panic("FATAL TRAP: vector = 3 (Int3)\n");
758 }
760 return do_guest_trap(TRAP_int3, regs, 0);
761 }
763 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
764 {
765 extern fastcall void (*machine_check_vector)(
766 struct cpu_user_regs *, long error_code);
767 machine_check_vector(regs, regs->error_code);
768 }
770 void propagate_page_fault(unsigned long addr, u16 error_code)
771 {
772 struct trap_info *ti;
773 struct vcpu *v = current;
774 struct trap_bounce *tb = &v->arch.trap_bounce;
776 v->arch.guest_context.ctrlreg[2] = addr;
777 arch_set_cr2(v, addr);
779 /* Re-set error_code.user flag appropriately for the guest. */
780 error_code &= ~PFEC_user_mode;
781 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
782 error_code |= PFEC_user_mode;
784 trace_pv_page_fault(addr, error_code);
786 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
787 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
788 tb->error_code = error_code;
789 tb->cs = ti->cs;
790 tb->eip = ti->address;
791 if ( TI_GET_IF(ti) )
792 tb->flags |= TBF_INTERRUPT;
793 if ( unlikely(null_trap_bounce(v, tb)) )
794 {
795 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
796 v->domain->domain_id, v->vcpu_id, error_code);
797 show_page_walk(addr);
798 }
799 }
801 static int handle_gdt_ldt_mapping_fault(
802 unsigned long offset, struct cpu_user_regs *regs)
803 {
804 struct vcpu *curr = current;
805 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
806 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
807 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
809 /* Should never fault in another vcpu's area. */
810 BUG_ON(vcpu_area != curr->vcpu_id);
812 /* Byte offset within the gdt/ldt sub-area. */
813 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
815 if ( likely(is_ldt_area) )
816 {
817 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
818 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
819 {
820 if ( guest_mode(regs) )
821 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
822 regs->eip, offset);
823 }
824 else
825 {
826 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
827 if ( !guest_mode(regs) )
828 return 0;
829 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
830 propagate_page_fault(
831 curr->arch.guest_context.ldt_base + offset,
832 regs->error_code);
833 }
834 }
835 else
836 {
837 /* GDT fault: handle the fault as #GP(selector). */
838 regs->error_code = (u16)offset & ~7;
839 (void)do_general_protection(regs);
840 }
842 return EXCRET_fault_fixed;
843 }
845 #ifdef HYPERVISOR_VIRT_END
846 #define IN_HYPERVISOR_RANGE(va) \
847 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
848 #else
849 #define IN_HYPERVISOR_RANGE(va) \
850 (((va) >= HYPERVISOR_VIRT_START))
851 #endif
853 static int __spurious_page_fault(
854 unsigned long addr, struct cpu_user_regs *regs)
855 {
856 unsigned long mfn, cr3 = read_cr3();
857 #if CONFIG_PAGING_LEVELS >= 4
858 l4_pgentry_t l4e, *l4t;
859 #endif
860 #if CONFIG_PAGING_LEVELS >= 3
861 l3_pgentry_t l3e, *l3t;
862 #endif
863 l2_pgentry_t l2e, *l2t;
864 l1_pgentry_t l1e, *l1t;
865 unsigned int required_flags, disallowed_flags;
867 /* Reserved bit violations are never spurious faults. */
868 if ( regs->error_code & PFEC_reserved_bit )
869 return 0;
871 required_flags = _PAGE_PRESENT;
872 if ( regs->error_code & PFEC_write_access )
873 required_flags |= _PAGE_RW;
874 if ( regs->error_code & PFEC_user_mode )
875 required_flags |= _PAGE_USER;
877 disallowed_flags = 0;
878 if ( regs->error_code & PFEC_insn_fetch )
879 disallowed_flags |= _PAGE_NX;
881 mfn = cr3 >> PAGE_SHIFT;
883 #if CONFIG_PAGING_LEVELS >= 4
884 l4t = map_domain_page(mfn);
885 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
886 mfn = l4e_get_pfn(l4e);
887 unmap_domain_page(l4t);
888 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
889 (l4e_get_flags(l4e) & disallowed_flags) )
890 return 0;
891 #endif
893 #if CONFIG_PAGING_LEVELS >= 3
894 l3t = map_domain_page(mfn);
895 #ifdef CONFIG_X86_PAE
896 l3t += (cr3 & 0xFE0UL) >> 3;
897 #endif
898 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
899 mfn = l3e_get_pfn(l3e);
900 unmap_domain_page(l3t);
901 #ifdef CONFIG_X86_PAE
902 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
903 return 0;
904 #else
905 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
906 (l3e_get_flags(l3e) & disallowed_flags) )
907 return 0;
908 #endif
909 #endif
911 l2t = map_domain_page(mfn);
912 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
913 mfn = l2e_get_pfn(l2e);
914 unmap_domain_page(l2t);
915 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
916 (l2e_get_flags(l2e) & disallowed_flags) )
917 return 0;
918 if ( l2e_get_flags(l2e) & _PAGE_PSE )
919 {
920 l1e = l1e_empty(); /* define before use in debug tracing */
921 goto spurious;
922 }
924 l1t = map_domain_page(mfn);
925 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
926 mfn = l1e_get_pfn(l1e);
927 unmap_domain_page(l1t);
928 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
929 (l1e_get_flags(l1e) & disallowed_flags) )
930 return 0;
932 spurious:
933 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
934 "at addr %lx, e/c %04x\n",
935 current->domain->domain_id, current->vcpu_id,
936 addr, regs->error_code);
937 #if CONFIG_PAGING_LEVELS >= 4
938 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
939 #endif
940 #if CONFIG_PAGING_LEVELS >= 3
941 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
942 #endif
943 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
944 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
945 #ifndef NDEBUG
946 show_registers(regs);
947 #endif
948 return 1;
949 }
951 static int spurious_page_fault(
952 unsigned long addr, struct cpu_user_regs *regs)
953 {
954 unsigned long flags;
955 int is_spurious;
957 /*
958 * Disabling interrupts prevents TLB flushing, and hence prevents
959 * page tables from becoming invalid under our feet during the walk.
960 */
961 local_irq_save(flags);
962 is_spurious = __spurious_page_fault(addr, regs);
963 local_irq_restore(flags);
965 return is_spurious;
966 }
968 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
969 {
970 struct vcpu *v = current;
971 struct domain *d = v->domain;
973 /* No fixups in interrupt context or when interrupts are disabled. */
974 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
975 return 0;
977 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
978 {
979 if ( paging_mode_external(d) && guest_mode(regs) )
980 {
981 int ret = paging_fault(addr, regs);
982 if ( ret == EXCRET_fault_fixed )
983 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
984 return ret;
985 }
986 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
987 return handle_gdt_ldt_mapping_fault(
988 addr - GDT_LDT_VIRT_START, regs);
989 return 0;
990 }
992 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
993 guest_kernel_mode(v, regs) &&
994 /* Do not check if access-protection fault since the page may
995 legitimately be not present in shadow page tables */
996 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
997 ptwr_do_page_fault(v, addr, regs) )
998 return EXCRET_fault_fixed;
1000 if ( paging_mode_enabled(d) )
1002 int ret = paging_fault(addr, regs);
1003 if ( ret == EXCRET_fault_fixed )
1004 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1005 return ret;
1008 return 0;
1011 /*
1012 * #PF error code:
1013 * Bit 0: Protection violation (=1) ; Page not present (=0)
1014 * Bit 1: Write access
1015 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1016 * Bit 3: Reserved bit violation
1017 * Bit 4: Instruction fetch
1018 */
1019 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
1021 unsigned long addr, fixup;
1022 int rc;
1024 addr = read_cr2();
1026 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1028 perfc_incr(page_faults);
1030 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
1031 return rc;
1033 if ( unlikely(!guest_mode(regs)) )
1035 if ( spurious_page_fault(addr, regs) )
1036 return EXCRET_not_a_fault;
1038 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1040 perfc_incr(copy_user_faults);
1041 regs->eip = fixup;
1042 return 0;
1045 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1047 show_execution_state(regs);
1048 show_page_walk(addr);
1049 panic("FATAL PAGE FAULT\n"
1050 "[error_code=%04x]\n"
1051 "Faulting linear address: %p\n",
1052 regs->error_code, _p(addr));
1055 propagate_page_fault(addr, regs->error_code);
1056 return 0;
1059 /*
1060 * Early handler to deal with spurious page faults. For example, consider a
1061 * routine that uses a mapping immediately after installing it (making it
1062 * present). The CPU may speculatively execute the memory access before
1063 * executing the PTE write. The instruction will then be marked to cause a
1064 * page fault when it is retired, despite the fact that the PTE is present and
1065 * correct at that point in time.
1066 */
1067 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1069 static int stuck;
1070 static unsigned long prev_eip, prev_cr2;
1071 unsigned long cr2 = read_cr2();
1073 BUG_ON(smp_processor_id() != 0);
1075 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1077 prev_eip = regs->eip;
1078 prev_cr2 = cr2;
1079 stuck = 0;
1080 return EXCRET_not_a_fault;
1083 if ( stuck++ == 1000 )
1084 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1085 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1087 return EXCRET_not_a_fault;
1090 long do_fpu_taskswitch(int set)
1092 struct vcpu *v = current;
1094 if ( set )
1096 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1097 stts();
1099 else
1101 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1102 if ( v->fpu_dirtied )
1103 clts();
1106 return 0;
1109 static int read_descriptor(unsigned int sel,
1110 const struct vcpu *v,
1111 const struct cpu_user_regs * regs,
1112 unsigned long *base,
1113 unsigned long *limit,
1114 unsigned int *ar,
1115 unsigned int vm86attr)
1117 struct desc_struct desc;
1119 if ( !vm86_mode(regs) )
1121 if ( sel < 4)
1122 desc.b = desc.a = 0;
1123 else if ( __get_user(desc,
1124 (const struct desc_struct *)(!(sel & 4)
1125 ? GDT_VIRT_START(v)
1126 : LDT_VIRT_START(v))
1127 + (sel >> 3)) )
1128 return 0;
1129 if ( !(vm86attr & _SEGMENT_CODE) )
1130 desc.b &= ~_SEGMENT_L;
1132 else
1134 desc.a = (sel << 20) | 0xffff;
1135 desc.b = vm86attr | (sel >> 12);
1138 *ar = desc.b & 0x00f0ff00;
1139 if ( !(desc.b & _SEGMENT_L) )
1141 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1142 (desc.b & 0xff000000));
1143 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1144 if ( desc.b & _SEGMENT_G )
1145 *limit = ((*limit + 1) << 12) - 1;
1146 #ifndef NDEBUG
1147 if ( !vm86_mode(regs) && (sel > 3) )
1149 unsigned int a, l;
1150 unsigned char valid;
1152 asm volatile (
1153 "larl %2,%0 ; setz %1"
1154 : "=r" (a), "=rm" (valid) : "rm" (sel));
1155 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1156 asm volatile (
1157 "lsll %2,%0 ; setz %1"
1158 : "=r" (l), "=rm" (valid) : "rm" (sel));
1159 BUG_ON(valid && (l != *limit));
1161 #endif
1163 else
1165 *base = 0UL;
1166 *limit = ~0UL;
1169 return 1;
1172 #ifdef __x86_64__
1173 static int read_gate_descriptor(unsigned int gate_sel,
1174 const struct vcpu *v,
1175 unsigned int *sel,
1176 unsigned long *off,
1177 unsigned int *ar)
1179 struct desc_struct desc;
1180 const struct desc_struct *pdesc;
1183 pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
1184 GDT_VIRT_START(v) :
1185 LDT_VIRT_START(v))
1186 + (gate_sel >> 3);
1187 if ( gate_sel < 4 ||
1188 (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
1189 __get_user(desc, pdesc) )
1190 return 0;
1192 *sel = (desc.a >> 16) & 0x0000fffc;
1193 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1194 *ar = desc.b & 0x0000ffff;
1195 /*
1196 * check_descriptor() clears the DPL field and stores the
1197 * guest requested DPL in the selector's RPL field.
1198 */
1199 ASSERT(!(*ar & _SEGMENT_DPL));
1200 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1202 if ( !is_pv_32bit_vcpu(v) )
1204 if ( (*ar & 0x1f00) != 0x0c00 ||
1205 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1206 __get_user(desc, pdesc + 1) ||
1207 (desc.b & 0x1f00) )
1208 return 0;
1210 *off |= (unsigned long)desc.a << 32;
1211 return 1;
1214 switch ( *ar & 0x1f00 )
1216 case 0x0400:
1217 *off &= 0xffff;
1218 break;
1219 case 0x0c00:
1220 break;
1221 default:
1222 return 0;
1225 return 1;
1227 #endif
1229 /* Has the guest requested sufficient permission for this I/O access? */
1230 static inline int guest_io_okay(
1231 unsigned int port, unsigned int bytes,
1232 struct vcpu *v, struct cpu_user_regs *regs)
1234 #if defined(__x86_64__)
1235 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1236 int user_mode = !(v->arch.flags & TF_kernel_mode);
1237 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1238 #elif defined(__i386__)
1239 #define TOGGLE_MODE() ((void)0)
1240 #endif
1242 if ( !vm86_mode(regs) &&
1243 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1244 return 1;
1246 if ( v->arch.iobmp_limit > (port + bytes) )
1248 union { uint8_t bytes[2]; uint16_t mask; } x;
1250 /*
1251 * Grab permission bytes from guest space. Inaccessible bytes are
1252 * read as 0xff (no access allowed).
1253 */
1254 TOGGLE_MODE();
1255 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1256 port>>3, 2) )
1258 default: x.bytes[0] = ~0;
1259 case 1: x.bytes[1] = ~0;
1260 case 0: break;
1262 TOGGLE_MODE();
1264 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1265 return 1;
1268 return 0;
1271 /* Has the administrator granted sufficient permission for this I/O access? */
1272 static inline int admin_io_okay(
1273 unsigned int port, unsigned int bytes,
1274 struct vcpu *v, struct cpu_user_regs *regs)
1276 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1279 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1280 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1281 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1282 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1283 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1284 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1286 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1287 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1288 __attribute__((__regparm__(1)));
1289 unsigned long guest_to_host_gpr_switch(unsigned long)
1290 __attribute__((__regparm__(1)));
1292 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1294 /* Instruction fetch with error handling. */
1295 #define insn_fetch(type, base, eip, limit) \
1296 ({ unsigned long _rc, _ptr = (base) + (eip); \
1297 type _x; \
1298 if ( ad_default < 8 ) \
1299 _ptr = (unsigned int)_ptr; \
1300 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1301 goto fail; \
1302 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1303 { \
1304 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1305 return EXCRET_fault_fixed; \
1306 } \
1307 (eip) += sizeof(_x); _x; })
1309 #if defined(CONFIG_X86_32)
1310 # define read_sreg(regs, sr) ((regs)->sr)
1311 #elif defined(CONFIG_X86_64)
1312 # define read_sreg(regs, sr) read_segment_register(sr)
1313 #endif
1315 static int emulate_privileged_op(struct cpu_user_regs *regs)
1317 struct vcpu *v = current;
1318 unsigned long *reg, eip = regs->eip, res;
1319 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1320 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1321 unsigned int port, i, data_sel, ar, data, rc;
1322 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1323 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1324 ? regs->reg \
1325 : ad_bytes == 4 \
1326 ? (u32)regs->reg \
1327 : (u16)regs->reg)
1328 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1329 ? regs->reg = (val) \
1330 : ad_bytes == 4 \
1331 ? (*(u32 *)&regs->reg = (val)) \
1332 : (*(u16 *)&regs->reg = (val)))
1333 unsigned long code_base, code_limit;
1334 char io_emul_stub[16];
1335 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1336 u32 l, h, eax, edx;
1338 if ( !read_descriptor(regs->cs, v, regs,
1339 &code_base, &code_limit, &ar,
1340 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1341 goto fail;
1342 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1343 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1344 if ( !(ar & _SEGMENT_S) ||
1345 !(ar & _SEGMENT_P) ||
1346 !(ar & _SEGMENT_CODE) )
1347 goto fail;
1349 /* emulating only opcodes not allowing SS to be default */
1350 data_sel = read_sreg(regs, ds);
1352 /* Legacy prefixes. */
1353 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1355 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1357 case 0x66: /* operand-size override */
1358 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1359 continue;
1360 case 0x67: /* address-size override */
1361 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1362 continue;
1363 case 0x2e: /* CS override */
1364 data_sel = regs->cs;
1365 continue;
1366 case 0x3e: /* DS override */
1367 data_sel = read_sreg(regs, ds);
1368 continue;
1369 case 0x26: /* ES override */
1370 data_sel = read_sreg(regs, es);
1371 continue;
1372 case 0x64: /* FS override */
1373 data_sel = read_sreg(regs, fs);
1374 lm_ovr = lm_seg_fs;
1375 continue;
1376 case 0x65: /* GS override */
1377 data_sel = read_sreg(regs, gs);
1378 lm_ovr = lm_seg_gs;
1379 continue;
1380 case 0x36: /* SS override */
1381 data_sel = regs->ss;
1382 continue;
1383 case 0xf0: /* LOCK */
1384 lock = 1;
1385 continue;
1386 case 0xf2: /* REPNE/REPNZ */
1387 case 0xf3: /* REP/REPE/REPZ */
1388 rep_prefix = 1;
1389 continue;
1390 default:
1391 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1393 rex = opcode;
1394 continue;
1396 break;
1398 break;
1401 /* REX prefix. */
1402 if ( rex & 8 ) /* REX.W */
1403 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1404 modrm_reg = (rex & 4) << 1; /* REX.R */
1405 /* REX.X does not need to be decoded. */
1406 modrm_rm = (rex & 1) << 3; /* REX.B */
1408 if ( opcode == 0x0f )
1409 goto twobyte_opcode;
1411 if ( lock )
1412 goto fail;
1414 /* Input/Output String instructions. */
1415 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1417 unsigned long data_base, data_limit;
1419 if ( rep_prefix && (rd_ad(ecx) == 0) )
1420 goto done;
1422 if ( !(opcode & 2) )
1424 data_sel = read_sreg(regs, es);
1425 lm_ovr = lm_seg_none;
1428 if ( !(ar & _SEGMENT_L) )
1430 if ( !read_descriptor(data_sel, v, regs,
1431 &data_base, &data_limit, &ar,
1432 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1433 goto fail;
1434 if ( !(ar & _SEGMENT_S) ||
1435 !(ar & _SEGMENT_P) ||
1436 (opcode & 2 ?
1437 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1438 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1439 goto fail;
1441 #ifdef CONFIG_X86_64
1442 else
1444 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1446 switch ( lm_ovr )
1448 case lm_seg_none:
1449 data_base = 0UL;
1450 break;
1451 case lm_seg_fs:
1452 data_base = v->arch.guest_context.fs_base;
1453 break;
1454 case lm_seg_gs:
1455 if ( guest_kernel_mode(v, regs) )
1456 data_base = v->arch.guest_context.gs_base_kernel;
1457 else
1458 data_base = v->arch.guest_context.gs_base_user;
1459 break;
1462 else
1463 read_descriptor(data_sel, v, regs,
1464 &data_base, &data_limit, &ar,
1465 0);
1466 data_limit = ~0UL;
1467 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1469 #endif
1471 continue_io_string:
1472 switch ( opcode )
1474 case 0x6c: /* INSB */
1475 op_bytes = 1;
1476 case 0x6d: /* INSW/INSL */
1477 if ( data_limit < op_bytes - 1 ||
1478 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1479 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1480 goto fail;
1481 port = (u16)regs->edx;
1482 switch ( op_bytes )
1484 case 1:
1485 /* emulate PIT counter 2 */
1486 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1487 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1488 pv_pit_handler(port, 0, 0) : ~0));
1489 break;
1490 case 2:
1491 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1492 break;
1493 case 4:
1494 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1495 break;
1497 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1499 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1500 PFEC_write_access);
1501 return EXCRET_fault_fixed;
1503 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1504 break;
1506 case 0x6e: /* OUTSB */
1507 op_bytes = 1;
1508 case 0x6f: /* OUTSW/OUTSL */
1509 if ( data_limit < op_bytes - 1 ||
1510 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1511 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1512 goto fail;
1513 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1514 if ( rc != 0 )
1516 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1517 return EXCRET_fault_fixed;
1519 port = (u16)regs->edx;
1520 switch ( op_bytes )
1522 case 1:
1523 if ( guest_outb_okay(port, v, regs) )
1525 outb((u8)data, port);
1526 if ( pv_post_outb_hook )
1527 pv_post_outb_hook(port, data);
1529 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1530 pv_pit_handler(port, data, 1);
1531 break;
1532 case 2:
1533 if ( guest_outw_okay(port, v, regs) )
1534 outw((u16)data, port);
1535 break;
1536 case 4:
1537 if ( guest_outl_okay(port, v, regs) )
1538 outl((u32)data, port);
1539 break;
1541 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1542 break;
1545 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1547 if ( !hypercall_preempt_check() )
1548 goto continue_io_string;
1549 eip = regs->eip;
1552 goto done;
1555 /*
1556 * Very likely to be an I/O instruction (IN/OUT).
1557 * Build an on-stack stub to execute the instruction with full guest
1558 * GPR context. This is needed for some systems which (ab)use IN/OUT
1559 * to communicate with BIOS code in system-management mode.
1560 */
1561 #ifdef __x86_64__
1562 /* movq $host_to_guest_gpr_switch,%rcx */
1563 io_emul_stub[0] = 0x48;
1564 io_emul_stub[1] = 0xb9;
1565 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1566 /* callq *%rcx */
1567 io_emul_stub[10] = 0xff;
1568 io_emul_stub[11] = 0xd1;
1569 #else
1570 /* call host_to_guest_gpr_switch */
1571 io_emul_stub[0] = 0xe8;
1572 *(s32 *)&io_emul_stub[1] =
1573 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1574 /* 7 x nop */
1575 memset(&io_emul_stub[5], 0x90, 7);
1576 #endif
1577 /* data16 or nop */
1578 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1579 /* <io-access opcode> */
1580 io_emul_stub[13] = opcode;
1581 /* imm8 or nop */
1582 io_emul_stub[14] = 0x90;
1583 /* ret (jumps to guest_to_host_gpr_switch) */
1584 io_emul_stub[15] = 0xc3;
1586 /* Handy function-typed pointer to the stub. */
1587 io_emul = (void *)io_emul_stub;
1589 /* I/O Port and Interrupt Flag instructions. */
1590 switch ( opcode )
1592 case 0xe4: /* IN imm8,%al */
1593 op_bytes = 1;
1594 case 0xe5: /* IN imm8,%eax */
1595 port = insn_fetch(u8, code_base, eip, code_limit);
1596 io_emul_stub[14] = port; /* imm8 */
1597 exec_in:
1598 if ( !guest_io_okay(port, op_bytes, v, regs) )
1599 goto fail;
1600 switch ( op_bytes )
1602 case 1:
1603 if ( guest_inb_okay(port, v, regs) )
1604 io_emul(regs);
1605 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1607 regs->eax &= ~0xffUL;
1608 regs->eax |= pv_pit_handler(port, 0, 0);
1610 else
1611 regs->eax |= (u8)~0;
1612 break;
1613 case 2:
1614 if ( guest_inw_okay(port, v, regs) )
1615 io_emul(regs);
1616 else
1617 regs->eax |= (u16)~0;
1618 break;
1619 case 4:
1620 if ( guest_inl_okay(port, v, regs) )
1621 io_emul(regs);
1622 else
1623 regs->eax = (u32)~0;
1624 break;
1626 goto done;
1628 case 0xec: /* IN %dx,%al */
1629 op_bytes = 1;
1630 case 0xed: /* IN %dx,%eax */
1631 port = (u16)regs->edx;
1632 goto exec_in;
1634 case 0xe6: /* OUT %al,imm8 */
1635 op_bytes = 1;
1636 case 0xe7: /* OUT %eax,imm8 */
1637 port = insn_fetch(u8, code_base, eip, code_limit);
1638 io_emul_stub[14] = port; /* imm8 */
1639 exec_out:
1640 if ( !guest_io_okay(port, op_bytes, v, regs) )
1641 goto fail;
1642 switch ( op_bytes )
1644 case 1:
1645 if ( guest_outb_okay(port, v, regs) )
1647 io_emul(regs);
1648 if ( pv_post_outb_hook )
1649 pv_post_outb_hook(port, regs->eax);
1651 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1652 pv_pit_handler(port, regs->eax, 1);
1653 break;
1654 case 2:
1655 if ( guest_outw_okay(port, v, regs) )
1656 io_emul(regs);
1657 break;
1658 case 4:
1659 if ( guest_outl_okay(port, v, regs) )
1660 io_emul(regs);
1661 break;
1663 goto done;
1665 case 0xee: /* OUT %al,%dx */
1666 op_bytes = 1;
1667 case 0xef: /* OUT %eax,%dx */
1668 port = (u16)regs->edx;
1669 goto exec_out;
1671 case 0xfa: /* CLI */
1672 case 0xfb: /* STI */
1673 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1674 goto fail;
1675 /*
1676 * This is just too dangerous to allow, in my opinion. Consider if the
1677 * caller then tries to reenable interrupts using POPF: we can't trap
1678 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1679 * do for us. :-)
1680 */
1681 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1682 goto done;
1685 /* No decode of this single-byte opcode. */
1686 goto fail;
1688 twobyte_opcode:
1689 /* Two-byte opcodes only emulated from guest kernel. */
1690 if ( !guest_kernel_mode(v, regs) )
1691 goto fail;
1693 /* Privileged (ring 0) instructions. */
1694 opcode = insn_fetch(u8, code_base, eip, code_limit);
1695 if ( lock && (opcode & ~3) != 0x20 )
1696 goto fail;
1697 switch ( opcode )
1699 case 0x06: /* CLTS */
1700 (void)do_fpu_taskswitch(0);
1701 break;
1703 case 0x09: /* WBINVD */
1704 /* Ignore the instruction if unprivileged. */
1705 if ( !cache_flush_permitted(v->domain) )
1706 /* Non-physdev domain attempted WBINVD; ignore for now since
1707 newer linux uses this in some start-of-day timing loops */
1709 else
1710 wbinvd();
1711 break;
1713 case 0x20: /* MOV CR?,<reg> */
1714 opcode = insn_fetch(u8, code_base, eip, code_limit);
1715 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1716 modrm_rm |= (opcode >> 0) & 7;
1717 reg = decode_register(modrm_rm, regs, 0);
1718 switch ( modrm_reg )
1720 case 0: /* Read CR0 */
1721 *reg = (read_cr0() & ~X86_CR0_TS) |
1722 v->arch.guest_context.ctrlreg[0];
1723 break;
1725 case 2: /* Read CR2 */
1726 *reg = v->arch.guest_context.ctrlreg[2];
1727 break;
1729 case 3: /* Read CR3 */
1730 if ( !is_pv_32on64_vcpu(v) )
1731 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1732 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1733 #ifdef CONFIG_COMPAT
1734 else
1735 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1736 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1737 #endif
1738 break;
1740 case 4: /* Read CR4 */
1741 /*
1742 * Guests can read CR4 to see what features Xen has enabled. We
1743 * therefore lie about PGE & PSE as they are unavailable to guests.
1744 */
1745 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1746 break;
1748 default:
1749 goto fail;
1751 break;
1753 case 0x21: /* MOV DR?,<reg> */
1754 opcode = insn_fetch(u8, code_base, eip, code_limit);
1755 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1756 modrm_rm |= (opcode >> 0) & 7;
1757 reg = decode_register(modrm_rm, regs, 0);
1758 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1759 goto fail;
1760 *reg = res;
1761 break;
1763 case 0x22: /* MOV <reg>,CR? */
1764 opcode = insn_fetch(u8, code_base, eip, code_limit);
1765 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1766 modrm_rm |= (opcode >> 0) & 7;
1767 reg = decode_register(modrm_rm, regs, 0);
1768 switch ( modrm_reg )
1770 case 0: /* Write CR0 */
1771 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1773 gdprintk(XENLOG_WARNING,
1774 "Attempt to change unmodifiable CR0 flags.\n");
1775 goto fail;
1777 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1778 break;
1780 case 2: /* Write CR2 */
1781 v->arch.guest_context.ctrlreg[2] = *reg;
1782 arch_set_cr2(v, *reg);
1783 break;
1785 case 3: /* Write CR3 */
1786 LOCK_BIGLOCK(v->domain);
1787 if ( !is_pv_32on64_vcpu(v) )
1788 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1789 #ifdef CONFIG_COMPAT
1790 else
1791 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1792 #endif
1793 UNLOCK_BIGLOCK(v->domain);
1794 if ( rc == 0 ) /* not okay */
1795 goto fail;
1796 break;
1798 case 4: /* Write CR4 */
1799 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
1800 write_cr4(pv_guest_cr4_to_real_cr4(
1801 v->arch.guest_context.ctrlreg[4]));
1802 break;
1804 default:
1805 goto fail;
1807 break;
1809 case 0x23: /* MOV <reg>,DR? */
1810 opcode = insn_fetch(u8, code_base, eip, code_limit);
1811 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1812 modrm_rm |= (opcode >> 0) & 7;
1813 reg = decode_register(modrm_rm, regs, 0);
1814 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1815 goto fail;
1816 break;
1818 case 0x30: /* WRMSR */
1819 eax = regs->eax;
1820 edx = regs->edx;
1821 res = ((u64)edx << 32) | eax;
1822 switch ( regs->ecx )
1824 #ifdef CONFIG_X86_64
1825 case MSR_FS_BASE:
1826 if ( is_pv_32on64_vcpu(v) )
1827 goto fail;
1828 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1829 goto fail;
1830 v->arch.guest_context.fs_base = res;
1831 break;
1832 case MSR_GS_BASE:
1833 if ( is_pv_32on64_vcpu(v) )
1834 goto fail;
1835 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1836 goto fail;
1837 v->arch.guest_context.gs_base_kernel = res;
1838 break;
1839 case MSR_SHADOW_GS_BASE:
1840 if ( is_pv_32on64_vcpu(v) )
1841 goto fail;
1842 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1843 goto fail;
1844 v->arch.guest_context.gs_base_user = res;
1845 break;
1846 #endif
1847 case MSR_K7_FID_VID_STATUS:
1848 case MSR_K7_FID_VID_CTL:
1849 case MSR_K8_PSTATE_LIMIT:
1850 case MSR_K8_PSTATE_CTRL:
1851 case MSR_K8_PSTATE_STATUS:
1852 case MSR_K8_PSTATE0:
1853 case MSR_K8_PSTATE1:
1854 case MSR_K8_PSTATE2:
1855 case MSR_K8_PSTATE3:
1856 case MSR_K8_PSTATE4:
1857 case MSR_K8_PSTATE5:
1858 case MSR_K8_PSTATE6:
1859 case MSR_K8_PSTATE7:
1860 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1861 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1862 wrmsr_safe(regs->ecx, eax, edx) )
1863 goto fail;
1864 break;
1865 case MSR_IA32_PERF_CTL:
1866 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1867 (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
1868 wrmsr_safe(regs->ecx, eax, edx) )
1869 goto fail;
1870 break;
1871 default:
1872 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1873 break;
1874 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1875 (eax != l) || (edx != h) )
1876 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1877 "%08x:%08x to %08x:%08x.\n",
1878 _p(regs->ecx), h, l, edx, eax);
1879 break;
1881 break;
1883 case 0x31: /* RDTSC */
1884 rdtsc(regs->eax, regs->edx);
1885 break;
1887 case 0x32: /* RDMSR */
1888 switch ( regs->ecx )
1890 #ifdef CONFIG_X86_64
1891 case MSR_FS_BASE:
1892 if ( is_pv_32on64_vcpu(v) )
1893 goto fail;
1894 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1895 regs->edx = v->arch.guest_context.fs_base >> 32;
1896 break;
1897 case MSR_GS_BASE:
1898 if ( is_pv_32on64_vcpu(v) )
1899 goto fail;
1900 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1901 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1902 break;
1903 case MSR_SHADOW_GS_BASE:
1904 if ( is_pv_32on64_vcpu(v) )
1905 goto fail;
1906 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1907 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1908 break;
1909 #endif
1910 case MSR_K7_FID_VID_CTL:
1911 case MSR_K7_FID_VID_STATUS:
1912 case MSR_K8_PSTATE_LIMIT:
1913 case MSR_K8_PSTATE_CTRL:
1914 case MSR_K8_PSTATE_STATUS:
1915 case MSR_K8_PSTATE0:
1916 case MSR_K8_PSTATE1:
1917 case MSR_K8_PSTATE2:
1918 case MSR_K8_PSTATE3:
1919 case MSR_K8_PSTATE4:
1920 case MSR_K8_PSTATE5:
1921 case MSR_K8_PSTATE6:
1922 case MSR_K8_PSTATE7:
1923 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1924 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1925 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1926 goto fail;
1927 break;
1928 case MSR_EFER:
1929 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1930 goto fail;
1931 break;
1932 default:
1933 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1935 regs->eax = l;
1936 regs->edx = h;
1937 break;
1939 /* Everyone can read the MSR space. */
1940 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1941 _p(regs->ecx));*/
1942 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1943 goto fail;
1944 break;
1946 break;
1948 default:
1949 goto fail;
1952 #undef wr_ad
1953 #undef rd_ad
1955 done:
1956 regs->eip = eip;
1957 regs->eflags &= ~X86_EFLAGS_RF;
1958 return EXCRET_fault_fixed;
1960 fail:
1961 return 0;
1964 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
1965 unsigned int esp, unsigned int decr)
1967 return (((esp - decr) < (esp - 1)) &&
1968 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
1971 static int emulate_gate_op(struct cpu_user_regs *regs)
1973 #ifdef __x86_64__
1974 struct vcpu *v = current;
1975 unsigned int sel, ar, dpl, nparm, opnd_sel;
1976 unsigned int op_default, op_bytes, ad_default, ad_bytes;
1977 unsigned long off, eip, opnd_off, base, limit;
1978 int jump;
1980 /* Check whether this fault is due to the use of a call gate. */
1981 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
1982 ((ar >> 13) & 3) < (regs->cs & 3) ||
1983 (ar & _SEGMENT_TYPE) != 0xc00 )
1984 return do_guest_trap(TRAP_gp_fault, regs, 1);
1985 if ( !(ar & _SEGMENT_P) )
1986 return do_guest_trap(TRAP_no_segment, regs, 1);
1987 dpl = (ar >> 13) & 3;
1988 nparm = ar & 0x1f;
1990 /*
1991 * Decode instruction (and perhaps operand) to determine RPL,
1992 * whether this is a jump or a call, and the call return offset.
1993 */
1994 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
1995 !(ar & _SEGMENT_S) ||
1996 !(ar & _SEGMENT_P) ||
1997 !(ar & _SEGMENT_CODE) )
1998 return do_guest_trap(TRAP_gp_fault, regs, 1);
2000 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2001 ad_default = ad_bytes = op_default;
2002 opnd_sel = opnd_off = 0;
2003 jump = -1;
2004 for ( eip = regs->eip; eip - regs->_eip < 10; )
2006 switch ( insn_fetch(u8, base, eip, limit) )
2008 case 0x66: /* operand-size override */
2009 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2010 continue;
2011 case 0x67: /* address-size override */
2012 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2013 continue;
2014 case 0x2e: /* CS override */
2015 opnd_sel = regs->cs;
2016 ASSERT(opnd_sel);
2017 continue;
2018 case 0x3e: /* DS override */
2019 opnd_sel = read_sreg(regs, ds);
2020 if ( !opnd_sel )
2021 opnd_sel = dpl;
2022 continue;
2023 case 0x26: /* ES override */
2024 opnd_sel = read_sreg(regs, es);
2025 if ( !opnd_sel )
2026 opnd_sel = dpl;
2027 continue;
2028 case 0x64: /* FS override */
2029 opnd_sel = read_sreg(regs, fs);
2030 if ( !opnd_sel )
2031 opnd_sel = dpl;
2032 continue;
2033 case 0x65: /* GS override */
2034 opnd_sel = read_sreg(regs, gs);
2035 if ( !opnd_sel )
2036 opnd_sel = dpl;
2037 continue;
2038 case 0x36: /* SS override */
2039 opnd_sel = regs->ss;
2040 if ( !opnd_sel )
2041 opnd_sel = dpl;
2042 continue;
2043 case 0xea:
2044 ++jump;
2045 /* FALLTHROUGH */
2046 case 0x9a:
2047 ++jump;
2048 opnd_sel = regs->cs;
2049 opnd_off = eip;
2050 ad_bytes = ad_default;
2051 eip += op_bytes + 2;
2052 break;
2053 case 0xff:
2055 unsigned int modrm;
2057 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2059 case 0x28: case 0x68: case 0xa8:
2060 ++jump;
2061 /* FALLTHROUGH */
2062 case 0x18: case 0x58: case 0x98:
2063 ++jump;
2064 if ( ad_bytes != 2 )
2066 if ( (modrm & 7) == 4 )
2068 unsigned int sib = insn_fetch(u8, base, eip, limit);
2070 modrm = (modrm & ~7) | (sib & 7);
2071 if ( (sib >>= 3) != 4 )
2072 opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
2073 opnd_off <<= sib >> 3;
2075 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2076 opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
2077 else
2078 modrm |= 0x87;
2079 if ( !opnd_sel )
2081 switch ( modrm & 7 )
2083 default:
2084 opnd_sel = read_sreg(regs, ds);
2085 break;
2086 case 4: case 5:
2087 opnd_sel = regs->ss;
2088 break;
2092 else
2094 switch ( modrm & 7 )
2096 case 0: case 1: case 7:
2097 opnd_off = regs->ebx;
2098 break;
2099 case 6:
2100 if ( !(modrm & 0xc0) )
2101 modrm |= 0x80;
2102 else
2103 case 2: case 3:
2105 opnd_off = regs->ebp;
2106 if ( !opnd_sel )
2107 opnd_sel = regs->ss;
2109 break;
2111 if ( !opnd_sel )
2112 opnd_sel = read_sreg(regs, ds);
2113 switch ( modrm & 7 )
2115 case 0: case 2: case 4:
2116 opnd_off += regs->esi;
2117 break;
2118 case 1: case 3: case 5:
2119 opnd_off += regs->edi;
2120 break;
2123 switch ( modrm & 0xc0 )
2125 case 0x40:
2126 opnd_off += insn_fetch(s8, base, eip, limit);
2127 break;
2128 case 0x80:
2129 opnd_off += insn_fetch(s32, base, eip, limit);
2130 break;
2132 if ( ad_bytes == 4 )
2133 opnd_off = (unsigned int)opnd_off;
2134 else if ( ad_bytes == 2 )
2135 opnd_off = (unsigned short)opnd_off;
2136 break;
2139 break;
2141 break;
2144 if ( jump < 0 )
2146 fail:
2147 return do_guest_trap(TRAP_gp_fault, regs, 1);
2150 if ( (opnd_sel != regs->cs &&
2151 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2152 !(ar & _SEGMENT_S) ||
2153 !(ar & _SEGMENT_P) ||
2154 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2155 return do_guest_trap(TRAP_gp_fault, regs, 1);
2157 opnd_off += op_bytes;
2158 #define ad_default ad_bytes
2159 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2160 #undef ad_default
2161 ASSERT((opnd_sel & ~3) == regs->error_code);
2162 if ( dpl < (opnd_sel & 3) )
2163 return do_guest_trap(TRAP_gp_fault, regs, 1);
2165 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2166 !(ar & _SEGMENT_S) ||
2167 !(ar & _SEGMENT_CODE) ||
2168 (!jump || (ar & _SEGMENT_EC) ?
2169 ((ar >> 13) & 3) > (regs->cs & 3) :
2170 ((ar >> 13) & 3) != (regs->cs & 3)) )
2172 regs->error_code = sel;
2173 return do_guest_trap(TRAP_gp_fault, regs, 1);
2175 if ( !(ar & _SEGMENT_P) )
2177 regs->error_code = sel;
2178 return do_guest_trap(TRAP_no_segment, regs, 1);
2180 if ( off > limit )
2182 regs->error_code = 0;
2183 return do_guest_trap(TRAP_gp_fault, regs, 1);
2186 if ( !jump )
2188 unsigned int ss, esp, *stkp;
2189 int rc;
2190 #define push(item) do \
2191 { \
2192 --stkp; \
2193 esp -= 4; \
2194 rc = __put_user(item, stkp); \
2195 if ( rc ) \
2196 { \
2197 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2198 PFEC_write_access); \
2199 return 0; \
2200 } \
2201 } while ( 0 )
2203 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2205 sel |= (ar >> 13) & 3;
2206 /* Inner stack known only for kernel ring. */
2207 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2208 return do_guest_trap(TRAP_gp_fault, regs, 1);
2209 esp = v->arch.guest_context.kernel_sp;
2210 ss = v->arch.guest_context.kernel_ss;
2211 if ( (ss & 3) != (sel & 3) ||
2212 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2213 ((ar >> 13) & 3) != (sel & 3) ||
2214 !(ar & _SEGMENT_S) ||
2215 (ar & _SEGMENT_CODE) ||
2216 !(ar & _SEGMENT_WR) )
2218 regs->error_code = ss & ~3;
2219 return do_guest_trap(TRAP_invalid_tss, regs, 1);
2221 if ( !(ar & _SEGMENT_P) ||
2222 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2224 regs->error_code = ss & ~3;
2225 return do_guest_trap(TRAP_stack_error, regs, 1);
2227 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2228 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2229 return do_guest_trap(TRAP_gp_fault, regs, 1);
2230 push(regs->ss);
2231 push(regs->esp);
2232 if ( nparm )
2234 const unsigned int *ustkp;
2236 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2237 ((ar >> 13) & 3) != (regs->cs & 3) ||
2238 !(ar & _SEGMENT_S) ||
2239 (ar & _SEGMENT_CODE) ||
2240 !(ar & _SEGMENT_WR) ||
2241 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2242 return do_guest_trap(TRAP_gp_fault, regs, 1);
2243 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2244 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2245 return do_guest_trap(TRAP_gp_fault, regs, 1);
2246 do
2248 unsigned int parm;
2250 --ustkp;
2251 rc = __get_user(parm, ustkp);
2252 if ( rc )
2254 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2255 return 0;
2257 push(parm);
2258 } while ( --nparm );
2261 else
2263 sel |= (regs->cs & 3);
2264 esp = regs->esp;
2265 ss = regs->ss;
2266 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2267 ((ar >> 13) & 3) != (sel & 3) )
2268 return do_guest_trap(TRAP_gp_fault, regs, 1);
2269 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2271 regs->error_code = 0;
2272 return do_guest_trap(TRAP_stack_error, regs, 1);
2274 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2275 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2276 return do_guest_trap(TRAP_gp_fault, regs, 1);
2278 push(regs->cs);
2279 push(eip);
2280 #undef push
2281 regs->esp = esp;
2282 regs->ss = ss;
2284 else
2285 sel |= (regs->cs & 3);
2287 regs->eip = off;
2288 regs->cs = sel;
2289 #endif
2291 return 0;
2294 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
2296 struct vcpu *v = current;
2297 unsigned long fixup;
2299 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2301 if ( regs->error_code & 1 )
2302 goto hardware_gp;
2304 if ( !guest_mode(regs) )
2305 goto gp_in_kernel;
2307 /*
2308 * Cunning trick to allow arbitrary "INT n" handling.
2310 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2311 * instruction from trapping to the appropriate vector, when that might not
2312 * be expected by Xen or the guest OS. For example, that entry might be for
2313 * a fault handler (unlike traps, faults don't increment EIP), or might
2314 * expect an error code on the stack (which a software trap never
2315 * provides), or might be a hardware interrupt handler that doesn't like
2316 * being called spuriously.
2318 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2319 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2320 * clear to indicate that it's a software fault, not hardware.
2322 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2323 * okay because they can only be triggered by an explicit DPL-checked
2324 * instruction. The DPL specified by the guest OS for these vectors is NOT
2325 * CHECKED!!
2326 */
2327 if ( (regs->error_code & 3) == 2 )
2329 /* This fault must be due to <INT n> instruction. */
2330 const struct trap_info *ti;
2331 unsigned char vector = regs->error_code >> 3;
2332 ti = &v->arch.guest_context.trap_ctxt[vector];
2333 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2335 regs->eip += 2;
2336 return do_guest_trap(vector, regs, 0);
2339 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2340 return emulate_gate_op(regs);
2342 /* Emulate some simple privileged and I/O instructions. */
2343 if ( (regs->error_code == 0) &&
2344 emulate_privileged_op(regs) )
2346 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2347 return 0;
2350 #if defined(__i386__)
2351 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2352 (regs->error_code == 0) &&
2353 gpf_emulate_4gb(regs) )
2355 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2356 return 0;
2358 #endif
2360 /* Pass on GPF as is. */
2361 return do_guest_trap(TRAP_gp_fault, regs, 1);
2363 gp_in_kernel:
2365 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2367 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2368 regs->error_code, _p(regs->eip), _p(fixup));
2369 regs->eip = fixup;
2370 return 0;
2373 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2375 hardware_gp:
2376 show_execution_state(regs);
2377 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2378 return 0;
2381 static void nmi_softirq(void)
2383 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
2384 vcpu_kick(dom0->vcpu[0]);
2387 static void nmi_dom0_report(unsigned int reason_idx)
2389 struct domain *d;
2390 struct vcpu *v;
2392 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
2393 return;
2395 set_bit(reason_idx, nmi_reason(d));
2397 if ( !test_and_set_bool(v->nmi_pending) )
2398 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
2401 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2403 switch ( opt_nmi[0] )
2405 case 'd': /* 'dom0' */
2406 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2407 case 'i': /* 'ignore' */
2408 break;
2409 default: /* 'fatal' */
2410 console_force_unlock();
2411 printk("\n\nNMI - MEMORY ERROR\n");
2412 fatal_trap(TRAP_nmi, regs);
2415 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2416 mdelay(1);
2417 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2420 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2422 switch ( opt_nmi[0] )
2424 case 'd': /* 'dom0' */
2425 nmi_dom0_report(_XEN_NMIREASON_io_error);
2426 case 'i': /* 'ignore' */
2427 break;
2428 default: /* 'fatal' */
2429 console_force_unlock();
2430 printk("\n\nNMI - I/O ERROR\n");
2431 fatal_trap(TRAP_nmi, regs);
2434 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2435 mdelay(1);
2436 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2439 static void unknown_nmi_error(unsigned char reason)
2441 switch ( opt_nmi[0] )
2443 case 'd': /* 'dom0' */
2444 nmi_dom0_report(_XEN_NMIREASON_unknown);
2445 case 'i': /* 'ignore' */
2446 break;
2447 default: /* 'fatal' */
2448 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2449 printk("Dazed and confused, but trying to continue\n");
2450 printk("Do you have a strange power saving mode enabled?\n");
2451 kexec_crash();
2455 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2457 return 0;
2460 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2462 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2464 unsigned int cpu = smp_processor_id();
2465 unsigned char reason;
2467 ++nmi_count(cpu);
2469 if ( nmi_callback(regs, cpu) )
2470 return;
2472 if ( nmi_watchdog )
2473 nmi_watchdog_tick(regs);
2475 /* Only the BSP gets external NMIs from the system. */
2476 if ( cpu == 0 )
2478 reason = inb(0x61);
2479 if ( reason & 0x80 )
2480 mem_parity_error(regs);
2481 else if ( reason & 0x40 )
2482 io_check_error(regs);
2483 else if ( !nmi_watchdog )
2484 unknown_nmi_error((unsigned char)(reason&0xff));
2488 void set_nmi_callback(nmi_callback_t callback)
2490 nmi_callback = callback;
2493 void unset_nmi_callback(void)
2495 nmi_callback = dummy_nmi_callback;
2498 asmlinkage int do_device_not_available(struct cpu_user_regs *regs)
2500 struct vcpu *curr = current;
2502 BUG_ON(!guest_mode(regs));
2504 setup_fpu(curr);
2506 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2508 do_guest_trap(TRAP_no_device, regs, 0);
2509 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2511 else
2512 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2514 return EXCRET_fault_fixed;
2517 asmlinkage int do_debug(struct cpu_user_regs *regs)
2519 struct vcpu *v = current;
2521 DEBUGGER_trap_entry(TRAP_debug, regs);
2523 if ( !guest_mode(regs) )
2525 if ( regs->eflags & EF_TF )
2527 #ifdef __x86_64__
2528 void sysenter_entry(void);
2529 void sysenter_eflags_saved(void);
2530 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2531 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2532 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2533 goto out;
2534 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2535 #else
2536 WARN_ON(1);
2537 #endif
2538 regs->eflags &= ~EF_TF;
2540 else
2542 /*
2543 * We ignore watchpoints when they trigger within Xen. This may
2544 * happen when a buffer is passed to us which previously had a
2545 * watchpoint set on it. No need to bump EIP; the only faulting
2546 * trap is an instruction breakpoint, which can't happen to us.
2547 */
2548 WARN_ON(!search_exception_table(regs->eip));
2550 goto out;
2553 /* Save debug status register where guest OS can peek at it */
2554 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2556 ler_enable();
2557 return do_guest_trap(TRAP_debug, regs, 0);
2559 out:
2560 ler_enable();
2561 return EXCRET_not_a_fault;
2564 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2566 return EXCRET_not_a_fault;
2569 void set_intr_gate(unsigned int n, void *addr)
2571 int i;
2572 /* Keep secondary tables in sync with IRQ updates. */
2573 for ( i = 1; i < NR_CPUS; i++ )
2574 if ( idt_tables[i] != NULL )
2575 _set_gate(&idt_tables[i][n], 14, 0, addr);
2576 _set_gate(&idt_table[n], 14, 0, addr);
2579 void set_system_gate(unsigned int n, void *addr)
2581 _set_gate(idt_table+n,14,3,addr);
2584 void set_task_gate(unsigned int n, unsigned int sel)
2586 idt_table[n].a = sel << 16;
2587 idt_table[n].b = 0x8500;
2590 void set_tss_desc(unsigned int n, void *addr)
2592 _set_tssldt_desc(
2593 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2594 (unsigned long)addr,
2595 offsetof(struct tss_struct, __cacheline_filler) - 1,
2596 9);
2597 #ifdef CONFIG_COMPAT
2598 _set_tssldt_desc(
2599 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2600 (unsigned long)addr,
2601 offsetof(struct tss_struct, __cacheline_filler) - 1,
2602 11);
2603 #endif
2606 void __devinit percpu_traps_init(void)
2608 subarch_percpu_traps_init();
2610 if ( !opt_ler )
2611 return;
2613 switch ( boot_cpu_data.x86_vendor )
2615 case X86_VENDOR_INTEL:
2616 switch ( boot_cpu_data.x86 )
2618 case 6:
2619 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2620 break;
2621 case 15:
2622 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2623 break;
2625 break;
2626 case X86_VENDOR_AMD:
2627 switch ( boot_cpu_data.x86 )
2629 case 6:
2630 case 15:
2631 case 16:
2632 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2633 break;
2635 break;
2638 ler_enable();
2641 void __init trap_init(void)
2643 /*
2644 * Note that interrupt gates are always used, rather than trap gates. We
2645 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2646 * first activation must have the "bad" value(s) for these registers and
2647 * we may lose them if another activation is installed before they are
2648 * saved. The page-fault handler also needs interrupts disabled until %cr2
2649 * has been read and saved on the stack.
2650 */
2651 set_intr_gate(TRAP_divide_error,&divide_error);
2652 set_intr_gate(TRAP_debug,&debug);
2653 set_intr_gate(TRAP_nmi,&nmi);
2654 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2655 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2656 set_intr_gate(TRAP_bounds,&bounds);
2657 set_intr_gate(TRAP_invalid_op,&invalid_op);
2658 set_intr_gate(TRAP_no_device,&device_not_available);
2659 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2660 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2661 set_intr_gate(TRAP_no_segment,&segment_not_present);
2662 set_intr_gate(TRAP_stack_error,&stack_segment);
2663 set_intr_gate(TRAP_gp_fault,&general_protection);
2664 set_intr_gate(TRAP_page_fault,&page_fault);
2665 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2666 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2667 set_intr_gate(TRAP_alignment_check,&alignment_check);
2668 set_intr_gate(TRAP_machine_check,&machine_check);
2669 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2671 /* CPU0 uses the master IDT. */
2672 idt_tables[0] = idt_table;
2674 percpu_traps_init();
2676 cpu_init();
2678 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2681 long register_guest_nmi_callback(unsigned long address)
2683 struct vcpu *v = current;
2684 struct domain *d = v->domain;
2685 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2687 t->vector = TRAP_nmi;
2688 t->flags = 0;
2689 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2690 t->address = address;
2691 TI_SET_IF(t, 1);
2693 /*
2694 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2695 * now.
2696 */
2697 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2698 v->nmi_pending = 1;
2700 return 0;
2703 long unregister_guest_nmi_callback(void)
2705 struct vcpu *v = current;
2706 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2708 memset(t, 0, sizeof(*t));
2710 return 0;
2713 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2715 struct trap_info cur;
2716 struct vcpu *curr = current;
2717 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
2718 long rc = 0;
2720 /* If no table is presented then clear the entire virtual IDT. */
2721 if ( guest_handle_is_null(traps) )
2723 memset(dst, 0, 256 * sizeof(*dst));
2724 init_int80_direct_trap(curr);
2725 return 0;
2728 for ( ; ; )
2730 if ( hypercall_preempt_check() )
2732 rc = hypercall_create_continuation(
2733 __HYPERVISOR_set_trap_table, "h", traps);
2734 break;
2737 if ( copy_from_guest(&cur, traps, 1) )
2739 rc = -EFAULT;
2740 break;
2743 if ( cur.address == 0 )
2744 break;
2746 fixup_guest_code_selector(curr->domain, cur.cs);
2748 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2750 if ( cur.vector == 0x80 )
2751 init_int80_direct_trap(curr);
2753 guest_handle_add_offset(traps, 1);
2756 return rc;
2759 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
2761 int i;
2762 struct vcpu *curr = current;
2764 switch ( reg )
2766 case 0:
2767 if ( !access_ok(value, sizeof(long)) )
2768 return -EPERM;
2769 if ( v == curr )
2770 write_debugreg(0, value);
2771 break;
2772 case 1:
2773 if ( !access_ok(value, sizeof(long)) )
2774 return -EPERM;
2775 if ( v == curr )
2776 write_debugreg(1, value);
2777 break;
2778 case 2:
2779 if ( !access_ok(value, sizeof(long)) )
2780 return -EPERM;
2781 if ( v == curr )
2782 write_debugreg(2, value);
2783 break;
2784 case 3:
2785 if ( !access_ok(value, sizeof(long)) )
2786 return -EPERM;
2787 if ( v == curr )
2788 write_debugreg(3, value);
2789 break;
2790 case 6:
2791 /*
2792 * DR6: Bits 4-11,16-31 reserved (set to 1).
2793 * Bit 12 reserved (set to 0).
2794 */
2795 value &= 0xffffefff; /* reserved bits => 0 */
2796 value |= 0xffff0ff0; /* reserved bits => 1 */
2797 if ( v == curr )
2798 write_debugreg(6, value);
2799 break;
2800 case 7:
2801 /*
2802 * DR7: Bit 10 reserved (set to 1).
2803 * Bits 11-12,14-15 reserved (set to 0).
2804 * Privileged bits:
2805 * GD (bit 13): must be 0.
2806 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2807 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2808 */
2809 /* DR7 == 0 => debugging disabled for this domain. */
2810 if ( value != 0 )
2812 value &= 0xffff27ff; /* reserved bits => 0 */
2813 value |= 0x00000400; /* reserved bits => 1 */
2814 if ( (value & (1<<13)) != 0 ) return -EPERM;
2815 for ( i = 0; i < 16; i += 2 )
2816 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2817 /*
2818 * If DR7 was previously clear then we need to load all other
2819 * debug registers at this point as they were not restored during
2820 * context switch.
2821 */
2822 if ( (v == curr) && (v->arch.guest_context.debugreg[7] == 0) )
2824 write_debugreg(0, v->arch.guest_context.debugreg[0]);
2825 write_debugreg(1, v->arch.guest_context.debugreg[1]);
2826 write_debugreg(2, v->arch.guest_context.debugreg[2]);
2827 write_debugreg(3, v->arch.guest_context.debugreg[3]);
2828 write_debugreg(6, v->arch.guest_context.debugreg[6]);
2831 if ( v == curr )
2832 write_debugreg(7, value);
2833 break;
2834 default:
2835 return -EINVAL;
2838 v->arch.guest_context.debugreg[reg] = value;
2839 return 0;
2842 long do_set_debugreg(int reg, unsigned long value)
2844 return set_debugreg(current, reg, value);
2847 unsigned long do_get_debugreg(int reg)
2849 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2850 return current->arch.guest_context.debugreg[reg];
2853 /*
2854 * Local variables:
2855 * mode: C
2856 * c-set-style: "BSD"
2857 * c-basic-offset: 4
2858 * tab-width: 4
2859 * indent-tabs-mode: nil
2860 * End:
2861 */