debuggers.hg

view xen/arch/x86/traps.c @ 21015:8cb6e7eff2ba

x86: Generalise BUGFRAME_dump mechanism to allow polled UART irq to
get proper regs argument.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 24 10:44:30 2010 +0000 (2010-02-24)
parents 257bd5e90294
children e4851c5b7d00
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/bitops.h>
55 #include <asm/desc.h>
56 #include <asm/debugreg.h>
57 #include <asm/smp.h>
58 #include <asm/flushtlb.h>
59 #include <asm/uaccess.h>
60 #include <asm/i387.h>
61 #include <asm/debugger.h>
62 #include <asm/msr.h>
63 #include <asm/shared.h>
64 #include <asm/x86_emulate.h>
65 #include <asm/traps.h>
66 #include <asm/hvm/vpt.h>
67 #include <asm/hypercall.h>
68 #include <public/arch-x86/cpuid.h>
70 /*
71 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
72 * fatal: Xen prints diagnostic message and then hangs.
73 * dom0: The NMI is virtualised to DOM0.
74 * ignore: The NMI error is cleared and ignored.
75 */
76 #ifdef NDEBUG
77 static char __read_mostly opt_nmi[10] = "dom0";
78 #else
79 static char __read_mostly opt_nmi[10] = "fatal";
80 #endif
81 string_param("nmi", opt_nmi);
83 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
85 /* Master table, used by CPU0. */
86 idt_entry_t idt_table[IDT_ENTRIES];
88 /* Pointer to the IDT of every CPU. */
89 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
91 #define DECLARE_TRAP_HANDLER(_name) \
92 asmlinkage void _name(void); \
93 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
95 DECLARE_TRAP_HANDLER(divide_error);
96 DECLARE_TRAP_HANDLER(debug);
97 DECLARE_TRAP_HANDLER(nmi);
98 DECLARE_TRAP_HANDLER(int3);
99 DECLARE_TRAP_HANDLER(overflow);
100 DECLARE_TRAP_HANDLER(bounds);
101 DECLARE_TRAP_HANDLER(invalid_op);
102 DECLARE_TRAP_HANDLER(device_not_available);
103 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
104 DECLARE_TRAP_HANDLER(invalid_TSS);
105 DECLARE_TRAP_HANDLER(segment_not_present);
106 DECLARE_TRAP_HANDLER(stack_segment);
107 DECLARE_TRAP_HANDLER(general_protection);
108 DECLARE_TRAP_HANDLER(page_fault);
109 DECLARE_TRAP_HANDLER(coprocessor_error);
110 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
111 DECLARE_TRAP_HANDLER(machine_check);
112 DECLARE_TRAP_HANDLER(alignment_check);
113 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
136 unsigned long mask = STACK_SIZE;
138 if ( is_hvm_vcpu(v) )
139 return;
141 if ( is_pv_32on64_vcpu(v) )
142 {
143 compat_show_guest_stack(v, regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 if ( !access_ok(stack, sizeof(*stack)) )
160 {
161 printk("Guest-inaccessible memory.\n");
162 return;
163 }
165 if ( v != current )
166 {
167 struct vcpu *vcpu;
169 ASSERT(guest_kernel_mode(v, regs));
170 #ifndef __x86_64__
171 addr = read_cr3();
172 for_each_vcpu( v->domain, vcpu )
173 if ( vcpu->arch.cr3 == addr )
174 break;
175 #else
176 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
177 #endif
178 if ( !vcpu )
179 {
180 stack = do_page_walk(v, (unsigned long)stack);
181 if ( (unsigned long)stack < PAGE_SIZE )
182 {
183 printk("Inaccessible guest memory.\n");
184 return;
185 }
186 mask = PAGE_SIZE;
187 }
188 }
190 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
191 {
192 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
193 break;
194 if ( __get_user(addr, stack) )
195 {
196 if ( i != 0 )
197 printk("\n ");
198 printk("Fault while accessing guest memory.");
199 i = 1;
200 break;
201 }
202 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
203 printk("\n ");
204 printk(" %p", _p(addr));
205 stack++;
206 }
207 if ( i == 0 )
208 printk("Stack empty.");
209 printk("\n");
210 }
212 #if !defined(CONFIG_FRAME_POINTER)
214 static void show_trace(struct cpu_user_regs *regs)
215 {
216 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
218 printk("Xen call trace:\n ");
220 printk("[<%p>]", _p(regs->eip));
221 print_symbol(" %s\n ", regs->eip);
223 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
224 {
225 addr = *stack++;
226 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
227 {
228 printk("[<%p>]", _p(addr));
229 print_symbol(" %s\n ", addr);
230 }
231 }
233 printk("\n");
234 }
236 #else
238 static void show_trace(struct cpu_user_regs *regs)
239 {
240 unsigned long *frame, next, addr, low, high;
242 printk("Xen call trace:\n ");
244 printk("[<%p>]", _p(regs->eip));
245 print_symbol(" %s\n ", regs->eip);
247 /* Bounds for range of valid frame pointer. */
248 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
249 high = (low & ~(STACK_SIZE - 1)) +
250 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
252 /* The initial frame pointer. */
253 next = regs->ebp;
255 for ( ; ; )
256 {
257 /* Valid frame pointer? */
258 if ( (next < low) || (next >= high) )
259 {
260 /*
261 * Exception stack frames have a different layout, denoted by an
262 * inverted frame pointer.
263 */
264 next = ~next;
265 if ( (next < low) || (next >= high) )
266 break;
267 frame = (unsigned long *)next;
268 next = frame[0];
269 addr = frame[(offsetof(struct cpu_user_regs, eip) -
270 offsetof(struct cpu_user_regs, ebp))
271 / BYTES_PER_LONG];
272 }
273 else
274 {
275 /* Ordinary stack frame. */
276 frame = (unsigned long *)next;
277 next = frame[0];
278 addr = frame[1];
279 }
281 printk("[<%p>]", _p(addr));
282 print_symbol(" %s\n ", addr);
284 low = (unsigned long)&frame[2];
285 }
287 printk("\n");
288 }
290 #endif
292 void show_stack(struct cpu_user_regs *regs)
293 {
294 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
295 int i;
297 if ( guest_mode(regs) )
298 return show_guest_stack(current, regs);
300 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
302 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
303 {
304 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
305 break;
306 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
307 printk("\n ");
308 addr = *stack++;
309 printk(" %p", _p(addr));
310 }
311 if ( i == 0 )
312 printk("Stack empty.");
313 printk("\n");
315 show_trace(regs);
316 }
318 void show_stack_overflow(unsigned int cpu, unsigned long esp)
319 {
320 #ifdef MEMORY_GUARD
321 unsigned long esp_top, esp_bottom;
322 unsigned long *stack, addr;
324 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
325 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
327 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
328 (void *)esp_top, (void *)esp_bottom, (void *)esp,
329 (void *)per_cpu(init_tss, cpu).esp0);
331 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
332 if ( ((unsigned long)(esp - esp_top) > 512) &&
333 ((unsigned long)(esp_top - esp) > 512) )
334 {
335 printk("No stack overflow detected. Skipping stack trace.\n");
336 return;
337 }
339 if ( esp < esp_top )
340 esp = esp_top;
342 printk("Xen stack overflow (dumping trace %p-%p):\n ",
343 (void *)esp, (void *)esp_bottom);
345 stack = (unsigned long *)esp;
346 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
347 {
348 addr = *stack++;
349 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
350 {
351 printk("%p: [<%p>]", stack, _p(addr));
352 print_symbol(" %s\n ", addr);
353 }
354 }
356 printk("\n");
357 #endif
358 }
360 void show_execution_state(struct cpu_user_regs *regs)
361 {
362 show_registers(regs);
363 show_stack(regs);
364 }
366 void vcpu_show_execution_state(struct vcpu *v)
367 {
368 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
369 v->domain->domain_id, v->vcpu_id);
371 if ( v == current )
372 {
373 show_execution_state(guest_cpu_user_regs());
374 return;
375 }
377 vcpu_pause(v); /* acceptably dangerous */
379 vcpu_show_registers(v);
380 if ( guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
381 show_guest_stack(v, &v->arch.guest_context.user_regs);
383 vcpu_unpause(v);
384 }
386 static char *trapstr(int trapnr)
387 {
388 static char *strings[] = {
389 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
390 "invalid opcode", "device not available", "double fault",
391 "coprocessor segment", "invalid tss", "segment not found",
392 "stack error", "general protection fault", "page fault",
393 "spurious interrupt", "coprocessor error", "alignment check",
394 "machine check", "simd error"
395 };
397 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
398 return "???";
400 return strings[trapnr];
401 }
403 /*
404 * This is called for faults at very unexpected times (e.g., when interrupts
405 * are disabled). In such situations we can't do much that is safe. We try to
406 * print out some tracing and then we just spin.
407 */
408 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
409 {
410 static DEFINE_PER_CPU(char, depth);
412 /*
413 * In some cases, we can end up in a vicious cycle of fatal_trap()s
414 * within fatal_trap()s. We give the problem a couple of iterations to
415 * bottom out, and then we just panic.
416 */
417 if ( ++this_cpu(depth) < 3 )
418 {
419 watchdog_disable();
420 console_start_sync();
422 show_execution_state(regs);
424 if ( trapnr == TRAP_page_fault )
425 {
426 unsigned long cr2 = read_cr2();
427 printk("Faulting linear address: %p\n", _p(cr2));
428 show_page_walk(cr2);
429 }
430 }
432 panic("FATAL TRAP: vector = %d (%s)\n"
433 "[error_code=%04x] %s\n",
434 trapnr, trapstr(trapnr), regs->error_code,
435 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
436 }
438 static void do_guest_trap(
439 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
440 {
441 struct vcpu *v = current;
442 struct trap_bounce *tb;
443 const struct trap_info *ti;
445 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
447 tb = &v->arch.trap_bounce;
448 ti = &v->arch.guest_context.trap_ctxt[trapnr];
450 tb->flags = TBF_EXCEPTION;
451 tb->cs = ti->cs;
452 tb->eip = ti->address;
454 if ( use_error_code )
455 {
456 tb->flags |= TBF_EXCEPTION_ERRCODE;
457 tb->error_code = regs->error_code;
458 }
460 if ( TI_GET_IF(ti) )
461 tb->flags |= TBF_INTERRUPT;
463 if ( unlikely(null_trap_bounce(v, tb)) )
464 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
465 "on VCPU %d [ec=%04x]\n",
466 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
467 }
469 static void instruction_done(
470 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
471 {
472 regs->eip = eip;
473 regs->eflags &= ~X86_EFLAGS_RF;
474 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
475 {
476 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
477 if ( regs->eflags & X86_EFLAGS_TF )
478 current->arch.guest_context.debugreg[6] |= 0x4000;
479 do_guest_trap(TRAP_debug, regs, 0);
480 }
481 }
483 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
484 unsigned int port, unsigned int len)
485 {
486 unsigned int width, i, match = 0;
487 unsigned long start;
489 if ( !(v->arch.guest_context.debugreg[5]) ||
490 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
491 return 0;
493 for ( i = 0; i < 4; i++ )
494 {
495 if ( !(v->arch.guest_context.debugreg[5] &
496 (3 << (i * DR_ENABLE_SIZE))) )
497 continue;
499 start = v->arch.guest_context.debugreg[i];
500 width = 0;
502 switch ( (v->arch.guest_context.debugreg[7] >>
503 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
504 {
505 case DR_LEN_1: width = 1; break;
506 case DR_LEN_2: width = 2; break;
507 case DR_LEN_4: width = 4; break;
508 case DR_LEN_8: width = 8; break;
509 }
511 if ( (start < (port + len)) && ((start + width) > port) )
512 match |= 1 << i;
513 }
515 return match;
516 }
518 /*
519 * Called from asm to set up the MCE trapbounce info.
520 * Returns 0 if no callback is set up, else 1.
521 */
522 asmlinkage int set_guest_machinecheck_trapbounce(void)
523 {
524 struct vcpu *v = current;
525 struct trap_bounce *tb = &v->arch.trap_bounce;
527 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
528 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
529 return !null_trap_bounce(v, tb);
530 }
532 /*
533 * Called from asm to set up the NMI trapbounce info.
534 * Returns 0 if no callback is set up, else 1.
535 */
536 asmlinkage int set_guest_nmi_trapbounce(void)
537 {
538 struct vcpu *v = current;
539 struct trap_bounce *tb = &v->arch.trap_bounce;
540 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
541 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
542 return !null_trap_bounce(v, tb);
543 }
545 static inline void do_trap(
546 int trapnr, struct cpu_user_regs *regs, int use_error_code)
547 {
548 struct vcpu *curr = current;
549 unsigned long fixup;
551 DEBUGGER_trap_entry(trapnr, regs);
553 if ( guest_mode(regs) )
554 {
555 do_guest_trap(trapnr, regs, use_error_code);
556 return;
557 }
559 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
560 {
561 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
562 trapnr, _p(regs->eip), _p(fixup));
563 regs->eip = fixup;
564 return;
565 }
567 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
568 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
569 {
570 curr->arch.hvm_vcpu.fpu_exception_callback(
571 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
572 return;
573 }
575 DEBUGGER_trap_fatal(trapnr, regs);
577 show_execution_state(regs);
578 panic("FATAL TRAP: vector = %d (%s)\n"
579 "[error_code=%04x]\n",
580 trapnr, trapstr(trapnr), regs->error_code);
581 }
583 #define DO_ERROR_NOCODE(trapnr, name) \
584 asmlinkage void do_##name(struct cpu_user_regs *regs) \
585 { \
586 do_trap(trapnr, regs, 0); \
587 }
589 #define DO_ERROR(trapnr, name) \
590 asmlinkage void do_##name(struct cpu_user_regs *regs) \
591 { \
592 do_trap(trapnr, regs, 1); \
593 }
595 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
596 DO_ERROR_NOCODE(TRAP_overflow, overflow)
597 DO_ERROR_NOCODE(TRAP_bounds, bounds)
598 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
599 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
600 DO_ERROR( TRAP_no_segment, segment_not_present)
601 DO_ERROR( TRAP_stack_error, stack_segment)
602 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
603 DO_ERROR( TRAP_alignment_check, alignment_check)
604 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
606 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
607 {
608 struct domain *d = current->domain;
609 /* Optionally shift out of the way of Viridian architectural MSRs. */
610 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
612 idx -= base;
613 if ( idx > 0 )
614 return 0;
616 switch ( idx )
617 {
618 case 0:
619 {
620 *val = 0;
621 break;
622 }
623 default:
624 BUG();
625 }
627 return 1;
628 }
630 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
631 {
632 struct domain *d = current->domain;
633 /* Optionally shift out of the way of Viridian architectural MSRs. */
634 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
636 idx -= base;
637 if ( idx > 0 )
638 return 0;
640 switch ( idx )
641 {
642 case 0:
643 {
644 void *hypercall_page;
645 unsigned long mfn;
646 unsigned long gmfn = val >> 12;
647 unsigned int idx = val & 0xfff;
649 if ( idx > 0 )
650 {
651 gdprintk(XENLOG_WARNING,
652 "Out of range index %u to MSR %08x\n",
653 idx, 0x40000000);
654 return 0;
655 }
657 mfn = gmfn_to_mfn(d, gmfn);
659 if ( !mfn_valid(mfn) ||
660 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
661 {
662 gdprintk(XENLOG_WARNING,
663 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
664 gmfn, mfn, base + idx);
665 return 0;
666 }
668 hypercall_page = map_domain_page(mfn);
669 hypercall_page_initialise(d, hypercall_page);
670 unmap_domain_page(hypercall_page);
672 put_page_and_type(mfn_to_page(mfn));
673 break;
674 }
676 default:
677 BUG();
678 }
680 return 1;
681 }
683 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
684 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
685 {
686 struct domain *d = current->domain;
687 /* Optionally shift out of the way of Viridian architectural leaves. */
688 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
690 idx -= base;
691 if ( idx > 3 )
692 return 0;
694 switch ( idx )
695 {
696 case 0:
697 *eax = base + 3; /* Largest leaf */
698 *ebx = XEN_CPUID_SIGNATURE_EBX;
699 *ecx = XEN_CPUID_SIGNATURE_ECX;
700 *edx = XEN_CPUID_SIGNATURE_EDX;
701 break;
703 case 1:
704 *eax = (xen_major_version() << 16) | xen_minor_version();
705 *ebx = 0; /* Reserved */
706 *ecx = 0; /* Reserved */
707 *edx = 0; /* Reserved */
708 break;
710 case 2:
711 *eax = 1; /* Number of hypercall-transfer pages */
712 *ebx = 0x40000000; /* MSR base address */
713 if ( is_viridian_domain(d) )
714 *ebx = 0x40000200;
715 *ecx = 0; /* Features 1 */
716 *edx = 0; /* Features 2 */
717 if ( !is_hvm_vcpu(current) )
718 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
719 break;
721 case 3:
722 *eax = *ebx = *ecx = *edx = 0;
723 cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
724 break;
726 default:
727 BUG();
728 }
730 return 1;
731 }
733 static void pv_cpuid(struct cpu_user_regs *regs)
734 {
735 uint32_t a, b, c, d;
737 a = regs->eax;
738 b = regs->ebx;
739 c = regs->ecx;
740 d = regs->edx;
742 if ( current->domain->domain_id != 0 )
743 {
744 if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
745 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
746 goto out;
747 }
749 asm (
750 "cpuid"
751 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
752 : "0" (a), "1" (b), "2" (c), "3" (d) );
754 if ( (regs->eax & 0x7fffffff) == 1 )
755 {
756 /* Modify Feature Information. */
757 __clear_bit(X86_FEATURE_VME, &d);
758 if ( !cpu_has_apic )
759 __clear_bit(X86_FEATURE_APIC, &d);
760 __clear_bit(X86_FEATURE_PSE, &d);
761 __clear_bit(X86_FEATURE_PGE, &d);
762 __clear_bit(X86_FEATURE_PSE36, &d);
763 }
764 switch ( (uint32_t)regs->eax )
765 {
766 case 1:
767 /* Modify Feature Information. */
768 if ( !cpu_has_sep )
769 __clear_bit(X86_FEATURE_SEP, &d);
770 #ifdef __i386__
771 if ( !supervisor_mode_kernel )
772 __clear_bit(X86_FEATURE_SEP, &d);
773 #endif
774 __clear_bit(X86_FEATURE_DS, &d);
775 __clear_bit(X86_FEATURE_ACC, &d);
776 __clear_bit(X86_FEATURE_PBE, &d);
778 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
779 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
780 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
781 __clear_bit(X86_FEATURE_VMXE % 32, &c);
782 __clear_bit(X86_FEATURE_SMXE % 32, &c);
783 __clear_bit(X86_FEATURE_TM2 % 32, &c);
784 if ( is_pv_32bit_vcpu(current) )
785 __clear_bit(X86_FEATURE_CX16 % 32, &c);
786 __clear_bit(X86_FEATURE_XTPR % 32, &c);
787 __clear_bit(X86_FEATURE_PDCM % 32, &c);
788 __clear_bit(X86_FEATURE_DCA % 32, &c);
789 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
790 if ( !cpu_has_apic )
791 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
792 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
793 break;
794 case 0x80000001:
795 /* Modify Feature Information. */
796 if ( is_pv_32bit_vcpu(current) )
797 {
798 __clear_bit(X86_FEATURE_LM % 32, &d);
799 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
800 }
801 #ifndef __i386__
802 if ( is_pv_32on64_vcpu(current) &&
803 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
804 #endif
805 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
806 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
807 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
809 __clear_bit(X86_FEATURE_SVME % 32, &c);
810 if ( !cpu_has_apic )
811 __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
812 __clear_bit(X86_FEATURE_OSVW % 32, &c);
813 __clear_bit(X86_FEATURE_IBS % 32, &c);
814 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
815 __clear_bit(X86_FEATURE_WDT % 32, &c);
816 break;
817 case 5: /* MONITOR/MWAIT */
818 case 0xa: /* Architectural Performance Monitor Features */
819 case 0x8000000a: /* SVM revision and features */
820 case 0x8000001b: /* Instruction Based Sampling */
821 a = b = c = d = 0;
822 break;
823 default:
824 (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
825 break;
826 }
828 out:
829 regs->eax = a;
830 regs->ebx = b;
831 regs->ecx = c;
832 regs->edx = d;
833 }
835 static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
836 {
837 char opcode[3];
838 unsigned long eip, rc;
839 struct vcpu *v = current;
841 eip = regs->eip;
842 if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
843 {
844 propagate_page_fault(eip + sizeof(opcode) - rc, 0);
845 return EXCRET_fault_fixed;
846 }
847 if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
848 return 0;
849 eip += sizeof(opcode);
850 pv_soft_rdtsc(v, regs, 1);
851 instruction_done(regs, eip, 0);
852 return EXCRET_fault_fixed;
853 }
855 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
856 {
857 char sig[5], instr[2];
858 unsigned long eip, rc;
860 eip = regs->eip;
862 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
863 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
864 {
865 propagate_page_fault(eip + sizeof(sig) - rc, 0);
866 return EXCRET_fault_fixed;
867 }
868 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
869 return 0;
870 eip += sizeof(sig);
872 /* We only emulate CPUID. */
873 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
874 {
875 propagate_page_fault(eip + sizeof(instr) - rc, 0);
876 return EXCRET_fault_fixed;
877 }
878 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
879 return 0;
880 eip += sizeof(instr);
882 pv_cpuid(regs);
884 instruction_done(regs, eip, 0);
886 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
888 return EXCRET_fault_fixed;
889 }
891 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
892 {
893 struct bug_frame bug;
894 struct bug_frame_str bug_str;
895 const void *p;
896 const char *filename, *predicate, *eip = (char *)regs->eip;
897 unsigned long fixup;
898 int id, lineno;
900 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
902 if ( likely(guest_mode(regs)) )
903 {
904 if ( !emulate_invalid_rdtscp(regs) &&
905 !emulate_forced_invalid_op(regs) )
906 do_guest_trap(TRAP_invalid_op, regs, 0);
907 return;
908 }
910 if ( !is_kernel(eip) ||
911 __copy_from_user(&bug, eip, sizeof(bug)) ||
912 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
913 (bug.ret != 0xc2) )
914 goto die;
915 eip += sizeof(bug);
917 /* Decode first pointer argument. */
918 if ( !is_kernel(eip) ||
919 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
920 (bug_str.mov != 0xbc) )
921 goto die;
922 p = bug_str(bug_str, eip);
923 if ( !is_kernel(p) )
924 goto die;
925 eip += sizeof(bug_str);
927 id = bug.id & 3;
929 if ( id == BUGFRAME_run_fn )
930 {
931 const void (*fn)(struct cpu_user_regs *) = p;
932 (*fn)(regs);
933 regs->eip = (unsigned long)eip;
934 return;
935 }
937 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
938 filename = p;
939 lineno = bug.id >> 2;
941 if ( id == BUGFRAME_warn )
942 {
943 printk("Xen WARN at %.50s:%d\n", filename, lineno);
944 show_execution_state(regs);
945 regs->eip = (unsigned long)eip;
946 return;
947 }
949 if ( id == BUGFRAME_bug )
950 {
951 printk("Xen BUG at %.50s:%d\n", filename, lineno);
952 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
953 show_execution_state(regs);
954 panic("Xen BUG at %.50s:%d\n", filename, lineno);
955 }
957 /* ASSERT: decode the predicate string pointer. */
958 ASSERT(id == BUGFRAME_assert);
959 if ( !is_kernel(eip) ||
960 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
961 (bug_str.mov != 0xbc) )
962 goto die;
963 predicate = bug_str(bug_str, eip);
964 eip += sizeof(bug_str);
966 if ( !is_kernel(predicate) )
967 predicate = "<unknown>";
968 printk("Assertion '%s' failed at %.50s:%d\n",
969 predicate, filename, lineno);
970 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
971 show_execution_state(regs);
972 panic("Assertion '%s' failed at %.50s:%d\n",
973 predicate, filename, lineno);
975 die:
976 if ( (fixup = search_exception_table(regs->eip)) != 0 )
977 {
978 regs->eip = fixup;
979 return;
980 }
981 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
982 show_execution_state(regs);
983 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
984 }
986 asmlinkage void do_int3(struct cpu_user_regs *regs)
987 {
988 DEBUGGER_trap_entry(TRAP_int3, regs);
990 if ( !guest_mode(regs) )
991 {
992 debugger_trap_fatal(TRAP_int3, regs);
993 return;
994 }
996 do_guest_trap(TRAP_int3, regs, 0);
997 }
999 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
1001 machine_check_vector(regs, regs->error_code);
1004 static void reserved_bit_page_fault(
1005 unsigned long addr, struct cpu_user_regs *regs)
1007 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
1008 current->domain->domain_id, current->vcpu_id, regs->error_code);
1009 show_page_walk(addr);
1010 show_execution_state(regs);
1013 void propagate_page_fault(unsigned long addr, u16 error_code)
1015 struct trap_info *ti;
1016 struct vcpu *v = current;
1017 struct trap_bounce *tb = &v->arch.trap_bounce;
1019 v->arch.guest_context.ctrlreg[2] = addr;
1020 arch_set_cr2(v, addr);
1022 /* Re-set error_code.user flag appropriately for the guest. */
1023 error_code &= ~PFEC_user_mode;
1024 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
1025 error_code |= PFEC_user_mode;
1027 trace_pv_page_fault(addr, error_code);
1029 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
1030 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1031 tb->error_code = error_code;
1032 tb->cs = ti->cs;
1033 tb->eip = ti->address;
1034 if ( TI_GET_IF(ti) )
1035 tb->flags |= TBF_INTERRUPT;
1036 if ( unlikely(null_trap_bounce(v, tb)) )
1038 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
1039 v->domain->domain_id, v->vcpu_id, error_code);
1040 show_page_walk(addr);
1043 if ( unlikely(error_code & PFEC_reserved_bit) )
1044 reserved_bit_page_fault(addr, guest_cpu_user_regs());
1047 static int handle_gdt_ldt_mapping_fault(
1048 unsigned long offset, struct cpu_user_regs *regs)
1050 struct vcpu *curr = current;
1051 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1052 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1053 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1055 /* Should never fault in another vcpu's area. */
1056 BUG_ON(vcpu_area != curr->vcpu_id);
1058 /* Byte offset within the gdt/ldt sub-area. */
1059 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1061 if ( likely(is_ldt_area) )
1063 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1064 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1066 if ( guest_mode(regs) )
1067 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1068 regs->eip, offset);
1070 else
1072 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1073 if ( !guest_mode(regs) )
1074 return 0;
1075 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1076 propagate_page_fault(
1077 curr->arch.guest_context.ldt_base + offset,
1078 regs->error_code);
1081 else
1083 /* GDT fault: handle the fault as #GP(selector). */
1084 regs->error_code = (u16)offset & ~7;
1085 (void)do_general_protection(regs);
1088 return EXCRET_fault_fixed;
1091 #ifdef HYPERVISOR_VIRT_END
1092 #define IN_HYPERVISOR_RANGE(va) \
1093 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1094 #else
1095 #define IN_HYPERVISOR_RANGE(va) \
1096 (((va) >= HYPERVISOR_VIRT_START))
1097 #endif
1099 static int __spurious_page_fault(
1100 unsigned long addr, unsigned int error_code)
1102 unsigned long mfn, cr3 = read_cr3();
1103 #if CONFIG_PAGING_LEVELS >= 4
1104 l4_pgentry_t l4e, *l4t;
1105 #endif
1106 #if CONFIG_PAGING_LEVELS >= 3
1107 l3_pgentry_t l3e, *l3t;
1108 #endif
1109 l2_pgentry_t l2e, *l2t;
1110 l1_pgentry_t l1e, *l1t;
1111 unsigned int required_flags, disallowed_flags;
1113 /*
1114 * We do not take spurious page faults in IRQ handlers as we do not
1115 * modify page tables in IRQ context. We therefore bail here because
1116 * map_domain_page() is not IRQ-safe.
1117 */
1118 if ( in_irq() )
1119 return 0;
1121 /* Reserved bit violations are never spurious faults. */
1122 if ( error_code & PFEC_reserved_bit )
1123 return 0;
1125 required_flags = _PAGE_PRESENT;
1126 if ( error_code & PFEC_write_access )
1127 required_flags |= _PAGE_RW;
1128 if ( error_code & PFEC_user_mode )
1129 required_flags |= _PAGE_USER;
1131 disallowed_flags = 0;
1132 if ( error_code & PFEC_insn_fetch )
1133 disallowed_flags |= _PAGE_NX;
1135 mfn = cr3 >> PAGE_SHIFT;
1137 #if CONFIG_PAGING_LEVELS >= 4
1138 l4t = map_domain_page(mfn);
1139 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1140 mfn = l4e_get_pfn(l4e);
1141 unmap_domain_page(l4t);
1142 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1143 (l4e_get_flags(l4e) & disallowed_flags) )
1144 return 0;
1145 #endif
1147 #if CONFIG_PAGING_LEVELS >= 3
1148 l3t = map_domain_page(mfn);
1149 #if CONFIG_PAGING_LEVELS == 3
1150 l3t += (cr3 & 0xFE0UL) >> 3;
1151 #endif
1152 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1153 mfn = l3e_get_pfn(l3e);
1154 unmap_domain_page(l3t);
1155 #if CONFIG_PAGING_LEVELS == 3
1156 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1157 return 0;
1158 #else
1159 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1160 (l3e_get_flags(l3e) & disallowed_flags) )
1161 return 0;
1162 #endif
1163 #endif
1165 l2t = map_domain_page(mfn);
1166 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1167 mfn = l2e_get_pfn(l2e);
1168 unmap_domain_page(l2t);
1169 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1170 (l2e_get_flags(l2e) & disallowed_flags) )
1171 return 0;
1172 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1174 l1e = l1e_empty(); /* define before use in debug tracing */
1175 goto spurious;
1178 l1t = map_domain_page(mfn);
1179 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1180 mfn = l1e_get_pfn(l1e);
1181 unmap_domain_page(l1t);
1182 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1183 (l1e_get_flags(l1e) & disallowed_flags) )
1184 return 0;
1186 spurious:
1187 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1188 "at addr %lx, e/c %04x\n",
1189 current->domain->domain_id, current->vcpu_id,
1190 addr, error_code);
1191 #if CONFIG_PAGING_LEVELS >= 4
1192 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1193 #endif
1194 #if CONFIG_PAGING_LEVELS >= 3
1195 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1196 #endif
1197 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1198 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1199 return 1;
1202 static int spurious_page_fault(
1203 unsigned long addr, unsigned int error_code)
1205 unsigned long flags;
1206 int is_spurious;
1208 /*
1209 * Disabling interrupts prevents TLB flushing, and hence prevents
1210 * page tables from becoming invalid under our feet during the walk.
1211 */
1212 local_irq_save(flags);
1213 is_spurious = __spurious_page_fault(addr, error_code);
1214 local_irq_restore(flags);
1216 return is_spurious;
1219 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1221 struct vcpu *v = current;
1222 struct domain *d = v->domain;
1224 /* No fixups in interrupt context or when interrupts are disabled. */
1225 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1226 return 0;
1228 /* Faults from external-mode guests are handled by shadow/hap */
1229 if ( paging_mode_external(d) && guest_mode(regs) )
1231 int ret = paging_fault(addr, regs);
1232 if ( ret == EXCRET_fault_fixed )
1233 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1234 return ret;
1237 if ( !(regs->error_code & PFEC_page_present) &&
1238 (pagefault_by_memadd(addr, regs)) )
1239 return handle_memadd_fault(addr, regs);
1241 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1243 if ( !(regs->error_code & PFEC_reserved_bit) &&
1244 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1245 return handle_gdt_ldt_mapping_fault(
1246 addr - GDT_LDT_VIRT_START, regs);
1247 return 0;
1250 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1251 guest_kernel_mode(v, regs) &&
1252 /* Do not check if access-protection fault since the page may
1253 legitimately be not present in shadow page tables */
1254 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1255 PFEC_write_access) &&
1256 ptwr_do_page_fault(v, addr, regs) )
1257 return EXCRET_fault_fixed;
1259 /* For non-external shadowed guests, we fix up both their own
1260 * pagefaults and Xen's, since they share the pagetables. */
1261 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1263 int ret = paging_fault(addr, regs);
1264 if ( ret == EXCRET_fault_fixed )
1265 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1266 return ret;
1269 return 0;
1272 /*
1273 * #PF error code:
1274 * Bit 0: Protection violation (=1) ; Page not present (=0)
1275 * Bit 1: Write access
1276 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1277 * Bit 3: Reserved bit violation
1278 * Bit 4: Instruction fetch
1279 */
1280 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1282 unsigned long addr, fixup;
1283 unsigned int error_code;
1285 addr = read_cr2();
1287 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1288 error_code = regs->error_code;
1290 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1292 perfc_incr(page_faults);
1294 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1295 return;
1297 if ( unlikely(!guest_mode(regs)) )
1299 if ( spurious_page_fault(addr, error_code) )
1300 return;
1302 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1304 perfc_incr(copy_user_faults);
1305 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1306 reserved_bit_page_fault(addr, regs);
1307 regs->eip = fixup;
1308 return;
1311 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1313 show_execution_state(regs);
1314 show_page_walk(addr);
1315 panic("FATAL PAGE FAULT\n"
1316 "[error_code=%04x]\n"
1317 "Faulting linear address: %p\n",
1318 error_code, _p(addr));
1321 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1322 && spurious_page_fault(addr, error_code)) )
1323 return;
1325 propagate_page_fault(addr, regs->error_code);
1328 /*
1329 * Early #PF handler to print CR2, error code, and stack.
1331 * We also deal with spurious faults here, even though they should never happen
1332 * during early boot (an issue was seen once, but was most likely a hardware
1333 * problem).
1334 */
1335 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1337 static int stuck;
1338 static unsigned long prev_eip, prev_cr2;
1339 unsigned long cr2 = read_cr2();
1341 BUG_ON(smp_processor_id() != 0);
1343 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1345 prev_eip = regs->eip;
1346 prev_cr2 = cr2;
1347 stuck = 0;
1348 return;
1351 if ( stuck++ == 1000 )
1353 unsigned long *stk = (unsigned long *)regs;
1354 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1355 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1356 printk("Stack dump: ");
1357 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1358 printk("%p ", _p(*stk++));
1359 for ( ; ; ) ;
1363 long do_fpu_taskswitch(int set)
1365 struct vcpu *v = current;
1367 if ( set )
1369 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1370 stts();
1372 else
1374 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1375 if ( v->fpu_dirtied )
1376 clts();
1379 return 0;
1382 static int read_descriptor(unsigned int sel,
1383 const struct vcpu *v,
1384 const struct cpu_user_regs * regs,
1385 unsigned long *base,
1386 unsigned long *limit,
1387 unsigned int *ar,
1388 unsigned int vm86attr)
1390 struct desc_struct desc;
1392 if ( !vm86_mode(regs) )
1394 if ( sel < 4)
1395 desc.b = desc.a = 0;
1396 else if ( __get_user(desc,
1397 (const struct desc_struct *)(!(sel & 4)
1398 ? GDT_VIRT_START(v)
1399 : LDT_VIRT_START(v))
1400 + (sel >> 3)) )
1401 return 0;
1402 if ( !(vm86attr & _SEGMENT_CODE) )
1403 desc.b &= ~_SEGMENT_L;
1405 else
1407 desc.a = (sel << 20) | 0xffff;
1408 desc.b = vm86attr | (sel >> 12);
1411 *ar = desc.b & 0x00f0ff00;
1412 if ( !(desc.b & _SEGMENT_L) )
1414 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1415 (desc.b & 0xff000000));
1416 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1417 if ( desc.b & _SEGMENT_G )
1418 *limit = ((*limit + 1) << 12) - 1;
1419 #ifndef NDEBUG
1420 if ( !vm86_mode(regs) && (sel > 3) )
1422 unsigned int a, l;
1423 unsigned char valid;
1425 asm volatile (
1426 "larl %2,%0 ; setz %1"
1427 : "=r" (a), "=qm" (valid) : "rm" (sel));
1428 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1429 asm volatile (
1430 "lsll %2,%0 ; setz %1"
1431 : "=r" (l), "=qm" (valid) : "rm" (sel));
1432 BUG_ON(valid && (l != *limit));
1434 #endif
1436 else
1438 *base = 0UL;
1439 *limit = ~0UL;
1442 return 1;
1445 #ifdef __x86_64__
1446 static int read_gate_descriptor(unsigned int gate_sel,
1447 const struct vcpu *v,
1448 unsigned int *sel,
1449 unsigned long *off,
1450 unsigned int *ar)
1452 struct desc_struct desc;
1453 const struct desc_struct *pdesc;
1456 pdesc = (const struct desc_struct *)
1457 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1458 + (gate_sel >> 3);
1459 if ( (gate_sel < 4) ||
1460 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1461 __get_user(desc, pdesc) )
1462 return 0;
1464 *sel = (desc.a >> 16) & 0x0000fffc;
1465 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1466 *ar = desc.b & 0x0000ffff;
1468 /*
1469 * check_descriptor() clears the DPL field and stores the
1470 * guest requested DPL in the selector's RPL field.
1471 */
1472 if ( *ar & _SEGMENT_DPL )
1473 return 0;
1474 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1476 if ( !is_pv_32bit_vcpu(v) )
1478 if ( (*ar & 0x1f00) != 0x0c00 ||
1479 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1480 __get_user(desc, pdesc + 1) ||
1481 (desc.b & 0x1f00) )
1482 return 0;
1484 *off |= (unsigned long)desc.a << 32;
1485 return 1;
1488 switch ( *ar & 0x1f00 )
1490 case 0x0400:
1491 *off &= 0xffff;
1492 break;
1493 case 0x0c00:
1494 break;
1495 default:
1496 return 0;
1499 return 1;
1501 #endif
1503 /* Has the guest requested sufficient permission for this I/O access? */
1504 static int guest_io_okay(
1505 unsigned int port, unsigned int bytes,
1506 struct vcpu *v, struct cpu_user_regs *regs)
1508 #if defined(__x86_64__)
1509 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1510 int user_mode = !(v->arch.flags & TF_kernel_mode);
1511 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1512 #elif defined(__i386__)
1513 #define TOGGLE_MODE() ((void)0)
1514 #endif
1516 if ( !vm86_mode(regs) &&
1517 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1518 return 1;
1520 if ( v->arch.iobmp_limit > (port + bytes) )
1522 union { uint8_t bytes[2]; uint16_t mask; } x;
1524 /*
1525 * Grab permission bytes from guest space. Inaccessible bytes are
1526 * read as 0xff (no access allowed).
1527 */
1528 TOGGLE_MODE();
1529 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1530 port>>3, 2) )
1532 default: x.bytes[0] = ~0;
1533 case 1: x.bytes[1] = ~0;
1534 case 0: break;
1536 TOGGLE_MODE();
1538 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1539 return 1;
1542 return 0;
1545 /* Has the administrator granted sufficient permission for this I/O access? */
1546 static int admin_io_okay(
1547 unsigned int port, unsigned int bytes,
1548 struct vcpu *v, struct cpu_user_regs *regs)
1550 /*
1551 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1552 * We never permit direct access to that register.
1553 */
1554 if ( (port == 0xcf8) && (bytes == 4) )
1555 return 0;
1557 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1560 static uint32_t guest_io_read(
1561 unsigned int port, unsigned int bytes,
1562 struct vcpu *v, struct cpu_user_regs *regs)
1564 extern uint32_t pci_conf_read(
1565 uint32_t cf8, uint8_t offset, uint8_t bytes);
1567 uint32_t data = 0;
1568 unsigned int shift = 0;
1570 if ( admin_io_okay(port, bytes, v, regs) )
1572 switch ( bytes )
1574 case 1: return inb(port);
1575 case 2: return inw(port);
1576 case 4: return inl(port);
1580 while ( bytes != 0 )
1582 unsigned int size = 1;
1583 uint32_t sub_data = 0xff;
1585 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1587 sub_data = pv_pit_handler(port, 0, 0);
1589 else if ( (port == 0xcf8) && (bytes == 4) )
1591 size = 4;
1592 sub_data = v->domain->arch.pci_cf8;
1594 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1596 size = min(bytes, 4 - (port & 3));
1597 if ( size == 3 )
1598 size = 2;
1599 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1602 if ( size == 4 )
1603 return sub_data;
1605 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1606 shift += size * 8;
1607 port += size;
1608 bytes -= size;
1611 return data;
1614 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1616 static void guest_io_write(
1617 unsigned int port, unsigned int bytes, uint32_t data,
1618 struct vcpu *v, struct cpu_user_regs *regs)
1620 extern void pci_conf_write(
1621 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1623 if ( admin_io_okay(port, bytes, v, regs) )
1625 switch ( bytes ) {
1626 case 1:
1627 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1628 pv_rtc_handler(port, (uint8_t)data);
1629 outb((uint8_t)data, port);
1630 if ( pv_post_outb_hook )
1631 pv_post_outb_hook(port, (uint8_t)data);
1632 break;
1633 case 2:
1634 outw((uint16_t)data, port);
1635 break;
1636 case 4:
1637 outl(data, port);
1638 break;
1640 return;
1643 while ( bytes != 0 )
1645 unsigned int size = 1;
1647 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1649 pv_pit_handler(port, (uint8_t)data, 1);
1651 else if ( (port == 0xcf8) && (bytes == 4) )
1653 size = 4;
1654 v->domain->arch.pci_cf8 = data;
1656 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1658 size = min(bytes, 4 - (port & 3));
1659 if ( size == 3 )
1660 size = 2;
1661 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1664 if ( size == 4 )
1665 return;
1667 port += size;
1668 bytes -= size;
1669 data >>= size * 8;
1673 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1674 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1675 __attribute__((__regparm__(1)));
1676 unsigned long guest_to_host_gpr_switch(unsigned long)
1677 __attribute__((__regparm__(1)));
1679 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1681 /* Instruction fetch with error handling. */
1682 #define insn_fetch(type, base, eip, limit) \
1683 ({ unsigned long _rc, _ptr = (base) + (eip); \
1684 type _x; \
1685 if ( ad_default < 8 ) \
1686 _ptr = (unsigned int)_ptr; \
1687 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1688 goto fail; \
1689 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1690 { \
1691 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1692 goto skip; \
1693 } \
1694 (eip) += sizeof(_x); _x; })
1696 #if defined(CONFIG_X86_32)
1697 # define read_sreg(regs, sr) ((regs)->sr)
1698 #elif defined(CONFIG_X86_64)
1699 # define read_sreg(regs, sr) read_segment_register(sr)
1700 #endif
1702 static int is_cpufreq_controller(struct domain *d)
1704 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1705 (d->domain_id == 0));
1708 static int emulate_privileged_op(struct cpu_user_regs *regs)
1710 struct vcpu *v = current;
1711 unsigned long *reg, eip = regs->eip;
1712 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1713 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1714 int rc;
1715 unsigned int port, i, data_sel, ar, data, bpmatch = 0;
1716 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1717 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1718 ? regs->reg \
1719 : ad_bytes == 4 \
1720 ? (u32)regs->reg \
1721 : (u16)regs->reg)
1722 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1723 ? regs->reg = (val) \
1724 : ad_bytes == 4 \
1725 ? (*(u32 *)&regs->reg = (val)) \
1726 : (*(u16 *)&regs->reg = (val)))
1727 unsigned long code_base, code_limit;
1728 char io_emul_stub[32];
1729 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1730 uint32_t l, h;
1731 uint64_t val;
1733 if ( !read_descriptor(regs->cs, v, regs,
1734 &code_base, &code_limit, &ar,
1735 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1736 goto fail;
1737 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1738 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1739 if ( !(ar & _SEGMENT_S) ||
1740 !(ar & _SEGMENT_P) ||
1741 !(ar & _SEGMENT_CODE) )
1742 goto fail;
1744 /* emulating only opcodes not allowing SS to be default */
1745 data_sel = read_sreg(regs, ds);
1747 /* Legacy prefixes. */
1748 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1750 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1752 case 0x66: /* operand-size override */
1753 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1754 continue;
1755 case 0x67: /* address-size override */
1756 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1757 continue;
1758 case 0x2e: /* CS override */
1759 data_sel = regs->cs;
1760 continue;
1761 case 0x3e: /* DS override */
1762 data_sel = read_sreg(regs, ds);
1763 continue;
1764 case 0x26: /* ES override */
1765 data_sel = read_sreg(regs, es);
1766 continue;
1767 case 0x64: /* FS override */
1768 data_sel = read_sreg(regs, fs);
1769 lm_ovr = lm_seg_fs;
1770 continue;
1771 case 0x65: /* GS override */
1772 data_sel = read_sreg(regs, gs);
1773 lm_ovr = lm_seg_gs;
1774 continue;
1775 case 0x36: /* SS override */
1776 data_sel = regs->ss;
1777 continue;
1778 case 0xf0: /* LOCK */
1779 lock = 1;
1780 continue;
1781 case 0xf2: /* REPNE/REPNZ */
1782 case 0xf3: /* REP/REPE/REPZ */
1783 rep_prefix = 1;
1784 continue;
1785 default:
1786 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1788 rex = opcode;
1789 continue;
1791 break;
1793 break;
1796 /* REX prefix. */
1797 if ( rex & 8 ) /* REX.W */
1798 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1799 modrm_reg = (rex & 4) << 1; /* REX.R */
1800 /* REX.X does not need to be decoded. */
1801 modrm_rm = (rex & 1) << 3; /* REX.B */
1803 if ( opcode == 0x0f )
1804 goto twobyte_opcode;
1806 if ( lock )
1807 goto fail;
1809 /* Input/Output String instructions. */
1810 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1812 unsigned long data_base, data_limit;
1814 if ( rep_prefix && (rd_ad(ecx) == 0) )
1815 goto done;
1817 if ( !(opcode & 2) )
1819 data_sel = read_sreg(regs, es);
1820 lm_ovr = lm_seg_none;
1823 if ( !(ar & _SEGMENT_L) )
1825 if ( !read_descriptor(data_sel, v, regs,
1826 &data_base, &data_limit, &ar,
1827 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1828 _SEGMENT_P) )
1829 goto fail;
1830 if ( !(ar & _SEGMENT_S) ||
1831 !(ar & _SEGMENT_P) ||
1832 (opcode & 2 ?
1833 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1834 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1835 goto fail;
1837 #ifdef CONFIG_X86_64
1838 else
1840 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1842 switch ( lm_ovr )
1844 case lm_seg_none:
1845 data_base = 0UL;
1846 break;
1847 case lm_seg_fs:
1848 data_base = v->arch.guest_context.fs_base;
1849 break;
1850 case lm_seg_gs:
1851 if ( guest_kernel_mode(v, regs) )
1852 data_base = v->arch.guest_context.gs_base_kernel;
1853 else
1854 data_base = v->arch.guest_context.gs_base_user;
1855 break;
1858 else
1859 read_descriptor(data_sel, v, regs,
1860 &data_base, &data_limit, &ar,
1861 0);
1862 data_limit = ~0UL;
1863 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1865 #endif
1867 port = (u16)regs->edx;
1869 continue_io_string:
1870 switch ( opcode )
1872 case 0x6c: /* INSB */
1873 op_bytes = 1;
1874 case 0x6d: /* INSW/INSL */
1875 if ( (data_limit < (op_bytes - 1)) ||
1876 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1877 !guest_io_okay(port, op_bytes, v, regs) )
1878 goto fail;
1879 data = guest_io_read(port, op_bytes, v, regs);
1880 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1881 &data, op_bytes)) != 0 )
1883 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1884 PFEC_write_access);
1885 return EXCRET_fault_fixed;
1887 wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
1888 ? -op_bytes : op_bytes));
1889 break;
1891 case 0x6e: /* OUTSB */
1892 op_bytes = 1;
1893 case 0x6f: /* OUTSW/OUTSL */
1894 if ( (data_limit < (op_bytes - 1)) ||
1895 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1896 !guest_io_okay(port, op_bytes, v, regs) )
1897 goto fail;
1898 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1899 op_bytes)) != 0 )
1901 propagate_page_fault(data_base + rd_ad(esi)
1902 + op_bytes - rc, 0);
1903 return EXCRET_fault_fixed;
1905 guest_io_write(port, op_bytes, data, v, regs);
1906 wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
1907 ? -op_bytes : op_bytes));
1908 break;
1911 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1913 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1915 if ( !bpmatch && !hypercall_preempt_check() )
1916 goto continue_io_string;
1917 eip = regs->eip;
1920 goto done;
1923 /*
1924 * Very likely to be an I/O instruction (IN/OUT).
1925 * Build an on-stack stub to execute the instruction with full guest
1926 * GPR context. This is needed for some systems which (ab)use IN/OUT
1927 * to communicate with BIOS code in system-management mode.
1928 */
1929 #ifdef __x86_64__
1930 /* movq $host_to_guest_gpr_switch,%rcx */
1931 io_emul_stub[0] = 0x48;
1932 io_emul_stub[1] = 0xb9;
1933 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1934 /* callq *%rcx */
1935 io_emul_stub[10] = 0xff;
1936 io_emul_stub[11] = 0xd1;
1937 #else
1938 /* call host_to_guest_gpr_switch */
1939 io_emul_stub[0] = 0xe8;
1940 *(s32 *)&io_emul_stub[1] =
1941 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1942 /* 7 x nop */
1943 memset(&io_emul_stub[5], 0x90, 7);
1944 #endif
1945 /* data16 or nop */
1946 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1947 /* <io-access opcode> */
1948 io_emul_stub[13] = opcode;
1949 /* imm8 or nop */
1950 io_emul_stub[14] = 0x90;
1951 /* ret (jumps to guest_to_host_gpr_switch) */
1952 io_emul_stub[15] = 0xc3;
1954 /* Handy function-typed pointer to the stub. */
1955 io_emul = (void *)io_emul_stub;
1957 if ( ioemul_handle_quirk )
1958 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1960 /* I/O Port and Interrupt Flag instructions. */
1961 switch ( opcode )
1963 case 0xe4: /* IN imm8,%al */
1964 op_bytes = 1;
1965 case 0xe5: /* IN imm8,%eax */
1966 port = insn_fetch(u8, code_base, eip, code_limit);
1967 io_emul_stub[14] = port; /* imm8 */
1968 exec_in:
1969 if ( !guest_io_okay(port, op_bytes, v, regs) )
1970 goto fail;
1971 if ( admin_io_okay(port, op_bytes, v, regs) )
1973 io_emul(regs);
1975 else
1977 if ( op_bytes == 4 )
1978 regs->eax = 0;
1979 else
1980 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1981 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1983 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1984 goto done;
1986 case 0xec: /* IN %dx,%al */
1987 op_bytes = 1;
1988 case 0xed: /* IN %dx,%eax */
1989 port = (u16)regs->edx;
1990 goto exec_in;
1992 case 0xe6: /* OUT %al,imm8 */
1993 op_bytes = 1;
1994 case 0xe7: /* OUT %eax,imm8 */
1995 port = insn_fetch(u8, code_base, eip, code_limit);
1996 io_emul_stub[14] = port; /* imm8 */
1997 exec_out:
1998 if ( !guest_io_okay(port, op_bytes, v, regs) )
1999 goto fail;
2000 if ( admin_io_okay(port, op_bytes, v, regs) )
2002 if ( (op_bytes == 1) &&
2003 ((port == 0x71) || (port == 0x70)) &&
2004 pv_rtc_handler )
2005 pv_rtc_handler(port, regs->eax);
2006 io_emul(regs);
2007 if ( (op_bytes == 1) && pv_post_outb_hook )
2008 pv_post_outb_hook(port, regs->eax);
2010 else
2012 guest_io_write(port, op_bytes, regs->eax, v, regs);
2014 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
2015 goto done;
2017 case 0xee: /* OUT %al,%dx */
2018 op_bytes = 1;
2019 case 0xef: /* OUT %eax,%dx */
2020 port = (u16)regs->edx;
2021 goto exec_out;
2023 case 0xfa: /* CLI */
2024 case 0xfb: /* STI */
2025 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
2026 goto fail;
2027 /*
2028 * This is just too dangerous to allow, in my opinion. Consider if the
2029 * caller then tries to reenable interrupts using POPF: we can't trap
2030 * that and we'll end up with hard-to-debug lockups. Fast & loose will
2031 * do for us. :-)
2032 */
2033 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
2034 goto done;
2037 /* No decode of this single-byte opcode. */
2038 goto fail;
2040 twobyte_opcode:
2041 /*
2042 * All 2 and 3 byte opcodes, except RDTSC (0x31) and RDTSCP (0x1,0xF9)
2043 * are executable only from guest kernel mode (virtual ring 0).
2044 */
2045 opcode = insn_fetch(u8, code_base, eip, code_limit);
2046 if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) )
2047 goto fail;
2049 if ( lock && (opcode & ~3) != 0x20 )
2050 goto fail;
2051 switch ( opcode )
2053 case 0x1: /* RDTSCP */
2054 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2055 !guest_kernel_mode(v, regs) )
2056 goto fail;
2057 if ( insn_fetch(u8, code_base, eip, code_limit) != 0xf9 )
2058 goto fail;
2059 pv_soft_rdtsc(v, regs, 1);
2060 break;
2062 case 0x06: /* CLTS */
2063 (void)do_fpu_taskswitch(0);
2064 break;
2066 case 0x09: /* WBINVD */
2067 /* Ignore the instruction if unprivileged. */
2068 if ( !cache_flush_permitted(v->domain) )
2069 /* Non-physdev domain attempted WBINVD; ignore for now since
2070 newer linux uses this in some start-of-day timing loops */
2072 else
2073 wbinvd();
2074 break;
2076 case 0x20: /* MOV CR?,<reg> */
2077 opcode = insn_fetch(u8, code_base, eip, code_limit);
2078 if ( opcode < 0xc0 )
2079 goto fail;
2080 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2081 modrm_rm |= (opcode >> 0) & 7;
2082 reg = decode_register(modrm_rm, regs, 0);
2083 switch ( modrm_reg )
2085 case 0: /* Read CR0 */
2086 *reg = (read_cr0() & ~X86_CR0_TS) |
2087 v->arch.guest_context.ctrlreg[0];
2088 break;
2090 case 2: /* Read CR2 */
2091 *reg = v->arch.guest_context.ctrlreg[2];
2092 break;
2094 case 3: /* Read CR3 */
2096 unsigned long mfn;
2098 if ( !is_pv_32on64_vcpu(v) )
2100 mfn = pagetable_get_pfn(v->arch.guest_table);
2101 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2102 v->domain, mfn));
2104 #ifdef CONFIG_COMPAT
2105 else
2107 mfn = l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)));
2108 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2109 v->domain, mfn));
2111 #endif
2112 /* PTs should not be shared */
2113 BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
2115 break;
2117 case 4: /* Read CR4 */
2118 *reg = v->arch.guest_context.ctrlreg[4];
2119 break;
2121 default:
2122 goto fail;
2124 break;
2126 case 0x21: /* MOV DR?,<reg> */ {
2127 unsigned long res;
2128 opcode = insn_fetch(u8, code_base, eip, code_limit);
2129 if ( opcode < 0xc0 )
2130 goto fail;
2131 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2132 modrm_rm |= (opcode >> 0) & 7;
2133 reg = decode_register(modrm_rm, regs, 0);
2134 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2135 goto fail;
2136 *reg = res;
2137 break;
2140 case 0x22: /* MOV <reg>,CR? */
2141 opcode = insn_fetch(u8, code_base, eip, code_limit);
2142 if ( opcode < 0xc0 )
2143 goto fail;
2144 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2145 modrm_rm |= (opcode >> 0) & 7;
2146 reg = decode_register(modrm_rm, regs, 0);
2147 switch ( modrm_reg )
2149 case 0: /* Write CR0 */
2150 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2152 gdprintk(XENLOG_WARNING,
2153 "Attempt to change unmodifiable CR0 flags.\n");
2154 goto fail;
2156 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2157 break;
2159 case 2: /* Write CR2 */
2160 v->arch.guest_context.ctrlreg[2] = *reg;
2161 arch_set_cr2(v, *reg);
2162 break;
2164 case 3: /* Write CR3 */
2165 domain_lock(v->domain);
2166 if ( !is_pv_32on64_vcpu(v) )
2167 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2168 #ifdef CONFIG_COMPAT
2169 else
2170 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2171 #endif
2172 domain_unlock(v->domain);
2173 if ( rc == 0 ) /* not okay */
2174 goto fail;
2175 break;
2177 case 4: /* Write CR4 */
2178 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2179 write_cr4(pv_guest_cr4_to_real_cr4(v));
2180 break;
2182 default:
2183 goto fail;
2185 break;
2187 case 0x23: /* MOV <reg>,DR? */
2188 opcode = insn_fetch(u8, code_base, eip, code_limit);
2189 if ( opcode < 0xc0 )
2190 goto fail;
2191 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2192 modrm_rm |= (opcode >> 0) & 7;
2193 reg = decode_register(modrm_rm, regs, 0);
2194 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2195 goto fail;
2196 break;
2198 case 0x30: /* WRMSR */ {
2199 u32 eax = regs->eax;
2200 u32 edx = regs->edx;
2201 u64 val = ((u64)edx << 32) | eax;
2202 switch ( (u32)regs->ecx )
2204 #ifdef CONFIG_X86_64
2205 case MSR_FS_BASE:
2206 if ( is_pv_32on64_vcpu(v) )
2207 goto fail;
2208 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2209 goto fail;
2210 v->arch.guest_context.fs_base = val;
2211 break;
2212 case MSR_GS_BASE:
2213 if ( is_pv_32on64_vcpu(v) )
2214 goto fail;
2215 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2216 goto fail;
2217 v->arch.guest_context.gs_base_kernel = val;
2218 break;
2219 case MSR_SHADOW_GS_BASE:
2220 if ( is_pv_32on64_vcpu(v) )
2221 goto fail;
2222 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2223 goto fail;
2224 v->arch.guest_context.gs_base_user = val;
2225 break;
2226 #endif
2227 case MSR_K7_FID_VID_STATUS:
2228 case MSR_K7_FID_VID_CTL:
2229 case MSR_K8_PSTATE_LIMIT:
2230 case MSR_K8_PSTATE_CTRL:
2231 case MSR_K8_PSTATE_STATUS:
2232 case MSR_K8_PSTATE0:
2233 case MSR_K8_PSTATE1:
2234 case MSR_K8_PSTATE2:
2235 case MSR_K8_PSTATE3:
2236 case MSR_K8_PSTATE4:
2237 case MSR_K8_PSTATE5:
2238 case MSR_K8_PSTATE6:
2239 case MSR_K8_PSTATE7:
2240 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2241 goto fail;
2242 if ( !is_cpufreq_controller(v->domain) )
2243 break;
2244 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2245 goto fail;
2246 break;
2247 case MSR_AMD64_NB_CFG:
2248 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2249 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2250 goto fail;
2251 if ( !IS_PRIV(v->domain) )
2252 break;
2253 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2254 (eax != l) ||
2255 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2256 goto invalid;
2257 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2258 goto fail;
2259 break;
2260 case MSR_FAM10H_MMIO_CONF_BASE:
2261 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2262 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2263 goto fail;
2264 if ( !IS_PRIV(v->domain) )
2265 break;
2266 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2267 (((((u64)h << 32) | l) ^ val) &
2268 ~( FAM10H_MMIO_CONF_ENABLE |
2269 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2270 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2271 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2272 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2273 goto invalid;
2274 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2275 goto fail;
2276 break;
2277 case MSR_IA32_MPERF:
2278 case MSR_IA32_APERF:
2279 case MSR_IA32_PERF_CTL:
2280 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2281 goto fail;
2282 if ( !is_cpufreq_controller(v->domain) )
2283 break;
2284 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2285 goto fail;
2286 break;
2287 case MSR_IA32_THERM_CONTROL:
2288 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2289 goto fail;
2290 if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
2291 break;
2292 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2293 goto fail;
2294 break;
2295 default:
2296 if ( wrmsr_hypervisor_regs(regs->ecx, val) )
2297 break;
2299 rc = mce_wrmsr(regs->ecx, val);
2300 if ( rc < 0 )
2301 goto fail;
2302 if ( rc )
2303 break;
2305 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2306 (eax != l) || (edx != h) )
2307 invalid:
2308 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2309 "%08x:%08x to %08x:%08x.\n",
2310 _p(regs->ecx), h, l, edx, eax);
2311 break;
2313 break;
2316 case 0x31: /* RDTSC */
2317 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2318 !guest_kernel_mode(v, regs) )
2319 goto fail;
2320 if ( v->domain->arch.vtsc )
2321 pv_soft_rdtsc(v, regs, 0);
2322 else
2323 rdtsc(regs->eax, regs->edx);
2324 break;
2326 case 0x32: /* RDMSR */
2327 switch ( (u32)regs->ecx )
2329 #ifdef CONFIG_X86_64
2330 case MSR_FS_BASE:
2331 if ( is_pv_32on64_vcpu(v) )
2332 goto fail;
2333 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2334 regs->edx = v->arch.guest_context.fs_base >> 32;
2335 break;
2336 case MSR_GS_BASE:
2337 if ( is_pv_32on64_vcpu(v) )
2338 goto fail;
2339 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2340 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2341 break;
2342 case MSR_SHADOW_GS_BASE:
2343 if ( is_pv_32on64_vcpu(v) )
2344 goto fail;
2345 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2346 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2347 break;
2348 #endif
2349 case MSR_K7_FID_VID_CTL:
2350 case MSR_K7_FID_VID_STATUS:
2351 case MSR_K8_PSTATE_LIMIT:
2352 case MSR_K8_PSTATE_CTRL:
2353 case MSR_K8_PSTATE_STATUS:
2354 case MSR_K8_PSTATE0:
2355 case MSR_K8_PSTATE1:
2356 case MSR_K8_PSTATE2:
2357 case MSR_K8_PSTATE3:
2358 case MSR_K8_PSTATE4:
2359 case MSR_K8_PSTATE5:
2360 case MSR_K8_PSTATE6:
2361 case MSR_K8_PSTATE7:
2362 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2363 goto fail;
2364 if ( !is_cpufreq_controller(v->domain) )
2366 regs->eax = regs->edx = 0;
2367 break;
2369 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2370 goto fail;
2371 break;
2372 case MSR_IA32_MISC_ENABLE:
2373 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2374 goto fail;
2375 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2376 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2377 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2378 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2379 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2380 break;
2381 case MSR_EFER:
2382 case MSR_AMD_PATCHLEVEL:
2383 default:
2384 if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
2386 rdmsr_writeback:
2387 regs->eax = (uint32_t)val;
2388 regs->edx = (uint32_t)(val >> 32);
2389 break;
2392 rc = mce_rdmsr(regs->ecx, &val);
2393 if ( rc < 0 )
2394 goto fail;
2395 if ( rc )
2396 goto rdmsr_writeback;
2398 /* Everyone can read the MSR space. */
2399 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2400 _p(regs->ecx));*/
2401 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2402 goto fail;
2403 break;
2405 break;
2407 default:
2408 goto fail;
2411 #undef wr_ad
2412 #undef rd_ad
2414 done:
2415 instruction_done(regs, eip, bpmatch);
2416 skip:
2417 return EXCRET_fault_fixed;
2419 fail:
2420 return 0;
2423 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2424 unsigned int esp, unsigned int decr)
2426 return (((esp - decr) < (esp - 1)) &&
2427 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2430 static void emulate_gate_op(struct cpu_user_regs *regs)
2432 #ifdef __x86_64__
2433 struct vcpu *v = current;
2434 unsigned int sel, ar, dpl, nparm, opnd_sel;
2435 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2436 unsigned long off, eip, opnd_off, base, limit;
2437 int jump;
2439 /* Check whether this fault is due to the use of a call gate. */
2440 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2441 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2442 ((ar & _SEGMENT_TYPE) != 0xc00) )
2444 do_guest_trap(TRAP_gp_fault, regs, 1);
2445 return;
2447 if ( !(ar & _SEGMENT_P) )
2449 do_guest_trap(TRAP_no_segment, regs, 1);
2450 return;
2452 dpl = (ar >> 13) & 3;
2453 nparm = ar & 0x1f;
2455 /*
2456 * Decode instruction (and perhaps operand) to determine RPL,
2457 * whether this is a jump or a call, and the call return offset.
2458 */
2459 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2460 !(ar & _SEGMENT_S) ||
2461 !(ar & _SEGMENT_P) ||
2462 !(ar & _SEGMENT_CODE) )
2464 do_guest_trap(TRAP_gp_fault, regs, 1);
2465 return;
2468 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2469 ad_default = ad_bytes = op_default;
2470 opnd_sel = opnd_off = 0;
2471 jump = -1;
2472 for ( eip = regs->eip; eip - regs->_eip < 10; )
2474 switch ( insn_fetch(u8, base, eip, limit) )
2476 case 0x66: /* operand-size override */
2477 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2478 continue;
2479 case 0x67: /* address-size override */
2480 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2481 continue;
2482 case 0x2e: /* CS override */
2483 opnd_sel = regs->cs;
2484 ASSERT(opnd_sel);
2485 continue;
2486 case 0x3e: /* DS override */
2487 opnd_sel = read_sreg(regs, ds);
2488 if ( !opnd_sel )
2489 opnd_sel = dpl;
2490 continue;
2491 case 0x26: /* ES override */
2492 opnd_sel = read_sreg(regs, es);
2493 if ( !opnd_sel )
2494 opnd_sel = dpl;
2495 continue;
2496 case 0x64: /* FS override */
2497 opnd_sel = read_sreg(regs, fs);
2498 if ( !opnd_sel )
2499 opnd_sel = dpl;
2500 continue;
2501 case 0x65: /* GS override */
2502 opnd_sel = read_sreg(regs, gs);
2503 if ( !opnd_sel )
2504 opnd_sel = dpl;
2505 continue;
2506 case 0x36: /* SS override */
2507 opnd_sel = regs->ss;
2508 if ( !opnd_sel )
2509 opnd_sel = dpl;
2510 continue;
2511 case 0xea:
2512 ++jump;
2513 /* FALLTHROUGH */
2514 case 0x9a:
2515 ++jump;
2516 opnd_sel = regs->cs;
2517 opnd_off = eip;
2518 ad_bytes = ad_default;
2519 eip += op_bytes + 2;
2520 break;
2521 case 0xff:
2523 unsigned int modrm;
2525 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2527 case 0x28: case 0x68: case 0xa8:
2528 ++jump;
2529 /* FALLTHROUGH */
2530 case 0x18: case 0x58: case 0x98:
2531 ++jump;
2532 if ( ad_bytes != 2 )
2534 if ( (modrm & 7) == 4 )
2536 unsigned int sib;
2537 sib = insn_fetch(u8, base, eip, limit);
2539 modrm = (modrm & ~7) | (sib & 7);
2540 if ( (sib >>= 3) != 4 )
2541 opnd_off = *(unsigned long *)
2542 decode_register(sib & 7, regs, 0);
2543 opnd_off <<= sib >> 3;
2545 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2546 opnd_off += *(unsigned long *)
2547 decode_register(modrm & 7, regs, 0);
2548 else
2549 modrm |= 0x87;
2550 if ( !opnd_sel )
2552 switch ( modrm & 7 )
2554 default:
2555 opnd_sel = read_sreg(regs, ds);
2556 break;
2557 case 4: case 5:
2558 opnd_sel = regs->ss;
2559 break;
2563 else
2565 switch ( modrm & 7 )
2567 case 0: case 1: case 7:
2568 opnd_off = regs->ebx;
2569 break;
2570 case 6:
2571 if ( !(modrm & 0xc0) )
2572 modrm |= 0x80;
2573 else
2574 case 2: case 3:
2576 opnd_off = regs->ebp;
2577 if ( !opnd_sel )
2578 opnd_sel = regs->ss;
2580 break;
2582 if ( !opnd_sel )
2583 opnd_sel = read_sreg(regs, ds);
2584 switch ( modrm & 7 )
2586 case 0: case 2: case 4:
2587 opnd_off += regs->esi;
2588 break;
2589 case 1: case 3: case 5:
2590 opnd_off += regs->edi;
2591 break;
2594 switch ( modrm & 0xc0 )
2596 case 0x40:
2597 opnd_off += insn_fetch(s8, base, eip, limit);
2598 break;
2599 case 0x80:
2600 opnd_off += insn_fetch(s32, base, eip, limit);
2601 break;
2603 if ( ad_bytes == 4 )
2604 opnd_off = (unsigned int)opnd_off;
2605 else if ( ad_bytes == 2 )
2606 opnd_off = (unsigned short)opnd_off;
2607 break;
2610 break;
2612 break;
2615 if ( jump < 0 )
2617 fail:
2618 do_guest_trap(TRAP_gp_fault, regs, 1);
2619 skip:
2620 return;
2623 if ( (opnd_sel != regs->cs &&
2624 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2625 !(ar & _SEGMENT_S) ||
2626 !(ar & _SEGMENT_P) ||
2627 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2629 do_guest_trap(TRAP_gp_fault, regs, 1);
2630 return;
2633 opnd_off += op_bytes;
2634 #define ad_default ad_bytes
2635 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2636 #undef ad_default
2637 ASSERT((opnd_sel & ~3) == regs->error_code);
2638 if ( dpl < (opnd_sel & 3) )
2640 do_guest_trap(TRAP_gp_fault, regs, 1);
2641 return;
2644 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2645 !(ar & _SEGMENT_S) ||
2646 !(ar & _SEGMENT_CODE) ||
2647 (!jump || (ar & _SEGMENT_EC) ?
2648 ((ar >> 13) & 3) > (regs->cs & 3) :
2649 ((ar >> 13) & 3) != (regs->cs & 3)) )
2651 regs->error_code = sel;
2652 do_guest_trap(TRAP_gp_fault, regs, 1);
2653 return;
2655 if ( !(ar & _SEGMENT_P) )
2657 regs->error_code = sel;
2658 do_guest_trap(TRAP_no_segment, regs, 1);
2659 return;
2661 if ( off > limit )
2663 regs->error_code = 0;
2664 do_guest_trap(TRAP_gp_fault, regs, 1);
2665 return;
2668 if ( !jump )
2670 unsigned int ss, esp, *stkp;
2671 int rc;
2672 #define push(item) do \
2673 { \
2674 --stkp; \
2675 esp -= 4; \
2676 rc = __put_user(item, stkp); \
2677 if ( rc ) \
2678 { \
2679 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2680 PFEC_write_access); \
2681 return; \
2682 } \
2683 } while ( 0 )
2685 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2687 sel |= (ar >> 13) & 3;
2688 /* Inner stack known only for kernel ring. */
2689 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2691 do_guest_trap(TRAP_gp_fault, regs, 1);
2692 return;
2694 esp = v->arch.guest_context.kernel_sp;
2695 ss = v->arch.guest_context.kernel_ss;
2696 if ( (ss & 3) != (sel & 3) ||
2697 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2698 ((ar >> 13) & 3) != (sel & 3) ||
2699 !(ar & _SEGMENT_S) ||
2700 (ar & _SEGMENT_CODE) ||
2701 !(ar & _SEGMENT_WR) )
2703 regs->error_code = ss & ~3;
2704 do_guest_trap(TRAP_invalid_tss, regs, 1);
2705 return;
2707 if ( !(ar & _SEGMENT_P) ||
2708 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2710 regs->error_code = ss & ~3;
2711 do_guest_trap(TRAP_stack_error, regs, 1);
2712 return;
2714 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2715 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2717 do_guest_trap(TRAP_gp_fault, regs, 1);
2718 return;
2720 push(regs->ss);
2721 push(regs->esp);
2722 if ( nparm )
2724 const unsigned int *ustkp;
2726 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2727 ((ar >> 13) & 3) != (regs->cs & 3) ||
2728 !(ar & _SEGMENT_S) ||
2729 (ar & _SEGMENT_CODE) ||
2730 !(ar & _SEGMENT_WR) ||
2731 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2732 return do_guest_trap(TRAP_gp_fault, regs, 1);
2733 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2734 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2736 do_guest_trap(TRAP_gp_fault, regs, 1);
2737 return;
2739 do
2741 unsigned int parm;
2743 --ustkp;
2744 rc = __get_user(parm, ustkp);
2745 if ( rc )
2747 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2748 return;
2750 push(parm);
2751 } while ( --nparm );
2754 else
2756 sel |= (regs->cs & 3);
2757 esp = regs->esp;
2758 ss = regs->ss;
2759 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2760 ((ar >> 13) & 3) != (sel & 3) )
2762 do_guest_trap(TRAP_gp_fault, regs, 1);
2763 return;
2765 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2767 regs->error_code = 0;
2768 do_guest_trap(TRAP_stack_error, regs, 1);
2769 return;
2771 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2772 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2774 do_guest_trap(TRAP_gp_fault, regs, 1);
2775 return;
2778 push(regs->cs);
2779 push(eip);
2780 #undef push
2781 regs->esp = esp;
2782 regs->ss = ss;
2784 else
2785 sel |= (regs->cs & 3);
2787 regs->cs = sel;
2788 instruction_done(regs, off, 0);
2789 #endif
2792 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2794 struct vcpu *v = current;
2795 unsigned long fixup;
2797 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2799 if ( regs->error_code & 1 )
2800 goto hardware_gp;
2802 if ( !guest_mode(regs) )
2803 goto gp_in_kernel;
2805 /*
2806 * Cunning trick to allow arbitrary "INT n" handling.
2808 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2809 * instruction from trapping to the appropriate vector, when that might not
2810 * be expected by Xen or the guest OS. For example, that entry might be for
2811 * a fault handler (unlike traps, faults don't increment EIP), or might
2812 * expect an error code on the stack (which a software trap never
2813 * provides), or might be a hardware interrupt handler that doesn't like
2814 * being called spuriously.
2816 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2817 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2818 * clear to indicate that it's a software fault, not hardware.
2820 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2821 * okay because they can only be triggered by an explicit DPL-checked
2822 * instruction. The DPL specified by the guest OS for these vectors is NOT
2823 * CHECKED!!
2824 */
2825 if ( (regs->error_code & 3) == 2 )
2827 /* This fault must be due to <INT n> instruction. */
2828 const struct trap_info *ti;
2829 unsigned char vector = regs->error_code >> 3;
2830 ti = &v->arch.guest_context.trap_ctxt[vector];
2831 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2833 regs->eip += 2;
2834 do_guest_trap(vector, regs, 0);
2835 return;
2838 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2840 emulate_gate_op(regs);
2841 return;
2844 /* Emulate some simple privileged and I/O instructions. */
2845 if ( (regs->error_code == 0) &&
2846 emulate_privileged_op(regs) )
2848 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2849 return;
2852 #if defined(__i386__)
2853 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2854 (regs->error_code == 0) &&
2855 gpf_emulate_4gb(regs) )
2857 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2858 return;
2860 #endif
2862 /* Pass on GPF as is. */
2863 do_guest_trap(TRAP_gp_fault, regs, 1);
2864 return;
2866 gp_in_kernel:
2868 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2870 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2871 regs->error_code, _p(regs->eip), _p(fixup));
2872 regs->eip = fixup;
2873 return;
2876 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2878 hardware_gp:
2879 show_execution_state(regs);
2880 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2883 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2885 static void nmi_mce_softirq(void)
2887 int cpu = smp_processor_id();
2888 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2889 cpumask_t affinity;
2891 BUG_ON(st == NULL);
2892 BUG_ON(st->vcpu == NULL);
2894 /* Set the tmp value unconditionally, so that
2895 * the check in the iret hypercall works. */
2896 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2898 if ((cpu != st->processor)
2899 || (st->processor != st->vcpu->processor))
2901 /* We are on a different physical cpu.
2902 * Make sure to wakeup the vcpu on the
2903 * specified processor.
2904 */
2905 cpus_clear(affinity);
2906 cpu_set(st->processor, affinity);
2907 vcpu_set_affinity(st->vcpu, &affinity);
2909 /* Affinity is restored in the iret hypercall. */
2912 /* Only used to defer wakeup of domain/vcpu to
2913 * a safe (non-NMI/MCE) context.
2914 */
2915 vcpu_kick(st->vcpu);
2916 st->vcpu = NULL;
2919 void async_exception_cleanup(struct vcpu *curr)
2921 int trap;
2923 if ( !curr->async_exception_mask )
2924 return;
2926 /* Restore affinity. */
2927 if ( !cpus_empty(curr->cpu_affinity_tmp) &&
2928 !cpus_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
2930 vcpu_set_affinity(curr, &curr->cpu_affinity_tmp);
2931 cpus_clear(curr->cpu_affinity_tmp);
2934 if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
2935 trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
2936 else
2937 for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
2938 if ( (curr->async_exception_mask ^
2939 curr->async_exception_state(trap).old_mask) == (1 << trap) )
2940 break;
2941 ASSERT(trap <= VCPU_TRAP_LAST);
2943 /* inject vMCE to PV_Guest including DOM0. */
2944 if ( trap == VCPU_TRAP_MCE )
2946 gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n");
2947 if ( curr->vcpu_id == 0 )
2949 struct domain *d = curr->domain;
2951 if ( !d->arch.vmca_msrs.nr_injection )
2953 printk(XENLOG_WARNING "MCE: ret from vMCE#, "
2954 "no injection node\n");
2955 goto end;
2958 d->arch.vmca_msrs.nr_injection--;
2959 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
2961 struct bank_entry *entry;
2963 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
2964 struct bank_entry, list);
2965 gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
2966 list_del(&entry->list);
2968 else
2969 printk(XENLOG_ERR "MCE: didn't found last injection node\n");
2971 /* further injection */
2972 if ( d->arch.vmca_msrs.nr_injection > 0 &&
2973 guest_has_trap_callback(d, 0, TRAP_machine_check) &&
2974 !test_and_set_bool(curr->mce_pending) )
2976 int cpu = smp_processor_id();
2977 cpumask_t affinity;
2979 curr->cpu_affinity_tmp = curr->cpu_affinity;
2980 cpus_clear(affinity);
2981 cpu_set(cpu, affinity);
2982 printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
2983 cpu, curr->processor);
2984 vcpu_set_affinity(curr, &affinity);
2989 end:
2990 /* Restore previous asynchronous exception mask. */
2991 curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
2994 static void nmi_dom0_report(unsigned int reason_idx)
2996 struct domain *d = dom0;
2998 if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) )
2999 return;
3001 set_bit(reason_idx, nmi_reason(d));
3003 send_guest_trap(d, 0, TRAP_nmi);
3006 static void mem_parity_error(struct cpu_user_regs *regs)
3008 switch ( opt_nmi[0] )
3010 case 'd': /* 'dom0' */
3011 nmi_dom0_report(_XEN_NMIREASON_parity_error);
3012 case 'i': /* 'ignore' */
3013 break;
3014 default: /* 'fatal' */
3015 console_force_unlock();
3016 printk("\n\nNMI - MEMORY ERROR\n");
3017 fatal_trap(TRAP_nmi, regs);
3020 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
3021 mdelay(1);
3022 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
3025 static void io_check_error(struct cpu_user_regs *regs)
3027 switch ( opt_nmi[0] )
3029 case 'd': /* 'dom0' */
3030 nmi_dom0_report(_XEN_NMIREASON_io_error);
3031 case 'i': /* 'ignore' */
3032 break;
3033 default: /* 'fatal' */
3034 console_force_unlock();
3035 printk("\n\nNMI - I/O ERROR\n");
3036 fatal_trap(TRAP_nmi, regs);
3039 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
3040 mdelay(1);
3041 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
3044 static void unknown_nmi_error(unsigned char reason)
3046 switch ( opt_nmi[0] )
3048 case 'd': /* 'dom0' */
3049 nmi_dom0_report(_XEN_NMIREASON_unknown);
3050 case 'i': /* 'ignore' */
3051 break;
3052 default: /* 'fatal' */
3053 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
3054 printk("Dazed and confused, but trying to continue\n");
3055 printk("Do you have a strange power saving mode enabled?\n");
3056 kexec_crash();
3060 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
3062 return 0;
3065 static nmi_callback_t nmi_callback = dummy_nmi_callback;
3067 asmlinkage void do_nmi(struct cpu_user_regs *regs)
3069 unsigned int cpu = smp_processor_id();
3070 unsigned char reason;
3072 ++nmi_count(cpu);
3074 if ( nmi_callback(regs, cpu) )
3075 return;
3077 if ( nmi_watchdog )
3078 nmi_watchdog_tick(regs);
3080 /* Only the BSP gets external NMIs from the system. */
3081 if ( cpu == 0 )
3083 reason = inb(0x61);
3084 if ( reason & 0x80 )
3085 mem_parity_error(regs);
3086 else if ( reason & 0x40 )
3087 io_check_error(regs);
3088 else if ( !nmi_watchdog )
3089 unknown_nmi_error((unsigned char)(reason&0xff));
3093 void set_nmi_callback(nmi_callback_t callback)
3095 nmi_callback = callback;
3098 void unset_nmi_callback(void)
3100 nmi_callback = dummy_nmi_callback;
3103 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
3105 struct vcpu *curr = current;
3107 BUG_ON(!guest_mode(regs));
3109 setup_fpu(curr);
3111 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
3113 do_guest_trap(TRAP_no_device, regs, 0);
3114 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
3116 else
3117 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
3119 return;
3122 asmlinkage void do_debug(struct cpu_user_regs *regs)
3124 struct vcpu *v = current;
3126 DEBUGGER_trap_entry(TRAP_debug, regs);
3128 if ( !guest_mode(regs) )
3130 if ( regs->eflags & X86_EFLAGS_TF )
3132 #ifdef __x86_64__
3133 void sysenter_entry(void);
3134 void sysenter_eflags_saved(void);
3135 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
3136 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
3137 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
3139 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
3140 regs->eflags &= ~X86_EFLAGS_TF;
3141 goto out;
3143 #endif
3144 if ( !debugger_trap_fatal(TRAP_debug, regs) )
3146 WARN_ON(1);
3147 regs->eflags &= ~X86_EFLAGS_TF;
3150 else
3152 /*
3153 * We ignore watchpoints when they trigger within Xen. This may
3154 * happen when a buffer is passed to us which previously had a
3155 * watchpoint set on it. No need to bump EIP; the only faulting
3156 * trap is an instruction breakpoint, which can't happen to us.
3157 */
3158 WARN_ON(!search_exception_table(regs->eip));
3160 goto out;
3163 /* Save debug status register where guest OS can peek at it */
3164 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3166 ler_enable();
3167 do_guest_trap(TRAP_debug, regs, 0);
3168 return;
3170 out:
3171 ler_enable();
3172 return;
3175 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3179 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3181 int i;
3182 /* Keep secondary tables in sync with IRQ updates. */
3183 for ( i = 1; i < NR_CPUS; i++ )
3184 if ( idt_tables[i] != NULL )
3185 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3186 _set_gate(&idt_table[n], 14, dpl, addr);
3189 static void set_swint_gate(unsigned int n, void *addr)
3191 __set_intr_gate(n, 3, addr);
3194 void set_intr_gate(unsigned int n, void *addr)
3196 __set_intr_gate(n, 0, addr);
3199 void load_TR(void)
3201 struct tss_struct *tss = &this_cpu(init_tss);
3202 struct desc_ptr old_gdt, tss_gdt = {
3203 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3204 .limit = LAST_RESERVED_GDT_BYTE
3205 };
3207 _set_tssldt_desc(
3208 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3209 (unsigned long)tss,
3210 offsetof(struct tss_struct, __cacheline_filler) - 1,
3211 9);
3212 #ifdef CONFIG_COMPAT
3213 _set_tssldt_desc(
3214 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3215 (unsigned long)tss,
3216 offsetof(struct tss_struct, __cacheline_filler) - 1,
3217 11);
3218 #endif
3220 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3221 asm volatile (
3222 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3223 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3226 void __devinit percpu_traps_init(void)
3228 subarch_percpu_traps_init();
3230 if ( !opt_ler )
3231 return;
3233 switch ( boot_cpu_data.x86_vendor )
3235 case X86_VENDOR_INTEL:
3236 switch ( boot_cpu_data.x86 )
3238 case 6:
3239 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3240 break;
3241 case 15:
3242 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3243 break;
3245 break;
3246 case X86_VENDOR_AMD:
3247 switch ( boot_cpu_data.x86 )
3249 case 6:
3250 case 15:
3251 case 16:
3252 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3253 break;
3255 break;
3258 ler_enable();
3261 void __init trap_init(void)
3263 /*
3264 * Note that interrupt gates are always used, rather than trap gates. We
3265 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3266 * first activation must have the "bad" value(s) for these registers and
3267 * we may lose them if another activation is installed before they are
3268 * saved. The page-fault handler also needs interrupts disabled until %cr2
3269 * has been read and saved on the stack.
3270 */
3271 set_intr_gate(TRAP_divide_error,&divide_error);
3272 set_intr_gate(TRAP_debug,&debug);
3273 set_intr_gate(TRAP_nmi,&nmi);
3274 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3275 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3276 set_intr_gate(TRAP_bounds,&bounds);
3277 set_intr_gate(TRAP_invalid_op,&invalid_op);
3278 set_intr_gate(TRAP_no_device,&device_not_available);
3279 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3280 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3281 set_intr_gate(TRAP_no_segment,&segment_not_present);
3282 set_intr_gate(TRAP_stack_error,&stack_segment);
3283 set_intr_gate(TRAP_gp_fault,&general_protection);
3284 set_intr_gate(TRAP_page_fault,&page_fault);
3285 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3286 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3287 set_intr_gate(TRAP_alignment_check,&alignment_check);
3288 set_intr_gate(TRAP_machine_check,&machine_check);
3289 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3291 /* CPU0 uses the master IDT. */
3292 idt_tables[0] = idt_table;
3294 percpu_traps_init();
3296 cpu_init();
3298 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3301 long register_guest_nmi_callback(unsigned long address)
3303 struct vcpu *v = current;
3304 struct domain *d = v->domain;
3305 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3307 t->vector = TRAP_nmi;
3308 t->flags = 0;
3309 t->cs = (is_pv_32on64_domain(d) ?
3310 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3311 t->address = address;
3312 TI_SET_IF(t, 1);
3314 /*
3315 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3316 * now.
3317 */
3318 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3319 v->nmi_pending = 1;
3321 return 0;
3324 long unregister_guest_nmi_callback(void)
3326 struct vcpu *v = current;
3327 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3329 memset(t, 0, sizeof(*t));
3331 return 0;
3334 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3336 struct vcpu *v;
3337 struct trap_info *t;
3339 BUG_ON(d == NULL);
3340 BUG_ON(vcpuid >= d->max_vcpus);
3342 /* Sanity check - XXX should be more fine grained. */
3343 BUG_ON(trap_nr > TRAP_syscall);
3345 v = d->vcpu[vcpuid];
3346 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3348 return (t->address != 0);
3352 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3354 struct vcpu *v;
3355 struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id());
3357 BUG_ON(d == NULL);
3358 BUG_ON(vcpuid >= d->max_vcpus);
3359 v = d->vcpu[vcpuid];
3361 switch (trap_nr) {
3362 case TRAP_nmi:
3363 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3364 return -EBUSY;
3365 if ( !test_and_set_bool(v->nmi_pending) ) {
3366 st->domain = d;
3367 st->processor = v->processor;
3369 /* not safe to wake up a vcpu here */
3370 raise_softirq(NMI_MCE_SOFTIRQ);
3371 return 0;
3373 st->vcpu = NULL;
3374 break;
3376 case TRAP_machine_check:
3377 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3378 return -EBUSY;
3380 /* We are called by the machine check (exception or polling) handlers
3381 * on the physical CPU that reported a machine check error. */
3383 if ( !test_and_set_bool(v->mce_pending) ) {
3384 st->domain = d;
3385 st->vcpu = v;
3386 st->processor = v->processor;
3388 /* not safe to wake up a vcpu here */
3389 raise_softirq(NMI_MCE_SOFTIRQ);
3390 return 0;
3392 st->vcpu = NULL;
3393 break;
3396 /* delivery failed */
3397 return -EIO;
3401 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3403 struct trap_info cur;
3404 struct vcpu *curr = current;
3405 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3406 long rc = 0;
3408 /* If no table is presented then clear the entire virtual IDT. */
3409 if ( guest_handle_is_null(traps) )
3411 memset(dst, 0, 256 * sizeof(*dst));
3412 init_int80_direct_trap(curr);
3413 return 0;
3416 for ( ; ; )
3418 if ( hypercall_preempt_check() )
3420 rc = hypercall_create_continuation(
3421 __HYPERVISOR_set_trap_table, "h", traps);
3422 break;
3425 if ( copy_from_guest(&cur, traps, 1) )
3427 rc = -EFAULT;
3428 break;
3431 if ( cur.address == 0 )
3432 break;
3434 fixup_guest_code_selector(curr->domain, cur.cs);
3436 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3438 if ( cur.vector == 0x80 )
3439 init_int80_direct_trap(curr);
3441 guest_handle_add_offset(traps, 1);
3444 return rc;
3447 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3449 int i;
3450 struct vcpu *curr = current;
3452 switch ( reg )
3454 case 0:
3455 if ( !access_ok(value, sizeof(long)) )
3456 return -EPERM;
3457 if ( v == curr )
3458 write_debugreg(0, value);
3459 break;
3460 case 1:
3461 if ( !access_ok(value, sizeof(long)) )
3462 return -EPERM;
3463 if ( v == curr )
3464 write_debugreg(1, value);
3465 break;
3466 case 2:
3467 if ( !access_ok(value, sizeof(long)) )
3468 return -EPERM;
3469 if ( v == curr )
3470 write_debugreg(2, value);
3471 break;
3472 case 3:
3473 if ( !access_ok(value, sizeof(long)) )
3474 return -EPERM;
3475 if ( v == curr )
3476 write_debugreg(3, value);
3477 break;
3478 case 6:
3479 /*
3480 * DR6: Bits 4-11,16-31 reserved (set to 1).
3481 * Bit 12 reserved (set to 0).
3482 */
3483 value &= 0xffffefff; /* reserved bits => 0 */
3484 value |= 0xffff0ff0; /* reserved bits => 1 */
3485 if ( v == curr )
3486 write_debugreg(6, value);
3487 break;
3488 case 7:
3489 /*
3490 * DR7: Bit 10 reserved (set to 1).
3491 * Bits 11-12,14-15 reserved (set to 0).
3492 */
3493 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3494 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3495 /*
3496 * Privileged bits:
3497 * GD (bit 13): must be 0.
3498 */
3499 if ( value & DR_GENERAL_DETECT )
3500 return -EPERM;
3501 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3502 if ( value & DR7_ACTIVE_MASK )
3504 unsigned int io_enable = 0;
3506 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3508 if ( ((value >> i) & 3) == DR_IO )
3510 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3511 return -EPERM;
3512 io_enable |= value & (3 << ((i - 16) >> 1));
3514 #ifdef __i386__
3515 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3516 !boot_cpu_has(X86_FEATURE_LM)) &&
3517 (((value >> i) & 0xc) == DR_LEN_8) )
3518 return -EPERM;
3519 #endif
3522 /* Guest DR5 is a handy stash for I/O intercept information. */
3523 v->arch.guest_context.debugreg[5] = io_enable;
3524 value &= ~io_enable;
3526 /*
3527 * If DR7 was previously clear then we need to load all other
3528 * debug registers at this point as they were not restored during
3529 * context switch.
3530 */
3531 if ( (v == curr) &&
3532 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3534 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3535 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3536 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3537 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3538 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3541 if ( v == curr )
3542 write_debugreg(7, value);
3543 break;
3544 default:
3545 return -EINVAL;
3548 v->arch.guest_context.debugreg[reg] = value;
3549 return 0;
3552 long do_set_debugreg(int reg, unsigned long value)
3554 return set_debugreg(current, reg, value);
3557 unsigned long do_get_debugreg(int reg)
3559 struct vcpu *curr = current;
3561 switch ( reg )
3563 case 0 ... 3:
3564 case 6:
3565 return curr->arch.guest_context.debugreg[reg];
3566 case 7:
3567 return (curr->arch.guest_context.debugreg[7] |
3568 curr->arch.guest_context.debugreg[5]);
3569 case 4 ... 5:
3570 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3571 curr->arch.guest_context.debugreg[reg + 2] : 0);
3574 return -EINVAL;
3577 /*
3578 * Local variables:
3579 * mode: C
3580 * c-set-style: "BSD"
3581 * c-basic-offset: 4
3582 * tab-width: 4
3583 * indent-tabs-mode: nil
3584 * End:
3585 */