debuggers.hg

view xen/arch/x86/traps.c @ 21023:e4851c5b7d00

x86: Fix build error after c/s 20969:8cb6e7eff2ba

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 24 20:26:08 2010 +0000 (2010-02-24)
parents 8cb6e7eff2ba
children 6233eb0f29ba
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/bitops.h>
55 #include <asm/desc.h>
56 #include <asm/debugreg.h>
57 #include <asm/smp.h>
58 #include <asm/flushtlb.h>
59 #include <asm/uaccess.h>
60 #include <asm/i387.h>
61 #include <asm/debugger.h>
62 #include <asm/msr.h>
63 #include <asm/shared.h>
64 #include <asm/x86_emulate.h>
65 #include <asm/traps.h>
66 #include <asm/hvm/vpt.h>
67 #include <asm/hypercall.h>
68 #include <public/arch-x86/cpuid.h>
70 /*
71 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
72 * fatal: Xen prints diagnostic message and then hangs.
73 * dom0: The NMI is virtualised to DOM0.
74 * ignore: The NMI error is cleared and ignored.
75 */
76 #ifdef NDEBUG
77 static char __read_mostly opt_nmi[10] = "dom0";
78 #else
79 static char __read_mostly opt_nmi[10] = "fatal";
80 #endif
81 string_param("nmi", opt_nmi);
83 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
85 /* Master table, used by CPU0. */
86 idt_entry_t idt_table[IDT_ENTRIES];
88 /* Pointer to the IDT of every CPU. */
89 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
91 #define DECLARE_TRAP_HANDLER(_name) \
92 asmlinkage void _name(void); \
93 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
95 DECLARE_TRAP_HANDLER(divide_error);
96 DECLARE_TRAP_HANDLER(debug);
97 DECLARE_TRAP_HANDLER(nmi);
98 DECLARE_TRAP_HANDLER(int3);
99 DECLARE_TRAP_HANDLER(overflow);
100 DECLARE_TRAP_HANDLER(bounds);
101 DECLARE_TRAP_HANDLER(invalid_op);
102 DECLARE_TRAP_HANDLER(device_not_available);
103 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
104 DECLARE_TRAP_HANDLER(invalid_TSS);
105 DECLARE_TRAP_HANDLER(segment_not_present);
106 DECLARE_TRAP_HANDLER(stack_segment);
107 DECLARE_TRAP_HANDLER(general_protection);
108 DECLARE_TRAP_HANDLER(page_fault);
109 DECLARE_TRAP_HANDLER(coprocessor_error);
110 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
111 DECLARE_TRAP_HANDLER(machine_check);
112 DECLARE_TRAP_HANDLER(alignment_check);
113 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
136 unsigned long mask = STACK_SIZE;
138 if ( is_hvm_vcpu(v) )
139 return;
141 if ( is_pv_32on64_vcpu(v) )
142 {
143 compat_show_guest_stack(v, regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 if ( !access_ok(stack, sizeof(*stack)) )
160 {
161 printk("Guest-inaccessible memory.\n");
162 return;
163 }
165 if ( v != current )
166 {
167 struct vcpu *vcpu;
169 ASSERT(guest_kernel_mode(v, regs));
170 #ifndef __x86_64__
171 addr = read_cr3();
172 for_each_vcpu( v->domain, vcpu )
173 if ( vcpu->arch.cr3 == addr )
174 break;
175 #else
176 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
177 #endif
178 if ( !vcpu )
179 {
180 stack = do_page_walk(v, (unsigned long)stack);
181 if ( (unsigned long)stack < PAGE_SIZE )
182 {
183 printk("Inaccessible guest memory.\n");
184 return;
185 }
186 mask = PAGE_SIZE;
187 }
188 }
190 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
191 {
192 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
193 break;
194 if ( __get_user(addr, stack) )
195 {
196 if ( i != 0 )
197 printk("\n ");
198 printk("Fault while accessing guest memory.");
199 i = 1;
200 break;
201 }
202 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
203 printk("\n ");
204 printk(" %p", _p(addr));
205 stack++;
206 }
207 if ( i == 0 )
208 printk("Stack empty.");
209 printk("\n");
210 }
212 #if !defined(CONFIG_FRAME_POINTER)
214 static void show_trace(struct cpu_user_regs *regs)
215 {
216 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
218 printk("Xen call trace:\n ");
220 printk("[<%p>]", _p(regs->eip));
221 print_symbol(" %s\n ", regs->eip);
223 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
224 {
225 addr = *stack++;
226 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
227 {
228 printk("[<%p>]", _p(addr));
229 print_symbol(" %s\n ", addr);
230 }
231 }
233 printk("\n");
234 }
236 #else
238 static void show_trace(struct cpu_user_regs *regs)
239 {
240 unsigned long *frame, next, addr, low, high;
242 printk("Xen call trace:\n ");
244 printk("[<%p>]", _p(regs->eip));
245 print_symbol(" %s\n ", regs->eip);
247 /* Bounds for range of valid frame pointer. */
248 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
249 high = (low & ~(STACK_SIZE - 1)) +
250 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
252 /* The initial frame pointer. */
253 next = regs->ebp;
255 for ( ; ; )
256 {
257 /* Valid frame pointer? */
258 if ( (next < low) || (next >= high) )
259 {
260 /*
261 * Exception stack frames have a different layout, denoted by an
262 * inverted frame pointer.
263 */
264 next = ~next;
265 if ( (next < low) || (next >= high) )
266 break;
267 frame = (unsigned long *)next;
268 next = frame[0];
269 addr = frame[(offsetof(struct cpu_user_regs, eip) -
270 offsetof(struct cpu_user_regs, ebp))
271 / BYTES_PER_LONG];
272 }
273 else
274 {
275 /* Ordinary stack frame. */
276 frame = (unsigned long *)next;
277 next = frame[0];
278 addr = frame[1];
279 }
281 printk("[<%p>]", _p(addr));
282 print_symbol(" %s\n ", addr);
284 low = (unsigned long)&frame[2];
285 }
287 printk("\n");
288 }
290 #endif
292 void show_stack(struct cpu_user_regs *regs)
293 {
294 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
295 int i;
297 if ( guest_mode(regs) )
298 return show_guest_stack(current, regs);
300 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
302 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
303 {
304 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
305 break;
306 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
307 printk("\n ");
308 addr = *stack++;
309 printk(" %p", _p(addr));
310 }
311 if ( i == 0 )
312 printk("Stack empty.");
313 printk("\n");
315 show_trace(regs);
316 }
318 void show_stack_overflow(unsigned int cpu, unsigned long esp)
319 {
320 #ifdef MEMORY_GUARD
321 unsigned long esp_top, esp_bottom;
322 unsigned long *stack, addr;
324 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
325 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
327 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
328 (void *)esp_top, (void *)esp_bottom, (void *)esp,
329 (void *)per_cpu(init_tss, cpu).esp0);
331 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
332 if ( ((unsigned long)(esp - esp_top) > 512) &&
333 ((unsigned long)(esp_top - esp) > 512) )
334 {
335 printk("No stack overflow detected. Skipping stack trace.\n");
336 return;
337 }
339 if ( esp < esp_top )
340 esp = esp_top;
342 printk("Xen stack overflow (dumping trace %p-%p):\n ",
343 (void *)esp, (void *)esp_bottom);
345 stack = (unsigned long *)esp;
346 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
347 {
348 addr = *stack++;
349 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
350 {
351 printk("%p: [<%p>]", stack, _p(addr));
352 print_symbol(" %s\n ", addr);
353 }
354 }
356 printk("\n");
357 #endif
358 }
360 void show_execution_state(struct cpu_user_regs *regs)
361 {
362 show_registers(regs);
363 show_stack(regs);
364 }
366 void vcpu_show_execution_state(struct vcpu *v)
367 {
368 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
369 v->domain->domain_id, v->vcpu_id);
371 if ( v == current )
372 {
373 show_execution_state(guest_cpu_user_regs());
374 return;
375 }
377 vcpu_pause(v); /* acceptably dangerous */
379 vcpu_show_registers(v);
380 if ( guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
381 show_guest_stack(v, &v->arch.guest_context.user_regs);
383 vcpu_unpause(v);
384 }
386 static char *trapstr(int trapnr)
387 {
388 static char *strings[] = {
389 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
390 "invalid opcode", "device not available", "double fault",
391 "coprocessor segment", "invalid tss", "segment not found",
392 "stack error", "general protection fault", "page fault",
393 "spurious interrupt", "coprocessor error", "alignment check",
394 "machine check", "simd error"
395 };
397 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
398 return "???";
400 return strings[trapnr];
401 }
403 /*
404 * This is called for faults at very unexpected times (e.g., when interrupts
405 * are disabled). In such situations we can't do much that is safe. We try to
406 * print out some tracing and then we just spin.
407 */
408 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
409 {
410 static DEFINE_PER_CPU(char, depth);
412 /*
413 * In some cases, we can end up in a vicious cycle of fatal_trap()s
414 * within fatal_trap()s. We give the problem a couple of iterations to
415 * bottom out, and then we just panic.
416 */
417 if ( ++this_cpu(depth) < 3 )
418 {
419 watchdog_disable();
420 console_start_sync();
422 show_execution_state(regs);
424 if ( trapnr == TRAP_page_fault )
425 {
426 unsigned long cr2 = read_cr2();
427 printk("Faulting linear address: %p\n", _p(cr2));
428 show_page_walk(cr2);
429 }
430 }
432 panic("FATAL TRAP: vector = %d (%s)\n"
433 "[error_code=%04x] %s\n",
434 trapnr, trapstr(trapnr), regs->error_code,
435 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
436 }
438 static void do_guest_trap(
439 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
440 {
441 struct vcpu *v = current;
442 struct trap_bounce *tb;
443 const struct trap_info *ti;
445 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
447 tb = &v->arch.trap_bounce;
448 ti = &v->arch.guest_context.trap_ctxt[trapnr];
450 tb->flags = TBF_EXCEPTION;
451 tb->cs = ti->cs;
452 tb->eip = ti->address;
454 if ( use_error_code )
455 {
456 tb->flags |= TBF_EXCEPTION_ERRCODE;
457 tb->error_code = regs->error_code;
458 }
460 if ( TI_GET_IF(ti) )
461 tb->flags |= TBF_INTERRUPT;
463 if ( unlikely(null_trap_bounce(v, tb)) )
464 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
465 "on VCPU %d [ec=%04x]\n",
466 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
467 }
469 static void instruction_done(
470 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
471 {
472 regs->eip = eip;
473 regs->eflags &= ~X86_EFLAGS_RF;
474 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
475 {
476 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
477 if ( regs->eflags & X86_EFLAGS_TF )
478 current->arch.guest_context.debugreg[6] |= 0x4000;
479 do_guest_trap(TRAP_debug, regs, 0);
480 }
481 }
483 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
484 unsigned int port, unsigned int len)
485 {
486 unsigned int width, i, match = 0;
487 unsigned long start;
489 if ( !(v->arch.guest_context.debugreg[5]) ||
490 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
491 return 0;
493 for ( i = 0; i < 4; i++ )
494 {
495 if ( !(v->arch.guest_context.debugreg[5] &
496 (3 << (i * DR_ENABLE_SIZE))) )
497 continue;
499 start = v->arch.guest_context.debugreg[i];
500 width = 0;
502 switch ( (v->arch.guest_context.debugreg[7] >>
503 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
504 {
505 case DR_LEN_1: width = 1; break;
506 case DR_LEN_2: width = 2; break;
507 case DR_LEN_4: width = 4; break;
508 case DR_LEN_8: width = 8; break;
509 }
511 if ( (start < (port + len)) && ((start + width) > port) )
512 match |= 1 << i;
513 }
515 return match;
516 }
518 /*
519 * Called from asm to set up the MCE trapbounce info.
520 * Returns 0 if no callback is set up, else 1.
521 */
522 asmlinkage int set_guest_machinecheck_trapbounce(void)
523 {
524 struct vcpu *v = current;
525 struct trap_bounce *tb = &v->arch.trap_bounce;
527 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
528 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
529 return !null_trap_bounce(v, tb);
530 }
532 /*
533 * Called from asm to set up the NMI trapbounce info.
534 * Returns 0 if no callback is set up, else 1.
535 */
536 asmlinkage int set_guest_nmi_trapbounce(void)
537 {
538 struct vcpu *v = current;
539 struct trap_bounce *tb = &v->arch.trap_bounce;
540 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
541 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
542 return !null_trap_bounce(v, tb);
543 }
545 static inline void do_trap(
546 int trapnr, struct cpu_user_regs *regs, int use_error_code)
547 {
548 struct vcpu *curr = current;
549 unsigned long fixup;
551 DEBUGGER_trap_entry(trapnr, regs);
553 if ( guest_mode(regs) )
554 {
555 do_guest_trap(trapnr, regs, use_error_code);
556 return;
557 }
559 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
560 {
561 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
562 trapnr, _p(regs->eip), _p(fixup));
563 regs->eip = fixup;
564 return;
565 }
567 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
568 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
569 {
570 curr->arch.hvm_vcpu.fpu_exception_callback(
571 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
572 return;
573 }
575 DEBUGGER_trap_fatal(trapnr, regs);
577 show_execution_state(regs);
578 panic("FATAL TRAP: vector = %d (%s)\n"
579 "[error_code=%04x]\n",
580 trapnr, trapstr(trapnr), regs->error_code);
581 }
583 #define DO_ERROR_NOCODE(trapnr, name) \
584 asmlinkage void do_##name(struct cpu_user_regs *regs) \
585 { \
586 do_trap(trapnr, regs, 0); \
587 }
589 #define DO_ERROR(trapnr, name) \
590 asmlinkage void do_##name(struct cpu_user_regs *regs) \
591 { \
592 do_trap(trapnr, regs, 1); \
593 }
595 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
596 DO_ERROR_NOCODE(TRAP_overflow, overflow)
597 DO_ERROR_NOCODE(TRAP_bounds, bounds)
598 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
599 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
600 DO_ERROR( TRAP_no_segment, segment_not_present)
601 DO_ERROR( TRAP_stack_error, stack_segment)
602 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
603 DO_ERROR( TRAP_alignment_check, alignment_check)
604 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
606 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
607 {
608 struct domain *d = current->domain;
609 /* Optionally shift out of the way of Viridian architectural MSRs. */
610 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
612 idx -= base;
613 if ( idx > 0 )
614 return 0;
616 switch ( idx )
617 {
618 case 0:
619 {
620 *val = 0;
621 break;
622 }
623 default:
624 BUG();
625 }
627 return 1;
628 }
630 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
631 {
632 struct domain *d = current->domain;
633 /* Optionally shift out of the way of Viridian architectural MSRs. */
634 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
636 idx -= base;
637 if ( idx > 0 )
638 return 0;
640 switch ( idx )
641 {
642 case 0:
643 {
644 void *hypercall_page;
645 unsigned long mfn;
646 unsigned long gmfn = val >> 12;
647 unsigned int idx = val & 0xfff;
649 if ( idx > 0 )
650 {
651 gdprintk(XENLOG_WARNING,
652 "Out of range index %u to MSR %08x\n",
653 idx, 0x40000000);
654 return 0;
655 }
657 mfn = gmfn_to_mfn(d, gmfn);
659 if ( !mfn_valid(mfn) ||
660 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
661 {
662 gdprintk(XENLOG_WARNING,
663 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
664 gmfn, mfn, base + idx);
665 return 0;
666 }
668 hypercall_page = map_domain_page(mfn);
669 hypercall_page_initialise(d, hypercall_page);
670 unmap_domain_page(hypercall_page);
672 put_page_and_type(mfn_to_page(mfn));
673 break;
674 }
676 default:
677 BUG();
678 }
680 return 1;
681 }
683 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
684 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
685 {
686 struct domain *d = current->domain;
687 /* Optionally shift out of the way of Viridian architectural leaves. */
688 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
690 idx -= base;
691 if ( idx > 3 )
692 return 0;
694 switch ( idx )
695 {
696 case 0:
697 *eax = base + 3; /* Largest leaf */
698 *ebx = XEN_CPUID_SIGNATURE_EBX;
699 *ecx = XEN_CPUID_SIGNATURE_ECX;
700 *edx = XEN_CPUID_SIGNATURE_EDX;
701 break;
703 case 1:
704 *eax = (xen_major_version() << 16) | xen_minor_version();
705 *ebx = 0; /* Reserved */
706 *ecx = 0; /* Reserved */
707 *edx = 0; /* Reserved */
708 break;
710 case 2:
711 *eax = 1; /* Number of hypercall-transfer pages */
712 *ebx = 0x40000000; /* MSR base address */
713 if ( is_viridian_domain(d) )
714 *ebx = 0x40000200;
715 *ecx = 0; /* Features 1 */
716 *edx = 0; /* Features 2 */
717 if ( !is_hvm_vcpu(current) )
718 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
719 break;
721 case 3:
722 *eax = *ebx = *ecx = *edx = 0;
723 cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
724 break;
726 default:
727 BUG();
728 }
730 return 1;
731 }
733 static void pv_cpuid(struct cpu_user_regs *regs)
734 {
735 uint32_t a, b, c, d;
737 a = regs->eax;
738 b = regs->ebx;
739 c = regs->ecx;
740 d = regs->edx;
742 if ( current->domain->domain_id != 0 )
743 {
744 if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
745 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
746 goto out;
747 }
749 asm (
750 "cpuid"
751 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
752 : "0" (a), "1" (b), "2" (c), "3" (d) );
754 if ( (regs->eax & 0x7fffffff) == 1 )
755 {
756 /* Modify Feature Information. */
757 __clear_bit(X86_FEATURE_VME, &d);
758 if ( !cpu_has_apic )
759 __clear_bit(X86_FEATURE_APIC, &d);
760 __clear_bit(X86_FEATURE_PSE, &d);
761 __clear_bit(X86_FEATURE_PGE, &d);
762 __clear_bit(X86_FEATURE_PSE36, &d);
763 }
764 switch ( (uint32_t)regs->eax )
765 {
766 case 1:
767 /* Modify Feature Information. */
768 if ( !cpu_has_sep )
769 __clear_bit(X86_FEATURE_SEP, &d);
770 #ifdef __i386__
771 if ( !supervisor_mode_kernel )
772 __clear_bit(X86_FEATURE_SEP, &d);
773 #endif
774 __clear_bit(X86_FEATURE_DS, &d);
775 __clear_bit(X86_FEATURE_ACC, &d);
776 __clear_bit(X86_FEATURE_PBE, &d);
778 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
779 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
780 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
781 __clear_bit(X86_FEATURE_VMXE % 32, &c);
782 __clear_bit(X86_FEATURE_SMXE % 32, &c);
783 __clear_bit(X86_FEATURE_TM2 % 32, &c);
784 if ( is_pv_32bit_vcpu(current) )
785 __clear_bit(X86_FEATURE_CX16 % 32, &c);
786 __clear_bit(X86_FEATURE_XTPR % 32, &c);
787 __clear_bit(X86_FEATURE_PDCM % 32, &c);
788 __clear_bit(X86_FEATURE_DCA % 32, &c);
789 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
790 if ( !cpu_has_apic )
791 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
792 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
793 break;
794 case 0x80000001:
795 /* Modify Feature Information. */
796 if ( is_pv_32bit_vcpu(current) )
797 {
798 __clear_bit(X86_FEATURE_LM % 32, &d);
799 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
800 }
801 #ifndef __i386__
802 if ( is_pv_32on64_vcpu(current) &&
803 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
804 #endif
805 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
806 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
807 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
809 __clear_bit(X86_FEATURE_SVME % 32, &c);
810 if ( !cpu_has_apic )
811 __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
812 __clear_bit(X86_FEATURE_OSVW % 32, &c);
813 __clear_bit(X86_FEATURE_IBS % 32, &c);
814 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
815 __clear_bit(X86_FEATURE_WDT % 32, &c);
816 break;
817 case 5: /* MONITOR/MWAIT */
818 case 0xa: /* Architectural Performance Monitor Features */
819 case 0x8000000a: /* SVM revision and features */
820 case 0x8000001b: /* Instruction Based Sampling */
821 a = b = c = d = 0;
822 break;
823 default:
824 (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
825 break;
826 }
828 out:
829 regs->eax = a;
830 regs->ebx = b;
831 regs->ecx = c;
832 regs->edx = d;
833 }
835 static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
836 {
837 char opcode[3];
838 unsigned long eip, rc;
839 struct vcpu *v = current;
841 eip = regs->eip;
842 if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
843 {
844 propagate_page_fault(eip + sizeof(opcode) - rc, 0);
845 return EXCRET_fault_fixed;
846 }
847 if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
848 return 0;
849 eip += sizeof(opcode);
850 pv_soft_rdtsc(v, regs, 1);
851 instruction_done(regs, eip, 0);
852 return EXCRET_fault_fixed;
853 }
855 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
856 {
857 char sig[5], instr[2];
858 unsigned long eip, rc;
860 eip = regs->eip;
862 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
863 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
864 {
865 propagate_page_fault(eip + sizeof(sig) - rc, 0);
866 return EXCRET_fault_fixed;
867 }
868 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
869 return 0;
870 eip += sizeof(sig);
872 /* We only emulate CPUID. */
873 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
874 {
875 propagate_page_fault(eip + sizeof(instr) - rc, 0);
876 return EXCRET_fault_fixed;
877 }
878 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
879 return 0;
880 eip += sizeof(instr);
882 pv_cpuid(regs);
884 instruction_done(regs, eip, 0);
886 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
888 return EXCRET_fault_fixed;
889 }
891 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
892 {
893 struct bug_frame bug;
894 struct bug_frame_str bug_str;
895 const char *p, *filename, *predicate, *eip = (char *)regs->eip;
896 unsigned long fixup;
897 int id, lineno;
899 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
901 if ( likely(guest_mode(regs)) )
902 {
903 if ( !emulate_invalid_rdtscp(regs) &&
904 !emulate_forced_invalid_op(regs) )
905 do_guest_trap(TRAP_invalid_op, regs, 0);
906 return;
907 }
909 if ( !is_kernel(eip) ||
910 __copy_from_user(&bug, eip, sizeof(bug)) ||
911 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
912 (bug.ret != 0xc2) )
913 goto die;
914 eip += sizeof(bug);
916 /* Decode first pointer argument. */
917 if ( !is_kernel(eip) ||
918 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
919 (bug_str.mov != 0xbc) )
920 goto die;
921 p = bug_str(bug_str, eip);
922 if ( !is_kernel(p) )
923 goto die;
924 eip += sizeof(bug_str);
926 id = bug.id & 3;
928 if ( id == BUGFRAME_run_fn )
929 {
930 void (*fn)(struct cpu_user_regs *) = (void *)p;
931 (*fn)(regs);
932 regs->eip = (unsigned long)eip;
933 return;
934 }
936 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
937 filename = p;
938 lineno = bug.id >> 2;
940 if ( id == BUGFRAME_warn )
941 {
942 printk("Xen WARN at %.50s:%d\n", filename, lineno);
943 show_execution_state(regs);
944 regs->eip = (unsigned long)eip;
945 return;
946 }
948 if ( id == BUGFRAME_bug )
949 {
950 printk("Xen BUG at %.50s:%d\n", filename, lineno);
951 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
952 show_execution_state(regs);
953 panic("Xen BUG at %.50s:%d\n", filename, lineno);
954 }
956 /* ASSERT: decode the predicate string pointer. */
957 ASSERT(id == BUGFRAME_assert);
958 if ( !is_kernel(eip) ||
959 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
960 (bug_str.mov != 0xbc) )
961 goto die;
962 predicate = bug_str(bug_str, eip);
963 eip += sizeof(bug_str);
965 if ( !is_kernel(predicate) )
966 predicate = "<unknown>";
967 printk("Assertion '%s' failed at %.50s:%d\n",
968 predicate, filename, lineno);
969 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
970 show_execution_state(regs);
971 panic("Assertion '%s' failed at %.50s:%d\n",
972 predicate, filename, lineno);
974 die:
975 if ( (fixup = search_exception_table(regs->eip)) != 0 )
976 {
977 regs->eip = fixup;
978 return;
979 }
980 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
981 show_execution_state(regs);
982 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
983 }
985 asmlinkage void do_int3(struct cpu_user_regs *regs)
986 {
987 DEBUGGER_trap_entry(TRAP_int3, regs);
989 if ( !guest_mode(regs) )
990 {
991 debugger_trap_fatal(TRAP_int3, regs);
992 return;
993 }
995 do_guest_trap(TRAP_int3, regs, 0);
996 }
998 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
999 {
1000 machine_check_vector(regs, regs->error_code);
1003 static void reserved_bit_page_fault(
1004 unsigned long addr, struct cpu_user_regs *regs)
1006 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
1007 current->domain->domain_id, current->vcpu_id, regs->error_code);
1008 show_page_walk(addr);
1009 show_execution_state(regs);
1012 void propagate_page_fault(unsigned long addr, u16 error_code)
1014 struct trap_info *ti;
1015 struct vcpu *v = current;
1016 struct trap_bounce *tb = &v->arch.trap_bounce;
1018 v->arch.guest_context.ctrlreg[2] = addr;
1019 arch_set_cr2(v, addr);
1021 /* Re-set error_code.user flag appropriately for the guest. */
1022 error_code &= ~PFEC_user_mode;
1023 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
1024 error_code |= PFEC_user_mode;
1026 trace_pv_page_fault(addr, error_code);
1028 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
1029 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1030 tb->error_code = error_code;
1031 tb->cs = ti->cs;
1032 tb->eip = ti->address;
1033 if ( TI_GET_IF(ti) )
1034 tb->flags |= TBF_INTERRUPT;
1035 if ( unlikely(null_trap_bounce(v, tb)) )
1037 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
1038 v->domain->domain_id, v->vcpu_id, error_code);
1039 show_page_walk(addr);
1042 if ( unlikely(error_code & PFEC_reserved_bit) )
1043 reserved_bit_page_fault(addr, guest_cpu_user_regs());
1046 static int handle_gdt_ldt_mapping_fault(
1047 unsigned long offset, struct cpu_user_regs *regs)
1049 struct vcpu *curr = current;
1050 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1051 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1052 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1054 /* Should never fault in another vcpu's area. */
1055 BUG_ON(vcpu_area != curr->vcpu_id);
1057 /* Byte offset within the gdt/ldt sub-area. */
1058 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1060 if ( likely(is_ldt_area) )
1062 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1063 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1065 if ( guest_mode(regs) )
1066 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1067 regs->eip, offset);
1069 else
1071 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1072 if ( !guest_mode(regs) )
1073 return 0;
1074 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1075 propagate_page_fault(
1076 curr->arch.guest_context.ldt_base + offset,
1077 regs->error_code);
1080 else
1082 /* GDT fault: handle the fault as #GP(selector). */
1083 regs->error_code = (u16)offset & ~7;
1084 (void)do_general_protection(regs);
1087 return EXCRET_fault_fixed;
1090 #ifdef HYPERVISOR_VIRT_END
1091 #define IN_HYPERVISOR_RANGE(va) \
1092 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1093 #else
1094 #define IN_HYPERVISOR_RANGE(va) \
1095 (((va) >= HYPERVISOR_VIRT_START))
1096 #endif
1098 static int __spurious_page_fault(
1099 unsigned long addr, unsigned int error_code)
1101 unsigned long mfn, cr3 = read_cr3();
1102 #if CONFIG_PAGING_LEVELS >= 4
1103 l4_pgentry_t l4e, *l4t;
1104 #endif
1105 #if CONFIG_PAGING_LEVELS >= 3
1106 l3_pgentry_t l3e, *l3t;
1107 #endif
1108 l2_pgentry_t l2e, *l2t;
1109 l1_pgentry_t l1e, *l1t;
1110 unsigned int required_flags, disallowed_flags;
1112 /*
1113 * We do not take spurious page faults in IRQ handlers as we do not
1114 * modify page tables in IRQ context. We therefore bail here because
1115 * map_domain_page() is not IRQ-safe.
1116 */
1117 if ( in_irq() )
1118 return 0;
1120 /* Reserved bit violations are never spurious faults. */
1121 if ( error_code & PFEC_reserved_bit )
1122 return 0;
1124 required_flags = _PAGE_PRESENT;
1125 if ( error_code & PFEC_write_access )
1126 required_flags |= _PAGE_RW;
1127 if ( error_code & PFEC_user_mode )
1128 required_flags |= _PAGE_USER;
1130 disallowed_flags = 0;
1131 if ( error_code & PFEC_insn_fetch )
1132 disallowed_flags |= _PAGE_NX;
1134 mfn = cr3 >> PAGE_SHIFT;
1136 #if CONFIG_PAGING_LEVELS >= 4
1137 l4t = map_domain_page(mfn);
1138 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1139 mfn = l4e_get_pfn(l4e);
1140 unmap_domain_page(l4t);
1141 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1142 (l4e_get_flags(l4e) & disallowed_flags) )
1143 return 0;
1144 #endif
1146 #if CONFIG_PAGING_LEVELS >= 3
1147 l3t = map_domain_page(mfn);
1148 #if CONFIG_PAGING_LEVELS == 3
1149 l3t += (cr3 & 0xFE0UL) >> 3;
1150 #endif
1151 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1152 mfn = l3e_get_pfn(l3e);
1153 unmap_domain_page(l3t);
1154 #if CONFIG_PAGING_LEVELS == 3
1155 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1156 return 0;
1157 #else
1158 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1159 (l3e_get_flags(l3e) & disallowed_flags) )
1160 return 0;
1161 #endif
1162 #endif
1164 l2t = map_domain_page(mfn);
1165 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1166 mfn = l2e_get_pfn(l2e);
1167 unmap_domain_page(l2t);
1168 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1169 (l2e_get_flags(l2e) & disallowed_flags) )
1170 return 0;
1171 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1173 l1e = l1e_empty(); /* define before use in debug tracing */
1174 goto spurious;
1177 l1t = map_domain_page(mfn);
1178 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1179 mfn = l1e_get_pfn(l1e);
1180 unmap_domain_page(l1t);
1181 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1182 (l1e_get_flags(l1e) & disallowed_flags) )
1183 return 0;
1185 spurious:
1186 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1187 "at addr %lx, e/c %04x\n",
1188 current->domain->domain_id, current->vcpu_id,
1189 addr, error_code);
1190 #if CONFIG_PAGING_LEVELS >= 4
1191 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1192 #endif
1193 #if CONFIG_PAGING_LEVELS >= 3
1194 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1195 #endif
1196 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1197 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1198 return 1;
1201 static int spurious_page_fault(
1202 unsigned long addr, unsigned int error_code)
1204 unsigned long flags;
1205 int is_spurious;
1207 /*
1208 * Disabling interrupts prevents TLB flushing, and hence prevents
1209 * page tables from becoming invalid under our feet during the walk.
1210 */
1211 local_irq_save(flags);
1212 is_spurious = __spurious_page_fault(addr, error_code);
1213 local_irq_restore(flags);
1215 return is_spurious;
1218 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1220 struct vcpu *v = current;
1221 struct domain *d = v->domain;
1223 /* No fixups in interrupt context or when interrupts are disabled. */
1224 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1225 return 0;
1227 /* Faults from external-mode guests are handled by shadow/hap */
1228 if ( paging_mode_external(d) && guest_mode(regs) )
1230 int ret = paging_fault(addr, regs);
1231 if ( ret == EXCRET_fault_fixed )
1232 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1233 return ret;
1236 if ( !(regs->error_code & PFEC_page_present) &&
1237 (pagefault_by_memadd(addr, regs)) )
1238 return handle_memadd_fault(addr, regs);
1240 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1242 if ( !(regs->error_code & PFEC_reserved_bit) &&
1243 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1244 return handle_gdt_ldt_mapping_fault(
1245 addr - GDT_LDT_VIRT_START, regs);
1246 return 0;
1249 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1250 guest_kernel_mode(v, regs) &&
1251 /* Do not check if access-protection fault since the page may
1252 legitimately be not present in shadow page tables */
1253 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1254 PFEC_write_access) &&
1255 ptwr_do_page_fault(v, addr, regs) )
1256 return EXCRET_fault_fixed;
1258 /* For non-external shadowed guests, we fix up both their own
1259 * pagefaults and Xen's, since they share the pagetables. */
1260 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1262 int ret = paging_fault(addr, regs);
1263 if ( ret == EXCRET_fault_fixed )
1264 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1265 return ret;
1268 return 0;
1271 /*
1272 * #PF error code:
1273 * Bit 0: Protection violation (=1) ; Page not present (=0)
1274 * Bit 1: Write access
1275 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1276 * Bit 3: Reserved bit violation
1277 * Bit 4: Instruction fetch
1278 */
1279 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1281 unsigned long addr, fixup;
1282 unsigned int error_code;
1284 addr = read_cr2();
1286 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1287 error_code = regs->error_code;
1289 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1291 perfc_incr(page_faults);
1293 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1294 return;
1296 if ( unlikely(!guest_mode(regs)) )
1298 if ( spurious_page_fault(addr, error_code) )
1299 return;
1301 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1303 perfc_incr(copy_user_faults);
1304 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1305 reserved_bit_page_fault(addr, regs);
1306 regs->eip = fixup;
1307 return;
1310 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1312 show_execution_state(regs);
1313 show_page_walk(addr);
1314 panic("FATAL PAGE FAULT\n"
1315 "[error_code=%04x]\n"
1316 "Faulting linear address: %p\n",
1317 error_code, _p(addr));
1320 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1321 && spurious_page_fault(addr, error_code)) )
1322 return;
1324 propagate_page_fault(addr, regs->error_code);
1327 /*
1328 * Early #PF handler to print CR2, error code, and stack.
1330 * We also deal with spurious faults here, even though they should never happen
1331 * during early boot (an issue was seen once, but was most likely a hardware
1332 * problem).
1333 */
1334 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1336 static int stuck;
1337 static unsigned long prev_eip, prev_cr2;
1338 unsigned long cr2 = read_cr2();
1340 BUG_ON(smp_processor_id() != 0);
1342 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1344 prev_eip = regs->eip;
1345 prev_cr2 = cr2;
1346 stuck = 0;
1347 return;
1350 if ( stuck++ == 1000 )
1352 unsigned long *stk = (unsigned long *)regs;
1353 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1354 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1355 printk("Stack dump: ");
1356 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1357 printk("%p ", _p(*stk++));
1358 for ( ; ; ) ;
1362 long do_fpu_taskswitch(int set)
1364 struct vcpu *v = current;
1366 if ( set )
1368 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1369 stts();
1371 else
1373 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1374 if ( v->fpu_dirtied )
1375 clts();
1378 return 0;
1381 static int read_descriptor(unsigned int sel,
1382 const struct vcpu *v,
1383 const struct cpu_user_regs * regs,
1384 unsigned long *base,
1385 unsigned long *limit,
1386 unsigned int *ar,
1387 unsigned int vm86attr)
1389 struct desc_struct desc;
1391 if ( !vm86_mode(regs) )
1393 if ( sel < 4)
1394 desc.b = desc.a = 0;
1395 else if ( __get_user(desc,
1396 (const struct desc_struct *)(!(sel & 4)
1397 ? GDT_VIRT_START(v)
1398 : LDT_VIRT_START(v))
1399 + (sel >> 3)) )
1400 return 0;
1401 if ( !(vm86attr & _SEGMENT_CODE) )
1402 desc.b &= ~_SEGMENT_L;
1404 else
1406 desc.a = (sel << 20) | 0xffff;
1407 desc.b = vm86attr | (sel >> 12);
1410 *ar = desc.b & 0x00f0ff00;
1411 if ( !(desc.b & _SEGMENT_L) )
1413 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1414 (desc.b & 0xff000000));
1415 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1416 if ( desc.b & _SEGMENT_G )
1417 *limit = ((*limit + 1) << 12) - 1;
1418 #ifndef NDEBUG
1419 if ( !vm86_mode(regs) && (sel > 3) )
1421 unsigned int a, l;
1422 unsigned char valid;
1424 asm volatile (
1425 "larl %2,%0 ; setz %1"
1426 : "=r" (a), "=qm" (valid) : "rm" (sel));
1427 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1428 asm volatile (
1429 "lsll %2,%0 ; setz %1"
1430 : "=r" (l), "=qm" (valid) : "rm" (sel));
1431 BUG_ON(valid && (l != *limit));
1433 #endif
1435 else
1437 *base = 0UL;
1438 *limit = ~0UL;
1441 return 1;
1444 #ifdef __x86_64__
1445 static int read_gate_descriptor(unsigned int gate_sel,
1446 const struct vcpu *v,
1447 unsigned int *sel,
1448 unsigned long *off,
1449 unsigned int *ar)
1451 struct desc_struct desc;
1452 const struct desc_struct *pdesc;
1455 pdesc = (const struct desc_struct *)
1456 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1457 + (gate_sel >> 3);
1458 if ( (gate_sel < 4) ||
1459 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1460 __get_user(desc, pdesc) )
1461 return 0;
1463 *sel = (desc.a >> 16) & 0x0000fffc;
1464 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1465 *ar = desc.b & 0x0000ffff;
1467 /*
1468 * check_descriptor() clears the DPL field and stores the
1469 * guest requested DPL in the selector's RPL field.
1470 */
1471 if ( *ar & _SEGMENT_DPL )
1472 return 0;
1473 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1475 if ( !is_pv_32bit_vcpu(v) )
1477 if ( (*ar & 0x1f00) != 0x0c00 ||
1478 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1479 __get_user(desc, pdesc + 1) ||
1480 (desc.b & 0x1f00) )
1481 return 0;
1483 *off |= (unsigned long)desc.a << 32;
1484 return 1;
1487 switch ( *ar & 0x1f00 )
1489 case 0x0400:
1490 *off &= 0xffff;
1491 break;
1492 case 0x0c00:
1493 break;
1494 default:
1495 return 0;
1498 return 1;
1500 #endif
1502 /* Has the guest requested sufficient permission for this I/O access? */
1503 static int guest_io_okay(
1504 unsigned int port, unsigned int bytes,
1505 struct vcpu *v, struct cpu_user_regs *regs)
1507 #if defined(__x86_64__)
1508 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1509 int user_mode = !(v->arch.flags & TF_kernel_mode);
1510 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1511 #elif defined(__i386__)
1512 #define TOGGLE_MODE() ((void)0)
1513 #endif
1515 if ( !vm86_mode(regs) &&
1516 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1517 return 1;
1519 if ( v->arch.iobmp_limit > (port + bytes) )
1521 union { uint8_t bytes[2]; uint16_t mask; } x;
1523 /*
1524 * Grab permission bytes from guest space. Inaccessible bytes are
1525 * read as 0xff (no access allowed).
1526 */
1527 TOGGLE_MODE();
1528 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1529 port>>3, 2) )
1531 default: x.bytes[0] = ~0;
1532 case 1: x.bytes[1] = ~0;
1533 case 0: break;
1535 TOGGLE_MODE();
1537 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1538 return 1;
1541 return 0;
1544 /* Has the administrator granted sufficient permission for this I/O access? */
1545 static int admin_io_okay(
1546 unsigned int port, unsigned int bytes,
1547 struct vcpu *v, struct cpu_user_regs *regs)
1549 /*
1550 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1551 * We never permit direct access to that register.
1552 */
1553 if ( (port == 0xcf8) && (bytes == 4) )
1554 return 0;
1556 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1559 static uint32_t guest_io_read(
1560 unsigned int port, unsigned int bytes,
1561 struct vcpu *v, struct cpu_user_regs *regs)
1563 extern uint32_t pci_conf_read(
1564 uint32_t cf8, uint8_t offset, uint8_t bytes);
1566 uint32_t data = 0;
1567 unsigned int shift = 0;
1569 if ( admin_io_okay(port, bytes, v, regs) )
1571 switch ( bytes )
1573 case 1: return inb(port);
1574 case 2: return inw(port);
1575 case 4: return inl(port);
1579 while ( bytes != 0 )
1581 unsigned int size = 1;
1582 uint32_t sub_data = 0xff;
1584 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1586 sub_data = pv_pit_handler(port, 0, 0);
1588 else if ( (port == 0xcf8) && (bytes == 4) )
1590 size = 4;
1591 sub_data = v->domain->arch.pci_cf8;
1593 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1595 size = min(bytes, 4 - (port & 3));
1596 if ( size == 3 )
1597 size = 2;
1598 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1601 if ( size == 4 )
1602 return sub_data;
1604 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1605 shift += size * 8;
1606 port += size;
1607 bytes -= size;
1610 return data;
1613 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1615 static void guest_io_write(
1616 unsigned int port, unsigned int bytes, uint32_t data,
1617 struct vcpu *v, struct cpu_user_regs *regs)
1619 extern void pci_conf_write(
1620 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1622 if ( admin_io_okay(port, bytes, v, regs) )
1624 switch ( bytes ) {
1625 case 1:
1626 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1627 pv_rtc_handler(port, (uint8_t)data);
1628 outb((uint8_t)data, port);
1629 if ( pv_post_outb_hook )
1630 pv_post_outb_hook(port, (uint8_t)data);
1631 break;
1632 case 2:
1633 outw((uint16_t)data, port);
1634 break;
1635 case 4:
1636 outl(data, port);
1637 break;
1639 return;
1642 while ( bytes != 0 )
1644 unsigned int size = 1;
1646 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1648 pv_pit_handler(port, (uint8_t)data, 1);
1650 else if ( (port == 0xcf8) && (bytes == 4) )
1652 size = 4;
1653 v->domain->arch.pci_cf8 = data;
1655 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1657 size = min(bytes, 4 - (port & 3));
1658 if ( size == 3 )
1659 size = 2;
1660 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1663 if ( size == 4 )
1664 return;
1666 port += size;
1667 bytes -= size;
1668 data >>= size * 8;
1672 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1673 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1674 __attribute__((__regparm__(1)));
1675 unsigned long guest_to_host_gpr_switch(unsigned long)
1676 __attribute__((__regparm__(1)));
1678 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1680 /* Instruction fetch with error handling. */
1681 #define insn_fetch(type, base, eip, limit) \
1682 ({ unsigned long _rc, _ptr = (base) + (eip); \
1683 type _x; \
1684 if ( ad_default < 8 ) \
1685 _ptr = (unsigned int)_ptr; \
1686 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1687 goto fail; \
1688 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1689 { \
1690 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1691 goto skip; \
1692 } \
1693 (eip) += sizeof(_x); _x; })
1695 #if defined(CONFIG_X86_32)
1696 # define read_sreg(regs, sr) ((regs)->sr)
1697 #elif defined(CONFIG_X86_64)
1698 # define read_sreg(regs, sr) read_segment_register(sr)
1699 #endif
1701 static int is_cpufreq_controller(struct domain *d)
1703 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1704 (d->domain_id == 0));
1707 static int emulate_privileged_op(struct cpu_user_regs *regs)
1709 struct vcpu *v = current;
1710 unsigned long *reg, eip = regs->eip;
1711 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1712 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1713 int rc;
1714 unsigned int port, i, data_sel, ar, data, bpmatch = 0;
1715 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1716 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1717 ? regs->reg \
1718 : ad_bytes == 4 \
1719 ? (u32)regs->reg \
1720 : (u16)regs->reg)
1721 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1722 ? regs->reg = (val) \
1723 : ad_bytes == 4 \
1724 ? (*(u32 *)&regs->reg = (val)) \
1725 : (*(u16 *)&regs->reg = (val)))
1726 unsigned long code_base, code_limit;
1727 char io_emul_stub[32];
1728 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1729 uint32_t l, h;
1730 uint64_t val;
1732 if ( !read_descriptor(regs->cs, v, regs,
1733 &code_base, &code_limit, &ar,
1734 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1735 goto fail;
1736 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1737 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1738 if ( !(ar & _SEGMENT_S) ||
1739 !(ar & _SEGMENT_P) ||
1740 !(ar & _SEGMENT_CODE) )
1741 goto fail;
1743 /* emulating only opcodes not allowing SS to be default */
1744 data_sel = read_sreg(regs, ds);
1746 /* Legacy prefixes. */
1747 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1749 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1751 case 0x66: /* operand-size override */
1752 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1753 continue;
1754 case 0x67: /* address-size override */
1755 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1756 continue;
1757 case 0x2e: /* CS override */
1758 data_sel = regs->cs;
1759 continue;
1760 case 0x3e: /* DS override */
1761 data_sel = read_sreg(regs, ds);
1762 continue;
1763 case 0x26: /* ES override */
1764 data_sel = read_sreg(regs, es);
1765 continue;
1766 case 0x64: /* FS override */
1767 data_sel = read_sreg(regs, fs);
1768 lm_ovr = lm_seg_fs;
1769 continue;
1770 case 0x65: /* GS override */
1771 data_sel = read_sreg(regs, gs);
1772 lm_ovr = lm_seg_gs;
1773 continue;
1774 case 0x36: /* SS override */
1775 data_sel = regs->ss;
1776 continue;
1777 case 0xf0: /* LOCK */
1778 lock = 1;
1779 continue;
1780 case 0xf2: /* REPNE/REPNZ */
1781 case 0xf3: /* REP/REPE/REPZ */
1782 rep_prefix = 1;
1783 continue;
1784 default:
1785 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1787 rex = opcode;
1788 continue;
1790 break;
1792 break;
1795 /* REX prefix. */
1796 if ( rex & 8 ) /* REX.W */
1797 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1798 modrm_reg = (rex & 4) << 1; /* REX.R */
1799 /* REX.X does not need to be decoded. */
1800 modrm_rm = (rex & 1) << 3; /* REX.B */
1802 if ( opcode == 0x0f )
1803 goto twobyte_opcode;
1805 if ( lock )
1806 goto fail;
1808 /* Input/Output String instructions. */
1809 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1811 unsigned long data_base, data_limit;
1813 if ( rep_prefix && (rd_ad(ecx) == 0) )
1814 goto done;
1816 if ( !(opcode & 2) )
1818 data_sel = read_sreg(regs, es);
1819 lm_ovr = lm_seg_none;
1822 if ( !(ar & _SEGMENT_L) )
1824 if ( !read_descriptor(data_sel, v, regs,
1825 &data_base, &data_limit, &ar,
1826 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1827 _SEGMENT_P) )
1828 goto fail;
1829 if ( !(ar & _SEGMENT_S) ||
1830 !(ar & _SEGMENT_P) ||
1831 (opcode & 2 ?
1832 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1833 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1834 goto fail;
1836 #ifdef CONFIG_X86_64
1837 else
1839 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1841 switch ( lm_ovr )
1843 case lm_seg_none:
1844 data_base = 0UL;
1845 break;
1846 case lm_seg_fs:
1847 data_base = v->arch.guest_context.fs_base;
1848 break;
1849 case lm_seg_gs:
1850 if ( guest_kernel_mode(v, regs) )
1851 data_base = v->arch.guest_context.gs_base_kernel;
1852 else
1853 data_base = v->arch.guest_context.gs_base_user;
1854 break;
1857 else
1858 read_descriptor(data_sel, v, regs,
1859 &data_base, &data_limit, &ar,
1860 0);
1861 data_limit = ~0UL;
1862 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1864 #endif
1866 port = (u16)regs->edx;
1868 continue_io_string:
1869 switch ( opcode )
1871 case 0x6c: /* INSB */
1872 op_bytes = 1;
1873 case 0x6d: /* INSW/INSL */
1874 if ( (data_limit < (op_bytes - 1)) ||
1875 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1876 !guest_io_okay(port, op_bytes, v, regs) )
1877 goto fail;
1878 data = guest_io_read(port, op_bytes, v, regs);
1879 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1880 &data, op_bytes)) != 0 )
1882 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1883 PFEC_write_access);
1884 return EXCRET_fault_fixed;
1886 wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
1887 ? -op_bytes : op_bytes));
1888 break;
1890 case 0x6e: /* OUTSB */
1891 op_bytes = 1;
1892 case 0x6f: /* OUTSW/OUTSL */
1893 if ( (data_limit < (op_bytes - 1)) ||
1894 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1895 !guest_io_okay(port, op_bytes, v, regs) )
1896 goto fail;
1897 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1898 op_bytes)) != 0 )
1900 propagate_page_fault(data_base + rd_ad(esi)
1901 + op_bytes - rc, 0);
1902 return EXCRET_fault_fixed;
1904 guest_io_write(port, op_bytes, data, v, regs);
1905 wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
1906 ? -op_bytes : op_bytes));
1907 break;
1910 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1912 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1914 if ( !bpmatch && !hypercall_preempt_check() )
1915 goto continue_io_string;
1916 eip = regs->eip;
1919 goto done;
1922 /*
1923 * Very likely to be an I/O instruction (IN/OUT).
1924 * Build an on-stack stub to execute the instruction with full guest
1925 * GPR context. This is needed for some systems which (ab)use IN/OUT
1926 * to communicate with BIOS code in system-management mode.
1927 */
1928 #ifdef __x86_64__
1929 /* movq $host_to_guest_gpr_switch,%rcx */
1930 io_emul_stub[0] = 0x48;
1931 io_emul_stub[1] = 0xb9;
1932 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1933 /* callq *%rcx */
1934 io_emul_stub[10] = 0xff;
1935 io_emul_stub[11] = 0xd1;
1936 #else
1937 /* call host_to_guest_gpr_switch */
1938 io_emul_stub[0] = 0xe8;
1939 *(s32 *)&io_emul_stub[1] =
1940 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1941 /* 7 x nop */
1942 memset(&io_emul_stub[5], 0x90, 7);
1943 #endif
1944 /* data16 or nop */
1945 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1946 /* <io-access opcode> */
1947 io_emul_stub[13] = opcode;
1948 /* imm8 or nop */
1949 io_emul_stub[14] = 0x90;
1950 /* ret (jumps to guest_to_host_gpr_switch) */
1951 io_emul_stub[15] = 0xc3;
1953 /* Handy function-typed pointer to the stub. */
1954 io_emul = (void *)io_emul_stub;
1956 if ( ioemul_handle_quirk )
1957 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1959 /* I/O Port and Interrupt Flag instructions. */
1960 switch ( opcode )
1962 case 0xe4: /* IN imm8,%al */
1963 op_bytes = 1;
1964 case 0xe5: /* IN imm8,%eax */
1965 port = insn_fetch(u8, code_base, eip, code_limit);
1966 io_emul_stub[14] = port; /* imm8 */
1967 exec_in:
1968 if ( !guest_io_okay(port, op_bytes, v, regs) )
1969 goto fail;
1970 if ( admin_io_okay(port, op_bytes, v, regs) )
1972 io_emul(regs);
1974 else
1976 if ( op_bytes == 4 )
1977 regs->eax = 0;
1978 else
1979 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1980 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1982 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1983 goto done;
1985 case 0xec: /* IN %dx,%al */
1986 op_bytes = 1;
1987 case 0xed: /* IN %dx,%eax */
1988 port = (u16)regs->edx;
1989 goto exec_in;
1991 case 0xe6: /* OUT %al,imm8 */
1992 op_bytes = 1;
1993 case 0xe7: /* OUT %eax,imm8 */
1994 port = insn_fetch(u8, code_base, eip, code_limit);
1995 io_emul_stub[14] = port; /* imm8 */
1996 exec_out:
1997 if ( !guest_io_okay(port, op_bytes, v, regs) )
1998 goto fail;
1999 if ( admin_io_okay(port, op_bytes, v, regs) )
2001 if ( (op_bytes == 1) &&
2002 ((port == 0x71) || (port == 0x70)) &&
2003 pv_rtc_handler )
2004 pv_rtc_handler(port, regs->eax);
2005 io_emul(regs);
2006 if ( (op_bytes == 1) && pv_post_outb_hook )
2007 pv_post_outb_hook(port, regs->eax);
2009 else
2011 guest_io_write(port, op_bytes, regs->eax, v, regs);
2013 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
2014 goto done;
2016 case 0xee: /* OUT %al,%dx */
2017 op_bytes = 1;
2018 case 0xef: /* OUT %eax,%dx */
2019 port = (u16)regs->edx;
2020 goto exec_out;
2022 case 0xfa: /* CLI */
2023 case 0xfb: /* STI */
2024 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
2025 goto fail;
2026 /*
2027 * This is just too dangerous to allow, in my opinion. Consider if the
2028 * caller then tries to reenable interrupts using POPF: we can't trap
2029 * that and we'll end up with hard-to-debug lockups. Fast & loose will
2030 * do for us. :-)
2031 */
2032 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
2033 goto done;
2036 /* No decode of this single-byte opcode. */
2037 goto fail;
2039 twobyte_opcode:
2040 /*
2041 * All 2 and 3 byte opcodes, except RDTSC (0x31) and RDTSCP (0x1,0xF9)
2042 * are executable only from guest kernel mode (virtual ring 0).
2043 */
2044 opcode = insn_fetch(u8, code_base, eip, code_limit);
2045 if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) )
2046 goto fail;
2048 if ( lock && (opcode & ~3) != 0x20 )
2049 goto fail;
2050 switch ( opcode )
2052 case 0x1: /* RDTSCP */
2053 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2054 !guest_kernel_mode(v, regs) )
2055 goto fail;
2056 if ( insn_fetch(u8, code_base, eip, code_limit) != 0xf9 )
2057 goto fail;
2058 pv_soft_rdtsc(v, regs, 1);
2059 break;
2061 case 0x06: /* CLTS */
2062 (void)do_fpu_taskswitch(0);
2063 break;
2065 case 0x09: /* WBINVD */
2066 /* Ignore the instruction if unprivileged. */
2067 if ( !cache_flush_permitted(v->domain) )
2068 /* Non-physdev domain attempted WBINVD; ignore for now since
2069 newer linux uses this in some start-of-day timing loops */
2071 else
2072 wbinvd();
2073 break;
2075 case 0x20: /* MOV CR?,<reg> */
2076 opcode = insn_fetch(u8, code_base, eip, code_limit);
2077 if ( opcode < 0xc0 )
2078 goto fail;
2079 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2080 modrm_rm |= (opcode >> 0) & 7;
2081 reg = decode_register(modrm_rm, regs, 0);
2082 switch ( modrm_reg )
2084 case 0: /* Read CR0 */
2085 *reg = (read_cr0() & ~X86_CR0_TS) |
2086 v->arch.guest_context.ctrlreg[0];
2087 break;
2089 case 2: /* Read CR2 */
2090 *reg = v->arch.guest_context.ctrlreg[2];
2091 break;
2093 case 3: /* Read CR3 */
2095 unsigned long mfn;
2097 if ( !is_pv_32on64_vcpu(v) )
2099 mfn = pagetable_get_pfn(v->arch.guest_table);
2100 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2101 v->domain, mfn));
2103 #ifdef CONFIG_COMPAT
2104 else
2106 mfn = l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)));
2107 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2108 v->domain, mfn));
2110 #endif
2111 /* PTs should not be shared */
2112 BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
2114 break;
2116 case 4: /* Read CR4 */
2117 *reg = v->arch.guest_context.ctrlreg[4];
2118 break;
2120 default:
2121 goto fail;
2123 break;
2125 case 0x21: /* MOV DR?,<reg> */ {
2126 unsigned long res;
2127 opcode = insn_fetch(u8, code_base, eip, code_limit);
2128 if ( opcode < 0xc0 )
2129 goto fail;
2130 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2131 modrm_rm |= (opcode >> 0) & 7;
2132 reg = decode_register(modrm_rm, regs, 0);
2133 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2134 goto fail;
2135 *reg = res;
2136 break;
2139 case 0x22: /* MOV <reg>,CR? */
2140 opcode = insn_fetch(u8, code_base, eip, code_limit);
2141 if ( opcode < 0xc0 )
2142 goto fail;
2143 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2144 modrm_rm |= (opcode >> 0) & 7;
2145 reg = decode_register(modrm_rm, regs, 0);
2146 switch ( modrm_reg )
2148 case 0: /* Write CR0 */
2149 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2151 gdprintk(XENLOG_WARNING,
2152 "Attempt to change unmodifiable CR0 flags.\n");
2153 goto fail;
2155 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2156 break;
2158 case 2: /* Write CR2 */
2159 v->arch.guest_context.ctrlreg[2] = *reg;
2160 arch_set_cr2(v, *reg);
2161 break;
2163 case 3: /* Write CR3 */
2164 domain_lock(v->domain);
2165 if ( !is_pv_32on64_vcpu(v) )
2166 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2167 #ifdef CONFIG_COMPAT
2168 else
2169 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2170 #endif
2171 domain_unlock(v->domain);
2172 if ( rc == 0 ) /* not okay */
2173 goto fail;
2174 break;
2176 case 4: /* Write CR4 */
2177 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2178 write_cr4(pv_guest_cr4_to_real_cr4(v));
2179 break;
2181 default:
2182 goto fail;
2184 break;
2186 case 0x23: /* MOV <reg>,DR? */
2187 opcode = insn_fetch(u8, code_base, eip, code_limit);
2188 if ( opcode < 0xc0 )
2189 goto fail;
2190 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2191 modrm_rm |= (opcode >> 0) & 7;
2192 reg = decode_register(modrm_rm, regs, 0);
2193 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2194 goto fail;
2195 break;
2197 case 0x30: /* WRMSR */ {
2198 u32 eax = regs->eax;
2199 u32 edx = regs->edx;
2200 u64 val = ((u64)edx << 32) | eax;
2201 switch ( (u32)regs->ecx )
2203 #ifdef CONFIG_X86_64
2204 case MSR_FS_BASE:
2205 if ( is_pv_32on64_vcpu(v) )
2206 goto fail;
2207 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2208 goto fail;
2209 v->arch.guest_context.fs_base = val;
2210 break;
2211 case MSR_GS_BASE:
2212 if ( is_pv_32on64_vcpu(v) )
2213 goto fail;
2214 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2215 goto fail;
2216 v->arch.guest_context.gs_base_kernel = val;
2217 break;
2218 case MSR_SHADOW_GS_BASE:
2219 if ( is_pv_32on64_vcpu(v) )
2220 goto fail;
2221 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2222 goto fail;
2223 v->arch.guest_context.gs_base_user = val;
2224 break;
2225 #endif
2226 case MSR_K7_FID_VID_STATUS:
2227 case MSR_K7_FID_VID_CTL:
2228 case MSR_K8_PSTATE_LIMIT:
2229 case MSR_K8_PSTATE_CTRL:
2230 case MSR_K8_PSTATE_STATUS:
2231 case MSR_K8_PSTATE0:
2232 case MSR_K8_PSTATE1:
2233 case MSR_K8_PSTATE2:
2234 case MSR_K8_PSTATE3:
2235 case MSR_K8_PSTATE4:
2236 case MSR_K8_PSTATE5:
2237 case MSR_K8_PSTATE6:
2238 case MSR_K8_PSTATE7:
2239 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2240 goto fail;
2241 if ( !is_cpufreq_controller(v->domain) )
2242 break;
2243 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2244 goto fail;
2245 break;
2246 case MSR_AMD64_NB_CFG:
2247 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2248 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2249 goto fail;
2250 if ( !IS_PRIV(v->domain) )
2251 break;
2252 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2253 (eax != l) ||
2254 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2255 goto invalid;
2256 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2257 goto fail;
2258 break;
2259 case MSR_FAM10H_MMIO_CONF_BASE:
2260 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2261 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2262 goto fail;
2263 if ( !IS_PRIV(v->domain) )
2264 break;
2265 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2266 (((((u64)h << 32) | l) ^ val) &
2267 ~( FAM10H_MMIO_CONF_ENABLE |
2268 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2269 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2270 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2271 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2272 goto invalid;
2273 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2274 goto fail;
2275 break;
2276 case MSR_IA32_MPERF:
2277 case MSR_IA32_APERF:
2278 case MSR_IA32_PERF_CTL:
2279 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2280 goto fail;
2281 if ( !is_cpufreq_controller(v->domain) )
2282 break;
2283 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2284 goto fail;
2285 break;
2286 case MSR_IA32_THERM_CONTROL:
2287 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2288 goto fail;
2289 if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
2290 break;
2291 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2292 goto fail;
2293 break;
2294 default:
2295 if ( wrmsr_hypervisor_regs(regs->ecx, val) )
2296 break;
2298 rc = mce_wrmsr(regs->ecx, val);
2299 if ( rc < 0 )
2300 goto fail;
2301 if ( rc )
2302 break;
2304 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2305 (eax != l) || (edx != h) )
2306 invalid:
2307 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2308 "%08x:%08x to %08x:%08x.\n",
2309 _p(regs->ecx), h, l, edx, eax);
2310 break;
2312 break;
2315 case 0x31: /* RDTSC */
2316 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2317 !guest_kernel_mode(v, regs) )
2318 goto fail;
2319 if ( v->domain->arch.vtsc )
2320 pv_soft_rdtsc(v, regs, 0);
2321 else
2322 rdtsc(regs->eax, regs->edx);
2323 break;
2325 case 0x32: /* RDMSR */
2326 switch ( (u32)regs->ecx )
2328 #ifdef CONFIG_X86_64
2329 case MSR_FS_BASE:
2330 if ( is_pv_32on64_vcpu(v) )
2331 goto fail;
2332 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2333 regs->edx = v->arch.guest_context.fs_base >> 32;
2334 break;
2335 case MSR_GS_BASE:
2336 if ( is_pv_32on64_vcpu(v) )
2337 goto fail;
2338 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2339 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2340 break;
2341 case MSR_SHADOW_GS_BASE:
2342 if ( is_pv_32on64_vcpu(v) )
2343 goto fail;
2344 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2345 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2346 break;
2347 #endif
2348 case MSR_K7_FID_VID_CTL:
2349 case MSR_K7_FID_VID_STATUS:
2350 case MSR_K8_PSTATE_LIMIT:
2351 case MSR_K8_PSTATE_CTRL:
2352 case MSR_K8_PSTATE_STATUS:
2353 case MSR_K8_PSTATE0:
2354 case MSR_K8_PSTATE1:
2355 case MSR_K8_PSTATE2:
2356 case MSR_K8_PSTATE3:
2357 case MSR_K8_PSTATE4:
2358 case MSR_K8_PSTATE5:
2359 case MSR_K8_PSTATE6:
2360 case MSR_K8_PSTATE7:
2361 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2362 goto fail;
2363 if ( !is_cpufreq_controller(v->domain) )
2365 regs->eax = regs->edx = 0;
2366 break;
2368 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2369 goto fail;
2370 break;
2371 case MSR_IA32_MISC_ENABLE:
2372 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2373 goto fail;
2374 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2375 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2376 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2377 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2378 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2379 break;
2380 case MSR_EFER:
2381 case MSR_AMD_PATCHLEVEL:
2382 default:
2383 if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
2385 rdmsr_writeback:
2386 regs->eax = (uint32_t)val;
2387 regs->edx = (uint32_t)(val >> 32);
2388 break;
2391 rc = mce_rdmsr(regs->ecx, &val);
2392 if ( rc < 0 )
2393 goto fail;
2394 if ( rc )
2395 goto rdmsr_writeback;
2397 /* Everyone can read the MSR space. */
2398 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2399 _p(regs->ecx));*/
2400 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2401 goto fail;
2402 break;
2404 break;
2406 default:
2407 goto fail;
2410 #undef wr_ad
2411 #undef rd_ad
2413 done:
2414 instruction_done(regs, eip, bpmatch);
2415 skip:
2416 return EXCRET_fault_fixed;
2418 fail:
2419 return 0;
2422 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2423 unsigned int esp, unsigned int decr)
2425 return (((esp - decr) < (esp - 1)) &&
2426 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2429 static void emulate_gate_op(struct cpu_user_regs *regs)
2431 #ifdef __x86_64__
2432 struct vcpu *v = current;
2433 unsigned int sel, ar, dpl, nparm, opnd_sel;
2434 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2435 unsigned long off, eip, opnd_off, base, limit;
2436 int jump;
2438 /* Check whether this fault is due to the use of a call gate. */
2439 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2440 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2441 ((ar & _SEGMENT_TYPE) != 0xc00) )
2443 do_guest_trap(TRAP_gp_fault, regs, 1);
2444 return;
2446 if ( !(ar & _SEGMENT_P) )
2448 do_guest_trap(TRAP_no_segment, regs, 1);
2449 return;
2451 dpl = (ar >> 13) & 3;
2452 nparm = ar & 0x1f;
2454 /*
2455 * Decode instruction (and perhaps operand) to determine RPL,
2456 * whether this is a jump or a call, and the call return offset.
2457 */
2458 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2459 !(ar & _SEGMENT_S) ||
2460 !(ar & _SEGMENT_P) ||
2461 !(ar & _SEGMENT_CODE) )
2463 do_guest_trap(TRAP_gp_fault, regs, 1);
2464 return;
2467 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2468 ad_default = ad_bytes = op_default;
2469 opnd_sel = opnd_off = 0;
2470 jump = -1;
2471 for ( eip = regs->eip; eip - regs->_eip < 10; )
2473 switch ( insn_fetch(u8, base, eip, limit) )
2475 case 0x66: /* operand-size override */
2476 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2477 continue;
2478 case 0x67: /* address-size override */
2479 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2480 continue;
2481 case 0x2e: /* CS override */
2482 opnd_sel = regs->cs;
2483 ASSERT(opnd_sel);
2484 continue;
2485 case 0x3e: /* DS override */
2486 opnd_sel = read_sreg(regs, ds);
2487 if ( !opnd_sel )
2488 opnd_sel = dpl;
2489 continue;
2490 case 0x26: /* ES override */
2491 opnd_sel = read_sreg(regs, es);
2492 if ( !opnd_sel )
2493 opnd_sel = dpl;
2494 continue;
2495 case 0x64: /* FS override */
2496 opnd_sel = read_sreg(regs, fs);
2497 if ( !opnd_sel )
2498 opnd_sel = dpl;
2499 continue;
2500 case 0x65: /* GS override */
2501 opnd_sel = read_sreg(regs, gs);
2502 if ( !opnd_sel )
2503 opnd_sel = dpl;
2504 continue;
2505 case 0x36: /* SS override */
2506 opnd_sel = regs->ss;
2507 if ( !opnd_sel )
2508 opnd_sel = dpl;
2509 continue;
2510 case 0xea:
2511 ++jump;
2512 /* FALLTHROUGH */
2513 case 0x9a:
2514 ++jump;
2515 opnd_sel = regs->cs;
2516 opnd_off = eip;
2517 ad_bytes = ad_default;
2518 eip += op_bytes + 2;
2519 break;
2520 case 0xff:
2522 unsigned int modrm;
2524 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2526 case 0x28: case 0x68: case 0xa8:
2527 ++jump;
2528 /* FALLTHROUGH */
2529 case 0x18: case 0x58: case 0x98:
2530 ++jump;
2531 if ( ad_bytes != 2 )
2533 if ( (modrm & 7) == 4 )
2535 unsigned int sib;
2536 sib = insn_fetch(u8, base, eip, limit);
2538 modrm = (modrm & ~7) | (sib & 7);
2539 if ( (sib >>= 3) != 4 )
2540 opnd_off = *(unsigned long *)
2541 decode_register(sib & 7, regs, 0);
2542 opnd_off <<= sib >> 3;
2544 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2545 opnd_off += *(unsigned long *)
2546 decode_register(modrm & 7, regs, 0);
2547 else
2548 modrm |= 0x87;
2549 if ( !opnd_sel )
2551 switch ( modrm & 7 )
2553 default:
2554 opnd_sel = read_sreg(regs, ds);
2555 break;
2556 case 4: case 5:
2557 opnd_sel = regs->ss;
2558 break;
2562 else
2564 switch ( modrm & 7 )
2566 case 0: case 1: case 7:
2567 opnd_off = regs->ebx;
2568 break;
2569 case 6:
2570 if ( !(modrm & 0xc0) )
2571 modrm |= 0x80;
2572 else
2573 case 2: case 3:
2575 opnd_off = regs->ebp;
2576 if ( !opnd_sel )
2577 opnd_sel = regs->ss;
2579 break;
2581 if ( !opnd_sel )
2582 opnd_sel = read_sreg(regs, ds);
2583 switch ( modrm & 7 )
2585 case 0: case 2: case 4:
2586 opnd_off += regs->esi;
2587 break;
2588 case 1: case 3: case 5:
2589 opnd_off += regs->edi;
2590 break;
2593 switch ( modrm & 0xc0 )
2595 case 0x40:
2596 opnd_off += insn_fetch(s8, base, eip, limit);
2597 break;
2598 case 0x80:
2599 opnd_off += insn_fetch(s32, base, eip, limit);
2600 break;
2602 if ( ad_bytes == 4 )
2603 opnd_off = (unsigned int)opnd_off;
2604 else if ( ad_bytes == 2 )
2605 opnd_off = (unsigned short)opnd_off;
2606 break;
2609 break;
2611 break;
2614 if ( jump < 0 )
2616 fail:
2617 do_guest_trap(TRAP_gp_fault, regs, 1);
2618 skip:
2619 return;
2622 if ( (opnd_sel != regs->cs &&
2623 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2624 !(ar & _SEGMENT_S) ||
2625 !(ar & _SEGMENT_P) ||
2626 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2628 do_guest_trap(TRAP_gp_fault, regs, 1);
2629 return;
2632 opnd_off += op_bytes;
2633 #define ad_default ad_bytes
2634 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2635 #undef ad_default
2636 ASSERT((opnd_sel & ~3) == regs->error_code);
2637 if ( dpl < (opnd_sel & 3) )
2639 do_guest_trap(TRAP_gp_fault, regs, 1);
2640 return;
2643 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2644 !(ar & _SEGMENT_S) ||
2645 !(ar & _SEGMENT_CODE) ||
2646 (!jump || (ar & _SEGMENT_EC) ?
2647 ((ar >> 13) & 3) > (regs->cs & 3) :
2648 ((ar >> 13) & 3) != (regs->cs & 3)) )
2650 regs->error_code = sel;
2651 do_guest_trap(TRAP_gp_fault, regs, 1);
2652 return;
2654 if ( !(ar & _SEGMENT_P) )
2656 regs->error_code = sel;
2657 do_guest_trap(TRAP_no_segment, regs, 1);
2658 return;
2660 if ( off > limit )
2662 regs->error_code = 0;
2663 do_guest_trap(TRAP_gp_fault, regs, 1);
2664 return;
2667 if ( !jump )
2669 unsigned int ss, esp, *stkp;
2670 int rc;
2671 #define push(item) do \
2672 { \
2673 --stkp; \
2674 esp -= 4; \
2675 rc = __put_user(item, stkp); \
2676 if ( rc ) \
2677 { \
2678 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2679 PFEC_write_access); \
2680 return; \
2681 } \
2682 } while ( 0 )
2684 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2686 sel |= (ar >> 13) & 3;
2687 /* Inner stack known only for kernel ring. */
2688 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2690 do_guest_trap(TRAP_gp_fault, regs, 1);
2691 return;
2693 esp = v->arch.guest_context.kernel_sp;
2694 ss = v->arch.guest_context.kernel_ss;
2695 if ( (ss & 3) != (sel & 3) ||
2696 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2697 ((ar >> 13) & 3) != (sel & 3) ||
2698 !(ar & _SEGMENT_S) ||
2699 (ar & _SEGMENT_CODE) ||
2700 !(ar & _SEGMENT_WR) )
2702 regs->error_code = ss & ~3;
2703 do_guest_trap(TRAP_invalid_tss, regs, 1);
2704 return;
2706 if ( !(ar & _SEGMENT_P) ||
2707 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2709 regs->error_code = ss & ~3;
2710 do_guest_trap(TRAP_stack_error, regs, 1);
2711 return;
2713 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2714 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2716 do_guest_trap(TRAP_gp_fault, regs, 1);
2717 return;
2719 push(regs->ss);
2720 push(regs->esp);
2721 if ( nparm )
2723 const unsigned int *ustkp;
2725 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2726 ((ar >> 13) & 3) != (regs->cs & 3) ||
2727 !(ar & _SEGMENT_S) ||
2728 (ar & _SEGMENT_CODE) ||
2729 !(ar & _SEGMENT_WR) ||
2730 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2731 return do_guest_trap(TRAP_gp_fault, regs, 1);
2732 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2733 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2735 do_guest_trap(TRAP_gp_fault, regs, 1);
2736 return;
2738 do
2740 unsigned int parm;
2742 --ustkp;
2743 rc = __get_user(parm, ustkp);
2744 if ( rc )
2746 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2747 return;
2749 push(parm);
2750 } while ( --nparm );
2753 else
2755 sel |= (regs->cs & 3);
2756 esp = regs->esp;
2757 ss = regs->ss;
2758 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2759 ((ar >> 13) & 3) != (sel & 3) )
2761 do_guest_trap(TRAP_gp_fault, regs, 1);
2762 return;
2764 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2766 regs->error_code = 0;
2767 do_guest_trap(TRAP_stack_error, regs, 1);
2768 return;
2770 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2771 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2773 do_guest_trap(TRAP_gp_fault, regs, 1);
2774 return;
2777 push(regs->cs);
2778 push(eip);
2779 #undef push
2780 regs->esp = esp;
2781 regs->ss = ss;
2783 else
2784 sel |= (regs->cs & 3);
2786 regs->cs = sel;
2787 instruction_done(regs, off, 0);
2788 #endif
2791 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2793 struct vcpu *v = current;
2794 unsigned long fixup;
2796 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2798 if ( regs->error_code & 1 )
2799 goto hardware_gp;
2801 if ( !guest_mode(regs) )
2802 goto gp_in_kernel;
2804 /*
2805 * Cunning trick to allow arbitrary "INT n" handling.
2807 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2808 * instruction from trapping to the appropriate vector, when that might not
2809 * be expected by Xen or the guest OS. For example, that entry might be for
2810 * a fault handler (unlike traps, faults don't increment EIP), or might
2811 * expect an error code on the stack (which a software trap never
2812 * provides), or might be a hardware interrupt handler that doesn't like
2813 * being called spuriously.
2815 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2816 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2817 * clear to indicate that it's a software fault, not hardware.
2819 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2820 * okay because they can only be triggered by an explicit DPL-checked
2821 * instruction. The DPL specified by the guest OS for these vectors is NOT
2822 * CHECKED!!
2823 */
2824 if ( (regs->error_code & 3) == 2 )
2826 /* This fault must be due to <INT n> instruction. */
2827 const struct trap_info *ti;
2828 unsigned char vector = regs->error_code >> 3;
2829 ti = &v->arch.guest_context.trap_ctxt[vector];
2830 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2832 regs->eip += 2;
2833 do_guest_trap(vector, regs, 0);
2834 return;
2837 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2839 emulate_gate_op(regs);
2840 return;
2843 /* Emulate some simple privileged and I/O instructions. */
2844 if ( (regs->error_code == 0) &&
2845 emulate_privileged_op(regs) )
2847 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2848 return;
2851 #if defined(__i386__)
2852 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2853 (regs->error_code == 0) &&
2854 gpf_emulate_4gb(regs) )
2856 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2857 return;
2859 #endif
2861 /* Pass on GPF as is. */
2862 do_guest_trap(TRAP_gp_fault, regs, 1);
2863 return;
2865 gp_in_kernel:
2867 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2869 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2870 regs->error_code, _p(regs->eip), _p(fixup));
2871 regs->eip = fixup;
2872 return;
2875 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2877 hardware_gp:
2878 show_execution_state(regs);
2879 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2882 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2884 static void nmi_mce_softirq(void)
2886 int cpu = smp_processor_id();
2887 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2888 cpumask_t affinity;
2890 BUG_ON(st == NULL);
2891 BUG_ON(st->vcpu == NULL);
2893 /* Set the tmp value unconditionally, so that
2894 * the check in the iret hypercall works. */
2895 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2897 if ((cpu != st->processor)
2898 || (st->processor != st->vcpu->processor))
2900 /* We are on a different physical cpu.
2901 * Make sure to wakeup the vcpu on the
2902 * specified processor.
2903 */
2904 cpus_clear(affinity);
2905 cpu_set(st->processor, affinity);
2906 vcpu_set_affinity(st->vcpu, &affinity);
2908 /* Affinity is restored in the iret hypercall. */
2911 /* Only used to defer wakeup of domain/vcpu to
2912 * a safe (non-NMI/MCE) context.
2913 */
2914 vcpu_kick(st->vcpu);
2915 st->vcpu = NULL;
2918 void async_exception_cleanup(struct vcpu *curr)
2920 int trap;
2922 if ( !curr->async_exception_mask )
2923 return;
2925 /* Restore affinity. */
2926 if ( !cpus_empty(curr->cpu_affinity_tmp) &&
2927 !cpus_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
2929 vcpu_set_affinity(curr, &curr->cpu_affinity_tmp);
2930 cpus_clear(curr->cpu_affinity_tmp);
2933 if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
2934 trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
2935 else
2936 for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
2937 if ( (curr->async_exception_mask ^
2938 curr->async_exception_state(trap).old_mask) == (1 << trap) )
2939 break;
2940 ASSERT(trap <= VCPU_TRAP_LAST);
2942 /* inject vMCE to PV_Guest including DOM0. */
2943 if ( trap == VCPU_TRAP_MCE )
2945 gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n");
2946 if ( curr->vcpu_id == 0 )
2948 struct domain *d = curr->domain;
2950 if ( !d->arch.vmca_msrs.nr_injection )
2952 printk(XENLOG_WARNING "MCE: ret from vMCE#, "
2953 "no injection node\n");
2954 goto end;
2957 d->arch.vmca_msrs.nr_injection--;
2958 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
2960 struct bank_entry *entry;
2962 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
2963 struct bank_entry, list);
2964 gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
2965 list_del(&entry->list);
2967 else
2968 printk(XENLOG_ERR "MCE: didn't found last injection node\n");
2970 /* further injection */
2971 if ( d->arch.vmca_msrs.nr_injection > 0 &&
2972 guest_has_trap_callback(d, 0, TRAP_machine_check) &&
2973 !test_and_set_bool(curr->mce_pending) )
2975 int cpu = smp_processor_id();
2976 cpumask_t affinity;
2978 curr->cpu_affinity_tmp = curr->cpu_affinity;
2979 cpus_clear(affinity);
2980 cpu_set(cpu, affinity);
2981 printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
2982 cpu, curr->processor);
2983 vcpu_set_affinity(curr, &affinity);
2988 end:
2989 /* Restore previous asynchronous exception mask. */
2990 curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
2993 static void nmi_dom0_report(unsigned int reason_idx)
2995 struct domain *d = dom0;
2997 if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) )
2998 return;
3000 set_bit(reason_idx, nmi_reason(d));
3002 send_guest_trap(d, 0, TRAP_nmi);
3005 static void mem_parity_error(struct cpu_user_regs *regs)
3007 switch ( opt_nmi[0] )
3009 case 'd': /* 'dom0' */
3010 nmi_dom0_report(_XEN_NMIREASON_parity_error);
3011 case 'i': /* 'ignore' */
3012 break;
3013 default: /* 'fatal' */
3014 console_force_unlock();
3015 printk("\n\nNMI - MEMORY ERROR\n");
3016 fatal_trap(TRAP_nmi, regs);
3019 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
3020 mdelay(1);
3021 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
3024 static void io_check_error(struct cpu_user_regs *regs)
3026 switch ( opt_nmi[0] )
3028 case 'd': /* 'dom0' */
3029 nmi_dom0_report(_XEN_NMIREASON_io_error);
3030 case 'i': /* 'ignore' */
3031 break;
3032 default: /* 'fatal' */
3033 console_force_unlock();
3034 printk("\n\nNMI - I/O ERROR\n");
3035 fatal_trap(TRAP_nmi, regs);
3038 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
3039 mdelay(1);
3040 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
3043 static void unknown_nmi_error(unsigned char reason)
3045 switch ( opt_nmi[0] )
3047 case 'd': /* 'dom0' */
3048 nmi_dom0_report(_XEN_NMIREASON_unknown);
3049 case 'i': /* 'ignore' */
3050 break;
3051 default: /* 'fatal' */
3052 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
3053 printk("Dazed and confused, but trying to continue\n");
3054 printk("Do you have a strange power saving mode enabled?\n");
3055 kexec_crash();
3059 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
3061 return 0;
3064 static nmi_callback_t nmi_callback = dummy_nmi_callback;
3066 asmlinkage void do_nmi(struct cpu_user_regs *regs)
3068 unsigned int cpu = smp_processor_id();
3069 unsigned char reason;
3071 ++nmi_count(cpu);
3073 if ( nmi_callback(regs, cpu) )
3074 return;
3076 if ( nmi_watchdog )
3077 nmi_watchdog_tick(regs);
3079 /* Only the BSP gets external NMIs from the system. */
3080 if ( cpu == 0 )
3082 reason = inb(0x61);
3083 if ( reason & 0x80 )
3084 mem_parity_error(regs);
3085 else if ( reason & 0x40 )
3086 io_check_error(regs);
3087 else if ( !nmi_watchdog )
3088 unknown_nmi_error((unsigned char)(reason&0xff));
3092 void set_nmi_callback(nmi_callback_t callback)
3094 nmi_callback = callback;
3097 void unset_nmi_callback(void)
3099 nmi_callback = dummy_nmi_callback;
3102 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
3104 struct vcpu *curr = current;
3106 BUG_ON(!guest_mode(regs));
3108 setup_fpu(curr);
3110 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
3112 do_guest_trap(TRAP_no_device, regs, 0);
3113 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
3115 else
3116 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
3118 return;
3121 asmlinkage void do_debug(struct cpu_user_regs *regs)
3123 struct vcpu *v = current;
3125 DEBUGGER_trap_entry(TRAP_debug, regs);
3127 if ( !guest_mode(regs) )
3129 if ( regs->eflags & X86_EFLAGS_TF )
3131 #ifdef __x86_64__
3132 void sysenter_entry(void);
3133 void sysenter_eflags_saved(void);
3134 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
3135 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
3136 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
3138 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
3139 regs->eflags &= ~X86_EFLAGS_TF;
3140 goto out;
3142 #endif
3143 if ( !debugger_trap_fatal(TRAP_debug, regs) )
3145 WARN_ON(1);
3146 regs->eflags &= ~X86_EFLAGS_TF;
3149 else
3151 /*
3152 * We ignore watchpoints when they trigger within Xen. This may
3153 * happen when a buffer is passed to us which previously had a
3154 * watchpoint set on it. No need to bump EIP; the only faulting
3155 * trap is an instruction breakpoint, which can't happen to us.
3156 */
3157 WARN_ON(!search_exception_table(regs->eip));
3159 goto out;
3162 /* Save debug status register where guest OS can peek at it */
3163 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3165 ler_enable();
3166 do_guest_trap(TRAP_debug, regs, 0);
3167 return;
3169 out:
3170 ler_enable();
3171 return;
3174 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3178 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3180 int i;
3181 /* Keep secondary tables in sync with IRQ updates. */
3182 for ( i = 1; i < NR_CPUS; i++ )
3183 if ( idt_tables[i] != NULL )
3184 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3185 _set_gate(&idt_table[n], 14, dpl, addr);
3188 static void set_swint_gate(unsigned int n, void *addr)
3190 __set_intr_gate(n, 3, addr);
3193 void set_intr_gate(unsigned int n, void *addr)
3195 __set_intr_gate(n, 0, addr);
3198 void load_TR(void)
3200 struct tss_struct *tss = &this_cpu(init_tss);
3201 struct desc_ptr old_gdt, tss_gdt = {
3202 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3203 .limit = LAST_RESERVED_GDT_BYTE
3204 };
3206 _set_tssldt_desc(
3207 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3208 (unsigned long)tss,
3209 offsetof(struct tss_struct, __cacheline_filler) - 1,
3210 9);
3211 #ifdef CONFIG_COMPAT
3212 _set_tssldt_desc(
3213 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3214 (unsigned long)tss,
3215 offsetof(struct tss_struct, __cacheline_filler) - 1,
3216 11);
3217 #endif
3219 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3220 asm volatile (
3221 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3222 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3225 void __devinit percpu_traps_init(void)
3227 subarch_percpu_traps_init();
3229 if ( !opt_ler )
3230 return;
3232 switch ( boot_cpu_data.x86_vendor )
3234 case X86_VENDOR_INTEL:
3235 switch ( boot_cpu_data.x86 )
3237 case 6:
3238 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3239 break;
3240 case 15:
3241 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3242 break;
3244 break;
3245 case X86_VENDOR_AMD:
3246 switch ( boot_cpu_data.x86 )
3248 case 6:
3249 case 15:
3250 case 16:
3251 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3252 break;
3254 break;
3257 ler_enable();
3260 void __init trap_init(void)
3262 /*
3263 * Note that interrupt gates are always used, rather than trap gates. We
3264 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3265 * first activation must have the "bad" value(s) for these registers and
3266 * we may lose them if another activation is installed before they are
3267 * saved. The page-fault handler also needs interrupts disabled until %cr2
3268 * has been read and saved on the stack.
3269 */
3270 set_intr_gate(TRAP_divide_error,&divide_error);
3271 set_intr_gate(TRAP_debug,&debug);
3272 set_intr_gate(TRAP_nmi,&nmi);
3273 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3274 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3275 set_intr_gate(TRAP_bounds,&bounds);
3276 set_intr_gate(TRAP_invalid_op,&invalid_op);
3277 set_intr_gate(TRAP_no_device,&device_not_available);
3278 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3279 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3280 set_intr_gate(TRAP_no_segment,&segment_not_present);
3281 set_intr_gate(TRAP_stack_error,&stack_segment);
3282 set_intr_gate(TRAP_gp_fault,&general_protection);
3283 set_intr_gate(TRAP_page_fault,&page_fault);
3284 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3285 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3286 set_intr_gate(TRAP_alignment_check,&alignment_check);
3287 set_intr_gate(TRAP_machine_check,&machine_check);
3288 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3290 /* CPU0 uses the master IDT. */
3291 idt_tables[0] = idt_table;
3293 percpu_traps_init();
3295 cpu_init();
3297 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3300 long register_guest_nmi_callback(unsigned long address)
3302 struct vcpu *v = current;
3303 struct domain *d = v->domain;
3304 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3306 t->vector = TRAP_nmi;
3307 t->flags = 0;
3308 t->cs = (is_pv_32on64_domain(d) ?
3309 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3310 t->address = address;
3311 TI_SET_IF(t, 1);
3313 /*
3314 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3315 * now.
3316 */
3317 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3318 v->nmi_pending = 1;
3320 return 0;
3323 long unregister_guest_nmi_callback(void)
3325 struct vcpu *v = current;
3326 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3328 memset(t, 0, sizeof(*t));
3330 return 0;
3333 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3335 struct vcpu *v;
3336 struct trap_info *t;
3338 BUG_ON(d == NULL);
3339 BUG_ON(vcpuid >= d->max_vcpus);
3341 /* Sanity check - XXX should be more fine grained. */
3342 BUG_ON(trap_nr > TRAP_syscall);
3344 v = d->vcpu[vcpuid];
3345 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3347 return (t->address != 0);
3351 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3353 struct vcpu *v;
3354 struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id());
3356 BUG_ON(d == NULL);
3357 BUG_ON(vcpuid >= d->max_vcpus);
3358 v = d->vcpu[vcpuid];
3360 switch (trap_nr) {
3361 case TRAP_nmi:
3362 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3363 return -EBUSY;
3364 if ( !test_and_set_bool(v->nmi_pending) ) {
3365 st->domain = d;
3366 st->processor = v->processor;
3368 /* not safe to wake up a vcpu here */
3369 raise_softirq(NMI_MCE_SOFTIRQ);
3370 return 0;
3372 st->vcpu = NULL;
3373 break;
3375 case TRAP_machine_check:
3376 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3377 return -EBUSY;
3379 /* We are called by the machine check (exception or polling) handlers
3380 * on the physical CPU that reported a machine check error. */
3382 if ( !test_and_set_bool(v->mce_pending) ) {
3383 st->domain = d;
3384 st->vcpu = v;
3385 st->processor = v->processor;
3387 /* not safe to wake up a vcpu here */
3388 raise_softirq(NMI_MCE_SOFTIRQ);
3389 return 0;
3391 st->vcpu = NULL;
3392 break;
3395 /* delivery failed */
3396 return -EIO;
3400 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3402 struct trap_info cur;
3403 struct vcpu *curr = current;
3404 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3405 long rc = 0;
3407 /* If no table is presented then clear the entire virtual IDT. */
3408 if ( guest_handle_is_null(traps) )
3410 memset(dst, 0, 256 * sizeof(*dst));
3411 init_int80_direct_trap(curr);
3412 return 0;
3415 for ( ; ; )
3417 if ( hypercall_preempt_check() )
3419 rc = hypercall_create_continuation(
3420 __HYPERVISOR_set_trap_table, "h", traps);
3421 break;
3424 if ( copy_from_guest(&cur, traps, 1) )
3426 rc = -EFAULT;
3427 break;
3430 if ( cur.address == 0 )
3431 break;
3433 fixup_guest_code_selector(curr->domain, cur.cs);
3435 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3437 if ( cur.vector == 0x80 )
3438 init_int80_direct_trap(curr);
3440 guest_handle_add_offset(traps, 1);
3443 return rc;
3446 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3448 int i;
3449 struct vcpu *curr = current;
3451 switch ( reg )
3453 case 0:
3454 if ( !access_ok(value, sizeof(long)) )
3455 return -EPERM;
3456 if ( v == curr )
3457 write_debugreg(0, value);
3458 break;
3459 case 1:
3460 if ( !access_ok(value, sizeof(long)) )
3461 return -EPERM;
3462 if ( v == curr )
3463 write_debugreg(1, value);
3464 break;
3465 case 2:
3466 if ( !access_ok(value, sizeof(long)) )
3467 return -EPERM;
3468 if ( v == curr )
3469 write_debugreg(2, value);
3470 break;
3471 case 3:
3472 if ( !access_ok(value, sizeof(long)) )
3473 return -EPERM;
3474 if ( v == curr )
3475 write_debugreg(3, value);
3476 break;
3477 case 6:
3478 /*
3479 * DR6: Bits 4-11,16-31 reserved (set to 1).
3480 * Bit 12 reserved (set to 0).
3481 */
3482 value &= 0xffffefff; /* reserved bits => 0 */
3483 value |= 0xffff0ff0; /* reserved bits => 1 */
3484 if ( v == curr )
3485 write_debugreg(6, value);
3486 break;
3487 case 7:
3488 /*
3489 * DR7: Bit 10 reserved (set to 1).
3490 * Bits 11-12,14-15 reserved (set to 0).
3491 */
3492 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3493 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3494 /*
3495 * Privileged bits:
3496 * GD (bit 13): must be 0.
3497 */
3498 if ( value & DR_GENERAL_DETECT )
3499 return -EPERM;
3500 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3501 if ( value & DR7_ACTIVE_MASK )
3503 unsigned int io_enable = 0;
3505 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3507 if ( ((value >> i) & 3) == DR_IO )
3509 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3510 return -EPERM;
3511 io_enable |= value & (3 << ((i - 16) >> 1));
3513 #ifdef __i386__
3514 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3515 !boot_cpu_has(X86_FEATURE_LM)) &&
3516 (((value >> i) & 0xc) == DR_LEN_8) )
3517 return -EPERM;
3518 #endif
3521 /* Guest DR5 is a handy stash for I/O intercept information. */
3522 v->arch.guest_context.debugreg[5] = io_enable;
3523 value &= ~io_enable;
3525 /*
3526 * If DR7 was previously clear then we need to load all other
3527 * debug registers at this point as they were not restored during
3528 * context switch.
3529 */
3530 if ( (v == curr) &&
3531 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3533 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3534 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3535 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3536 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3537 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3540 if ( v == curr )
3541 write_debugreg(7, value);
3542 break;
3543 default:
3544 return -EINVAL;
3547 v->arch.guest_context.debugreg[reg] = value;
3548 return 0;
3551 long do_set_debugreg(int reg, unsigned long value)
3553 return set_debugreg(current, reg, value);
3556 unsigned long do_get_debugreg(int reg)
3558 struct vcpu *curr = current;
3560 switch ( reg )
3562 case 0 ... 3:
3563 case 6:
3564 return curr->arch.guest_context.debugreg[reg];
3565 case 7:
3566 return (curr->arch.guest_context.debugreg[7] |
3567 curr->arch.guest_context.debugreg[5]);
3568 case 4 ... 5:
3569 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3570 curr->arch.guest_context.debugreg[reg + 2] : 0);
3573 return -EINVAL;
3576 /*
3577 * Local variables:
3578 * mode: C
3579 * c-set-style: "BSD"
3580 * c-basic-offset: 4
3581 * tab-width: 4
3582 * indent-tabs-mode: nil
3583 * End:
3584 */