debuggers.hg

view xen/arch/x86/traps.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 4785c70c2b6d
children
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/bitops.h>
55 #include <asm/desc.h>
56 #include <asm/debugreg.h>
57 #include <asm/smp.h>
58 #include <asm/flushtlb.h>
59 #include <asm/uaccess.h>
60 #include <asm/i387.h>
61 #include <asm/debugger.h>
62 #include <asm/msr.h>
63 #include <asm/shared.h>
64 #include <asm/x86_emulate.h>
65 #include <asm/traps.h>
66 #include <asm/hvm/vpt.h>
67 #include <asm/hypercall.h>
68 #include <asm/mce.h>
69 #include <asm/apic.h>
70 #include <public/arch-x86/cpuid.h>
72 /*
73 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
74 * fatal: Xen prints diagnostic message and then hangs.
75 * dom0: The NMI is virtualised to DOM0.
76 * ignore: The NMI error is cleared and ignored.
77 */
78 #ifdef NDEBUG
79 static char __read_mostly opt_nmi[10] = "dom0";
80 #else
81 static char __read_mostly opt_nmi[10] = "fatal";
82 #endif
83 string_param("nmi", opt_nmi);
85 DEFINE_PER_CPU(u64, efer);
87 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
89 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
90 #ifdef CONFIG_COMPAT
91 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
92 #endif
94 /* Master table, used by CPU0. */
95 idt_entry_t idt_table[IDT_ENTRIES];
97 /* Pointer to the IDT of every CPU. */
98 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
100 void (*ioemul_handle_quirk)(
101 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
103 static int debug_stack_lines = 20;
104 integer_param("debug_stack_lines", debug_stack_lines);
106 static bool_t __devinitdata opt_ler;
107 boolean_param("ler", opt_ler);
109 #ifdef CONFIG_X86_32
110 #define stack_words_per_line 8
111 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
112 #else
113 #define stack_words_per_line 4
114 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
115 #endif
117 static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs)
118 {
119 int i;
120 unsigned long *stack, addr;
121 unsigned long mask = STACK_SIZE;
123 if ( is_hvm_vcpu(v) )
124 return;
126 if ( is_pv_32on64_vcpu(v) )
127 {
128 compat_show_guest_stack(v, regs, debug_stack_lines);
129 return;
130 }
132 if ( vm86_mode(regs) )
133 {
134 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
135 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
136 regs->ss, (uint16_t)(regs->esp & 0xffff));
137 }
138 else
139 {
140 stack = (unsigned long *)regs->esp;
141 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
142 }
144 if ( !access_ok(stack, sizeof(*stack)) )
145 {
146 printk("Guest-inaccessible memory.\n");
147 return;
148 }
150 if ( v != current )
151 {
152 struct vcpu *vcpu;
154 ASSERT(guest_kernel_mode(v, regs));
155 #ifndef __x86_64__
156 addr = read_cr3();
157 for_each_vcpu( v->domain, vcpu )
158 if ( vcpu->arch.cr3 == addr )
159 break;
160 #else
161 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
162 #endif
163 if ( !vcpu )
164 {
165 stack = do_page_walk(v, (unsigned long)stack);
166 if ( (unsigned long)stack < PAGE_SIZE )
167 {
168 printk("Inaccessible guest memory.\n");
169 return;
170 }
171 mask = PAGE_SIZE;
172 }
173 }
175 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
176 {
177 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
178 break;
179 if ( __get_user(addr, stack) )
180 {
181 if ( i != 0 )
182 printk("\n ");
183 printk("Fault while accessing guest memory.");
184 i = 1;
185 break;
186 }
187 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
188 printk("\n ");
189 printk(" %p", _p(addr));
190 stack++;
191 }
192 if ( i == 0 )
193 printk("Stack empty.");
194 printk("\n");
195 }
197 #if !defined(CONFIG_FRAME_POINTER)
199 void show_trace(struct cpu_user_regs *regs)
200 {
201 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
203 printk("Xen call trace:\n ");
205 printk("[<%p>]", _p(regs->eip));
206 print_symbol(" %s\n ", regs->eip);
208 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
209 {
210 addr = *stack++;
211 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
212 {
213 printk("[<%p>]", _p(addr));
214 print_symbol(" %s\n ", addr);
215 }
216 }
218 printk("\n");
219 }
221 #else
223 void show_trace(struct cpu_user_regs *regs)
224 {
225 unsigned long *frame, next, addr, low, high;
227 printk("Xen call trace:\n ");
229 printk("[<%p>]", _p(regs->eip));
230 print_symbol(" %s\n ", regs->eip);
232 /* Bounds for range of valid frame pointer. */
233 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
234 high = (low & ~(STACK_SIZE - 1)) +
235 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
237 /* The initial frame pointer. */
238 next = regs->ebp;
240 for ( ; ; )
241 {
242 /* Valid frame pointer? */
243 if ( (next < low) || (next >= high) )
244 {
245 /*
246 * Exception stack frames have a different layout, denoted by an
247 * inverted frame pointer.
248 */
249 next = ~next;
250 if ( (next < low) || (next >= high) )
251 break;
252 frame = (unsigned long *)next;
253 next = frame[0];
254 addr = frame[(offsetof(struct cpu_user_regs, eip) -
255 offsetof(struct cpu_user_regs, ebp))
256 / BYTES_PER_LONG];
257 }
258 else
259 {
260 /* Ordinary stack frame. */
261 frame = (unsigned long *)next;
262 next = frame[0];
263 addr = frame[1];
264 }
266 printk("[<%p>]", _p(addr));
267 print_symbol(" %s\n ", addr);
269 low = (unsigned long)&frame[2];
270 }
272 printk("\n");
273 }
275 #endif
277 void show_stack(struct cpu_user_regs *regs)
278 {
279 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
280 int i;
282 if ( guest_mode(regs) )
283 return show_guest_stack(current, regs);
285 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
287 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
288 {
289 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
290 break;
291 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
292 printk("\n ");
293 addr = *stack++;
294 printk(" %p", _p(addr));
295 }
296 if ( i == 0 )
297 printk("Stack empty.");
298 printk("\n");
300 show_trace(regs);
301 }
303 void show_stack_overflow(unsigned int cpu, unsigned long esp)
304 {
305 #ifdef MEMORY_GUARD
306 unsigned long esp_top, esp_bottom;
307 unsigned long *stack, addr;
309 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
310 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
312 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
313 (void *)esp_top, (void *)esp_bottom, (void *)esp,
314 (void *)per_cpu(init_tss, cpu).esp0);
316 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
317 if ( ((unsigned long)(esp - esp_top) > 512) &&
318 ((unsigned long)(esp_top - esp) > 512) )
319 {
320 printk("No stack overflow detected. Skipping stack trace.\n");
321 return;
322 }
324 if ( esp < esp_top )
325 esp = esp_top;
327 printk("Xen stack overflow (dumping trace %p-%p):\n ",
328 (void *)esp, (void *)esp_bottom);
330 stack = (unsigned long *)esp;
331 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
332 {
333 addr = *stack++;
334 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
335 {
336 printk("%p: [<%p>]", stack, _p(addr));
337 print_symbol(" %s\n ", addr);
338 }
339 }
341 printk("\n");
342 #endif
343 }
345 void show_execution_state(struct cpu_user_regs *regs)
346 {
347 show_registers(regs);
348 show_stack(regs);
349 }
351 void vcpu_show_execution_state(struct vcpu *v)
352 {
353 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
354 v->domain->domain_id, v->vcpu_id);
356 if ( v == current )
357 {
358 show_execution_state(guest_cpu_user_regs());
359 return;
360 }
362 vcpu_pause(v); /* acceptably dangerous */
364 vcpu_show_registers(v);
365 if ( guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
366 show_guest_stack(v, &v->arch.guest_context.user_regs);
368 vcpu_unpause(v);
369 }
371 static char *trapstr(int trapnr)
372 {
373 static char *strings[] = {
374 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
375 "invalid opcode", "device not available", "double fault",
376 "coprocessor segment", "invalid tss", "segment not found",
377 "stack error", "general protection fault", "page fault",
378 "spurious interrupt", "coprocessor error", "alignment check",
379 "machine check", "simd error"
380 };
382 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
383 return "???";
385 return strings[trapnr];
386 }
388 /*
389 * This is called for faults at very unexpected times (e.g., when interrupts
390 * are disabled). In such situations we can't do much that is safe. We try to
391 * print out some tracing and then we just spin.
392 */
393 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
394 {
395 static DEFINE_PER_CPU(char, depth);
397 /*
398 * In some cases, we can end up in a vicious cycle of fatal_trap()s
399 * within fatal_trap()s. We give the problem a couple of iterations to
400 * bottom out, and then we just panic.
401 */
402 if ( ++this_cpu(depth) < 3 )
403 {
404 watchdog_disable();
405 console_start_sync();
407 show_execution_state(regs);
409 if ( trapnr == TRAP_page_fault )
410 {
411 unsigned long cr2 = read_cr2();
412 printk("Faulting linear address: %p\n", _p(cr2));
413 show_page_walk(cr2);
414 }
415 }
417 panic("FATAL TRAP: vector = %d (%s)\n"
418 "[error_code=%04x] %s\n",
419 trapnr, trapstr(trapnr), regs->error_code,
420 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
421 }
423 static void do_guest_trap(
424 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
425 {
426 struct vcpu *v = current;
427 struct trap_bounce *tb;
428 const struct trap_info *ti;
430 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
432 tb = &v->arch.trap_bounce;
433 ti = &v->arch.guest_context.trap_ctxt[trapnr];
435 tb->flags = TBF_EXCEPTION;
436 tb->cs = ti->cs;
437 tb->eip = ti->address;
439 if ( use_error_code )
440 {
441 tb->flags |= TBF_EXCEPTION_ERRCODE;
442 tb->error_code = regs->error_code;
443 }
445 if ( TI_GET_IF(ti) )
446 tb->flags |= TBF_INTERRUPT;
448 if ( unlikely(null_trap_bounce(v, tb)) )
449 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
450 "on VCPU %d [ec=%04x]\n",
451 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
452 }
454 static void instruction_done(
455 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
456 {
457 regs->eip = eip;
458 regs->eflags &= ~X86_EFLAGS_RF;
459 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
460 {
461 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
462 if ( regs->eflags & X86_EFLAGS_TF )
463 current->arch.guest_context.debugreg[6] |= 0x4000;
464 do_guest_trap(TRAP_debug, regs, 0);
465 }
466 }
468 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
469 unsigned int port, unsigned int len)
470 {
471 unsigned int width, i, match = 0;
472 unsigned long start;
474 if ( !(v->arch.guest_context.debugreg[5]) ||
475 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
476 return 0;
478 for ( i = 0; i < 4; i++ )
479 {
480 if ( !(v->arch.guest_context.debugreg[5] &
481 (3 << (i * DR_ENABLE_SIZE))) )
482 continue;
484 start = v->arch.guest_context.debugreg[i];
485 width = 0;
487 switch ( (v->arch.guest_context.debugreg[7] >>
488 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
489 {
490 case DR_LEN_1: width = 1; break;
491 case DR_LEN_2: width = 2; break;
492 case DR_LEN_4: width = 4; break;
493 case DR_LEN_8: width = 8; break;
494 }
496 if ( (start < (port + len)) && ((start + width) > port) )
497 match |= 1 << i;
498 }
500 return match;
501 }
503 /*
504 * Called from asm to set up the MCE trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_machinecheck_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
512 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
513 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
514 return !null_trap_bounce(v, tb);
515 }
517 /*
518 * Called from asm to set up the NMI trapbounce info.
519 * Returns 0 if no callback is set up, else 1.
520 */
521 asmlinkage int set_guest_nmi_trapbounce(void)
522 {
523 struct vcpu *v = current;
524 struct trap_bounce *tb = &v->arch.trap_bounce;
525 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
526 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
527 return !null_trap_bounce(v, tb);
528 }
530 static inline void do_trap(
531 int trapnr, struct cpu_user_regs *regs, int use_error_code)
532 {
533 struct vcpu *curr = current;
534 unsigned long fixup;
536 DEBUGGER_trap_entry(trapnr, regs);
538 if ( guest_mode(regs) )
539 {
540 do_guest_trap(trapnr, regs, use_error_code);
541 return;
542 }
544 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
545 {
546 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
547 trapnr, _p(regs->eip), _p(fixup));
548 regs->eip = fixup;
549 return;
550 }
552 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
553 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
554 {
555 curr->arch.hvm_vcpu.fpu_exception_callback(
556 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
557 return;
558 }
560 DEBUGGER_trap_fatal(trapnr, regs);
562 show_execution_state(regs);
563 panic("FATAL TRAP: vector = %d (%s)\n"
564 "[error_code=%04x]\n",
565 trapnr, trapstr(trapnr), regs->error_code);
566 }
568 #define DO_ERROR_NOCODE(trapnr, name) \
569 asmlinkage void do_##name(struct cpu_user_regs *regs) \
570 { \
571 do_trap(trapnr, regs, 0); \
572 }
574 #define DO_ERROR(trapnr, name) \
575 asmlinkage void do_##name(struct cpu_user_regs *regs) \
576 { \
577 do_trap(trapnr, regs, 1); \
578 }
580 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
581 DO_ERROR_NOCODE(TRAP_overflow, overflow)
582 DO_ERROR_NOCODE(TRAP_bounds, bounds)
583 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
584 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
585 DO_ERROR( TRAP_no_segment, segment_not_present)
586 DO_ERROR( TRAP_stack_error, stack_segment)
587 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
588 DO_ERROR( TRAP_alignment_check, alignment_check)
589 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
591 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
592 {
593 struct domain *d = current->domain;
594 /* Optionally shift out of the way of Viridian architectural MSRs. */
595 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
597 idx -= base;
598 if ( idx > 0 )
599 return 0;
601 switch ( idx )
602 {
603 case 0:
604 {
605 *val = 0;
606 break;
607 }
608 default:
609 BUG();
610 }
612 return 1;
613 }
615 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
616 {
617 struct domain *d = current->domain;
618 /* Optionally shift out of the way of Viridian architectural MSRs. */
619 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
621 idx -= base;
622 if ( idx > 0 )
623 return 0;
625 switch ( idx )
626 {
627 case 0:
628 {
629 void *hypercall_page;
630 unsigned long mfn;
631 unsigned long gmfn = val >> 12;
632 unsigned int idx = val & 0xfff;
634 if ( idx > 0 )
635 {
636 gdprintk(XENLOG_WARNING,
637 "Out of range index %u to MSR %08x\n",
638 idx, 0x40000000);
639 return 0;
640 }
642 mfn = gmfn_to_mfn(d, gmfn);
644 if ( !mfn_valid(mfn) ||
645 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
646 {
647 gdprintk(XENLOG_WARNING,
648 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
649 gmfn, mfn, base + idx);
650 return 0;
651 }
653 hypercall_page = map_domain_page(mfn);
654 hypercall_page_initialise(d, hypercall_page);
655 unmap_domain_page(hypercall_page);
657 put_page_and_type(mfn_to_page(mfn));
658 break;
659 }
661 default:
662 BUG();
663 }
665 return 1;
666 }
668 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
669 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
670 {
671 struct domain *d = current->domain;
672 /* Optionally shift out of the way of Viridian architectural leaves. */
673 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
675 idx -= base;
676 if ( idx > 3 )
677 return 0;
679 switch ( idx )
680 {
681 case 0:
682 *eax = base + 3; /* Largest leaf */
683 *ebx = XEN_CPUID_SIGNATURE_EBX;
684 *ecx = XEN_CPUID_SIGNATURE_ECX;
685 *edx = XEN_CPUID_SIGNATURE_EDX;
686 break;
688 case 1:
689 *eax = (xen_major_version() << 16) | xen_minor_version();
690 *ebx = 0; /* Reserved */
691 *ecx = 0; /* Reserved */
692 *edx = 0; /* Reserved */
693 break;
695 case 2:
696 *eax = 1; /* Number of hypercall-transfer pages */
697 *ebx = 0x40000000; /* MSR base address */
698 if ( is_viridian_domain(d) )
699 *ebx = 0x40000200;
700 *ecx = 0; /* Features 1 */
701 *edx = 0; /* Features 2 */
702 if ( !is_hvm_vcpu(current) )
703 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
704 break;
706 case 3:
707 *eax = *ebx = *ecx = *edx = 0;
708 cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
709 break;
711 default:
712 BUG();
713 }
715 return 1;
716 }
718 static void pv_cpuid(struct cpu_user_regs *regs)
719 {
720 uint32_t a, b, c, d;
722 a = regs->eax;
723 b = regs->ebx;
724 c = regs->ecx;
725 d = regs->edx;
727 if ( current->domain->domain_id != 0 )
728 {
729 if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
730 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
731 goto out;
732 }
734 asm (
735 "cpuid"
736 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
737 : "0" (a), "1" (b), "2" (c), "3" (d) );
739 if ( (regs->eax & 0x7fffffff) == 1 )
740 {
741 /* Modify Feature Information. */
742 __clear_bit(X86_FEATURE_VME, &d);
743 if ( !cpu_has_apic )
744 __clear_bit(X86_FEATURE_APIC, &d);
745 __clear_bit(X86_FEATURE_PSE, &d);
746 __clear_bit(X86_FEATURE_PGE, &d);
747 __clear_bit(X86_FEATURE_PSE36, &d);
748 }
749 switch ( (uint32_t)regs->eax )
750 {
751 case 1:
752 /* Modify Feature Information. */
753 if ( !cpu_has_sep )
754 __clear_bit(X86_FEATURE_SEP, &d);
755 #ifdef __i386__
756 if ( !supervisor_mode_kernel )
757 __clear_bit(X86_FEATURE_SEP, &d);
758 #endif
759 __clear_bit(X86_FEATURE_DS, &d);
760 __clear_bit(X86_FEATURE_ACC, &d);
761 __clear_bit(X86_FEATURE_PBE, &d);
763 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
764 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
765 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
766 __clear_bit(X86_FEATURE_VMXE % 32, &c);
767 __clear_bit(X86_FEATURE_SMXE % 32, &c);
768 __clear_bit(X86_FEATURE_TM2 % 32, &c);
769 if ( is_pv_32bit_vcpu(current) )
770 __clear_bit(X86_FEATURE_CX16 % 32, &c);
771 __clear_bit(X86_FEATURE_XTPR % 32, &c);
772 __clear_bit(X86_FEATURE_PDCM % 32, &c);
773 __clear_bit(X86_FEATURE_DCA % 32, &c);
774 if ( !cpu_has_xsave )
775 {
776 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
777 __clear_bit(X86_FEATURE_AVX % 32, &c);
778 }
779 if ( !cpu_has_apic )
780 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
781 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
782 break;
783 case 0x80000001:
784 /* Modify Feature Information. */
785 if ( is_pv_32bit_vcpu(current) )
786 {
787 __clear_bit(X86_FEATURE_LM % 32, &d);
788 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
789 }
790 #ifndef __i386__
791 if ( is_pv_32on64_vcpu(current) &&
792 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
793 #endif
794 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
795 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
796 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
798 __clear_bit(X86_FEATURE_SVM % 32, &c);
799 if ( !cpu_has_apic )
800 __clear_bit(X86_FEATURE_EXTAPIC % 32, &c);
801 __clear_bit(X86_FEATURE_OSVW % 32, &c);
802 __clear_bit(X86_FEATURE_IBS % 32, &c);
803 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
804 __clear_bit(X86_FEATURE_WDT % 32, &c);
805 __clear_bit(X86_FEATURE_LWP % 32, &c);
806 __clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
807 __clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
808 break;
809 case 5: /* MONITOR/MWAIT */
810 case 0xa: /* Architectural Performance Monitor Features */
811 case 0x8000000a: /* SVM revision and features */
812 case 0x8000001b: /* Instruction Based Sampling */
813 case 0x8000001c: /* Light Weight Profiling */
814 case 0x8000001e: /* Extended topology reporting */
815 a = b = c = d = 0;
816 break;
817 default:
818 (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
819 break;
820 }
822 out:
823 regs->eax = a;
824 regs->ebx = b;
825 regs->ecx = c;
826 regs->edx = d;
827 }
829 static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
830 {
831 char opcode[3];
832 unsigned long eip, rc;
833 struct vcpu *v = current;
835 eip = regs->eip;
836 if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
837 {
838 propagate_page_fault(eip + sizeof(opcode) - rc, 0);
839 return EXCRET_fault_fixed;
840 }
841 if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
842 return 0;
843 eip += sizeof(opcode);
844 pv_soft_rdtsc(v, regs, 1);
845 instruction_done(regs, eip, 0);
846 return EXCRET_fault_fixed;
847 }
849 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
850 {
851 char sig[5], instr[2];
852 unsigned long eip, rc;
854 eip = regs->eip;
856 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
857 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
858 {
859 propagate_page_fault(eip + sizeof(sig) - rc, 0);
860 return EXCRET_fault_fixed;
861 }
862 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
863 return 0;
864 eip += sizeof(sig);
866 /* We only emulate CPUID. */
867 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
868 {
869 propagate_page_fault(eip + sizeof(instr) - rc, 0);
870 return EXCRET_fault_fixed;
871 }
872 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
873 return 0;
874 eip += sizeof(instr);
876 pv_cpuid(regs);
878 instruction_done(regs, eip, 0);
880 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
882 return EXCRET_fault_fixed;
883 }
885 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
886 {
887 struct bug_frame bug;
888 struct bug_frame_str bug_str;
889 const char *p, *filename, *predicate, *eip = (char *)regs->eip;
890 unsigned long fixup;
891 int id, lineno;
893 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
895 if ( likely(guest_mode(regs)) )
896 {
897 if ( !emulate_invalid_rdtscp(regs) &&
898 !emulate_forced_invalid_op(regs) )
899 do_guest_trap(TRAP_invalid_op, regs, 0);
900 return;
901 }
903 if ( !is_kernel(eip) ||
904 __copy_from_user(&bug, eip, sizeof(bug)) ||
905 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
906 (bug.ret != 0xc2) )
907 goto die;
908 eip += sizeof(bug);
910 /* Decode first pointer argument. */
911 if ( !is_kernel(eip) ||
912 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
913 (bug_str.mov != 0xbc) )
914 goto die;
915 p = bug_str(bug_str, eip);
916 if ( !is_kernel(p) )
917 goto die;
918 eip += sizeof(bug_str);
920 id = bug.id & 3;
922 if ( id == BUGFRAME_run_fn )
923 {
924 void (*fn)(struct cpu_user_regs *) = (void *)p;
925 (*fn)(regs);
926 regs->eip = (unsigned long)eip;
927 return;
928 }
930 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
931 filename = p;
932 lineno = bug.id >> 2;
934 if ( id == BUGFRAME_warn )
935 {
936 printk("Xen WARN at %.50s:%d\n", filename, lineno);
937 show_execution_state(regs);
938 regs->eip = (unsigned long)eip;
939 return;
940 }
942 if ( id == BUGFRAME_bug )
943 {
944 printk("Xen BUG at %.50s:%d\n", filename, lineno);
945 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
946 show_execution_state(regs);
947 panic("Xen BUG at %.50s:%d\n", filename, lineno);
948 }
950 /* ASSERT: decode the predicate string pointer. */
951 ASSERT(id == BUGFRAME_assert);
952 if ( !is_kernel(eip) ||
953 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
954 (bug_str.mov != 0xbc) )
955 goto die;
956 predicate = bug_str(bug_str, eip);
957 eip += sizeof(bug_str);
959 if ( !is_kernel(predicate) )
960 predicate = "<unknown>";
961 printk("Assertion '%s' failed at %.50s:%d\n",
962 predicate, filename, lineno);
963 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
964 show_execution_state(regs);
965 panic("Assertion '%s' failed at %.50s:%d\n",
966 predicate, filename, lineno);
968 die:
969 if ( (fixup = search_exception_table(regs->eip)) != 0 )
970 {
971 regs->eip = fixup;
972 return;
973 }
974 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
975 show_execution_state(regs);
976 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
977 }
979 asmlinkage void do_int3(struct cpu_user_regs *regs)
980 {
981 DEBUGGER_trap_entry(TRAP_int3, regs);
983 if ( !guest_mode(regs) )
984 {
985 debugger_trap_fatal(TRAP_int3, regs);
986 return;
987 }
989 do_guest_trap(TRAP_int3, regs, 0);
990 }
992 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
993 {
994 machine_check_vector(regs, regs->error_code);
995 }
997 static void reserved_bit_page_fault(
998 unsigned long addr, struct cpu_user_regs *regs)
999 {
1000 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
1001 current->domain->domain_id, current->vcpu_id, regs->error_code);
1002 show_page_walk(addr);
1003 show_execution_state(regs);
1006 void propagate_page_fault(unsigned long addr, u16 error_code)
1008 struct trap_info *ti;
1009 struct vcpu *v = current;
1010 struct trap_bounce *tb = &v->arch.trap_bounce;
1012 v->arch.guest_context.ctrlreg[2] = addr;
1013 arch_set_cr2(v, addr);
1015 /* Re-set error_code.user flag appropriately for the guest. */
1016 error_code &= ~PFEC_user_mode;
1017 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
1018 error_code |= PFEC_user_mode;
1020 trace_pv_page_fault(addr, error_code);
1022 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
1023 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1024 tb->error_code = error_code;
1025 tb->cs = ti->cs;
1026 tb->eip = ti->address;
1027 if ( TI_GET_IF(ti) )
1028 tb->flags |= TBF_INTERRUPT;
1029 if ( unlikely(null_trap_bounce(v, tb)) )
1031 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
1032 v->domain->domain_id, v->vcpu_id, error_code);
1033 show_page_walk(addr);
1036 if ( unlikely(error_code & PFEC_reserved_bit) )
1037 reserved_bit_page_fault(addr, guest_cpu_user_regs());
1040 static int handle_gdt_ldt_mapping_fault(
1041 unsigned long offset, struct cpu_user_regs *regs)
1043 struct vcpu *curr = current;
1044 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1045 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1046 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1048 /*
1049 * If the fault is in another vcpu's area, it cannot be due to
1050 * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1051 * indeed we have to since map_ldt_shadow_page() works correctly only on
1052 * accesses to a vcpu's own area.
1053 */
1054 if ( vcpu_area != curr->vcpu_id )
1055 return 0;
1057 /* Byte offset within the gdt/ldt sub-area. */
1058 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1060 if ( likely(is_ldt_area) )
1062 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1063 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1065 if ( guest_mode(regs) )
1066 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1067 regs->eip, offset);
1069 else
1071 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1072 if ( !guest_mode(regs) )
1073 return 0;
1074 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1075 propagate_page_fault(
1076 curr->arch.guest_context.ldt_base + offset,
1077 regs->error_code);
1080 else
1082 /* GDT fault: handle the fault as #GP(selector). */
1083 regs->error_code = (u16)offset & ~7;
1084 (void)do_general_protection(regs);
1087 return EXCRET_fault_fixed;
1090 #ifdef HYPERVISOR_VIRT_END
1091 #define IN_HYPERVISOR_RANGE(va) \
1092 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1093 #else
1094 #define IN_HYPERVISOR_RANGE(va) \
1095 (((va) >= HYPERVISOR_VIRT_START))
1096 #endif
1098 static int __spurious_page_fault(
1099 unsigned long addr, unsigned int error_code)
1101 unsigned long mfn, cr3 = read_cr3();
1102 #if CONFIG_PAGING_LEVELS >= 4
1103 l4_pgentry_t l4e, *l4t;
1104 #endif
1105 #if CONFIG_PAGING_LEVELS >= 3
1106 l3_pgentry_t l3e, *l3t;
1107 #endif
1108 l2_pgentry_t l2e, *l2t;
1109 l1_pgentry_t l1e, *l1t;
1110 unsigned int required_flags, disallowed_flags;
1112 /*
1113 * We do not take spurious page faults in IRQ handlers as we do not
1114 * modify page tables in IRQ context. We therefore bail here because
1115 * map_domain_page() is not IRQ-safe.
1116 */
1117 if ( in_irq() )
1118 return 0;
1120 /* Reserved bit violations are never spurious faults. */
1121 if ( error_code & PFEC_reserved_bit )
1122 return 0;
1124 required_flags = _PAGE_PRESENT;
1125 if ( error_code & PFEC_write_access )
1126 required_flags |= _PAGE_RW;
1127 if ( error_code & PFEC_user_mode )
1128 required_flags |= _PAGE_USER;
1130 disallowed_flags = 0;
1131 if ( error_code & PFEC_insn_fetch )
1132 disallowed_flags |= _PAGE_NX;
1134 mfn = cr3 >> PAGE_SHIFT;
1136 #if CONFIG_PAGING_LEVELS >= 4
1137 l4t = map_domain_page(mfn);
1138 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1139 mfn = l4e_get_pfn(l4e);
1140 unmap_domain_page(l4t);
1141 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1142 (l4e_get_flags(l4e) & disallowed_flags) )
1143 return 0;
1144 #endif
1146 #if CONFIG_PAGING_LEVELS >= 3
1147 l3t = map_domain_page(mfn);
1148 #if CONFIG_PAGING_LEVELS == 3
1149 l3t += (cr3 & 0xFE0UL) >> 3;
1150 #endif
1151 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1152 mfn = l3e_get_pfn(l3e);
1153 unmap_domain_page(l3t);
1154 #if CONFIG_PAGING_LEVELS == 3
1155 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1156 return 0;
1157 #else
1158 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1159 (l3e_get_flags(l3e) & disallowed_flags) )
1160 return 0;
1161 #endif
1162 #endif
1164 l2t = map_domain_page(mfn);
1165 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1166 mfn = l2e_get_pfn(l2e);
1167 unmap_domain_page(l2t);
1168 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1169 (l2e_get_flags(l2e) & disallowed_flags) )
1170 return 0;
1171 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1172 return 1;
1174 l1t = map_domain_page(mfn);
1175 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1176 mfn = l1e_get_pfn(l1e);
1177 unmap_domain_page(l1t);
1178 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1179 (l1e_get_flags(l1e) & disallowed_flags) )
1180 return 0;
1182 return 1;
1185 static int spurious_page_fault(
1186 unsigned long addr, unsigned int error_code)
1188 unsigned long flags;
1189 int is_spurious;
1191 /*
1192 * Disabling interrupts prevents TLB flushing, and hence prevents
1193 * page tables from becoming invalid under our feet during the walk.
1194 */
1195 local_irq_save(flags);
1196 is_spurious = __spurious_page_fault(addr, error_code);
1197 local_irq_restore(flags);
1199 return is_spurious;
1202 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1204 struct vcpu *v = current;
1205 struct domain *d = v->domain;
1207 /* No fixups in interrupt context or when interrupts are disabled. */
1208 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1209 return 0;
1211 /* Faults from external-mode guests are handled by shadow/hap */
1212 if ( paging_mode_external(d) && guest_mode(regs) )
1214 int ret = paging_fault(addr, regs);
1215 if ( ret == EXCRET_fault_fixed )
1216 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1217 return ret;
1220 if ( !(regs->error_code & PFEC_page_present) &&
1221 (pagefault_by_memadd(addr, regs)) )
1222 return handle_memadd_fault(addr, regs);
1224 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1226 if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1227 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1228 return handle_gdt_ldt_mapping_fault(
1229 addr - GDT_LDT_VIRT_START, regs);
1230 return 0;
1233 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1234 guest_kernel_mode(v, regs) )
1236 unsigned int mbs = PFEC_write_access;
1237 unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch;
1239 /* Do not check if access-protection fault since the page may
1240 legitimately be not present in shadow page tables */
1241 if ( !paging_mode_enabled(d) )
1242 mbs |= PFEC_page_present;
1244 if ( ((regs->error_code & (mbs | mbz)) == mbs) &&
1245 ptwr_do_page_fault(v, addr, regs) )
1246 return EXCRET_fault_fixed;
1249 /* For non-external shadowed guests, we fix up both their own
1250 * pagefaults and Xen's, since they share the pagetables. */
1251 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1253 int ret = paging_fault(addr, regs);
1254 if ( ret == EXCRET_fault_fixed )
1255 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1256 return ret;
1259 return 0;
1262 /*
1263 * #PF error code:
1264 * Bit 0: Protection violation (=1) ; Page not present (=0)
1265 * Bit 1: Write access
1266 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1267 * Bit 3: Reserved bit violation
1268 * Bit 4: Instruction fetch
1269 */
1270 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1272 unsigned long addr, fixup;
1273 unsigned int error_code;
1275 addr = read_cr2();
1277 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1278 error_code = regs->error_code;
1280 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1282 perfc_incr(page_faults);
1284 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1285 return;
1287 if ( unlikely(!guest_mode(regs)) )
1289 if ( spurious_page_fault(addr, error_code) )
1290 return;
1292 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1294 perfc_incr(copy_user_faults);
1295 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1296 reserved_bit_page_fault(addr, regs);
1297 regs->eip = fixup;
1298 return;
1301 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1303 show_execution_state(regs);
1304 show_page_walk(addr);
1305 panic("FATAL PAGE FAULT\n"
1306 "[error_code=%04x]\n"
1307 "Faulting linear address: %p\n",
1308 error_code, _p(addr));
1311 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1312 && spurious_page_fault(addr, error_code)) )
1313 return;
1315 propagate_page_fault(addr, regs->error_code);
1318 /*
1319 * Early #PF handler to print CR2, error code, and stack.
1321 * We also deal with spurious faults here, even though they should never happen
1322 * during early boot (an issue was seen once, but was most likely a hardware
1323 * problem).
1324 */
1325 asmlinkage void __init do_early_page_fault(struct cpu_user_regs *regs)
1327 static int stuck;
1328 static unsigned long prev_eip, prev_cr2;
1329 unsigned long cr2 = read_cr2();
1331 BUG_ON(smp_processor_id() != 0);
1333 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1335 prev_eip = regs->eip;
1336 prev_cr2 = cr2;
1337 stuck = 0;
1338 return;
1341 if ( stuck++ == 1000 )
1343 unsigned long *stk = (unsigned long *)regs;
1344 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1345 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1346 printk("Stack dump: ");
1347 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1348 printk("%p ", _p(*stk++));
1349 for ( ; ; ) ;
1353 long do_fpu_taskswitch(int set)
1355 struct vcpu *v = current;
1357 if ( set )
1359 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1360 stts();
1362 else
1364 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1365 if ( v->fpu_dirtied )
1366 clts();
1369 return 0;
1372 static int read_descriptor(unsigned int sel,
1373 const struct vcpu *v,
1374 const struct cpu_user_regs * regs,
1375 unsigned long *base,
1376 unsigned long *limit,
1377 unsigned int *ar,
1378 unsigned int vm86attr)
1380 struct desc_struct desc;
1382 if ( !vm86_mode(regs) )
1384 if ( sel < 4)
1385 desc.b = desc.a = 0;
1386 else if ( __get_user(desc,
1387 (const struct desc_struct *)(!(sel & 4)
1388 ? GDT_VIRT_START(v)
1389 : LDT_VIRT_START(v))
1390 + (sel >> 3)) )
1391 return 0;
1392 if ( !(vm86attr & _SEGMENT_CODE) )
1393 desc.b &= ~_SEGMENT_L;
1395 else
1397 desc.a = (sel << 20) | 0xffff;
1398 desc.b = vm86attr | (sel >> 12);
1401 *ar = desc.b & 0x00f0ff00;
1402 if ( !(desc.b & _SEGMENT_L) )
1404 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1405 (desc.b & 0xff000000));
1406 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1407 if ( desc.b & _SEGMENT_G )
1408 *limit = ((*limit + 1) << 12) - 1;
1409 #ifndef NDEBUG
1410 if ( !vm86_mode(regs) && (sel > 3) )
1412 unsigned int a, l;
1413 unsigned char valid;
1415 asm volatile (
1416 "larl %2,%0 ; setz %1"
1417 : "=r" (a), "=qm" (valid) : "rm" (sel));
1418 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1419 asm volatile (
1420 "lsll %2,%0 ; setz %1"
1421 : "=r" (l), "=qm" (valid) : "rm" (sel));
1422 BUG_ON(valid && (l != *limit));
1424 #endif
1426 else
1428 *base = 0UL;
1429 *limit = ~0UL;
1432 return 1;
1435 #ifdef __x86_64__
1436 static int read_gate_descriptor(unsigned int gate_sel,
1437 const struct vcpu *v,
1438 unsigned int *sel,
1439 unsigned long *off,
1440 unsigned int *ar)
1442 struct desc_struct desc;
1443 const struct desc_struct *pdesc;
1446 pdesc = (const struct desc_struct *)
1447 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1448 + (gate_sel >> 3);
1449 if ( (gate_sel < 4) ||
1450 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1451 __get_user(desc, pdesc) )
1452 return 0;
1454 *sel = (desc.a >> 16) & 0x0000fffc;
1455 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1456 *ar = desc.b & 0x0000ffff;
1458 /*
1459 * check_descriptor() clears the DPL field and stores the
1460 * guest requested DPL in the selector's RPL field.
1461 */
1462 if ( *ar & _SEGMENT_DPL )
1463 return 0;
1464 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1466 if ( !is_pv_32bit_vcpu(v) )
1468 if ( (*ar & 0x1f00) != 0x0c00 ||
1469 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1470 __get_user(desc, pdesc + 1) ||
1471 (desc.b & 0x1f00) )
1472 return 0;
1474 *off |= (unsigned long)desc.a << 32;
1475 return 1;
1478 switch ( *ar & 0x1f00 )
1480 case 0x0400:
1481 *off &= 0xffff;
1482 break;
1483 case 0x0c00:
1484 break;
1485 default:
1486 return 0;
1489 return 1;
1491 #endif
1493 /* Has the guest requested sufficient permission for this I/O access? */
1494 static int guest_io_okay(
1495 unsigned int port, unsigned int bytes,
1496 struct vcpu *v, struct cpu_user_regs *regs)
1498 #if defined(__x86_64__)
1499 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1500 int user_mode = !(v->arch.flags & TF_kernel_mode);
1501 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1502 #elif defined(__i386__)
1503 #define TOGGLE_MODE() ((void)0)
1504 #endif
1506 if ( !vm86_mode(regs) &&
1507 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1508 return 1;
1510 if ( v->arch.iobmp_limit > (port + bytes) )
1512 union { uint8_t bytes[2]; uint16_t mask; } x;
1514 /*
1515 * Grab permission bytes from guest space. Inaccessible bytes are
1516 * read as 0xff (no access allowed).
1517 */
1518 TOGGLE_MODE();
1519 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1520 port>>3, 2) )
1522 default: x.bytes[0] = ~0;
1523 case 1: x.bytes[1] = ~0;
1524 case 0: break;
1526 TOGGLE_MODE();
1528 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1529 return 1;
1532 return 0;
1535 /* Has the administrator granted sufficient permission for this I/O access? */
1536 static int admin_io_okay(
1537 unsigned int port, unsigned int bytes,
1538 struct vcpu *v, struct cpu_user_regs *regs)
1540 /*
1541 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1542 * We never permit direct access to that register.
1543 */
1544 if ( (port == 0xcf8) && (bytes == 4) )
1545 return 0;
1547 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1550 static uint32_t guest_io_read(
1551 unsigned int port, unsigned int bytes,
1552 struct vcpu *v, struct cpu_user_regs *regs)
1554 uint32_t data = 0;
1555 unsigned int shift = 0;
1557 if ( admin_io_okay(port, bytes, v, regs) )
1559 switch ( bytes )
1561 case 1: return inb(port);
1562 case 2: return inw(port);
1563 case 4: return inl(port);
1567 while ( bytes != 0 )
1569 unsigned int size = 1;
1570 uint32_t sub_data = 0xff;
1572 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1574 sub_data = pv_pit_handler(port, 0, 0);
1576 else if ( (port == 0xcf8) && (bytes == 4) )
1578 size = 4;
1579 sub_data = v->domain->arch.pci_cf8;
1581 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1583 size = min(bytes, 4 - (port & 3));
1584 if ( size == 3 )
1585 size = 2;
1586 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1589 if ( size == 4 )
1590 return sub_data;
1592 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1593 shift += size * 8;
1594 port += size;
1595 bytes -= size;
1598 return data;
1601 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1603 static void guest_io_write(
1604 unsigned int port, unsigned int bytes, uint32_t data,
1605 struct vcpu *v, struct cpu_user_regs *regs)
1607 if ( admin_io_okay(port, bytes, v, regs) )
1609 switch ( bytes ) {
1610 case 1:
1611 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1612 pv_rtc_handler(port, (uint8_t)data);
1613 outb((uint8_t)data, port);
1614 if ( pv_post_outb_hook )
1615 pv_post_outb_hook(port, (uint8_t)data);
1616 break;
1617 case 2:
1618 outw((uint16_t)data, port);
1619 break;
1620 case 4:
1621 outl(data, port);
1622 break;
1624 return;
1627 while ( bytes != 0 )
1629 unsigned int size = 1;
1631 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1633 pv_pit_handler(port, (uint8_t)data, 1);
1635 else if ( (port == 0xcf8) && (bytes == 4) )
1637 size = 4;
1638 v->domain->arch.pci_cf8 = data;
1640 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1642 size = min(bytes, 4 - (port & 3));
1643 if ( size == 3 )
1644 size = 2;
1645 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1648 if ( size == 4 )
1649 return;
1651 port += size;
1652 bytes -= size;
1653 data >>= size * 8;
1657 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1658 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1659 __attribute__((__regparm__(1)));
1660 unsigned long guest_to_host_gpr_switch(unsigned long)
1661 __attribute__((__regparm__(1)));
1663 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1665 static inline uint64_t guest_misc_enable(uint64_t val)
1667 val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
1668 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
1669 val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1670 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
1671 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
1672 return val;
1675 /* Instruction fetch with error handling. */
1676 #define insn_fetch(type, base, eip, limit) \
1677 ({ unsigned long _rc, _ptr = (base) + (eip); \
1678 type _x; \
1679 if ( ad_default < 8 ) \
1680 _ptr = (unsigned int)_ptr; \
1681 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1682 goto fail; \
1683 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1684 { \
1685 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1686 goto skip; \
1687 } \
1688 (eip) += sizeof(_x); _x; })
1690 #if defined(CONFIG_X86_32)
1691 # define read_sreg(regs, sr) ((regs)->sr)
1692 #elif defined(CONFIG_X86_64)
1693 # define read_sreg(regs, sr) read_segment_register(sr)
1694 #endif
1696 static int is_cpufreq_controller(struct domain *d)
1698 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1699 (d->domain_id == 0));
1702 #ifdef CONFIG_X86_64
1703 #include "x86_64/mmconfig.h"
1704 #endif
1706 static int emulate_privileged_op(struct cpu_user_regs *regs)
1708 struct vcpu *v = current;
1709 unsigned long *reg, eip = regs->eip;
1710 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1711 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1712 int rc;
1713 unsigned int port, i, data_sel, ar, data, bpmatch = 0;
1714 unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
1715 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1716 ? regs->reg \
1717 : ad_bytes == 4 \
1718 ? (u32)regs->reg \
1719 : (u16)regs->reg)
1720 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1721 ? regs->reg = (val) \
1722 : ad_bytes == 4 \
1723 ? (*(u32 *)&regs->reg = (val)) \
1724 : (*(u16 *)&regs->reg = (val)))
1725 unsigned long code_base, code_limit;
1726 char io_emul_stub[32];
1727 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1728 uint64_t val, msr_content;
1730 if ( !read_descriptor(regs->cs, v, regs,
1731 &code_base, &code_limit, &ar,
1732 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1733 goto fail;
1734 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1735 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1736 if ( !(ar & _SEGMENT_S) ||
1737 !(ar & _SEGMENT_P) ||
1738 !(ar & _SEGMENT_CODE) )
1739 goto fail;
1741 /* emulating only opcodes not allowing SS to be default */
1742 data_sel = read_sreg(regs, ds);
1744 /* Legacy prefixes. */
1745 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1747 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1749 case 0x66: /* operand-size override */
1750 opsize_prefix = 1;
1751 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1752 continue;
1753 case 0x67: /* address-size override */
1754 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1755 continue;
1756 case 0x2e: /* CS override */
1757 data_sel = regs->cs;
1758 continue;
1759 case 0x3e: /* DS override */
1760 data_sel = read_sreg(regs, ds);
1761 continue;
1762 case 0x26: /* ES override */
1763 data_sel = read_sreg(regs, es);
1764 continue;
1765 case 0x64: /* FS override */
1766 data_sel = read_sreg(regs, fs);
1767 lm_ovr = lm_seg_fs;
1768 continue;
1769 case 0x65: /* GS override */
1770 data_sel = read_sreg(regs, gs);
1771 lm_ovr = lm_seg_gs;
1772 continue;
1773 case 0x36: /* SS override */
1774 data_sel = regs->ss;
1775 continue;
1776 case 0xf0: /* LOCK */
1777 lock = 1;
1778 continue;
1779 case 0xf2: /* REPNE/REPNZ */
1780 case 0xf3: /* REP/REPE/REPZ */
1781 rep_prefix = 1;
1782 continue;
1783 default:
1784 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1786 rex = opcode;
1787 continue;
1789 break;
1791 break;
1794 /* REX prefix. */
1795 if ( rex & 8 ) /* REX.W */
1796 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1797 modrm_reg = (rex & 4) << 1; /* REX.R */
1798 /* REX.X does not need to be decoded. */
1799 modrm_rm = (rex & 1) << 3; /* REX.B */
1801 if ( opcode == 0x0f )
1802 goto twobyte_opcode;
1804 if ( lock )
1805 goto fail;
1807 /* Input/Output String instructions. */
1808 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1810 unsigned long data_base, data_limit;
1812 if ( rep_prefix && (rd_ad(ecx) == 0) )
1813 goto done;
1815 if ( !(opcode & 2) )
1817 data_sel = read_sreg(regs, es);
1818 lm_ovr = lm_seg_none;
1821 if ( !(ar & _SEGMENT_L) )
1823 if ( !read_descriptor(data_sel, v, regs,
1824 &data_base, &data_limit, &ar,
1825 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1826 _SEGMENT_P) )
1827 goto fail;
1828 if ( !(ar & _SEGMENT_S) ||
1829 !(ar & _SEGMENT_P) ||
1830 (opcode & 2 ?
1831 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1832 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1833 goto fail;
1835 #ifdef CONFIG_X86_64
1836 else
1838 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1840 switch ( lm_ovr )
1842 case lm_seg_none:
1843 data_base = 0UL;
1844 break;
1845 case lm_seg_fs:
1846 data_base = v->arch.guest_context.fs_base;
1847 break;
1848 case lm_seg_gs:
1849 if ( guest_kernel_mode(v, regs) )
1850 data_base = v->arch.guest_context.gs_base_kernel;
1851 else
1852 data_base = v->arch.guest_context.gs_base_user;
1853 break;
1856 else
1857 read_descriptor(data_sel, v, regs,
1858 &data_base, &data_limit, &ar,
1859 0);
1860 data_limit = ~0UL;
1861 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1863 #endif
1865 port = (u16)regs->edx;
1867 continue_io_string:
1868 switch ( opcode )
1870 case 0x6c: /* INSB */
1871 op_bytes = 1;
1872 case 0x6d: /* INSW/INSL */
1873 if ( (data_limit < (op_bytes - 1)) ||
1874 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1875 !guest_io_okay(port, op_bytes, v, regs) )
1876 goto fail;
1877 data = guest_io_read(port, op_bytes, v, regs);
1878 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1879 &data, op_bytes)) != 0 )
1881 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1882 PFEC_write_access);
1883 return EXCRET_fault_fixed;
1885 wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
1886 ? -op_bytes : op_bytes));
1887 break;
1889 case 0x6e: /* OUTSB */
1890 op_bytes = 1;
1891 case 0x6f: /* OUTSW/OUTSL */
1892 if ( (data_limit < (op_bytes - 1)) ||
1893 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1894 !guest_io_okay(port, op_bytes, v, regs) )
1895 goto fail;
1896 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1897 op_bytes)) != 0 )
1899 propagate_page_fault(data_base + rd_ad(esi)
1900 + op_bytes - rc, 0);
1901 return EXCRET_fault_fixed;
1903 guest_io_write(port, op_bytes, data, v, regs);
1904 wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
1905 ? -op_bytes : op_bytes));
1906 break;
1909 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1911 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1913 if ( !bpmatch && !hypercall_preempt_check() )
1914 goto continue_io_string;
1915 eip = regs->eip;
1918 goto done;
1921 /*
1922 * Very likely to be an I/O instruction (IN/OUT).
1923 * Build an on-stack stub to execute the instruction with full guest
1924 * GPR context. This is needed for some systems which (ab)use IN/OUT
1925 * to communicate with BIOS code in system-management mode.
1926 */
1927 #ifdef __x86_64__
1928 /* movq $host_to_guest_gpr_switch,%rcx */
1929 io_emul_stub[0] = 0x48;
1930 io_emul_stub[1] = 0xb9;
1931 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1932 /* callq *%rcx */
1933 io_emul_stub[10] = 0xff;
1934 io_emul_stub[11] = 0xd1;
1935 #else
1936 /* call host_to_guest_gpr_switch */
1937 io_emul_stub[0] = 0xe8;
1938 *(s32 *)&io_emul_stub[1] =
1939 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1940 /* 7 x nop */
1941 memset(&io_emul_stub[5], 0x90, 7);
1942 #endif
1943 /* data16 or nop */
1944 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1945 /* <io-access opcode> */
1946 io_emul_stub[13] = opcode;
1947 /* imm8 or nop */
1948 io_emul_stub[14] = 0x90;
1949 /* ret (jumps to guest_to_host_gpr_switch) */
1950 io_emul_stub[15] = 0xc3;
1952 /* Handy function-typed pointer to the stub. */
1953 io_emul = (void *)io_emul_stub;
1955 if ( ioemul_handle_quirk )
1956 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1958 /* I/O Port and Interrupt Flag instructions. */
1959 switch ( opcode )
1961 case 0xe4: /* IN imm8,%al */
1962 op_bytes = 1;
1963 case 0xe5: /* IN imm8,%eax */
1964 port = insn_fetch(u8, code_base, eip, code_limit);
1965 io_emul_stub[14] = port; /* imm8 */
1966 exec_in:
1967 if ( !guest_io_okay(port, op_bytes, v, regs) )
1968 goto fail;
1969 if ( admin_io_okay(port, op_bytes, v, regs) )
1971 io_emul(regs);
1973 else
1975 if ( op_bytes == 4 )
1976 regs->eax = 0;
1977 else
1978 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1979 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1981 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1982 goto done;
1984 case 0xec: /* IN %dx,%al */
1985 op_bytes = 1;
1986 case 0xed: /* IN %dx,%eax */
1987 port = (u16)regs->edx;
1988 goto exec_in;
1990 case 0xe6: /* OUT %al,imm8 */
1991 op_bytes = 1;
1992 case 0xe7: /* OUT %eax,imm8 */
1993 port = insn_fetch(u8, code_base, eip, code_limit);
1994 io_emul_stub[14] = port; /* imm8 */
1995 exec_out:
1996 if ( !guest_io_okay(port, op_bytes, v, regs) )
1997 goto fail;
1998 if ( admin_io_okay(port, op_bytes, v, regs) )
2000 if ( (op_bytes == 1) &&
2001 ((port == 0x71) || (port == 0x70)) &&
2002 pv_rtc_handler )
2003 pv_rtc_handler(port, regs->eax);
2004 io_emul(regs);
2005 if ( (op_bytes == 1) && pv_post_outb_hook )
2006 pv_post_outb_hook(port, regs->eax);
2008 else
2010 guest_io_write(port, op_bytes, regs->eax, v, regs);
2012 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
2013 goto done;
2015 case 0xee: /* OUT %al,%dx */
2016 op_bytes = 1;
2017 case 0xef: /* OUT %eax,%dx */
2018 port = (u16)regs->edx;
2019 goto exec_out;
2021 case 0xfa: /* CLI */
2022 case 0xfb: /* STI */
2023 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
2024 goto fail;
2025 /*
2026 * This is just too dangerous to allow, in my opinion. Consider if the
2027 * caller then tries to reenable interrupts using POPF: we can't trap
2028 * that and we'll end up with hard-to-debug lockups. Fast & loose will
2029 * do for us. :-)
2030 */
2031 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
2032 goto done;
2035 /* No decode of this single-byte opcode. */
2036 goto fail;
2038 twobyte_opcode:
2039 /*
2040 * All 2 and 3 byte opcodes, except RDTSC (0x31) and RDTSCP (0x1,0xF9)
2041 * are executable only from guest kernel mode (virtual ring 0).
2042 */
2043 opcode = insn_fetch(u8, code_base, eip, code_limit);
2044 if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) )
2045 goto fail;
2047 if ( lock && (opcode & ~3) != 0x20 )
2048 goto fail;
2049 switch ( opcode )
2051 case 0x1: /* RDTSCP and XSETBV */
2052 switch ( insn_fetch(u8, code_base, eip, code_limit) )
2054 case 0xf9: /* RDTSCP */
2055 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2056 !guest_kernel_mode(v, regs) )
2057 goto fail;
2058 pv_soft_rdtsc(v, regs, 1);
2059 break;
2060 case 0xd1: /* XSETBV */
2062 u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
2064 if ( lock || rep_prefix || opsize_prefix
2065 || !(v->arch.guest_context.ctrlreg[4] & X86_CR4_OSXSAVE) )
2067 do_guest_trap(TRAP_invalid_op, regs, 0);
2068 goto skip;
2071 if ( !guest_kernel_mode(v, regs) )
2072 goto fail;
2074 switch ( (u32)regs->ecx )
2076 case XCR_XFEATURE_ENABLED_MASK:
2077 /* bit 0 of XCR0 must be set and reserved bit must not be set */
2078 if ( !(new_xfeature & XSTATE_FP) || (new_xfeature & ~xfeature_mask) )
2079 goto fail;
2081 v->arch.xcr0 = new_xfeature;
2082 v->arch.xcr0_accum |= new_xfeature;
2083 set_xcr0(new_xfeature);
2084 break;
2085 default:
2086 goto fail;
2088 break;
2090 default:
2091 goto fail;
2093 break;
2095 case 0x06: /* CLTS */
2096 (void)do_fpu_taskswitch(0);
2097 break;
2099 case 0x09: /* WBINVD */
2100 /* Ignore the instruction if unprivileged. */
2101 if ( !cache_flush_permitted(v->domain) )
2102 /* Non-physdev domain attempted WBINVD; ignore for now since
2103 newer linux uses this in some start-of-day timing loops */
2105 else
2106 wbinvd();
2107 break;
2109 case 0x20: /* MOV CR?,<reg> */
2110 opcode = insn_fetch(u8, code_base, eip, code_limit);
2111 if ( opcode < 0xc0 )
2112 goto fail;
2113 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2114 modrm_rm |= (opcode >> 0) & 7;
2115 reg = decode_register(modrm_rm, regs, 0);
2116 switch ( modrm_reg )
2118 case 0: /* Read CR0 */
2119 *reg = (read_cr0() & ~X86_CR0_TS) |
2120 v->arch.guest_context.ctrlreg[0];
2121 break;
2123 case 2: /* Read CR2 */
2124 *reg = v->arch.guest_context.ctrlreg[2];
2125 break;
2127 case 3: /* Read CR3 */
2129 unsigned long mfn;
2131 if ( !is_pv_32on64_vcpu(v) )
2133 mfn = pagetable_get_pfn(v->arch.guest_table);
2134 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2135 v->domain, mfn));
2137 #ifdef CONFIG_COMPAT
2138 else
2140 mfn = l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)));
2141 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2142 v->domain, mfn));
2144 #endif
2145 /* PTs should not be shared */
2146 BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
2148 break;
2150 case 4: /* Read CR4 */
2151 *reg = v->arch.guest_context.ctrlreg[4];
2152 break;
2154 default:
2155 goto fail;
2157 break;
2159 case 0x21: /* MOV DR?,<reg> */ {
2160 unsigned long res;
2161 opcode = insn_fetch(u8, code_base, eip, code_limit);
2162 if ( opcode < 0xc0 )
2163 goto fail;
2164 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2165 modrm_rm |= (opcode >> 0) & 7;
2166 reg = decode_register(modrm_rm, regs, 0);
2167 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2168 goto fail;
2169 *reg = res;
2170 break;
2173 case 0x22: /* MOV <reg>,CR? */
2174 opcode = insn_fetch(u8, code_base, eip, code_limit);
2175 if ( opcode < 0xc0 )
2176 goto fail;
2177 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2178 modrm_rm |= (opcode >> 0) & 7;
2179 reg = decode_register(modrm_rm, regs, 0);
2180 switch ( modrm_reg )
2182 case 0: /* Write CR0 */
2183 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2185 gdprintk(XENLOG_WARNING,
2186 "Attempt to change unmodifiable CR0 flags.\n");
2187 goto fail;
2189 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2190 break;
2192 case 2: /* Write CR2 */
2193 v->arch.guest_context.ctrlreg[2] = *reg;
2194 arch_set_cr2(v, *reg);
2195 break;
2197 case 3: /* Write CR3 */
2198 domain_lock(v->domain);
2199 if ( !is_pv_32on64_vcpu(v) )
2200 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2201 #ifdef CONFIG_COMPAT
2202 else
2203 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2204 #endif
2205 domain_unlock(v->domain);
2206 if ( rc == 0 ) /* not okay */
2207 goto fail;
2208 break;
2210 case 4: /* Write CR4 */
2211 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg);
2212 write_cr4(pv_guest_cr4_to_real_cr4(v));
2213 break;
2215 default:
2216 goto fail;
2218 break;
2220 case 0x23: /* MOV <reg>,DR? */
2221 opcode = insn_fetch(u8, code_base, eip, code_limit);
2222 if ( opcode < 0xc0 )
2223 goto fail;
2224 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2225 modrm_rm |= (opcode >> 0) & 7;
2226 reg = decode_register(modrm_rm, regs, 0);
2227 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2228 goto fail;
2229 break;
2231 case 0x30: /* WRMSR */ {
2232 uint32_t eax = regs->eax;
2233 uint32_t edx = regs->edx;
2234 msr_content = ((uint64_t)edx << 32) | eax;
2235 switch ( (u32)regs->ecx )
2237 #ifdef CONFIG_X86_64
2238 case MSR_FS_BASE:
2239 if ( is_pv_32on64_vcpu(v) )
2240 goto fail;
2241 if ( wrmsr_safe(MSR_FS_BASE, msr_content) )
2242 goto fail;
2243 v->arch.guest_context.fs_base = msr_content;
2244 break;
2245 case MSR_GS_BASE:
2246 if ( is_pv_32on64_vcpu(v) )
2247 goto fail;
2248 if ( wrmsr_safe(MSR_GS_BASE, msr_content) )
2249 goto fail;
2250 v->arch.guest_context.gs_base_kernel = msr_content;
2251 break;
2252 case MSR_SHADOW_GS_BASE:
2253 if ( is_pv_32on64_vcpu(v) )
2254 goto fail;
2255 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
2256 goto fail;
2257 v->arch.guest_context.gs_base_user = msr_content;
2258 break;
2259 #endif
2260 case MSR_K7_FID_VID_STATUS:
2261 case MSR_K7_FID_VID_CTL:
2262 case MSR_K8_PSTATE_LIMIT:
2263 case MSR_K8_PSTATE_CTRL:
2264 case MSR_K8_PSTATE_STATUS:
2265 case MSR_K8_PSTATE0:
2266 case MSR_K8_PSTATE1:
2267 case MSR_K8_PSTATE2:
2268 case MSR_K8_PSTATE3:
2269 case MSR_K8_PSTATE4:
2270 case MSR_K8_PSTATE5:
2271 case MSR_K8_PSTATE6:
2272 case MSR_K8_PSTATE7:
2273 case MSR_K8_HWCR:
2274 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2275 goto fail;
2276 if ( !is_cpufreq_controller(v->domain) )
2277 break;
2278 if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
2279 goto fail;
2280 break;
2281 case MSR_AMD64_NB_CFG:
2282 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2283 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
2284 goto fail;
2285 if ( !IS_PRIV(v->domain) )
2286 break;
2287 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
2288 (eax != (uint32_t)val) ||
2289 ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2290 goto invalid;
2291 if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 )
2292 goto fail;
2293 break;
2294 case MSR_FAM10H_MMIO_CONF_BASE:
2295 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2296 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
2297 goto fail;
2298 if ( !IS_PRIV(v->domain) )
2299 break;
2300 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
2301 goto fail;
2302 if (
2303 #ifdef CONFIG_X86_64
2304 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
2305 val != msr_content :
2306 #endif
2307 ((val ^ msr_content) &
2308 ~( FAM10H_MMIO_CONF_ENABLE |
2309 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2310 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2311 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2312 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2313 goto invalid;
2314 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 )
2315 goto fail;
2316 break;
2317 case MSR_IA32_UCODE_REV:
2318 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2319 goto fail;
2320 if ( rdmsr_safe(regs->ecx, val) )
2321 goto fail;
2322 if ( msr_content )
2323 goto invalid;
2324 break;
2325 case MSR_IA32_MISC_ENABLE:
2326 if ( rdmsr_safe(regs->ecx, val) )
2327 goto invalid;
2328 val = guest_misc_enable(val);
2329 if ( msr_content != val )
2330 goto invalid;
2331 break;
2332 case MSR_IA32_MPERF:
2333 case MSR_IA32_APERF:
2334 if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
2335 ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
2336 goto fail;
2337 if ( !is_cpufreq_controller(v->domain) )
2338 break;
2339 if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
2340 goto fail;
2341 break;
2342 case MSR_IA32_PERF_CTL:
2343 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2344 goto fail;
2345 if ( !is_cpufreq_controller(v->domain) )
2346 break;
2347 if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
2348 goto fail;
2349 break;
2350 case MSR_IA32_THERM_CONTROL:
2351 case MSR_IA32_ENERGY_PERF_BIAS:
2352 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2353 goto fail;
2354 if ( (v->domain->domain_id != 0) || !is_pinned_vcpu(v) )
2355 break;
2356 if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
2357 goto fail;
2358 break;
2359 default:
2360 if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) )
2361 break;
2363 rc = vmce_wrmsr(regs->ecx, msr_content);
2364 if ( rc < 0 )
2365 goto fail;
2366 if ( rc )
2367 break;
2369 if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) )
2370 invalid:
2371 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2372 "0x%016"PRIx64" to 0x%016"PRIx64".\n",
2373 _p(regs->ecx), val, msr_content);
2374 break;
2376 break;
2379 case 0x31: /* RDTSC */
2380 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2381 !guest_kernel_mode(v, regs) )
2382 goto fail;
2383 if ( v->domain->arch.vtsc )
2384 pv_soft_rdtsc(v, regs, 0);
2385 else
2386 rdtsc(regs->eax, regs->edx);
2387 break;
2389 case 0x32: /* RDMSR */
2390 switch ( (u32)regs->ecx )
2392 #ifdef CONFIG_X86_64
2393 case MSR_FS_BASE:
2394 if ( is_pv_32on64_vcpu(v) )
2395 goto fail;
2396 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2397 regs->edx = v->arch.guest_context.fs_base >> 32;
2398 break;
2399 case MSR_GS_BASE:
2400 if ( is_pv_32on64_vcpu(v) )
2401 goto fail;
2402 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2403 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2404 break;
2405 case MSR_SHADOW_GS_BASE:
2406 if ( is_pv_32on64_vcpu(v) )
2407 goto fail;
2408 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2409 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2410 break;
2411 #endif
2412 case MSR_K7_FID_VID_CTL:
2413 case MSR_K7_FID_VID_STATUS:
2414 case MSR_K8_PSTATE_LIMIT:
2415 case MSR_K8_PSTATE_CTRL:
2416 case MSR_K8_PSTATE_STATUS:
2417 case MSR_K8_PSTATE0:
2418 case MSR_K8_PSTATE1:
2419 case MSR_K8_PSTATE2:
2420 case MSR_K8_PSTATE3:
2421 case MSR_K8_PSTATE4:
2422 case MSR_K8_PSTATE5:
2423 case MSR_K8_PSTATE6:
2424 case MSR_K8_PSTATE7:
2425 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2426 goto fail;
2427 if ( !is_cpufreq_controller(v->domain) )
2429 regs->eax = regs->edx = 0;
2430 break;
2432 goto rdmsr_normal;
2433 case MSR_IA32_UCODE_REV:
2434 BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
2435 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
2437 if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
2438 goto fail;
2439 sync_core();
2441 goto rdmsr_normal;
2442 case MSR_IA32_MISC_ENABLE:
2443 if ( rdmsr_safe(regs->ecx, msr_content) )
2444 goto fail;
2445 msr_content = guest_misc_enable(msr_content);
2446 regs->eax = (uint32_t)msr_content;
2447 regs->edx = (uint32_t)(msr_content >> 32);
2448 break;
2449 default:
2450 if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
2452 rdmsr_writeback:
2453 regs->eax = (uint32_t)val;
2454 regs->edx = (uint32_t)(val >> 32);
2455 break;
2458 rc = vmce_rdmsr(regs->ecx, &val);
2459 if ( rc < 0 )
2460 goto fail;
2461 if ( rc )
2462 goto rdmsr_writeback;
2464 case MSR_EFER:
2465 rdmsr_normal:
2466 /* Everyone can read the MSR space. */
2467 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2468 _p(regs->ecx));*/
2469 if ( rdmsr_safe(regs->ecx, msr_content) )
2470 goto fail;
2471 regs->eax = (uint32_t)msr_content;
2472 regs->edx = (uint32_t)(msr_content >> 32);
2473 break;
2475 break;
2477 default:
2478 goto fail;
2481 #undef wr_ad
2482 #undef rd_ad
2484 done:
2485 instruction_done(regs, eip, bpmatch);
2486 skip:
2487 return EXCRET_fault_fixed;
2489 fail:
2490 return 0;
2493 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2494 unsigned int esp, unsigned int decr)
2496 return (((esp - decr) < (esp - 1)) &&
2497 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2500 static void emulate_gate_op(struct cpu_user_regs *regs)
2502 #ifdef __x86_64__
2503 struct vcpu *v = current;
2504 unsigned int sel, ar, dpl, nparm, opnd_sel;
2505 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2506 unsigned long off, eip, opnd_off, base, limit;
2507 int jump;
2509 /* Check whether this fault is due to the use of a call gate. */
2510 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2511 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2512 ((ar & _SEGMENT_TYPE) != 0xc00) )
2514 do_guest_trap(TRAP_gp_fault, regs, 1);
2515 return;
2517 if ( !(ar & _SEGMENT_P) )
2519 do_guest_trap(TRAP_no_segment, regs, 1);
2520 return;
2522 dpl = (ar >> 13) & 3;
2523 nparm = ar & 0x1f;
2525 /*
2526 * Decode instruction (and perhaps operand) to determine RPL,
2527 * whether this is a jump or a call, and the call return offset.
2528 */
2529 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2530 !(ar & _SEGMENT_S) ||
2531 !(ar & _SEGMENT_P) ||
2532 !(ar & _SEGMENT_CODE) )
2534 do_guest_trap(TRAP_gp_fault, regs, 1);
2535 return;
2538 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2539 ad_default = ad_bytes = op_default;
2540 opnd_sel = opnd_off = 0;
2541 jump = -1;
2542 for ( eip = regs->eip; eip - regs->_eip < 10; )
2544 switch ( insn_fetch(u8, base, eip, limit) )
2546 case 0x66: /* operand-size override */
2547 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2548 continue;
2549 case 0x67: /* address-size override */
2550 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2551 continue;
2552 case 0x2e: /* CS override */
2553 opnd_sel = regs->cs;
2554 ASSERT(opnd_sel);
2555 continue;
2556 case 0x3e: /* DS override */
2557 opnd_sel = read_sreg(regs, ds);
2558 if ( !opnd_sel )
2559 opnd_sel = dpl;
2560 continue;
2561 case 0x26: /* ES override */
2562 opnd_sel = read_sreg(regs, es);
2563 if ( !opnd_sel )
2564 opnd_sel = dpl;
2565 continue;
2566 case 0x64: /* FS override */
2567 opnd_sel = read_sreg(regs, fs);
2568 if ( !opnd_sel )
2569 opnd_sel = dpl;
2570 continue;
2571 case 0x65: /* GS override */
2572 opnd_sel = read_sreg(regs, gs);
2573 if ( !opnd_sel )
2574 opnd_sel = dpl;
2575 continue;
2576 case 0x36: /* SS override */
2577 opnd_sel = regs->ss;
2578 if ( !opnd_sel )
2579 opnd_sel = dpl;
2580 continue;
2581 case 0xea:
2582 ++jump;
2583 /* FALLTHROUGH */
2584 case 0x9a:
2585 ++jump;
2586 opnd_sel = regs->cs;
2587 opnd_off = eip;
2588 ad_bytes = ad_default;
2589 eip += op_bytes + 2;
2590 break;
2591 case 0xff:
2593 unsigned int modrm;
2595 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2597 case 0x28: case 0x68: case 0xa8:
2598 ++jump;
2599 /* FALLTHROUGH */
2600 case 0x18: case 0x58: case 0x98:
2601 ++jump;
2602 if ( ad_bytes != 2 )
2604 if ( (modrm & 7) == 4 )
2606 unsigned int sib;
2607 sib = insn_fetch(u8, base, eip, limit);
2609 modrm = (modrm & ~7) | (sib & 7);
2610 if ( (sib >>= 3) != 4 )
2611 opnd_off = *(unsigned long *)
2612 decode_register(sib & 7, regs, 0);
2613 opnd_off <<= sib >> 3;
2615 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2616 opnd_off += *(unsigned long *)
2617 decode_register(modrm & 7, regs, 0);
2618 else
2619 modrm |= 0x87;
2620 if ( !opnd_sel )
2622 switch ( modrm & 7 )
2624 default:
2625 opnd_sel = read_sreg(regs, ds);
2626 break;
2627 case 4: case 5:
2628 opnd_sel = regs->ss;
2629 break;
2633 else
2635 switch ( modrm & 7 )
2637 case 0: case 1: case 7:
2638 opnd_off = regs->ebx;
2639 break;
2640 case 6:
2641 if ( !(modrm & 0xc0) )
2642 modrm |= 0x80;
2643 else
2644 case 2: case 3:
2646 opnd_off = regs->ebp;
2647 if ( !opnd_sel )
2648 opnd_sel = regs->ss;
2650 break;
2652 if ( !opnd_sel )
2653 opnd_sel = read_sreg(regs, ds);
2654 switch ( modrm & 7 )
2656 case 0: case 2: case 4:
2657 opnd_off += regs->esi;
2658 break;
2659 case 1: case 3: case 5:
2660 opnd_off += regs->edi;
2661 break;
2664 switch ( modrm & 0xc0 )
2666 case 0x40:
2667 opnd_off += insn_fetch(s8, base, eip, limit);
2668 break;
2669 case 0x80:
2670 opnd_off += insn_fetch(s32, base, eip, limit);
2671 break;
2673 if ( ad_bytes == 4 )
2674 opnd_off = (unsigned int)opnd_off;
2675 else if ( ad_bytes == 2 )
2676 opnd_off = (unsigned short)opnd_off;
2677 break;
2680 break;
2682 break;
2685 if ( jump < 0 )
2687 fail:
2688 do_guest_trap(TRAP_gp_fault, regs, 1);
2689 skip:
2690 return;
2693 if ( (opnd_sel != regs->cs &&
2694 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2695 !(ar & _SEGMENT_S) ||
2696 !(ar & _SEGMENT_P) ||
2697 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2699 do_guest_trap(TRAP_gp_fault, regs, 1);
2700 return;
2703 opnd_off += op_bytes;
2704 #define ad_default ad_bytes
2705 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2706 #undef ad_default
2707 ASSERT((opnd_sel & ~3) == regs->error_code);
2708 if ( dpl < (opnd_sel & 3) )
2710 do_guest_trap(TRAP_gp_fault, regs, 1);
2711 return;
2714 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2715 !(ar & _SEGMENT_S) ||
2716 !(ar & _SEGMENT_CODE) ||
2717 (!jump || (ar & _SEGMENT_EC) ?
2718 ((ar >> 13) & 3) > (regs->cs & 3) :
2719 ((ar >> 13) & 3) != (regs->cs & 3)) )
2721 regs->error_code = sel;
2722 do_guest_trap(TRAP_gp_fault, regs, 1);
2723 return;
2725 if ( !(ar & _SEGMENT_P) )
2727 regs->error_code = sel;
2728 do_guest_trap(TRAP_no_segment, regs, 1);
2729 return;
2731 if ( off > limit )
2733 regs->error_code = 0;
2734 do_guest_trap(TRAP_gp_fault, regs, 1);
2735 return;
2738 if ( !jump )
2740 unsigned int ss, esp, *stkp;
2741 int rc;
2742 #define push(item) do \
2743 { \
2744 --stkp; \
2745 esp -= 4; \
2746 rc = __put_user(item, stkp); \
2747 if ( rc ) \
2748 { \
2749 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2750 PFEC_write_access); \
2751 return; \
2752 } \
2753 } while ( 0 )
2755 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2757 sel |= (ar >> 13) & 3;
2758 /* Inner stack known only for kernel ring. */
2759 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2761 do_guest_trap(TRAP_gp_fault, regs, 1);
2762 return;
2764 esp = v->arch.guest_context.kernel_sp;
2765 ss = v->arch.guest_context.kernel_ss;
2766 if ( (ss & 3) != (sel & 3) ||
2767 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2768 ((ar >> 13) & 3) != (sel & 3) ||
2769 !(ar & _SEGMENT_S) ||
2770 (ar & _SEGMENT_CODE) ||
2771 !(ar & _SEGMENT_WR) )
2773 regs->error_code = ss & ~3;
2774 do_guest_trap(TRAP_invalid_tss, regs, 1);
2775 return;
2777 if ( !(ar & _SEGMENT_P) ||
2778 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2780 regs->error_code = ss & ~3;
2781 do_guest_trap(TRAP_stack_error, regs, 1);
2782 return;
2784 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2785 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2787 do_guest_trap(TRAP_gp_fault, regs, 1);
2788 return;
2790 push(regs->ss);
2791 push(regs->esp);
2792 if ( nparm )
2794 const unsigned int *ustkp;
2796 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2797 ((ar >> 13) & 3) != (regs->cs & 3) ||
2798 !(ar & _SEGMENT_S) ||
2799 (ar & _SEGMENT_CODE) ||
2800 !(ar & _SEGMENT_WR) ||
2801 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2802 return do_guest_trap(TRAP_gp_fault, regs, 1);
2803 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2804 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2806 do_guest_trap(TRAP_gp_fault, regs, 1);
2807 return;
2809 do
2811 unsigned int parm;
2813 --ustkp;
2814 rc = __get_user(parm, ustkp);
2815 if ( rc )
2817 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2818 return;
2820 push(parm);
2821 } while ( --nparm );
2824 else
2826 sel |= (regs->cs & 3);
2827 esp = regs->esp;
2828 ss = regs->ss;
2829 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2830 ((ar >> 13) & 3) != (sel & 3) )
2832 do_guest_trap(TRAP_gp_fault, regs, 1);
2833 return;
2835 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2837 regs->error_code = 0;
2838 do_guest_trap(TRAP_stack_error, regs, 1);
2839 return;
2841 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2842 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2844 do_guest_trap(TRAP_gp_fault, regs, 1);
2845 return;
2848 push(regs->cs);
2849 push(eip);
2850 #undef push
2851 regs->esp = esp;
2852 regs->ss = ss;
2854 else
2855 sel |= (regs->cs & 3);
2857 regs->cs = sel;
2858 instruction_done(regs, off, 0);
2859 #endif
2862 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2864 struct vcpu *v = current;
2865 unsigned long fixup;
2867 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2869 if ( regs->error_code & 1 )
2870 goto hardware_gp;
2872 if ( !guest_mode(regs) )
2873 goto gp_in_kernel;
2875 /*
2876 * Cunning trick to allow arbitrary "INT n" handling.
2878 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2879 * instruction from trapping to the appropriate vector, when that might not
2880 * be expected by Xen or the guest OS. For example, that entry might be for
2881 * a fault handler (unlike traps, faults don't increment EIP), or might
2882 * expect an error code on the stack (which a software trap never
2883 * provides), or might be a hardware interrupt handler that doesn't like
2884 * being called spuriously.
2886 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2887 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2888 * clear to indicate that it's a software fault, not hardware.
2890 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2891 * okay because they can only be triggered by an explicit DPL-checked
2892 * instruction. The DPL specified by the guest OS for these vectors is NOT
2893 * CHECKED!!
2894 */
2895 if ( (regs->error_code & 3) == 2 )
2897 /* This fault must be due to <INT n> instruction. */
2898 const struct trap_info *ti;
2899 unsigned char vector = regs->error_code >> 3;
2900 ti = &v->arch.guest_context.trap_ctxt[vector];
2901 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2903 regs->eip += 2;
2904 do_guest_trap(vector, regs, 0);
2905 return;
2908 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2910 emulate_gate_op(regs);
2911 return;
2914 /* Emulate some simple privileged and I/O instructions. */
2915 if ( (regs->error_code == 0) &&
2916 emulate_privileged_op(regs) )
2918 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2919 return;
2922 #if defined(__i386__)
2923 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2924 (regs->error_code == 0) &&
2925 gpf_emulate_4gb(regs) )
2927 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2928 return;
2930 #endif
2932 /* Pass on GPF as is. */
2933 do_guest_trap(TRAP_gp_fault, regs, 1);
2934 return;
2936 gp_in_kernel:
2938 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2940 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2941 regs->error_code, _p(regs->eip), _p(fixup));
2942 regs->eip = fixup;
2943 return;
2946 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2948 hardware_gp:
2949 show_execution_state(regs);
2950 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2953 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2955 static void nmi_mce_softirq(void)
2957 int cpu = smp_processor_id();
2958 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2959 cpumask_t affinity;
2961 BUG_ON(st == NULL);
2962 BUG_ON(st->vcpu == NULL);
2964 /* Set the tmp value unconditionally, so that
2965 * the check in the iret hypercall works. */
2966 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2968 if ((cpu != st->processor)
2969 || (st->processor != st->vcpu->processor))
2971 /* We are on a different physical cpu.
2972 * Make sure to wakeup the vcpu on the
2973 * specified processor.
2974 */
2975 cpus_clear(affinity);
2976 cpu_set(st->processor, affinity);
2977 vcpu_set_affinity(st->vcpu, &affinity);
2979 /* Affinity is restored in the iret hypercall. */
2982 /* Only used to defer wakeup of domain/vcpu to
2983 * a safe (non-NMI/MCE) context.
2984 */
2985 vcpu_kick(st->vcpu);
2986 st->vcpu = NULL;
2989 void async_exception_cleanup(struct vcpu *curr)
2991 int trap;
2993 if ( !curr->async_exception_mask )
2994 return;
2996 /* Restore affinity. */
2997 if ( !cpus_empty(curr->cpu_affinity_tmp) &&
2998 !cpus_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
3000 vcpu_set_affinity(curr, &curr->cpu_affinity_tmp);
3001 cpus_clear(curr->cpu_affinity_tmp);
3004 if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
3005 trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
3006 else
3007 for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
3008 if ( (curr->async_exception_mask ^
3009 curr->async_exception_state(trap).old_mask) == (1 << trap) )
3010 break;
3011 ASSERT(trap <= VCPU_TRAP_LAST);
3013 /* inject vMCE to PV_Guest including DOM0. */
3014 if ( trap == VCPU_TRAP_MCE )
3016 gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n");
3017 if ( curr->vcpu_id == 0 )
3019 struct domain *d = curr->domain;
3021 if ( !d->arch.vmca_msrs->nr_injection )
3023 printk(XENLOG_WARNING "MCE: ret from vMCE#, "
3024 "no injection node\n");
3025 goto end;
3028 d->arch.vmca_msrs->nr_injection--;
3029 if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
3031 struct bank_entry *entry;
3033 entry = list_entry(d->arch.vmca_msrs->impact_header.next,
3034 struct bank_entry, list);
3035 gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
3036 list_del(&entry->list);
3038 else
3039 printk(XENLOG_ERR "MCE: didn't found last injection node\n");
3041 /* further injection */
3042 if ( d->arch.vmca_msrs->nr_injection > 0 &&
3043 guest_has_trap_callback(d, 0, TRAP_machine_check) &&
3044 !test_and_set_bool(curr->mce_pending) )
3046 int cpu = smp_processor_id();
3047 cpumask_t affinity;
3049 curr->cpu_affinity_tmp = curr->cpu_affinity;
3050 cpus_clear(affinity);
3051 cpu_set(cpu, affinity);
3052 printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
3053 cpu, curr->processor);
3054 vcpu_set_affinity(curr, &affinity);
3059 end:
3060 /* Restore previous asynchronous exception mask. */
3061 curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
3064 static void nmi_dom0_report(unsigned int reason_idx)
3066 struct domain *d = dom0;
3068 if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) )
3069 return;
3071 set_bit(reason_idx, nmi_reason(d));
3073 send_guest_trap(d, 0, TRAP_nmi);
3076 static void mem_parity_error(struct cpu_user_regs *regs)
3078 switch ( opt_nmi[0] )
3080 case 'd': /* 'dom0' */
3081 nmi_dom0_report(_XEN_NMIREASON_parity_error);
3082 case 'i': /* 'ignore' */
3083 break;
3084 default: /* 'fatal' */
3085 console_force_unlock();
3086 printk("\n\nNMI - MEMORY ERROR\n");
3087 fatal_trap(TRAP_nmi, regs);
3090 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
3091 mdelay(1);
3092 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
3095 static void io_check_error(struct cpu_user_regs *regs)
3097 switch ( opt_nmi[0] )
3099 case 'd': /* 'dom0' */
3100 nmi_dom0_report(_XEN_NMIREASON_io_error);
3101 case 'i': /* 'ignore' */
3102 break;
3103 default: /* 'fatal' */
3104 console_force_unlock();
3105 printk("\n\nNMI - I/O ERROR\n");
3106 fatal_trap(TRAP_nmi, regs);
3109 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
3110 mdelay(1);
3111 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
3114 static void unknown_nmi_error(unsigned char reason)
3116 switch ( opt_nmi[0] )
3118 case 'd': /* 'dom0' */
3119 nmi_dom0_report(_XEN_NMIREASON_unknown);
3120 case 'i': /* 'ignore' */
3121 break;
3122 default: /* 'fatal' */
3123 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
3124 printk("Dazed and confused, but trying to continue\n");
3125 printk("Do you have a strange power saving mode enabled?\n");
3126 kexec_crash();
3130 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
3132 return 0;
3135 static nmi_callback_t nmi_callback = dummy_nmi_callback;
3137 asmlinkage void do_nmi(struct cpu_user_regs *regs)
3139 unsigned int cpu = smp_processor_id();
3140 unsigned char reason;
3142 ++nmi_count(cpu);
3144 if ( nmi_callback(regs, cpu) )
3145 return;
3147 #ifdef XEN_KDB_CONFIG
3148 if (kdb_enabled && kdb_handle_trap_entry(TRAP_nmi, regs))
3149 return;
3150 #endif
3151 if ( nmi_watchdog )
3152 nmi_watchdog_tick(regs);
3154 /* Only the BSP gets external NMIs from the system. */
3155 if ( cpu == 0 )
3157 reason = inb(0x61);
3158 if ( reason & 0x80 )
3159 mem_parity_error(regs);
3160 else if ( reason & 0x40 )
3161 io_check_error(regs);
3162 else if ( !nmi_watchdog )
3163 unknown_nmi_error((unsigned char)(reason&0xff));
3167 void set_nmi_callback(nmi_callback_t callback)
3169 nmi_callback = callback;
3172 void unset_nmi_callback(void)
3174 nmi_callback = dummy_nmi_callback;
3177 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
3179 struct vcpu *curr = current;
3181 BUG_ON(!guest_mode(regs));
3183 setup_fpu(curr);
3185 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
3187 do_guest_trap(TRAP_no_device, regs, 0);
3188 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
3190 else
3191 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
3193 return;
3196 u64 read_efer(void)
3198 return this_cpu(efer);
3201 void write_efer(u64 val)
3203 this_cpu(efer) = val;
3204 wrmsrl(MSR_EFER, val);
3207 static void ler_enable(void)
3209 u64 debugctl;
3211 if ( !this_cpu(ler_msr) )
3212 return;
3214 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
3215 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | 1);
3218 asmlinkage void do_debug(struct cpu_user_regs *regs)
3220 struct vcpu *v = current;
3222 DEBUGGER_trap_entry(TRAP_debug, regs);
3224 if ( !guest_mode(regs) )
3226 if ( regs->eflags & X86_EFLAGS_TF )
3228 #ifdef __x86_64__
3229 void sysenter_entry(void);
3230 void sysenter_eflags_saved(void);
3231 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
3232 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
3233 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
3235 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
3236 regs->eflags &= ~X86_EFLAGS_TF;
3237 goto out;
3239 #endif
3240 if ( !debugger_trap_fatal(TRAP_debug, regs) )
3242 WARN_ON(1);
3243 regs->eflags &= ~X86_EFLAGS_TF;
3246 else
3248 /*
3249 * We ignore watchpoints when they trigger within Xen. This may
3250 * happen when a buffer is passed to us which previously had a
3251 * watchpoint set on it. No need to bump EIP; the only faulting
3252 * trap is an instruction breakpoint, which can't happen to us.
3253 */
3254 WARN_ON(!search_exception_table(regs->eip));
3256 goto out;
3259 /* Save debug status register where guest OS can peek at it */
3260 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3262 ler_enable();
3263 do_guest_trap(TRAP_debug, regs, 0);
3264 return;
3266 out:
3267 ler_enable();
3268 return;
3271 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3275 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3277 int i;
3278 /* Keep secondary tables in sync with IRQ updates. */
3279 for ( i = 1; i < NR_CPUS; i++ )
3280 if ( idt_tables[i] != NULL )
3281 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3282 _set_gate(&idt_table[n], 14, dpl, addr);
3285 static void set_swint_gate(unsigned int n, void *addr)
3287 __set_intr_gate(n, 3, addr);
3290 void set_intr_gate(unsigned int n, void *addr)
3292 __set_intr_gate(n, 0, addr);
3295 void load_TR(void)
3297 struct tss_struct *tss = &this_cpu(init_tss);
3298 struct desc_ptr old_gdt, tss_gdt = {
3299 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3300 .limit = LAST_RESERVED_GDT_BYTE
3301 };
3303 _set_tssldt_desc(
3304 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3305 (unsigned long)tss,
3306 offsetof(struct tss_struct, __cacheline_filler) - 1,
3307 9);
3308 #ifdef CONFIG_COMPAT
3309 _set_tssldt_desc(
3310 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3311 (unsigned long)tss,
3312 offsetof(struct tss_struct, __cacheline_filler) - 1,
3313 11);
3314 #endif
3316 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3317 asm volatile (
3318 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3319 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3322 void __devinit percpu_traps_init(void)
3324 subarch_percpu_traps_init();
3326 if ( !opt_ler )
3327 return;
3329 switch ( boot_cpu_data.x86_vendor )
3331 case X86_VENDOR_INTEL:
3332 switch ( boot_cpu_data.x86 )
3334 case 6:
3335 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3336 break;
3337 case 15:
3338 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3339 break;
3341 break;
3342 case X86_VENDOR_AMD:
3343 switch ( boot_cpu_data.x86 )
3345 case 6:
3346 case 0xf ... 0x17:
3347 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3348 break;
3350 break;
3353 ler_enable();
3356 void __init trap_init(void)
3358 /*
3359 * Note that interrupt gates are always used, rather than trap gates. We
3360 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3361 * first activation must have the "bad" value(s) for these registers and
3362 * we may lose them if another activation is installed before they are
3363 * saved. The page-fault handler also needs interrupts disabled until %cr2
3364 * has been read and saved on the stack.
3365 */
3366 set_intr_gate(TRAP_divide_error,&divide_error);
3367 set_intr_gate(TRAP_debug,&debug);
3368 set_intr_gate(TRAP_nmi,&nmi);
3369 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3370 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3371 set_intr_gate(TRAP_bounds,&bounds);
3372 set_intr_gate(TRAP_invalid_op,&invalid_op);
3373 set_intr_gate(TRAP_no_device,&device_not_available);
3374 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3375 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3376 set_intr_gate(TRAP_no_segment,&segment_not_present);
3377 set_intr_gate(TRAP_stack_error,&stack_segment);
3378 set_intr_gate(TRAP_gp_fault,&general_protection);
3379 set_intr_gate(TRAP_page_fault,&page_fault);
3380 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3381 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3382 set_intr_gate(TRAP_alignment_check,&alignment_check);
3383 set_intr_gate(TRAP_machine_check,&machine_check);
3384 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3386 /* CPU0 uses the master IDT. */
3387 idt_tables[0] = idt_table;
3389 this_cpu(gdt_table) = boot_cpu_gdt_table;
3390 #ifdef CONFIG_COMPAT
3391 this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table;
3392 #endif
3394 percpu_traps_init();
3396 cpu_init();
3398 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3401 long register_guest_nmi_callback(unsigned long address)
3403 struct vcpu *v = current;
3404 struct domain *d = v->domain;
3405 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3407 t->vector = TRAP_nmi;
3408 t->flags = 0;
3409 t->cs = (is_pv_32on64_domain(d) ?
3410 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3411 t->address = address;
3412 TI_SET_IF(t, 1);
3414 /*
3415 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3416 * now.
3417 */
3418 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3419 v->nmi_pending = 1;
3421 return 0;
3424 long unregister_guest_nmi_callback(void)
3426 struct vcpu *v = current;
3427 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3429 memset(t, 0, sizeof(*t));
3431 return 0;
3434 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3436 struct vcpu *v;
3437 struct trap_info *t;
3439 BUG_ON(d == NULL);
3440 BUG_ON(vcpuid >= d->max_vcpus);
3442 /* Sanity check - XXX should be more fine grained. */
3443 BUG_ON(trap_nr > TRAP_syscall);
3445 v = d->vcpu[vcpuid];
3446 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3448 return (t->address != 0);
3452 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3454 struct vcpu *v;
3455 struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id());
3457 BUG_ON(d == NULL);
3458 BUG_ON(vcpuid >= d->max_vcpus);
3459 v = d->vcpu[vcpuid];
3461 switch (trap_nr) {
3462 case TRAP_nmi:
3463 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3464 return -EBUSY;
3465 if ( !test_and_set_bool(v->nmi_pending) ) {
3466 st->domain = d;
3467 st->processor = v->processor;
3469 /* not safe to wake up a vcpu here */
3470 raise_softirq(NMI_MCE_SOFTIRQ);
3471 return 0;
3473 st->vcpu = NULL;
3474 break;
3476 case TRAP_machine_check:
3477 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3478 return -EBUSY;
3480 /* We are called by the machine check (exception or polling) handlers
3481 * on the physical CPU that reported a machine check error. */
3483 if ( !test_and_set_bool(v->mce_pending) ) {
3484 st->domain = d;
3485 st->vcpu = v;
3486 st->processor = v->processor;
3488 /* not safe to wake up a vcpu here */
3489 raise_softirq(NMI_MCE_SOFTIRQ);
3490 return 0;
3492 st->vcpu = NULL;
3493 break;
3496 /* delivery failed */
3497 return -EIO;
3501 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3503 struct trap_info cur;
3504 struct vcpu *curr = current;
3505 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3506 long rc = 0;
3508 /* If no table is presented then clear the entire virtual IDT. */
3509 if ( guest_handle_is_null(traps) )
3511 memset(dst, 0, 256 * sizeof(*dst));
3512 init_int80_direct_trap(curr);
3513 return 0;
3516 for ( ; ; )
3518 if ( hypercall_preempt_check() )
3520 rc = hypercall_create_continuation(
3521 __HYPERVISOR_set_trap_table, "h", traps);
3522 break;
3525 if ( copy_from_guest(&cur, traps, 1) )
3527 rc = -EFAULT;
3528 break;
3531 if ( cur.address == 0 )
3532 break;
3534 fixup_guest_code_selector(curr->domain, cur.cs);
3536 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3538 if ( cur.vector == 0x80 )
3539 init_int80_direct_trap(curr);
3541 guest_handle_add_offset(traps, 1);
3544 return rc;
3547 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3549 int i;
3550 struct vcpu *curr = current;
3552 switch ( reg )
3554 case 0:
3555 if ( !access_ok(value, sizeof(long)) )
3556 return -EPERM;
3557 if ( v == curr )
3558 write_debugreg(0, value);
3559 break;
3560 case 1:
3561 if ( !access_ok(value, sizeof(long)) )
3562 return -EPERM;
3563 if ( v == curr )
3564 write_debugreg(1, value);
3565 break;
3566 case 2:
3567 if ( !access_ok(value, sizeof(long)) )
3568 return -EPERM;
3569 if ( v == curr )
3570 write_debugreg(2, value);
3571 break;
3572 case 3:
3573 if ( !access_ok(value, sizeof(long)) )
3574 return -EPERM;
3575 if ( v == curr )
3576 write_debugreg(3, value);
3577 break;
3578 case 6:
3579 /*
3580 * DR6: Bits 4-11,16-31 reserved (set to 1).
3581 * Bit 12 reserved (set to 0).
3582 */
3583 value &= 0xffffefff; /* reserved bits => 0 */
3584 value |= 0xffff0ff0; /* reserved bits => 1 */
3585 if ( v == curr )
3586 write_debugreg(6, value);
3587 break;
3588 case 7:
3589 /*
3590 * DR7: Bit 10 reserved (set to 1).
3591 * Bits 11-12,14-15 reserved (set to 0).
3592 */
3593 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3594 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3595 /*
3596 * Privileged bits:
3597 * GD (bit 13): must be 0.
3598 */
3599 if ( value & DR_GENERAL_DETECT )
3600 return -EPERM;
3601 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3602 if ( value & DR7_ACTIVE_MASK )
3604 unsigned int io_enable = 0;
3606 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3608 if ( ((value >> i) & 3) == DR_IO )
3610 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3611 return -EPERM;
3612 io_enable |= value & (3 << ((i - 16) >> 1));
3614 #ifdef __i386__
3615 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3616 !boot_cpu_has(X86_FEATURE_LM)) &&
3617 (((value >> i) & 0xc) == DR_LEN_8) )
3618 return -EPERM;
3619 #endif
3622 /* Guest DR5 is a handy stash for I/O intercept information. */
3623 v->arch.guest_context.debugreg[5] = io_enable;
3624 value &= ~io_enable;
3626 /*
3627 * If DR7 was previously clear then we need to load all other
3628 * debug registers at this point as they were not restored during
3629 * context switch.
3630 */
3631 if ( (v == curr) &&
3632 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3634 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3635 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3636 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3637 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3638 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3641 if ( v == curr )
3642 write_debugreg(7, value);
3643 break;
3644 default:
3645 return -EINVAL;
3648 v->arch.guest_context.debugreg[reg] = value;
3649 return 0;
3652 long do_set_debugreg(int reg, unsigned long value)
3654 return set_debugreg(current, reg, value);
3657 unsigned long do_get_debugreg(int reg)
3659 struct vcpu *curr = current;
3661 switch ( reg )
3663 case 0 ... 3:
3664 case 6:
3665 return curr->arch.guest_context.debugreg[reg];
3666 case 7:
3667 return (curr->arch.guest_context.debugreg[7] |
3668 curr->arch.guest_context.debugreg[5]);
3669 case 4 ... 5:
3670 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3671 curr->arch.guest_context.debugreg[reg + 2] : 0);
3674 return -EINVAL;
3677 /*
3678 * Local variables:
3679 * mode: C
3680 * c-set-style: "BSD"
3681 * c-basic-offset: 4
3682 * tab-width: 4
3683 * indent-tabs-mode: nil
3684 * End:
3685 */