debuggers.hg

view xen/arch/x86/traps.c @ 20638:cf4f3e2f425c

Make tsc_mode=3 (pvrdtscp) work correctly.

Initial tsc_mode patch contained a rough cut at pvrdtscp mode. This
patch gets it working correctly. For the record, pvrdtscp mode allows
an application to obtain information from Xen to descale/de-offset
a physical tsc value to obtain "nsec since VM start". Though the
raw tsc value may change across migration due to different Hz rates
and different start times of different physical machines, applying
the pvrdtscp algorithm to a raw tsc value guarantees that the result
will always be both a fixed known rate (nanoseconds) and monotonically
increasing. BUT, pvrdtscp will only be fast on physical machines that
support the rdtscp instruction AND on which tsc is "safe"; on other
machines both the rdtsc and rdtscp instructions will be emulated.
Also note that when tsc_mode=3 is enabled, tsc-sensitive applications
that do NOT implement the pvrdtscp algorithm will behave incorrectly.
So, tsc_mode=3 should only be used when all apps are either
tsc-resilient
or pvrdtscp-modified, and only has a performance advantage on very
recent generation processors.

Signed-off-by: Dan Magenheiemer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Dec 08 07:48:45 2009 +0000 (2009-12-08)
parents 14d9fb7a3262
children 611f49efe955
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/bitops.h>
55 #include <asm/desc.h>
56 #include <asm/debugreg.h>
57 #include <asm/smp.h>
58 #include <asm/flushtlb.h>
59 #include <asm/uaccess.h>
60 #include <asm/i387.h>
61 #include <asm/debugger.h>
62 #include <asm/msr.h>
63 #include <asm/shared.h>
64 #include <asm/x86_emulate.h>
65 #include <asm/traps.h>
66 #include <asm/hvm/vpt.h>
67 #include <asm/hypercall.h>
68 #include <public/arch-x86/cpuid.h>
70 /*
71 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
72 * fatal: Xen prints diagnostic message and then hangs.
73 * dom0: The NMI is virtualised to DOM0.
74 * ignore: The NMI error is cleared and ignored.
75 */
76 #ifdef NDEBUG
77 static char __read_mostly opt_nmi[10] = "dom0";
78 #else
79 static char __read_mostly opt_nmi[10] = "fatal";
80 #endif
81 string_param("nmi", opt_nmi);
83 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
85 /* Master table, used by CPU0. */
86 idt_entry_t idt_table[IDT_ENTRIES];
88 /* Pointer to the IDT of every CPU. */
89 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
91 #define DECLARE_TRAP_HANDLER(_name) \
92 asmlinkage void _name(void); \
93 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
95 DECLARE_TRAP_HANDLER(divide_error);
96 DECLARE_TRAP_HANDLER(debug);
97 DECLARE_TRAP_HANDLER(nmi);
98 DECLARE_TRAP_HANDLER(int3);
99 DECLARE_TRAP_HANDLER(overflow);
100 DECLARE_TRAP_HANDLER(bounds);
101 DECLARE_TRAP_HANDLER(invalid_op);
102 DECLARE_TRAP_HANDLER(device_not_available);
103 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
104 DECLARE_TRAP_HANDLER(invalid_TSS);
105 DECLARE_TRAP_HANDLER(segment_not_present);
106 DECLARE_TRAP_HANDLER(stack_segment);
107 DECLARE_TRAP_HANDLER(general_protection);
108 DECLARE_TRAP_HANDLER(page_fault);
109 DECLARE_TRAP_HANDLER(coprocessor_error);
110 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
111 DECLARE_TRAP_HANDLER(machine_check);
112 DECLARE_TRAP_HANDLER(alignment_check);
113 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
136 unsigned long mask = STACK_SIZE;
138 if ( is_hvm_vcpu(v) )
139 return;
141 if ( is_pv_32on64_vcpu(v) )
142 {
143 compat_show_guest_stack(v, regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 if ( !access_ok(stack, sizeof(*stack)) )
160 {
161 printk("Guest-inaccessible memory.\n");
162 return;
163 }
165 if ( v != current )
166 {
167 struct vcpu *vcpu;
169 ASSERT(guest_kernel_mode(v, regs));
170 #ifndef __x86_64__
171 addr = read_cr3();
172 for_each_vcpu( v->domain, vcpu )
173 if ( vcpu->arch.cr3 == addr )
174 break;
175 #else
176 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
177 #endif
178 if ( !vcpu )
179 {
180 stack = do_page_walk(v, (unsigned long)stack);
181 if ( (unsigned long)stack < PAGE_SIZE )
182 {
183 printk("Inaccessible guest memory.\n");
184 return;
185 }
186 mask = PAGE_SIZE;
187 }
188 }
190 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
191 {
192 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
193 break;
194 if ( __get_user(addr, stack) )
195 {
196 if ( i != 0 )
197 printk("\n ");
198 printk("Fault while accessing guest memory.");
199 i = 1;
200 break;
201 }
202 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
203 printk("\n ");
204 printk(" %p", _p(addr));
205 stack++;
206 }
207 if ( i == 0 )
208 printk("Stack empty.");
209 printk("\n");
210 }
212 #if !defined(CONFIG_FRAME_POINTER)
214 static void show_trace(struct cpu_user_regs *regs)
215 {
216 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
218 printk("Xen call trace:\n ");
220 printk("[<%p>]", _p(regs->eip));
221 print_symbol(" %s\n ", regs->eip);
223 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
224 {
225 addr = *stack++;
226 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
227 {
228 printk("[<%p>]", _p(addr));
229 print_symbol(" %s\n ", addr);
230 }
231 }
233 printk("\n");
234 }
236 #else
238 static void show_trace(struct cpu_user_regs *regs)
239 {
240 unsigned long *frame, next, addr, low, high;
242 printk("Xen call trace:\n ");
244 printk("[<%p>]", _p(regs->eip));
245 print_symbol(" %s\n ", regs->eip);
247 /* Bounds for range of valid frame pointer. */
248 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
249 high = (low & ~(STACK_SIZE - 1)) +
250 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
252 /* The initial frame pointer. */
253 next = regs->ebp;
255 for ( ; ; )
256 {
257 /* Valid frame pointer? */
258 if ( (next < low) || (next >= high) )
259 {
260 /*
261 * Exception stack frames have a different layout, denoted by an
262 * inverted frame pointer.
263 */
264 next = ~next;
265 if ( (next < low) || (next >= high) )
266 break;
267 frame = (unsigned long *)next;
268 next = frame[0];
269 addr = frame[(offsetof(struct cpu_user_regs, eip) -
270 offsetof(struct cpu_user_regs, ebp))
271 / BYTES_PER_LONG];
272 }
273 else
274 {
275 /* Ordinary stack frame. */
276 frame = (unsigned long *)next;
277 next = frame[0];
278 addr = frame[1];
279 }
281 printk("[<%p>]", _p(addr));
282 print_symbol(" %s\n ", addr);
284 low = (unsigned long)&frame[2];
285 }
287 printk("\n");
288 }
290 #endif
292 void show_stack(struct cpu_user_regs *regs)
293 {
294 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
295 int i;
297 if ( guest_mode(regs) )
298 return show_guest_stack(current, regs);
300 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
302 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
303 {
304 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
305 break;
306 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
307 printk("\n ");
308 addr = *stack++;
309 printk(" %p", _p(addr));
310 }
311 if ( i == 0 )
312 printk("Stack empty.");
313 printk("\n");
315 show_trace(regs);
316 }
318 void show_stack_overflow(unsigned int cpu, unsigned long esp)
319 {
320 #ifdef MEMORY_GUARD
321 unsigned long esp_top, esp_bottom;
322 unsigned long *stack, addr;
324 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
325 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
327 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
328 (void *)esp_top, (void *)esp_bottom, (void *)esp,
329 (void *)per_cpu(init_tss, cpu).esp0);
331 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
332 if ( ((unsigned long)(esp - esp_top) > 512) &&
333 ((unsigned long)(esp_top - esp) > 512) )
334 {
335 printk("No stack overflow detected. Skipping stack trace.\n");
336 return;
337 }
339 if ( esp < esp_top )
340 esp = esp_top;
342 printk("Xen stack overflow (dumping trace %p-%p):\n ",
343 (void *)esp, (void *)esp_bottom);
345 stack = (unsigned long *)esp;
346 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
347 {
348 addr = *stack++;
349 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
350 {
351 printk("%p: [<%p>]", stack, _p(addr));
352 print_symbol(" %s\n ", addr);
353 }
354 }
356 printk("\n");
357 #endif
358 }
360 void show_execution_state(struct cpu_user_regs *regs)
361 {
362 show_registers(regs);
363 show_stack(regs);
364 }
366 void vcpu_show_execution_state(struct vcpu *v)
367 {
368 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
369 v->domain->domain_id, v->vcpu_id);
371 if ( v == current )
372 {
373 show_execution_state(guest_cpu_user_regs());
374 return;
375 }
377 vcpu_pause(v); /* acceptably dangerous */
379 vcpu_show_registers(v);
380 if ( guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
381 show_guest_stack(v, &v->arch.guest_context.user_regs);
383 vcpu_unpause(v);
384 }
386 static char *trapstr(int trapnr)
387 {
388 static char *strings[] = {
389 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
390 "invalid opcode", "device not available", "double fault",
391 "coprocessor segment", "invalid tss", "segment not found",
392 "stack error", "general protection fault", "page fault",
393 "spurious interrupt", "coprocessor error", "alignment check",
394 "machine check", "simd error"
395 };
397 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
398 return "???";
400 return strings[trapnr];
401 }
403 /*
404 * This is called for faults at very unexpected times (e.g., when interrupts
405 * are disabled). In such situations we can't do much that is safe. We try to
406 * print out some tracing and then we just spin.
407 */
408 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
409 {
410 static DEFINE_PER_CPU(char, depth);
412 /*
413 * In some cases, we can end up in a vicious cycle of fatal_trap()s
414 * within fatal_trap()s. We give the problem a couple of iterations to
415 * bottom out, and then we just panic.
416 */
417 if ( ++this_cpu(depth) < 3 )
418 {
419 watchdog_disable();
420 console_start_sync();
422 show_execution_state(regs);
424 if ( trapnr == TRAP_page_fault )
425 {
426 unsigned long cr2 = read_cr2();
427 printk("Faulting linear address: %p\n", _p(cr2));
428 show_page_walk(cr2);
429 }
430 }
432 panic("FATAL TRAP: vector = %d (%s)\n"
433 "[error_code=%04x] %s\n",
434 trapnr, trapstr(trapnr), regs->error_code,
435 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
436 }
438 static void do_guest_trap(
439 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
440 {
441 struct vcpu *v = current;
442 struct trap_bounce *tb;
443 const struct trap_info *ti;
445 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
447 tb = &v->arch.trap_bounce;
448 ti = &v->arch.guest_context.trap_ctxt[trapnr];
450 tb->flags = TBF_EXCEPTION;
451 tb->cs = ti->cs;
452 tb->eip = ti->address;
454 if ( use_error_code )
455 {
456 tb->flags |= TBF_EXCEPTION_ERRCODE;
457 tb->error_code = regs->error_code;
458 }
460 if ( TI_GET_IF(ti) )
461 tb->flags |= TBF_INTERRUPT;
463 if ( unlikely(null_trap_bounce(v, tb)) )
464 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
465 "on VCPU %d [ec=%04x]\n",
466 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
467 }
469 static void instruction_done(
470 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
471 {
472 regs->eip = eip;
473 regs->eflags &= ~X86_EFLAGS_RF;
474 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
475 {
476 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
477 if ( regs->eflags & X86_EFLAGS_TF )
478 current->arch.guest_context.debugreg[6] |= 0x4000;
479 do_guest_trap(TRAP_debug, regs, 0);
480 }
481 }
483 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
484 unsigned int port, unsigned int len)
485 {
486 unsigned int width, i, match = 0;
487 unsigned long start;
489 if ( !(v->arch.guest_context.debugreg[5]) ||
490 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
491 return 0;
493 for ( i = 0; i < 4; i++ )
494 {
495 if ( !(v->arch.guest_context.debugreg[5] &
496 (3 << (i * DR_ENABLE_SIZE))) )
497 continue;
499 start = v->arch.guest_context.debugreg[i];
500 width = 0;
502 switch ( (v->arch.guest_context.debugreg[7] >>
503 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
504 {
505 case DR_LEN_1: width = 1; break;
506 case DR_LEN_2: width = 2; break;
507 case DR_LEN_4: width = 4; break;
508 case DR_LEN_8: width = 8; break;
509 }
511 if ( (start < (port + len)) && ((start + width) > port) )
512 match |= 1 << i;
513 }
515 return match;
516 }
518 /*
519 * Called from asm to set up the MCE trapbounce info.
520 * Returns 0 if no callback is set up, else 1.
521 */
522 asmlinkage int set_guest_machinecheck_trapbounce(void)
523 {
524 struct vcpu *v = current;
525 struct trap_bounce *tb = &v->arch.trap_bounce;
527 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
528 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
529 return !null_trap_bounce(v, tb);
530 }
532 /*
533 * Called from asm to set up the NMI trapbounce info.
534 * Returns 0 if no callback is set up, else 1.
535 */
536 asmlinkage int set_guest_nmi_trapbounce(void)
537 {
538 struct vcpu *v = current;
539 struct trap_bounce *tb = &v->arch.trap_bounce;
540 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
541 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
542 return !null_trap_bounce(v, tb);
543 }
545 static inline void do_trap(
546 int trapnr, struct cpu_user_regs *regs, int use_error_code)
547 {
548 struct vcpu *curr = current;
549 unsigned long fixup;
551 DEBUGGER_trap_entry(trapnr, regs);
553 if ( guest_mode(regs) )
554 {
555 do_guest_trap(trapnr, regs, use_error_code);
556 return;
557 }
559 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
560 {
561 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
562 trapnr, _p(regs->eip), _p(fixup));
563 regs->eip = fixup;
564 return;
565 }
567 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
568 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
569 {
570 curr->arch.hvm_vcpu.fpu_exception_callback(
571 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
572 return;
573 }
575 DEBUGGER_trap_fatal(trapnr, regs);
577 show_execution_state(regs);
578 panic("FATAL TRAP: vector = %d (%s)\n"
579 "[error_code=%04x]\n",
580 trapnr, trapstr(trapnr), regs->error_code);
581 }
583 #define DO_ERROR_NOCODE(trapnr, name) \
584 asmlinkage void do_##name(struct cpu_user_regs *regs) \
585 { \
586 do_trap(trapnr, regs, 0); \
587 }
589 #define DO_ERROR(trapnr, name) \
590 asmlinkage void do_##name(struct cpu_user_regs *regs) \
591 { \
592 do_trap(trapnr, regs, 1); \
593 }
595 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
596 DO_ERROR_NOCODE(TRAP_overflow, overflow)
597 DO_ERROR_NOCODE(TRAP_bounds, bounds)
598 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
599 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
600 DO_ERROR( TRAP_no_segment, segment_not_present)
601 DO_ERROR( TRAP_stack_error, stack_segment)
602 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
603 DO_ERROR( TRAP_alignment_check, alignment_check)
604 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
606 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
607 {
608 struct domain *d = current->domain;
609 /* Optionally shift out of the way of Viridian architectural MSRs. */
610 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
612 idx -= base;
613 if ( idx > 0 )
614 return 0;
616 switch ( idx )
617 {
618 case 0:
619 {
620 *val = 0;
621 break;
622 }
623 default:
624 BUG();
625 }
627 return 1;
628 }
630 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
631 {
632 struct domain *d = current->domain;
633 /* Optionally shift out of the way of Viridian architectural MSRs. */
634 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
636 idx -= base;
637 if ( idx > 0 )
638 return 0;
640 switch ( idx )
641 {
642 case 0:
643 {
644 void *hypercall_page;
645 unsigned long mfn;
646 unsigned long gmfn = val >> 12;
647 unsigned int idx = val & 0xfff;
649 if ( idx > 0 )
650 {
651 gdprintk(XENLOG_WARNING,
652 "Out of range index %u to MSR %08x\n",
653 idx, 0x40000000);
654 return 0;
655 }
657 mfn = gmfn_to_mfn(d, gmfn);
659 if ( !mfn_valid(mfn) ||
660 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
661 {
662 gdprintk(XENLOG_WARNING,
663 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
664 gmfn, mfn, base + idx);
665 return 0;
666 }
668 hypercall_page = map_domain_page(mfn);
669 hypercall_page_initialise(d, hypercall_page);
670 unmap_domain_page(hypercall_page);
672 put_page_and_type(mfn_to_page(mfn));
673 break;
674 }
676 default:
677 BUG();
678 }
680 return 1;
681 }
683 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
684 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
685 {
686 struct domain *d = current->domain;
687 /* Optionally shift out of the way of Viridian architectural leaves. */
688 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
690 idx -= base;
691 if ( idx > 3 )
692 return 0;
694 switch ( idx )
695 {
696 case 0:
697 *eax = base + 3; /* Largest leaf */
698 *ebx = XEN_CPUID_SIGNATURE_EBX;
699 *ecx = XEN_CPUID_SIGNATURE_ECX;
700 *edx = XEN_CPUID_SIGNATURE_EDX;
701 break;
703 case 1:
704 *eax = (xen_major_version() << 16) | xen_minor_version();
705 *ebx = 0; /* Reserved */
706 *ecx = 0; /* Reserved */
707 *edx = 0; /* Reserved */
708 break;
710 case 2:
711 *eax = 1; /* Number of hypercall-transfer pages */
712 *ebx = 0x40000000; /* MSR base address */
713 if ( is_viridian_domain(d) )
714 *ebx = 0x40000200;
715 *ecx = 0; /* Features 1 */
716 *edx = 0; /* Features 2 */
717 if ( !is_hvm_vcpu(current) )
718 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
719 break;
721 case 3:
722 *eax = *ebx = *ecx = *edx = 0;
723 cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
724 break;
726 default:
727 BUG();
728 }
730 return 1;
731 }
733 static void pv_cpuid(struct cpu_user_regs *regs)
734 {
735 uint32_t a, b, c, d;
737 a = regs->eax;
738 b = regs->ebx;
739 c = regs->ecx;
740 d = regs->edx;
742 if ( current->domain->domain_id != 0 )
743 {
744 if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
745 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
746 goto out;
747 }
749 asm (
750 "cpuid"
751 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
752 : "0" (a), "1" (b), "2" (c), "3" (d) );
754 if ( (regs->eax & 0x7fffffff) == 1 )
755 {
756 /* Modify Feature Information. */
757 __clear_bit(X86_FEATURE_VME, &d);
758 if ( !cpu_has_apic )
759 __clear_bit(X86_FEATURE_APIC, &d);
760 __clear_bit(X86_FEATURE_PSE, &d);
761 __clear_bit(X86_FEATURE_PGE, &d);
762 __clear_bit(X86_FEATURE_PSE36, &d);
763 }
764 switch ( (uint32_t)regs->eax )
765 {
766 case 1:
767 /* Modify Feature Information. */
768 if ( !cpu_has_sep )
769 __clear_bit(X86_FEATURE_SEP, &d);
770 #ifdef __i386__
771 if ( !supervisor_mode_kernel )
772 __clear_bit(X86_FEATURE_SEP, &d);
773 #endif
774 __clear_bit(X86_FEATURE_DS, &d);
775 __clear_bit(X86_FEATURE_ACC, &d);
776 __clear_bit(X86_FEATURE_PBE, &d);
778 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
779 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
780 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
781 __clear_bit(X86_FEATURE_VMXE % 32, &c);
782 __clear_bit(X86_FEATURE_SMXE % 32, &c);
783 __clear_bit(X86_FEATURE_TM2 % 32, &c);
784 if ( is_pv_32bit_vcpu(current) )
785 __clear_bit(X86_FEATURE_CX16 % 32, &c);
786 __clear_bit(X86_FEATURE_XTPR % 32, &c);
787 __clear_bit(X86_FEATURE_PDCM % 32, &c);
788 __clear_bit(X86_FEATURE_DCA % 32, &c);
789 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
790 if ( !cpu_has_apic )
791 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
792 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
793 break;
794 case 0x80000001:
795 /* Modify Feature Information. */
796 if ( is_pv_32bit_vcpu(current) )
797 {
798 __clear_bit(X86_FEATURE_LM % 32, &d);
799 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
800 }
801 #ifndef __i386__
802 if ( is_pv_32on64_vcpu(current) &&
803 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
804 #endif
805 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
806 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
807 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
809 __clear_bit(X86_FEATURE_SVME % 32, &c);
810 if ( !cpu_has_apic )
811 __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
812 __clear_bit(X86_FEATURE_OSVW % 32, &c);
813 __clear_bit(X86_FEATURE_IBS % 32, &c);
814 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
815 __clear_bit(X86_FEATURE_WDT % 32, &c);
816 break;
817 case 5: /* MONITOR/MWAIT */
818 case 0xa: /* Architectural Performance Monitor Features */
819 case 0x8000000a: /* SVM revision and features */
820 case 0x8000001b: /* Instruction Based Sampling */
821 a = b = c = d = 0;
822 break;
823 default:
824 (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
825 break;
826 }
828 out:
829 regs->eax = a;
830 regs->ebx = b;
831 regs->ecx = c;
832 regs->edx = d;
833 }
835 static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
836 {
837 char opcode[3];
838 unsigned long eip, rc;
839 struct vcpu *v = current;
841 eip = regs->eip;
842 if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 )
843 {
844 propagate_page_fault(eip + sizeof(opcode) - rc, 0);
845 return EXCRET_fault_fixed;
846 }
847 if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) )
848 return 0;
849 eip += sizeof(opcode);
850 pv_soft_rdtsc(v, regs, 1);
851 instruction_done(regs, eip, 0);
852 return EXCRET_fault_fixed;
853 }
855 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
856 {
857 char sig[5], instr[2];
858 unsigned long eip, rc;
860 eip = regs->eip;
862 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
863 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
864 {
865 propagate_page_fault(eip + sizeof(sig) - rc, 0);
866 return EXCRET_fault_fixed;
867 }
868 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
869 return 0;
870 eip += sizeof(sig);
872 /* We only emulate CPUID. */
873 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
874 {
875 propagate_page_fault(eip + sizeof(instr) - rc, 0);
876 return EXCRET_fault_fixed;
877 }
878 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
879 return 0;
880 eip += sizeof(instr);
882 pv_cpuid(regs);
884 instruction_done(regs, eip, 0);
886 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
888 return EXCRET_fault_fixed;
889 }
891 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
892 {
893 struct bug_frame bug;
894 struct bug_frame_str bug_str;
895 const char *filename, *predicate, *eip = (char *)regs->eip;
896 unsigned long fixup;
897 int id, lineno;
899 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
901 if ( likely(guest_mode(regs)) )
902 {
903 if ( !emulate_invalid_rdtscp(regs) &&
904 !emulate_forced_invalid_op(regs) )
905 do_guest_trap(TRAP_invalid_op, regs, 0);
906 return;
907 }
909 if ( !is_kernel(eip) ||
910 __copy_from_user(&bug, eip, sizeof(bug)) ||
911 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
912 (bug.ret != 0xc2) )
913 goto die;
914 eip += sizeof(bug);
916 id = bug.id & 3;
918 if ( id == BUGFRAME_dump )
919 {
920 show_execution_state(regs);
921 regs->eip = (unsigned long)eip;
922 return;
923 }
925 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
926 if ( !is_kernel(eip) ||
927 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
928 (bug_str.mov != 0xbc) )
929 goto die;
930 filename = bug_str(bug_str, eip);
931 eip += sizeof(bug_str);
933 if ( !is_kernel(filename) )
934 filename = "<unknown>";
935 lineno = bug.id >> 2;
937 if ( id == BUGFRAME_warn )
938 {
939 printk("Xen WARN at %.50s:%d\n", filename, lineno);
940 show_execution_state(regs);
941 regs->eip = (unsigned long)eip;
942 return;
943 }
945 if ( id == BUGFRAME_bug )
946 {
947 printk("Xen BUG at %.50s:%d\n", filename, lineno);
948 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
949 show_execution_state(regs);
950 panic("Xen BUG at %.50s:%d\n", filename, lineno);
951 }
953 /* ASSERT: decode the predicate string pointer. */
954 ASSERT(id == BUGFRAME_assert);
955 if ( !is_kernel(eip) ||
956 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
957 (bug_str.mov != 0xbc) )
958 goto die;
959 predicate = bug_str(bug_str, eip);
960 eip += sizeof(bug_str);
962 if ( !is_kernel(predicate) )
963 predicate = "<unknown>";
964 printk("Assertion '%s' failed at %.50s:%d\n",
965 predicate, filename, lineno);
966 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
967 show_execution_state(regs);
968 panic("Assertion '%s' failed at %.50s:%d\n",
969 predicate, filename, lineno);
971 die:
972 if ( (fixup = search_exception_table(regs->eip)) != 0 )
973 {
974 regs->eip = fixup;
975 return;
976 }
977 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
978 show_execution_state(regs);
979 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
980 }
982 asmlinkage void do_int3(struct cpu_user_regs *regs)
983 {
984 DEBUGGER_trap_entry(TRAP_int3, regs);
986 if ( !guest_mode(regs) )
987 {
988 debugger_trap_fatal(TRAP_int3, regs);
989 return;
990 }
992 do_guest_trap(TRAP_int3, regs, 0);
993 }
995 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
996 {
997 machine_check_vector(regs, regs->error_code);
998 }
1000 static void reserved_bit_page_fault(
1001 unsigned long addr, struct cpu_user_regs *regs)
1003 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
1004 current->domain->domain_id, current->vcpu_id, regs->error_code);
1005 show_page_walk(addr);
1006 show_execution_state(regs);
1009 void propagate_page_fault(unsigned long addr, u16 error_code)
1011 struct trap_info *ti;
1012 struct vcpu *v = current;
1013 struct trap_bounce *tb = &v->arch.trap_bounce;
1015 v->arch.guest_context.ctrlreg[2] = addr;
1016 arch_set_cr2(v, addr);
1018 /* Re-set error_code.user flag appropriately for the guest. */
1019 error_code &= ~PFEC_user_mode;
1020 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
1021 error_code |= PFEC_user_mode;
1023 trace_pv_page_fault(addr, error_code);
1025 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
1026 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1027 tb->error_code = error_code;
1028 tb->cs = ti->cs;
1029 tb->eip = ti->address;
1030 if ( TI_GET_IF(ti) )
1031 tb->flags |= TBF_INTERRUPT;
1032 if ( unlikely(null_trap_bounce(v, tb)) )
1034 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
1035 v->domain->domain_id, v->vcpu_id, error_code);
1036 show_page_walk(addr);
1039 if ( unlikely(error_code & PFEC_reserved_bit) )
1040 reserved_bit_page_fault(addr, guest_cpu_user_regs());
1043 static int handle_gdt_ldt_mapping_fault(
1044 unsigned long offset, struct cpu_user_regs *regs)
1046 struct vcpu *curr = current;
1047 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1048 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1049 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1051 /* Should never fault in another vcpu's area. */
1052 BUG_ON(vcpu_area != curr->vcpu_id);
1054 /* Byte offset within the gdt/ldt sub-area. */
1055 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1057 if ( likely(is_ldt_area) )
1059 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1060 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1062 if ( guest_mode(regs) )
1063 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1064 regs->eip, offset);
1066 else
1068 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1069 if ( !guest_mode(regs) )
1070 return 0;
1071 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1072 propagate_page_fault(
1073 curr->arch.guest_context.ldt_base + offset,
1074 regs->error_code);
1077 else
1079 /* GDT fault: handle the fault as #GP(selector). */
1080 regs->error_code = (u16)offset & ~7;
1081 (void)do_general_protection(regs);
1084 return EXCRET_fault_fixed;
1087 #ifdef HYPERVISOR_VIRT_END
1088 #define IN_HYPERVISOR_RANGE(va) \
1089 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1090 #else
1091 #define IN_HYPERVISOR_RANGE(va) \
1092 (((va) >= HYPERVISOR_VIRT_START))
1093 #endif
1095 static int __spurious_page_fault(
1096 unsigned long addr, unsigned int error_code)
1098 unsigned long mfn, cr3 = read_cr3();
1099 #if CONFIG_PAGING_LEVELS >= 4
1100 l4_pgentry_t l4e, *l4t;
1101 #endif
1102 #if CONFIG_PAGING_LEVELS >= 3
1103 l3_pgentry_t l3e, *l3t;
1104 #endif
1105 l2_pgentry_t l2e, *l2t;
1106 l1_pgentry_t l1e, *l1t;
1107 unsigned int required_flags, disallowed_flags;
1109 /*
1110 * We do not take spurious page faults in IRQ handlers as we do not
1111 * modify page tables in IRQ context. We therefore bail here because
1112 * map_domain_page() is not IRQ-safe.
1113 */
1114 if ( in_irq() )
1115 return 0;
1117 /* Reserved bit violations are never spurious faults. */
1118 if ( error_code & PFEC_reserved_bit )
1119 return 0;
1121 required_flags = _PAGE_PRESENT;
1122 if ( error_code & PFEC_write_access )
1123 required_flags |= _PAGE_RW;
1124 if ( error_code & PFEC_user_mode )
1125 required_flags |= _PAGE_USER;
1127 disallowed_flags = 0;
1128 if ( error_code & PFEC_insn_fetch )
1129 disallowed_flags |= _PAGE_NX;
1131 mfn = cr3 >> PAGE_SHIFT;
1133 #if CONFIG_PAGING_LEVELS >= 4
1134 l4t = map_domain_page(mfn);
1135 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1136 mfn = l4e_get_pfn(l4e);
1137 unmap_domain_page(l4t);
1138 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1139 (l4e_get_flags(l4e) & disallowed_flags) )
1140 return 0;
1141 #endif
1143 #if CONFIG_PAGING_LEVELS >= 3
1144 l3t = map_domain_page(mfn);
1145 #if CONFIG_PAGING_LEVELS == 3
1146 l3t += (cr3 & 0xFE0UL) >> 3;
1147 #endif
1148 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1149 mfn = l3e_get_pfn(l3e);
1150 unmap_domain_page(l3t);
1151 #if CONFIG_PAGING_LEVELS == 3
1152 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1153 return 0;
1154 #else
1155 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1156 (l3e_get_flags(l3e) & disallowed_flags) )
1157 return 0;
1158 #endif
1159 #endif
1161 l2t = map_domain_page(mfn);
1162 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1163 mfn = l2e_get_pfn(l2e);
1164 unmap_domain_page(l2t);
1165 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1166 (l2e_get_flags(l2e) & disallowed_flags) )
1167 return 0;
1168 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1170 l1e = l1e_empty(); /* define before use in debug tracing */
1171 goto spurious;
1174 l1t = map_domain_page(mfn);
1175 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1176 mfn = l1e_get_pfn(l1e);
1177 unmap_domain_page(l1t);
1178 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1179 (l1e_get_flags(l1e) & disallowed_flags) )
1180 return 0;
1182 spurious:
1183 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1184 "at addr %lx, e/c %04x\n",
1185 current->domain->domain_id, current->vcpu_id,
1186 addr, error_code);
1187 #if CONFIG_PAGING_LEVELS >= 4
1188 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1189 #endif
1190 #if CONFIG_PAGING_LEVELS >= 3
1191 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1192 #endif
1193 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1194 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1195 return 1;
1198 static int spurious_page_fault(
1199 unsigned long addr, unsigned int error_code)
1201 unsigned long flags;
1202 int is_spurious;
1204 /*
1205 * Disabling interrupts prevents TLB flushing, and hence prevents
1206 * page tables from becoming invalid under our feet during the walk.
1207 */
1208 local_irq_save(flags);
1209 is_spurious = __spurious_page_fault(addr, error_code);
1210 local_irq_restore(flags);
1212 return is_spurious;
1215 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1217 struct vcpu *v = current;
1218 struct domain *d = v->domain;
1220 /* No fixups in interrupt context or when interrupts are disabled. */
1221 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1222 return 0;
1224 /* Faults from external-mode guests are handled by shadow/hap */
1225 if ( paging_mode_external(d) && guest_mode(regs) )
1227 int ret = paging_fault(addr, regs);
1228 if ( ret == EXCRET_fault_fixed )
1229 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1230 return ret;
1233 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1235 if ( !(regs->error_code & PFEC_reserved_bit) &&
1236 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1237 return handle_gdt_ldt_mapping_fault(
1238 addr - GDT_LDT_VIRT_START, regs);
1239 return 0;
1242 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1243 guest_kernel_mode(v, regs) &&
1244 /* Do not check if access-protection fault since the page may
1245 legitimately be not present in shadow page tables */
1246 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1247 PFEC_write_access) &&
1248 ptwr_do_page_fault(v, addr, regs) )
1249 return EXCRET_fault_fixed;
1251 /* For non-external shadowed guests, we fix up both their own
1252 * pagefaults and Xen's, since they share the pagetables. */
1253 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1255 int ret = paging_fault(addr, regs);
1256 if ( ret == EXCRET_fault_fixed )
1257 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1258 return ret;
1261 return 0;
1264 /*
1265 * #PF error code:
1266 * Bit 0: Protection violation (=1) ; Page not present (=0)
1267 * Bit 1: Write access
1268 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1269 * Bit 3: Reserved bit violation
1270 * Bit 4: Instruction fetch
1271 */
1272 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1274 unsigned long addr, fixup;
1275 unsigned int error_code;
1277 addr = read_cr2();
1279 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1280 error_code = regs->error_code;
1282 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1284 perfc_incr(page_faults);
1286 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1287 return;
1289 if ( unlikely(!guest_mode(regs)) )
1291 if ( spurious_page_fault(addr, error_code) )
1292 return;
1294 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1296 perfc_incr(copy_user_faults);
1297 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1298 reserved_bit_page_fault(addr, regs);
1299 regs->eip = fixup;
1300 return;
1303 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1305 show_execution_state(regs);
1306 show_page_walk(addr);
1307 panic("FATAL PAGE FAULT\n"
1308 "[error_code=%04x]\n"
1309 "Faulting linear address: %p\n",
1310 error_code, _p(addr));
1313 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1314 && spurious_page_fault(addr, error_code)) )
1315 return;
1317 propagate_page_fault(addr, regs->error_code);
1320 /*
1321 * Early #PF handler to print CR2, error code, and stack.
1323 * We also deal with spurious faults here, even though they should never happen
1324 * during early boot (an issue was seen once, but was most likely a hardware
1325 * problem).
1326 */
1327 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1329 static int stuck;
1330 static unsigned long prev_eip, prev_cr2;
1331 unsigned long cr2 = read_cr2();
1333 BUG_ON(smp_processor_id() != 0);
1335 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1337 prev_eip = regs->eip;
1338 prev_cr2 = cr2;
1339 stuck = 0;
1340 return;
1343 if ( stuck++ == 1000 )
1345 unsigned long *stk = (unsigned long *)regs;
1346 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1347 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1348 printk("Stack dump: ");
1349 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1350 printk("%p ", _p(*stk++));
1351 for ( ; ; ) ;
1355 long do_fpu_taskswitch(int set)
1357 struct vcpu *v = current;
1359 if ( set )
1361 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1362 stts();
1364 else
1366 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1367 if ( v->fpu_dirtied )
1368 clts();
1371 return 0;
1374 static int read_descriptor(unsigned int sel,
1375 const struct vcpu *v,
1376 const struct cpu_user_regs * regs,
1377 unsigned long *base,
1378 unsigned long *limit,
1379 unsigned int *ar,
1380 unsigned int vm86attr)
1382 struct desc_struct desc;
1384 if ( !vm86_mode(regs) )
1386 if ( sel < 4)
1387 desc.b = desc.a = 0;
1388 else if ( __get_user(desc,
1389 (const struct desc_struct *)(!(sel & 4)
1390 ? GDT_VIRT_START(v)
1391 : LDT_VIRT_START(v))
1392 + (sel >> 3)) )
1393 return 0;
1394 if ( !(vm86attr & _SEGMENT_CODE) )
1395 desc.b &= ~_SEGMENT_L;
1397 else
1399 desc.a = (sel << 20) | 0xffff;
1400 desc.b = vm86attr | (sel >> 12);
1403 *ar = desc.b & 0x00f0ff00;
1404 if ( !(desc.b & _SEGMENT_L) )
1406 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1407 (desc.b & 0xff000000));
1408 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1409 if ( desc.b & _SEGMENT_G )
1410 *limit = ((*limit + 1) << 12) - 1;
1411 #ifndef NDEBUG
1412 if ( !vm86_mode(regs) && (sel > 3) )
1414 unsigned int a, l;
1415 unsigned char valid;
1417 asm volatile (
1418 "larl %2,%0 ; setz %1"
1419 : "=r" (a), "=qm" (valid) : "rm" (sel));
1420 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1421 asm volatile (
1422 "lsll %2,%0 ; setz %1"
1423 : "=r" (l), "=qm" (valid) : "rm" (sel));
1424 BUG_ON(valid && (l != *limit));
1426 #endif
1428 else
1430 *base = 0UL;
1431 *limit = ~0UL;
1434 return 1;
1437 #ifdef __x86_64__
1438 static int read_gate_descriptor(unsigned int gate_sel,
1439 const struct vcpu *v,
1440 unsigned int *sel,
1441 unsigned long *off,
1442 unsigned int *ar)
1444 struct desc_struct desc;
1445 const struct desc_struct *pdesc;
1448 pdesc = (const struct desc_struct *)
1449 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1450 + (gate_sel >> 3);
1451 if ( (gate_sel < 4) ||
1452 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1453 __get_user(desc, pdesc) )
1454 return 0;
1456 *sel = (desc.a >> 16) & 0x0000fffc;
1457 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1458 *ar = desc.b & 0x0000ffff;
1460 /*
1461 * check_descriptor() clears the DPL field and stores the
1462 * guest requested DPL in the selector's RPL field.
1463 */
1464 if ( *ar & _SEGMENT_DPL )
1465 return 0;
1466 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1468 if ( !is_pv_32bit_vcpu(v) )
1470 if ( (*ar & 0x1f00) != 0x0c00 ||
1471 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1472 __get_user(desc, pdesc + 1) ||
1473 (desc.b & 0x1f00) )
1474 return 0;
1476 *off |= (unsigned long)desc.a << 32;
1477 return 1;
1480 switch ( *ar & 0x1f00 )
1482 case 0x0400:
1483 *off &= 0xffff;
1484 break;
1485 case 0x0c00:
1486 break;
1487 default:
1488 return 0;
1491 return 1;
1493 #endif
1495 /* Has the guest requested sufficient permission for this I/O access? */
1496 static int guest_io_okay(
1497 unsigned int port, unsigned int bytes,
1498 struct vcpu *v, struct cpu_user_regs *regs)
1500 #if defined(__x86_64__)
1501 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1502 int user_mode = !(v->arch.flags & TF_kernel_mode);
1503 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1504 #elif defined(__i386__)
1505 #define TOGGLE_MODE() ((void)0)
1506 #endif
1508 if ( !vm86_mode(regs) &&
1509 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1510 return 1;
1512 if ( v->arch.iobmp_limit > (port + bytes) )
1514 union { uint8_t bytes[2]; uint16_t mask; } x;
1516 /*
1517 * Grab permission bytes from guest space. Inaccessible bytes are
1518 * read as 0xff (no access allowed).
1519 */
1520 TOGGLE_MODE();
1521 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1522 port>>3, 2) )
1524 default: x.bytes[0] = ~0;
1525 case 1: x.bytes[1] = ~0;
1526 case 0: break;
1528 TOGGLE_MODE();
1530 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1531 return 1;
1534 return 0;
1537 /* Has the administrator granted sufficient permission for this I/O access? */
1538 static int admin_io_okay(
1539 unsigned int port, unsigned int bytes,
1540 struct vcpu *v, struct cpu_user_regs *regs)
1542 /*
1543 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1544 * We never permit direct access to that register.
1545 */
1546 if ( (port == 0xcf8) && (bytes == 4) )
1547 return 0;
1549 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1552 static uint32_t guest_io_read(
1553 unsigned int port, unsigned int bytes,
1554 struct vcpu *v, struct cpu_user_regs *regs)
1556 extern uint32_t pci_conf_read(
1557 uint32_t cf8, uint8_t offset, uint8_t bytes);
1559 uint32_t data = 0;
1560 unsigned int shift = 0;
1562 if ( admin_io_okay(port, bytes, v, regs) )
1564 switch ( bytes )
1566 case 1: return inb(port);
1567 case 2: return inw(port);
1568 case 4: return inl(port);
1572 while ( bytes != 0 )
1574 unsigned int size = 1;
1575 uint32_t sub_data = 0xff;
1577 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1579 sub_data = pv_pit_handler(port, 0, 0);
1581 else if ( (port == 0xcf8) && (bytes == 4) )
1583 size = 4;
1584 sub_data = v->domain->arch.pci_cf8;
1586 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1588 size = min(bytes, 4 - (port & 3));
1589 if ( size == 3 )
1590 size = 2;
1591 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1594 if ( size == 4 )
1595 return sub_data;
1597 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1598 shift += size * 8;
1599 port += size;
1600 bytes -= size;
1603 return data;
1606 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1608 static void guest_io_write(
1609 unsigned int port, unsigned int bytes, uint32_t data,
1610 struct vcpu *v, struct cpu_user_regs *regs)
1612 extern void pci_conf_write(
1613 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1615 if ( admin_io_okay(port, bytes, v, regs) )
1617 switch ( bytes ) {
1618 case 1:
1619 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1620 pv_rtc_handler(port, (uint8_t)data);
1621 outb((uint8_t)data, port);
1622 if ( pv_post_outb_hook )
1623 pv_post_outb_hook(port, (uint8_t)data);
1624 break;
1625 case 2:
1626 outw((uint16_t)data, port);
1627 break;
1628 case 4:
1629 outl(data, port);
1630 break;
1632 return;
1635 while ( bytes != 0 )
1637 unsigned int size = 1;
1639 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1641 pv_pit_handler(port, (uint8_t)data, 1);
1643 else if ( (port == 0xcf8) && (bytes == 4) )
1645 size = 4;
1646 v->domain->arch.pci_cf8 = data;
1648 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1650 size = min(bytes, 4 - (port & 3));
1651 if ( size == 3 )
1652 size = 2;
1653 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1656 if ( size == 4 )
1657 return;
1659 port += size;
1660 bytes -= size;
1661 data >>= size * 8;
1665 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1666 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1667 __attribute__((__regparm__(1)));
1668 unsigned long guest_to_host_gpr_switch(unsigned long)
1669 __attribute__((__regparm__(1)));
1671 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1673 /* Instruction fetch with error handling. */
1674 #define insn_fetch(type, base, eip, limit) \
1675 ({ unsigned long _rc, _ptr = (base) + (eip); \
1676 type _x; \
1677 if ( ad_default < 8 ) \
1678 _ptr = (unsigned int)_ptr; \
1679 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1680 goto fail; \
1681 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1682 { \
1683 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1684 goto skip; \
1685 } \
1686 (eip) += sizeof(_x); _x; })
1688 #if defined(CONFIG_X86_32)
1689 # define read_sreg(regs, sr) ((regs)->sr)
1690 #elif defined(CONFIG_X86_64)
1691 # define read_sreg(regs, sr) read_segment_register(sr)
1692 #endif
1694 static int is_cpufreq_controller(struct domain *d)
1696 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1697 (d->domain_id == 0));
1700 static int emulate_privileged_op(struct cpu_user_regs *regs)
1702 struct vcpu *v = current;
1703 unsigned long *reg, eip = regs->eip;
1704 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1705 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1706 int rc;
1707 unsigned int port, i, data_sel, ar, data, bpmatch = 0;
1708 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1709 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1710 ? regs->reg \
1711 : ad_bytes == 4 \
1712 ? (u32)regs->reg \
1713 : (u16)regs->reg)
1714 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1715 ? regs->reg = (val) \
1716 : ad_bytes == 4 \
1717 ? (*(u32 *)&regs->reg = (val)) \
1718 : (*(u16 *)&regs->reg = (val)))
1719 unsigned long code_base, code_limit;
1720 char io_emul_stub[32];
1721 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1722 uint32_t l, h;
1723 uint64_t val;
1725 if ( !read_descriptor(regs->cs, v, regs,
1726 &code_base, &code_limit, &ar,
1727 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1728 goto fail;
1729 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1730 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1731 if ( !(ar & _SEGMENT_S) ||
1732 !(ar & _SEGMENT_P) ||
1733 !(ar & _SEGMENT_CODE) )
1734 goto fail;
1736 /* emulating only opcodes not allowing SS to be default */
1737 data_sel = read_sreg(regs, ds);
1739 /* Legacy prefixes. */
1740 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1742 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1744 case 0x66: /* operand-size override */
1745 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1746 continue;
1747 case 0x67: /* address-size override */
1748 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1749 continue;
1750 case 0x2e: /* CS override */
1751 data_sel = regs->cs;
1752 continue;
1753 case 0x3e: /* DS override */
1754 data_sel = read_sreg(regs, ds);
1755 continue;
1756 case 0x26: /* ES override */
1757 data_sel = read_sreg(regs, es);
1758 continue;
1759 case 0x64: /* FS override */
1760 data_sel = read_sreg(regs, fs);
1761 lm_ovr = lm_seg_fs;
1762 continue;
1763 case 0x65: /* GS override */
1764 data_sel = read_sreg(regs, gs);
1765 lm_ovr = lm_seg_gs;
1766 continue;
1767 case 0x36: /* SS override */
1768 data_sel = regs->ss;
1769 continue;
1770 case 0xf0: /* LOCK */
1771 lock = 1;
1772 continue;
1773 case 0xf2: /* REPNE/REPNZ */
1774 case 0xf3: /* REP/REPE/REPZ */
1775 rep_prefix = 1;
1776 continue;
1777 default:
1778 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1780 rex = opcode;
1781 continue;
1783 break;
1785 break;
1788 /* REX prefix. */
1789 if ( rex & 8 ) /* REX.W */
1790 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1791 modrm_reg = (rex & 4) << 1; /* REX.R */
1792 /* REX.X does not need to be decoded. */
1793 modrm_rm = (rex & 1) << 3; /* REX.B */
1795 if ( opcode == 0x0f )
1796 goto twobyte_opcode;
1798 if ( lock )
1799 goto fail;
1801 /* Input/Output String instructions. */
1802 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1804 unsigned long data_base, data_limit;
1806 if ( rep_prefix && (rd_ad(ecx) == 0) )
1807 goto done;
1809 if ( !(opcode & 2) )
1811 data_sel = read_sreg(regs, es);
1812 lm_ovr = lm_seg_none;
1815 if ( !(ar & _SEGMENT_L) )
1817 if ( !read_descriptor(data_sel, v, regs,
1818 &data_base, &data_limit, &ar,
1819 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1820 _SEGMENT_P) )
1821 goto fail;
1822 if ( !(ar & _SEGMENT_S) ||
1823 !(ar & _SEGMENT_P) ||
1824 (opcode & 2 ?
1825 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1826 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1827 goto fail;
1829 #ifdef CONFIG_X86_64
1830 else
1832 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1834 switch ( lm_ovr )
1836 case lm_seg_none:
1837 data_base = 0UL;
1838 break;
1839 case lm_seg_fs:
1840 data_base = v->arch.guest_context.fs_base;
1841 break;
1842 case lm_seg_gs:
1843 if ( guest_kernel_mode(v, regs) )
1844 data_base = v->arch.guest_context.gs_base_kernel;
1845 else
1846 data_base = v->arch.guest_context.gs_base_user;
1847 break;
1850 else
1851 read_descriptor(data_sel, v, regs,
1852 &data_base, &data_limit, &ar,
1853 0);
1854 data_limit = ~0UL;
1855 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1857 #endif
1859 port = (u16)regs->edx;
1861 continue_io_string:
1862 switch ( opcode )
1864 case 0x6c: /* INSB */
1865 op_bytes = 1;
1866 case 0x6d: /* INSW/INSL */
1867 if ( (data_limit < (op_bytes - 1)) ||
1868 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1869 !guest_io_okay(port, op_bytes, v, regs) )
1870 goto fail;
1871 data = guest_io_read(port, op_bytes, v, regs);
1872 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1873 &data, op_bytes)) != 0 )
1875 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1876 PFEC_write_access);
1877 return EXCRET_fault_fixed;
1879 wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
1880 ? -op_bytes : op_bytes));
1881 break;
1883 case 0x6e: /* OUTSB */
1884 op_bytes = 1;
1885 case 0x6f: /* OUTSW/OUTSL */
1886 if ( (data_limit < (op_bytes - 1)) ||
1887 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1888 !guest_io_okay(port, op_bytes, v, regs) )
1889 goto fail;
1890 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1891 op_bytes)) != 0 )
1893 propagate_page_fault(data_base + rd_ad(esi)
1894 + op_bytes - rc, 0);
1895 return EXCRET_fault_fixed;
1897 guest_io_write(port, op_bytes, data, v, regs);
1898 wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
1899 ? -op_bytes : op_bytes));
1900 break;
1903 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1905 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1907 if ( !bpmatch && !hypercall_preempt_check() )
1908 goto continue_io_string;
1909 eip = regs->eip;
1912 goto done;
1915 /*
1916 * Very likely to be an I/O instruction (IN/OUT).
1917 * Build an on-stack stub to execute the instruction with full guest
1918 * GPR context. This is needed for some systems which (ab)use IN/OUT
1919 * to communicate with BIOS code in system-management mode.
1920 */
1921 #ifdef __x86_64__
1922 /* movq $host_to_guest_gpr_switch,%rcx */
1923 io_emul_stub[0] = 0x48;
1924 io_emul_stub[1] = 0xb9;
1925 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1926 /* callq *%rcx */
1927 io_emul_stub[10] = 0xff;
1928 io_emul_stub[11] = 0xd1;
1929 #else
1930 /* call host_to_guest_gpr_switch */
1931 io_emul_stub[0] = 0xe8;
1932 *(s32 *)&io_emul_stub[1] =
1933 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1934 /* 7 x nop */
1935 memset(&io_emul_stub[5], 0x90, 7);
1936 #endif
1937 /* data16 or nop */
1938 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1939 /* <io-access opcode> */
1940 io_emul_stub[13] = opcode;
1941 /* imm8 or nop */
1942 io_emul_stub[14] = 0x90;
1943 /* ret (jumps to guest_to_host_gpr_switch) */
1944 io_emul_stub[15] = 0xc3;
1946 /* Handy function-typed pointer to the stub. */
1947 io_emul = (void *)io_emul_stub;
1949 if ( ioemul_handle_quirk )
1950 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1952 /* I/O Port and Interrupt Flag instructions. */
1953 switch ( opcode )
1955 case 0xe4: /* IN imm8,%al */
1956 op_bytes = 1;
1957 case 0xe5: /* IN imm8,%eax */
1958 port = insn_fetch(u8, code_base, eip, code_limit);
1959 io_emul_stub[14] = port; /* imm8 */
1960 exec_in:
1961 if ( !guest_io_okay(port, op_bytes, v, regs) )
1962 goto fail;
1963 if ( admin_io_okay(port, op_bytes, v, regs) )
1965 io_emul(regs);
1967 else
1969 if ( op_bytes == 4 )
1970 regs->eax = 0;
1971 else
1972 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1973 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1975 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1976 goto done;
1978 case 0xec: /* IN %dx,%al */
1979 op_bytes = 1;
1980 case 0xed: /* IN %dx,%eax */
1981 port = (u16)regs->edx;
1982 goto exec_in;
1984 case 0xe6: /* OUT %al,imm8 */
1985 op_bytes = 1;
1986 case 0xe7: /* OUT %eax,imm8 */
1987 port = insn_fetch(u8, code_base, eip, code_limit);
1988 io_emul_stub[14] = port; /* imm8 */
1989 exec_out:
1990 if ( !guest_io_okay(port, op_bytes, v, regs) )
1991 goto fail;
1992 if ( admin_io_okay(port, op_bytes, v, regs) )
1994 if ( (op_bytes == 1) &&
1995 ((port == 0x71) || (port == 0x70)) &&
1996 pv_rtc_handler )
1997 pv_rtc_handler(port, regs->eax);
1998 io_emul(regs);
1999 if ( (op_bytes == 1) && pv_post_outb_hook )
2000 pv_post_outb_hook(port, regs->eax);
2002 else
2004 guest_io_write(port, op_bytes, regs->eax, v, regs);
2006 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
2007 goto done;
2009 case 0xee: /* OUT %al,%dx */
2010 op_bytes = 1;
2011 case 0xef: /* OUT %eax,%dx */
2012 port = (u16)regs->edx;
2013 goto exec_out;
2015 case 0xfa: /* CLI */
2016 case 0xfb: /* STI */
2017 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
2018 goto fail;
2019 /*
2020 * This is just too dangerous to allow, in my opinion. Consider if the
2021 * caller then tries to reenable interrupts using POPF: we can't trap
2022 * that and we'll end up with hard-to-debug lockups. Fast & loose will
2023 * do for us. :-)
2024 */
2025 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
2026 goto done;
2029 /* No decode of this single-byte opcode. */
2030 goto fail;
2032 twobyte_opcode:
2033 /*
2034 * All 2 and 3 byte opcodes, except RDTSC (0x31) and RDTSCP (0x1,0xF9)
2035 * are executable only from guest kernel mode (virtual ring 0).
2036 */
2037 opcode = insn_fetch(u8, code_base, eip, code_limit);
2038 if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) )
2039 goto fail;
2041 if ( lock && (opcode & ~3) != 0x20 )
2042 goto fail;
2043 switch ( opcode )
2045 case 0x1: /* RDTSCP */
2046 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2047 !guest_kernel_mode(v, regs) )
2048 goto fail;
2049 if ( insn_fetch(u8, code_base, eip, code_limit) != 0xf9 )
2050 goto fail;
2051 pv_soft_rdtsc(v, regs, 1);
2052 break;
2054 case 0x06: /* CLTS */
2055 (void)do_fpu_taskswitch(0);
2056 break;
2058 case 0x09: /* WBINVD */
2059 /* Ignore the instruction if unprivileged. */
2060 if ( !cache_flush_permitted(v->domain) )
2061 /* Non-physdev domain attempted WBINVD; ignore for now since
2062 newer linux uses this in some start-of-day timing loops */
2064 else
2065 wbinvd();
2066 break;
2068 case 0x20: /* MOV CR?,<reg> */
2069 opcode = insn_fetch(u8, code_base, eip, code_limit);
2070 if ( opcode < 0xc0 )
2071 goto fail;
2072 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2073 modrm_rm |= (opcode >> 0) & 7;
2074 reg = decode_register(modrm_rm, regs, 0);
2075 switch ( modrm_reg )
2077 case 0: /* Read CR0 */
2078 *reg = (read_cr0() & ~X86_CR0_TS) |
2079 v->arch.guest_context.ctrlreg[0];
2080 break;
2082 case 2: /* Read CR2 */
2083 *reg = v->arch.guest_context.ctrlreg[2];
2084 break;
2086 case 3: /* Read CR3 */
2087 if ( !is_pv_32on64_vcpu(v) )
2088 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2089 v->domain, pagetable_get_pfn(v->arch.guest_table)));
2090 #ifdef CONFIG_COMPAT
2091 else
2092 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2093 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
2094 #endif
2095 break;
2097 case 4: /* Read CR4 */
2098 *reg = v->arch.guest_context.ctrlreg[4];
2099 break;
2101 default:
2102 goto fail;
2104 break;
2106 case 0x21: /* MOV DR?,<reg> */ {
2107 unsigned long res;
2108 opcode = insn_fetch(u8, code_base, eip, code_limit);
2109 if ( opcode < 0xc0 )
2110 goto fail;
2111 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2112 modrm_rm |= (opcode >> 0) & 7;
2113 reg = decode_register(modrm_rm, regs, 0);
2114 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2115 goto fail;
2116 *reg = res;
2117 break;
2120 case 0x22: /* MOV <reg>,CR? */
2121 opcode = insn_fetch(u8, code_base, eip, code_limit);
2122 if ( opcode < 0xc0 )
2123 goto fail;
2124 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2125 modrm_rm |= (opcode >> 0) & 7;
2126 reg = decode_register(modrm_rm, regs, 0);
2127 switch ( modrm_reg )
2129 case 0: /* Write CR0 */
2130 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2132 gdprintk(XENLOG_WARNING,
2133 "Attempt to change unmodifiable CR0 flags.\n");
2134 goto fail;
2136 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2137 break;
2139 case 2: /* Write CR2 */
2140 v->arch.guest_context.ctrlreg[2] = *reg;
2141 arch_set_cr2(v, *reg);
2142 break;
2144 case 3: /* Write CR3 */
2145 domain_lock(v->domain);
2146 if ( !is_pv_32on64_vcpu(v) )
2147 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2148 #ifdef CONFIG_COMPAT
2149 else
2150 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2151 #endif
2152 domain_unlock(v->domain);
2153 if ( rc == 0 ) /* not okay */
2154 goto fail;
2155 break;
2157 case 4: /* Write CR4 */
2158 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2159 write_cr4(pv_guest_cr4_to_real_cr4(v));
2160 break;
2162 default:
2163 goto fail;
2165 break;
2167 case 0x23: /* MOV <reg>,DR? */
2168 opcode = insn_fetch(u8, code_base, eip, code_limit);
2169 if ( opcode < 0xc0 )
2170 goto fail;
2171 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2172 modrm_rm |= (opcode >> 0) & 7;
2173 reg = decode_register(modrm_rm, regs, 0);
2174 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2175 goto fail;
2176 break;
2178 case 0x30: /* WRMSR */ {
2179 u32 eax = regs->eax;
2180 u32 edx = regs->edx;
2181 u64 val = ((u64)edx << 32) | eax;
2182 switch ( (u32)regs->ecx )
2184 #ifdef CONFIG_X86_64
2185 case MSR_FS_BASE:
2186 if ( is_pv_32on64_vcpu(v) )
2187 goto fail;
2188 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2189 goto fail;
2190 v->arch.guest_context.fs_base = val;
2191 break;
2192 case MSR_GS_BASE:
2193 if ( is_pv_32on64_vcpu(v) )
2194 goto fail;
2195 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2196 goto fail;
2197 v->arch.guest_context.gs_base_kernel = val;
2198 break;
2199 case MSR_SHADOW_GS_BASE:
2200 if ( is_pv_32on64_vcpu(v) )
2201 goto fail;
2202 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2203 goto fail;
2204 v->arch.guest_context.gs_base_user = val;
2205 break;
2206 #endif
2207 case MSR_K7_FID_VID_STATUS:
2208 case MSR_K7_FID_VID_CTL:
2209 case MSR_K8_PSTATE_LIMIT:
2210 case MSR_K8_PSTATE_CTRL:
2211 case MSR_K8_PSTATE_STATUS:
2212 case MSR_K8_PSTATE0:
2213 case MSR_K8_PSTATE1:
2214 case MSR_K8_PSTATE2:
2215 case MSR_K8_PSTATE3:
2216 case MSR_K8_PSTATE4:
2217 case MSR_K8_PSTATE5:
2218 case MSR_K8_PSTATE6:
2219 case MSR_K8_PSTATE7:
2220 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2221 goto fail;
2222 if ( !is_cpufreq_controller(v->domain) )
2223 break;
2224 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2225 goto fail;
2226 break;
2227 case MSR_AMD64_NB_CFG:
2228 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2229 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2230 goto fail;
2231 if ( !IS_PRIV(v->domain) )
2232 break;
2233 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2234 (eax != l) ||
2235 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2236 goto invalid;
2237 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2238 goto fail;
2239 break;
2240 case MSR_FAM10H_MMIO_CONF_BASE:
2241 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2242 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2243 goto fail;
2244 if ( !IS_PRIV(v->domain) )
2245 break;
2246 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2247 (((((u64)h << 32) | l) ^ val) &
2248 ~( FAM10H_MMIO_CONF_ENABLE |
2249 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2250 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2251 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2252 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2253 goto invalid;
2254 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2255 goto fail;
2256 break;
2257 case MSR_IA32_MPERF:
2258 case MSR_IA32_APERF:
2259 case MSR_IA32_PERF_CTL:
2260 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2261 goto fail;
2262 if ( !is_cpufreq_controller(v->domain) )
2263 break;
2264 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2265 goto fail;
2266 break;
2267 case MSR_IA32_THERM_CONTROL:
2268 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2269 goto fail;
2270 if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
2271 break;
2272 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2273 goto fail;
2274 break;
2275 default:
2276 if ( wrmsr_hypervisor_regs(regs->ecx, val) )
2277 break;
2279 rc = mce_wrmsr(regs->ecx, val);
2280 if ( rc < 0 )
2281 goto fail;
2282 if ( rc )
2283 break;
2285 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2286 (eax != l) || (edx != h) )
2287 invalid:
2288 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2289 "%08x:%08x to %08x:%08x.\n",
2290 _p(regs->ecx), h, l, edx, eax);
2291 break;
2293 break;
2296 case 0x31: /* RDTSC */
2297 if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
2298 !guest_kernel_mode(v, regs) )
2299 goto fail;
2300 if ( v->domain->arch.vtsc )
2301 pv_soft_rdtsc(v, regs, 0);
2302 else
2303 rdtsc(regs->eax, regs->edx);
2304 break;
2306 case 0x32: /* RDMSR */
2307 switch ( (u32)regs->ecx )
2309 #ifdef CONFIG_X86_64
2310 case MSR_FS_BASE:
2311 if ( is_pv_32on64_vcpu(v) )
2312 goto fail;
2313 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2314 regs->edx = v->arch.guest_context.fs_base >> 32;
2315 break;
2316 case MSR_GS_BASE:
2317 if ( is_pv_32on64_vcpu(v) )
2318 goto fail;
2319 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2320 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2321 break;
2322 case MSR_SHADOW_GS_BASE:
2323 if ( is_pv_32on64_vcpu(v) )
2324 goto fail;
2325 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2326 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2327 break;
2328 #endif
2329 case MSR_K7_FID_VID_CTL:
2330 case MSR_K7_FID_VID_STATUS:
2331 case MSR_K8_PSTATE_LIMIT:
2332 case MSR_K8_PSTATE_CTRL:
2333 case MSR_K8_PSTATE_STATUS:
2334 case MSR_K8_PSTATE0:
2335 case MSR_K8_PSTATE1:
2336 case MSR_K8_PSTATE2:
2337 case MSR_K8_PSTATE3:
2338 case MSR_K8_PSTATE4:
2339 case MSR_K8_PSTATE5:
2340 case MSR_K8_PSTATE6:
2341 case MSR_K8_PSTATE7:
2342 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2343 goto fail;
2344 if ( !is_cpufreq_controller(v->domain) )
2346 regs->eax = regs->edx = 0;
2347 break;
2349 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2350 goto fail;
2351 break;
2352 case MSR_IA32_MISC_ENABLE:
2353 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2354 goto fail;
2355 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2356 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2357 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2358 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2359 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2360 break;
2361 case MSR_EFER:
2362 case MSR_AMD_PATCHLEVEL:
2363 default:
2364 if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
2366 rdmsr_writeback:
2367 regs->eax = (uint32_t)val;
2368 regs->edx = (uint32_t)(val >> 32);
2369 break;
2372 rc = mce_rdmsr(regs->ecx, &val);
2373 if ( rc < 0 )
2374 goto fail;
2375 if ( rc )
2376 goto rdmsr_writeback;
2378 /* Everyone can read the MSR space. */
2379 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2380 _p(regs->ecx));*/
2381 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2382 goto fail;
2383 break;
2385 break;
2387 default:
2388 goto fail;
2391 #undef wr_ad
2392 #undef rd_ad
2394 done:
2395 instruction_done(regs, eip, bpmatch);
2396 skip:
2397 return EXCRET_fault_fixed;
2399 fail:
2400 return 0;
2403 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2404 unsigned int esp, unsigned int decr)
2406 return (((esp - decr) < (esp - 1)) &&
2407 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2410 static void emulate_gate_op(struct cpu_user_regs *regs)
2412 #ifdef __x86_64__
2413 struct vcpu *v = current;
2414 unsigned int sel, ar, dpl, nparm, opnd_sel;
2415 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2416 unsigned long off, eip, opnd_off, base, limit;
2417 int jump;
2419 /* Check whether this fault is due to the use of a call gate. */
2420 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2421 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2422 ((ar & _SEGMENT_TYPE) != 0xc00) )
2424 do_guest_trap(TRAP_gp_fault, regs, 1);
2425 return;
2427 if ( !(ar & _SEGMENT_P) )
2429 do_guest_trap(TRAP_no_segment, regs, 1);
2430 return;
2432 dpl = (ar >> 13) & 3;
2433 nparm = ar & 0x1f;
2435 /*
2436 * Decode instruction (and perhaps operand) to determine RPL,
2437 * whether this is a jump or a call, and the call return offset.
2438 */
2439 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2440 !(ar & _SEGMENT_S) ||
2441 !(ar & _SEGMENT_P) ||
2442 !(ar & _SEGMENT_CODE) )
2444 do_guest_trap(TRAP_gp_fault, regs, 1);
2445 return;
2448 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2449 ad_default = ad_bytes = op_default;
2450 opnd_sel = opnd_off = 0;
2451 jump = -1;
2452 for ( eip = regs->eip; eip - regs->_eip < 10; )
2454 switch ( insn_fetch(u8, base, eip, limit) )
2456 case 0x66: /* operand-size override */
2457 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2458 continue;
2459 case 0x67: /* address-size override */
2460 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2461 continue;
2462 case 0x2e: /* CS override */
2463 opnd_sel = regs->cs;
2464 ASSERT(opnd_sel);
2465 continue;
2466 case 0x3e: /* DS override */
2467 opnd_sel = read_sreg(regs, ds);
2468 if ( !opnd_sel )
2469 opnd_sel = dpl;
2470 continue;
2471 case 0x26: /* ES override */
2472 opnd_sel = read_sreg(regs, es);
2473 if ( !opnd_sel )
2474 opnd_sel = dpl;
2475 continue;
2476 case 0x64: /* FS override */
2477 opnd_sel = read_sreg(regs, fs);
2478 if ( !opnd_sel )
2479 opnd_sel = dpl;
2480 continue;
2481 case 0x65: /* GS override */
2482 opnd_sel = read_sreg(regs, gs);
2483 if ( !opnd_sel )
2484 opnd_sel = dpl;
2485 continue;
2486 case 0x36: /* SS override */
2487 opnd_sel = regs->ss;
2488 if ( !opnd_sel )
2489 opnd_sel = dpl;
2490 continue;
2491 case 0xea:
2492 ++jump;
2493 /* FALLTHROUGH */
2494 case 0x9a:
2495 ++jump;
2496 opnd_sel = regs->cs;
2497 opnd_off = eip;
2498 ad_bytes = ad_default;
2499 eip += op_bytes + 2;
2500 break;
2501 case 0xff:
2503 unsigned int modrm;
2505 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2507 case 0x28: case 0x68: case 0xa8:
2508 ++jump;
2509 /* FALLTHROUGH */
2510 case 0x18: case 0x58: case 0x98:
2511 ++jump;
2512 if ( ad_bytes != 2 )
2514 if ( (modrm & 7) == 4 )
2516 unsigned int sib;
2517 sib = insn_fetch(u8, base, eip, limit);
2519 modrm = (modrm & ~7) | (sib & 7);
2520 if ( (sib >>= 3) != 4 )
2521 opnd_off = *(unsigned long *)
2522 decode_register(sib & 7, regs, 0);
2523 opnd_off <<= sib >> 3;
2525 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2526 opnd_off += *(unsigned long *)
2527 decode_register(modrm & 7, regs, 0);
2528 else
2529 modrm |= 0x87;
2530 if ( !opnd_sel )
2532 switch ( modrm & 7 )
2534 default:
2535 opnd_sel = read_sreg(regs, ds);
2536 break;
2537 case 4: case 5:
2538 opnd_sel = regs->ss;
2539 break;
2543 else
2545 switch ( modrm & 7 )
2547 case 0: case 1: case 7:
2548 opnd_off = regs->ebx;
2549 break;
2550 case 6:
2551 if ( !(modrm & 0xc0) )
2552 modrm |= 0x80;
2553 else
2554 case 2: case 3:
2556 opnd_off = regs->ebp;
2557 if ( !opnd_sel )
2558 opnd_sel = regs->ss;
2560 break;
2562 if ( !opnd_sel )
2563 opnd_sel = read_sreg(regs, ds);
2564 switch ( modrm & 7 )
2566 case 0: case 2: case 4:
2567 opnd_off += regs->esi;
2568 break;
2569 case 1: case 3: case 5:
2570 opnd_off += regs->edi;
2571 break;
2574 switch ( modrm & 0xc0 )
2576 case 0x40:
2577 opnd_off += insn_fetch(s8, base, eip, limit);
2578 break;
2579 case 0x80:
2580 opnd_off += insn_fetch(s32, base, eip, limit);
2581 break;
2583 if ( ad_bytes == 4 )
2584 opnd_off = (unsigned int)opnd_off;
2585 else if ( ad_bytes == 2 )
2586 opnd_off = (unsigned short)opnd_off;
2587 break;
2590 break;
2592 break;
2595 if ( jump < 0 )
2597 fail:
2598 do_guest_trap(TRAP_gp_fault, regs, 1);
2599 skip:
2600 return;
2603 if ( (opnd_sel != regs->cs &&
2604 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2605 !(ar & _SEGMENT_S) ||
2606 !(ar & _SEGMENT_P) ||
2607 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2609 do_guest_trap(TRAP_gp_fault, regs, 1);
2610 return;
2613 opnd_off += op_bytes;
2614 #define ad_default ad_bytes
2615 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2616 #undef ad_default
2617 ASSERT((opnd_sel & ~3) == regs->error_code);
2618 if ( dpl < (opnd_sel & 3) )
2620 do_guest_trap(TRAP_gp_fault, regs, 1);
2621 return;
2624 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2625 !(ar & _SEGMENT_S) ||
2626 !(ar & _SEGMENT_CODE) ||
2627 (!jump || (ar & _SEGMENT_EC) ?
2628 ((ar >> 13) & 3) > (regs->cs & 3) :
2629 ((ar >> 13) & 3) != (regs->cs & 3)) )
2631 regs->error_code = sel;
2632 do_guest_trap(TRAP_gp_fault, regs, 1);
2633 return;
2635 if ( !(ar & _SEGMENT_P) )
2637 regs->error_code = sel;
2638 do_guest_trap(TRAP_no_segment, regs, 1);
2639 return;
2641 if ( off > limit )
2643 regs->error_code = 0;
2644 do_guest_trap(TRAP_gp_fault, regs, 1);
2645 return;
2648 if ( !jump )
2650 unsigned int ss, esp, *stkp;
2651 int rc;
2652 #define push(item) do \
2653 { \
2654 --stkp; \
2655 esp -= 4; \
2656 rc = __put_user(item, stkp); \
2657 if ( rc ) \
2658 { \
2659 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2660 PFEC_write_access); \
2661 return; \
2662 } \
2663 } while ( 0 )
2665 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2667 sel |= (ar >> 13) & 3;
2668 /* Inner stack known only for kernel ring. */
2669 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2671 do_guest_trap(TRAP_gp_fault, regs, 1);
2672 return;
2674 esp = v->arch.guest_context.kernel_sp;
2675 ss = v->arch.guest_context.kernel_ss;
2676 if ( (ss & 3) != (sel & 3) ||
2677 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2678 ((ar >> 13) & 3) != (sel & 3) ||
2679 !(ar & _SEGMENT_S) ||
2680 (ar & _SEGMENT_CODE) ||
2681 !(ar & _SEGMENT_WR) )
2683 regs->error_code = ss & ~3;
2684 do_guest_trap(TRAP_invalid_tss, regs, 1);
2685 return;
2687 if ( !(ar & _SEGMENT_P) ||
2688 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2690 regs->error_code = ss & ~3;
2691 do_guest_trap(TRAP_stack_error, regs, 1);
2692 return;
2694 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2695 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2697 do_guest_trap(TRAP_gp_fault, regs, 1);
2698 return;
2700 push(regs->ss);
2701 push(regs->esp);
2702 if ( nparm )
2704 const unsigned int *ustkp;
2706 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2707 ((ar >> 13) & 3) != (regs->cs & 3) ||
2708 !(ar & _SEGMENT_S) ||
2709 (ar & _SEGMENT_CODE) ||
2710 !(ar & _SEGMENT_WR) ||
2711 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2712 return do_guest_trap(TRAP_gp_fault, regs, 1);
2713 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2714 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2716 do_guest_trap(TRAP_gp_fault, regs, 1);
2717 return;
2719 do
2721 unsigned int parm;
2723 --ustkp;
2724 rc = __get_user(parm, ustkp);
2725 if ( rc )
2727 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2728 return;
2730 push(parm);
2731 } while ( --nparm );
2734 else
2736 sel |= (regs->cs & 3);
2737 esp = regs->esp;
2738 ss = regs->ss;
2739 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2740 ((ar >> 13) & 3) != (sel & 3) )
2742 do_guest_trap(TRAP_gp_fault, regs, 1);
2743 return;
2745 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2747 regs->error_code = 0;
2748 do_guest_trap(TRAP_stack_error, regs, 1);
2749 return;
2751 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2752 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2754 do_guest_trap(TRAP_gp_fault, regs, 1);
2755 return;
2758 push(regs->cs);
2759 push(eip);
2760 #undef push
2761 regs->esp = esp;
2762 regs->ss = ss;
2764 else
2765 sel |= (regs->cs & 3);
2767 regs->cs = sel;
2768 instruction_done(regs, off, 0);
2769 #endif
2772 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2774 struct vcpu *v = current;
2775 unsigned long fixup;
2777 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2779 if ( regs->error_code & 1 )
2780 goto hardware_gp;
2782 if ( !guest_mode(regs) )
2783 goto gp_in_kernel;
2785 /*
2786 * Cunning trick to allow arbitrary "INT n" handling.
2788 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2789 * instruction from trapping to the appropriate vector, when that might not
2790 * be expected by Xen or the guest OS. For example, that entry might be for
2791 * a fault handler (unlike traps, faults don't increment EIP), or might
2792 * expect an error code on the stack (which a software trap never
2793 * provides), or might be a hardware interrupt handler that doesn't like
2794 * being called spuriously.
2796 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2797 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2798 * clear to indicate that it's a software fault, not hardware.
2800 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2801 * okay because they can only be triggered by an explicit DPL-checked
2802 * instruction. The DPL specified by the guest OS for these vectors is NOT
2803 * CHECKED!!
2804 */
2805 if ( (regs->error_code & 3) == 2 )
2807 /* This fault must be due to <INT n> instruction. */
2808 const struct trap_info *ti;
2809 unsigned char vector = regs->error_code >> 3;
2810 ti = &v->arch.guest_context.trap_ctxt[vector];
2811 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2813 regs->eip += 2;
2814 do_guest_trap(vector, regs, 0);
2815 return;
2818 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2820 emulate_gate_op(regs);
2821 return;
2824 /* Emulate some simple privileged and I/O instructions. */
2825 if ( (regs->error_code == 0) &&
2826 emulate_privileged_op(regs) )
2828 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2829 return;
2832 #if defined(__i386__)
2833 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2834 (regs->error_code == 0) &&
2835 gpf_emulate_4gb(regs) )
2837 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2838 return;
2840 #endif
2842 /* Pass on GPF as is. */
2843 do_guest_trap(TRAP_gp_fault, regs, 1);
2844 return;
2846 gp_in_kernel:
2848 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2850 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2851 regs->error_code, _p(regs->eip), _p(fixup));
2852 regs->eip = fixup;
2853 return;
2856 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2858 hardware_gp:
2859 show_execution_state(regs);
2860 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2863 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2865 static void nmi_mce_softirq(void)
2867 int cpu = smp_processor_id();
2868 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2869 cpumask_t affinity;
2871 BUG_ON(st == NULL);
2872 BUG_ON(st->vcpu == NULL);
2874 /* Set the tmp value unconditionally, so that
2875 * the check in the iret hypercall works. */
2876 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2878 if ((cpu != st->processor)
2879 || (st->processor != st->vcpu->processor))
2881 /* We are on a different physical cpu.
2882 * Make sure to wakeup the vcpu on the
2883 * specified processor.
2884 */
2885 cpus_clear(affinity);
2886 cpu_set(st->processor, affinity);
2887 vcpu_set_affinity(st->vcpu, &affinity);
2889 /* Affinity is restored in the iret hypercall. */
2892 /* Only used to defer wakeup of domain/vcpu to
2893 * a safe (non-NMI/MCE) context.
2894 */
2895 vcpu_kick(st->vcpu);
2896 st->vcpu = NULL;
2899 void async_exception_cleanup(struct vcpu *curr)
2901 int trap;
2903 if ( !curr->async_exception_mask )
2904 return;
2906 /* Restore affinity. */
2907 if ( !cpus_empty(curr->cpu_affinity_tmp) &&
2908 !cpus_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
2910 vcpu_set_affinity(curr, &curr->cpu_affinity_tmp);
2911 cpus_clear(curr->cpu_affinity_tmp);
2914 if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
2915 trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
2916 else
2917 for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
2918 if ( (curr->async_exception_mask ^
2919 curr->async_exception_state(trap).old_mask) == (1 << trap) )
2920 break;
2921 ASSERT(trap <= VCPU_TRAP_LAST);
2923 /* inject vMCE to PV_Guest including DOM0. */
2924 if ( trap == VCPU_TRAP_MCE )
2926 gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n");
2927 if ( curr->vcpu_id == 0 )
2929 struct domain *d = curr->domain;
2931 if ( !d->arch.vmca_msrs.nr_injection )
2933 printk(XENLOG_WARNING "MCE: ret from vMCE#, "
2934 "no injection node\n");
2935 goto end;
2938 d->arch.vmca_msrs.nr_injection--;
2939 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
2941 struct bank_entry *entry;
2943 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
2944 struct bank_entry, list);
2945 gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
2946 list_del(&entry->list);
2948 else
2949 printk(XENLOG_ERR "MCE: didn't found last injection node\n");
2951 /* further injection */
2952 if ( d->arch.vmca_msrs.nr_injection > 0 &&
2953 guest_has_trap_callback(d, 0, TRAP_machine_check) &&
2954 !test_and_set_bool(curr->mce_pending) )
2956 int cpu = smp_processor_id();
2957 cpumask_t affinity;
2959 curr->cpu_affinity_tmp = curr->cpu_affinity;
2960 cpus_clear(affinity);
2961 cpu_set(cpu, affinity);
2962 printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
2963 cpu, curr->processor);
2964 vcpu_set_affinity(curr, &affinity);
2969 end:
2970 /* Restore previous asynchronous exception mask. */
2971 curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
2974 static void nmi_dom0_report(unsigned int reason_idx)
2976 struct domain *d = dom0;
2978 if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) )
2979 return;
2981 set_bit(reason_idx, nmi_reason(d));
2983 send_guest_trap(d, 0, TRAP_nmi);
2986 static void mem_parity_error(struct cpu_user_regs *regs)
2988 switch ( opt_nmi[0] )
2990 case 'd': /* 'dom0' */
2991 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2992 case 'i': /* 'ignore' */
2993 break;
2994 default: /* 'fatal' */
2995 console_force_unlock();
2996 printk("\n\nNMI - MEMORY ERROR\n");
2997 fatal_trap(TRAP_nmi, regs);
3000 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
3001 mdelay(1);
3002 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
3005 static void io_check_error(struct cpu_user_regs *regs)
3007 switch ( opt_nmi[0] )
3009 case 'd': /* 'dom0' */
3010 nmi_dom0_report(_XEN_NMIREASON_io_error);
3011 case 'i': /* 'ignore' */
3012 break;
3013 default: /* 'fatal' */
3014 console_force_unlock();
3015 printk("\n\nNMI - I/O ERROR\n");
3016 fatal_trap(TRAP_nmi, regs);
3019 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
3020 mdelay(1);
3021 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
3024 static void unknown_nmi_error(unsigned char reason)
3026 switch ( opt_nmi[0] )
3028 case 'd': /* 'dom0' */
3029 nmi_dom0_report(_XEN_NMIREASON_unknown);
3030 case 'i': /* 'ignore' */
3031 break;
3032 default: /* 'fatal' */
3033 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
3034 printk("Dazed and confused, but trying to continue\n");
3035 printk("Do you have a strange power saving mode enabled?\n");
3036 kexec_crash();
3040 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
3042 return 0;
3045 static nmi_callback_t nmi_callback = dummy_nmi_callback;
3047 asmlinkage void do_nmi(struct cpu_user_regs *regs)
3049 unsigned int cpu = smp_processor_id();
3050 unsigned char reason;
3052 ++nmi_count(cpu);
3054 if ( nmi_callback(regs, cpu) )
3055 return;
3057 if ( nmi_watchdog )
3058 nmi_watchdog_tick(regs);
3060 /* Only the BSP gets external NMIs from the system. */
3061 if ( cpu == 0 )
3063 reason = inb(0x61);
3064 if ( reason & 0x80 )
3065 mem_parity_error(regs);
3066 else if ( reason & 0x40 )
3067 io_check_error(regs);
3068 else if ( !nmi_watchdog )
3069 unknown_nmi_error((unsigned char)(reason&0xff));
3073 void set_nmi_callback(nmi_callback_t callback)
3075 nmi_callback = callback;
3078 void unset_nmi_callback(void)
3080 nmi_callback = dummy_nmi_callback;
3083 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
3085 struct vcpu *curr = current;
3087 BUG_ON(!guest_mode(regs));
3089 setup_fpu(curr);
3091 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
3093 do_guest_trap(TRAP_no_device, regs, 0);
3094 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
3096 else
3097 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
3099 return;
3102 asmlinkage void do_debug(struct cpu_user_regs *regs)
3104 struct vcpu *v = current;
3106 DEBUGGER_trap_entry(TRAP_debug, regs);
3108 if ( !guest_mode(regs) )
3110 if ( regs->eflags & X86_EFLAGS_TF )
3112 #ifdef __x86_64__
3113 void sysenter_entry(void);
3114 void sysenter_eflags_saved(void);
3115 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
3116 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
3117 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
3119 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
3120 regs->eflags &= ~X86_EFLAGS_TF;
3121 goto out;
3123 #endif
3124 if ( !debugger_trap_fatal(TRAP_debug, regs) )
3126 WARN_ON(1);
3127 regs->eflags &= ~X86_EFLAGS_TF;
3130 else
3132 /*
3133 * We ignore watchpoints when they trigger within Xen. This may
3134 * happen when a buffer is passed to us which previously had a
3135 * watchpoint set on it. No need to bump EIP; the only faulting
3136 * trap is an instruction breakpoint, which can't happen to us.
3137 */
3138 WARN_ON(!search_exception_table(regs->eip));
3140 goto out;
3143 /* Save debug status register where guest OS can peek at it */
3144 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3146 ler_enable();
3147 do_guest_trap(TRAP_debug, regs, 0);
3148 return;
3150 out:
3151 ler_enable();
3152 return;
3155 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3159 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3161 int i;
3162 /* Keep secondary tables in sync with IRQ updates. */
3163 for ( i = 1; i < NR_CPUS; i++ )
3164 if ( idt_tables[i] != NULL )
3165 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3166 _set_gate(&idt_table[n], 14, dpl, addr);
3169 static void set_swint_gate(unsigned int n, void *addr)
3171 __set_intr_gate(n, 3, addr);
3174 void set_intr_gate(unsigned int n, void *addr)
3176 __set_intr_gate(n, 0, addr);
3179 void load_TR(void)
3181 struct tss_struct *tss = &this_cpu(init_tss);
3182 struct desc_ptr old_gdt, tss_gdt = {
3183 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3184 .limit = LAST_RESERVED_GDT_BYTE
3185 };
3187 _set_tssldt_desc(
3188 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3189 (unsigned long)tss,
3190 offsetof(struct tss_struct, __cacheline_filler) - 1,
3191 9);
3192 #ifdef CONFIG_COMPAT
3193 _set_tssldt_desc(
3194 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3195 (unsigned long)tss,
3196 offsetof(struct tss_struct, __cacheline_filler) - 1,
3197 11);
3198 #endif
3200 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3201 asm volatile (
3202 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3203 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3206 void __devinit percpu_traps_init(void)
3208 subarch_percpu_traps_init();
3210 if ( !opt_ler )
3211 return;
3213 switch ( boot_cpu_data.x86_vendor )
3215 case X86_VENDOR_INTEL:
3216 switch ( boot_cpu_data.x86 )
3218 case 6:
3219 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3220 break;
3221 case 15:
3222 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3223 break;
3225 break;
3226 case X86_VENDOR_AMD:
3227 switch ( boot_cpu_data.x86 )
3229 case 6:
3230 case 15:
3231 case 16:
3232 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3233 break;
3235 break;
3238 ler_enable();
3241 void __init trap_init(void)
3243 /*
3244 * Note that interrupt gates are always used, rather than trap gates. We
3245 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3246 * first activation must have the "bad" value(s) for these registers and
3247 * we may lose them if another activation is installed before they are
3248 * saved. The page-fault handler also needs interrupts disabled until %cr2
3249 * has been read and saved on the stack.
3250 */
3251 set_intr_gate(TRAP_divide_error,&divide_error);
3252 set_intr_gate(TRAP_debug,&debug);
3253 set_intr_gate(TRAP_nmi,&nmi);
3254 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3255 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3256 set_intr_gate(TRAP_bounds,&bounds);
3257 set_intr_gate(TRAP_invalid_op,&invalid_op);
3258 set_intr_gate(TRAP_no_device,&device_not_available);
3259 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3260 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3261 set_intr_gate(TRAP_no_segment,&segment_not_present);
3262 set_intr_gate(TRAP_stack_error,&stack_segment);
3263 set_intr_gate(TRAP_gp_fault,&general_protection);
3264 set_intr_gate(TRAP_page_fault,&page_fault);
3265 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3266 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3267 set_intr_gate(TRAP_alignment_check,&alignment_check);
3268 set_intr_gate(TRAP_machine_check,&machine_check);
3269 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3271 /* CPU0 uses the master IDT. */
3272 idt_tables[0] = idt_table;
3274 percpu_traps_init();
3276 cpu_init();
3278 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3281 long register_guest_nmi_callback(unsigned long address)
3283 struct vcpu *v = current;
3284 struct domain *d = v->domain;
3285 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3287 t->vector = TRAP_nmi;
3288 t->flags = 0;
3289 t->cs = (is_pv_32on64_domain(d) ?
3290 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3291 t->address = address;
3292 TI_SET_IF(t, 1);
3294 /*
3295 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3296 * now.
3297 */
3298 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3299 v->nmi_pending = 1;
3301 return 0;
3304 long unregister_guest_nmi_callback(void)
3306 struct vcpu *v = current;
3307 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3309 memset(t, 0, sizeof(*t));
3311 return 0;
3314 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3316 struct vcpu *v;
3317 struct trap_info *t;
3319 BUG_ON(d == NULL);
3320 BUG_ON(vcpuid >= d->max_vcpus);
3322 /* Sanity check - XXX should be more fine grained. */
3323 BUG_ON(trap_nr > TRAP_syscall);
3325 v = d->vcpu[vcpuid];
3326 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3328 return (t->address != 0);
3332 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3334 struct vcpu *v;
3335 struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id());
3337 BUG_ON(d == NULL);
3338 BUG_ON(vcpuid >= d->max_vcpus);
3339 v = d->vcpu[vcpuid];
3341 switch (trap_nr) {
3342 case TRAP_nmi:
3343 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3344 return -EBUSY;
3345 if ( !test_and_set_bool(v->nmi_pending) ) {
3346 st->domain = d;
3347 st->processor = v->processor;
3349 /* not safe to wake up a vcpu here */
3350 raise_softirq(NMI_MCE_SOFTIRQ);
3351 return 0;
3353 st->vcpu = NULL;
3354 break;
3356 case TRAP_machine_check:
3357 if ( cmpxchgptr(&st->vcpu, NULL, v) )
3358 return -EBUSY;
3360 /* We are called by the machine check (exception or polling) handlers
3361 * on the physical CPU that reported a machine check error. */
3363 if ( !test_and_set_bool(v->mce_pending) ) {
3364 st->domain = d;
3365 st->vcpu = v;
3366 st->processor = v->processor;
3368 /* not safe to wake up a vcpu here */
3369 raise_softirq(NMI_MCE_SOFTIRQ);
3370 return 0;
3372 st->vcpu = NULL;
3373 break;
3376 /* delivery failed */
3377 return -EIO;
3381 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3383 struct trap_info cur;
3384 struct vcpu *curr = current;
3385 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3386 long rc = 0;
3388 /* If no table is presented then clear the entire virtual IDT. */
3389 if ( guest_handle_is_null(traps) )
3391 memset(dst, 0, 256 * sizeof(*dst));
3392 init_int80_direct_trap(curr);
3393 return 0;
3396 for ( ; ; )
3398 if ( hypercall_preempt_check() )
3400 rc = hypercall_create_continuation(
3401 __HYPERVISOR_set_trap_table, "h", traps);
3402 break;
3405 if ( copy_from_guest(&cur, traps, 1) )
3407 rc = -EFAULT;
3408 break;
3411 if ( cur.address == 0 )
3412 break;
3414 fixup_guest_code_selector(curr->domain, cur.cs);
3416 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3418 if ( cur.vector == 0x80 )
3419 init_int80_direct_trap(curr);
3421 guest_handle_add_offset(traps, 1);
3424 return rc;
3427 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3429 int i;
3430 struct vcpu *curr = current;
3432 switch ( reg )
3434 case 0:
3435 if ( !access_ok(value, sizeof(long)) )
3436 return -EPERM;
3437 if ( v == curr )
3438 write_debugreg(0, value);
3439 break;
3440 case 1:
3441 if ( !access_ok(value, sizeof(long)) )
3442 return -EPERM;
3443 if ( v == curr )
3444 write_debugreg(1, value);
3445 break;
3446 case 2:
3447 if ( !access_ok(value, sizeof(long)) )
3448 return -EPERM;
3449 if ( v == curr )
3450 write_debugreg(2, value);
3451 break;
3452 case 3:
3453 if ( !access_ok(value, sizeof(long)) )
3454 return -EPERM;
3455 if ( v == curr )
3456 write_debugreg(3, value);
3457 break;
3458 case 6:
3459 /*
3460 * DR6: Bits 4-11,16-31 reserved (set to 1).
3461 * Bit 12 reserved (set to 0).
3462 */
3463 value &= 0xffffefff; /* reserved bits => 0 */
3464 value |= 0xffff0ff0; /* reserved bits => 1 */
3465 if ( v == curr )
3466 write_debugreg(6, value);
3467 break;
3468 case 7:
3469 /*
3470 * DR7: Bit 10 reserved (set to 1).
3471 * Bits 11-12,14-15 reserved (set to 0).
3472 */
3473 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3474 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3475 /*
3476 * Privileged bits:
3477 * GD (bit 13): must be 0.
3478 */
3479 if ( value & DR_GENERAL_DETECT )
3480 return -EPERM;
3481 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3482 if ( value & DR7_ACTIVE_MASK )
3484 unsigned int io_enable = 0;
3486 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3488 if ( ((value >> i) & 3) == DR_IO )
3490 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3491 return -EPERM;
3492 io_enable |= value & (3 << ((i - 16) >> 1));
3494 #ifdef __i386__
3495 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3496 !boot_cpu_has(X86_FEATURE_LM)) &&
3497 (((value >> i) & 0xc) == DR_LEN_8) )
3498 return -EPERM;
3499 #endif
3502 /* Guest DR5 is a handy stash for I/O intercept information. */
3503 v->arch.guest_context.debugreg[5] = io_enable;
3504 value &= ~io_enable;
3506 /*
3507 * If DR7 was previously clear then we need to load all other
3508 * debug registers at this point as they were not restored during
3509 * context switch.
3510 */
3511 if ( (v == curr) &&
3512 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3514 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3515 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3516 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3517 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3518 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3521 if ( v == curr )
3522 write_debugreg(7, value);
3523 break;
3524 default:
3525 return -EINVAL;
3528 v->arch.guest_context.debugreg[reg] = value;
3529 return 0;
3532 long do_set_debugreg(int reg, unsigned long value)
3534 return set_debugreg(current, reg, value);
3537 unsigned long do_get_debugreg(int reg)
3539 struct vcpu *curr = current;
3541 switch ( reg )
3543 case 0 ... 3:
3544 case 6:
3545 return curr->arch.guest_context.debugreg[reg];
3546 case 7:
3547 return (curr->arch.guest_context.debugreg[7] |
3548 curr->arch.guest_context.debugreg[5]);
3549 case 4 ... 5:
3550 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3551 curr->arch.guest_context.debugreg[reg + 2] : 0);
3554 return -EINVAL;
3557 /*
3558 * Local variables:
3559 * mode: C
3560 * c-set-style: "BSD"
3561 * c-basic-offset: 4
3562 * tab-width: 4
3563 * indent-tabs-mode: nil
3564 * End:
3565 */