/root/src/xen/xen/arch/x86/traps.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * arch/x86/traps.c |
3 | | * |
4 | | * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify |
7 | | * it under the terms of the GNU General Public License as published by |
8 | | * the Free Software Foundation; either version 2 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU General Public License |
17 | | * along with this program; If not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | /* |
21 | | * Copyright (C) 1991, 1992 Linus Torvalds |
22 | | * |
23 | | * Pentium III FXSR, SSE support |
24 | | * Gareth Hughes <gareth@valinux.com>, May 2000 |
25 | | */ |
26 | | |
27 | | #include <xen/init.h> |
28 | | #include <xen/sched.h> |
29 | | #include <xen/lib.h> |
30 | | #include <xen/err.h> |
31 | | #include <xen/errno.h> |
32 | | #include <xen/mm.h> |
33 | | #include <xen/console.h> |
34 | | #include <xen/shutdown.h> |
35 | | #include <xen/guest_access.h> |
36 | | #include <asm/regs.h> |
37 | | #include <xen/delay.h> |
38 | | #include <xen/event.h> |
39 | | #include <xen/spinlock.h> |
40 | | #include <xen/irq.h> |
41 | | #include <xen/perfc.h> |
42 | | #include <xen/softirq.h> |
43 | | #include <xen/domain_page.h> |
44 | | #include <xen/symbols.h> |
45 | | #include <xen/iocap.h> |
46 | | #include <xen/version.h> |
47 | | #include <xen/kexec.h> |
48 | | #include <xen/trace.h> |
49 | | #include <xen/paging.h> |
50 | | #include <xen/virtual_region.h> |
51 | | #include <xen/watchdog.h> |
52 | | #include <xen/livepatch.h> |
53 | | #include <asm/system.h> |
54 | | #include <asm/io.h> |
55 | | #include <asm/atomic.h> |
56 | | #include <xen/bitops.h> |
57 | | #include <asm/desc.h> |
58 | | #include <asm/debugreg.h> |
59 | | #include <asm/smp.h> |
60 | | #include <asm/flushtlb.h> |
61 | | #include <asm/uaccess.h> |
62 | | #include <asm/i387.h> |
63 | | #include <asm/xstate.h> |
64 | | #include <asm/debugger.h> |
65 | | #include <asm/msr.h> |
66 | | #include <asm/nmi.h> |
67 | | #include <asm/shared.h> |
68 | | #include <asm/x86_emulate.h> |
69 | | #include <asm/traps.h> |
70 | | #include <asm/hvm/vpt.h> |
71 | | #include <asm/hypercall.h> |
72 | | #include <asm/mce.h> |
73 | | #include <asm/apic.h> |
74 | | #include <asm/mc146818rtc.h> |
75 | | #include <asm/hpet.h> |
76 | | #include <asm/vpmu.h> |
77 | | #include <public/arch-x86/cpuid.h> |
78 | | #include <asm/cpuid.h> |
79 | | #include <xsm/xsm.h> |
80 | | #include <asm/pv/traps.h> |
81 | | #include <asm/pv/mm.h> |
82 | | |
83 | | /* |
84 | | * opt_nmi: one of 'ignore', 'dom0', or 'fatal'. |
85 | | * fatal: Xen prints diagnostic message and then hangs. |
86 | | * dom0: The NMI is virtualised to DOM0. |
87 | | * ignore: The NMI error is cleared and ignored. |
88 | | */ |
89 | | #ifdef NDEBUG |
90 | | static char __read_mostly opt_nmi[10] = "dom0"; |
91 | | #else |
92 | | static char __read_mostly opt_nmi[10] = "fatal"; |
93 | | #endif |
94 | | string_param("nmi", opt_nmi); |
95 | | |
96 | | DEFINE_PER_CPU(u64, efer); |
97 | | static DEFINE_PER_CPU(unsigned long, last_extable_addr); |
98 | | |
99 | | DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr); |
100 | | |
101 | | DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table); |
102 | | DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table); |
103 | | |
104 | | /* Master table, used by CPU0. */ |
105 | | idt_entry_t idt_table[IDT_ENTRIES]; |
106 | | |
107 | | /* Pointer to the IDT of every CPU. */ |
108 | | idt_entry_t *idt_tables[NR_CPUS] __read_mostly; |
109 | | |
110 | | void (*ioemul_handle_quirk)( |
111 | | u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); |
112 | | |
113 | | static int debug_stack_lines = 20; |
114 | | integer_param("debug_stack_lines", debug_stack_lines); |
115 | | |
116 | | static bool opt_ler; |
117 | | boolean_param("ler", opt_ler); |
118 | | |
119 | 0 | #define stack_words_per_line 4 |
120 | 0 | #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) |
121 | | |
122 | | static void show_code(const struct cpu_user_regs *regs) |
123 | 0 | { |
124 | 0 | unsigned char insns_before[8] = {}, insns_after[16] = {}; |
125 | 0 | unsigned int i, tmp, missing_before, missing_after; |
126 | 0 |
|
127 | 0 | if ( guest_mode(regs) ) |
128 | 0 | return; |
129 | 0 |
|
130 | 0 | stac(); |
131 | 0 |
|
132 | 0 | /* |
133 | 0 | * Copy forward from regs->rip. In the case of a fault, %ecx contains the |
134 | 0 | * number of bytes remaining to copy. |
135 | 0 | */ |
136 | 0 | asm volatile ("1: rep movsb; 2:" |
137 | 0 | _ASM_EXTABLE(1b, 2b) |
138 | 0 | : "=&c" (missing_after), |
139 | 0 | "=&D" (tmp), "=&S" (tmp) |
140 | 0 | : "0" (ARRAY_SIZE(insns_after)), |
141 | 0 | "1" (insns_after), |
142 | 0 | "2" (regs->rip)); |
143 | 0 |
|
144 | 0 | /* |
145 | 0 | * Copy backwards from regs->rip - 1. In the case of a fault, %ecx |
146 | 0 | * contains the number of bytes remaining to copy. |
147 | 0 | */ |
148 | 0 | asm volatile ("std;" |
149 | 0 | "1: rep movsb;" |
150 | 0 | "2: cld;" |
151 | 0 | _ASM_EXTABLE(1b, 2b) |
152 | 0 | : "=&c" (missing_before), |
153 | 0 | "=&D" (tmp), "=&S" (tmp) |
154 | 0 | : "0" (ARRAY_SIZE(insns_before)), |
155 | 0 | "1" (insns_before + ARRAY_SIZE(insns_before) - 1), |
156 | 0 | "2" (regs->rip - 1)); |
157 | 0 | clac(); |
158 | 0 |
|
159 | 0 | printk("Xen code around <%p> (%ps)%s:\n", |
160 | 0 | _p(regs->rip), _p(regs->rip), |
161 | 0 | (missing_before || missing_after) ? " [fault on access]" : ""); |
162 | 0 |
|
163 | 0 | /* Print bytes from insns_before[]. */ |
164 | 0 | for ( i = 0; i < ARRAY_SIZE(insns_before); ++i ) |
165 | 0 | { |
166 | 0 | if ( i < missing_before ) |
167 | 0 | printk(" --"); |
168 | 0 | else |
169 | 0 | printk(" %02x", insns_before[i]); |
170 | 0 | } |
171 | 0 |
|
172 | 0 | /* Print the byte under %rip. */ |
173 | 0 | if ( missing_after != ARRAY_SIZE(insns_after) ) |
174 | 0 | printk(" <%02x>", insns_after[0]); |
175 | 0 | else |
176 | 0 | printk(" <-->"); |
177 | 0 |
|
178 | 0 | /* Print bytes from insns_after[]. */ |
179 | 0 | for ( i = 1; i < ARRAY_SIZE(insns_after); ++i ) |
180 | 0 | { |
181 | 0 | if ( i < (ARRAY_SIZE(insns_after) - missing_after) ) |
182 | 0 | printk(" %02x", insns_after[i]); |
183 | 0 | else |
184 | 0 | printk(" --"); |
185 | 0 | } |
186 | 0 |
|
187 | 0 | printk("\n"); |
188 | 0 | } |
189 | | |
190 | | static void compat_show_guest_stack(struct vcpu *v, |
191 | | const struct cpu_user_regs *regs, |
192 | | int debug_stack_lines) |
193 | 0 | { |
194 | 0 | unsigned int i, *stack, addr, mask = STACK_SIZE; |
195 | 0 |
|
196 | 0 | stack = (unsigned int *)(unsigned long)regs->esp; |
197 | 0 | printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack); |
198 | 0 |
|
199 | 0 | if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) ) |
200 | 0 | { |
201 | 0 | printk("Guest-inaccessible memory.\n"); |
202 | 0 | return; |
203 | 0 | } |
204 | 0 |
|
205 | 0 | if ( v != current ) |
206 | 0 | { |
207 | 0 | struct vcpu *vcpu; |
208 | 0 | unsigned long mfn; |
209 | 0 |
|
210 | 0 | ASSERT(guest_kernel_mode(v, regs)); |
211 | 0 | mfn = read_cr3() >> PAGE_SHIFT; |
212 | 0 | for_each_vcpu( v->domain, vcpu ) |
213 | 0 | if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn ) |
214 | 0 | break; |
215 | 0 | if ( !vcpu ) |
216 | 0 | { |
217 | 0 | stack = do_page_walk(v, (unsigned long)stack); |
218 | 0 | if ( (unsigned long)stack < PAGE_SIZE ) |
219 | 0 | { |
220 | 0 | printk("Inaccessible guest memory.\n"); |
221 | 0 | return; |
222 | 0 | } |
223 | 0 | mask = PAGE_SIZE; |
224 | 0 | } |
225 | 0 | } |
226 | 0 |
|
227 | 0 | for ( i = 0; i < debug_stack_lines * 8; i++ ) |
228 | 0 | { |
229 | 0 | if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask ) |
230 | 0 | break; |
231 | 0 | if ( __get_user(addr, stack) ) |
232 | 0 | { |
233 | 0 | if ( i != 0 ) |
234 | 0 | printk("\n "); |
235 | 0 | printk("Fault while accessing guest memory."); |
236 | 0 | i = 1; |
237 | 0 | break; |
238 | 0 | } |
239 | 0 | if ( (i != 0) && ((i % 8) == 0) ) |
240 | 0 | printk("\n "); |
241 | 0 | printk(" %08x", addr); |
242 | 0 | stack++; |
243 | 0 | } |
244 | 0 | if ( mask == PAGE_SIZE ) |
245 | 0 | { |
246 | 0 | BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE); |
247 | 0 | unmap_domain_page(stack); |
248 | 0 | } |
249 | 0 | if ( i == 0 ) |
250 | 0 | printk("Stack empty."); |
251 | 0 | printk("\n"); |
252 | 0 | } |
253 | | |
254 | | static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs) |
255 | 0 | { |
256 | 0 | int i; |
257 | 0 | unsigned long *stack, addr; |
258 | 0 | unsigned long mask = STACK_SIZE; |
259 | 0 |
|
260 | 0 | /* Avoid HVM as we don't know what the stack looks like. */ |
261 | 0 | if ( is_hvm_vcpu(v) ) |
262 | 0 | return; |
263 | 0 |
|
264 | 0 | if ( is_pv_32bit_vcpu(v) ) |
265 | 0 | { |
266 | 0 | compat_show_guest_stack(v, regs, debug_stack_lines); |
267 | 0 | return; |
268 | 0 | } |
269 | 0 |
|
270 | 0 | stack = (unsigned long *)regs->rsp; |
271 | 0 | printk("Guest stack trace from "__OP"sp=%p:\n ", stack); |
272 | 0 |
|
273 | 0 | if ( !access_ok(stack, sizeof(*stack)) ) |
274 | 0 | { |
275 | 0 | printk("Guest-inaccessible memory.\n"); |
276 | 0 | return; |
277 | 0 | } |
278 | 0 |
|
279 | 0 | if ( v != current ) |
280 | 0 | { |
281 | 0 | struct vcpu *vcpu; |
282 | 0 |
|
283 | 0 | ASSERT(guest_kernel_mode(v, regs)); |
284 | 0 | vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL; |
285 | 0 | if ( !vcpu ) |
286 | 0 | { |
287 | 0 | stack = do_page_walk(v, (unsigned long)stack); |
288 | 0 | if ( (unsigned long)stack < PAGE_SIZE ) |
289 | 0 | { |
290 | 0 | printk("Inaccessible guest memory.\n"); |
291 | 0 | return; |
292 | 0 | } |
293 | 0 | mask = PAGE_SIZE; |
294 | 0 | } |
295 | 0 | } |
296 | 0 |
|
297 | 0 | for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) |
298 | 0 | { |
299 | 0 | if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask ) |
300 | 0 | break; |
301 | 0 | if ( __get_user(addr, stack) ) |
302 | 0 | { |
303 | 0 | if ( i != 0 ) |
304 | 0 | printk("\n "); |
305 | 0 | printk("Fault while accessing guest memory."); |
306 | 0 | i = 1; |
307 | 0 | break; |
308 | 0 | } |
309 | 0 | if ( (i != 0) && ((i % stack_words_per_line) == 0) ) |
310 | 0 | printk("\n "); |
311 | 0 | printk(" %p", _p(addr)); |
312 | 0 | stack++; |
313 | 0 | } |
314 | 0 | if ( mask == PAGE_SIZE ) |
315 | 0 | { |
316 | 0 | BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE); |
317 | 0 | unmap_domain_page(stack); |
318 | 0 | } |
319 | 0 | if ( i == 0 ) |
320 | 0 | printk("Stack empty."); |
321 | 0 | printk("\n"); |
322 | 0 | } |
323 | | |
324 | | /* |
325 | | * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() |
326 | | * |
327 | | * Stack pages 0, 1 and 2: |
328 | | * These are all 1-page IST stacks. Each of these stacks have an exception |
329 | | * frame and saved register state at the top. The interesting bound for a |
330 | | * trace is the word adjacent to this, while the bound for a dump is the |
331 | | * very top, including the exception frame. |
332 | | * |
333 | | * Stack pages 3, 4 and 5: |
334 | | * None of these are particularly interesting. With MEMORY_GUARD, page 5 is |
335 | | * explicitly not present, so attempting to dump or trace it is |
336 | | * counterproductive. Without MEMORY_GUARD, it is possible for a call chain |
337 | | * to use the entire primary stack and wander into page 5. In this case, |
338 | | * consider these pages an extension of the primary stack to aid debugging |
339 | | * hopefully rare situations where the primary stack has effective been |
340 | | * overflown. |
341 | | * |
342 | | * Stack pages 6 and 7: |
343 | | * These form the primary stack, and have a cpu_info at the top. For a |
344 | | * trace, the interesting bound is adjacent to the cpu_info, while for a |
345 | | * dump, the entire cpu_info is interesting. |
346 | | * |
347 | | * For the cases where the stack should not be inspected, pretend that the |
348 | | * passed stack pointer is already out of reasonable bounds. |
349 | | */ |
350 | | unsigned long get_stack_trace_bottom(unsigned long sp) |
351 | 0 | { |
352 | 0 | switch ( get_stack_page(sp) ) |
353 | 0 | { |
354 | 0 | case 0 ... 2: |
355 | 0 | return ROUNDUP(sp, PAGE_SIZE) - |
356 | 0 | offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); |
357 | 0 |
|
358 | 0 | #ifndef MEMORY_GUARD |
359 | | case 3 ... 5: |
360 | | #endif |
361 | 0 | case 6 ... 7: |
362 | 0 | return ROUNDUP(sp, STACK_SIZE) - |
363 | 0 | sizeof(struct cpu_info) - sizeof(unsigned long); |
364 | 0 |
|
365 | 0 | default: |
366 | 0 | return sp - sizeof(unsigned long); |
367 | 0 | } |
368 | 0 | } |
369 | | |
370 | | unsigned long get_stack_dump_bottom(unsigned long sp) |
371 | 0 | { |
372 | 0 | switch ( get_stack_page(sp) ) |
373 | 0 | { |
374 | 0 | case 0 ... 2: |
375 | 0 | return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); |
376 | 0 |
|
377 | 0 | #ifndef MEMORY_GUARD |
378 | | case 3 ... 5: |
379 | | #endif |
380 | 0 | case 6 ... 7: |
381 | 0 | return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); |
382 | 0 |
|
383 | 0 | default: |
384 | 0 | return sp - sizeof(unsigned long); |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | #if !defined(CONFIG_FRAME_POINTER) |
389 | | |
390 | | /* |
391 | | * Stack trace from pointers found in stack, unaided by frame pointers. For |
392 | | * caller convenience, this has the same prototype as its alternative, and |
393 | | * simply ignores the base pointer parameter. |
394 | | */ |
395 | | static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp) |
396 | | { |
397 | | unsigned long *stack = (unsigned long *)sp, addr; |
398 | | unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp); |
399 | | |
400 | | while ( stack <= bottom ) |
401 | | { |
402 | | addr = *stack++; |
403 | | if ( is_active_kernel_text(addr) ) |
404 | | printk(" [<%p>] %pS\n", _p(addr), _p(addr)); |
405 | | } |
406 | | } |
407 | | |
408 | | #else |
409 | | |
410 | | /* Stack trace from frames in the stack, using frame pointers */ |
411 | | static void _show_trace(unsigned long sp, unsigned long bp) |
412 | 0 | { |
413 | 0 | unsigned long *frame, next, addr; |
414 | 0 |
|
415 | 0 | /* Bounds for range of valid frame pointer. */ |
416 | 0 | unsigned long low = sp, high = get_stack_trace_bottom(sp); |
417 | 0 |
|
418 | 0 | /* The initial frame pointer. */ |
419 | 0 | next = bp; |
420 | 0 |
|
421 | 0 | for ( ; ; ) |
422 | 0 | { |
423 | 0 | /* Valid frame pointer? */ |
424 | 0 | if ( (next < low) || (next >= high) ) |
425 | 0 | { |
426 | 0 | /* |
427 | 0 | * Exception stack frames have a different layout, denoted by an |
428 | 0 | * inverted frame pointer. |
429 | 0 | */ |
430 | 0 | next = ~next; |
431 | 0 | if ( (next < low) || (next >= high) ) |
432 | 0 | break; |
433 | 0 | frame = (unsigned long *)next; |
434 | 0 | next = frame[0]; |
435 | 0 | addr = frame[(offsetof(struct cpu_user_regs, rip) - |
436 | 0 | offsetof(struct cpu_user_regs, rbp)) |
437 | 0 | / BYTES_PER_LONG]; |
438 | 0 | } |
439 | 0 | else |
440 | 0 | { |
441 | 0 | /* Ordinary stack frame. */ |
442 | 0 | frame = (unsigned long *)next; |
443 | 0 | next = frame[0]; |
444 | 0 | addr = frame[1]; |
445 | 0 | } |
446 | 0 |
|
447 | 0 | printk(" [<%p>] %pS\n", _p(addr), _p(addr)); |
448 | 0 |
|
449 | 0 | low = (unsigned long)&frame[2]; |
450 | 0 | } |
451 | 0 | } |
452 | | |
453 | | #endif |
454 | | |
455 | | static void show_trace(const struct cpu_user_regs *regs) |
456 | 0 | { |
457 | 0 | unsigned long *sp = ESP_BEFORE_EXCEPTION(regs); |
458 | 0 |
|
459 | 0 | printk("Xen call trace:\n"); |
460 | 0 |
|
461 | 0 | /* |
462 | 0 | * If RIP looks sensible, or the top of the stack doesn't, print RIP at |
463 | 0 | * the top of the stack trace. |
464 | 0 | */ |
465 | 0 | if ( is_active_kernel_text(regs->rip) || |
466 | 0 | !is_active_kernel_text(*sp) ) |
467 | 0 | printk(" [<%p>] %pS\n", _p(regs->rip), _p(regs->rip)); |
468 | 0 | /* |
469 | 0 | * Else RIP looks bad but the top of the stack looks good. Perhaps we |
470 | 0 | * followed a wild function pointer? Lets assume the top of the stack is a |
471 | 0 | * return address; print it and skip past so _show_trace() doesn't print |
472 | 0 | * it again. |
473 | 0 | */ |
474 | 0 | else |
475 | 0 | { |
476 | 0 | printk(" [<%p>] %pS\n", _p(*sp), _p(*sp)); |
477 | 0 | sp++; |
478 | 0 | } |
479 | 0 |
|
480 | 0 | _show_trace((unsigned long)sp, regs->rbp); |
481 | 0 |
|
482 | 0 | printk("\n"); |
483 | 0 | } |
484 | | |
485 | | void show_stack(const struct cpu_user_regs *regs) |
486 | 0 | { |
487 | 0 | unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr; |
488 | 0 | int i; |
489 | 0 |
|
490 | 0 | if ( guest_mode(regs) ) |
491 | 0 | return show_guest_stack(current, regs); |
492 | 0 |
|
493 | 0 | printk("Xen stack trace from "__OP"sp=%p:\n ", stack); |
494 | 0 |
|
495 | 0 | stack_bottom = _p(get_stack_dump_bottom(regs->rsp)); |
496 | 0 |
|
497 | 0 | for ( i = 0; i < (debug_stack_lines*stack_words_per_line) && |
498 | 0 | (stack <= stack_bottom); i++ ) |
499 | 0 | { |
500 | 0 | if ( (i != 0) && ((i % stack_words_per_line) == 0) ) |
501 | 0 | printk("\n "); |
502 | 0 | addr = *stack++; |
503 | 0 | printk(" %p", _p(addr)); |
504 | 0 | } |
505 | 0 | if ( i == 0 ) |
506 | 0 | printk("Stack empty."); |
507 | 0 | printk("\n"); |
508 | 0 |
|
509 | 0 | show_trace(regs); |
510 | 0 | } |
511 | | |
512 | | void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs) |
513 | 0 | { |
514 | 0 | unsigned long esp = regs->rsp; |
515 | 0 | unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1); |
516 | 0 | #ifdef MEMORY_GUARD |
517 | 0 | unsigned long esp_top, esp_bottom; |
518 | 0 | #endif |
519 | 0 |
|
520 | 0 | if ( _p(curr_stack_base) != stack_base[cpu] ) |
521 | 0 | printk("Current stack base %p differs from expected %p\n", |
522 | 0 | _p(curr_stack_base), stack_base[cpu]); |
523 | 0 |
|
524 | 0 | #ifdef MEMORY_GUARD |
525 | 0 | esp_bottom = (esp | (STACK_SIZE - 1)) + 1; |
526 | 0 | esp_top = esp_bottom - PRIMARY_STACK_SIZE; |
527 | 0 |
|
528 | 0 | printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n", |
529 | 0 | (void *)esp_top, (void *)esp_bottom, (void *)esp, |
530 | 0 | (void *)per_cpu(init_tss, cpu).rsp0); |
531 | 0 |
|
532 | 0 | /* |
533 | 0 | * Trigger overflow trace if %esp is anywhere within the guard page, or |
534 | 0 | * with fewer than 512 bytes remaining on the primary stack. |
535 | 0 | */ |
536 | 0 | if ( (esp > (esp_top + 512)) || |
537 | 0 | (esp < (esp_top - PAGE_SIZE)) ) |
538 | 0 | { |
539 | 0 | printk("No stack overflow detected. Skipping stack trace.\n"); |
540 | 0 | return; |
541 | 0 | } |
542 | 0 |
|
543 | 0 | if ( esp < esp_top ) |
544 | 0 | esp = esp_top; |
545 | 0 |
|
546 | 0 | printk("Xen stack overflow (dumping trace %p-%p):\n", |
547 | 0 | (void *)esp, (void *)esp_bottom); |
548 | 0 |
|
549 | 0 | _show_trace(esp, regs->rbp); |
550 | 0 |
|
551 | 0 | printk("\n"); |
552 | 0 | #endif |
553 | 0 | } |
554 | | |
555 | | void show_execution_state(const struct cpu_user_regs *regs) |
556 | 0 | { |
557 | 0 | /* Prevent interleaving of output. */ |
558 | 0 | unsigned long flags = console_lock_recursive_irqsave(); |
559 | 0 |
|
560 | 0 | show_registers(regs); |
561 | 0 | show_code(regs); |
562 | 0 | show_stack(regs); |
563 | 0 |
|
564 | 0 | console_unlock_recursive_irqrestore(flags); |
565 | 0 | } |
566 | | |
567 | | void vcpu_show_execution_state(struct vcpu *v) |
568 | 0 | { |
569 | 0 | unsigned long flags; |
570 | 0 |
|
571 | 0 | printk("*** Dumping Dom%d vcpu#%d state: ***\n", |
572 | 0 | v->domain->domain_id, v->vcpu_id); |
573 | 0 |
|
574 | 0 | if ( v == current ) |
575 | 0 | { |
576 | 0 | show_execution_state(guest_cpu_user_regs()); |
577 | 0 | return; |
578 | 0 | } |
579 | 0 |
|
580 | 0 | vcpu_pause(v); /* acceptably dangerous */ |
581 | 0 |
|
582 | 0 | /* Prevent interleaving of output. */ |
583 | 0 | flags = console_lock_recursive_irqsave(); |
584 | 0 |
|
585 | 0 | vcpu_show_registers(v); |
586 | 0 | if ( guest_kernel_mode(v, &v->arch.user_regs) ) |
587 | 0 | show_guest_stack(v, &v->arch.user_regs); |
588 | 0 |
|
589 | 0 | console_unlock_recursive_irqrestore(flags); |
590 | 0 |
|
591 | 0 | vcpu_unpause(v); |
592 | 0 | } |
593 | | |
594 | | static cpumask_t show_state_mask; |
595 | | static bool opt_show_all; |
596 | | boolean_param("async-show-all", opt_show_all); |
597 | | |
598 | | static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu) |
599 | 0 | { |
600 | 0 | if ( !cpumask_test_cpu(cpu, &show_state_mask) ) |
601 | 0 | return 0; |
602 | 0 |
|
603 | 0 | if ( opt_show_all ) |
604 | 0 | show_execution_state(regs); |
605 | 0 | else |
606 | 0 | printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs, |
607 | 0 | regs->rip, guest_mode(regs) ? _p(regs->rip) : NULL); |
608 | 0 | cpumask_clear_cpu(cpu, &show_state_mask); |
609 | 0 |
|
610 | 0 | return 1; |
611 | 0 | } |
612 | | |
613 | | const char *trapstr(unsigned int trapnr) |
614 | 0 | { |
615 | 0 | static const char * const strings[] = { |
616 | 0 | "divide error", "debug", "nmi", "bkpt", "overflow", "bounds", |
617 | 0 | "invalid opcode", "device not available", "double fault", |
618 | 0 | "coprocessor segment", "invalid tss", "segment not found", |
619 | 0 | "stack error", "general protection fault", "page fault", |
620 | 0 | "spurious interrupt", "coprocessor error", "alignment check", |
621 | 0 | "machine check", "simd error", "virtualisation exception" |
622 | 0 | }; |
623 | 0 |
|
624 | 0 | return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???"; |
625 | 0 | } |
626 | | |
627 | | /* |
628 | | * This is called for faults at very unexpected times (e.g., when interrupts |
629 | | * are disabled). In such situations we can't do much that is safe. We try to |
630 | | * print out some tracing and then we just spin. |
631 | | */ |
632 | | void fatal_trap(const struct cpu_user_regs *regs, bool show_remote) |
633 | 0 | { |
634 | 0 | static DEFINE_PER_CPU(char, depth); |
635 | 0 | unsigned int trapnr = regs->entry_vector; |
636 | 0 |
|
637 | 0 | /* Set AC to reduce chance of further SMAP faults */ |
638 | 0 | stac(); |
639 | 0 |
|
640 | 0 | /* |
641 | 0 | * In some cases, we can end up in a vicious cycle of fatal_trap()s |
642 | 0 | * within fatal_trap()s. We give the problem a couple of iterations to |
643 | 0 | * bottom out, and then we just panic. |
644 | 0 | */ |
645 | 0 | if ( ++this_cpu(depth) < 3 ) |
646 | 0 | { |
647 | 0 | watchdog_disable(); |
648 | 0 | console_start_sync(); |
649 | 0 |
|
650 | 0 | show_execution_state(regs); |
651 | 0 |
|
652 | 0 | if ( trapnr == TRAP_page_fault ) |
653 | 0 | { |
654 | 0 | unsigned long cr2 = read_cr2(); |
655 | 0 | printk("Faulting linear address: %p\n", _p(cr2)); |
656 | 0 | show_page_walk(cr2); |
657 | 0 | } |
658 | 0 |
|
659 | 0 | if ( show_remote ) |
660 | 0 | { |
661 | 0 | unsigned int msecs, pending; |
662 | 0 |
|
663 | 0 | cpumask_andnot(&show_state_mask, &cpu_online_map, |
664 | 0 | cpumask_of(smp_processor_id())); |
665 | 0 | set_nmi_callback(nmi_show_execution_state); |
666 | 0 | /* Ensure new callback is set before sending out the NMI. */ |
667 | 0 | smp_wmb(); |
668 | 0 | smp_send_nmi_allbutself(); |
669 | 0 |
|
670 | 0 | /* Wait at most 10ms for some other CPU to respond. */ |
671 | 0 | msecs = 10; |
672 | 0 | pending = cpumask_weight(&show_state_mask); |
673 | 0 | while ( pending && msecs-- ) |
674 | 0 | { |
675 | 0 | unsigned int left; |
676 | 0 |
|
677 | 0 | mdelay(1); |
678 | 0 | left = cpumask_weight(&show_state_mask); |
679 | 0 | if ( left < pending ) |
680 | 0 | { |
681 | 0 | pending = left; |
682 | 0 | msecs = 10; |
683 | 0 | } |
684 | 0 | } |
685 | 0 | } |
686 | 0 | } |
687 | 0 |
|
688 | 0 | panic("FATAL TRAP: vector = %d (%s)\n" |
689 | 0 | "[error_code=%04x] %s", |
690 | 0 | trapnr, trapstr(trapnr), regs->error_code, |
691 | 0 | (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT"); |
692 | 0 | } |
693 | | |
694 | | void do_reserved_trap(struct cpu_user_regs *regs) |
695 | 0 | { |
696 | 0 | unsigned int trapnr = regs->entry_vector; |
697 | 0 |
|
698 | 0 | if ( debugger_trap_fatal(trapnr, regs) ) |
699 | 0 | return; |
700 | 0 |
|
701 | 0 | show_execution_state(regs); |
702 | 0 | panic("FATAL RESERVED TRAP %#x: %s", trapnr, trapstr(trapnr)); |
703 | 0 | } |
704 | | |
705 | | void do_trap(struct cpu_user_regs *regs) |
706 | 1 | { |
707 | 1 | struct vcpu *curr = current; |
708 | 1 | unsigned int trapnr = regs->entry_vector; |
709 | 1 | unsigned long fixup; |
710 | 1 | |
711 | 1 | if ( regs->error_code & X86_XEC_EXT ) |
712 | 0 | goto hardware_trap; |
713 | 1 | |
714 | 1 | if ( debugger_trap_entry(trapnr, regs) ) |
715 | 0 | return; |
716 | 1 | |
717 | 1 | ASSERT(trapnr < 32); |
718 | 1 | |
719 | 1 | if ( guest_mode(regs) ) |
720 | 0 | { |
721 | 0 | pv_inject_hw_exception(trapnr, |
722 | 0 | (TRAP_HAVE_EC & (1u << trapnr)) |
723 | 0 | ? regs->error_code : X86_EVENT_NO_EC); |
724 | 0 | return; |
725 | 0 | } |
726 | 1 | |
727 | 1 | if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) && |
728 | 0 | system_state >= SYS_STATE_active && is_hvm_vcpu(curr) && |
729 | 0 | curr->arch.hvm_vcpu.fpu_exception_callback ) |
730 | 0 | { |
731 | 0 | curr->arch.hvm_vcpu.fpu_exception_callback( |
732 | 0 | curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs); |
733 | 0 | return; |
734 | 0 | } |
735 | 1 | |
736 | 1 | if ( likely((fixup = search_exception_table(regs)) != 0) ) |
737 | 1 | { |
738 | 1 | dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n", |
739 | 1 | trapnr, _p(regs->rip), _p(regs->rip), _p(fixup)); |
740 | 1 | this_cpu(last_extable_addr) = regs->rip; |
741 | 1 | regs->rip = fixup; |
742 | 1 | return; |
743 | 1 | } |
744 | 1 | |
745 | 0 | hardware_trap: |
746 | 0 | if ( debugger_trap_fatal(trapnr, regs) ) |
747 | 0 | return; |
748 | 0 |
|
749 | 0 | show_execution_state(regs); |
750 | 0 | panic("FATAL TRAP: vector = %d (%s)\n" |
751 | 0 | "[error_code=%04x]", |
752 | 0 | trapnr, trapstr(trapnr), regs->error_code); |
753 | 0 | } |
754 | | |
755 | | /* Returns 0 if not handled, and non-0 for success. */ |
756 | | int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val) |
757 | 0 | { |
758 | 0 | struct domain *d = current->domain; |
759 | 0 | /* Optionally shift out of the way of Viridian architectural MSRs. */ |
760 | 0 | uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000; |
761 | 0 |
|
762 | 0 | switch ( idx - base ) |
763 | 0 | { |
764 | 0 | case 0: /* Write hypercall page MSR. Read as zero. */ |
765 | 0 | { |
766 | 0 | *val = 0; |
767 | 0 | return 1; |
768 | 0 | } |
769 | 0 | } |
770 | 0 |
|
771 | 0 | return 0; |
772 | 0 | } |
773 | | |
774 | | /* Returns 1 if handled, 0 if not and -Exx for error. */ |
775 | | int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val) |
776 | 2 | { |
777 | 2 | struct domain *d = current->domain; |
778 | 2 | /* Optionally shift out of the way of Viridian architectural MSRs. */ |
779 | 2 | uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000; |
780 | 2 | |
781 | 2 | switch ( idx - base ) |
782 | 2 | { |
783 | 2 | case 0: /* Write hypercall page */ |
784 | 2 | { |
785 | 2 | void *hypercall_page; |
786 | 2 | unsigned long gmfn = val >> PAGE_SHIFT; |
787 | 2 | unsigned int page_index = val & (PAGE_SIZE - 1); |
788 | 2 | struct page_info *page; |
789 | 2 | p2m_type_t t; |
790 | 2 | |
791 | 2 | if ( page_index > 0 ) |
792 | 0 | { |
793 | 0 | gdprintk(XENLOG_WARNING, |
794 | 0 | "wrmsr hypercall page index %#x unsupported\n", |
795 | 0 | page_index); |
796 | 0 | return 0; |
797 | 0 | } |
798 | 2 | |
799 | 2 | page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC); |
800 | 2 | |
801 | 2 | if ( !page || !get_page_type(page, PGT_writable_page) ) |
802 | 0 | { |
803 | 0 | if ( page ) |
804 | 0 | put_page(page); |
805 | 0 |
|
806 | 0 | if ( p2m_is_paging(t) ) |
807 | 0 | { |
808 | 0 | p2m_mem_paging_populate(d, gmfn); |
809 | 0 | return -ERESTART; |
810 | 0 | } |
811 | 0 |
|
812 | 0 | gdprintk(XENLOG_WARNING, |
813 | 0 | "Bad GMFN %lx (MFN %lx) to MSR %08x\n", |
814 | 0 | gmfn, page ? page_to_mfn(page) : -1UL, base); |
815 | 0 | return 0; |
816 | 0 | } |
817 | 2 | |
818 | 2 | hypercall_page = __map_domain_page(page); |
819 | 2 | hypercall_page_initialise(d, hypercall_page); |
820 | 2 | unmap_domain_page(hypercall_page); |
821 | 2 | |
822 | 2 | put_page_and_type(page); |
823 | 2 | return 1; |
824 | 2 | } |
825 | 2 | } |
826 | 2 | |
827 | 0 | return 0; |
828 | 2 | } |
829 | | |
830 | | void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf, |
831 | | uint32_t subleaf, struct cpuid_leaf *res) |
832 | 9 | { |
833 | 9 | const struct domain *d = v->domain; |
834 | 9 | const struct cpuid_policy *p = d->arch.cpuid; |
835 | 9 | uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000; |
836 | 9 | uint32_t idx = leaf - base; |
837 | 9 | unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit; |
838 | 9 | |
839 | 9 | if ( limit == 0 ) |
840 | 9 | /* Default number of leaves */ |
841 | 9 | limit = XEN_CPUID_MAX_NUM_LEAVES; |
842 | 9 | else |
843 | 9 | /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */ |
844 | 0 | limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u); |
845 | 9 | |
846 | 9 | if ( idx > limit ) |
847 | 0 | return; |
848 | 9 | |
849 | 9 | switch ( idx ) |
850 | 9 | { |
851 | 4 | case 0: |
852 | 4 | res->a = base + limit; /* Largest leaf */ |
853 | 4 | res->b = XEN_CPUID_SIGNATURE_EBX; |
854 | 4 | res->c = XEN_CPUID_SIGNATURE_ECX; |
855 | 4 | res->d = XEN_CPUID_SIGNATURE_EDX; |
856 | 4 | break; |
857 | 4 | |
858 | 2 | case 1: |
859 | 2 | res->a = (xen_major_version() << 16) | xen_minor_version(); |
860 | 2 | break; |
861 | 4 | |
862 | 2 | case 2: |
863 | 2 | res->a = 1; /* Number of hypercall-transfer pages */ |
864 | 2 | /* MSR base address */ |
865 | 2 | res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000; |
866 | 2 | if ( is_pv_domain(d) ) /* Features */ |
867 | 0 | res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD; |
868 | 2 | break; |
869 | 4 | |
870 | 0 | case 3: /* Time leaf. */ |
871 | 0 | switch ( subleaf ) |
872 | 0 | { |
873 | 0 | case 0: /* features */ |
874 | 0 | res->a = ((d->arch.vtsc << 0) | |
875 | 0 | (!!host_tsc_is_safe() << 1) | |
876 | 0 | (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2)); |
877 | 0 | res->b = d->arch.tsc_mode; |
878 | 0 | res->c = d->arch.tsc_khz; |
879 | 0 | res->d = d->arch.incarnation; |
880 | 0 | break; |
881 | 0 |
|
882 | 0 | case 1: /* scale and offset */ |
883 | 0 | { |
884 | 0 | uint64_t offset; |
885 | 0 |
|
886 | 0 | if ( !d->arch.vtsc ) |
887 | 0 | offset = d->arch.vtsc_offset; |
888 | 0 | else |
889 | 0 | /* offset already applied to value returned by virtual rdtscp */ |
890 | 0 | offset = 0; |
891 | 0 | res->a = offset; |
892 | 0 | res->b = offset >> 32; |
893 | 0 | res->c = d->arch.vtsc_to_ns.mul_frac; |
894 | 0 | res->d = (s8)d->arch.vtsc_to_ns.shift; |
895 | 0 | break; |
896 | 0 | } |
897 | 0 |
|
898 | 0 | case 2: /* physical cpu_khz */ |
899 | 0 | res->a = cpu_khz; |
900 | 0 | break; |
901 | 0 | } |
902 | 0 | break; |
903 | 0 |
|
904 | 1 | case 4: /* HVM hypervisor leaf. */ |
905 | 1 | if ( !is_hvm_domain(d) || subleaf != 0 ) |
906 | 1 | break; |
907 | 1 | |
908 | 0 | if ( cpu_has_vmx_apic_reg_virt ) |
909 | 0 | res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT; |
910 | 0 |
|
911 | 0 | /* |
912 | 0 | * We want to claim that x2APIC is virtualized if APIC MSR accesses |
913 | 0 | * are not intercepted. When all three of these are true both rdmsr |
914 | 0 | * and wrmsr in the guest will run without VMEXITs (see |
915 | 0 | * vmx_vlapic_msr_changed()). |
916 | 0 | */ |
917 | 0 | if ( cpu_has_vmx_virtualize_x2apic_mode && |
918 | 0 | cpu_has_vmx_apic_reg_virt && |
919 | 0 | cpu_has_vmx_virtual_intr_delivery ) |
920 | 0 | res->a |= XEN_HVM_CPUID_X2APIC_VIRT; |
921 | 0 |
|
922 | 0 | /* |
923 | 0 | * Indicate that memory mapped from other domains (either grants or |
924 | 0 | * foreign pages) has valid IOMMU entries. |
925 | 0 | */ |
926 | 0 | res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS; |
927 | 0 |
|
928 | 0 | /* Indicate presence of vcpu id and set it in ebx */ |
929 | 0 | res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT; |
930 | 0 | res->b = v->vcpu_id; |
931 | 0 | break; |
932 | 1 | |
933 | 0 | case 5: /* PV-specific parameters */ |
934 | 0 | if ( is_hvm_domain(d) || subleaf != 0 ) |
935 | 0 | break; |
936 | 0 |
|
937 | 0 | res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT; |
938 | 0 | break; |
939 | 0 |
|
940 | 0 | default: |
941 | 0 | ASSERT_UNREACHABLE(); |
942 | 9 | } |
943 | 9 | } |
944 | | |
945 | | void do_invalid_op(struct cpu_user_regs *regs) |
946 | 2 | { |
947 | 2 | const struct bug_frame *bug = NULL; |
948 | 2 | u8 bug_insn[2]; |
949 | 2 | const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip; |
950 | 2 | unsigned long fixup; |
951 | 2 | int id = -1, lineno; |
952 | 2 | const struct virtual_region *region; |
953 | 2 | |
954 | 2 | if ( debugger_trap_entry(TRAP_invalid_op, regs) ) |
955 | 0 | return; |
956 | 2 | |
957 | 2 | if ( likely(guest_mode(regs)) ) |
958 | 0 | { |
959 | 0 | if ( pv_emulate_invalid_op(regs) ) |
960 | 0 | pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC); |
961 | 0 | return; |
962 | 0 | } |
963 | 2 | |
964 | 2 | if ( !is_active_kernel_text(regs->rip) || |
965 | 1 | __copy_from_user(bug_insn, eip, sizeof(bug_insn)) || |
966 | 1 | memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) ) |
967 | 1 | goto die; |
968 | 2 | |
969 | 1 | region = find_text_region(regs->rip); |
970 | 1 | if ( region ) |
971 | 1 | { |
972 | 1 | for ( id = 0; id < BUGFRAME_NR; id++ ) |
973 | 1 | { |
974 | 1 | const struct bug_frame *b; |
975 | 1 | unsigned int i; |
976 | 1 | |
977 | 1 | for ( i = 0, b = region->frame[id].bugs; |
978 | 1 | i < region->frame[id].n_bugs; b++, i++ ) |
979 | 1 | { |
980 | 1 | if ( bug_loc(b) == eip ) |
981 | 1 | { |
982 | 1 | bug = b; |
983 | 1 | goto found; |
984 | 1 | } |
985 | 1 | } |
986 | 1 | } |
987 | 1 | } |
988 | 1 | |
989 | 1 | found: |
990 | 1 | if ( !bug ) |
991 | 0 | goto die; |
992 | 1 | eip += sizeof(bug_insn); |
993 | 1 | if ( id == BUGFRAME_run_fn ) |
994 | 1 | { |
995 | 1 | void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); |
996 | 1 | |
997 | 1 | fn(regs); |
998 | 1 | regs->rip = (unsigned long)eip; |
999 | 1 | return; |
1000 | 1 | } |
1001 | 1 | |
1002 | 1 | /* WARN, BUG or ASSERT: decode the filename pointer and line number. */ |
1003 | 0 | filename = bug_ptr(bug); |
1004 | 0 | if ( !is_kernel(filename) && !is_patch(filename) ) |
1005 | 0 | goto die; |
1006 | 0 | fixup = strlen(filename); |
1007 | 0 | if ( fixup > 50 ) |
1008 | 0 | { |
1009 | 0 | filename += fixup - 47; |
1010 | 0 | prefix = "..."; |
1011 | 0 | } |
1012 | 0 | lineno = bug_line(bug); |
1013 | 0 |
|
1014 | 0 | switch ( id ) |
1015 | 0 | { |
1016 | 0 | case BUGFRAME_warn: |
1017 | 0 | printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); |
1018 | 0 | show_execution_state(regs); |
1019 | 0 | regs->rip = (unsigned long)eip; |
1020 | 0 | return; |
1021 | 0 |
|
1022 | 0 | case BUGFRAME_bug: |
1023 | 0 | printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno); |
1024 | 0 |
|
1025 | 0 | if ( debugger_trap_fatal(TRAP_invalid_op, regs) ) |
1026 | 0 | return; |
1027 | 0 |
|
1028 | 0 | show_execution_state(regs); |
1029 | 0 | panic("Xen BUG at %s%s:%d", prefix, filename, lineno); |
1030 | 0 |
|
1031 | 0 | case BUGFRAME_assert: |
1032 | 0 | /* ASSERT: decode the predicate string pointer. */ |
1033 | 0 | predicate = bug_msg(bug); |
1034 | 0 | if ( !is_kernel(predicate) && !is_patch(predicate) ) |
1035 | 0 | predicate = "<unknown>"; |
1036 | 0 |
|
1037 | 0 | printk("Assertion '%s' failed at %s%s:%d\n", |
1038 | 0 | predicate, prefix, filename, lineno); |
1039 | 0 |
|
1040 | 0 | if ( debugger_trap_fatal(TRAP_invalid_op, regs) ) |
1041 | 0 | return; |
1042 | 0 |
|
1043 | 0 | show_execution_state(regs); |
1044 | 0 | panic("Assertion '%s' failed at %s%s:%d", |
1045 | 0 | predicate, prefix, filename, lineno); |
1046 | 0 | } |
1047 | 0 |
|
1048 | 1 | die: |
1049 | 1 | if ( (fixup = search_exception_table(regs)) != 0 ) |
1050 | 1 | { |
1051 | 1 | this_cpu(last_extable_addr) = regs->rip; |
1052 | 1 | regs->rip = fixup; |
1053 | 1 | return; |
1054 | 1 | } |
1055 | 1 | |
1056 | 0 | if ( debugger_trap_fatal(TRAP_invalid_op, regs) ) |
1057 | 0 | return; |
1058 | 0 |
|
1059 | 0 | show_execution_state(regs); |
1060 | 0 | panic("FATAL TRAP: vector = %d (invalid opcode)", TRAP_invalid_op); |
1061 | 0 | } |
1062 | | |
1063 | | void do_int3(struct cpu_user_regs *regs) |
1064 | 1 | { |
1065 | 1 | if ( debugger_trap_entry(TRAP_int3, regs) ) |
1066 | 0 | return; |
1067 | 1 | |
1068 | 1 | if ( !guest_mode(regs) ) |
1069 | 1 | { |
1070 | 1 | unsigned long fixup; |
1071 | 1 | |
1072 | 1 | if ( (fixup = search_exception_table(regs)) != 0 ) |
1073 | 1 | { |
1074 | 1 | this_cpu(last_extable_addr) = regs->rip; |
1075 | 1 | dprintk(XENLOG_DEBUG, "Trap %u: %p [%ps] -> %p\n", |
1076 | 1 | TRAP_int3, _p(regs->rip), _p(regs->rip), _p(fixup)); |
1077 | 1 | regs->rip = fixup; |
1078 | 1 | return; |
1079 | 1 | } |
1080 | 1 | |
1081 | 0 | if ( !debugger_trap_fatal(TRAP_int3, regs) ) |
1082 | 0 | printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n", |
1083 | 0 | _p(regs->rip), _p(regs->rip)); |
1084 | 0 |
|
1085 | 0 | return; |
1086 | 1 | } |
1087 | 1 | |
1088 | 0 | pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC); |
1089 | 0 | } |
1090 | | |
1091 | | static void reserved_bit_page_fault(unsigned long addr, |
1092 | | struct cpu_user_regs *regs) |
1093 | 0 | { |
1094 | 0 | printk("%pv: reserved bit in page table (ec=%04X)\n", |
1095 | 0 | current, regs->error_code); |
1096 | 0 | show_page_walk(addr); |
1097 | 0 | show_execution_state(regs); |
1098 | 0 | } |
1099 | | |
1100 | | static int handle_gdt_ldt_mapping_fault(unsigned long offset, |
1101 | | struct cpu_user_regs *regs) |
1102 | 0 | { |
1103 | 0 | struct vcpu *curr = current; |
1104 | 0 | /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */ |
1105 | 0 | unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1; |
1106 | 0 | unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT); |
1107 | 0 |
|
1108 | 0 | /* |
1109 | 0 | * If the fault is in another vcpu's area, it cannot be due to |
1110 | 0 | * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and |
1111 | 0 | * indeed we have to since pv_map_ldt_shadow_page() works correctly only on |
1112 | 0 | * accesses to a vcpu's own area. |
1113 | 0 | */ |
1114 | 0 | if ( vcpu_area != curr->vcpu_id ) |
1115 | 0 | return 0; |
1116 | 0 |
|
1117 | 0 | /* Byte offset within the gdt/ldt sub-area. */ |
1118 | 0 | offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL; |
1119 | 0 |
|
1120 | 0 | if ( likely(is_ldt_area) ) |
1121 | 0 | { |
1122 | 0 | /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */ |
1123 | 0 | if ( likely(pv_map_ldt_shadow_page(offset)) ) |
1124 | 0 | { |
1125 | 0 | if ( guest_mode(regs) ) |
1126 | 0 | trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT, |
1127 | 0 | regs->rip, offset); |
1128 | 0 | } |
1129 | 0 | else |
1130 | 0 | { |
1131 | 0 | /* In hypervisor mode? Leave it to the #PF handler to fix up. */ |
1132 | 0 | if ( !guest_mode(regs) ) |
1133 | 0 | return 0; |
1134 | 0 |
|
1135 | 0 | /* Access would have become non-canonical? Pass #GP[sel] back. */ |
1136 | 0 | if ( unlikely(!is_canonical_address( |
1137 | 0 | curr->arch.pv_vcpu.ldt_base + offset)) ) |
1138 | 0 | { |
1139 | 0 | uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI; |
1140 | 0 |
|
1141 | 0 | pv_inject_hw_exception(TRAP_gp_fault, ec); |
1142 | 0 | } |
1143 | 0 | else |
1144 | 0 | /* else pass the #PF back, with adjusted %cr2. */ |
1145 | 0 | pv_inject_page_fault(regs->error_code, |
1146 | 0 | curr->arch.pv_vcpu.ldt_base + offset); |
1147 | 0 | } |
1148 | 0 | } |
1149 | 0 | else |
1150 | 0 | { |
1151 | 0 | /* GDT fault: handle the fault as #GP(selector). */ |
1152 | 0 | regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI); |
1153 | 0 | (void)do_general_protection(regs); |
1154 | 0 | } |
1155 | 0 |
|
1156 | 0 | return EXCRET_fault_fixed; |
1157 | 0 | } |
1158 | | |
1159 | | #define IN_HYPERVISOR_RANGE(va) \ |
1160 | | (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END)) |
1161 | | |
1162 | | enum pf_type { |
1163 | | real_fault, |
1164 | | smep_fault, |
1165 | | smap_fault, |
1166 | | spurious_fault |
1167 | | }; |
1168 | | |
1169 | | static enum pf_type __page_fault_type(unsigned long addr, |
1170 | | const struct cpu_user_regs *regs) |
1171 | 0 | { |
1172 | 0 | unsigned long mfn, cr3 = read_cr3(); |
1173 | 0 | l4_pgentry_t l4e, *l4t; |
1174 | 0 | l3_pgentry_t l3e, *l3t; |
1175 | 0 | l2_pgentry_t l2e, *l2t; |
1176 | 0 | l1_pgentry_t l1e, *l1t; |
1177 | 0 | unsigned int required_flags, disallowed_flags, page_user; |
1178 | 0 | unsigned int error_code = regs->error_code; |
1179 | 0 |
|
1180 | 0 | /* |
1181 | 0 | * We do not take spurious page faults in IRQ handlers as we do not |
1182 | 0 | * modify page tables in IRQ context. We therefore bail here because |
1183 | 0 | * map_domain_page() is not IRQ-safe. |
1184 | 0 | */ |
1185 | 0 | if ( in_irq() ) |
1186 | 0 | return real_fault; |
1187 | 0 |
|
1188 | 0 | /* Reserved bit violations are never spurious faults. */ |
1189 | 0 | if ( error_code & PFEC_reserved_bit ) |
1190 | 0 | return real_fault; |
1191 | 0 |
|
1192 | 0 | required_flags = _PAGE_PRESENT; |
1193 | 0 | if ( error_code & PFEC_write_access ) |
1194 | 0 | required_flags |= _PAGE_RW; |
1195 | 0 | if ( error_code & PFEC_user_mode ) |
1196 | 0 | required_flags |= _PAGE_USER; |
1197 | 0 |
|
1198 | 0 | disallowed_flags = 0; |
1199 | 0 | if ( error_code & PFEC_insn_fetch ) |
1200 | 0 | disallowed_flags |= _PAGE_NX_BIT; |
1201 | 0 |
|
1202 | 0 | page_user = _PAGE_USER; |
1203 | 0 |
|
1204 | 0 | mfn = cr3 >> PAGE_SHIFT; |
1205 | 0 |
|
1206 | 0 | l4t = map_domain_page(_mfn(mfn)); |
1207 | 0 | l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]); |
1208 | 0 | mfn = l4e_get_pfn(l4e); |
1209 | 0 | unmap_domain_page(l4t); |
1210 | 0 | if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) || |
1211 | 0 | (l4e_get_flags(l4e) & disallowed_flags) ) |
1212 | 0 | return real_fault; |
1213 | 0 | page_user &= l4e_get_flags(l4e); |
1214 | 0 |
|
1215 | 0 | l3t = map_domain_page(_mfn(mfn)); |
1216 | 0 | l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]); |
1217 | 0 | mfn = l3e_get_pfn(l3e); |
1218 | 0 | unmap_domain_page(l3t); |
1219 | 0 | if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) || |
1220 | 0 | (l3e_get_flags(l3e) & disallowed_flags) ) |
1221 | 0 | return real_fault; |
1222 | 0 | page_user &= l3e_get_flags(l3e); |
1223 | 0 | if ( l3e_get_flags(l3e) & _PAGE_PSE ) |
1224 | 0 | goto leaf; |
1225 | 0 |
|
1226 | 0 | l2t = map_domain_page(_mfn(mfn)); |
1227 | 0 | l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]); |
1228 | 0 | mfn = l2e_get_pfn(l2e); |
1229 | 0 | unmap_domain_page(l2t); |
1230 | 0 | if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) || |
1231 | 0 | (l2e_get_flags(l2e) & disallowed_flags) ) |
1232 | 0 | return real_fault; |
1233 | 0 | page_user &= l2e_get_flags(l2e); |
1234 | 0 | if ( l2e_get_flags(l2e) & _PAGE_PSE ) |
1235 | 0 | goto leaf; |
1236 | 0 |
|
1237 | 0 | l1t = map_domain_page(_mfn(mfn)); |
1238 | 0 | l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]); |
1239 | 0 | mfn = l1e_get_pfn(l1e); |
1240 | 0 | unmap_domain_page(l1t); |
1241 | 0 | if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) || |
1242 | 0 | (l1e_get_flags(l1e) & disallowed_flags) ) |
1243 | 0 | return real_fault; |
1244 | 0 | page_user &= l1e_get_flags(l1e); |
1245 | 0 |
|
1246 | 0 | leaf: |
1247 | 0 | if ( page_user ) |
1248 | 0 | { |
1249 | 0 | unsigned long cr4 = read_cr4(); |
1250 | 0 | /* |
1251 | 0 | * Supervisor Mode Execution Prevention (SMEP): |
1252 | 0 | * Disallow supervisor execution from user-accessible mappings |
1253 | 0 | */ |
1254 | 0 | if ( (cr4 & X86_CR4_SMEP) && |
1255 | 0 | ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) ) |
1256 | 0 | return smep_fault; |
1257 | 0 |
|
1258 | 0 | /* |
1259 | 0 | * Supervisor Mode Access Prevention (SMAP): |
1260 | 0 | * Disallow supervisor access user-accessible mappings |
1261 | 0 | * A fault is considered as an SMAP violation if the following |
1262 | 0 | * conditions are true: |
1263 | 0 | * - X86_CR4_SMAP is set in CR4 |
1264 | 0 | * - A user page is being accessed |
1265 | 0 | * - CPL=3 or X86_EFLAGS_AC is clear |
1266 | 0 | * - Page fault in kernel mode |
1267 | 0 | */ |
1268 | 0 | if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) && |
1269 | 0 | (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) ) |
1270 | 0 | return smap_fault; |
1271 | 0 | } |
1272 | 0 |
|
1273 | 0 | return spurious_fault; |
1274 | 0 | } |
1275 | | |
1276 | | static enum pf_type spurious_page_fault(unsigned long addr, |
1277 | | const struct cpu_user_regs *regs) |
1278 | 0 | { |
1279 | 0 | unsigned long flags; |
1280 | 0 | enum pf_type pf_type; |
1281 | 0 |
|
1282 | 0 | /* |
1283 | 0 | * Disabling interrupts prevents TLB flushing, and hence prevents |
1284 | 0 | * page tables from becoming invalid under our feet during the walk. |
1285 | 0 | */ |
1286 | 0 | local_irq_save(flags); |
1287 | 0 | pf_type = __page_fault_type(addr, regs); |
1288 | 0 | local_irq_restore(flags); |
1289 | 0 |
|
1290 | 0 | return pf_type; |
1291 | 0 | } |
1292 | | |
1293 | | static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) |
1294 | 0 | { |
1295 | 0 | struct vcpu *v = current; |
1296 | 0 | struct domain *d = v->domain; |
1297 | 0 |
|
1298 | 0 | /* No fixups in interrupt context or when interrupts are disabled. */ |
1299 | 0 | if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) ) |
1300 | 0 | return 0; |
1301 | 0 |
|
1302 | 0 | if ( !(regs->error_code & PFEC_page_present) && |
1303 | 0 | (pagefault_by_memadd(addr, regs)) ) |
1304 | 0 | return handle_memadd_fault(addr, regs); |
1305 | 0 |
|
1306 | 0 | if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) |
1307 | 0 | { |
1308 | 0 | if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) && |
1309 | 0 | (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) |
1310 | 0 | return handle_gdt_ldt_mapping_fault( |
1311 | 0 | addr - GDT_LDT_VIRT_START, regs); |
1312 | 0 | return 0; |
1313 | 0 | } |
1314 | 0 |
|
1315 | 0 | if ( guest_kernel_mode(v, regs) && |
1316 | 0 | !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) && |
1317 | 0 | (regs->error_code & PFEC_write_access) ) |
1318 | 0 | { |
1319 | 0 | bool ptwr, mmio_ro; |
1320 | 0 |
|
1321 | 0 | ptwr = VM_ASSIST(d, writable_pagetables) && |
1322 | 0 | /* Do not check if access-protection fault since the page may |
1323 | 0 | legitimately be not present in shadow page tables */ |
1324 | 0 | (paging_mode_enabled(d) || |
1325 | 0 | (regs->error_code & PFEC_page_present)); |
1326 | 0 |
|
1327 | 0 | mmio_ro = is_hardware_domain(d) && |
1328 | 0 | (regs->error_code & PFEC_page_present); |
1329 | 0 |
|
1330 | 0 | if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) ) |
1331 | 0 | return EXCRET_fault_fixed; |
1332 | 0 | } |
1333 | 0 |
|
1334 | 0 | /* |
1335 | 0 | * For non-external shadowed guests, we fix up both their own pagefaults |
1336 | 0 | * and Xen's, since they share the pagetables. This includes hypervisor |
1337 | 0 | * faults, e.g. from copy_to_user(). |
1338 | 0 | */ |
1339 | 0 | if ( paging_mode_enabled(d) && !paging_mode_external(d) ) |
1340 | 0 | { |
1341 | 0 | int ret; |
1342 | 0 |
|
1343 | 0 | /* Logdirty mode is the only expected paging mode for PV guests. */ |
1344 | 0 | ASSERT(paging_mode_only_log_dirty(d)); |
1345 | 0 |
|
1346 | 0 | ret = paging_fault(addr, regs); |
1347 | 0 | if ( ret == EXCRET_fault_fixed ) |
1348 | 0 | trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr); |
1349 | 0 | return ret; |
1350 | 0 | } |
1351 | 0 |
|
1352 | 0 | return 0; |
1353 | 0 | } |
1354 | | |
1355 | | /* |
1356 | | * #PF error code: |
1357 | | * Bit 0: Protection violation (=1) ; Page not present (=0) |
1358 | | * Bit 1: Write access |
1359 | | * Bit 2: User mode (=1) ; Supervisor mode (=0) |
1360 | | * Bit 3: Reserved bit violation |
1361 | | * Bit 4: Instruction fetch |
1362 | | */ |
1363 | | void do_page_fault(struct cpu_user_regs *regs) |
1364 | 0 | { |
1365 | 0 | unsigned long addr, fixup; |
1366 | 0 | unsigned int error_code; |
1367 | 0 | enum pf_type pf_type; |
1368 | 0 |
|
1369 | 0 | addr = read_cr2(); |
1370 | 0 |
|
1371 | 0 | /* fixup_page_fault() might change regs->error_code, so cache it here. */ |
1372 | 0 | error_code = regs->error_code; |
1373 | 0 |
|
1374 | 0 | if ( debugger_trap_entry(TRAP_page_fault, regs) ) |
1375 | 0 | return; |
1376 | 0 |
|
1377 | 0 | perfc_incr(page_faults); |
1378 | 0 |
|
1379 | 0 | if ( unlikely(fixup_page_fault(addr, regs) != 0) ) |
1380 | 0 | return; |
1381 | 0 |
|
1382 | 0 | if ( unlikely(!guest_mode(regs)) ) |
1383 | 0 | { |
1384 | 0 | pf_type = spurious_page_fault(addr, regs); |
1385 | 0 | if ( (pf_type == smep_fault) || (pf_type == smap_fault) ) |
1386 | 0 | { |
1387 | 0 | console_start_sync(); |
1388 | 0 | printk("Xen SM%cP violation\n", |
1389 | 0 | (pf_type == smep_fault) ? 'E' : 'A'); |
1390 | 0 | fatal_trap(regs, 0); |
1391 | 0 | } |
1392 | 0 |
|
1393 | 0 | if ( pf_type != real_fault ) |
1394 | 0 | return; |
1395 | 0 |
|
1396 | 0 | if ( likely((fixup = search_exception_table(regs)) != 0) ) |
1397 | 0 | { |
1398 | 0 | perfc_incr(copy_user_faults); |
1399 | 0 | if ( unlikely(regs->error_code & PFEC_reserved_bit) ) |
1400 | 0 | reserved_bit_page_fault(addr, regs); |
1401 | 0 | this_cpu(last_extable_addr) = regs->rip; |
1402 | 0 | regs->rip = fixup; |
1403 | 0 | return; |
1404 | 0 | } |
1405 | 0 |
|
1406 | 0 | if ( debugger_trap_fatal(TRAP_page_fault, regs) ) |
1407 | 0 | return; |
1408 | 0 |
|
1409 | 0 | show_execution_state(regs); |
1410 | 0 | show_page_walk(addr); |
1411 | 0 | panic("FATAL PAGE FAULT\n" |
1412 | 0 | "[error_code=%04x]\n" |
1413 | 0 | "Faulting linear address: %p", |
1414 | 0 | error_code, _p(addr)); |
1415 | 0 | } |
1416 | 0 |
|
1417 | 0 | if ( unlikely(current->domain->arch.suppress_spurious_page_faults) ) |
1418 | 0 | { |
1419 | 0 | pf_type = spurious_page_fault(addr, regs); |
1420 | 0 | if ( (pf_type == smep_fault) || (pf_type == smap_fault)) |
1421 | 0 | { |
1422 | 0 | printk(XENLOG_G_ERR "%pv fatal SM%cP violation\n", |
1423 | 0 | current, (pf_type == smep_fault) ? 'E' : 'A'); |
1424 | 0 |
|
1425 | 0 | domain_crash(current->domain); |
1426 | 0 | } |
1427 | 0 | if ( pf_type != real_fault ) |
1428 | 0 | return; |
1429 | 0 | } |
1430 | 0 |
|
1431 | 0 | if ( unlikely(regs->error_code & PFEC_reserved_bit) ) |
1432 | 0 | reserved_bit_page_fault(addr, regs); |
1433 | 0 |
|
1434 | 0 | pv_inject_page_fault(regs->error_code, addr); |
1435 | 0 | } |
1436 | | |
1437 | | /* |
1438 | | * Early #PF handler to print CR2, error code, and stack. |
1439 | | * |
1440 | | * We also deal with spurious faults here, even though they should never happen |
1441 | | * during early boot (an issue was seen once, but was most likely a hardware |
1442 | | * problem). |
1443 | | */ |
1444 | | void __init do_early_page_fault(struct cpu_user_regs *regs) |
1445 | 0 | { |
1446 | 0 | static unsigned int __initdata stuck; |
1447 | 0 | static unsigned long __initdata prev_eip, prev_cr2; |
1448 | 0 | unsigned long cr2 = read_cr2(); |
1449 | 0 |
|
1450 | 0 | BUG_ON(smp_processor_id() != 0); |
1451 | 0 |
|
1452 | 0 | if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) ) |
1453 | 0 | { |
1454 | 0 | prev_eip = regs->rip; |
1455 | 0 | prev_cr2 = cr2; |
1456 | 0 | stuck = 0; |
1457 | 0 | return; |
1458 | 0 | } |
1459 | 0 |
|
1460 | 0 | if ( stuck++ == 1000 ) |
1461 | 0 | { |
1462 | 0 | console_start_sync(); |
1463 | 0 | printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n", |
1464 | 0 | regs->cs, _p(regs->rip), _p(cr2), regs->error_code); |
1465 | 0 | fatal_trap(regs, 0); |
1466 | 0 | } |
1467 | 0 | } |
1468 | | |
1469 | | void do_general_protection(struct cpu_user_regs *regs) |
1470 | 1 | { |
1471 | 1 | struct vcpu *v = current; |
1472 | 1 | unsigned long fixup; |
1473 | 1 | |
1474 | 1 | if ( debugger_trap_entry(TRAP_gp_fault, regs) ) |
1475 | 0 | return; |
1476 | 1 | |
1477 | 1 | if ( regs->error_code & X86_XEC_EXT ) |
1478 | 0 | goto hardware_gp; |
1479 | 1 | |
1480 | 1 | if ( !guest_mode(regs) ) |
1481 | 1 | goto gp_in_kernel; |
1482 | 1 | |
1483 | 1 | /* |
1484 | 1 | * Cunning trick to allow arbitrary "INT n" handling. |
1485 | 1 | * |
1486 | 1 | * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n> |
1487 | 1 | * instruction from trapping to the appropriate vector, when that might not |
1488 | 1 | * be expected by Xen or the guest OS. For example, that entry might be for |
1489 | 1 | * a fault handler (unlike traps, faults don't increment EIP), or might |
1490 | 1 | * expect an error code on the stack (which a software trap never |
1491 | 1 | * provides), or might be a hardware interrupt handler that doesn't like |
1492 | 1 | * being called spuriously. |
1493 | 1 | * |
1494 | 1 | * Instead, a GPF occurs with the faulting IDT vector in the error code. |
1495 | 1 | * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is |
1496 | 1 | * clear (which got already checked above) to indicate that it's a software |
1497 | 1 | * fault, not a hardware one. |
1498 | 1 | * |
1499 | 1 | * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is |
1500 | 1 | * okay because they can only be triggered by an explicit DPL-checked |
1501 | 1 | * instruction. The DPL specified by the guest OS for these vectors is NOT |
1502 | 1 | * CHECKED!! |
1503 | 1 | */ |
1504 | 0 | if ( regs->error_code & X86_XEC_IDT ) |
1505 | 0 | { |
1506 | 0 | /* This fault must be due to <INT n> instruction. */ |
1507 | 0 | const struct trap_info *ti; |
1508 | 0 | unsigned char vector = regs->error_code >> 3; |
1509 | 0 | ti = &v->arch.pv_vcpu.trap_ctxt[vector]; |
1510 | 0 | if ( permit_softint(TI_GET_DPL(ti), v, regs) ) |
1511 | 0 | { |
1512 | 0 | regs->rip += 2; |
1513 | 0 | pv_inject_sw_interrupt(vector); |
1514 | 0 | return; |
1515 | 0 | } |
1516 | 0 | } |
1517 | 0 | else if ( is_pv_32bit_vcpu(v) && regs->error_code ) |
1518 | 0 | { |
1519 | 0 | pv_emulate_gate_op(regs); |
1520 | 0 | return; |
1521 | 0 | } |
1522 | 0 |
|
1523 | 0 | /* Emulate some simple privileged and I/O instructions. */ |
1524 | 0 | if ( (regs->error_code == 0) && |
1525 | 0 | pv_emulate_privileged_op(regs) ) |
1526 | 0 | { |
1527 | 0 | trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip); |
1528 | 0 | return; |
1529 | 0 | } |
1530 | 0 |
|
1531 | 0 | /* Pass on GPF as is. */ |
1532 | 0 | pv_inject_hw_exception(TRAP_gp_fault, regs->error_code); |
1533 | 0 | return; |
1534 | 0 |
|
1535 | 1 | gp_in_kernel: |
1536 | 1 | |
1537 | 1 | if ( likely((fixup = search_exception_table(regs)) != 0) ) |
1538 | 1 | { |
1539 | 1 | dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n", |
1540 | 1 | regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup)); |
1541 | 1 | this_cpu(last_extable_addr) = regs->rip; |
1542 | 1 | regs->rip = fixup; |
1543 | 1 | return; |
1544 | 1 | } |
1545 | 1 | |
1546 | 0 | hardware_gp: |
1547 | 0 | if ( debugger_trap_fatal(TRAP_gp_fault, regs) ) |
1548 | 0 | return; |
1549 | 0 |
|
1550 | 0 | show_execution_state(regs); |
1551 | 0 | panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code); |
1552 | 0 | } |
1553 | | |
1554 | | static void pci_serr_softirq(void) |
1555 | 0 | { |
1556 | 0 | printk("\n\nNMI - PCI system error (SERR)\n"); |
1557 | 0 | outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ |
1558 | 0 | } |
1559 | | |
1560 | | void async_exception_cleanup(struct vcpu *curr) |
1561 | 0 | { |
1562 | 0 | int trap; |
1563 | 0 |
|
1564 | 0 | if ( !curr->async_exception_mask ) |
1565 | 0 | return; |
1566 | 0 |
|
1567 | 0 | /* Restore affinity. */ |
1568 | 0 | if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) && |
1569 | 0 | !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) ) |
1570 | 0 | { |
1571 | 0 | vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp); |
1572 | 0 | cpumask_clear(curr->cpu_hard_affinity_tmp); |
1573 | 0 | } |
1574 | 0 |
|
1575 | 0 | if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) ) |
1576 | 0 | trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE); |
1577 | 0 | else |
1578 | 0 | for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap ) |
1579 | 0 | if ( (curr->async_exception_mask ^ |
1580 | 0 | curr->async_exception_state(trap).old_mask) == (1 << trap) ) |
1581 | 0 | break; |
1582 | 0 | if ( unlikely(trap > VCPU_TRAP_LAST) ) |
1583 | 0 | { |
1584 | 0 | ASSERT_UNREACHABLE(); |
1585 | 0 | return; |
1586 | 0 | } |
1587 | 0 |
|
1588 | 0 | /* Restore previous asynchronous exception mask. */ |
1589 | 0 | curr->async_exception_mask = curr->async_exception_state(trap).old_mask; |
1590 | 0 | } |
1591 | | |
1592 | | static void nmi_hwdom_report(unsigned int reason_idx) |
1593 | 0 | { |
1594 | 0 | struct domain *d = hardware_domain; |
1595 | 0 |
|
1596 | 0 | if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ ) |
1597 | 0 | return; |
1598 | 0 |
|
1599 | 0 | set_bit(reason_idx, nmi_reason(d)); |
1600 | 0 |
|
1601 | 0 | pv_raise_interrupt(d->vcpu[0], TRAP_nmi); |
1602 | 0 | } |
1603 | | |
1604 | | static void pci_serr_error(const struct cpu_user_regs *regs) |
1605 | 0 | { |
1606 | 0 | outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ |
1607 | 0 |
|
1608 | 0 | switch ( opt_nmi[0] ) |
1609 | 0 | { |
1610 | 0 | case 'd': /* 'dom0' */ |
1611 | 0 | nmi_hwdom_report(_XEN_NMIREASON_pci_serr); |
1612 | 0 | /* fallthrough */ |
1613 | 0 | case 'i': /* 'ignore' */ |
1614 | 0 | /* Would like to print a diagnostic here but can't call printk() |
1615 | 0 | from NMI context -- raise a softirq instead. */ |
1616 | 0 | raise_softirq(PCI_SERR_SOFTIRQ); |
1617 | 0 | break; |
1618 | 0 | default: /* 'fatal' */ |
1619 | 0 | console_force_unlock(); |
1620 | 0 | printk("\n\nNMI - PCI system error (SERR)\n"); |
1621 | 0 | fatal_trap(regs, 0); |
1622 | 0 | } |
1623 | 0 | } |
1624 | | |
1625 | | static void io_check_error(const struct cpu_user_regs *regs) |
1626 | 0 | { |
1627 | 0 | switch ( opt_nmi[0] ) |
1628 | 0 | { |
1629 | 0 | case 'd': /* 'dom0' */ |
1630 | 0 | nmi_hwdom_report(_XEN_NMIREASON_io_error); |
1631 | 0 | case 'i': /* 'ignore' */ |
1632 | 0 | break; |
1633 | 0 | default: /* 'fatal' */ |
1634 | 0 | console_force_unlock(); |
1635 | 0 | printk("\n\nNMI - I/O ERROR\n"); |
1636 | 0 | fatal_trap(regs, 0); |
1637 | 0 | } |
1638 | 0 |
|
1639 | 0 | outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */ |
1640 | 0 | mdelay(1); |
1641 | 0 | outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */ |
1642 | 0 | } |
1643 | | |
1644 | | static void unknown_nmi_error(const struct cpu_user_regs *regs, |
1645 | | unsigned char reason) |
1646 | 0 | { |
1647 | 0 | switch ( opt_nmi[0] ) |
1648 | 0 | { |
1649 | 0 | case 'd': /* 'dom0' */ |
1650 | 0 | nmi_hwdom_report(_XEN_NMIREASON_unknown); |
1651 | 0 | case 'i': /* 'ignore' */ |
1652 | 0 | break; |
1653 | 0 | default: /* 'fatal' */ |
1654 | 0 | console_force_unlock(); |
1655 | 0 | printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); |
1656 | 0 | printk("Do you have a strange power saving mode enabled?\n"); |
1657 | 0 | fatal_trap(regs, 0); |
1658 | 0 | } |
1659 | 0 | } |
1660 | | |
1661 | | static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu) |
1662 | 0 | { |
1663 | 0 | return 0; |
1664 | 0 | } |
1665 | | |
1666 | | static nmi_callback_t *nmi_callback = dummy_nmi_callback; |
1667 | | |
1668 | | void do_nmi(const struct cpu_user_regs *regs) |
1669 | 0 | { |
1670 | 0 | unsigned int cpu = smp_processor_id(); |
1671 | 0 | unsigned char reason; |
1672 | 0 | bool handle_unknown = false; |
1673 | 0 |
|
1674 | 0 | ++nmi_count(cpu); |
1675 | 0 |
|
1676 | 0 | if ( nmi_callback(regs, cpu) ) |
1677 | 0 | return; |
1678 | 0 |
|
1679 | 0 | if ( (nmi_watchdog == NMI_NONE) || |
1680 | 0 | (!nmi_watchdog_tick(regs) && watchdog_force) ) |
1681 | 0 | handle_unknown = true; |
1682 | 0 |
|
1683 | 0 | /* Only the BSP gets external NMIs from the system. */ |
1684 | 0 | if ( cpu == 0 ) |
1685 | 0 | { |
1686 | 0 | reason = inb(0x61); |
1687 | 0 | if ( reason & 0x80 ) |
1688 | 0 | pci_serr_error(regs); |
1689 | 0 | if ( reason & 0x40 ) |
1690 | 0 | io_check_error(regs); |
1691 | 0 | if ( !(reason & 0xc0) && handle_unknown ) |
1692 | 0 | unknown_nmi_error(regs, reason); |
1693 | 0 | } |
1694 | 0 | } |
1695 | | |
1696 | | nmi_callback_t *set_nmi_callback(nmi_callback_t *callback) |
1697 | 2 | { |
1698 | 2 | nmi_callback_t *old_nmi_callback = nmi_callback; |
1699 | 2 | |
1700 | 2 | nmi_callback = callback; |
1701 | 2 | |
1702 | 2 | return old_nmi_callback; |
1703 | 2 | } |
1704 | | |
1705 | | void unset_nmi_callback(void) |
1706 | 0 | { |
1707 | 0 | nmi_callback = dummy_nmi_callback; |
1708 | 0 | } |
1709 | | |
1710 | | void do_device_not_available(struct cpu_user_regs *regs) |
1711 | 0 | { |
1712 | 0 | struct vcpu *curr = current; |
1713 | 0 |
|
1714 | 0 | BUG_ON(!guest_mode(regs)); |
1715 | 0 |
|
1716 | 0 | vcpu_restore_fpu_lazy(curr); |
1717 | 0 |
|
1718 | 0 | if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS ) |
1719 | 0 | { |
1720 | 0 | pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC); |
1721 | 0 | curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS; |
1722 | 0 | } |
1723 | 0 | else |
1724 | 0 | TRACE_0D(TRC_PV_MATH_STATE_RESTORE); |
1725 | 0 |
|
1726 | 0 | return; |
1727 | 0 | } |
1728 | | |
1729 | | u64 read_efer(void) |
1730 | 37.0k | { |
1731 | 37.0k | return this_cpu(efer); |
1732 | 37.0k | } |
1733 | | |
1734 | | void write_efer(u64 val) |
1735 | 60 | { |
1736 | 60 | this_cpu(efer) = val; |
1737 | 60 | wrmsrl(MSR_EFER, val); |
1738 | 60 | } |
1739 | | |
1740 | | static void ler_enable(void) |
1741 | 0 | { |
1742 | 0 | u64 debugctl; |
1743 | 0 |
|
1744 | 0 | if ( !this_cpu(ler_msr) ) |
1745 | 0 | return; |
1746 | 0 |
|
1747 | 0 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); |
1748 | 0 | wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR); |
1749 | 0 | } |
1750 | | |
1751 | | void do_debug(struct cpu_user_regs *regs) |
1752 | 0 | { |
1753 | 0 | struct vcpu *v = current; |
1754 | 0 |
|
1755 | 0 | if ( debugger_trap_entry(TRAP_debug, regs) ) |
1756 | 0 | return; |
1757 | 0 |
|
1758 | 0 | if ( !guest_mode(regs) ) |
1759 | 0 | { |
1760 | 0 | if ( regs->eflags & X86_EFLAGS_TF ) |
1761 | 0 | { |
1762 | 0 | /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */ |
1763 | 0 | if ( (regs->rip >= (unsigned long)sysenter_entry) && |
1764 | 0 | (regs->rip <= (unsigned long)sysenter_eflags_saved) ) |
1765 | 0 | { |
1766 | 0 | if ( regs->rip == (unsigned long)sysenter_eflags_saved ) |
1767 | 0 | regs->eflags &= ~X86_EFLAGS_TF; |
1768 | 0 | goto out; |
1769 | 0 | } |
1770 | 0 | if ( !debugger_trap_fatal(TRAP_debug, regs) ) |
1771 | 0 | { |
1772 | 0 | WARN(); |
1773 | 0 | regs->eflags &= ~X86_EFLAGS_TF; |
1774 | 0 | } |
1775 | 0 | } |
1776 | 0 | else |
1777 | 0 | { |
1778 | 0 | /* |
1779 | 0 | * We ignore watchpoints when they trigger within Xen. This may |
1780 | 0 | * happen when a buffer is passed to us which previously had a |
1781 | 0 | * watchpoint set on it. No need to bump EIP; the only faulting |
1782 | 0 | * trap is an instruction breakpoint, which can't happen to us. |
1783 | 0 | */ |
1784 | 0 | WARN_ON(!search_exception_table(regs)); |
1785 | 0 | } |
1786 | 0 | goto out; |
1787 | 0 | } |
1788 | 0 |
|
1789 | 0 | /* Save debug status register where guest OS can peek at it */ |
1790 | 0 | v->arch.debugreg[6] = read_debugreg(6); |
1791 | 0 |
|
1792 | 0 | ler_enable(); |
1793 | 0 | pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); |
1794 | 0 | return; |
1795 | 0 |
|
1796 | 0 | out: |
1797 | 0 | ler_enable(); |
1798 | 0 | return; |
1799 | 0 | } |
1800 | | |
1801 | | static void __init noinline __set_intr_gate(unsigned int n, |
1802 | | uint32_t dpl, void *addr) |
1803 | 255 | { |
1804 | 255 | _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr); |
1805 | 255 | } |
1806 | | |
1807 | | static void __init set_swint_gate(unsigned int n, void *addr) |
1808 | 2 | { |
1809 | 2 | __set_intr_gate(n, 3, addr); |
1810 | 2 | } |
1811 | | |
1812 | | static void __init set_intr_gate(unsigned int n, void *addr) |
1813 | 253 | { |
1814 | 253 | __set_intr_gate(n, 0, addr); |
1815 | 253 | } |
1816 | | |
1817 | | void load_TR(void) |
1818 | 0 | { |
1819 | 0 | struct tss_struct *tss = &this_cpu(init_tss); |
1820 | 0 | struct desc_ptr old_gdt, tss_gdt = { |
1821 | 0 | .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), |
1822 | 0 | .limit = LAST_RESERVED_GDT_BYTE |
1823 | 0 | }; |
1824 | 0 |
|
1825 | 0 | _set_tssldt_desc( |
1826 | 0 | this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, |
1827 | 0 | (unsigned long)tss, |
1828 | 0 | offsetof(struct tss_struct, __cacheline_filler) - 1, |
1829 | 0 | SYS_DESC_tss_avail); |
1830 | 0 | _set_tssldt_desc( |
1831 | 0 | this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, |
1832 | 0 | (unsigned long)tss, |
1833 | 0 | offsetof(struct tss_struct, __cacheline_filler) - 1, |
1834 | 0 | SYS_DESC_tss_busy); |
1835 | 0 |
|
1836 | 0 | /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */ |
1837 | 0 | asm volatile ( |
1838 | 0 | "sgdt %0; lgdt %2; ltr %w1; lgdt %0" |
1839 | 0 | : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" ); |
1840 | 0 | } |
1841 | | |
1842 | | void percpu_traps_init(void) |
1843 | 13 | { |
1844 | 13 | subarch_percpu_traps_init(); |
1845 | 13 | |
1846 | 13 | if ( !opt_ler ) |
1847 | 13 | return; |
1848 | 13 | |
1849 | 0 | switch ( boot_cpu_data.x86_vendor ) |
1850 | 0 | { |
1851 | 0 | case X86_VENDOR_INTEL: |
1852 | 0 | switch ( boot_cpu_data.x86 ) |
1853 | 0 | { |
1854 | 0 | case 6: |
1855 | 0 | this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; |
1856 | 0 | break; |
1857 | 0 | case 15: |
1858 | 0 | this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP; |
1859 | 0 | break; |
1860 | 0 | } |
1861 | 0 | break; |
1862 | 0 | case X86_VENDOR_AMD: |
1863 | 0 | switch ( boot_cpu_data.x86 ) |
1864 | 0 | { |
1865 | 0 | case 6: |
1866 | 0 | case 0xf ... 0x17: |
1867 | 0 | this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; |
1868 | 0 | break; |
1869 | 0 | } |
1870 | 0 | break; |
1871 | 0 | } |
1872 | 0 |
|
1873 | 0 | ler_enable(); |
1874 | 0 | } |
1875 | | |
1876 | | void __init init_idt_traps(void) |
1877 | 1 | { |
1878 | 1 | /* |
1879 | 1 | * Note that interrupt gates are always used, rather than trap gates. We |
1880 | 1 | * must have interrupts disabled until DS/ES/FS/GS are saved because the |
1881 | 1 | * first activation must have the "bad" value(s) for these registers and |
1882 | 1 | * we may lose them if another activation is installed before they are |
1883 | 1 | * saved. The page-fault handler also needs interrupts disabled until %cr2 |
1884 | 1 | * has been read and saved on the stack. |
1885 | 1 | */ |
1886 | 1 | set_intr_gate(TRAP_divide_error,÷_error); |
1887 | 1 | set_intr_gate(TRAP_debug,&debug); |
1888 | 1 | set_intr_gate(TRAP_nmi,&nmi); |
1889 | 1 | set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */ |
1890 | 1 | set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */ |
1891 | 1 | set_intr_gate(TRAP_bounds,&bounds); |
1892 | 1 | set_intr_gate(TRAP_invalid_op,&invalid_op); |
1893 | 1 | set_intr_gate(TRAP_no_device,&device_not_available); |
1894 | 1 | set_intr_gate(TRAP_double_fault,&double_fault); |
1895 | 1 | set_intr_gate(TRAP_invalid_tss,&invalid_TSS); |
1896 | 1 | set_intr_gate(TRAP_no_segment,&segment_not_present); |
1897 | 1 | set_intr_gate(TRAP_stack_error,&stack_segment); |
1898 | 1 | set_intr_gate(TRAP_gp_fault,&general_protection); |
1899 | 1 | set_intr_gate(TRAP_page_fault,&early_page_fault); |
1900 | 1 | set_intr_gate(TRAP_copro_error,&coprocessor_error); |
1901 | 1 | set_intr_gate(TRAP_alignment_check,&alignment_check); |
1902 | 1 | set_intr_gate(TRAP_machine_check,&machine_check); |
1903 | 1 | set_intr_gate(TRAP_simd_error,&simd_coprocessor_error); |
1904 | 1 | |
1905 | 1 | /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */ |
1906 | 1 | set_ist(&idt_table[TRAP_double_fault], IST_DF); |
1907 | 1 | set_ist(&idt_table[TRAP_nmi], IST_NMI); |
1908 | 1 | set_ist(&idt_table[TRAP_machine_check], IST_MCE); |
1909 | 1 | |
1910 | 1 | /* CPU0 uses the master IDT. */ |
1911 | 1 | idt_tables[0] = idt_table; |
1912 | 1 | |
1913 | 1 | this_cpu(gdt_table) = boot_cpu_gdt_table; |
1914 | 1 | this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table; |
1915 | 1 | } |
1916 | | |
1917 | | extern void (*const autogen_entrypoints[NR_VECTORS])(void); |
1918 | | void __init trap_init(void) |
1919 | 1 | { |
1920 | 1 | unsigned int vector; |
1921 | 1 | |
1922 | 1 | /* Replace early pagefault with real pagefault handler. */ |
1923 | 1 | set_intr_gate(TRAP_page_fault, &page_fault); |
1924 | 1 | |
1925 | 1 | pv_trap_init(); |
1926 | 1 | |
1927 | 257 | for ( vector = 0; vector < NR_VECTORS; ++vector ) |
1928 | 256 | { |
1929 | 256 | if ( autogen_entrypoints[vector] ) |
1930 | 236 | { |
1931 | 236 | /* Found autogen entry: check we won't clobber an existing trap. */ |
1932 | 236 | ASSERT(idt_table[vector].b == 0); |
1933 | 236 | set_intr_gate(vector, autogen_entrypoints[vector]); |
1934 | 236 | } |
1935 | 256 | else |
1936 | 20 | { |
1937 | 20 | /* No entry point: confirm we have an existing trap in place. */ |
1938 | 20 | ASSERT(idt_table[vector].b != 0); |
1939 | 20 | } |
1940 | 256 | } |
1941 | 1 | |
1942 | 1 | percpu_traps_init(); |
1943 | 1 | |
1944 | 1 | cpu_init(); |
1945 | 1 | |
1946 | 1 | open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq); |
1947 | 1 | } |
1948 | | |
1949 | | void activate_debugregs(const struct vcpu *curr) |
1950 | 0 | { |
1951 | 0 | ASSERT(curr == current); |
1952 | 0 |
|
1953 | 0 | write_debugreg(0, curr->arch.debugreg[0]); |
1954 | 0 | write_debugreg(1, curr->arch.debugreg[1]); |
1955 | 0 | write_debugreg(2, curr->arch.debugreg[2]); |
1956 | 0 | write_debugreg(3, curr->arch.debugreg[3]); |
1957 | 0 | write_debugreg(6, curr->arch.debugreg[6]); |
1958 | 0 |
|
1959 | 0 | /* |
1960 | 0 | * Avoid writing the subsequently getting replaced value when getting |
1961 | 0 | * called from set_debugreg() below. Eventual future callers will need |
1962 | 0 | * to take this into account. |
1963 | 0 | */ |
1964 | 0 | if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) |
1965 | 0 | write_debugreg(7, curr->arch.debugreg[7]); |
1966 | 0 |
|
1967 | 0 | if ( boot_cpu_has(X86_FEATURE_DBEXT) ) |
1968 | 0 | { |
1969 | 0 | wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]); |
1970 | 0 | wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]); |
1971 | 0 | wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]); |
1972 | 0 | wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]); |
1973 | 0 | } |
1974 | 0 | } |
1975 | | |
1976 | | long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) |
1977 | 0 | { |
1978 | 0 | int i; |
1979 | 0 | struct vcpu *curr = current; |
1980 | 0 |
|
1981 | 0 | switch ( reg ) |
1982 | 0 | { |
1983 | 0 | case 0: |
1984 | 0 | if ( !access_ok(value, sizeof(long)) ) |
1985 | 0 | return -EPERM; |
1986 | 0 | if ( v == curr ) |
1987 | 0 | write_debugreg(0, value); |
1988 | 0 | break; |
1989 | 0 | case 1: |
1990 | 0 | if ( !access_ok(value, sizeof(long)) ) |
1991 | 0 | return -EPERM; |
1992 | 0 | if ( v == curr ) |
1993 | 0 | write_debugreg(1, value); |
1994 | 0 | break; |
1995 | 0 | case 2: |
1996 | 0 | if ( !access_ok(value, sizeof(long)) ) |
1997 | 0 | return -EPERM; |
1998 | 0 | if ( v == curr ) |
1999 | 0 | write_debugreg(2, value); |
2000 | 0 | break; |
2001 | 0 | case 3: |
2002 | 0 | if ( !access_ok(value, sizeof(long)) ) |
2003 | 0 | return -EPERM; |
2004 | 0 | if ( v == curr ) |
2005 | 0 | write_debugreg(3, value); |
2006 | 0 | break; |
2007 | 0 | case 6: |
2008 | 0 | /* |
2009 | 0 | * DR6: Bits 4-11,16-31 reserved (set to 1). |
2010 | 0 | * Bit 12 reserved (set to 0). |
2011 | 0 | */ |
2012 | 0 | value &= ~DR_STATUS_RESERVED_ZERO; /* reserved bits => 0 */ |
2013 | 0 | value |= DR_STATUS_RESERVED_ONE; /* reserved bits => 1 */ |
2014 | 0 | if ( v == curr ) |
2015 | 0 | write_debugreg(6, value); |
2016 | 0 | break; |
2017 | 0 | case 7: |
2018 | 0 | /* |
2019 | 0 | * DR7: Bit 10 reserved (set to 1). |
2020 | 0 | * Bits 11-12,14-15 reserved (set to 0). |
2021 | 0 | */ |
2022 | 0 | value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */ |
2023 | 0 | value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */ |
2024 | 0 | /* |
2025 | 0 | * Privileged bits: |
2026 | 0 | * GD (bit 13): must be 0. |
2027 | 0 | */ |
2028 | 0 | if ( value & DR_GENERAL_DETECT ) |
2029 | 0 | return -EPERM; |
2030 | 0 | /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ |
2031 | 0 | if ( value & DR7_ACTIVE_MASK ) |
2032 | 0 | { |
2033 | 0 | unsigned int io_enable = 0; |
2034 | 0 |
|
2035 | 0 | for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE ) |
2036 | 0 | { |
2037 | 0 | if ( ((value >> i) & 3) == DR_IO ) |
2038 | 0 | { |
2039 | 0 | if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) |
2040 | 0 | return -EPERM; |
2041 | 0 | io_enable |= value & (3 << ((i - 16) >> 1)); |
2042 | 0 | } |
2043 | 0 | } |
2044 | 0 |
|
2045 | 0 | /* Guest DR5 is a handy stash for I/O intercept information. */ |
2046 | 0 | v->arch.debugreg[5] = io_enable; |
2047 | 0 | value &= ~io_enable; |
2048 | 0 |
|
2049 | 0 | /* |
2050 | 0 | * If DR7 was previously clear then we need to load all other |
2051 | 0 | * debug registers at this point as they were not restored during |
2052 | 0 | * context switch. |
2053 | 0 | */ |
2054 | 0 | if ( (v == curr) && |
2055 | 0 | !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) |
2056 | 0 | { |
2057 | 0 | activate_debugregs(v); |
2058 | 0 | break; |
2059 | 0 | } |
2060 | 0 | } |
2061 | 0 | if ( v == curr ) |
2062 | 0 | write_debugreg(7, value); |
2063 | 0 | break; |
2064 | 0 | default: |
2065 | 0 | return -EINVAL; |
2066 | 0 | } |
2067 | 0 |
|
2068 | 0 | v->arch.debugreg[reg] = value; |
2069 | 0 | return 0; |
2070 | 0 | } |
2071 | | |
2072 | | void asm_domain_crash_synchronous(unsigned long addr) |
2073 | 0 | { |
2074 | 0 | /* |
2075 | 0 | * We need clear AC bit here because in entry.S AC is set |
2076 | 0 | * by ASM_STAC to temporarily allow accesses to user pages |
2077 | 0 | * which is prevented by SMAP by default. |
2078 | 0 | * |
2079 | 0 | * For some code paths, where this function is called, clac() |
2080 | 0 | * is not needed, but adding clac() here instead of each place |
2081 | 0 | * asm_domain_crash_synchronous() is called can reduce the code |
2082 | 0 | * redundancy, and it is harmless as well. |
2083 | 0 | */ |
2084 | 0 | clac(); |
2085 | 0 |
|
2086 | 0 | if ( addr == 0 ) |
2087 | 0 | addr = this_cpu(last_extable_addr); |
2088 | 0 |
|
2089 | 0 | printk("domain_crash_sync called from entry.S: fault at %p %pS\n", |
2090 | 0 | _p(addr), _p(addr)); |
2091 | 0 |
|
2092 | 0 | __domain_crash_synchronous(); |
2093 | 0 | } |
2094 | | |
2095 | | /* |
2096 | | * Local variables: |
2097 | | * mode: C |
2098 | | * c-file-style: "BSD" |
2099 | | * c-basic-offset: 4 |
2100 | | * tab-width: 4 |
2101 | | * indent-tabs-mode: nil |
2102 | | * End: |
2103 | | */ |