Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/traps.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * arch/x86/traps.c
3
 *
4
 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5
 *
6
 * This program is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
18
 */
19
20
/*
21
 *  Copyright (C) 1991, 1992  Linus Torvalds
22
 *
23
 *  Pentium III FXSR, SSE support
24
 * Gareth Hughes <gareth@valinux.com>, May 2000
25
 */
26
27
#include <xen/init.h>
28
#include <xen/sched.h>
29
#include <xen/lib.h>
30
#include <xen/err.h>
31
#include <xen/errno.h>
32
#include <xen/mm.h>
33
#include <xen/console.h>
34
#include <xen/shutdown.h>
35
#include <xen/guest_access.h>
36
#include <asm/regs.h>
37
#include <xen/delay.h>
38
#include <xen/event.h>
39
#include <xen/spinlock.h>
40
#include <xen/irq.h>
41
#include <xen/perfc.h>
42
#include <xen/softirq.h>
43
#include <xen/domain_page.h>
44
#include <xen/symbols.h>
45
#include <xen/iocap.h>
46
#include <xen/version.h>
47
#include <xen/kexec.h>
48
#include <xen/trace.h>
49
#include <xen/paging.h>
50
#include <xen/virtual_region.h>
51
#include <xen/watchdog.h>
52
#include <xen/livepatch.h>
53
#include <asm/system.h>
54
#include <asm/io.h>
55
#include <asm/atomic.h>
56
#include <xen/bitops.h>
57
#include <asm/desc.h>
58
#include <asm/debugreg.h>
59
#include <asm/smp.h>
60
#include <asm/flushtlb.h>
61
#include <asm/uaccess.h>
62
#include <asm/i387.h>
63
#include <asm/xstate.h>
64
#include <asm/debugger.h>
65
#include <asm/msr.h>
66
#include <asm/nmi.h>
67
#include <asm/shared.h>
68
#include <asm/x86_emulate.h>
69
#include <asm/traps.h>
70
#include <asm/hvm/vpt.h>
71
#include <asm/hypercall.h>
72
#include <asm/mce.h>
73
#include <asm/apic.h>
74
#include <asm/mc146818rtc.h>
75
#include <asm/hpet.h>
76
#include <asm/vpmu.h>
77
#include <public/arch-x86/cpuid.h>
78
#include <asm/cpuid.h>
79
#include <xsm/xsm.h>
80
#include <asm/pv/traps.h>
81
#include <asm/pv/mm.h>
82
83
/*
84
 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
85
 *  fatal:  Xen prints diagnostic message and then hangs.
86
 *  dom0:   The NMI is virtualised to DOM0.
87
 *  ignore: The NMI error is cleared and ignored.
88
 */
89
#ifdef NDEBUG
90
static char __read_mostly opt_nmi[10] = "dom0";
91
#else
92
static char __read_mostly opt_nmi[10] = "fatal";
93
#endif
94
string_param("nmi", opt_nmi);
95
96
DEFINE_PER_CPU(u64, efer);
97
static DEFINE_PER_CPU(unsigned long, last_extable_addr);
98
99
DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
100
101
DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
102
DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
103
104
/* Master table, used by CPU0. */
105
idt_entry_t idt_table[IDT_ENTRIES];
106
107
/* Pointer to the IDT of every CPU. */
108
idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
109
110
void (*ioemul_handle_quirk)(
111
    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
112
113
static int debug_stack_lines = 20;
114
integer_param("debug_stack_lines", debug_stack_lines);
115
116
static bool opt_ler;
117
boolean_param("ler", opt_ler);
118
119
0
#define stack_words_per_line 4
120
0
#define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
121
122
static void show_code(const struct cpu_user_regs *regs)
123
0
{
124
0
    unsigned char insns_before[8] = {}, insns_after[16] = {};
125
0
    unsigned int i, tmp, missing_before, missing_after;
126
0
127
0
    if ( guest_mode(regs) )
128
0
        return;
129
0
130
0
    stac();
131
0
132
0
    /*
133
0
     * Copy forward from regs->rip.  In the case of a fault, %ecx contains the
134
0
     * number of bytes remaining to copy.
135
0
     */
136
0
    asm volatile ("1: rep movsb; 2:"
137
0
                  _ASM_EXTABLE(1b, 2b)
138
0
                  : "=&c" (missing_after),
139
0
                    "=&D" (tmp), "=&S" (tmp)
140
0
                  : "0" (ARRAY_SIZE(insns_after)),
141
0
                    "1" (insns_after),
142
0
                    "2" (regs->rip));
143
0
144
0
    /*
145
0
     * Copy backwards from regs->rip - 1.  In the case of a fault, %ecx
146
0
     * contains the number of bytes remaining to copy.
147
0
     */
148
0
    asm volatile ("std;"
149
0
                  "1: rep movsb;"
150
0
                  "2: cld;"
151
0
                  _ASM_EXTABLE(1b, 2b)
152
0
                  : "=&c" (missing_before),
153
0
                    "=&D" (tmp), "=&S" (tmp)
154
0
                  : "0" (ARRAY_SIZE(insns_before)),
155
0
                    "1" (insns_before + ARRAY_SIZE(insns_before) - 1),
156
0
                    "2" (regs->rip - 1));
157
0
    clac();
158
0
159
0
    printk("Xen code around <%p> (%ps)%s:\n",
160
0
           _p(regs->rip), _p(regs->rip),
161
0
           (missing_before || missing_after) ? " [fault on access]" : "");
162
0
163
0
    /* Print bytes from insns_before[]. */
164
0
    for ( i = 0; i < ARRAY_SIZE(insns_before); ++i )
165
0
    {
166
0
        if ( i < missing_before )
167
0
            printk(" --");
168
0
        else
169
0
            printk(" %02x", insns_before[i]);
170
0
    }
171
0
172
0
    /* Print the byte under %rip. */
173
0
    if ( missing_after != ARRAY_SIZE(insns_after) )
174
0
        printk(" <%02x>", insns_after[0]);
175
0
    else
176
0
        printk(" <-->");
177
0
178
0
    /* Print bytes from insns_after[]. */
179
0
    for ( i = 1; i < ARRAY_SIZE(insns_after); ++i )
180
0
    {
181
0
        if ( i < (ARRAY_SIZE(insns_after) - missing_after) )
182
0
            printk(" %02x", insns_after[i]);
183
0
        else
184
0
            printk(" --");
185
0
    }
186
0
187
0
    printk("\n");
188
0
}
189
190
static void compat_show_guest_stack(struct vcpu *v,
191
                                    const struct cpu_user_regs *regs,
192
                                    int debug_stack_lines)
193
0
{
194
0
    unsigned int i, *stack, addr, mask = STACK_SIZE;
195
0
196
0
    stack = (unsigned int *)(unsigned long)regs->esp;
197
0
    printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack);
198
0
199
0
    if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) )
200
0
    {
201
0
        printk("Guest-inaccessible memory.\n");
202
0
        return;
203
0
    }
204
0
205
0
    if ( v != current )
206
0
    {
207
0
        struct vcpu *vcpu;
208
0
        unsigned long mfn;
209
0
210
0
        ASSERT(guest_kernel_mode(v, regs));
211
0
        mfn = read_cr3() >> PAGE_SHIFT;
212
0
        for_each_vcpu( v->domain, vcpu )
213
0
            if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
214
0
                break;
215
0
        if ( !vcpu )
216
0
        {
217
0
            stack = do_page_walk(v, (unsigned long)stack);
218
0
            if ( (unsigned long)stack < PAGE_SIZE )
219
0
            {
220
0
                printk("Inaccessible guest memory.\n");
221
0
                return;
222
0
            }
223
0
            mask = PAGE_SIZE;
224
0
        }
225
0
    }
226
0
227
0
    for ( i = 0; i < debug_stack_lines * 8; i++ )
228
0
    {
229
0
        if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
230
0
            break;
231
0
        if ( __get_user(addr, stack) )
232
0
        {
233
0
            if ( i != 0 )
234
0
                printk("\n    ");
235
0
            printk("Fault while accessing guest memory.");
236
0
            i = 1;
237
0
            break;
238
0
        }
239
0
        if ( (i != 0) && ((i % 8) == 0) )
240
0
            printk("\n ");
241
0
        printk(" %08x", addr);
242
0
        stack++;
243
0
    }
244
0
    if ( mask == PAGE_SIZE )
245
0
    {
246
0
        BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
247
0
        unmap_domain_page(stack);
248
0
    }
249
0
    if ( i == 0 )
250
0
        printk("Stack empty.");
251
0
    printk("\n");
252
0
}
253
254
static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
255
0
{
256
0
    int i;
257
0
    unsigned long *stack, addr;
258
0
    unsigned long mask = STACK_SIZE;
259
0
260
0
    /* Avoid HVM as we don't know what the stack looks like. */
261
0
    if ( is_hvm_vcpu(v) )
262
0
        return;
263
0
264
0
    if ( is_pv_32bit_vcpu(v) )
265
0
    {
266
0
        compat_show_guest_stack(v, regs, debug_stack_lines);
267
0
        return;
268
0
    }
269
0
270
0
    stack = (unsigned long *)regs->rsp;
271
0
    printk("Guest stack trace from "__OP"sp=%p:\n  ", stack);
272
0
273
0
    if ( !access_ok(stack, sizeof(*stack)) )
274
0
    {
275
0
        printk("Guest-inaccessible memory.\n");
276
0
        return;
277
0
    }
278
0
279
0
    if ( v != current )
280
0
    {
281
0
        struct vcpu *vcpu;
282
0
283
0
        ASSERT(guest_kernel_mode(v, regs));
284
0
        vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
285
0
        if ( !vcpu )
286
0
        {
287
0
            stack = do_page_walk(v, (unsigned long)stack);
288
0
            if ( (unsigned long)stack < PAGE_SIZE )
289
0
            {
290
0
                printk("Inaccessible guest memory.\n");
291
0
                return;
292
0
            }
293
0
            mask = PAGE_SIZE;
294
0
        }
295
0
    }
296
0
297
0
    for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
298
0
    {
299
0
        if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
300
0
            break;
301
0
        if ( __get_user(addr, stack) )
302
0
        {
303
0
            if ( i != 0 )
304
0
                printk("\n    ");
305
0
            printk("Fault while accessing guest memory.");
306
0
            i = 1;
307
0
            break;
308
0
        }
309
0
        if ( (i != 0) && ((i % stack_words_per_line) == 0) )
310
0
            printk("\n  ");
311
0
        printk(" %p", _p(addr));
312
0
        stack++;
313
0
    }
314
0
    if ( mask == PAGE_SIZE )
315
0
    {
316
0
        BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
317
0
        unmap_domain_page(stack);
318
0
    }
319
0
    if ( i == 0 )
320
0
        printk("Stack empty.");
321
0
    printk("\n");
322
0
}
323
324
/*
325
 * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
326
 *
327
 * Stack pages 0, 1 and 2:
328
 *   These are all 1-page IST stacks.  Each of these stacks have an exception
329
 *   frame and saved register state at the top.  The interesting bound for a
330
 *   trace is the word adjacent to this, while the bound for a dump is the
331
 *   very top, including the exception frame.
332
 *
333
 * Stack pages 3, 4 and 5:
334
 *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
335
 *   explicitly not present, so attempting to dump or trace it is
336
 *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
337
 *   to use the entire primary stack and wander into page 5.  In this case,
338
 *   consider these pages an extension of the primary stack to aid debugging
339
 *   hopefully rare situations where the primary stack has effective been
340
 *   overflown.
341
 *
342
 * Stack pages 6 and 7:
343
 *   These form the primary stack, and have a cpu_info at the top.  For a
344
 *   trace, the interesting bound is adjacent to the cpu_info, while for a
345
 *   dump, the entire cpu_info is interesting.
346
 *
347
 * For the cases where the stack should not be inspected, pretend that the
348
 * passed stack pointer is already out of reasonable bounds.
349
 */
350
unsigned long get_stack_trace_bottom(unsigned long sp)
351
0
{
352
0
    switch ( get_stack_page(sp) )
353
0
    {
354
0
    case 0 ... 2:
355
0
        return ROUNDUP(sp, PAGE_SIZE) -
356
0
            offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
357
0
358
0
#ifndef MEMORY_GUARD
359
    case 3 ... 5:
360
#endif
361
0
    case 6 ... 7:
362
0
        return ROUNDUP(sp, STACK_SIZE) -
363
0
            sizeof(struct cpu_info) - sizeof(unsigned long);
364
0
365
0
    default:
366
0
        return sp - sizeof(unsigned long);
367
0
    }
368
0
}
369
370
unsigned long get_stack_dump_bottom(unsigned long sp)
371
0
{
372
0
    switch ( get_stack_page(sp) )
373
0
    {
374
0
    case 0 ... 2:
375
0
        return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
376
0
377
0
#ifndef MEMORY_GUARD
378
    case 3 ... 5:
379
#endif
380
0
    case 6 ... 7:
381
0
        return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
382
0
383
0
    default:
384
0
        return sp - sizeof(unsigned long);
385
0
    }
386
0
}
387
388
#if !defined(CONFIG_FRAME_POINTER)
389
390
/*
391
 * Stack trace from pointers found in stack, unaided by frame pointers.  For
392
 * caller convenience, this has the same prototype as its alternative, and
393
 * simply ignores the base pointer parameter.
394
 */
395
static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
396
{
397
    unsigned long *stack = (unsigned long *)sp, addr;
398
    unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
399
400
    while ( stack <= bottom )
401
    {
402
        addr = *stack++;
403
        if ( is_active_kernel_text(addr) )
404
            printk("   [<%p>] %pS\n", _p(addr), _p(addr));
405
    }
406
}
407
408
#else
409
410
/* Stack trace from frames in the stack, using frame pointers */
411
static void _show_trace(unsigned long sp, unsigned long bp)
412
0
{
413
0
    unsigned long *frame, next, addr;
414
0
415
0
    /* Bounds for range of valid frame pointer. */
416
0
    unsigned long low = sp, high = get_stack_trace_bottom(sp);
417
0
418
0
    /* The initial frame pointer. */
419
0
    next = bp;
420
0
421
0
    for ( ; ; )
422
0
    {
423
0
        /* Valid frame pointer? */
424
0
        if ( (next < low) || (next >= high) )
425
0
        {
426
0
            /*
427
0
             * Exception stack frames have a different layout, denoted by an
428
0
             * inverted frame pointer.
429
0
             */
430
0
            next = ~next;
431
0
            if ( (next < low) || (next >= high) )
432
0
                break;
433
0
            frame = (unsigned long *)next;
434
0
            next  = frame[0];
435
0
            addr  = frame[(offsetof(struct cpu_user_regs, rip) -
436
0
                           offsetof(struct cpu_user_regs, rbp))
437
0
                         / BYTES_PER_LONG];
438
0
        }
439
0
        else
440
0
        {
441
0
            /* Ordinary stack frame. */
442
0
            frame = (unsigned long *)next;
443
0
            next  = frame[0];
444
0
            addr  = frame[1];
445
0
        }
446
0
447
0
        printk("   [<%p>] %pS\n", _p(addr), _p(addr));
448
0
449
0
        low = (unsigned long)&frame[2];
450
0
    }
451
0
}
452
453
#endif
454
455
static void show_trace(const struct cpu_user_regs *regs)
456
0
{
457
0
    unsigned long *sp = ESP_BEFORE_EXCEPTION(regs);
458
0
459
0
    printk("Xen call trace:\n");
460
0
461
0
    /*
462
0
     * If RIP looks sensible, or the top of the stack doesn't, print RIP at
463
0
     * the top of the stack trace.
464
0
     */
465
0
    if ( is_active_kernel_text(regs->rip) ||
466
0
         !is_active_kernel_text(*sp) )
467
0
        printk("   [<%p>] %pS\n", _p(regs->rip), _p(regs->rip));
468
0
    /*
469
0
     * Else RIP looks bad but the top of the stack looks good.  Perhaps we
470
0
     * followed a wild function pointer? Lets assume the top of the stack is a
471
0
     * return address; print it and skip past so _show_trace() doesn't print
472
0
     * it again.
473
0
     */
474
0
    else
475
0
    {
476
0
        printk("   [<%p>] %pS\n", _p(*sp), _p(*sp));
477
0
        sp++;
478
0
    }
479
0
480
0
    _show_trace((unsigned long)sp, regs->rbp);
481
0
482
0
    printk("\n");
483
0
}
484
485
void show_stack(const struct cpu_user_regs *regs)
486
0
{
487
0
    unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
488
0
    int i;
489
0
490
0
    if ( guest_mode(regs) )
491
0
        return show_guest_stack(current, regs);
492
0
493
0
    printk("Xen stack trace from "__OP"sp=%p:\n  ", stack);
494
0
495
0
    stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
496
0
497
0
    for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
498
0
              (stack <= stack_bottom); i++ )
499
0
    {
500
0
        if ( (i != 0) && ((i % stack_words_per_line) == 0) )
501
0
            printk("\n  ");
502
0
        addr = *stack++;
503
0
        printk(" %p", _p(addr));
504
0
    }
505
0
    if ( i == 0 )
506
0
        printk("Stack empty.");
507
0
    printk("\n");
508
0
509
0
    show_trace(regs);
510
0
}
511
512
void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
513
0
{
514
0
    unsigned long esp = regs->rsp;
515
0
    unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
516
0
#ifdef MEMORY_GUARD
517
0
    unsigned long esp_top, esp_bottom;
518
0
#endif
519
0
520
0
    if ( _p(curr_stack_base) != stack_base[cpu] )
521
0
        printk("Current stack base %p differs from expected %p\n",
522
0
               _p(curr_stack_base), stack_base[cpu]);
523
0
524
0
#ifdef MEMORY_GUARD
525
0
    esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
526
0
    esp_top    = esp_bottom - PRIMARY_STACK_SIZE;
527
0
528
0
    printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n",
529
0
           (void *)esp_top, (void *)esp_bottom, (void *)esp,
530
0
           (void *)per_cpu(init_tss, cpu).rsp0);
531
0
532
0
    /*
533
0
     * Trigger overflow trace if %esp is anywhere within the guard page, or
534
0
     * with fewer than 512 bytes remaining on the primary stack.
535
0
     */
536
0
    if ( (esp > (esp_top + 512)) ||
537
0
         (esp < (esp_top - PAGE_SIZE)) )
538
0
    {
539
0
        printk("No stack overflow detected. Skipping stack trace.\n");
540
0
        return;
541
0
    }
542
0
543
0
    if ( esp < esp_top )
544
0
        esp = esp_top;
545
0
546
0
    printk("Xen stack overflow (dumping trace %p-%p):\n",
547
0
           (void *)esp, (void *)esp_bottom);
548
0
549
0
    _show_trace(esp, regs->rbp);
550
0
551
0
    printk("\n");
552
0
#endif
553
0
}
554
555
void show_execution_state(const struct cpu_user_regs *regs)
556
0
{
557
0
    /* Prevent interleaving of output. */
558
0
    unsigned long flags = console_lock_recursive_irqsave();
559
0
560
0
    show_registers(regs);
561
0
    show_code(regs);
562
0
    show_stack(regs);
563
0
564
0
    console_unlock_recursive_irqrestore(flags);
565
0
}
566
567
void vcpu_show_execution_state(struct vcpu *v)
568
0
{
569
0
    unsigned long flags;
570
0
571
0
    printk("*** Dumping Dom%d vcpu#%d state: ***\n",
572
0
           v->domain->domain_id, v->vcpu_id);
573
0
574
0
    if ( v == current )
575
0
    {
576
0
        show_execution_state(guest_cpu_user_regs());
577
0
        return;
578
0
    }
579
0
580
0
    vcpu_pause(v); /* acceptably dangerous */
581
0
582
0
    /* Prevent interleaving of output. */
583
0
    flags = console_lock_recursive_irqsave();
584
0
585
0
    vcpu_show_registers(v);
586
0
    if ( guest_kernel_mode(v, &v->arch.user_regs) )
587
0
        show_guest_stack(v, &v->arch.user_regs);
588
0
589
0
    console_unlock_recursive_irqrestore(flags);
590
0
591
0
    vcpu_unpause(v);
592
0
}
593
594
static cpumask_t show_state_mask;
595
static bool opt_show_all;
596
boolean_param("async-show-all", opt_show_all);
597
598
static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
599
0
{
600
0
    if ( !cpumask_test_cpu(cpu, &show_state_mask) )
601
0
        return 0;
602
0
603
0
    if ( opt_show_all )
604
0
        show_execution_state(regs);
605
0
    else
606
0
        printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs,
607
0
               regs->rip, guest_mode(regs) ? _p(regs->rip) : NULL);
608
0
    cpumask_clear_cpu(cpu, &show_state_mask);
609
0
610
0
    return 1;
611
0
}
612
613
const char *trapstr(unsigned int trapnr)
614
0
{
615
0
    static const char * const strings[] = {
616
0
        "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
617
0
        "invalid opcode", "device not available", "double fault",
618
0
        "coprocessor segment", "invalid tss", "segment not found",
619
0
        "stack error", "general protection fault", "page fault",
620
0
        "spurious interrupt", "coprocessor error", "alignment check",
621
0
        "machine check", "simd error", "virtualisation exception"
622
0
    };
623
0
624
0
    return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
625
0
}
626
627
/*
628
 * This is called for faults at very unexpected times (e.g., when interrupts
629
 * are disabled). In such situations we can't do much that is safe. We try to
630
 * print out some tracing and then we just spin.
631
 */
632
void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
633
0
{
634
0
    static DEFINE_PER_CPU(char, depth);
635
0
    unsigned int trapnr = regs->entry_vector;
636
0
637
0
    /* Set AC to reduce chance of further SMAP faults */
638
0
    stac();
639
0
640
0
    /*
641
0
     * In some cases, we can end up in a vicious cycle of fatal_trap()s
642
0
     * within fatal_trap()s. We give the problem a couple of iterations to
643
0
     * bottom out, and then we just panic.
644
0
     */
645
0
    if ( ++this_cpu(depth) < 3 )
646
0
    {
647
0
        watchdog_disable();
648
0
        console_start_sync();
649
0
650
0
        show_execution_state(regs);
651
0
652
0
        if ( trapnr == TRAP_page_fault )
653
0
        {
654
0
            unsigned long cr2 = read_cr2();
655
0
            printk("Faulting linear address: %p\n", _p(cr2));
656
0
            show_page_walk(cr2);
657
0
        }
658
0
659
0
        if ( show_remote )
660
0
        {
661
0
            unsigned int msecs, pending;
662
0
663
0
            cpumask_andnot(&show_state_mask, &cpu_online_map,
664
0
                           cpumask_of(smp_processor_id()));
665
0
            set_nmi_callback(nmi_show_execution_state);
666
0
            /* Ensure new callback is set before sending out the NMI. */
667
0
            smp_wmb();
668
0
            smp_send_nmi_allbutself();
669
0
670
0
            /* Wait at most 10ms for some other CPU to respond. */
671
0
            msecs = 10;
672
0
            pending = cpumask_weight(&show_state_mask);
673
0
            while ( pending && msecs-- )
674
0
            {
675
0
                unsigned int left;
676
0
677
0
                mdelay(1);
678
0
                left = cpumask_weight(&show_state_mask);
679
0
                if ( left < pending )
680
0
                {
681
0
                    pending = left;
682
0
                    msecs = 10;
683
0
                }
684
0
            }
685
0
        }
686
0
    }
687
0
688
0
    panic("FATAL TRAP: vector = %d (%s)\n"
689
0
          "[error_code=%04x] %s",
690
0
          trapnr, trapstr(trapnr), regs->error_code,
691
0
          (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
692
0
}
693
694
void do_reserved_trap(struct cpu_user_regs *regs)
695
0
{
696
0
    unsigned int trapnr = regs->entry_vector;
697
0
698
0
    if ( debugger_trap_fatal(trapnr, regs) )
699
0
        return;
700
0
701
0
    show_execution_state(regs);
702
0
    panic("FATAL RESERVED TRAP %#x: %s", trapnr, trapstr(trapnr));
703
0
}
704
705
void do_trap(struct cpu_user_regs *regs)
706
1
{
707
1
    struct vcpu *curr = current;
708
1
    unsigned int trapnr = regs->entry_vector;
709
1
    unsigned long fixup;
710
1
711
1
    if ( regs->error_code & X86_XEC_EXT )
712
0
        goto hardware_trap;
713
1
714
1
    if ( debugger_trap_entry(trapnr, regs) )
715
0
        return;
716
1
717
1
    ASSERT(trapnr < 32);
718
1
719
1
    if ( guest_mode(regs) )
720
0
    {
721
0
        pv_inject_hw_exception(trapnr,
722
0
                               (TRAP_HAVE_EC & (1u << trapnr))
723
0
                               ? regs->error_code : X86_EVENT_NO_EC);
724
0
        return;
725
0
    }
726
1
727
1
    if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
728
0
         system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
729
0
         curr->arch.hvm_vcpu.fpu_exception_callback )
730
0
    {
731
0
        curr->arch.hvm_vcpu.fpu_exception_callback(
732
0
            curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
733
0
        return;
734
0
    }
735
1
736
1
    if ( likely((fixup = search_exception_table(regs)) != 0) )
737
1
    {
738
1
        dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
739
1
                trapnr, _p(regs->rip), _p(regs->rip), _p(fixup));
740
1
        this_cpu(last_extable_addr) = regs->rip;
741
1
        regs->rip = fixup;
742
1
        return;
743
1
    }
744
1
745
0
 hardware_trap:
746
0
    if ( debugger_trap_fatal(trapnr, regs) )
747
0
        return;
748
0
749
0
    show_execution_state(regs);
750
0
    panic("FATAL TRAP: vector = %d (%s)\n"
751
0
          "[error_code=%04x]",
752
0
          trapnr, trapstr(trapnr), regs->error_code);
753
0
}
754
755
/* Returns 0 if not handled, and non-0 for success. */
756
int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
757
0
{
758
0
    struct domain *d = current->domain;
759
0
    /* Optionally shift out of the way of Viridian architectural MSRs. */
760
0
    uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
761
0
762
0
    switch ( idx - base )
763
0
    {
764
0
    case 0: /* Write hypercall page MSR.  Read as zero. */
765
0
    {
766
0
        *val = 0;
767
0
        return 1;
768
0
    }
769
0
    }
770
0
771
0
    return 0;
772
0
}
773
774
/* Returns 1 if handled, 0 if not and -Exx for error. */
775
int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
776
2
{
777
2
    struct domain *d = current->domain;
778
2
    /* Optionally shift out of the way of Viridian architectural MSRs. */
779
2
    uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
780
2
781
2
    switch ( idx - base )
782
2
    {
783
2
    case 0: /* Write hypercall page */
784
2
    {
785
2
        void *hypercall_page;
786
2
        unsigned long gmfn = val >> PAGE_SHIFT;
787
2
        unsigned int page_index = val & (PAGE_SIZE - 1);
788
2
        struct page_info *page;
789
2
        p2m_type_t t;
790
2
791
2
        if ( page_index > 0 )
792
0
        {
793
0
            gdprintk(XENLOG_WARNING,
794
0
                     "wrmsr hypercall page index %#x unsupported\n",
795
0
                     page_index);
796
0
            return 0;
797
0
        }
798
2
799
2
        page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC);
800
2
801
2
        if ( !page || !get_page_type(page, PGT_writable_page) )
802
0
        {
803
0
            if ( page )
804
0
                put_page(page);
805
0
806
0
            if ( p2m_is_paging(t) )
807
0
            {
808
0
                p2m_mem_paging_populate(d, gmfn);
809
0
                return -ERESTART;
810
0
            }
811
0
812
0
            gdprintk(XENLOG_WARNING,
813
0
                     "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
814
0
                     gmfn, page ? page_to_mfn(page) : -1UL, base);
815
0
            return 0;
816
0
        }
817
2
818
2
        hypercall_page = __map_domain_page(page);
819
2
        hypercall_page_initialise(d, hypercall_page);
820
2
        unmap_domain_page(hypercall_page);
821
2
822
2
        put_page_and_type(page);
823
2
        return 1;
824
2
    }
825
2
    }
826
2
827
0
    return 0;
828
2
}
829
830
void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
831
                             uint32_t subleaf, struct cpuid_leaf *res)
832
9
{
833
9
    const struct domain *d = v->domain;
834
9
    const struct cpuid_policy *p = d->arch.cpuid;
835
9
    uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
836
9
    uint32_t idx  = leaf - base;
837
9
    unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit;
838
9
839
9
    if ( limit == 0 )
840
9
        /* Default number of leaves */
841
9
        limit = XEN_CPUID_MAX_NUM_LEAVES;
842
9
    else
843
9
        /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */
844
0
        limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u);
845
9
846
9
    if ( idx > limit )
847
0
        return;
848
9
849
9
    switch ( idx )
850
9
    {
851
4
    case 0:
852
4
        res->a = base + limit; /* Largest leaf */
853
4
        res->b = XEN_CPUID_SIGNATURE_EBX;
854
4
        res->c = XEN_CPUID_SIGNATURE_ECX;
855
4
        res->d = XEN_CPUID_SIGNATURE_EDX;
856
4
        break;
857
4
858
2
    case 1:
859
2
        res->a = (xen_major_version() << 16) | xen_minor_version();
860
2
        break;
861
4
862
2
    case 2:
863
2
        res->a = 1;            /* Number of hypercall-transfer pages */
864
2
                               /* MSR base address */
865
2
        res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
866
2
        if ( is_pv_domain(d) ) /* Features */
867
0
            res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
868
2
        break;
869
4
870
0
    case 3: /* Time leaf. */
871
0
        switch ( subleaf )
872
0
        {
873
0
        case 0: /* features */
874
0
            res->a = ((d->arch.vtsc << 0) |
875
0
                      (!!host_tsc_is_safe() << 1) |
876
0
                      (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2));
877
0
            res->b = d->arch.tsc_mode;
878
0
            res->c = d->arch.tsc_khz;
879
0
            res->d = d->arch.incarnation;
880
0
            break;
881
0
882
0
        case 1: /* scale and offset */
883
0
        {
884
0
            uint64_t offset;
885
0
886
0
            if ( !d->arch.vtsc )
887
0
                offset = d->arch.vtsc_offset;
888
0
            else
889
0
                /* offset already applied to value returned by virtual rdtscp */
890
0
                offset = 0;
891
0
            res->a = offset;
892
0
            res->b = offset >> 32;
893
0
            res->c = d->arch.vtsc_to_ns.mul_frac;
894
0
            res->d = (s8)d->arch.vtsc_to_ns.shift;
895
0
            break;
896
0
        }
897
0
898
0
        case 2: /* physical cpu_khz */
899
0
            res->a = cpu_khz;
900
0
            break;
901
0
        }
902
0
        break;
903
0
904
1
    case 4: /* HVM hypervisor leaf. */
905
1
        if ( !is_hvm_domain(d) || subleaf != 0 )
906
1
            break;
907
1
908
0
        if ( cpu_has_vmx_apic_reg_virt )
909
0
            res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
910
0
911
0
        /*
912
0
         * We want to claim that x2APIC is virtualized if APIC MSR accesses
913
0
         * are not intercepted. When all three of these are true both rdmsr
914
0
         * and wrmsr in the guest will run without VMEXITs (see
915
0
         * vmx_vlapic_msr_changed()).
916
0
         */
917
0
        if ( cpu_has_vmx_virtualize_x2apic_mode &&
918
0
             cpu_has_vmx_apic_reg_virt &&
919
0
             cpu_has_vmx_virtual_intr_delivery )
920
0
            res->a |= XEN_HVM_CPUID_X2APIC_VIRT;
921
0
922
0
        /*
923
0
         * Indicate that memory mapped from other domains (either grants or
924
0
         * foreign pages) has valid IOMMU entries.
925
0
         */
926
0
        res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
927
0
928
0
        /* Indicate presence of vcpu id and set it in ebx */
929
0
        res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
930
0
        res->b = v->vcpu_id;
931
0
        break;
932
1
933
0
    case 5: /* PV-specific parameters */
934
0
        if ( is_hvm_domain(d) || subleaf != 0 )
935
0
            break;
936
0
937
0
        res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT;
938
0
        break;
939
0
940
0
    default:
941
0
        ASSERT_UNREACHABLE();
942
9
    }
943
9
}
944
945
void do_invalid_op(struct cpu_user_regs *regs)
946
2
{
947
2
    const struct bug_frame *bug = NULL;
948
2
    u8 bug_insn[2];
949
2
    const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip;
950
2
    unsigned long fixup;
951
2
    int id = -1, lineno;
952
2
    const struct virtual_region *region;
953
2
954
2
    if ( debugger_trap_entry(TRAP_invalid_op, regs) )
955
0
        return;
956
2
957
2
    if ( likely(guest_mode(regs)) )
958
0
    {
959
0
        if ( pv_emulate_invalid_op(regs) )
960
0
            pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
961
0
        return;
962
0
    }
963
2
964
2
    if ( !is_active_kernel_text(regs->rip) ||
965
1
         __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
966
1
         memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
967
1
        goto die;
968
2
969
1
    region = find_text_region(regs->rip);
970
1
    if ( region )
971
1
    {
972
1
        for ( id = 0; id < BUGFRAME_NR; id++ )
973
1
        {
974
1
            const struct bug_frame *b;
975
1
            unsigned int i;
976
1
977
1
            for ( i = 0, b = region->frame[id].bugs;
978
1
                  i < region->frame[id].n_bugs; b++, i++ )
979
1
            {
980
1
                if ( bug_loc(b) == eip )
981
1
                {
982
1
                    bug = b;
983
1
                    goto found;
984
1
                }
985
1
            }
986
1
        }
987
1
    }
988
1
989
1
 found:
990
1
    if ( !bug )
991
0
        goto die;
992
1
    eip += sizeof(bug_insn);
993
1
    if ( id == BUGFRAME_run_fn )
994
1
    {
995
1
        void (*fn)(struct cpu_user_regs *) = bug_ptr(bug);
996
1
997
1
        fn(regs);
998
1
        regs->rip = (unsigned long)eip;
999
1
        return;
1000
1
    }
1001
1
1002
1
    /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
1003
0
    filename = bug_ptr(bug);
1004
0
    if ( !is_kernel(filename) && !is_patch(filename) )
1005
0
        goto die;
1006
0
    fixup = strlen(filename);
1007
0
    if ( fixup > 50 )
1008
0
    {
1009
0
        filename += fixup - 47;
1010
0
        prefix = "...";
1011
0
    }
1012
0
    lineno = bug_line(bug);
1013
0
1014
0
    switch ( id )
1015
0
    {
1016
0
    case BUGFRAME_warn:
1017
0
        printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno);
1018
0
        show_execution_state(regs);
1019
0
        regs->rip = (unsigned long)eip;
1020
0
        return;
1021
0
1022
0
    case BUGFRAME_bug:
1023
0
        printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1024
0
1025
0
        if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1026
0
            return;
1027
0
1028
0
        show_execution_state(regs);
1029
0
        panic("Xen BUG at %s%s:%d", prefix, filename, lineno);
1030
0
1031
0
    case BUGFRAME_assert:
1032
0
        /* ASSERT: decode the predicate string pointer. */
1033
0
        predicate = bug_msg(bug);
1034
0
        if ( !is_kernel(predicate) && !is_patch(predicate) )
1035
0
            predicate = "<unknown>";
1036
0
1037
0
        printk("Assertion '%s' failed at %s%s:%d\n",
1038
0
               predicate, prefix, filename, lineno);
1039
0
1040
0
        if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1041
0
            return;
1042
0
1043
0
        show_execution_state(regs);
1044
0
        panic("Assertion '%s' failed at %s%s:%d",
1045
0
              predicate, prefix, filename, lineno);
1046
0
    }
1047
0
1048
1
 die:
1049
1
    if ( (fixup = search_exception_table(regs)) != 0 )
1050
1
    {
1051
1
        this_cpu(last_extable_addr) = regs->rip;
1052
1
        regs->rip = fixup;
1053
1
        return;
1054
1
    }
1055
1
1056
0
    if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1057
0
        return;
1058
0
1059
0
    show_execution_state(regs);
1060
0
    panic("FATAL TRAP: vector = %d (invalid opcode)", TRAP_invalid_op);
1061
0
}
1062
1063
void do_int3(struct cpu_user_regs *regs)
1064
1
{
1065
1
    if ( debugger_trap_entry(TRAP_int3, regs) )
1066
0
        return;
1067
1
1068
1
    if ( !guest_mode(regs) )
1069
1
    {
1070
1
        unsigned long fixup;
1071
1
1072
1
        if ( (fixup = search_exception_table(regs)) != 0 )
1073
1
        {
1074
1
            this_cpu(last_extable_addr) = regs->rip;
1075
1
            dprintk(XENLOG_DEBUG, "Trap %u: %p [%ps] -> %p\n",
1076
1
                    TRAP_int3, _p(regs->rip), _p(regs->rip), _p(fixup));
1077
1
            regs->rip = fixup;
1078
1
            return;
1079
1
        }
1080
1
1081
0
        if ( !debugger_trap_fatal(TRAP_int3, regs) )
1082
0
            printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n",
1083
0
                   _p(regs->rip), _p(regs->rip));
1084
0
1085
0
        return;
1086
1
    }
1087
1
1088
0
    pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC);
1089
0
}
1090
1091
static void reserved_bit_page_fault(unsigned long addr,
1092
                                    struct cpu_user_regs *regs)
1093
0
{
1094
0
    printk("%pv: reserved bit in page table (ec=%04X)\n",
1095
0
           current, regs->error_code);
1096
0
    show_page_walk(addr);
1097
0
    show_execution_state(regs);
1098
0
}
1099
1100
static int handle_gdt_ldt_mapping_fault(unsigned long offset,
1101
                                        struct cpu_user_regs *regs)
1102
0
{
1103
0
    struct vcpu *curr = current;
1104
0
    /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1105
0
    unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1106
0
    unsigned int vcpu_area   = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1107
0
1108
0
    /*
1109
0
     * If the fault is in another vcpu's area, it cannot be due to
1110
0
     * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1111
0
     * indeed we have to since pv_map_ldt_shadow_page() works correctly only on
1112
0
     * accesses to a vcpu's own area.
1113
0
     */
1114
0
    if ( vcpu_area != curr->vcpu_id )
1115
0
        return 0;
1116
0
1117
0
    /* Byte offset within the gdt/ldt sub-area. */
1118
0
    offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1119
0
1120
0
    if ( likely(is_ldt_area) )
1121
0
    {
1122
0
        /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1123
0
        if ( likely(pv_map_ldt_shadow_page(offset)) )
1124
0
        {
1125
0
            if ( guest_mode(regs) )
1126
0
                trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1127
0
                                    regs->rip, offset);
1128
0
        }
1129
0
        else
1130
0
        {
1131
0
            /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1132
0
            if ( !guest_mode(regs) )
1133
0
                return 0;
1134
0
1135
0
            /* Access would have become non-canonical? Pass #GP[sel] back. */
1136
0
            if ( unlikely(!is_canonical_address(
1137
0
                              curr->arch.pv_vcpu.ldt_base + offset)) )
1138
0
            {
1139
0
                uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI;
1140
0
1141
0
                pv_inject_hw_exception(TRAP_gp_fault, ec);
1142
0
            }
1143
0
            else
1144
0
                /* else pass the #PF back, with adjusted %cr2. */
1145
0
                pv_inject_page_fault(regs->error_code,
1146
0
                                     curr->arch.pv_vcpu.ldt_base + offset);
1147
0
        }
1148
0
    }
1149
0
    else
1150
0
    {
1151
0
        /* GDT fault: handle the fault as #GP(selector). */
1152
0
        regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI);
1153
0
        (void)do_general_protection(regs);
1154
0
    }
1155
0
1156
0
    return EXCRET_fault_fixed;
1157
0
}
1158
1159
#define IN_HYPERVISOR_RANGE(va) \
1160
    (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1161
1162
enum pf_type {
1163
    real_fault,
1164
    smep_fault,
1165
    smap_fault,
1166
    spurious_fault
1167
};
1168
1169
static enum pf_type __page_fault_type(unsigned long addr,
1170
                                      const struct cpu_user_regs *regs)
1171
0
{
1172
0
    unsigned long mfn, cr3 = read_cr3();
1173
0
    l4_pgentry_t l4e, *l4t;
1174
0
    l3_pgentry_t l3e, *l3t;
1175
0
    l2_pgentry_t l2e, *l2t;
1176
0
    l1_pgentry_t l1e, *l1t;
1177
0
    unsigned int required_flags, disallowed_flags, page_user;
1178
0
    unsigned int error_code = regs->error_code;
1179
0
1180
0
    /*
1181
0
     * We do not take spurious page faults in IRQ handlers as we do not
1182
0
     * modify page tables in IRQ context. We therefore bail here because
1183
0
     * map_domain_page() is not IRQ-safe.
1184
0
     */
1185
0
    if ( in_irq() )
1186
0
        return real_fault;
1187
0
1188
0
    /* Reserved bit violations are never spurious faults. */
1189
0
    if ( error_code & PFEC_reserved_bit )
1190
0
        return real_fault;
1191
0
1192
0
    required_flags  = _PAGE_PRESENT;
1193
0
    if ( error_code & PFEC_write_access )
1194
0
        required_flags |= _PAGE_RW;
1195
0
    if ( error_code & PFEC_user_mode )
1196
0
        required_flags |= _PAGE_USER;
1197
0
1198
0
    disallowed_flags = 0;
1199
0
    if ( error_code & PFEC_insn_fetch )
1200
0
        disallowed_flags |= _PAGE_NX_BIT;
1201
0
1202
0
    page_user = _PAGE_USER;
1203
0
1204
0
    mfn = cr3 >> PAGE_SHIFT;
1205
0
1206
0
    l4t = map_domain_page(_mfn(mfn));
1207
0
    l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1208
0
    mfn = l4e_get_pfn(l4e);
1209
0
    unmap_domain_page(l4t);
1210
0
    if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1211
0
         (l4e_get_flags(l4e) & disallowed_flags) )
1212
0
        return real_fault;
1213
0
    page_user &= l4e_get_flags(l4e);
1214
0
1215
0
    l3t  = map_domain_page(_mfn(mfn));
1216
0
    l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1217
0
    mfn = l3e_get_pfn(l3e);
1218
0
    unmap_domain_page(l3t);
1219
0
    if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1220
0
         (l3e_get_flags(l3e) & disallowed_flags) )
1221
0
        return real_fault;
1222
0
    page_user &= l3e_get_flags(l3e);
1223
0
    if ( l3e_get_flags(l3e) & _PAGE_PSE )
1224
0
        goto leaf;
1225
0
1226
0
    l2t = map_domain_page(_mfn(mfn));
1227
0
    l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1228
0
    mfn = l2e_get_pfn(l2e);
1229
0
    unmap_domain_page(l2t);
1230
0
    if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1231
0
         (l2e_get_flags(l2e) & disallowed_flags) )
1232
0
        return real_fault;
1233
0
    page_user &= l2e_get_flags(l2e);
1234
0
    if ( l2e_get_flags(l2e) & _PAGE_PSE )
1235
0
        goto leaf;
1236
0
1237
0
    l1t = map_domain_page(_mfn(mfn));
1238
0
    l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1239
0
    mfn = l1e_get_pfn(l1e);
1240
0
    unmap_domain_page(l1t);
1241
0
    if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1242
0
         (l1e_get_flags(l1e) & disallowed_flags) )
1243
0
        return real_fault;
1244
0
    page_user &= l1e_get_flags(l1e);
1245
0
1246
0
leaf:
1247
0
    if ( page_user )
1248
0
    {
1249
0
        unsigned long cr4 = read_cr4();
1250
0
        /*
1251
0
         * Supervisor Mode Execution Prevention (SMEP):
1252
0
         * Disallow supervisor execution from user-accessible mappings
1253
0
         */
1254
0
        if ( (cr4 & X86_CR4_SMEP) &&
1255
0
             ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
1256
0
            return smep_fault;
1257
0
1258
0
        /*
1259
0
         * Supervisor Mode Access Prevention (SMAP):
1260
0
         * Disallow supervisor access user-accessible mappings
1261
0
         * A fault is considered as an SMAP violation if the following
1262
0
         * conditions are true:
1263
0
         *   - X86_CR4_SMAP is set in CR4
1264
0
         *   - A user page is being accessed
1265
0
         *   - CPL=3 or X86_EFLAGS_AC is clear
1266
0
         *   - Page fault in kernel mode
1267
0
         */
1268
0
        if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) &&
1269
0
             (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) )
1270
0
            return smap_fault;
1271
0
    }
1272
0
1273
0
    return spurious_fault;
1274
0
}
1275
1276
static enum pf_type spurious_page_fault(unsigned long addr,
1277
                                        const struct cpu_user_regs *regs)
1278
0
{
1279
0
    unsigned long flags;
1280
0
    enum pf_type pf_type;
1281
0
1282
0
    /*
1283
0
     * Disabling interrupts prevents TLB flushing, and hence prevents
1284
0
     * page tables from becoming invalid under our feet during the walk.
1285
0
     */
1286
0
    local_irq_save(flags);
1287
0
    pf_type = __page_fault_type(addr, regs);
1288
0
    local_irq_restore(flags);
1289
0
1290
0
    return pf_type;
1291
0
}
1292
1293
static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1294
0
{
1295
0
    struct vcpu   *v = current;
1296
0
    struct domain *d = v->domain;
1297
0
1298
0
    /* No fixups in interrupt context or when interrupts are disabled. */
1299
0
    if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1300
0
        return 0;
1301
0
1302
0
    if ( !(regs->error_code & PFEC_page_present) &&
1303
0
          (pagefault_by_memadd(addr, regs)) )
1304
0
        return handle_memadd_fault(addr, regs);
1305
0
1306
0
    if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1307
0
    {
1308
0
        if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1309
0
             (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1310
0
            return handle_gdt_ldt_mapping_fault(
1311
0
                addr - GDT_LDT_VIRT_START, regs);
1312
0
        return 0;
1313
0
    }
1314
0
1315
0
    if ( guest_kernel_mode(v, regs) &&
1316
0
         !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
1317
0
         (regs->error_code & PFEC_write_access) )
1318
0
    {
1319
0
        bool ptwr, mmio_ro;
1320
0
1321
0
        ptwr = VM_ASSIST(d, writable_pagetables) &&
1322
0
               /* Do not check if access-protection fault since the page may
1323
0
                  legitimately be not present in shadow page tables */
1324
0
               (paging_mode_enabled(d) ||
1325
0
                (regs->error_code & PFEC_page_present));
1326
0
1327
0
        mmio_ro = is_hardware_domain(d) &&
1328
0
                  (regs->error_code & PFEC_page_present);
1329
0
1330
0
        if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) )
1331
0
            return EXCRET_fault_fixed;
1332
0
    }
1333
0
1334
0
    /*
1335
0
     * For non-external shadowed guests, we fix up both their own pagefaults
1336
0
     * and Xen's, since they share the pagetables.  This includes hypervisor
1337
0
     * faults, e.g. from copy_to_user().
1338
0
     */
1339
0
    if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1340
0
    {
1341
0
        int ret;
1342
0
1343
0
        /* Logdirty mode is the only expected paging mode for PV guests. */
1344
0
        ASSERT(paging_mode_only_log_dirty(d));
1345
0
1346
0
        ret = paging_fault(addr, regs);
1347
0
        if ( ret == EXCRET_fault_fixed )
1348
0
            trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr);
1349
0
        return ret;
1350
0
    }
1351
0
1352
0
    return 0;
1353
0
}
1354
1355
/*
1356
 * #PF error code:
1357
 *  Bit 0: Protection violation (=1) ; Page not present (=0)
1358
 *  Bit 1: Write access
1359
 *  Bit 2: User mode (=1) ; Supervisor mode (=0)
1360
 *  Bit 3: Reserved bit violation
1361
 *  Bit 4: Instruction fetch
1362
 */
1363
void do_page_fault(struct cpu_user_regs *regs)
1364
0
{
1365
0
    unsigned long addr, fixup;
1366
0
    unsigned int error_code;
1367
0
    enum pf_type pf_type;
1368
0
1369
0
    addr = read_cr2();
1370
0
1371
0
    /* fixup_page_fault() might change regs->error_code, so cache it here. */
1372
0
    error_code = regs->error_code;
1373
0
1374
0
    if ( debugger_trap_entry(TRAP_page_fault, regs) )
1375
0
        return;
1376
0
1377
0
    perfc_incr(page_faults);
1378
0
1379
0
    if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1380
0
        return;
1381
0
1382
0
    if ( unlikely(!guest_mode(regs)) )
1383
0
    {
1384
0
        pf_type = spurious_page_fault(addr, regs);
1385
0
        if ( (pf_type == smep_fault) || (pf_type == smap_fault) )
1386
0
        {
1387
0
            console_start_sync();
1388
0
            printk("Xen SM%cP violation\n",
1389
0
                   (pf_type == smep_fault) ? 'E' : 'A');
1390
0
            fatal_trap(regs, 0);
1391
0
        }
1392
0
1393
0
        if ( pf_type != real_fault )
1394
0
            return;
1395
0
1396
0
        if ( likely((fixup = search_exception_table(regs)) != 0) )
1397
0
        {
1398
0
            perfc_incr(copy_user_faults);
1399
0
            if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1400
0
                reserved_bit_page_fault(addr, regs);
1401
0
            this_cpu(last_extable_addr) = regs->rip;
1402
0
            regs->rip = fixup;
1403
0
            return;
1404
0
        }
1405
0
1406
0
        if ( debugger_trap_fatal(TRAP_page_fault, regs) )
1407
0
            return;
1408
0
1409
0
        show_execution_state(regs);
1410
0
        show_page_walk(addr);
1411
0
        panic("FATAL PAGE FAULT\n"
1412
0
              "[error_code=%04x]\n"
1413
0
              "Faulting linear address: %p",
1414
0
              error_code, _p(addr));
1415
0
    }
1416
0
1417
0
    if ( unlikely(current->domain->arch.suppress_spurious_page_faults) )
1418
0
    {
1419
0
        pf_type = spurious_page_fault(addr, regs);
1420
0
        if ( (pf_type == smep_fault) || (pf_type == smap_fault))
1421
0
        {
1422
0
            printk(XENLOG_G_ERR "%pv fatal SM%cP violation\n",
1423
0
                   current, (pf_type == smep_fault) ? 'E' : 'A');
1424
0
1425
0
            domain_crash(current->domain);
1426
0
        }
1427
0
        if ( pf_type != real_fault )
1428
0
            return;
1429
0
    }
1430
0
1431
0
    if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1432
0
        reserved_bit_page_fault(addr, regs);
1433
0
1434
0
    pv_inject_page_fault(regs->error_code, addr);
1435
0
}
1436
1437
/*
1438
 * Early #PF handler to print CR2, error code, and stack.
1439
 *
1440
 * We also deal with spurious faults here, even though they should never happen
1441
 * during early boot (an issue was seen once, but was most likely a hardware
1442
 * problem).
1443
 */
1444
void __init do_early_page_fault(struct cpu_user_regs *regs)
1445
0
{
1446
0
    static unsigned int __initdata stuck;
1447
0
    static unsigned long __initdata prev_eip, prev_cr2;
1448
0
    unsigned long cr2 = read_cr2();
1449
0
1450
0
    BUG_ON(smp_processor_id() != 0);
1451
0
1452
0
    if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) )
1453
0
    {
1454
0
        prev_eip = regs->rip;
1455
0
        prev_cr2 = cr2;
1456
0
        stuck    = 0;
1457
0
        return;
1458
0
    }
1459
0
1460
0
    if ( stuck++ == 1000 )
1461
0
    {
1462
0
        console_start_sync();
1463
0
        printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1464
0
               regs->cs, _p(regs->rip), _p(cr2), regs->error_code);
1465
0
        fatal_trap(regs, 0);
1466
0
    }
1467
0
}
1468
1469
void do_general_protection(struct cpu_user_regs *regs)
1470
1
{
1471
1
    struct vcpu *v = current;
1472
1
    unsigned long fixup;
1473
1
1474
1
    if ( debugger_trap_entry(TRAP_gp_fault, regs) )
1475
0
        return;
1476
1
1477
1
    if ( regs->error_code & X86_XEC_EXT )
1478
0
        goto hardware_gp;
1479
1
1480
1
    if ( !guest_mode(regs) )
1481
1
        goto gp_in_kernel;
1482
1
1483
1
    /*
1484
1
     * Cunning trick to allow arbitrary "INT n" handling.
1485
1
     *
1486
1
     * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1487
1
     * instruction from trapping to the appropriate vector, when that might not
1488
1
     * be expected by Xen or the guest OS. For example, that entry might be for
1489
1
     * a fault handler (unlike traps, faults don't increment EIP), or might
1490
1
     * expect an error code on the stack (which a software trap never
1491
1
     * provides), or might be a hardware interrupt handler that doesn't like
1492
1
     * being called spuriously.
1493
1
     *
1494
1
     * Instead, a GPF occurs with the faulting IDT vector in the error code.
1495
1
     * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1496
1
     * clear (which got already checked above) to indicate that it's a software
1497
1
     * fault, not a hardware one.
1498
1
     *
1499
1
     * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1500
1
     * okay because they can only be triggered by an explicit DPL-checked
1501
1
     * instruction. The DPL specified by the guest OS for these vectors is NOT
1502
1
     * CHECKED!!
1503
1
     */
1504
0
    if ( regs->error_code & X86_XEC_IDT )
1505
0
    {
1506
0
        /* This fault must be due to <INT n> instruction. */
1507
0
        const struct trap_info *ti;
1508
0
        unsigned char vector = regs->error_code >> 3;
1509
0
        ti = &v->arch.pv_vcpu.trap_ctxt[vector];
1510
0
        if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1511
0
        {
1512
0
            regs->rip += 2;
1513
0
            pv_inject_sw_interrupt(vector);
1514
0
            return;
1515
0
        }
1516
0
    }
1517
0
    else if ( is_pv_32bit_vcpu(v) && regs->error_code )
1518
0
    {
1519
0
        pv_emulate_gate_op(regs);
1520
0
        return;
1521
0
    }
1522
0
1523
0
    /* Emulate some simple privileged and I/O instructions. */
1524
0
    if ( (regs->error_code == 0) &&
1525
0
         pv_emulate_privileged_op(regs) )
1526
0
    {
1527
0
        trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
1528
0
        return;
1529
0
    }
1530
0
1531
0
    /* Pass on GPF as is. */
1532
0
    pv_inject_hw_exception(TRAP_gp_fault, regs->error_code);
1533
0
    return;
1534
0
1535
1
 gp_in_kernel:
1536
1
1537
1
    if ( likely((fixup = search_exception_table(regs)) != 0) )
1538
1
    {
1539
1
        dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
1540
1
                regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
1541
1
        this_cpu(last_extable_addr) = regs->rip;
1542
1
        regs->rip = fixup;
1543
1
        return;
1544
1
    }
1545
1
1546
0
 hardware_gp:
1547
0
    if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
1548
0
        return;
1549
0
1550
0
    show_execution_state(regs);
1551
0
    panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
1552
0
}
1553
1554
static void pci_serr_softirq(void)
1555
0
{
1556
0
    printk("\n\nNMI - PCI system error (SERR)\n");
1557
0
    outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
1558
0
}
1559
1560
void async_exception_cleanup(struct vcpu *curr)
1561
0
{
1562
0
    int trap;
1563
0
1564
0
    if ( !curr->async_exception_mask )
1565
0
        return;
1566
0
1567
0
    /* Restore affinity.  */
1568
0
    if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
1569
0
         !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
1570
0
    {
1571
0
        vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
1572
0
        cpumask_clear(curr->cpu_hard_affinity_tmp);
1573
0
    }
1574
0
1575
0
    if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
1576
0
        trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
1577
0
    else
1578
0
        for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
1579
0
            if ( (curr->async_exception_mask ^
1580
0
                  curr->async_exception_state(trap).old_mask) == (1 << trap) )
1581
0
                break;
1582
0
    if ( unlikely(trap > VCPU_TRAP_LAST) )
1583
0
    {
1584
0
        ASSERT_UNREACHABLE();
1585
0
        return;
1586
0
    }
1587
0
1588
0
    /* Restore previous asynchronous exception mask. */
1589
0
    curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
1590
0
}
1591
1592
static void nmi_hwdom_report(unsigned int reason_idx)
1593
0
{
1594
0
    struct domain *d = hardware_domain;
1595
0
1596
0
    if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
1597
0
        return;
1598
0
1599
0
    set_bit(reason_idx, nmi_reason(d));
1600
0
1601
0
    pv_raise_interrupt(d->vcpu[0], TRAP_nmi);
1602
0
}
1603
1604
static void pci_serr_error(const struct cpu_user_regs *regs)
1605
0
{
1606
0
    outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
1607
0
1608
0
    switch ( opt_nmi[0] )
1609
0
    {
1610
0
    case 'd': /* 'dom0' */
1611
0
        nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
1612
0
        /* fallthrough */
1613
0
    case 'i': /* 'ignore' */
1614
0
        /* Would like to print a diagnostic here but can't call printk()
1615
0
           from NMI context -- raise a softirq instead. */
1616
0
        raise_softirq(PCI_SERR_SOFTIRQ);
1617
0
        break;
1618
0
    default:  /* 'fatal' */
1619
0
        console_force_unlock();
1620
0
        printk("\n\nNMI - PCI system error (SERR)\n");
1621
0
        fatal_trap(regs, 0);
1622
0
    }
1623
0
}
1624
1625
static void io_check_error(const struct cpu_user_regs *regs)
1626
0
{
1627
0
    switch ( opt_nmi[0] )
1628
0
    {
1629
0
    case 'd': /* 'dom0' */
1630
0
        nmi_hwdom_report(_XEN_NMIREASON_io_error);
1631
0
    case 'i': /* 'ignore' */
1632
0
        break;
1633
0
    default:  /* 'fatal' */
1634
0
        console_force_unlock();
1635
0
        printk("\n\nNMI - I/O ERROR\n");
1636
0
        fatal_trap(regs, 0);
1637
0
    }
1638
0
1639
0
    outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1640
0
    mdelay(1);
1641
0
    outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1642
0
}
1643
1644
static void unknown_nmi_error(const struct cpu_user_regs *regs,
1645
                              unsigned char reason)
1646
0
{
1647
0
    switch ( opt_nmi[0] )
1648
0
    {
1649
0
    case 'd': /* 'dom0' */
1650
0
        nmi_hwdom_report(_XEN_NMIREASON_unknown);
1651
0
    case 'i': /* 'ignore' */
1652
0
        break;
1653
0
    default:  /* 'fatal' */
1654
0
        console_force_unlock();
1655
0
        printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1656
0
        printk("Do you have a strange power saving mode enabled?\n");
1657
0
        fatal_trap(regs, 0);
1658
0
    }
1659
0
}
1660
1661
static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu)
1662
0
{
1663
0
    return 0;
1664
0
}
1665
1666
static nmi_callback_t *nmi_callback = dummy_nmi_callback;
1667
1668
void do_nmi(const struct cpu_user_regs *regs)
1669
0
{
1670
0
    unsigned int cpu = smp_processor_id();
1671
0
    unsigned char reason;
1672
0
    bool handle_unknown = false;
1673
0
1674
0
    ++nmi_count(cpu);
1675
0
1676
0
    if ( nmi_callback(regs, cpu) )
1677
0
        return;
1678
0
1679
0
    if ( (nmi_watchdog == NMI_NONE) ||
1680
0
         (!nmi_watchdog_tick(regs) && watchdog_force) )
1681
0
        handle_unknown = true;
1682
0
1683
0
    /* Only the BSP gets external NMIs from the system. */
1684
0
    if ( cpu == 0 )
1685
0
    {
1686
0
        reason = inb(0x61);
1687
0
        if ( reason & 0x80 )
1688
0
            pci_serr_error(regs);
1689
0
        if ( reason & 0x40 )
1690
0
            io_check_error(regs);
1691
0
        if ( !(reason & 0xc0) && handle_unknown )
1692
0
            unknown_nmi_error(regs, reason);
1693
0
    }
1694
0
}
1695
1696
nmi_callback_t *set_nmi_callback(nmi_callback_t *callback)
1697
2
{
1698
2
    nmi_callback_t *old_nmi_callback = nmi_callback;
1699
2
1700
2
    nmi_callback = callback;
1701
2
1702
2
    return old_nmi_callback;
1703
2
}
1704
1705
void unset_nmi_callback(void)
1706
0
{
1707
0
    nmi_callback = dummy_nmi_callback;
1708
0
}
1709
1710
void do_device_not_available(struct cpu_user_regs *regs)
1711
0
{
1712
0
    struct vcpu *curr = current;
1713
0
1714
0
    BUG_ON(!guest_mode(regs));
1715
0
1716
0
    vcpu_restore_fpu_lazy(curr);
1717
0
1718
0
    if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
1719
0
    {
1720
0
        pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1721
0
        curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
1722
0
    }
1723
0
    else
1724
0
        TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
1725
0
1726
0
    return;
1727
0
}
1728
1729
u64 read_efer(void)
1730
37.0k
{
1731
37.0k
    return this_cpu(efer);
1732
37.0k
}
1733
1734
void write_efer(u64 val)
1735
60
{
1736
60
    this_cpu(efer) = val;
1737
60
    wrmsrl(MSR_EFER, val);
1738
60
}
1739
1740
static void ler_enable(void)
1741
0
{
1742
0
    u64 debugctl;
1743
0
1744
0
    if ( !this_cpu(ler_msr) )
1745
0
        return;
1746
0
1747
0
    rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1748
0
    wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
1749
0
}
1750
1751
void do_debug(struct cpu_user_regs *regs)
1752
0
{
1753
0
    struct vcpu *v = current;
1754
0
1755
0
    if ( debugger_trap_entry(TRAP_debug, regs) )
1756
0
        return;
1757
0
1758
0
    if ( !guest_mode(regs) )
1759
0
    {
1760
0
        if ( regs->eflags & X86_EFLAGS_TF )
1761
0
        {
1762
0
            /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
1763
0
            if ( (regs->rip >= (unsigned long)sysenter_entry) &&
1764
0
                 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
1765
0
            {
1766
0
                if ( regs->rip == (unsigned long)sysenter_eflags_saved )
1767
0
                    regs->eflags &= ~X86_EFLAGS_TF;
1768
0
                goto out;
1769
0
            }
1770
0
            if ( !debugger_trap_fatal(TRAP_debug, regs) )
1771
0
            {
1772
0
                WARN();
1773
0
                regs->eflags &= ~X86_EFLAGS_TF;
1774
0
            }
1775
0
        }
1776
0
        else
1777
0
        {
1778
0
            /*
1779
0
             * We ignore watchpoints when they trigger within Xen. This may
1780
0
             * happen when a buffer is passed to us which previously had a
1781
0
             * watchpoint set on it. No need to bump EIP; the only faulting
1782
0
             * trap is an instruction breakpoint, which can't happen to us.
1783
0
             */
1784
0
            WARN_ON(!search_exception_table(regs));
1785
0
        }
1786
0
        goto out;
1787
0
    }
1788
0
1789
0
    /* Save debug status register where guest OS can peek at it */
1790
0
    v->arch.debugreg[6] = read_debugreg(6);
1791
0
1792
0
    ler_enable();
1793
0
    pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1794
0
    return;
1795
0
1796
0
 out:
1797
0
    ler_enable();
1798
0
    return;
1799
0
}
1800
1801
static void __init noinline __set_intr_gate(unsigned int n,
1802
                                            uint32_t dpl, void *addr)
1803
255
{
1804
255
    _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr);
1805
255
}
1806
1807
static void __init set_swint_gate(unsigned int n, void *addr)
1808
2
{
1809
2
    __set_intr_gate(n, 3, addr);
1810
2
}
1811
1812
static void __init set_intr_gate(unsigned int n, void *addr)
1813
253
{
1814
253
    __set_intr_gate(n, 0, addr);
1815
253
}
1816
1817
void load_TR(void)
1818
0
{
1819
0
    struct tss_struct *tss = &this_cpu(init_tss);
1820
0
    struct desc_ptr old_gdt, tss_gdt = {
1821
0
        .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
1822
0
        .limit = LAST_RESERVED_GDT_BYTE
1823
0
    };
1824
0
1825
0
    _set_tssldt_desc(
1826
0
        this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1827
0
        (unsigned long)tss,
1828
0
        offsetof(struct tss_struct, __cacheline_filler) - 1,
1829
0
        SYS_DESC_tss_avail);
1830
0
    _set_tssldt_desc(
1831
0
        this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1832
0
        (unsigned long)tss,
1833
0
        offsetof(struct tss_struct, __cacheline_filler) - 1,
1834
0
        SYS_DESC_tss_busy);
1835
0
1836
0
    /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
1837
0
    asm volatile (
1838
0
        "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
1839
0
        : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
1840
0
}
1841
1842
void percpu_traps_init(void)
1843
13
{
1844
13
    subarch_percpu_traps_init();
1845
13
1846
13
    if ( !opt_ler )
1847
13
        return;
1848
13
1849
0
    switch ( boot_cpu_data.x86_vendor )
1850
0
    {
1851
0
    case X86_VENDOR_INTEL:
1852
0
        switch ( boot_cpu_data.x86 )
1853
0
        {
1854
0
        case 6:
1855
0
            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1856
0
            break;
1857
0
        case 15:
1858
0
            this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
1859
0
            break;
1860
0
        }
1861
0
        break;
1862
0
    case X86_VENDOR_AMD:
1863
0
        switch ( boot_cpu_data.x86 )
1864
0
        {
1865
0
        case 6:
1866
0
        case 0xf ... 0x17:
1867
0
            this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1868
0
            break;
1869
0
        }
1870
0
        break;
1871
0
    }
1872
0
1873
0
    ler_enable();
1874
0
}
1875
1876
void __init init_idt_traps(void)
1877
1
{
1878
1
    /*
1879
1
     * Note that interrupt gates are always used, rather than trap gates. We
1880
1
     * must have interrupts disabled until DS/ES/FS/GS are saved because the
1881
1
     * first activation must have the "bad" value(s) for these registers and
1882
1
     * we may lose them if another activation is installed before they are
1883
1
     * saved. The page-fault handler also needs interrupts disabled until %cr2
1884
1
     * has been read and saved on the stack.
1885
1
     */
1886
1
    set_intr_gate(TRAP_divide_error,&divide_error);
1887
1
    set_intr_gate(TRAP_debug,&debug);
1888
1
    set_intr_gate(TRAP_nmi,&nmi);
1889
1
    set_swint_gate(TRAP_int3,&int3);         /* usable from all privileges */
1890
1
    set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1891
1
    set_intr_gate(TRAP_bounds,&bounds);
1892
1
    set_intr_gate(TRAP_invalid_op,&invalid_op);
1893
1
    set_intr_gate(TRAP_no_device,&device_not_available);
1894
1
    set_intr_gate(TRAP_double_fault,&double_fault);
1895
1
    set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1896
1
    set_intr_gate(TRAP_no_segment,&segment_not_present);
1897
1
    set_intr_gate(TRAP_stack_error,&stack_segment);
1898
1
    set_intr_gate(TRAP_gp_fault,&general_protection);
1899
1
    set_intr_gate(TRAP_page_fault,&early_page_fault);
1900
1
    set_intr_gate(TRAP_copro_error,&coprocessor_error);
1901
1
    set_intr_gate(TRAP_alignment_check,&alignment_check);
1902
1
    set_intr_gate(TRAP_machine_check,&machine_check);
1903
1
    set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1904
1
1905
1
    /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
1906
1
    set_ist(&idt_table[TRAP_double_fault],  IST_DF);
1907
1
    set_ist(&idt_table[TRAP_nmi],           IST_NMI);
1908
1
    set_ist(&idt_table[TRAP_machine_check], IST_MCE);
1909
1
1910
1
    /* CPU0 uses the master IDT. */
1911
1
    idt_tables[0] = idt_table;
1912
1
1913
1
    this_cpu(gdt_table) = boot_cpu_gdt_table;
1914
1
    this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table;
1915
1
}
1916
1917
extern void (*const autogen_entrypoints[NR_VECTORS])(void);
1918
void __init trap_init(void)
1919
1
{
1920
1
    unsigned int vector;
1921
1
1922
1
    /* Replace early pagefault with real pagefault handler. */
1923
1
    set_intr_gate(TRAP_page_fault, &page_fault);
1924
1
1925
1
    pv_trap_init();
1926
1
1927
257
    for ( vector = 0; vector < NR_VECTORS; ++vector )
1928
256
    {
1929
256
        if ( autogen_entrypoints[vector] )
1930
236
        {
1931
236
            /* Found autogen entry: check we won't clobber an existing trap. */
1932
236
            ASSERT(idt_table[vector].b == 0);
1933
236
            set_intr_gate(vector, autogen_entrypoints[vector]);
1934
236
        }
1935
256
        else
1936
20
        {
1937
20
            /* No entry point: confirm we have an existing trap in place. */
1938
20
            ASSERT(idt_table[vector].b != 0);
1939
20
        }
1940
256
    }
1941
1
1942
1
    percpu_traps_init();
1943
1
1944
1
    cpu_init();
1945
1
1946
1
    open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq);
1947
1
}
1948
1949
void activate_debugregs(const struct vcpu *curr)
1950
0
{
1951
0
    ASSERT(curr == current);
1952
0
1953
0
    write_debugreg(0, curr->arch.debugreg[0]);
1954
0
    write_debugreg(1, curr->arch.debugreg[1]);
1955
0
    write_debugreg(2, curr->arch.debugreg[2]);
1956
0
    write_debugreg(3, curr->arch.debugreg[3]);
1957
0
    write_debugreg(6, curr->arch.debugreg[6]);
1958
0
1959
0
    /*
1960
0
     * Avoid writing the subsequently getting replaced value when getting
1961
0
     * called from set_debugreg() below. Eventual future callers will need
1962
0
     * to take this into account.
1963
0
     */
1964
0
    if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1965
0
        write_debugreg(7, curr->arch.debugreg[7]);
1966
0
1967
0
    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
1968
0
    {
1969
0
        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
1970
0
        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
1971
0
        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
1972
0
        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
1973
0
    }
1974
0
}
1975
1976
long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
1977
0
{
1978
0
    int i;
1979
0
    struct vcpu *curr = current;
1980
0
1981
0
    switch ( reg )
1982
0
    {
1983
0
    case 0:
1984
0
        if ( !access_ok(value, sizeof(long)) )
1985
0
            return -EPERM;
1986
0
        if ( v == curr )
1987
0
            write_debugreg(0, value);
1988
0
        break;
1989
0
    case 1:
1990
0
        if ( !access_ok(value, sizeof(long)) )
1991
0
            return -EPERM;
1992
0
        if ( v == curr )
1993
0
            write_debugreg(1, value);
1994
0
        break;
1995
0
    case 2:
1996
0
        if ( !access_ok(value, sizeof(long)) )
1997
0
            return -EPERM;
1998
0
        if ( v == curr )
1999
0
            write_debugreg(2, value);
2000
0
        break;
2001
0
    case 3:
2002
0
        if ( !access_ok(value, sizeof(long)) )
2003
0
            return -EPERM;
2004
0
        if ( v == curr )
2005
0
            write_debugreg(3, value);
2006
0
        break;
2007
0
    case 6:
2008
0
        /*
2009
0
         * DR6: Bits 4-11,16-31 reserved (set to 1).
2010
0
         *      Bit 12 reserved (set to 0).
2011
0
         */
2012
0
        value &= ~DR_STATUS_RESERVED_ZERO; /* reserved bits => 0 */
2013
0
        value |=  DR_STATUS_RESERVED_ONE;  /* reserved bits => 1 */
2014
0
        if ( v == curr )
2015
0
            write_debugreg(6, value);
2016
0
        break;
2017
0
    case 7:
2018
0
        /*
2019
0
         * DR7: Bit 10 reserved (set to 1).
2020
0
         *      Bits 11-12,14-15 reserved (set to 0).
2021
0
         */
2022
0
        value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2023
0
        value |=  DR_CONTROL_RESERVED_ONE;  /* reserved bits => 1 */
2024
0
        /*
2025
0
         * Privileged bits:
2026
0
         *      GD (bit 13): must be 0.
2027
0
         */
2028
0
        if ( value & DR_GENERAL_DETECT )
2029
0
            return -EPERM;
2030
0
        /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2031
0
        if ( value & DR7_ACTIVE_MASK )
2032
0
        {
2033
0
            unsigned int io_enable = 0;
2034
0
2035
0
            for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2036
0
            {
2037
0
                if ( ((value >> i) & 3) == DR_IO )
2038
0
                {
2039
0
                    if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
2040
0
                        return -EPERM;
2041
0
                    io_enable |= value & (3 << ((i - 16) >> 1));
2042
0
                }
2043
0
            }
2044
0
2045
0
            /* Guest DR5 is a handy stash for I/O intercept information. */
2046
0
            v->arch.debugreg[5] = io_enable;
2047
0
            value &= ~io_enable;
2048
0
2049
0
            /*
2050
0
             * If DR7 was previously clear then we need to load all other
2051
0
             * debug registers at this point as they were not restored during
2052
0
             * context switch.
2053
0
             */
2054
0
            if ( (v == curr) &&
2055
0
                 !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
2056
0
            {
2057
0
                activate_debugregs(v);
2058
0
                break;
2059
0
            }
2060
0
        }
2061
0
        if ( v == curr )
2062
0
            write_debugreg(7, value);
2063
0
        break;
2064
0
    default:
2065
0
        return -EINVAL;
2066
0
    }
2067
0
2068
0
    v->arch.debugreg[reg] = value;
2069
0
    return 0;
2070
0
}
2071
2072
void asm_domain_crash_synchronous(unsigned long addr)
2073
0
{
2074
0
    /*
2075
0
     * We need clear AC bit here because in entry.S AC is set
2076
0
     * by ASM_STAC to temporarily allow accesses to user pages
2077
0
     * which is prevented by SMAP by default.
2078
0
     *
2079
0
     * For some code paths, where this function is called, clac()
2080
0
     * is not needed, but adding clac() here instead of each place
2081
0
     * asm_domain_crash_synchronous() is called can reduce the code
2082
0
     * redundancy, and it is harmless as well.
2083
0
     */
2084
0
    clac();
2085
0
2086
0
    if ( addr == 0 )
2087
0
        addr = this_cpu(last_extable_addr);
2088
0
2089
0
    printk("domain_crash_sync called from entry.S: fault at %p %pS\n",
2090
0
           _p(addr), _p(addr));
2091
0
2092
0
    __domain_crash_synchronous();
2093
0
}
2094
2095
/*
2096
 * Local variables:
2097
 * mode: C
2098
 * c-file-style: "BSD"
2099
 * c-basic-offset: 4
2100
 * tab-width: 4
2101
 * indent-tabs-mode: nil
2102
 * End:
2103
 */