Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/mm/shadow/common.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * arch/x86/mm/shadow/common.c
3
 *
4
 * Shadow code that does not need to be multiply compiled.
5
 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6
 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7
 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8
 *
9
 * This program is free software; you can redistribute it and/or modify
10
 * it under the terms of the GNU General Public License as published by
11
 * the Free Software Foundation; either version 2 of the License, or
12
 * (at your option) any later version.
13
 *
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 * GNU General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU General Public License
20
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
21
 */
22
23
#include <xen/types.h>
24
#include <xen/mm.h>
25
#include <xen/trace.h>
26
#include <xen/sched.h>
27
#include <xen/perfc.h>
28
#include <xen/irq.h>
29
#include <xen/domain_page.h>
30
#include <xen/guest_access.h>
31
#include <xen/keyhandler.h>
32
#include <asm/event.h>
33
#include <asm/page.h>
34
#include <asm/current.h>
35
#include <asm/flushtlb.h>
36
#include <asm/shadow.h>
37
#include <asm/hvm/ioreq.h>
38
#include <xen/numa.h>
39
#include "private.h"
40
41
DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
42
43
static int sh_enable_log_dirty(struct domain *, bool log_global);
44
static int sh_disable_log_dirty(struct domain *);
45
static void sh_clean_dirty_bitmap(struct domain *);
46
47
/* Set up the shadow-specific parts of a domain struct at start of day.
48
 * Called for every domain from arch_domain_create() */
49
int shadow_domain_init(struct domain *d, unsigned int domcr_flags)
50
0
{
51
0
    static const struct log_dirty_ops sh_ops = {
52
0
        .enable  = sh_enable_log_dirty,
53
0
        .disable = sh_disable_log_dirty,
54
0
        .clean   = sh_clean_dirty_bitmap,
55
0
    };
56
0
57
0
    INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelist);
58
0
    INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
59
0
60
0
    /* Use shadow pagetables for log-dirty support */
61
0
    paging_log_dirty_init(d, &sh_ops);
62
0
63
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
64
0
    d->arch.paging.shadow.oos_active = 0;
65
0
    d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ?  1 : 0;
66
0
#endif
67
0
    d->arch.paging.shadow.pagetable_dying_op = 0;
68
0
69
0
    return 0;
70
0
}
71
72
/* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
73
 * job is to initialize the update_paging_modes() function pointer, which is
74
 * used to initialized the rest of resources. Therefore, it really does not
75
 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
76
 * be compiled.
77
 */
78
void shadow_vcpu_init(struct vcpu *v)
79
0
{
80
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
81
0
    int i, j;
82
0
83
0
    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
84
0
    {
85
0
        v->arch.paging.shadow.oos[i] = INVALID_MFN;
86
0
        v->arch.paging.shadow.oos_snapshot[i] = INVALID_MFN;
87
0
        for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
88
0
            v->arch.paging.shadow.oos_fixup[i].smfn[j] = INVALID_MFN;
89
0
    }
90
0
#endif
91
0
92
0
    v->arch.paging.mode = is_pv_vcpu(v) ?
93
0
                          &SHADOW_INTERNAL_NAME(sh_paging_mode, 4) :
94
0
                          &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
95
0
}
96
97
#if SHADOW_AUDIT
98
int shadow_audit_enable = 0;
99
100
static void shadow_audit_key(unsigned char key)
101
0
{
102
0
    shadow_audit_enable = !shadow_audit_enable;
103
0
    printk("%s shadow_audit_enable=%d\n",
104
0
           __func__, shadow_audit_enable);
105
0
}
106
107
static int __init shadow_audit_key_init(void)
108
1
{
109
1
    register_keyhandler('O', shadow_audit_key, "toggle shadow audits", 0);
110
1
    return 0;
111
1
}
112
__initcall(shadow_audit_key_init);
113
#endif /* SHADOW_AUDIT */
114
115
116
/**************************************************************************/
117
/* x86 emulator support for the shadow code
118
 */
119
120
/*
121
 * Callers which pass a known in-range x86_segment can rely on the return
122
 * pointer being valid.  Other callers must explicitly check for errors.
123
 */
124
static struct segment_register *hvm_get_seg_reg(
125
    enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
126
0
{
127
0
    unsigned int idx = seg;
128
0
    struct segment_register *seg_reg;
129
0
130
0
    if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) )
131
0
        return ERR_PTR(-X86EMUL_UNHANDLEABLE);
132
0
133
0
    seg_reg = &sh_ctxt->seg_reg[idx];
134
0
    if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) )
135
0
        hvm_get_segment_register(current, idx, seg_reg);
136
0
    return seg_reg;
137
0
}
138
139
static int hvm_translate_virtual_addr(
140
    enum x86_segment seg,
141
    unsigned long offset,
142
    unsigned int bytes,
143
    enum hvm_access_type access_type,
144
    struct sh_emulate_ctxt *sh_ctxt,
145
    unsigned long *linear)
146
0
{
147
0
    const struct segment_register *reg;
148
0
    int okay;
149
0
150
0
    reg = hvm_get_seg_reg(seg, sh_ctxt);
151
0
    if ( IS_ERR(reg) )
152
0
        return -PTR_ERR(reg);
153
0
154
0
    okay = hvm_virtual_to_linear_addr(
155
0
        seg, reg, offset, bytes, access_type,
156
0
        hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear);
157
0
158
0
    if ( !okay )
159
0
    {
160
0
        /*
161
0
         * Leave exception injection to the caller for non-user segments: We
162
0
         * neither know the exact error code to be used, nor can we easily
163
0
         * determine the kind of exception (#GP or #TS) in that case.
164
0
         */
165
0
        if ( is_x86_user_segment(seg) )
166
0
            x86_emul_hw_exception(
167
0
                (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault,
168
0
                0, &sh_ctxt->ctxt);
169
0
        return X86EMUL_EXCEPTION;
170
0
    }
171
0
172
0
    return 0;
173
0
}
174
175
static int
176
hvm_read(enum x86_segment seg,
177
         unsigned long offset,
178
         void *p_data,
179
         unsigned int bytes,
180
         enum hvm_access_type access_type,
181
         struct sh_emulate_ctxt *sh_ctxt)
182
0
{
183
0
    pagefault_info_t pfinfo;
184
0
    unsigned long addr;
185
0
    int rc;
186
0
187
0
    rc = hvm_translate_virtual_addr(
188
0
        seg, offset, bytes, access_type, sh_ctxt, &addr);
189
0
    if ( rc || !bytes )
190
0
        return rc;
191
0
192
0
    if ( access_type == hvm_access_insn_fetch )
193
0
        rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
194
0
    else
195
0
        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
196
0
197
0
    switch ( rc )
198
0
    {
199
0
    case HVMTRANS_okay:
200
0
        return X86EMUL_OKAY;
201
0
    case HVMTRANS_bad_linear_to_gfn:
202
0
        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt);
203
0
        return X86EMUL_EXCEPTION;
204
0
    case HVMTRANS_bad_gfn_to_mfn:
205
0
    case HVMTRANS_unhandleable:
206
0
        return X86EMUL_UNHANDLEABLE;
207
0
    case HVMTRANS_gfn_paged_out:
208
0
    case HVMTRANS_gfn_shared:
209
0
        return X86EMUL_RETRY;
210
0
    }
211
0
212
0
    BUG();
213
0
    return X86EMUL_UNHANDLEABLE;
214
0
}
215
216
static int
217
hvm_emulate_read(enum x86_segment seg,
218
                 unsigned long offset,
219
                 void *p_data,
220
                 unsigned int bytes,
221
                 struct x86_emulate_ctxt *ctxt)
222
0
{
223
0
    if ( !is_x86_user_segment(seg) )
224
0
        return X86EMUL_UNHANDLEABLE;
225
0
    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
226
0
                    container_of(ctxt, struct sh_emulate_ctxt, ctxt));
227
0
}
228
229
static int
230
hvm_emulate_insn_fetch(enum x86_segment seg,
231
                       unsigned long offset,
232
                       void *p_data,
233
                       unsigned int bytes,
234
                       struct x86_emulate_ctxt *ctxt)
235
0
{
236
0
    struct sh_emulate_ctxt *sh_ctxt =
237
0
        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
238
0
    unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
239
0
240
0
    ASSERT(seg == x86_seg_cs);
241
0
242
0
    /* Fall back if requested bytes are not in the prefetch cache. */
243
0
    if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
244
0
        return hvm_read(seg, offset, p_data, bytes,
245
0
                        hvm_access_insn_fetch, sh_ctxt);
246
0
247
0
    /* Hit the cache. Simple memcpy. */
248
0
    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
249
0
    return X86EMUL_OKAY;
250
0
}
251
252
static int
253
hvm_emulate_write(enum x86_segment seg,
254
                  unsigned long offset,
255
                  void *p_data,
256
                  unsigned int bytes,
257
                  struct x86_emulate_ctxt *ctxt)
258
0
{
259
0
    struct sh_emulate_ctxt *sh_ctxt =
260
0
        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
261
0
    struct vcpu *v = current;
262
0
    unsigned long addr;
263
0
    int rc;
264
0
265
0
    /* How many emulations could we save if we unshadowed on stack writes? */
266
0
    if ( seg == x86_seg_ss )
267
0
        perfc_incr(shadow_fault_emulate_stack);
268
0
269
0
    rc = hvm_translate_virtual_addr(
270
0
        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
271
0
    if ( rc || !bytes )
272
0
        return rc;
273
0
274
0
    return v->arch.paging.mode->shadow.x86_emulate_write(
275
0
        v, addr, p_data, bytes, sh_ctxt);
276
0
}
277
278
static int
279
hvm_emulate_cmpxchg(enum x86_segment seg,
280
                    unsigned long offset,
281
                    void *p_old,
282
                    void *p_new,
283
                    unsigned int bytes,
284
                    struct x86_emulate_ctxt *ctxt)
285
0
{
286
0
    struct sh_emulate_ctxt *sh_ctxt =
287
0
        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
288
0
    struct vcpu *v = current;
289
0
    unsigned long addr, old, new;
290
0
    int rc;
291
0
292
0
    if ( bytes > sizeof(long) )
293
0
        return X86EMUL_UNHANDLEABLE;
294
0
295
0
    rc = hvm_translate_virtual_addr(
296
0
        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
297
0
    if ( rc )
298
0
        return rc;
299
0
300
0
    old = new = 0;
301
0
    memcpy(&old, p_old, bytes);
302
0
    memcpy(&new, p_new, bytes);
303
0
304
0
    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
305
0
               v, addr, old, new, bytes, sh_ctxt);
306
0
}
307
308
static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
309
    .read       = hvm_emulate_read,
310
    .insn_fetch = hvm_emulate_insn_fetch,
311
    .write      = hvm_emulate_write,
312
    .cmpxchg    = hvm_emulate_cmpxchg,
313
    .cpuid      = hvmemul_cpuid,
314
};
315
316
const struct x86_emulate_ops *shadow_init_emulation(
317
    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
318
0
{
319
0
    struct segment_register *creg, *sreg;
320
0
    struct vcpu *v = current;
321
0
    unsigned long addr;
322
0
323
0
    ASSERT(is_hvm_vcpu(v));
324
0
325
0
    memset(sh_ctxt, 0, sizeof(*sh_ctxt));
326
0
327
0
    sh_ctxt->ctxt.regs = regs;
328
0
    sh_ctxt->ctxt.vendor = v->domain->arch.cpuid->x86_vendor;
329
0
    sh_ctxt->ctxt.lma = hvm_long_mode_active(v);
330
0
331
0
    /* Segment cache initialisation. Primed with CS. */
332
0
    creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
333
0
334
0
    /* Work out the emulation mode. */
335
0
    if ( sh_ctxt->ctxt.lma && creg->l )
336
0
        sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
337
0
    else
338
0
    {
339
0
        sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
340
0
        sh_ctxt->ctxt.addr_size = creg->db ? 32 : 16;
341
0
        sh_ctxt->ctxt.sp_size   = sreg->db ? 32 : 16;
342
0
    }
343
0
344
0
    /* Attempt to prefetch whole instruction. */
345
0
    sh_ctxt->insn_buf_eip = regs->rip;
346
0
    sh_ctxt->insn_buf_bytes =
347
0
        (!hvm_translate_virtual_addr(
348
0
            x86_seg_cs, regs->rip, sizeof(sh_ctxt->insn_buf),
349
0
            hvm_access_insn_fetch, sh_ctxt, &addr) &&
350
0
         !hvm_fetch_from_guest_linear(
351
0
             sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0, NULL))
352
0
        ? sizeof(sh_ctxt->insn_buf) : 0;
353
0
354
0
    return &hvm_shadow_emulator_ops;
355
0
}
356
357
/* Update an initialized emulation context to prepare for the next
358
 * instruction */
359
void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
360
                               struct cpu_user_regs *regs)
361
0
{
362
0
    struct vcpu *v = current;
363
0
    unsigned long addr, diff;
364
0
365
0
    ASSERT(is_hvm_vcpu(v));
366
0
367
0
    /*
368
0
     * We don't refetch the segment bases, because we don't emulate
369
0
     * writes to segment registers
370
0
     */
371
0
    diff = regs->rip - sh_ctxt->insn_buf_eip;
372
0
    if ( diff > sh_ctxt->insn_buf_bytes )
373
0
    {
374
0
        /* Prefetch more bytes. */
375
0
        sh_ctxt->insn_buf_bytes =
376
0
            (!hvm_translate_virtual_addr(
377
0
                x86_seg_cs, regs->rip, sizeof(sh_ctxt->insn_buf),
378
0
                hvm_access_insn_fetch, sh_ctxt, &addr) &&
379
0
             !hvm_fetch_from_guest_linear(
380
0
                 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0, NULL))
381
0
            ? sizeof(sh_ctxt->insn_buf) : 0;
382
0
        sh_ctxt->insn_buf_eip = regs->rip;
383
0
    }
384
0
}
385
386
387
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
388
/**************************************************************************/
389
/* Out-of-sync shadows. */
390
391
/* From time to time, we let a shadowed pagetable page go out of sync
392
 * with its shadow: the guest is allowed to write directly to the page,
393
 * and those writes are not synchronously reflected in the shadow.
394
 * This lets us avoid many emulations if the guest is writing a lot to a
395
 * pagetable, but it relaxes a pretty important invariant in the shadow
396
 * pagetable design.  Therefore, some rules:
397
 *
398
 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
399
 *    at at higher level must be synchronously updated.  This makes
400
 *    using linear shadow pagetables much less dangerous.
401
 *    That means that: (a) unsyncing code needs to check for higher-level
402
 *    shadows, and (b) promotion code needs to resync.
403
 *
404
 * 2. All shadow operations on a guest page require the page to be brought
405
 *    back into sync before proceeding.  This must be done under the
406
 *    paging lock so that the page is guaranteed to remain synced until
407
 *    the operation completes.
408
 *
409
 *    Exceptions to this rule: the pagefault and invlpg handlers may
410
 *    update only one entry on an out-of-sync page without resyncing it.
411
 *
412
 * 3. Operations on shadows that do not start from a guest page need to
413
 *    be aware that they may be handling an out-of-sync shadow.
414
 *
415
 * 4. Operations that do not normally take the paging lock (fast-path
416
 *    #PF handler, INVLPG) must fall back to a locking, syncing version
417
 *    if they see an out-of-sync table.
418
 *
419
 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
420
 *    must explicitly resync all relevant pages or update their
421
 *    shadows.
422
 *
423
 * Currently out-of-sync pages are listed in a simple open-addressed
424
 * hash table with a second chance (must resist temptation to radically
425
 * over-engineer hash tables...)  The virtual address of the access
426
 * which caused us to unsync the page is also kept in the hash table, as
427
 * a hint for finding the writable mappings later.
428
 *
429
 * We keep a hash per vcpu, because we want as much as possible to do
430
 * the re-sync on the save vcpu we did the unsync on, so the VA hint
431
 * will be valid.
432
 */
433
434
435
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
436
static void sh_oos_audit(struct domain *d)
437
{
438
    int idx, expected_idx, expected_idx_alt;
439
    struct page_info *pg;
440
    struct vcpu *v;
441
442
    for_each_vcpu(d, v)
443
    {
444
        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
445
        {
446
            mfn_t *oos = v->arch.paging.shadow.oos;
447
            if ( !mfn_valid(oos[idx]) )
448
                continue;
449
450
            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
451
            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
452
            if ( idx != expected_idx && idx != expected_idx_alt )
453
            {
454
                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
455
                       __func__, idx, mfn_x(oos[idx]),
456
                       expected_idx, expected_idx_alt);
457
                BUG();
458
            }
459
            pg = mfn_to_page(oos[idx]);
460
            if ( !(pg->count_info & PGC_page_table) )
461
            {
462
                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
463
                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
464
                BUG();
465
            }
466
            if ( !(pg->shadow_flags & SHF_out_of_sync) )
467
            {
468
                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
469
                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
470
                BUG();
471
            }
472
            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
473
            {
474
                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
475
                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
476
                BUG();
477
            }
478
        }
479
    }
480
}
481
#endif
482
483
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
484
void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
485
0
{
486
0
    int idx;
487
0
    struct vcpu *v;
488
0
    mfn_t *oos;
489
0
490
0
    ASSERT(mfn_is_out_of_sync(gmfn));
491
0
492
0
    for_each_vcpu(d, v)
493
0
    {
494
0
        oos = v->arch.paging.shadow.oos;
495
0
        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
496
0
        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
497
0
            idx = (idx + 1) % SHADOW_OOS_PAGES;
498
0
499
0
        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
500
0
            return;
501
0
    }
502
0
503
0
    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
504
0
    BUG();
505
0
}
506
#endif
507
508
/* Update the shadow, but keep the page out of sync. */
509
static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
510
0
{
511
0
    struct page_info *pg = mfn_to_page(gmfn);
512
0
513
0
    ASSERT(mfn_valid(gmfn));
514
0
    ASSERT(page_is_out_of_sync(pg));
515
0
516
0
    /* Call out to the appropriate per-mode resyncing function */
517
0
    if ( pg->shadow_flags & SHF_L1_32 )
518
0
        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
519
0
    else if ( pg->shadow_flags & SHF_L1_PAE )
520
0
        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
521
0
    else if ( pg->shadow_flags & SHF_L1_64 )
522
0
        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
523
0
}
524
525
526
/*
527
 * Fixup arrays: We limit the maximum number of writable mappings to
528
 * SHADOW_OOS_FIXUPS and store enough information to remove them
529
 * quickly on resync.
530
 */
531
532
static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
533
                                       struct oos_fixup *fixup)
534
0
{
535
0
    struct domain *d = v->domain;
536
0
    int i;
537
0
    for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
538
0
    {
539
0
        if ( !mfn_eq(fixup->smfn[i], INVALID_MFN) )
540
0
        {
541
0
            sh_remove_write_access_from_sl1p(d, gmfn,
542
0
                                             fixup->smfn[i],
543
0
                                             fixup->off[i]);
544
0
            fixup->smfn[i] = INVALID_MFN;
545
0
        }
546
0
    }
547
0
548
0
    /* Always flush the TLBs. See comment on oos_fixup_add(). */
549
0
    return 1;
550
0
}
551
552
void oos_fixup_add(struct domain *d, mfn_t gmfn,
553
                   mfn_t smfn,  unsigned long off)
554
0
{
555
0
    int idx, next;
556
0
    mfn_t *oos;
557
0
    struct oos_fixup *oos_fixup;
558
0
    struct vcpu *v;
559
0
560
0
    perfc_incr(shadow_oos_fixup_add);
561
0
562
0
    for_each_vcpu(d, v)
563
0
    {
564
0
        oos = v->arch.paging.shadow.oos;
565
0
        oos_fixup = v->arch.paging.shadow.oos_fixup;
566
0
        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
567
0
        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
568
0
            idx = (idx + 1) % SHADOW_OOS_PAGES;
569
0
        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
570
0
        {
571
0
            int i;
572
0
            for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
573
0
            {
574
0
                if ( mfn_valid(oos_fixup[idx].smfn[i])
575
0
                     && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn))
576
0
                     && (oos_fixup[idx].off[i] == off) )
577
0
                    return;
578
0
            }
579
0
580
0
            next = oos_fixup[idx].next;
581
0
582
0
            if ( !mfn_eq(oos_fixup[idx].smfn[next], INVALID_MFN) )
583
0
            {
584
0
                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
585
0
586
0
                /* Reuse this slot and remove current writable mapping. */
587
0
                sh_remove_write_access_from_sl1p(d, gmfn,
588
0
                                                 oos_fixup[idx].smfn[next],
589
0
                                                 oos_fixup[idx].off[next]);
590
0
                perfc_incr(shadow_oos_fixup_evict);
591
0
                /* We should flush the TLBs now, because we removed a
592
0
                   writable mapping, but since the shadow is already
593
0
                   OOS we have no problem if another vcpu write to
594
0
                   this page table. We just have to be very careful to
595
0
                   *always* flush the tlbs on resync. */
596
0
            }
597
0
598
0
            oos_fixup[idx].smfn[next] = smfn;
599
0
            oos_fixup[idx].off[next] = off;
600
0
            oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
601
0
602
0
            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
603
0
            return;
604
0
        }
605
0
    }
606
0
607
0
    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
608
0
    BUG();
609
0
}
610
611
static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
612
                                   struct oos_fixup *fixup)
613
0
{
614
0
    struct domain *d = v->domain;
615
0
    int ftlb = 0;
616
0
617
0
    ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
618
0
619
0
    switch ( sh_remove_write_access(d, gmfn, 0, 0) )
620
0
    {
621
0
    default:
622
0
    case 0:
623
0
        break;
624
0
625
0
    case 1:
626
0
        ftlb |= 1;
627
0
        break;
628
0
629
0
    case -1:
630
0
        /* An unfindable writeable typecount has appeared, probably via a
631
0
         * grant table entry: can't shoot the mapping, so try to unshadow
632
0
         * the page.  If that doesn't work either, the guest is granting
633
0
         * his pagetables and must be killed after all.
634
0
         * This will flush the tlb, so we can return with no worries. */
635
0
        sh_remove_shadows(d, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
636
0
        return 1;
637
0
    }
638
0
639
0
    if ( ftlb )
640
0
        flush_tlb_mask(d->domain_dirty_cpumask);
641
0
642
0
    return 0;
643
0
}
644
645
646
static inline void trace_resync(int event, mfn_t gmfn)
647
0
{
648
0
    if ( tb_init_done )
649
0
    {
650
0
        /* Convert gmfn to gfn */
651
0
        unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
652
0
        __trace_var(event, 0/*!tsc*/, sizeof(gfn), &gfn);
653
0
    }
654
0
}
655
656
/* Pull all the entries on an out-of-sync page back into sync. */
657
static void _sh_resync(struct vcpu *v, mfn_t gmfn,
658
                       struct oos_fixup *fixup, mfn_t snp)
659
0
{
660
0
    struct page_info *pg = mfn_to_page(gmfn);
661
0
662
0
    ASSERT(paging_locked_by_me(v->domain));
663
0
    ASSERT(mfn_is_out_of_sync(gmfn));
664
0
    /* Guest page must be shadowed *only* as L1 when out of sync. */
665
0
    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
666
0
             & ~SHF_L1_ANY));
667
0
    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
668
0
669
0
    SHADOW_PRINTK("%pv gmfn=%"PRI_mfn"\n", v, mfn_x(gmfn));
670
0
671
0
    /* Need to pull write access so the page *stays* in sync. */
672
0
    if ( oos_remove_write_access(v, gmfn, fixup) )
673
0
    {
674
0
        /* Page has been unshadowed. */
675
0
        return;
676
0
    }
677
0
678
0
    /* No more writable mappings of this page, please */
679
0
    pg->shadow_flags &= ~SHF_oos_may_write;
680
0
681
0
    /* Update the shadows with current guest entries. */
682
0
    _sh_resync_l1(v, gmfn, snp);
683
0
684
0
    /* Now we know all the entries are synced, and will stay that way */
685
0
    pg->shadow_flags &= ~SHF_out_of_sync;
686
0
    perfc_incr(shadow_resync);
687
0
    trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
688
0
}
689
690
691
/* Add an MFN to the list of out-of-sync guest pagetables */
692
static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
693
0
{
694
0
    int i, idx, oidx, swap = 0;
695
0
    void *gptr, *gsnpptr;
696
0
    mfn_t *oos = v->arch.paging.shadow.oos;
697
0
    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
698
0
    struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
699
0
    struct oos_fixup fixup = { .next = 0 };
700
0
701
0
    for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
702
0
        fixup.smfn[i] = INVALID_MFN;
703
0
704
0
    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
705
0
    oidx = idx;
706
0
707
0
    if ( mfn_valid(oos[idx])
708
0
         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
709
0
    {
710
0
        /* Punt the current occupant into the next slot */
711
0
        SWAP(oos[idx], gmfn);
712
0
        SWAP(oos_fixup[idx], fixup);
713
0
        swap = 1;
714
0
        idx = (idx + 1) % SHADOW_OOS_PAGES;
715
0
    }
716
0
    if ( mfn_valid(oos[idx]) )
717
0
   {
718
0
        /* Crush the current occupant. */
719
0
        _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
720
0
        perfc_incr(shadow_unsync_evict);
721
0
    }
722
0
    oos[idx] = gmfn;
723
0
    oos_fixup[idx] = fixup;
724
0
725
0
    if ( swap )
726
0
        SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
727
0
728
0
    gptr = map_domain_page(oos[oidx]);
729
0
    gsnpptr = map_domain_page(oos_snapshot[oidx]);
730
0
    memcpy(gsnpptr, gptr, PAGE_SIZE);
731
0
    unmap_domain_page(gptr);
732
0
    unmap_domain_page(gsnpptr);
733
0
}
734
735
/* Remove an MFN from the list of out-of-sync guest pagetables */
736
static void oos_hash_remove(struct domain *d, mfn_t gmfn)
737
0
{
738
0
    int idx;
739
0
    mfn_t *oos;
740
0
    struct vcpu *v;
741
0
742
0
    SHADOW_PRINTK("d%d gmfn %lx\n", d->domain_id, mfn_x(gmfn));
743
0
744
0
    for_each_vcpu(d, v)
745
0
    {
746
0
        oos = v->arch.paging.shadow.oos;
747
0
        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
748
0
        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
749
0
            idx = (idx + 1) % SHADOW_OOS_PAGES;
750
0
        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
751
0
        {
752
0
            oos[idx] = INVALID_MFN;
753
0
            return;
754
0
        }
755
0
    }
756
0
757
0
    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
758
0
    BUG();
759
0
}
760
761
mfn_t oos_snapshot_lookup(struct domain *d, mfn_t gmfn)
762
0
{
763
0
    int idx;
764
0
    mfn_t *oos;
765
0
    mfn_t *oos_snapshot;
766
0
    struct vcpu *v;
767
0
768
0
    for_each_vcpu(d, v)
769
0
    {
770
0
        oos = v->arch.paging.shadow.oos;
771
0
        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
772
0
        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
773
0
        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
774
0
            idx = (idx + 1) % SHADOW_OOS_PAGES;
775
0
        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
776
0
        {
777
0
            return oos_snapshot[idx];
778
0
        }
779
0
    }
780
0
781
0
    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
782
0
    BUG();
783
0
}
784
785
/* Pull a single guest page back into sync */
786
void sh_resync(struct domain *d, mfn_t gmfn)
787
0
{
788
0
    int idx;
789
0
    mfn_t *oos;
790
0
    mfn_t *oos_snapshot;
791
0
    struct oos_fixup *oos_fixup;
792
0
    struct vcpu *v;
793
0
794
0
    for_each_vcpu(d, v)
795
0
    {
796
0
        oos = v->arch.paging.shadow.oos;
797
0
        oos_fixup = v->arch.paging.shadow.oos_fixup;
798
0
        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
799
0
        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
800
0
        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
801
0
            idx = (idx + 1) % SHADOW_OOS_PAGES;
802
0
803
0
        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
804
0
        {
805
0
            _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
806
0
            oos[idx] = INVALID_MFN;
807
0
            return;
808
0
        }
809
0
    }
810
0
811
0
    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
812
0
    BUG();
813
0
}
814
815
/* Figure out whether it's definitely safe not to sync this l1 table,
816
 * by making a call out to the mode in which that shadow was made. */
817
static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
818
0
{
819
0
    struct page_info *pg = mfn_to_page(gl1mfn);
820
0
    if ( pg->shadow_flags & SHF_L1_32 )
821
0
        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
822
0
    else if ( pg->shadow_flags & SHF_L1_PAE )
823
0
        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
824
0
    else if ( pg->shadow_flags & SHF_L1_64 )
825
0
        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
826
0
    SHADOW_ERROR("gmfn %#lx was OOS but not shadowed as an l1.\n",
827
0
                 mfn_x(gl1mfn));
828
0
    BUG();
829
0
}
830
831
832
/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
833
 * on other vcpus are allowed to remain out of sync, but their contents
834
 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
835
 * are brought back into sync and write-protected.  If skip != 0, we try
836
 * to avoid resyncing at all if we think we can get away with it. */
837
void sh_resync_all(struct vcpu *v, int skip, int this, int others)
838
0
{
839
0
    int idx;
840
0
    struct vcpu *other;
841
0
    mfn_t *oos = v->arch.paging.shadow.oos;
842
0
    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
843
0
    struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
844
0
845
0
    SHADOW_PRINTK("%pv\n", v);
846
0
847
0
    ASSERT(paging_locked_by_me(v->domain));
848
0
849
0
    if ( !this )
850
0
        goto resync_others;
851
0
852
0
    /* First: resync all of this vcpu's oos pages */
853
0
    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
854
0
        if ( mfn_valid(oos[idx]) )
855
0
        {
856
0
            /* Write-protect and sync contents */
857
0
            _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
858
0
            oos[idx] = INVALID_MFN;
859
0
        }
860
0
861
0
 resync_others:
862
0
    if ( !others )
863
0
        return;
864
0
865
0
    /* Second: make all *other* vcpus' oos pages safe. */
866
0
    for_each_vcpu(v->domain, other)
867
0
    {
868
0
        if ( v == other )
869
0
            continue;
870
0
871
0
        oos = other->arch.paging.shadow.oos;
872
0
        oos_fixup = other->arch.paging.shadow.oos_fixup;
873
0
        oos_snapshot = other->arch.paging.shadow.oos_snapshot;
874
0
875
0
        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
876
0
        {
877
0
            if ( !mfn_valid(oos[idx]) )
878
0
                continue;
879
0
880
0
            if ( skip )
881
0
            {
882
0
                /* Update the shadows and leave the page OOS. */
883
0
                if ( sh_skip_sync(v, oos[idx]) )
884
0
                    continue;
885
0
                trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
886
0
                _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
887
0
            }
888
0
            else
889
0
            {
890
0
                /* Write-protect and sync contents */
891
0
                _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
892
0
                oos[idx] = INVALID_MFN;
893
0
            }
894
0
        }
895
0
    }
896
0
}
897
898
/* Allow a shadowed page to go out of sync. Unsyncs are traced in
899
 * multi.c:sh_page_fault() */
900
int sh_unsync(struct vcpu *v, mfn_t gmfn)
901
0
{
902
0
    struct page_info *pg;
903
0
904
0
    ASSERT(paging_locked_by_me(v->domain));
905
0
906
0
    SHADOW_PRINTK("%pv gmfn=%"PRI_mfn"\n", v, mfn_x(gmfn));
907
0
908
0
    pg = mfn_to_page(gmfn);
909
0
910
0
    /* Guest page must be shadowed *only* as L1 and *only* once when out
911
0
     * of sync.  Also, get out now if it's already out of sync.
912
0
     * Also, can't safely unsync if some vcpus have paging disabled.*/
913
0
    if ( pg->shadow_flags &
914
0
         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
915
0
         || sh_page_has_multiple_shadows(pg)
916
0
         || is_pv_vcpu(v)
917
0
         || !v->domain->arch.paging.shadow.oos_active )
918
0
        return 0;
919
0
920
0
    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
921
0
    oos_hash_add(v, gmfn);
922
0
    perfc_incr(shadow_unsync);
923
0
    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
924
0
    return 1;
925
0
}
926
927
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
928
929
930
/**************************************************************************/
931
/* Code for "promoting" a guest page to the point where the shadow code is
932
 * willing to let it be treated as a guest page table.  This generally
933
 * involves making sure there are no writable mappings available to the guest
934
 * for this page.
935
 */
936
void shadow_promote(struct domain *d, mfn_t gmfn, unsigned int type)
937
0
{
938
0
    struct page_info *page = mfn_to_page(gmfn);
939
0
940
0
    ASSERT(mfn_valid(gmfn));
941
0
942
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
943
0
    /* Is the page already shadowed and out of sync? */
944
0
    if ( page_is_out_of_sync(page) )
945
0
        sh_resync(d, gmfn);
946
0
#endif
947
0
948
0
    /* We should never try to promote a gmfn that has writeable mappings */
949
0
    ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
950
0
           || (page->u.inuse.type_info & PGT_count_mask) == 0
951
0
           || d->is_shutting_down);
952
0
953
0
    /* Is the page already shadowed? */
954
0
    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
955
0
        page->shadow_flags = 0;
956
0
957
0
    ASSERT(!test_bit(type, &page->shadow_flags));
958
0
    set_bit(type, &page->shadow_flags);
959
0
    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
960
0
}
961
962
void shadow_demote(struct domain *d, mfn_t gmfn, u32 type)
963
0
{
964
0
    struct page_info *page = mfn_to_page(gmfn);
965
0
966
0
    ASSERT(test_bit(_PGC_page_table, &page->count_info));
967
0
    ASSERT(test_bit(type, &page->shadow_flags));
968
0
969
0
    clear_bit(type, &page->shadow_flags);
970
0
971
0
    if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
972
0
    {
973
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
974
0
        /* Was the page out of sync? */
975
0
        if ( page_is_out_of_sync(page) )
976
0
        {
977
0
            oos_hash_remove(d, gmfn);
978
0
        }
979
0
#endif
980
0
        clear_bit(_PGC_page_table, &page->count_info);
981
0
    }
982
0
983
0
    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
984
0
}
985
986
/**************************************************************************/
987
/* Validate a pagetable change from the guest and update the shadows.
988
 * Returns a bitmask of SHADOW_SET_* flags. */
989
990
int
991
sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
992
0
{
993
0
    int result = 0;
994
0
    struct page_info *page = mfn_to_page(gmfn);
995
0
996
0
    paging_mark_dirty(v->domain, gmfn);
997
0
998
0
    // Determine which types of shadows are affected, and update each.
999
0
    //
1000
0
    // Always validate L1s before L2s to prevent another cpu with a linear
1001
0
    // mapping of this gmfn from seeing a walk that results from
1002
0
    // using the new L2 value and the old L1 value.  (It is OK for such a
1003
0
    // guest to see a walk that uses the old L2 value with the new L1 value,
1004
0
    // as hardware could behave this way if one level of the pagewalk occurs
1005
0
    // before the store, and the next level of the pagewalk occurs after the
1006
0
    // store.
1007
0
    //
1008
0
    // Ditto for L2s before L3s, etc.
1009
0
    //
1010
0
1011
0
    if ( !(page->count_info & PGC_page_table) )
1012
0
        return 0;  /* Not shadowed at all */
1013
0
1014
0
    if ( page->shadow_flags & SHF_L1_32 )
1015
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1016
0
            (v, gmfn, entry, size);
1017
0
    if ( page->shadow_flags & SHF_L2_32 )
1018
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1019
0
            (v, gmfn, entry, size);
1020
0
1021
0
    if ( page->shadow_flags & SHF_L1_PAE )
1022
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1023
0
            (v, gmfn, entry, size);
1024
0
    if ( page->shadow_flags & SHF_L2_PAE )
1025
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1026
0
            (v, gmfn, entry, size);
1027
0
    if ( page->shadow_flags & SHF_L2H_PAE )
1028
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1029
0
            (v, gmfn, entry, size);
1030
0
1031
0
    if ( page->shadow_flags & SHF_L1_64 )
1032
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1033
0
            (v, gmfn, entry, size);
1034
0
    if ( page->shadow_flags & SHF_L2_64 )
1035
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1036
0
            (v, gmfn, entry, size);
1037
0
    if ( page->shadow_flags & SHF_L2H_64 )
1038
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1039
0
            (v, gmfn, entry, size);
1040
0
    if ( page->shadow_flags & SHF_L3_64 )
1041
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1042
0
            (v, gmfn, entry, size);
1043
0
    if ( page->shadow_flags & SHF_L4_64 )
1044
0
        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1045
0
            (v, gmfn, entry, size);
1046
0
1047
0
    this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
1048
0
1049
0
    return result;
1050
0
}
1051
1052
1053
void
1054
sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1055
                           void *entry, u32 size)
1056
/* This is the entry point for emulated writes to pagetables in HVM guests and
1057
 * PV translated guests.
1058
 */
1059
0
{
1060
0
    struct domain *d = v->domain;
1061
0
    int rc;
1062
0
1063
0
    ASSERT(paging_locked_by_me(v->domain));
1064
0
    rc = sh_validate_guest_entry(v, gmfn, entry, size);
1065
0
    if ( rc & SHADOW_SET_FLUSH )
1066
0
        /* Need to flush TLBs to pick up shadow PT changes */
1067
0
        flush_tlb_mask(d->domain_dirty_cpumask);
1068
0
    if ( rc & SHADOW_SET_ERROR )
1069
0
    {
1070
0
        /* This page is probably not a pagetable any more: tear it out of the
1071
0
         * shadows, along with any tables that reference it.
1072
0
         * Since the validate call above will have made a "safe" (i.e. zero)
1073
0
         * shadow entry, we can let the domain live even if we can't fully
1074
0
         * unshadow the page. */
1075
0
        sh_remove_shadows(d, gmfn, 0, 0);
1076
0
    }
1077
0
}
1078
1079
1080
/**************************************************************************/
1081
/* Memory management for shadow pages. */
1082
1083
/* Allocating shadow pages
1084
 * -----------------------
1085
 *
1086
 * Most shadow pages are allocated singly, but there is one case where
1087
 * we need to allocate multiple pages together: shadowing 32-bit guest
1088
 * tables on PAE or 64-bit shadows.  A 32-bit guest l1 table covers 4MB
1089
 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1090
 * l1 tables (covering 2MB of virtual address space each).  Similarly, a
1091
 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1092
 * PAE/64-bit l2 tables (1GB va each).  These multi-page shadows are
1093
 * not contiguous in memory; functions for handling offsets into them are
1094
 * defined in shadow/multi.c (shadow_l1_index() etc.)
1095
 *
1096
 * This table shows the allocation behaviour of the different modes:
1097
 *
1098
 * Xen paging      64b  64b  64b
1099
 * Guest paging    32b  pae  64b
1100
 * PV or HVM       HVM  HVM   *
1101
 * Shadow paging   pae  pae  64b
1102
 *
1103
 * sl1 size         8k   4k   4k
1104
 * sl2 size        16k   4k   4k
1105
 * sl3 size         -    -    4k
1106
 * sl4 size         -    -    4k
1107
 *
1108
 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1109
 * a function for the p2m management to steal pages, in max-order chunks, from
1110
 * the free pool.
1111
 */
1112
1113
const u8 sh_type_to_size[] = {
1114
    1, /* SH_type_none           */
1115
    2, /* SH_type_l1_32_shadow   */
1116
    2, /* SH_type_fl1_32_shadow  */
1117
    4, /* SH_type_l2_32_shadow   */
1118
    1, /* SH_type_l1_pae_shadow  */
1119
    1, /* SH_type_fl1_pae_shadow */
1120
    1, /* SH_type_l2_pae_shadow  */
1121
    1, /* SH_type_l2h_pae_shadow */
1122
    1, /* SH_type_l1_64_shadow   */
1123
    1, /* SH_type_fl1_64_shadow  */
1124
    1, /* SH_type_l2_64_shadow   */
1125
    1, /* SH_type_l2h_64_shadow  */
1126
    1, /* SH_type_l3_64_shadow   */
1127
    1, /* SH_type_l4_64_shadow   */
1128
    1, /* SH_type_p2m_table      */
1129
    1, /* SH_type_monitor_table  */
1130
    1  /* SH_type_oos_snapshot   */
1131
};
1132
1133
/* Figure out the least acceptable quantity of shadow memory.
1134
 * The minimum memory requirement for always being able to free up a
1135
 * chunk of memory is very small -- only three max-order chunks per
1136
 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1137
 *
1138
 * But for a guest to be guaranteed to successfully execute a single
1139
 * instruction, we must be able to map a large number (about thirty) VAs
1140
 * at the same time, which means that to guarantee progress, we must
1141
 * allow for more than ninety allocated pages per vcpu.  We round that
1142
 * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's
1143
 * worth to make sure we never return zero. */
1144
static unsigned int shadow_min_acceptable_pages(struct domain *d)
1145
0
{
1146
0
    u32 vcpu_count = 1;
1147
0
    struct vcpu *v;
1148
0
1149
0
    for_each_vcpu(d, v)
1150
0
        vcpu_count++;
1151
0
1152
0
    return (vcpu_count * 128);
1153
0
}
1154
1155
/* Dispatcher function: call the per-mode function that will unhook the
1156
 * non-Xen mappings in this top-level shadow mfn.  With user_only == 1,
1157
 * unhooks only the user-mode mappings. */
1158
void shadow_unhook_mappings(struct domain *d, mfn_t smfn, int user_only)
1159
0
{
1160
0
    struct page_info *sp = mfn_to_page(smfn);
1161
0
    switch ( sp->u.sh.type )
1162
0
    {
1163
0
    case SH_type_l2_32_shadow:
1164
0
        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(d, smfn, user_only);
1165
0
        break;
1166
0
    case SH_type_l2_pae_shadow:
1167
0
    case SH_type_l2h_pae_shadow:
1168
0
        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(d, smfn, user_only);
1169
0
        break;
1170
0
    case SH_type_l4_64_shadow:
1171
0
        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(d, smfn, user_only);
1172
0
        break;
1173
0
    default:
1174
0
        SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type);
1175
0
        BUG();
1176
0
    }
1177
0
}
1178
1179
static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
1180
0
{
1181
0
    if ( tb_init_done )
1182
0
    {
1183
0
        /* Convert smfn to gfn */
1184
0
        unsigned long gfn;
1185
0
        ASSERT(mfn_valid(smfn));
1186
0
        gfn = mfn_to_gfn(d, backpointer(mfn_to_page(smfn)));
1187
0
        __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/, sizeof(gfn), &gfn);
1188
0
    }
1189
0
}
1190
1191
/* Make sure there are at least count order-sized pages
1192
 * available in the shadow page pool. */
1193
static void _shadow_prealloc(struct domain *d, unsigned int pages)
1194
0
{
1195
0
    struct vcpu *v;
1196
0
    struct page_info *sp, *t;
1197
0
    mfn_t smfn;
1198
0
    int i;
1199
0
1200
0
    if ( d->arch.paging.shadow.free_pages >= pages ) return;
1201
0
1202
0
    /* Shouldn't have enabled shadows if we've no vcpus. */
1203
0
    ASSERT(d->vcpu && d->vcpu[0]);
1204
0
1205
0
    /* Stage one: walk the list of pinned pages, unpinning them */
1206
0
    perfc_incr(shadow_prealloc_1);
1207
0
    foreach_pinned_shadow(d, sp, t)
1208
0
    {
1209
0
        smfn = page_to_mfn(sp);
1210
0
1211
0
        /* Unpin this top-level shadow */
1212
0
        trace_shadow_prealloc_unpin(d, smfn);
1213
0
        sh_unpin(d, smfn);
1214
0
1215
0
        /* See if that freed up enough space */
1216
0
        if ( d->arch.paging.shadow.free_pages >= pages ) return;
1217
0
    }
1218
0
1219
0
    /* Stage two: all shadow pages are in use in hierarchies that are
1220
0
     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
1221
0
     * mappings. */
1222
0
    perfc_incr(shadow_prealloc_2);
1223
0
1224
0
    for_each_vcpu(d, v)
1225
0
        for ( i = 0 ; i < 4 ; i++ )
1226
0
        {
1227
0
            if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1228
0
            {
1229
0
                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
1230
0
                shadow_unhook_mappings(d,
1231
0
                               pagetable_get_mfn(v->arch.shadow_table[i]), 0);
1232
0
1233
0
                /* See if that freed up enough space */
1234
0
                if ( d->arch.paging.shadow.free_pages >= pages )
1235
0
                {
1236
0
                    flush_tlb_mask(d->domain_dirty_cpumask);
1237
0
                    return;
1238
0
                }
1239
0
            }
1240
0
        }
1241
0
1242
0
    /* Nothing more we can do: all remaining shadows are of pages that
1243
0
     * hold Xen mappings for some vcpu.  This can never happen. */
1244
0
    SHADOW_ERROR("Can't pre-allocate %u shadow pages!\n"
1245
0
                 "  shadow pages total = %u, free = %u, p2m=%u\n",
1246
0
                 pages,
1247
0
                 d->arch.paging.shadow.total_pages,
1248
0
                 d->arch.paging.shadow.free_pages,
1249
0
                 d->arch.paging.shadow.p2m_pages);
1250
0
    BUG();
1251
0
}
1252
1253
/* Make sure there are at least count pages of the order according to
1254
 * type available in the shadow page pool.
1255
 * This must be called before any calls to shadow_alloc().  Since this
1256
 * will free existing shadows to make room, it must be called early enough
1257
 * to avoid freeing shadows that the caller is currently working on. */
1258
void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1259
0
{
1260
0
    return _shadow_prealloc(d, shadow_size(type) * count);
1261
0
}
1262
1263
/* Deliberately free all the memory we can: this will tear down all of
1264
 * this domain's shadows */
1265
static void shadow_blow_tables(struct domain *d)
1266
0
{
1267
0
    struct page_info *sp, *t;
1268
0
    struct vcpu *v;
1269
0
    mfn_t smfn;
1270
0
    int i;
1271
0
1272
0
    /* Shouldn't have enabled shadows if we've no vcpus. */
1273
0
    ASSERT(d->vcpu && d->vcpu[0]);
1274
0
1275
0
    /* Pass one: unpin all pinned pages */
1276
0
    foreach_pinned_shadow(d, sp, t)
1277
0
    {
1278
0
        smfn = page_to_mfn(sp);
1279
0
        sh_unpin(d, smfn);
1280
0
    }
1281
0
1282
0
    /* Second pass: unhook entries of in-use shadows */
1283
0
    for_each_vcpu(d, v)
1284
0
        for ( i = 0 ; i < 4 ; i++ )
1285
0
            if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1286
0
                shadow_unhook_mappings(d,
1287
0
                               pagetable_get_mfn(v->arch.shadow_table[i]), 0);
1288
0
1289
0
    /* Make sure everyone sees the unshadowings */
1290
0
    flush_tlb_mask(d->domain_dirty_cpumask);
1291
0
}
1292
1293
void shadow_blow_tables_per_domain(struct domain *d)
1294
0
{
1295
0
    if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) {
1296
0
        paging_lock(d);
1297
0
        shadow_blow_tables(d);
1298
0
        paging_unlock(d);
1299
0
    }
1300
0
}
1301
1302
#ifndef NDEBUG
1303
/* Blow all shadows of all shadowed domains: this can be used to cause the
1304
 * guest's pagetables to be re-shadowed if we suspect that the shadows
1305
 * have somehow got out of sync */
1306
static void shadow_blow_all_tables(unsigned char c)
1307
0
{
1308
0
    struct domain *d;
1309
0
    printk("'%c' pressed -> blowing all shadow tables\n", c);
1310
0
    rcu_read_lock(&domlist_read_lock);
1311
0
    for_each_domain(d)
1312
0
    {
1313
0
        if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL )
1314
0
        {
1315
0
            paging_lock(d);
1316
0
            shadow_blow_tables(d);
1317
0
            paging_unlock(d);
1318
0
        }
1319
0
    }
1320
0
    rcu_read_unlock(&domlist_read_lock);
1321
0
}
1322
1323
/* Register this function in the Xen console keypress table */
1324
static __init int shadow_blow_tables_keyhandler_init(void)
1325
1
{
1326
1
    register_keyhandler('S', shadow_blow_all_tables, "reset shadow pagetables", 1);
1327
1
    return 0;
1328
1
}
1329
__initcall(shadow_blow_tables_keyhandler_init);
1330
#endif /* !NDEBUG */
1331
1332
/* Accessors for the singly-linked list that's used for hash chains */
1333
static inline struct page_info *
1334
next_shadow(const struct page_info *sp)
1335
0
{
1336
0
    return sp->next_shadow ? pdx_to_page(sp->next_shadow) : NULL;
1337
0
}
1338
1339
static inline void
1340
set_next_shadow(struct page_info *sp, struct page_info *next)
1341
0
{
1342
0
    sp->next_shadow = next ? page_to_pdx(next) : 0;
1343
0
}
1344
1345
/* Allocate another shadow's worth of (contiguous, aligned) pages,
1346
 * and fill in the type and backpointer fields of their page_infos.
1347
 * Never fails to allocate. */
1348
mfn_t shadow_alloc(struct domain *d,
1349
                    u32 shadow_type,
1350
                    unsigned long backpointer)
1351
0
{
1352
0
    struct page_info *sp = NULL;
1353
0
    unsigned int pages = shadow_size(shadow_type);
1354
0
    struct page_list_head tmp_list;
1355
0
    cpumask_t mask;
1356
0
    unsigned int i;
1357
0
1358
0
    ASSERT(paging_locked_by_me(d));
1359
0
    ASSERT(shadow_type != SH_type_none);
1360
0
    perfc_incr(shadow_alloc);
1361
0
1362
0
    if ( d->arch.paging.shadow.free_pages < pages )
1363
0
    {
1364
0
        /* If we get here, we failed to allocate. This should never
1365
0
         * happen.  It means that we didn't call shadow_prealloc()
1366
0
         * correctly before we allocated.  We can't recover by calling
1367
0
         * prealloc here, because we might free up higher-level pages
1368
0
         * that the caller is working on. */
1369
0
        SHADOW_ERROR("Can't allocate %i shadow pages!\n", pages);
1370
0
        BUG();
1371
0
    }
1372
0
    d->arch.paging.shadow.free_pages -= pages;
1373
0
1374
0
    /* Backpointers that are MFNs need to be packed into PDXs (PFNs don't) */
1375
0
    switch (shadow_type)
1376
0
    {
1377
0
    case SH_type_fl1_32_shadow:
1378
0
    case SH_type_fl1_pae_shadow:
1379
0
    case SH_type_fl1_64_shadow:
1380
0
        break;
1381
0
    default:
1382
0
        backpointer = pfn_to_pdx(backpointer);
1383
0
        break;
1384
0
    }
1385
0
1386
0
    INIT_PAGE_LIST_HEAD(&tmp_list);
1387
0
1388
0
    /* Init page info fields and clear the pages */
1389
0
    for ( i = 0; i < pages ; i++ )
1390
0
    {
1391
0
        sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
1392
0
        /* Before we overwrite the old contents of this page,
1393
0
         * we need to be sure that no TLB holds a pointer to it. */
1394
0
        cpumask_copy(&mask, d->domain_dirty_cpumask);
1395
0
        tlbflush_filter(&mask, sp->tlbflush_timestamp);
1396
0
        if ( unlikely(!cpumask_empty(&mask)) )
1397
0
        {
1398
0
            perfc_incr(shadow_alloc_tlbflush);
1399
0
            flush_tlb_mask(&mask);
1400
0
        }
1401
0
        /* Now safe to clear the page for reuse */
1402
0
        clear_domain_page(page_to_mfn(sp));
1403
0
        INIT_PAGE_LIST_ENTRY(&sp->list);
1404
0
        page_list_add(sp, &tmp_list);
1405
0
        sp->u.sh.type = shadow_type;
1406
0
        sp->u.sh.pinned = 0;
1407
0
        sp->u.sh.count = 0;
1408
0
        sp->u.sh.head = 0;
1409
0
        sp->v.sh.back = backpointer;
1410
0
        set_next_shadow(sp, NULL);
1411
0
        perfc_incr(shadow_alloc_count);
1412
0
    }
1413
0
    if ( shadow_type >= SH_type_min_shadow
1414
0
         && shadow_type <= SH_type_max_shadow )
1415
0
        sp->u.sh.head = 1;
1416
0
1417
0
    sh_terminate_list(&tmp_list);
1418
0
1419
0
    return page_to_mfn(sp);
1420
0
}
1421
1422
1423
/* Return some shadow pages to the pool. */
1424
void shadow_free(struct domain *d, mfn_t smfn)
1425
0
{
1426
0
    struct page_info *next = NULL, *sp = mfn_to_page(smfn);
1427
0
    struct page_list_head *pin_list;
1428
0
    unsigned int pages;
1429
0
    u32 shadow_type;
1430
0
    int i;
1431
0
1432
0
    ASSERT(paging_locked_by_me(d));
1433
0
    perfc_incr(shadow_free);
1434
0
1435
0
    shadow_type = sp->u.sh.type;
1436
0
    ASSERT(shadow_type != SH_type_none);
1437
0
    ASSERT(sp->u.sh.head || (shadow_type > SH_type_max_shadow));
1438
0
    pages = shadow_size(shadow_type);
1439
0
    pin_list = &d->arch.paging.shadow.pinned_shadows;
1440
0
1441
0
    for ( i = 0; i < pages; i++ )
1442
0
    {
1443
0
#if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1444
0
        struct vcpu *v;
1445
0
        for_each_vcpu(d, v)
1446
0
        {
1447
0
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1448
0
            /* No longer safe to look for a writeable mapping in this shadow */
1449
0
            if ( v->arch.paging.shadow.last_writeable_pte_smfn
1450
0
                 == mfn_x(page_to_mfn(sp)) )
1451
0
                v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1452
0
#endif
1453
0
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1454
0
            v->arch.paging.last_write_emul_ok = 0;
1455
0
#endif
1456
0
        }
1457
0
#endif
1458
0
        /* Get the next page before we overwrite the list header */
1459
0
        if ( i < pages - 1 )
1460
0
            next = page_list_next(sp, pin_list);
1461
0
        /* Strip out the type: this is now a free shadow page */
1462
0
        sp->u.sh.type = sp->u.sh.head = 0;
1463
0
        /* Remember the TLB timestamp so we will know whether to flush
1464
0
         * TLBs when we reuse the page.  Because the destructors leave the
1465
0
         * contents of the pages in place, we can delay TLB flushes until
1466
0
         * just before the allocator hands the page out again. */
1467
0
        page_set_tlbflush_timestamp(sp);
1468
0
        perfc_decr(shadow_alloc_count);
1469
0
        page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
1470
0
        sp = next;
1471
0
    }
1472
0
1473
0
    d->arch.paging.shadow.free_pages += pages;
1474
0
}
1475
1476
/* Divert a page from the pool to be used by the p2m mapping.
1477
 * This action is irreversible: the p2m mapping only ever grows.
1478
 * That's OK because the p2m table only exists for translated domains,
1479
 * and those domains can't ever turn off shadow mode. */
1480
static struct page_info *
1481
shadow_alloc_p2m_page(struct domain *d)
1482
0
{
1483
0
    struct page_info *pg;
1484
0
1485
0
    /* This is called both from the p2m code (which never holds the
1486
0
     * paging lock) and the log-dirty code (which always does). */
1487
0
    paging_lock_recursive(d);
1488
0
1489
0
    if ( d->arch.paging.shadow.total_pages
1490
0
         < shadow_min_acceptable_pages(d) + 1 )
1491
0
    {
1492
0
        if ( !d->arch.paging.p2m_alloc_failed )
1493
0
        {
1494
0
            d->arch.paging.p2m_alloc_failed = 1;
1495
0
            dprintk(XENLOG_ERR, "d%i failed to allocate from shadow pool\n",
1496
0
                    d->domain_id);
1497
0
        }
1498
0
        paging_unlock(d);
1499
0
        return NULL;
1500
0
    }
1501
0
1502
0
    shadow_prealloc(d, SH_type_p2m_table, 1);
1503
0
    pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1504
0
    d->arch.paging.shadow.p2m_pages++;
1505
0
    d->arch.paging.shadow.total_pages--;
1506
0
1507
0
    paging_unlock(d);
1508
0
1509
0
    /* Unlike shadow pages, mark p2m pages as owned by the domain.
1510
0
     * Marking the domain as the owner would normally allow the guest to
1511
0
     * create mappings of these pages, but these p2m pages will never be
1512
0
     * in the domain's guest-physical address space, and so that is not
1513
0
     * believed to be a concern. */
1514
0
    page_set_owner(pg, d);
1515
0
    pg->count_info |= 1;
1516
0
    return pg;
1517
0
}
1518
1519
static void
1520
shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1521
0
{
1522
0
    ASSERT(page_get_owner(pg) == d);
1523
0
    /* Should have just the one ref we gave it in alloc_p2m_page() */
1524
0
    if ( (pg->count_info & PGC_count_mask) != 1 )
1525
0
    {
1526
0
        SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n",
1527
0
                     pg->count_info, pg->u.inuse.type_info);
1528
0
    }
1529
0
    pg->count_info &= ~PGC_count_mask;
1530
0
    pg->u.sh.type = SH_type_p2m_table; /* p2m code reuses type-info */
1531
0
    page_set_owner(pg, NULL);
1532
0
1533
0
    /* This is called both from the p2m code (which never holds the
1534
0
     * paging lock) and the log-dirty code (which always does). */
1535
0
    paging_lock_recursive(d);
1536
0
1537
0
    shadow_free(d, page_to_mfn(pg));
1538
0
    d->arch.paging.shadow.p2m_pages--;
1539
0
    d->arch.paging.shadow.total_pages++;
1540
0
1541
0
    paging_unlock(d);
1542
0
}
1543
1544
int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted)
1545
0
{
1546
0
    struct page_info *sp;
1547
0
    unsigned int lower_bound;
1548
0
1549
0
    ASSERT(paging_locked_by_me(d));
1550
0
1551
0
    if ( pages > 0 )
1552
0
    {
1553
0
        /* Check for minimum value. */
1554
0
        if ( pages < d->arch.paging.shadow.p2m_pages )
1555
0
            pages = 0;
1556
0
        else
1557
0
            pages -= d->arch.paging.shadow.p2m_pages;
1558
0
1559
0
        /* Don't allocate less than the minimum acceptable, plus one page per
1560
0
         * megabyte of RAM (for the p2m table) */
1561
0
        lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1562
0
        if ( pages < lower_bound )
1563
0
            pages = lower_bound;
1564
0
    }
1565
0
1566
0
    SHADOW_PRINTK("current %i target %i\n",
1567
0
                   d->arch.paging.shadow.total_pages, pages);
1568
0
1569
0
    for ( ; ; )
1570
0
    {
1571
0
        if ( d->arch.paging.shadow.total_pages < pages )
1572
0
        {
1573
0
            /* Need to allocate more memory from domheap */
1574
0
            sp = (struct page_info *)
1575
0
                alloc_domheap_page(d, MEMF_no_owner);
1576
0
            if ( sp == NULL )
1577
0
            {
1578
0
                SHADOW_PRINTK("failed to allocate shadow pages.\n");
1579
0
                return -ENOMEM;
1580
0
            }
1581
0
            d->arch.paging.shadow.free_pages++;
1582
0
            d->arch.paging.shadow.total_pages++;
1583
0
            sp->u.sh.type = 0;
1584
0
            sp->u.sh.pinned = 0;
1585
0
            sp->u.sh.count = 0;
1586
0
            sp->tlbflush_timestamp = 0; /* Not in any TLB */
1587
0
            page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
1588
0
        }
1589
0
        else if ( d->arch.paging.shadow.total_pages > pages )
1590
0
        {
1591
0
            /* Need to return memory to domheap */
1592
0
            _shadow_prealloc(d, 1);
1593
0
            sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
1594
0
            ASSERT(sp);
1595
0
            /*
1596
0
             * The pages were allocated anonymously, but the owner field
1597
0
             * gets overwritten normally, so need to clear it here.
1598
0
             */
1599
0
            page_set_owner(sp, NULL);
1600
0
            d->arch.paging.shadow.free_pages--;
1601
0
            d->arch.paging.shadow.total_pages--;
1602
0
            free_domheap_page(sp);
1603
0
        }
1604
0
        else
1605
0
            break;
1606
0
1607
0
        /* Check to see if we need to yield and try again */
1608
0
        if ( preempted && general_preempt_check() )
1609
0
        {
1610
0
            *preempted = true;
1611
0
            return 0;
1612
0
        }
1613
0
    }
1614
0
1615
0
    return 0;
1616
0
}
1617
1618
/* Return the size of the shadow pool, rounded up to the nearest MB */
1619
static unsigned int shadow_get_allocation(struct domain *d)
1620
0
{
1621
0
    unsigned int pg = d->arch.paging.shadow.total_pages
1622
0
        + d->arch.paging.shadow.p2m_pages;
1623
0
    return ((pg >> (20 - PAGE_SHIFT))
1624
0
            + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1625
0
}
1626
1627
/**************************************************************************/
1628
/* Handling guest writes to pagetables. */
1629
1630
/*
1631
 * Translate a VA to an MFN, injecting a page-fault if we fail.  If the
1632
 * mapping succeeds, a reference will be held on the underlying page.
1633
 */
1634
0
#define BAD_GVA_TO_GFN (~0UL)
1635
0
#define BAD_GFN_TO_MFN (~1UL)
1636
0
#define READONLY_GFN   (~2UL)
1637
static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr,
1638
                                struct sh_emulate_ctxt *sh_ctxt)
1639
0
{
1640
0
    unsigned long gfn;
1641
0
    struct page_info *page;
1642
0
    mfn_t mfn;
1643
0
    p2m_type_t p2mt;
1644
0
    uint32_t pfec = PFEC_page_present | PFEC_write_access;
1645
0
1646
0
    /* Translate the VA to a GFN. */
1647
0
    gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec);
1648
0
    if ( gfn == gfn_x(INVALID_GFN) )
1649
0
    {
1650
0
        x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt);
1651
0
1652
0
        return _mfn(BAD_GVA_TO_GFN);
1653
0
    }
1654
0
1655
0
    /* Translate the GFN to an MFN. */
1656
0
    ASSERT(!paging_locked_by_me(v->domain));
1657
0
1658
0
    page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC);
1659
0
1660
0
    /* Sanity checking. */
1661
0
    if ( page == NULL )
1662
0
    {
1663
0
        return _mfn(BAD_GFN_TO_MFN);
1664
0
    }
1665
0
    if ( p2m_is_discard_write(p2mt) )
1666
0
    {
1667
0
        put_page(page);
1668
0
        return _mfn(READONLY_GFN);
1669
0
    }
1670
0
    if ( !p2m_is_ram(p2mt) )
1671
0
    {
1672
0
        put_page(page);
1673
0
        return _mfn(BAD_GFN_TO_MFN);
1674
0
    }
1675
0
    mfn = page_to_mfn(page);
1676
0
    ASSERT(mfn_valid(mfn));
1677
0
1678
0
    v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
1679
0
1680
0
    return mfn;
1681
0
}
1682
1683
/*
1684
 * Check that the user is allowed to perform this write.  If a mapping is
1685
 * returned, page references will be held on sh_ctxt->mfn[0] and
1686
 * sh_ctxt->mfn[1] iff !INVALID_MFN.
1687
 */
1688
void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
1689
                          unsigned int bytes,
1690
                          struct sh_emulate_ctxt *sh_ctxt)
1691
0
{
1692
0
    struct domain *d = v->domain;
1693
0
    void *map;
1694
0
1695
0
#ifndef NDEBUG
1696
0
    /* We don't emulate user-mode writes to page tables. */
1697
0
    if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3
1698
0
                          : !guest_kernel_mode(v, guest_cpu_user_regs()) )
1699
0
    {
1700
0
        gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
1701
0
                 "emulate_map_dest(). This should never happen!\n");
1702
0
        return MAPPING_UNHANDLEABLE;
1703
0
    }
1704
0
#endif
1705
0
1706
0
    sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
1707
0
    if ( !mfn_valid(sh_ctxt->mfn[0]) )
1708
0
    {
1709
0
        switch ( mfn_x(sh_ctxt->mfn[0]) )
1710
0
        {
1711
0
        case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
1712
0
        case READONLY_GFN:   return MAPPING_SILENT_FAIL;
1713
0
        default:             return MAPPING_UNHANDLEABLE;
1714
0
        }
1715
0
    }
1716
0
1717
0
    /* Unaligned writes mean probably this isn't a pagetable. */
1718
0
    if ( vaddr & (bytes - 1) )
1719
0
        sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ );
1720
0
1721
0
    if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
1722
0
    {
1723
0
        /* Whole write fits on a single page. */
1724
0
        sh_ctxt->mfn[1] = INVALID_MFN;
1725
0
        map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK);
1726
0
    }
1727
0
    else if ( !is_hvm_domain(d) )
1728
0
    {
1729
0
        /*
1730
0
         * Cross-page emulated writes are only supported for HVM guests;
1731
0
         * PV guests ought to know better.
1732
0
         */
1733
0
        put_page(mfn_to_page(sh_ctxt->mfn[0]));
1734
0
        return MAPPING_UNHANDLEABLE;
1735
0
    }
1736
0
    else
1737
0
    {
1738
0
        /* This write crosses a page boundary. Translate the second page. */
1739
0
        sh_ctxt->mfn[1] = emulate_gva_to_mfn(
1740
0
            v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt);
1741
0
        if ( !mfn_valid(sh_ctxt->mfn[1]) )
1742
0
        {
1743
0
            put_page(mfn_to_page(sh_ctxt->mfn[0]));
1744
0
            switch ( mfn_x(sh_ctxt->mfn[1]) )
1745
0
            {
1746
0
            case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
1747
0
            case READONLY_GFN:   return MAPPING_SILENT_FAIL;
1748
0
            default:             return MAPPING_UNHANDLEABLE;
1749
0
            }
1750
0
        }
1751
0
1752
0
        /* Cross-page writes mean probably not a pagetable. */
1753
0
        sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ );
1754
0
1755
0
        map = vmap(sh_ctxt->mfn, 2);
1756
0
        if ( !map )
1757
0
        {
1758
0
            put_page(mfn_to_page(sh_ctxt->mfn[0]));
1759
0
            put_page(mfn_to_page(sh_ctxt->mfn[1]));
1760
0
            return MAPPING_UNHANDLEABLE;
1761
0
        }
1762
0
        map += (vaddr & ~PAGE_MASK);
1763
0
    }
1764
0
1765
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
1766
0
    /*
1767
0
     * Remember if the bottom bit was clear, so we can choose not to run
1768
0
     * the change through the verify code if it's still clear afterwards.
1769
0
     */
1770
0
    sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
1771
0
#endif
1772
0
1773
0
    return map;
1774
0
}
1775
1776
/*
1777
 * Tidy up after the emulated write: mark pages dirty, verify the new
1778
 * contents, and undo the mapping.
1779
 */
1780
void sh_emulate_unmap_dest(struct vcpu *v, void *addr, unsigned int bytes,
1781
                           struct sh_emulate_ctxt *sh_ctxt)
1782
0
{
1783
0
    u32 b1 = bytes, b2 = 0, shflags;
1784
0
1785
0
    /*
1786
0
     * We can avoid re-verifying the page contents after the write if:
1787
0
     *  - it was no larger than the PTE type of this pagetable;
1788
0
     *  - it was aligned to the PTE boundaries; and
1789
0
     *  - _PAGE_PRESENT was clear before and after the write.
1790
0
     */
1791
0
    shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags;
1792
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
1793
0
    if ( sh_ctxt->low_bit_was_clear
1794
0
         && !(*(u8 *)addr & _PAGE_PRESENT)
1795
0
         && ((!(shflags & SHF_32)
1796
0
              /*
1797
0
               * Not shadowed 32-bit: aligned 64-bit writes that leave
1798
0
               * the present bit unset are safe to ignore.
1799
0
               */
1800
0
              && ((unsigned long)addr & 7) == 0
1801
0
              && bytes <= 8)
1802
0
             ||
1803
0
             (!(shflags & (SHF_PAE|SHF_64))
1804
0
              /*
1805
0
               * Not shadowed PAE/64-bit: aligned 32-bit writes that
1806
0
               * leave the present bit unset are safe to ignore.
1807
0
               */
1808
0
              && ((unsigned long)addr & 3) == 0
1809
0
              && bytes <= 4)) )
1810
0
    {
1811
0
        /* Writes with this alignment constraint can't possibly cross pages. */
1812
0
        ASSERT(!mfn_valid(sh_ctxt->mfn[1]));
1813
0
    }
1814
0
    else
1815
0
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
1816
0
    {
1817
0
        if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
1818
0
        {
1819
0
            /* Validate as two writes, one to each page. */
1820
0
            b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
1821
0
            b2 = bytes - b1;
1822
0
            ASSERT(b2 < bytes);
1823
0
        }
1824
0
        if ( likely(b1 > 0) )
1825
0
            sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1);
1826
0
        if ( unlikely(b2 > 0) )
1827
0
            sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2);
1828
0
    }
1829
0
1830
0
    paging_mark_dirty(v->domain, sh_ctxt->mfn[0]);
1831
0
    put_page(mfn_to_page(sh_ctxt->mfn[0]));
1832
0
1833
0
    if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
1834
0
    {
1835
0
        paging_mark_dirty(v->domain, sh_ctxt->mfn[1]);
1836
0
        put_page(mfn_to_page(sh_ctxt->mfn[1]));
1837
0
        vunmap((void *)((unsigned long)addr & PAGE_MASK));
1838
0
    }
1839
0
    else
1840
0
        unmap_domain_page(addr);
1841
0
1842
0
    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
1843
0
}
1844
1845
/**************************************************************************/
1846
/* Hash table for storing the guest->shadow mappings.
1847
 * The table itself is an array of pointers to shadows; the shadows are then
1848
 * threaded on a singly-linked list of shadows with the same hash value */
1849
1850
0
#define SHADOW_HASH_BUCKETS 251
1851
/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1852
1853
/* Hash function that takes a gfn or mfn, plus another byte of type info */
1854
typedef u32 key_t;
1855
static inline key_t sh_hash(unsigned long n, unsigned int t)
1856
0
{
1857
0
    unsigned char *p = (unsigned char *)&n;
1858
0
    key_t k = t;
1859
0
    int i;
1860
0
    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1861
0
    return k % SHADOW_HASH_BUCKETS;
1862
0
}
1863
1864
#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1865
1866
/* Before we get to the mechanism, define a pair of audit functions
1867
 * that sanity-check the contents of the hash table. */
1868
static void sh_hash_audit_bucket(struct domain *d, int bucket)
1869
/* Audit one bucket of the hash table */
1870
0
{
1871
0
    struct page_info *sp, *x;
1872
0
1873
0
    if ( !(SHADOW_AUDIT_ENABLE) )
1874
0
        return;
1875
0
1876
0
    sp = d->arch.paging.shadow.hash_table[bucket];
1877
0
    while ( sp )
1878
0
    {
1879
0
        /* Not a shadow? */
1880
0
        BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ;
1881
0
        /* Bogus type? */
1882
0
        BUG_ON( sp->u.sh.type == 0 );
1883
0
        BUG_ON( sp->u.sh.type > SH_type_max_shadow );
1884
0
        /* Wrong page of a multi-page shadow? */
1885
0
        BUG_ON( !sp->u.sh.head );
1886
0
        /* Wrong bucket? */
1887
0
        BUG_ON( sh_hash(__backpointer(sp), sp->u.sh.type) != bucket );
1888
0
        /* Duplicate entry? */
1889
0
        for ( x = next_shadow(sp); x; x = next_shadow(x) )
1890
0
            BUG_ON( x->v.sh.back == sp->v.sh.back &&
1891
0
                    x->u.sh.type == sp->u.sh.type );
1892
0
        /* Follow the backpointer to the guest pagetable */
1893
0
        if ( sp->u.sh.type != SH_type_fl1_32_shadow
1894
0
             && sp->u.sh.type != SH_type_fl1_pae_shadow
1895
0
             && sp->u.sh.type != SH_type_fl1_64_shadow )
1896
0
        {
1897
0
            struct page_info *gpg = mfn_to_page(backpointer(sp));
1898
0
            /* Bad shadow flags on guest page? */
1899
0
            BUG_ON( !(gpg->shadow_flags & (1<<sp->u.sh.type)) );
1900
0
            /* Bad type count on guest page? */
1901
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1902
0
            if ( sp->u.sh.type == SH_type_l1_32_shadow
1903
0
                 || sp->u.sh.type == SH_type_l1_pae_shadow
1904
0
                 || sp->u.sh.type == SH_type_l1_64_shadow )
1905
0
            {
1906
0
                if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1907
0
                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1908
0
                {
1909
0
                    if ( !page_is_out_of_sync(gpg) )
1910
0
                    {
1911
0
                        SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
1912
0
                                     " and not OOS but has typecount %#lx\n",
1913
0
                                     __backpointer(sp),
1914
0
                                     mfn_x(page_to_mfn(sp)),
1915
0
                                     gpg->u.inuse.type_info);
1916
0
                        BUG();
1917
0
                    }
1918
0
                }
1919
0
            }
1920
0
            else /* Not an l1 */
1921
0
#endif
1922
0
            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1923
0
                 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1924
0
            {
1925
0
                SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
1926
0
                             " but has typecount %#lx\n",
1927
0
                             __backpointer(sp), mfn_x(page_to_mfn(sp)),
1928
0
                             gpg->u.inuse.type_info);
1929
0
                BUG();
1930
0
            }
1931
0
        }
1932
0
        /* That entry was OK; on we go */
1933
0
        sp = next_shadow(sp);
1934
0
    }
1935
0
}
1936
1937
#else
1938
#define sh_hash_audit_bucket(_d, _b) do {} while(0)
1939
#endif /* Hashtable bucket audit */
1940
1941
1942
#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1943
1944
static void sh_hash_audit(struct domain *d)
1945
/* Full audit: audit every bucket in the table */
1946
{
1947
    int i;
1948
1949
    if ( !(SHADOW_AUDIT_ENABLE) )
1950
        return;
1951
1952
    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1953
    {
1954
        sh_hash_audit_bucket(d, i);
1955
    }
1956
}
1957
1958
#else
1959
0
#define sh_hash_audit(_d) do {} while(0)
1960
#endif /* Hashtable bucket audit */
1961
1962
/* Allocate and initialise the table itself.
1963
 * Returns 0 for success, 1 for error. */
1964
static int shadow_hash_alloc(struct domain *d)
1965
0
{
1966
0
    struct page_info **table;
1967
0
1968
0
    ASSERT(paging_locked_by_me(d));
1969
0
    ASSERT(!d->arch.paging.shadow.hash_table);
1970
0
1971
0
    table = xzalloc_array(struct page_info *, SHADOW_HASH_BUCKETS);
1972
0
    if ( !table ) return 1;
1973
0
    d->arch.paging.shadow.hash_table = table;
1974
0
    return 0;
1975
0
}
1976
1977
/* Tear down the hash table and return all memory to Xen.
1978
 * This function does not care whether the table is populated. */
1979
static void shadow_hash_teardown(struct domain *d)
1980
0
{
1981
0
    ASSERT(paging_locked_by_me(d));
1982
0
    ASSERT(d->arch.paging.shadow.hash_table);
1983
0
1984
0
    xfree(d->arch.paging.shadow.hash_table);
1985
0
    d->arch.paging.shadow.hash_table = NULL;
1986
0
}
1987
1988
1989
mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t)
1990
/* Find an entry in the hash table.  Returns the MFN of the shadow,
1991
 * or INVALID_MFN if it doesn't exist */
1992
0
{
1993
0
    struct page_info *sp, *prev;
1994
0
    key_t key;
1995
0
1996
0
    ASSERT(paging_locked_by_me(d));
1997
0
    ASSERT(d->arch.paging.shadow.hash_table);
1998
0
    ASSERT(t);
1999
0
2000
0
    sh_hash_audit(d);
2001
0
2002
0
    perfc_incr(shadow_hash_lookups);
2003
0
    key = sh_hash(n, t);
2004
0
    sh_hash_audit_bucket(d, key);
2005
0
2006
0
    sp = d->arch.paging.shadow.hash_table[key];
2007
0
    prev = NULL;
2008
0
    while(sp)
2009
0
    {
2010
0
        if ( __backpointer(sp) == n && sp->u.sh.type == t )
2011
0
        {
2012
0
            /* Pull-to-front if 'sp' isn't already the head item */
2013
0
            if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2014
0
            {
2015
0
                if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2016
0
                    /* Can't reorder: someone is walking the hash chains */
2017
0
                    return page_to_mfn(sp);
2018
0
                else
2019
0
                {
2020
0
                    ASSERT(prev);
2021
0
                    /* Delete sp from the list */
2022
0
                    prev->next_shadow = sp->next_shadow;
2023
0
                    /* Re-insert it at the head of the list */
2024
0
                    set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2025
0
                    d->arch.paging.shadow.hash_table[key] = sp;
2026
0
                }
2027
0
            }
2028
0
            else
2029
0
            {
2030
0
                perfc_incr(shadow_hash_lookup_head);
2031
0
            }
2032
0
            return page_to_mfn(sp);
2033
0
        }
2034
0
        prev = sp;
2035
0
        sp = next_shadow(sp);
2036
0
    }
2037
0
2038
0
    perfc_incr(shadow_hash_lookup_miss);
2039
0
    return INVALID_MFN;
2040
0
}
2041
2042
void shadow_hash_insert(struct domain *d, unsigned long n, unsigned int t,
2043
                        mfn_t smfn)
2044
/* Put a mapping (n,t)->smfn into the hash table */
2045
0
{
2046
0
    struct page_info *sp;
2047
0
    key_t key;
2048
0
2049
0
    ASSERT(paging_locked_by_me(d));
2050
0
    ASSERT(d->arch.paging.shadow.hash_table);
2051
0
    ASSERT(t);
2052
0
2053
0
    sh_hash_audit(d);
2054
0
2055
0
    perfc_incr(shadow_hash_inserts);
2056
0
    key = sh_hash(n, t);
2057
0
    sh_hash_audit_bucket(d, key);
2058
0
2059
0
    /* Insert this shadow at the top of the bucket */
2060
0
    sp = mfn_to_page(smfn);
2061
0
    set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2062
0
    d->arch.paging.shadow.hash_table[key] = sp;
2063
0
2064
0
    sh_hash_audit_bucket(d, key);
2065
0
}
2066
2067
void shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
2068
                        mfn_t smfn)
2069
/* Excise the mapping (n,t)->smfn from the hash table */
2070
0
{
2071
0
    struct page_info *sp, *x;
2072
0
    key_t key;
2073
0
2074
0
    ASSERT(paging_locked_by_me(d));
2075
0
    ASSERT(d->arch.paging.shadow.hash_table);
2076
0
    ASSERT(t);
2077
0
2078
0
    sh_hash_audit(d);
2079
0
2080
0
    perfc_incr(shadow_hash_deletes);
2081
0
    key = sh_hash(n, t);
2082
0
    sh_hash_audit_bucket(d, key);
2083
0
2084
0
    sp = mfn_to_page(smfn);
2085
0
    if ( d->arch.paging.shadow.hash_table[key] == sp )
2086
0
        /* Easy case: we're deleting the head item. */
2087
0
        d->arch.paging.shadow.hash_table[key] = next_shadow(sp);
2088
0
    else
2089
0
    {
2090
0
        /* Need to search for the one we want */
2091
0
        x = d->arch.paging.shadow.hash_table[key];
2092
0
        while ( 1 )
2093
0
        {
2094
0
            ASSERT(x); /* We can't have hit the end, since our target is
2095
0
                        * still in the chain somehwere... */
2096
0
            if ( next_shadow(x) == sp )
2097
0
            {
2098
0
                x->next_shadow = sp->next_shadow;
2099
0
                break;
2100
0
            }
2101
0
            x = next_shadow(x);
2102
0
        }
2103
0
    }
2104
0
    set_next_shadow(sp, NULL);
2105
0
2106
0
    sh_hash_audit_bucket(d, key);
2107
0
}
2108
2109
typedef int (*hash_vcpu_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2110
typedef int (*hash_domain_callback_t)(struct domain *d, mfn_t smfn, mfn_t other_mfn);
2111
2112
static void hash_vcpu_foreach(struct vcpu *v, unsigned int callback_mask,
2113
                              const hash_vcpu_callback_t callbacks[],
2114
                              mfn_t callback_mfn)
2115
/* Walk the hash table looking at the types of the entries and
2116
 * calling the appropriate callback function for each entry.
2117
 * The mask determines which shadow types we call back for, and the array
2118
 * of callbacks tells us which function to call.
2119
 * Any callback may return non-zero to let us skip the rest of the scan.
2120
 *
2121
 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2122
 * then return non-zero to terminate the scan. */
2123
0
{
2124
0
    int i, done = 0;
2125
0
    struct domain *d = v->domain;
2126
0
    struct page_info *x;
2127
0
2128
0
    ASSERT(paging_locked_by_me(d));
2129
0
2130
0
    /* Can be called via p2m code &c after shadow teardown. */
2131
0
    if ( unlikely(!d->arch.paging.shadow.hash_table) )
2132
0
        return;
2133
0
2134
0
    /* Say we're here, to stop hash-lookups reordering the chains */
2135
0
    ASSERT(d->arch.paging.shadow.hash_walking == 0);
2136
0
    d->arch.paging.shadow.hash_walking = 1;
2137
0
2138
0
    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2139
0
    {
2140
0
        /* WARNING: This is not safe against changes to the hash table.
2141
0
         * The callback *must* return non-zero if it has inserted or
2142
0
         * deleted anything from the hash (lookups are OK, though). */
2143
0
        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
2144
0
        {
2145
0
            if ( callback_mask & (1 << x->u.sh.type) )
2146
0
            {
2147
0
                ASSERT(x->u.sh.type <= 15);
2148
0
                ASSERT(callbacks[x->u.sh.type] != NULL);
2149
0
                done = callbacks[x->u.sh.type](v, page_to_mfn(x),
2150
0
                                               callback_mfn);
2151
0
                if ( done ) break;
2152
0
            }
2153
0
        }
2154
0
        if ( done ) break;
2155
0
    }
2156
0
    d->arch.paging.shadow.hash_walking = 0;
2157
0
}
2158
2159
static void hash_domain_foreach(struct domain *d,
2160
                                unsigned int callback_mask,
2161
                                const hash_domain_callback_t callbacks[],
2162
                                mfn_t callback_mfn)
2163
/* Walk the hash table looking at the types of the entries and
2164
 * calling the appropriate callback function for each entry.
2165
 * The mask determines which shadow types we call back for, and the array
2166
 * of callbacks tells us which function to call.
2167
 * Any callback may return non-zero to let us skip the rest of the scan.
2168
 *
2169
 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2170
 * then return non-zero to terminate the scan. */
2171
0
{
2172
0
    int i, done = 0;
2173
0
    struct page_info *x;
2174
0
2175
0
    ASSERT(paging_locked_by_me(d));
2176
0
2177
0
    /* Can be called via p2m code &c after shadow teardown. */
2178
0
    if ( unlikely(!d->arch.paging.shadow.hash_table) )
2179
0
        return;
2180
0
2181
0
    /* Say we're here, to stop hash-lookups reordering the chains */
2182
0
    ASSERT(d->arch.paging.shadow.hash_walking == 0);
2183
0
    d->arch.paging.shadow.hash_walking = 1;
2184
0
2185
0
    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2186
0
    {
2187
0
        /* WARNING: This is not safe against changes to the hash table.
2188
0
         * The callback *must* return non-zero if it has inserted or
2189
0
         * deleted anything from the hash (lookups are OK, though). */
2190
0
        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
2191
0
        {
2192
0
            if ( callback_mask & (1 << x->u.sh.type) )
2193
0
            {
2194
0
                ASSERT(x->u.sh.type <= 15);
2195
0
                ASSERT(callbacks[x->u.sh.type] != NULL);
2196
0
                done = callbacks[x->u.sh.type](d, page_to_mfn(x),
2197
0
                                               callback_mfn);
2198
0
                if ( done ) break;
2199
0
            }
2200
0
        }
2201
0
        if ( done ) break;
2202
0
    }
2203
0
    d->arch.paging.shadow.hash_walking = 0;
2204
0
}
2205
2206
2207
/**************************************************************************/
2208
/* Destroy a shadow page: simple dispatcher to call the per-type destructor
2209
 * which will decrement refcounts appropriately and return memory to the
2210
 * free pool. */
2211
2212
void sh_destroy_shadow(struct domain *d, mfn_t smfn)
2213
0
{
2214
0
    struct page_info *sp = mfn_to_page(smfn);
2215
0
    unsigned int t = sp->u.sh.type;
2216
0
2217
0
2218
0
    SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2219
0
2220
0
    /* Double-check, if we can, that the shadowed page belongs to this
2221
0
     * domain, (by following the back-pointer). */
2222
0
    ASSERT(t == SH_type_fl1_32_shadow  ||
2223
0
           t == SH_type_fl1_pae_shadow ||
2224
0
           t == SH_type_fl1_64_shadow  ||
2225
0
           t == SH_type_monitor_table  ||
2226
0
           (is_pv_32bit_domain(d) && t == SH_type_l4_64_shadow) ||
2227
0
           (page_get_owner(mfn_to_page(backpointer(sp))) == d));
2228
0
2229
0
    /* The down-shifts here are so that the switch statement is on nice
2230
0
     * small numbers that the compiler will enjoy */
2231
0
    switch ( t )
2232
0
    {
2233
0
    case SH_type_l1_32_shadow:
2234
0
    case SH_type_fl1_32_shadow:
2235
0
        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(d, smfn);
2236
0
        break;
2237
0
    case SH_type_l2_32_shadow:
2238
0
        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(d, smfn);
2239
0
        break;
2240
0
2241
0
    case SH_type_l1_pae_shadow:
2242
0
    case SH_type_fl1_pae_shadow:
2243
0
        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(d, smfn);
2244
0
        break;
2245
0
    case SH_type_l2_pae_shadow:
2246
0
    case SH_type_l2h_pae_shadow:
2247
0
        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(d, smfn);
2248
0
        break;
2249
0
2250
0
    case SH_type_l1_64_shadow:
2251
0
    case SH_type_fl1_64_shadow:
2252
0
        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(d, smfn);
2253
0
        break;
2254
0
    case SH_type_l2h_64_shadow:
2255
0
        ASSERT(is_pv_32bit_domain(d));
2256
0
        /* Fall through... */
2257
0
    case SH_type_l2_64_shadow:
2258
0
        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(d, smfn);
2259
0
        break;
2260
0
    case SH_type_l3_64_shadow:
2261
0
        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(d, smfn);
2262
0
        break;
2263
0
    case SH_type_l4_64_shadow:
2264
0
        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(d, smfn);
2265
0
        break;
2266
0
2267
0
    default:
2268
0
        SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2269
0
                     (unsigned long)t);
2270
0
        BUG();
2271
0
    }
2272
0
}
2273
2274
static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
2275
0
{
2276
0
    if ( tb_init_done )
2277
0
    {
2278
0
        /* Convert gmfn to gfn */
2279
0
        unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
2280
0
        __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), &gfn);
2281
0
    }
2282
0
}
2283
2284
/**************************************************************************/
2285
/* Remove all writeable mappings of a guest frame from the shadow tables
2286
 * Returns non-zero if we need to flush TLBs.
2287
 * level and fault_addr desribe how we found this to be a pagetable;
2288
 * level==0 means we have some other reason for revoking write access.
2289
 * If level==0 we are allowed to fail, returning -1. */
2290
2291
int sh_remove_write_access(struct domain *d, mfn_t gmfn,
2292
                           unsigned int level,
2293
                           unsigned long fault_addr)
2294
0
{
2295
0
    /* Dispatch table for getting per-type functions */
2296
0
    static const hash_domain_callback_t callbacks[SH_type_unused] = {
2297
0
        NULL, /* none    */
2298
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32   */
2299
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32  */
2300
0
        NULL, /* l2_32   */
2301
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae  */
2302
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2303
0
        NULL, /* l2_pae  */
2304
0
        NULL, /* l2h_pae */
2305
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64   */
2306
0
        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64  */
2307
0
        NULL, /* l2_64   */
2308
0
        NULL, /* l2h_64  */
2309
0
        NULL, /* l3_64   */
2310
0
        NULL, /* l4_64   */
2311
0
        NULL, /* p2m     */
2312
0
        NULL  /* unused  */
2313
0
    };
2314
0
2315
0
    static const unsigned int callback_mask =
2316
0
          SHF_L1_32
2317
0
        | SHF_FL1_32
2318
0
        | SHF_L1_PAE
2319
0
        | SHF_FL1_PAE
2320
0
        | SHF_L1_64
2321
0
        | SHF_FL1_64
2322
0
        ;
2323
0
    struct page_info *pg = mfn_to_page(gmfn);
2324
0
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2325
0
    struct vcpu *curr = current;
2326
0
#endif
2327
0
2328
0
    ASSERT(paging_locked_by_me(d));
2329
0
2330
0
    /* Only remove writable mappings if we are doing shadow refcounts.
2331
0
     * In guest refcounting, we trust Xen to already be restricting
2332
0
     * all the writes to the guest page tables, so we do not need to
2333
0
     * do more. */
2334
0
    if ( !shadow_mode_refcounts(d) )
2335
0
        return 0;
2336
0
2337
0
    /* Early exit if it's already a pagetable, or otherwise not writeable */
2338
0
    if ( (sh_mfn_is_a_page_table(gmfn)
2339
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2340
0
         /* Unless they've been allowed to go out of sync with their shadows */
2341
0
           && !mfn_oos_may_write(gmfn)
2342
0
#endif
2343
0
         )
2344
0
         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2345
0
        return 0;
2346
0
2347
0
    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
2348
0
2349
0
    perfc_incr(shadow_writeable);
2350
0
2351
0
    /* If this isn't a "normal" writeable page, the domain is trying to
2352
0
     * put pagetables in special memory of some kind.  We can't allow that. */
2353
0
    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2354
0
    {
2355
0
        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2356
0
                      PRtype_info "\n",
2357
0
                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2358
0
        domain_crash(d);
2359
0
    }
2360
0
2361
0
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2362
0
    if ( curr->domain == d )
2363
0
    {
2364
0
        unsigned long gfn;
2365
0
        /* Heuristic: there is likely to be only one writeable mapping,
2366
0
         * and that mapping is likely to be in the current pagetable,
2367
0
         * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2368
0
2369
0
#define GUESS(_a, _h) do {                                              \
2370
0
            if ( curr->arch.paging.mode->shadow.guess_wrmap(            \
2371
0
                     curr, (_a), gmfn) )                                \
2372
0
                perfc_incr(shadow_writeable_h_ ## _h);                  \
2373
0
            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
2374
0
            {                                                           \
2375
0
                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);   \
2376
0
                return 1;                                               \
2377
0
            }                                                           \
2378
0
        } while (0)
2379
0
2380
0
        if ( curr->arch.paging.mode->guest_levels == 2 )
2381
0
        {
2382
0
            if ( level == 1 )
2383
0
                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2384
0
                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2385
0
2386
0
            /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2387
0
            if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 )
2388
0
                GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2389
0
2390
0
            /* FreeBSD: Linear map at 0xBFC00000 */
2391
0
            if ( level == 1 )
2392
0
                GUESS(0xBFC00000UL
2393
0
                      + ((fault_addr & VADDR_MASK) >> 10), 6);
2394
0
        }
2395
0
        else if ( curr->arch.paging.mode->guest_levels == 3 )
2396
0
        {
2397
0
            /* 32bit PAE w2k3: linear map at 0xC0000000 */
2398
0
            switch ( level )
2399
0
            {
2400
0
            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2401
0
            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2402
0
            }
2403
0
2404
0
            /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2405
0
            if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 )
2406
0
                GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2407
0
2408
0
            /* FreeBSD PAE: Linear map at 0xBF800000 */
2409
0
            switch ( level )
2410
0
            {
2411
0
            case 1: GUESS(0xBF800000UL
2412
0
                          + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2413
0
            case 2: GUESS(0xBFDFC000UL
2414
0
                          + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2415
0
            }
2416
0
        }
2417
0
        else if ( curr->arch.paging.mode->guest_levels == 4 )
2418
0
        {
2419
0
            /* 64bit w2k3: linear map at 0xfffff68000000000 */
2420
0
            switch ( level )
2421
0
            {
2422
0
            case 1: GUESS(0xfffff68000000000UL
2423
0
                          + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2424
0
            case 2: GUESS(0xfffff6fb40000000UL
2425
0
                          + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2426
0
            case 3: GUESS(0xfffff6fb7da00000UL
2427
0
                          + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2428
0
            }
2429
0
2430
0
            /* 64bit Linux direct map at 0xffff880000000000; older kernels
2431
0
             * had it at 0xffff810000000000, and older kernels yet had it
2432
0
             * at 0x0000010000000000UL */
2433
0
            gfn = mfn_to_gfn(d, gmfn);
2434
0
            GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
2435
0
            GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2436
0
            GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2437
0
2438
0
            /*
2439
0
             * 64bit Solaris kernel page map at
2440
0
             * kpm_vbase; 0xfffffe0000000000UL
2441
0
             */
2442
0
            GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2443
0
2444
0
             /* FreeBSD 64bit: linear map 0xffff800000000000 */
2445
0
             switch ( level )
2446
0
             {
2447
0
             case 1: GUESS(0xffff800000000000
2448
0
                           + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2449
0
             case 2: GUESS(0xffff804000000000UL
2450
0
                           + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2451
0
             case 3: GUESS(0xffff804020000000UL
2452
0
                           + ((fault_addr & VADDR_MASK) >> 27), 6); break;
2453
0
             }
2454
0
             /* FreeBSD 64bit: direct map at 0xffffff0000000000 */
2455
0
             GUESS(0xffffff0000000000 + (gfn << PAGE_SHIFT), 6);
2456
0
        }
2457
0
2458
0
#undef GUESS
2459
0
    }
2460
0
2461
0
    if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2462
0
        return 1;
2463
0
2464
0
    /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2465
0
     * (entries in the fixmap) where linux maps its pagetables.  Since
2466
0
     * we expect to hit them most of the time, we start the search for
2467
0
     * the writeable mapping by looking at the same MFN where the last
2468
0
     * brute-force search succeeded. */
2469
0
2470
0
    if ( (curr->domain == d) &&
2471
0
         (curr->arch.paging.shadow.last_writeable_pte_smfn != 0) )
2472
0
    {
2473
0
        unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2474
0
        mfn_t last_smfn = _mfn(curr->arch.paging.shadow.last_writeable_pte_smfn);
2475
0
        int shtype = mfn_to_page(last_smfn)->u.sh.type;
2476
0
2477
0
        if ( callbacks[shtype] )
2478
0
            callbacks[shtype](d, last_smfn, gmfn);
2479
0
2480
0
        if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2481
0
            perfc_incr(shadow_writeable_h_5);
2482
0
    }
2483
0
2484
0
    if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2485
0
        return 1;
2486
0
2487
0
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2488
0
2489
0
    /* Brute-force search of all the shadows, by walking the hash */
2490
0
    trace_shadow_wrmap_bf(gmfn);
2491
0
    if ( level == 0 )
2492
0
        perfc_incr(shadow_writeable_bf_1);
2493
0
    else
2494
0
        perfc_incr(shadow_writeable_bf);
2495
0
    hash_domain_foreach(d, callback_mask, callbacks, gmfn);
2496
0
2497
0
    /* If that didn't catch the mapping, then there's some non-pagetable
2498
0
     * mapping -- ioreq page, grant mapping, &c. */
2499
0
    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2500
0
    {
2501
0
        if ( level == 0 )
2502
0
            return -1;
2503
0
2504
0
        SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2505
0
                      "%lu special-use mappings of it\n", mfn_x(gmfn),
2506
0
                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2507
0
        domain_crash(d);
2508
0
    }
2509
0
2510
0
    /* We killed at least one writeable mapping, so must flush TLBs. */
2511
0
    return 1;
2512
0
}
2513
2514
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2515
int sh_remove_write_access_from_sl1p(struct domain *d, mfn_t gmfn,
2516
                                     mfn_t smfn, unsigned long off)
2517
0
{
2518
0
    struct page_info *sp = mfn_to_page(smfn);
2519
0
2520
0
    ASSERT(mfn_valid(smfn));
2521
0
    ASSERT(mfn_valid(gmfn));
2522
0
2523
0
    if ( sp->u.sh.type == SH_type_l1_32_shadow
2524
0
         || sp->u.sh.type == SH_type_fl1_32_shadow )
2525
0
    {
2526
0
        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2527
0
            (d, gmfn, smfn, off);
2528
0
    }
2529
0
    else if ( sp->u.sh.type == SH_type_l1_pae_shadow
2530
0
              || sp->u.sh.type == SH_type_fl1_pae_shadow )
2531
0
        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2532
0
            (d, gmfn, smfn, off);
2533
0
    else if ( sp->u.sh.type == SH_type_l1_64_shadow
2534
0
              || sp->u.sh.type == SH_type_fl1_64_shadow )
2535
0
        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2536
0
            (d, gmfn, smfn, off);
2537
0
2538
0
    return 0;
2539
0
}
2540
#endif
2541
2542
/**************************************************************************/
2543
/* Remove all mappings of a guest frame from the shadow tables.
2544
 * Returns non-zero if we need to flush TLBs. */
2545
2546
static int sh_remove_all_mappings(struct domain *d, mfn_t gmfn, gfn_t gfn)
2547
0
{
2548
0
    struct page_info *page = mfn_to_page(gmfn);
2549
0
2550
0
    /* Dispatch table for getting per-type functions */
2551
0
    static const hash_domain_callback_t callbacks[SH_type_unused] = {
2552
0
        NULL, /* none    */
2553
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32   */
2554
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32  */
2555
0
        NULL, /* l2_32   */
2556
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae  */
2557
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2558
0
        NULL, /* l2_pae  */
2559
0
        NULL, /* l2h_pae */
2560
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64   */
2561
0
        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64  */
2562
0
        NULL, /* l2_64   */
2563
0
        NULL, /* l2h_64  */
2564
0
        NULL, /* l3_64   */
2565
0
        NULL, /* l4_64   */
2566
0
        NULL, /* p2m     */
2567
0
        NULL  /* unused  */
2568
0
    };
2569
0
2570
0
    static const unsigned int callback_mask =
2571
0
          SHF_L1_32
2572
0
        | SHF_FL1_32
2573
0
        | SHF_L1_PAE
2574
0
        | SHF_FL1_PAE
2575
0
        | SHF_L1_64
2576
0
        | SHF_FL1_64
2577
0
        ;
2578
0
2579
0
    perfc_incr(shadow_mappings);
2580
0
    if ( sh_check_page_has_no_refs(page) )
2581
0
        return 0;
2582
0
2583
0
    /* Although this is an externally visible function, we do not know
2584
0
     * whether the paging lock will be held when it is called (since it
2585
0
     * can be called via put_page_type when we clear a shadow l1e).*/
2586
0
    paging_lock_recursive(d);
2587
0
2588
0
    /* XXX TODO:
2589
0
     * Heuristics for finding the (probably) single mapping of this gmfn */
2590
0
2591
0
    /* Brute-force search of all the shadows, by walking the hash */
2592
0
    perfc_incr(shadow_mappings_bf);
2593
0
    hash_domain_foreach(d, callback_mask, callbacks, gmfn);
2594
0
2595
0
    /* If that didn't catch the mapping, something is very wrong */
2596
0
    if ( !sh_check_page_has_no_refs(page) )
2597
0
    {
2598
0
        /*
2599
0
         * Don't complain if we're in HVM and there are some extra mappings:
2600
0
         * The qemu helper process has an untyped mapping of this dom's RAM
2601
0
         * and the HVM restore program takes another.
2602
0
         * Also allow one typed refcount for
2603
0
         * - Xen heap pages, to match share_xen_page_with_guest(),
2604
0
         * - ioreq server pages, to match prepare_ring_for_helper().
2605
0
         */
2606
0
        if ( !(shadow_mode_external(d)
2607
0
               && (page->count_info & PGC_count_mask) <= 3
2608
0
               && ((page->u.inuse.type_info & PGT_count_mask)
2609
0
                   == (is_xen_heap_page(page) ||
2610
0
                       is_ioreq_server_page(d, page)))) )
2611
0
        {
2612
0
            SHADOW_ERROR("can't find all mappings of mfn %lx (gfn %lx): "
2613
0
                          "c=%lx t=%lx x=%d i=%d\n", mfn_x(gmfn), gfn_x(gfn),
2614
0
                          page->count_info, page->u.inuse.type_info,
2615
0
                          !!is_xen_heap_page(page), is_ioreq_server_page(d, page));
2616
0
        }
2617
0
    }
2618
0
2619
0
    paging_unlock(d);
2620
0
2621
0
    /* We killed at least one mapping, so must flush TLBs. */
2622
0
    return 1;
2623
0
}
2624
2625
2626
/**************************************************************************/
2627
/* Remove all shadows of a guest frame from the shadow tables */
2628
2629
static int sh_remove_shadow_via_pointer(struct domain *d, mfn_t smfn)
2630
/* Follow this shadow's up-pointer, if it has one, and remove the reference
2631
 * found there.  Returns 1 if that was the only reference to this shadow */
2632
0
{
2633
0
    struct page_info *sp = mfn_to_page(smfn);
2634
0
    mfn_t pmfn;
2635
0
    void *vaddr;
2636
0
    int rc;
2637
0
2638
0
    ASSERT(sp->u.sh.type > 0);
2639
0
    ASSERT(sp->u.sh.type < SH_type_max_shadow);
2640
0
    ASSERT(sh_type_has_up_pointer(d, sp->u.sh.type));
2641
0
2642
0
    if (sp->up == 0) return 0;
2643
0
    pmfn = maddr_to_mfn(sp->up);
2644
0
    ASSERT(mfn_valid(pmfn));
2645
0
    vaddr = map_domain_page(pmfn);
2646
0
    ASSERT(vaddr);
2647
0
    vaddr += sp->up & (PAGE_SIZE-1);
2648
0
    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2649
0
2650
0
    /* Is this the only reference to this shadow? */
2651
0
    rc = (sp->u.sh.count == 1) ? 1 : 0;
2652
0
2653
0
    /* Blank the offending entry */
2654
0
    switch (sp->u.sh.type)
2655
0
    {
2656
0
    case SH_type_l1_32_shadow:
2657
0
    case SH_type_l2_32_shadow:
2658
0
        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(d, vaddr, pmfn);
2659
0
        break;
2660
0
    case SH_type_l1_pae_shadow:
2661
0
    case SH_type_l2_pae_shadow:
2662
0
    case SH_type_l2h_pae_shadow:
2663
0
        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(d, vaddr, pmfn);
2664
0
        break;
2665
0
    case SH_type_l1_64_shadow:
2666
0
    case SH_type_l2_64_shadow:
2667
0
    case SH_type_l2h_64_shadow:
2668
0
    case SH_type_l3_64_shadow:
2669
0
    case SH_type_l4_64_shadow:
2670
0
        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(d, vaddr, pmfn);
2671
0
        break;
2672
0
    default: BUG(); /* Some wierd unknown shadow type */
2673
0
    }
2674
0
2675
0
    unmap_domain_page(vaddr);
2676
0
    if ( rc )
2677
0
        perfc_incr(shadow_up_pointer);
2678
0
    else
2679
0
        perfc_incr(shadow_unshadow_bf);
2680
0
2681
0
    return rc;
2682
0
}
2683
2684
void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all)
2685
/* Remove the shadows of this guest page.
2686
 * If fast != 0, just try the quick heuristic, which will remove
2687
 * at most one reference to each shadow of the page.  Otherwise, walk
2688
 * all the shadow tables looking for refs to shadows of this gmfn.
2689
 * If all != 0, kill the domain if we can't find all the shadows.
2690
 * (all != 0 implies fast == 0)
2691
 */
2692
0
{
2693
0
    struct page_info *pg = mfn_to_page(gmfn);
2694
0
    mfn_t smfn;
2695
0
    unsigned char t;
2696
0
2697
0
    /* Dispatch table for getting per-type functions: each level must
2698
0
     * be called with the function to remove a lower-level shadow. */
2699
0
    static const hash_domain_callback_t callbacks[SH_type_unused] = {
2700
0
        NULL, /* none    */
2701
0
        NULL, /* l1_32   */
2702
0
        NULL, /* fl1_32  */
2703
0
        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32   */
2704
0
        NULL, /* l1_pae  */
2705
0
        NULL, /* fl1_pae */
2706
0
        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae  */
2707
0
        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2708
0
        NULL, /* l1_64   */
2709
0
        NULL, /* fl1_64  */
2710
0
        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64   */
2711
0
        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64  */
2712
0
        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64   */
2713
0
        SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64   */
2714
0
        NULL, /* p2m     */
2715
0
        NULL  /* unused  */
2716
0
    };
2717
0
2718
0
    /* Another lookup table, for choosing which mask to use */
2719
0
    static const unsigned int masks[SH_type_unused] = {
2720
0
        0, /* none    */
2721
0
        SHF_L2_32, /* l1_32   */
2722
0
        0, /* fl1_32  */
2723
0
        0, /* l2_32   */
2724
0
        SHF_L2H_PAE | SHF_L2_PAE, /* l1_pae  */
2725
0
        0, /* fl1_pae */
2726
0
        0, /* l2_pae  */
2727
0
        0, /* l2h_pae  */
2728
0
        SHF_L2H_64 | SHF_L2_64, /* l1_64   */
2729
0
        0, /* fl1_64  */
2730
0
        SHF_L3_64, /* l2_64   */
2731
0
        SHF_L3_64, /* l2h_64  */
2732
0
        SHF_L4_64, /* l3_64   */
2733
0
        0, /* l4_64   */
2734
0
        0, /* p2m     */
2735
0
        0  /* unused  */
2736
0
    };
2737
0
2738
0
    ASSERT(!(all && fast));
2739
0
    ASSERT(mfn_valid(gmfn));
2740
0
2741
0
    /* Although this is an externally visible function, we do not know
2742
0
     * whether the paging lock will be held when it is called (since it
2743
0
     * can be called via put_page_type when we clear a shadow l1e).*/
2744
0
    paging_lock_recursive(d);
2745
0
2746
0
    SHADOW_PRINTK("d%d gmfn=%"PRI_mfn"\n", d->domain_id, mfn_x(gmfn));
2747
0
2748
0
    /* Bail out now if the page is not shadowed */
2749
0
    if ( (pg->count_info & PGC_page_table) == 0 )
2750
0
    {
2751
0
        paging_unlock(d);
2752
0
        return;
2753
0
    }
2754
0
2755
0
    /* Search for this shadow in all appropriate shadows */
2756
0
    perfc_incr(shadow_unshadow);
2757
0
2758
0
    /* Lower-level shadows need to be excised from upper-level shadows.
2759
0
     * This call to hash_vcpu_foreach() looks dangerous but is in fact OK: each
2760
0
     * call will remove at most one shadow, and terminate immediately when
2761
0
     * it does remove it, so we never walk the hash after doing a deletion.  */
2762
0
#define DO_UNSHADOW(_type) do {                                         \
2763
0
    t = (_type);                                                        \
2764
0
    if( !(pg->count_info & PGC_page_table)                              \
2765
0
        || !(pg->shadow_flags & (1 << t)) )                             \
2766
0
        break;                                                          \
2767
0
    smfn = shadow_hash_lookup(d, mfn_x(gmfn), t);                       \
2768
0
    if ( unlikely(!mfn_valid(smfn)) )                                   \
2769
0
    {                                                                   \
2770
0
        SHADOW_ERROR(": gmfn %#lx has flags %#"PRIx32                   \
2771
0
                     " but no type-%#"PRIx32" shadow\n",                \
2772
0
                     mfn_x(gmfn), (uint32_t)pg->shadow_flags, t);       \
2773
0
        break;                                                          \
2774
0
    }                                                                   \
2775
0
    if ( sh_type_is_pinnable(d, t) )                                    \
2776
0
        sh_unpin(d, smfn);                                              \
2777
0
    else if ( sh_type_has_up_pointer(d, t) )                            \
2778
0
        sh_remove_shadow_via_pointer(d, smfn);                          \
2779
0
    if( !fast                                                           \
2780
0
        && (pg->count_info & PGC_page_table)                            \
2781
0
        && (pg->shadow_flags & (1 << t)) )                              \
2782
0
        hash_domain_foreach(d, masks[t], callbacks, smfn);              \
2783
0
} while (0)
2784
0
2785
0
    DO_UNSHADOW(SH_type_l2_32_shadow);
2786
0
    DO_UNSHADOW(SH_type_l1_32_shadow);
2787
0
    DO_UNSHADOW(SH_type_l2h_pae_shadow);
2788
0
    DO_UNSHADOW(SH_type_l2_pae_shadow);
2789
0
    DO_UNSHADOW(SH_type_l1_pae_shadow);
2790
0
    DO_UNSHADOW(SH_type_l4_64_shadow);
2791
0
    DO_UNSHADOW(SH_type_l3_64_shadow);
2792
0
    DO_UNSHADOW(SH_type_l2h_64_shadow);
2793
0
    DO_UNSHADOW(SH_type_l2_64_shadow);
2794
0
    DO_UNSHADOW(SH_type_l1_64_shadow);
2795
0
2796
0
#undef DO_UNSHADOW
2797
0
2798
0
    /* If that didn't catch the shadows, something is wrong */
2799
0
    if ( !fast && all && (pg->count_info & PGC_page_table) )
2800
0
    {
2801
0
        SHADOW_ERROR("can't find all shadows of mfn %"PRI_mfn" "
2802
0
                     "(shadow_flags=%08x)\n",
2803
0
                      mfn_x(gmfn), pg->shadow_flags);
2804
0
        domain_crash(d);
2805
0
    }
2806
0
2807
0
    /* Need to flush TLBs now, so that linear maps are safe next time we
2808
0
     * take a fault. */
2809
0
    flush_tlb_mask(d->domain_dirty_cpumask);
2810
0
2811
0
    paging_unlock(d);
2812
0
}
2813
2814
static void
2815
sh_remove_all_shadows_and_parents(struct domain *d, mfn_t gmfn)
2816
/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2817
 * Unshadow it, and recursively unshadow pages that reference it. */
2818
0
{
2819
0
    sh_remove_shadows(d, gmfn, 0, 1);
2820
0
    /* XXX TODO:
2821
0
     * Rework this hashtable walker to return a linked-list of all
2822
0
     * the shadows it modified, then do breadth-first recursion
2823
0
     * to find the way up to higher-level tables and unshadow them too.
2824
0
     *
2825
0
     * The current code (just tearing down each page's shadows as we
2826
0
     * detect that it is not a pagetable) is correct, but very slow.
2827
0
     * It means extra emulated writes and slows down removal of mappings. */
2828
0
}
2829
2830
/**************************************************************************/
2831
2832
/* Reset the up-pointers of every L3 shadow to 0.
2833
 * This is called when l3 shadows stop being pinnable, to clear out all
2834
 * the list-head bits so the up-pointer field is properly inititalised. */
2835
static int sh_clear_up_pointer(struct vcpu *v, mfn_t smfn, mfn_t unused)
2836
0
{
2837
0
    mfn_to_page(smfn)->up = 0;
2838
0
    return 0;
2839
0
}
2840
2841
void sh_reset_l3_up_pointers(struct vcpu *v)
2842
0
{
2843
0
    static const hash_vcpu_callback_t callbacks[SH_type_unused] = {
2844
0
        NULL, /* none    */
2845
0
        NULL, /* l1_32   */
2846
0
        NULL, /* fl1_32  */
2847
0
        NULL, /* l2_32   */
2848
0
        NULL, /* l1_pae  */
2849
0
        NULL, /* fl1_pae */
2850
0
        NULL, /* l2_pae  */
2851
0
        NULL, /* l2h_pae */
2852
0
        NULL, /* l1_64   */
2853
0
        NULL, /* fl1_64  */
2854
0
        NULL, /* l2_64   */
2855
0
        NULL, /* l2h_64  */
2856
0
        sh_clear_up_pointer, /* l3_64   */
2857
0
        NULL, /* l4_64   */
2858
0
        NULL, /* p2m     */
2859
0
        NULL  /* unused  */
2860
0
    };
2861
0
    static const unsigned int callback_mask = SHF_L3_64;
2862
0
2863
0
    hash_vcpu_foreach(v, callback_mask, callbacks, INVALID_MFN);
2864
0
}
2865
2866
2867
/**************************************************************************/
2868
2869
static void sh_update_paging_modes(struct vcpu *v)
2870
0
{
2871
0
    struct domain *d = v->domain;
2872
0
    const struct paging_mode *old_mode = v->arch.paging.mode;
2873
0
2874
0
    ASSERT(paging_locked_by_me(d));
2875
0
2876
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2877
0
    /* Make sure this vcpu has a virtual TLB array allocated */
2878
0
    if ( unlikely(!v->arch.paging.vtlb) )
2879
0
    {
2880
0
        v->arch.paging.vtlb = xzalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2881
0
        if ( unlikely(!v->arch.paging.vtlb) )
2882
0
        {
2883
0
            SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2884
0
                         d->domain_id, v->vcpu_id);
2885
0
            domain_crash(v->domain);
2886
0
            return;
2887
0
        }
2888
0
        spin_lock_init(&v->arch.paging.vtlb_lock);
2889
0
    }
2890
0
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2891
0
2892
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2893
0
    if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
2894
0
    {
2895
0
        int i;
2896
0
        for(i = 0; i < SHADOW_OOS_PAGES; i++)
2897
0
        {
2898
0
            shadow_prealloc(d, SH_type_oos_snapshot, 1);
2899
0
            v->arch.paging.shadow.oos_snapshot[i] =
2900
0
                shadow_alloc(d, SH_type_oos_snapshot, 0);
2901
0
        }
2902
0
    }
2903
0
#endif /* OOS */
2904
0
2905
0
    // Valid transitions handled by this function:
2906
0
    // - For PV guests:
2907
0
    //     - after a shadow mode has been changed
2908
0
    // - For HVM guests:
2909
0
    //     - after a shadow mode has been changed
2910
0
    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2911
0
    //
2912
0
2913
0
    // First, tear down any old shadow tables held by this vcpu.
2914
0
    //
2915
0
    if ( v->arch.paging.mode )
2916
0
        v->arch.paging.mode->shadow.detach_old_tables(v);
2917
0
2918
0
    if ( !is_pv_domain(d) )
2919
0
    {
2920
0
        ///
2921
0
        /// HVM guest
2922
0
        ///
2923
0
        ASSERT(shadow_mode_translate(d));
2924
0
        ASSERT(shadow_mode_external(d));
2925
0
2926
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2927
0
        /* Need to resync all our pages now, because if a page goes out
2928
0
         * of sync with paging enabled and is resynced with paging
2929
0
         * disabled, the resync will go wrong. */
2930
0
        shadow_resync_all(v);
2931
0
#endif /* OOS */
2932
0
2933
0
        if ( !hvm_paging_enabled(v) )
2934
0
        {
2935
0
            /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2936
0
             * pagetable for it, mapping 4 GB one-to-one using a single l2
2937
0
             * page of 1024 superpage mappings */
2938
0
            v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2939
0
            v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2940
0
        }
2941
0
        else if ( hvm_long_mode_active(v) )
2942
0
        {
2943
0
            // long mode guest...
2944
0
            v->arch.paging.mode =
2945
0
                &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2946
0
        }
2947
0
        else if ( hvm_pae_enabled(v) )
2948
0
        {
2949
0
            // 32-bit PAE mode guest...
2950
0
            v->arch.paging.mode =
2951
0
                &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2952
0
        }
2953
0
        else
2954
0
        {
2955
0
            // 32-bit 2 level guest...
2956
0
            v->arch.paging.mode =
2957
0
                &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2958
0
        }
2959
0
2960
0
        if ( pagetable_is_null(v->arch.monitor_table) )
2961
0
        {
2962
0
            mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2963
0
            v->arch.monitor_table = pagetable_from_mfn(mmfn);
2964
0
            make_cr3(v, mmfn);
2965
0
            hvm_update_host_cr3(v);
2966
0
        }
2967
0
2968
0
        if ( v->arch.paging.mode != old_mode )
2969
0
        {
2970
0
            SHADOW_PRINTK("new paging mode: %pv pe=%d gl=%u "
2971
0
                          "sl=%u (was g=%u s=%u)\n",
2972
0
                          v,
2973
0
                          is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2974
0
                          v->arch.paging.mode->guest_levels,
2975
0
                          v->arch.paging.mode->shadow.shadow_levels,
2976
0
                          old_mode ? old_mode->guest_levels : 0,
2977
0
                          old_mode ? old_mode->shadow.shadow_levels : 0);
2978
0
            if ( old_mode &&
2979
0
                 (v->arch.paging.mode->shadow.shadow_levels !=
2980
0
                  old_mode->shadow.shadow_levels) )
2981
0
            {
2982
0
                /* Need to make a new monitor table for the new mode */
2983
0
                mfn_t new_mfn, old_mfn;
2984
0
2985
0
                if ( v != current && vcpu_runnable(v) )
2986
0
                {
2987
0
                    SHADOW_ERROR("Some third party (%pv) is changing "
2988
0
                                 "this HVM vcpu's (%pv) paging mode "
2989
0
                                 "while it is running.\n",
2990
0
                                 current, v);
2991
0
                    /* It's not safe to do that because we can't change
2992
0
                     * the host CR3 for a running domain */
2993
0
                    domain_crash(v->domain);
2994
0
                    return;
2995
0
                }
2996
0
2997
0
                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2998
0
                v->arch.monitor_table = pagetable_null();
2999
0
                new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
3000
0
                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
3001
0
                SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
3002
0
                               mfn_x(new_mfn));
3003
0
3004
0
                /* Don't be running on the old monitor table when we
3005
0
                 * pull it down!  Switch CR3, and warn the HVM code that
3006
0
                 * its host cr3 has changed. */
3007
0
                make_cr3(v, new_mfn);
3008
0
                if ( v == current )
3009
0
                    write_ptbase(v);
3010
0
                hvm_update_host_cr3(v);
3011
0
                old_mode->shadow.destroy_monitor_table(v, old_mfn);
3012
0
            }
3013
0
        }
3014
0
3015
0
        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
3016
0
        //        These are HARD: think about the case where two CPU's have
3017
0
        //        different values for CR4.PSE and CR4.PGE at the same time.
3018
0
        //        This *does* happen, at least for CR4.PGE...
3019
0
    }
3020
0
3021
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3022
0
    /* We need to check that all the vcpus have paging enabled to
3023
0
     * unsync PTs. */
3024
0
    if ( is_hvm_domain(d) && !d->arch.paging.shadow.oos_off )
3025
0
    {
3026
0
        int pe = 1;
3027
0
        struct vcpu *vptr;
3028
0
3029
0
        for_each_vcpu(d, vptr)
3030
0
        {
3031
0
            if ( !hvm_paging_enabled(vptr) )
3032
0
            {
3033
0
                pe = 0;
3034
0
                break;
3035
0
            }
3036
0
        }
3037
0
3038
0
        d->arch.paging.shadow.oos_active = pe;
3039
0
    }
3040
0
#endif /* OOS */
3041
0
3042
0
    v->arch.paging.mode->update_cr3(v, 0);
3043
0
}
3044
3045
void shadow_update_paging_modes(struct vcpu *v)
3046
0
{
3047
0
    paging_lock(v->domain);
3048
0
    sh_update_paging_modes(v);
3049
0
    paging_unlock(v->domain);
3050
0
}
3051
3052
/**************************************************************************/
3053
/* Turning on and off shadow features */
3054
3055
static void sh_new_mode(struct domain *d, u32 new_mode)
3056
/* Inform all the vcpus that the shadow mode has been changed */
3057
0
{
3058
0
    struct vcpu *v;
3059
0
3060
0
    ASSERT(paging_locked_by_me(d));
3061
0
    ASSERT(d != current->domain);
3062
0
3063
0
    d->arch.paging.mode = new_mode;
3064
0
    for_each_vcpu(d, v)
3065
0
        sh_update_paging_modes(v);
3066
0
}
3067
3068
int shadow_enable(struct domain *d, u32 mode)
3069
/* Turn on "permanent" shadow features: external, translate, refcount.
3070
 * Can only be called once on a domain, and these features cannot be
3071
 * disabled.
3072
 * Returns 0 for success, -errno for failure. */
3073
0
{
3074
0
    unsigned int old_pages;
3075
0
    struct page_info *pg = NULL;
3076
0
    uint32_t *e;
3077
0
    int rv = 0;
3078
0
    struct p2m_domain *p2m = p2m_get_hostp2m(d);
3079
0
3080
0
    mode |= PG_SH_enable;
3081
0
3082
0
    domain_pause(d);
3083
0
3084
0
    /* Sanity check the arguments */
3085
0
    if ( shadow_mode_enabled(d) )
3086
0
    {
3087
0
        rv = -EINVAL;
3088
0
        goto out_unlocked;
3089
0
    }
3090
0
3091
0
    /* Init the shadow memory allocation if the user hasn't done so */
3092
0
    old_pages = d->arch.paging.shadow.total_pages;
3093
0
    if ( old_pages == 0 )
3094
0
    {
3095
0
        paging_lock(d);
3096
0
        rv = shadow_set_allocation(d, 1024, NULL); /* Use at least 4MB */
3097
0
        if ( rv != 0 )
3098
0
        {
3099
0
            shadow_set_allocation(d, 0, NULL);
3100
0
            goto out_locked;
3101
0
        }
3102
0
        paging_unlock(d);
3103
0
    }
3104
0
3105
0
    /* Allow p2m and log-dirty code to borrow shadow memory */
3106
0
    d->arch.paging.alloc_page = shadow_alloc_p2m_page;
3107
0
    d->arch.paging.free_page = shadow_free_p2m_page;
3108
0
3109
0
    /* Init the P2M table.  Must be done before we take the paging lock
3110
0
     * to avoid possible deadlock. */
3111
0
    if ( mode & PG_translate )
3112
0
    {
3113
0
        rv = p2m_alloc_table(p2m);
3114
0
        if (rv != 0)
3115
0
            goto out_unlocked;
3116
0
    }
3117
0
3118
0
    /* HVM domains need an extra pagetable for vcpus that think they
3119
0
     * have paging disabled */
3120
0
    if ( is_hvm_domain(d) )
3121
0
    {
3122
0
        /* Get a single page from the shadow pool.  Take it via the
3123
0
         * P2M interface to make freeing it simpler afterwards. */
3124
0
        pg = shadow_alloc_p2m_page(d);
3125
0
        if ( pg == NULL )
3126
0
        {
3127
0
            rv = -ENOMEM;
3128
0
            goto out_unlocked;
3129
0
        }
3130
0
        /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3131
0
         * of virtual address space onto the same physical address range */
3132
0
        e = __map_domain_page(pg);
3133
0
        write_32bit_pse_identmap(e);
3134
0
        unmap_domain_page(e);
3135
0
        pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3136
0
    }
3137
0
3138
0
    paging_lock(d);
3139
0
3140
0
    /* Sanity check again with the lock held */
3141
0
    if ( shadow_mode_enabled(d) )
3142
0
    {
3143
0
        rv = -EINVAL;
3144
0
        goto out_locked;
3145
0
    }
3146
0
3147
0
    /* Init the hash table */
3148
0
    if ( shadow_hash_alloc(d) != 0 )
3149
0
    {
3150
0
        rv = -ENOMEM;
3151
0
        goto out_locked;
3152
0
    }
3153
0
3154
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3155
0
    /* We assume we're dealing with an older 64bit linux guest until we
3156
0
     * see the guest use more than one l4 per vcpu. */
3157
0
    d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3158
0
#endif
3159
0
3160
0
    /* Record the 1-to-1 pagetable we just made */
3161
0
    if ( is_hvm_domain(d) )
3162
0
        d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3163
0
3164
0
    /* Update the bits */
3165
0
    sh_new_mode(d, mode);
3166
0
3167
0
 out_locked:
3168
0
    paging_unlock(d);
3169
0
 out_unlocked:
3170
0
    if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
3171
0
        p2m_teardown(p2m);
3172
0
    if ( rv != 0 && pg != NULL )
3173
0
        shadow_free_p2m_page(d, pg);
3174
0
    domain_unpause(d);
3175
0
    return rv;
3176
0
}
3177
3178
void shadow_teardown(struct domain *d, bool *preempted)
3179
/* Destroy the shadow pagetables of this domain and free its shadow memory.
3180
 * Should only be called for dying domains. */
3181
0
{
3182
0
    struct vcpu *v;
3183
0
    mfn_t mfn;
3184
0
    struct page_info *unpaged_pagetable = NULL;
3185
0
3186
0
    ASSERT(d->is_dying);
3187
0
    ASSERT(d != current->domain);
3188
0
3189
0
    paging_lock(d);
3190
0
3191
0
    if ( shadow_mode_enabled(d) )
3192
0
    {
3193
0
        /* Release the shadow and monitor tables held by each vcpu */
3194
0
        for_each_vcpu(d, v)
3195
0
        {
3196
0
            if ( v->arch.paging.mode )
3197
0
            {
3198
0
                v->arch.paging.mode->shadow.detach_old_tables(v);
3199
0
                if ( shadow_mode_external(d) )
3200
0
                {
3201
0
                    mfn = pagetable_get_mfn(v->arch.monitor_table);
3202
0
                    if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3203
0
                        v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3204
0
                    v->arch.monitor_table = pagetable_null();
3205
0
                }
3206
0
            }
3207
0
        }
3208
0
    }
3209
0
3210
0
#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3211
0
    /* Free the virtual-TLB array attached to each vcpu */
3212
0
    for_each_vcpu(d, v)
3213
0
    {
3214
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3215
0
        if ( v->arch.paging.vtlb )
3216
0
        {
3217
0
            xfree(v->arch.paging.vtlb);
3218
0
            v->arch.paging.vtlb = NULL;
3219
0
        }
3220
0
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3221
0
3222
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3223
0
        {
3224
0
            int i;
3225
0
            mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3226
0
            for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
3227
0
                if ( mfn_valid(oos_snapshot[i]) )
3228
0
                {
3229
0
                    shadow_free(d, oos_snapshot[i]);
3230
0
                    oos_snapshot[i] = INVALID_MFN;
3231
0
                }
3232
0
        }
3233
0
#endif /* OOS */
3234
0
    }
3235
0
#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3236
0
3237
0
    if ( d->arch.paging.shadow.total_pages != 0 )
3238
0
    {
3239
0
        /* Destroy all the shadows and release memory to domheap */
3240
0
        shadow_set_allocation(d, 0, preempted);
3241
0
3242
0
        if ( preempted && *preempted )
3243
0
            goto out;
3244
0
3245
0
        /* Release the hash table back to xenheap */
3246
0
        if (d->arch.paging.shadow.hash_table)
3247
0
            shadow_hash_teardown(d);
3248
0
3249
0
        ASSERT(d->arch.paging.shadow.total_pages == 0);
3250
0
    }
3251
0
3252
0
    /* Free the non-paged-vcpus pagetable; must happen after we've
3253
0
     * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3254
0
    if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3255
0
    {
3256
0
        ASSERT(is_hvm_domain(d));
3257
0
        for_each_vcpu(d, v)
3258
0
            if ( !hvm_paging_enabled(v) )
3259
0
                v->arch.guest_table = pagetable_null();
3260
0
        unpaged_pagetable =
3261
0
            pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable);
3262
0
        d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3263
0
    }
3264
0
3265
0
    /* We leave the "permanent" shadow modes enabled, but clear the
3266
0
     * log-dirty mode bit.  We don't want any more mark_dirty()
3267
0
     * calls now that we've torn down the bitmap */
3268
0
    d->arch.paging.mode &= ~PG_log_dirty;
3269
0
3270
0
    if (d->arch.hvm_domain.dirty_vram) {
3271
0
        xfree(d->arch.hvm_domain.dirty_vram->sl1ma);
3272
0
        xfree(d->arch.hvm_domain.dirty_vram->dirty_bitmap);
3273
0
        xfree(d->arch.hvm_domain.dirty_vram);
3274
0
        d->arch.hvm_domain.dirty_vram = NULL;
3275
0
    }
3276
0
3277
0
out:
3278
0
    paging_unlock(d);
3279
0
3280
0
    /* Must be called outside the lock */
3281
0
    if ( unpaged_pagetable )
3282
0
        shadow_free_p2m_page(d, unpaged_pagetable);
3283
0
}
3284
3285
void shadow_final_teardown(struct domain *d)
3286
/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3287
0
{
3288
0
    SHADOW_PRINTK("dom %u final teardown starts."
3289
0
                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
3290
0
                   d->domain_id,
3291
0
                   d->arch.paging.shadow.total_pages,
3292
0
                   d->arch.paging.shadow.free_pages,
3293
0
                   d->arch.paging.shadow.p2m_pages);
3294
0
3295
0
    /* Double-check that the domain didn't have any shadow memory.
3296
0
     * It is possible for a domain that never got domain_kill()ed
3297
0
     * to get here with its shadow allocation intact. */
3298
0
    if ( d->arch.paging.shadow.total_pages != 0 )
3299
0
        shadow_teardown(d, NULL);
3300
0
3301
0
    /* It is now safe to pull down the p2m map. */
3302
0
    p2m_teardown(p2m_get_hostp2m(d));
3303
0
    /* Free any shadow memory that the p2m teardown released */
3304
0
    paging_lock(d);
3305
0
    shadow_set_allocation(d, 0, NULL);
3306
0
    SHADOW_PRINTK("dom %u final teardown done."
3307
0
                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
3308
0
                   d->domain_id,
3309
0
                   d->arch.paging.shadow.total_pages,
3310
0
                   d->arch.paging.shadow.free_pages,
3311
0
                   d->arch.paging.shadow.p2m_pages);
3312
0
    paging_unlock(d);
3313
0
}
3314
3315
static int shadow_one_bit_enable(struct domain *d, u32 mode)
3316
/* Turn on a single shadow mode feature */
3317
0
{
3318
0
    ASSERT(paging_locked_by_me(d));
3319
0
3320
0
    /* Sanity check the call */
3321
0
    if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3322
0
    {
3323
0
        return -EINVAL;
3324
0
    }
3325
0
3326
0
    mode |= PG_SH_enable;
3327
0
3328
0
    if ( d->arch.paging.shadow.total_pages == 0 )
3329
0
    {
3330
0
        /* Init the shadow memory allocation if the user hasn't done so */
3331
0
        if ( shadow_set_allocation(d, 1, NULL) != 0 )
3332
0
        {
3333
0
            shadow_set_allocation(d, 0, NULL);
3334
0
            return -ENOMEM;
3335
0
        }
3336
0
    }
3337
0
3338
0
    /* Allow p2m and log-dirty code to borrow shadow memory */
3339
0
    d->arch.paging.alloc_page = shadow_alloc_p2m_page;
3340
0
    d->arch.paging.free_page = shadow_free_p2m_page;
3341
0
3342
0
    if ( d->arch.paging.mode == 0 )
3343
0
    {
3344
0
        /* Init the shadow hash table */
3345
0
        if ( shadow_hash_alloc(d) != 0 )
3346
0
            return -ENOMEM;
3347
0
    }
3348
0
3349
0
    /* Update the bits */
3350
0
    sh_new_mode(d, d->arch.paging.mode | mode);
3351
0
3352
0
    return 0;
3353
0
}
3354
3355
static int shadow_one_bit_disable(struct domain *d, u32 mode)
3356
/* Turn off a single shadow mode feature */
3357
0
{
3358
0
    struct vcpu *v;
3359
0
    ASSERT(paging_locked_by_me(d));
3360
0
3361
0
    /* Sanity check the call */
3362
0
    if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3363
0
    {
3364
0
        return -EINVAL;
3365
0
    }
3366
0
3367
0
    /* Update the bits */
3368
0
    sh_new_mode(d, d->arch.paging.mode & ~mode);
3369
0
    if ( d->arch.paging.mode == 0 )
3370
0
    {
3371
0
        /* Get this domain off shadows */
3372
0
        SHADOW_PRINTK("un-shadowing of domain %u starts."
3373
0
                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
3374
0
                       d->domain_id,
3375
0
                       d->arch.paging.shadow.total_pages,
3376
0
                       d->arch.paging.shadow.free_pages,
3377
0
                       d->arch.paging.shadow.p2m_pages);
3378
0
        for_each_vcpu(d, v)
3379
0
        {
3380
0
            if ( v->arch.paging.mode )
3381
0
                v->arch.paging.mode->shadow.detach_old_tables(v);
3382
0
            if ( !(v->arch.flags & TF_kernel_mode) )
3383
0
                make_cr3(v, pagetable_get_mfn(v->arch.guest_table_user));
3384
0
            else
3385
0
                make_cr3(v, pagetable_get_mfn(v->arch.guest_table));
3386
0
3387
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3388
0
            {
3389
0
                int i;
3390
0
                mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3391
0
                for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
3392
0
                    if ( mfn_valid(oos_snapshot[i]) )
3393
0
                    {
3394
0
                        shadow_free(d, oos_snapshot[i]);
3395
0
                        oos_snapshot[i] = INVALID_MFN;
3396
0
                    }
3397
0
            }
3398
0
#endif /* OOS */
3399
0
        }
3400
0
3401
0
        /* Pull down the memory allocation */
3402
0
        if ( shadow_set_allocation(d, 0, NULL) != 0 )
3403
0
            BUG(); /* In fact, we will have BUG()ed already */
3404
0
        shadow_hash_teardown(d);
3405
0
        SHADOW_PRINTK("un-shadowing of domain %u done."
3406
0
                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
3407
0
                       d->domain_id,
3408
0
                       d->arch.paging.shadow.total_pages,
3409
0
                       d->arch.paging.shadow.free_pages,
3410
0
                       d->arch.paging.shadow.p2m_pages);
3411
0
    }
3412
0
3413
0
    return 0;
3414
0
}
3415
3416
/* Enable/disable ops for the "test" and "log-dirty" modes */
3417
static int shadow_test_enable(struct domain *d)
3418
0
{
3419
0
    int ret;
3420
0
3421
0
    domain_pause(d);
3422
0
    paging_lock(d);
3423
0
    ret = shadow_one_bit_enable(d, PG_SH_enable);
3424
0
    paging_unlock(d);
3425
0
    domain_unpause(d);
3426
0
3427
0
    return ret;
3428
0
}
3429
3430
static int shadow_test_disable(struct domain *d)
3431
0
{
3432
0
    int ret;
3433
0
3434
0
    domain_pause(d);
3435
0
    paging_lock(d);
3436
0
    ret = shadow_one_bit_disable(d, PG_SH_enable);
3437
0
    paging_unlock(d);
3438
0
    domain_unpause(d);
3439
0
3440
0
    return ret;
3441
0
}
3442
3443
/**************************************************************************/
3444
/* P2M map manipulations */
3445
3446
/* shadow specific code which should be called when P2M table entry is updated
3447
 * with new content. It is responsible for update the entry, as well as other
3448
 * shadow processing jobs.
3449
 */
3450
3451
static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
3452
                                       l1_pgentry_t *p, l1_pgentry_t new,
3453
                                       unsigned int level)
3454
0
{
3455
0
    /* The following assertion is to make sure we don't step on 1GB host
3456
0
     * page support of HVM guest. */
3457
0
    ASSERT(!(level > 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3458
0
             (l1e_get_flags(*p) & _PAGE_PSE)));
3459
0
3460
0
    /* If we're removing an MFN from the p2m, remove it from the shadows too */
3461
0
    if ( level == 1 )
3462
0
    {
3463
0
        mfn_t mfn = l1e_get_mfn(*p);
3464
0
        p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3465
0
        if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) )
3466
0
        {
3467
0
            sh_remove_all_shadows_and_parents(d, mfn);
3468
0
            if ( sh_remove_all_mappings(d, mfn, _gfn(gfn)) )
3469
0
                flush_tlb_mask(d->domain_dirty_cpumask);
3470
0
        }
3471
0
    }
3472
0
3473
0
    /* If we're removing a superpage mapping from the p2m, we need to check
3474
0
     * all the pages covered by it.  If they're still there in the new
3475
0
     * scheme, that's OK, but otherwise they must be unshadowed. */
3476
0
    if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3477
0
         (l1e_get_flags(*p) & _PAGE_PSE) )
3478
0
    {
3479
0
        unsigned int i;
3480
0
        cpumask_t flushmask;
3481
0
        mfn_t omfn = l1e_get_mfn(*p);
3482
0
        mfn_t nmfn = l1e_get_mfn(new);
3483
0
        l1_pgentry_t *npte = NULL;
3484
0
        p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3485
0
        if ( p2m_is_valid(p2mt) && mfn_valid(omfn) )
3486
0
        {
3487
0
            cpumask_clear(&flushmask);
3488
0
3489
0
            /* If we're replacing a superpage with a normal L1 page, map it */
3490
0
            if ( (l1e_get_flags(new) & _PAGE_PRESENT)
3491
0
                 && !(l1e_get_flags(new) & _PAGE_PSE)
3492
0
                 && mfn_valid(nmfn) )
3493
0
                npte = map_domain_page(nmfn);
3494
0
3495
0
            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3496
0
            {
3497
0
                if ( !npte
3498
0
                     || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
3499
0
                     || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
3500
0
                {
3501
0
                    /* This GFN->MFN mapping has gone away */
3502
0
                    sh_remove_all_shadows_and_parents(d, omfn);
3503
0
                    if ( sh_remove_all_mappings(d, omfn,
3504
0
                                                _gfn(gfn + (i << PAGE_SHIFT))) )
3505
0
                        cpumask_or(&flushmask, &flushmask,
3506
0
                                   d->domain_dirty_cpumask);
3507
0
                }
3508
0
                omfn = _mfn(mfn_x(omfn) + 1);
3509
0
            }
3510
0
            flush_tlb_mask(&flushmask);
3511
0
3512
0
            if ( npte )
3513
0
                unmap_domain_page(npte);
3514
0
        }
3515
0
    }
3516
0
}
3517
3518
void
3519
shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
3520
                       l1_pgentry_t *p, l1_pgentry_t new,
3521
                       unsigned int level)
3522
0
{
3523
0
    paging_lock(d);
3524
0
3525
0
    /* If there are any shadows, update them.  But if shadow_teardown()
3526
0
     * has already been called then it's not safe to try. */
3527
0
    if ( likely(d->arch.paging.shadow.total_pages != 0) )
3528
0
         sh_unshadow_for_p2m_change(d, gfn, p, new, level);
3529
0
3530
0
    /* Update the entry with new content */
3531
0
    safe_write_pte(p, new);
3532
0
3533
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3534
0
    /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3535
0
       cached the fact that this is an mmio region in the shadow
3536
0
       page tables.  Blow the tables away to remove the cache.
3537
0
       This is pretty heavy handed, but this is a rare operation
3538
0
       (it might happen a dozen times during boot and then never
3539
0
       again), so it doesn't matter too much. */
3540
0
    if ( d->arch.paging.shadow.has_fast_mmio_entries )
3541
0
    {
3542
0
        shadow_blow_tables(d);
3543
0
        d->arch.paging.shadow.has_fast_mmio_entries = 0;
3544
0
    }
3545
0
#endif
3546
0
3547
0
    paging_unlock(d);
3548
0
}
3549
3550
/**************************************************************************/
3551
/* Log-dirty mode support */
3552
3553
/* Shadow specific code which is called in paging_log_dirty_enable().
3554
 * Return 0 if no problem found.
3555
 */
3556
static int sh_enable_log_dirty(struct domain *d, bool log_global)
3557
0
{
3558
0
    int ret;
3559
0
3560
0
    paging_lock(d);
3561
0
    if ( shadow_mode_enabled(d) )
3562
0
    {
3563
0
        /* This domain already has some shadows: need to clear them out
3564
0
         * of the way to make sure that all references to guest memory are
3565
0
         * properly write-protected */
3566
0
        shadow_blow_tables(d);
3567
0
    }
3568
0
3569
0
#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3570
0
    /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3571
0
     * change an l4e instead of cr3 to switch tables.  Give them the
3572
0
     * same optimization */
3573
0
    if ( is_pv_32bit_domain(d) )
3574
0
        d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3575
0
#endif
3576
0
3577
0
    ret = shadow_one_bit_enable(d, PG_log_dirty);
3578
0
    paging_unlock(d);
3579
0
3580
0
    return ret;
3581
0
}
3582
3583
/* shadow specfic code which is called in paging_log_dirty_disable() */
3584
static int sh_disable_log_dirty(struct domain *d)
3585
0
{
3586
0
    int ret;
3587
0
3588
0
    paging_lock(d);
3589
0
    ret = shadow_one_bit_disable(d, PG_log_dirty);
3590
0
    paging_unlock(d);
3591
0
3592
0
    return ret;
3593
0
}
3594
3595
/* This function is called when we CLEAN log dirty bitmap. See
3596
 * paging_log_dirty_op() for details.
3597
 */
3598
static void sh_clean_dirty_bitmap(struct domain *d)
3599
0
{
3600
0
    paging_lock(d);
3601
0
    /* Need to revoke write access to the domain's pages again.
3602
0
     * In future, we'll have a less heavy-handed approach to this,
3603
0
     * but for now, we just unshadow everything except Xen. */
3604
0
    shadow_blow_tables(d);
3605
0
    paging_unlock(d);
3606
0
}
3607
3608
3609
/**************************************************************************/
3610
/* VRAM dirty tracking support */
3611
int shadow_track_dirty_vram(struct domain *d,
3612
                            unsigned long begin_pfn,
3613
                            unsigned long nr,
3614
                            XEN_GUEST_HANDLE_PARAM(void) guest_dirty_bitmap)
3615
0
{
3616
0
    int rc = 0;
3617
0
    unsigned long end_pfn = begin_pfn + nr;
3618
0
    unsigned long dirty_size = (nr + 7) / 8;
3619
0
    int flush_tlb = 0;
3620
0
    unsigned long i;
3621
0
    p2m_type_t t;
3622
0
    struct sh_dirty_vram *dirty_vram;
3623
0
    struct p2m_domain *p2m = p2m_get_hostp2m(d);
3624
0
    uint8_t *dirty_bitmap = NULL;
3625
0
3626
0
    if ( end_pfn < begin_pfn || end_pfn > p2m->max_mapped_pfn + 1 )
3627
0
        return -EINVAL;
3628
0
3629
0
    /* We perform p2m lookups, so lock the p2m upfront to avoid deadlock */
3630
0
    p2m_lock(p2m_get_hostp2m(d));
3631
0
    paging_lock(d);
3632
0
3633
0
    dirty_vram = d->arch.hvm_domain.dirty_vram;
3634
0
3635
0
    if ( dirty_vram && (!nr ||
3636
0
             ( begin_pfn != dirty_vram->begin_pfn
3637
0
            || end_pfn   != dirty_vram->end_pfn )) )
3638
0
    {
3639
0
        /* Different tracking, tear the previous down. */
3640
0
        gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", dirty_vram->begin_pfn, dirty_vram->end_pfn);
3641
0
        xfree(dirty_vram->sl1ma);
3642
0
        xfree(dirty_vram->dirty_bitmap);
3643
0
        xfree(dirty_vram);
3644
0
        dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3645
0
    }
3646
0
3647
0
    if ( !nr )
3648
0
        goto out;
3649
0
3650
0
    dirty_bitmap = vzalloc(dirty_size);
3651
0
    if ( dirty_bitmap == NULL )
3652
0
    {
3653
0
        rc = -ENOMEM;
3654
0
        goto out;
3655
0
    }
3656
0
    /* This should happen seldomly (Video mode change),
3657
0
     * no need to be careful. */
3658
0
    if ( !dirty_vram )
3659
0
    {
3660
0
        /* Throw away all the shadows rather than walking through them
3661
0
         * up to nr times getting rid of mappings of each pfn */
3662
0
        shadow_blow_tables(d);
3663
0
3664
0
        gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3665
0
3666
0
        rc = -ENOMEM;
3667
0
        if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3668
0
            goto out;
3669
0
        dirty_vram->begin_pfn = begin_pfn;
3670
0
        dirty_vram->end_pfn = end_pfn;
3671
0
        d->arch.hvm_domain.dirty_vram = dirty_vram;
3672
0
3673
0
        if ( (dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3674
0
            goto out_dirty_vram;
3675
0
        memset(dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3676
0
3677
0
        if ( (dirty_vram->dirty_bitmap = xzalloc_array(uint8_t, dirty_size)) == NULL )
3678
0
            goto out_sl1ma;
3679
0
3680
0
        dirty_vram->last_dirty = NOW();
3681
0
3682
0
        /* Tell the caller that this time we could not track dirty bits. */
3683
0
        rc = -ENODATA;
3684
0
    }
3685
0
    else if (dirty_vram->last_dirty == -1)
3686
0
        /* still completely clean, just copy our empty bitmap */
3687
0
        memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size);
3688
0
    else
3689
0
    {
3690
0
        unsigned long map_mfn = mfn_x(INVALID_MFN);
3691
0
        void *map_sl1p = NULL;
3692
0
3693
0
        /* Iterate over VRAM to track dirty bits. */
3694
0
        for ( i = 0; i < nr; i++ ) {
3695
0
            mfn_t mfn = get_gfn_query_unlocked(d, begin_pfn + i, &t);
3696
0
            struct page_info *page;
3697
0
            int dirty = 0;
3698
0
            paddr_t sl1ma = dirty_vram->sl1ma[i];
3699
0
3700
0
            if ( !mfn_eq(mfn, INVALID_MFN) )
3701
0
            {
3702
0
                dirty = 1;
3703
0
            }
3704
0
            else
3705
0
            {
3706
0
                page = mfn_to_page(mfn);
3707
0
                switch (page->u.inuse.type_info & PGT_count_mask)
3708
0
                {
3709
0
                case 0:
3710
0
                    /* No guest reference, nothing to track. */
3711
0
                    break;
3712
0
                case 1:
3713
0
                    /* One guest reference. */
3714
0
                    if ( sl1ma == INVALID_PADDR )
3715
0
                    {
3716
0
                        /* We don't know which sl1e points to this, too bad. */
3717
0
                        dirty = 1;
3718
0
                        /* TODO: Heuristics for finding the single mapping of
3719
0
                         * this gmfn */
3720
0
                        flush_tlb |= sh_remove_all_mappings(d, mfn,
3721
0
                                                            _gfn(begin_pfn + i));
3722
0
                    }
3723
0
                    else
3724
0
                    {
3725
0
                        /* Hopefully the most common case: only one mapping,
3726
0
                         * whose dirty bit we can use. */
3727
0
                        l1_pgentry_t *sl1e;
3728
0
                        unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3729
0
3730
0
                        if ( sl1mfn != map_mfn )
3731
0
                        {
3732
0
                            if ( map_sl1p )
3733
0
                                unmap_domain_page(map_sl1p);
3734
0
                            map_sl1p = map_domain_page(_mfn(sl1mfn));
3735
0
                            map_mfn = sl1mfn;
3736
0
                        }
3737
0
                        sl1e = map_sl1p + (sl1ma & ~PAGE_MASK);
3738
0
3739
0
                        if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3740
0
                        {
3741
0
                            dirty = 1;
3742
0
                            /* Note: this is atomic, so we may clear a
3743
0
                             * _PAGE_ACCESSED set by another processor. */
3744
0
                            l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3745
0
                            flush_tlb = 1;
3746
0
                        }
3747
0
                    }
3748
0
                    break;
3749
0
                default:
3750
0
                    /* More than one guest reference,
3751
0
                     * we don't afford tracking that. */
3752
0
                    dirty = 1;
3753
0
                    break;
3754
0
                }
3755
0
            }
3756
0
3757
0
            if ( dirty )
3758
0
            {
3759
0
                dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3760
0
                dirty_vram->last_dirty = NOW();
3761
0
            }
3762
0
        }
3763
0
3764
0
        if ( map_sl1p )
3765
0
            unmap_domain_page(map_sl1p);
3766
0
3767
0
        memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size);
3768
0
        memset(dirty_vram->dirty_bitmap, 0, dirty_size);
3769
0
        if ( dirty_vram->last_dirty + SECONDS(2) < NOW() )
3770
0
        {
3771
0
            /* was clean for more than two seconds, try to disable guest
3772
0
             * write access */
3773
0
            for ( i = begin_pfn; i < end_pfn; i++ )
3774
0
            {
3775
0
                mfn_t mfn = get_gfn_query_unlocked(d, i, &t);
3776
0
                if ( !mfn_eq(mfn, INVALID_MFN) )
3777
0
                    flush_tlb |= sh_remove_write_access(d, mfn, 1, 0);
3778
0
            }
3779
0
            dirty_vram->last_dirty = -1;
3780
0
        }
3781
0
    }
3782
0
    if ( flush_tlb )
3783
0
        flush_tlb_mask(d->domain_dirty_cpumask);
3784
0
    goto out;
3785
0
3786
0
out_sl1ma:
3787
0
    xfree(dirty_vram->sl1ma);
3788
0
out_dirty_vram:
3789
0
    xfree(dirty_vram);
3790
0
    dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3791
0
3792
0
out:
3793
0
    paging_unlock(d);
3794
0
    if ( rc == 0 && dirty_bitmap != NULL &&
3795
0
         copy_to_guest(guest_dirty_bitmap, dirty_bitmap, dirty_size) )
3796
0
    {
3797
0
        paging_lock(d);
3798
0
        for ( i = 0; i < dirty_size; i++ )
3799
0
            dirty_vram->dirty_bitmap[i] |= dirty_bitmap[i];
3800
0
        paging_unlock(d);
3801
0
        rc = -EFAULT;
3802
0
    }
3803
0
    vfree(dirty_bitmap);
3804
0
    p2m_unlock(p2m_get_hostp2m(d));
3805
0
    return rc;
3806
0
}
3807
3808
/**************************************************************************/
3809
/* Shadow-control XEN_DOMCTL dispatcher */
3810
3811
int shadow_domctl(struct domain *d,
3812
                  struct xen_domctl_shadow_op *sc,
3813
                  XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
3814
0
{
3815
0
    int rc;
3816
0
    bool preempted = false;
3817
0
3818
0
    switch ( sc->op )
3819
0
    {
3820
0
    case XEN_DOMCTL_SHADOW_OP_OFF:
3821
0
        if ( d->arch.paging.mode == PG_SH_enable )
3822
0
            if ( (rc = shadow_test_disable(d)) != 0 )
3823
0
                return rc;
3824
0
        return 0;
3825
0
3826
0
    case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3827
0
        return shadow_test_enable(d);
3828
0
3829
0
    case XEN_DOMCTL_SHADOW_OP_ENABLE:
3830
0
        return paging_enable(d, sc->mode << PG_mode_shift);
3831
0
3832
0
    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3833
0
        sc->mb = shadow_get_allocation(d);
3834
0
        return 0;
3835
0
3836
0
    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3837
0
        paging_lock(d);
3838
0
        if ( sc->mb == 0 && shadow_mode_enabled(d) )
3839
0
        {
3840
0
            /* Can't set the allocation to zero unless the domain stops using
3841
0
             * shadow pagetables first */
3842
0
            SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3843
0
                         " is still using shadows.\n", d->domain_id);
3844
0
            paging_unlock(d);
3845
0
            return -EINVAL;
3846
0
        }
3847
0
        rc = shadow_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3848
0
        paging_unlock(d);
3849
0
        if ( preempted )
3850
0
            /* Not finished.  Set up to re-run the call. */
3851
0
            rc = hypercall_create_continuation(
3852
0
                __HYPERVISOR_domctl, "h", u_domctl);
3853
0
        else
3854
0
            /* Finished.  Return the new allocation */
3855
0
            sc->mb = shadow_get_allocation(d);
3856
0
        return rc;
3857
0
3858
0
    default:
3859
0
        SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3860
0
        return -EINVAL;
3861
0
    }
3862
0
}
3863
3864
3865
/**************************************************************************/
3866
/* Auditing shadow tables */
3867
3868
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3869
3870
void shadow_audit_tables(struct vcpu *v)
3871
{
3872
    /* Dispatch table for getting per-type functions */
3873
    static const hash_vcpu_callback_t callbacks[SH_type_unused] = {
3874
        NULL, /* none    */
3875
        SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2),  /* l1_32   */
3876
        SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32  */
3877
        SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2),  /* l2_32   */
3878
        SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3),  /* l1_pae  */
3879
        SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3880
        SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3),  /* l2_pae  */
3881
        SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3),  /* l2h_pae */
3882
        SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4),  /* l1_64   */
3883
        SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64  */
3884
        SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4),  /* l2_64   */
3885
        SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4),  /* l2h_64   */
3886
        SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4),  /* l3_64   */
3887
        SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4),  /* l4_64   */
3888
        NULL  /* All the rest */
3889
    };
3890
    unsigned int mask;
3891
3892
    if ( !(SHADOW_AUDIT_ENABLE) )
3893
        return;
3894
3895
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3896
    sh_oos_audit(v->domain);
3897
#endif
3898
3899
    if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3900
        mask = SHF_page_type_mask; /* Audit every table in the system */
3901
    else 
3902
    {
3903
        /* Audit only the current mode's tables */
3904
        switch ( v->arch.paging.mode->guest_levels )
3905
        {
3906
        case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3907
        case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3908
                        |SHF_L2H_PAE); break;
3909
        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3910
                        |SHF_L3_64|SHF_L4_64); break;
3911
        default: BUG();
3912
        }
3913
    }
3914
3915
    hash_vcpu_foreach(v, mask, callbacks, INVALID_MFN);
3916
}
3917
3918
#endif /* Shadow audit */
3919
3920
/*
3921
 * Local variables:
3922
 * mode: C
3923
 * c-file-style: "BSD"
3924
 * c-basic-offset: 4
3925
 * indent-tabs-mode: nil
3926
 * End:
3927
 */