Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/mm.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * arch/x86/mm.c
3
 *
4
 * Copyright (c) 2002-2005 K A Fraser
5
 * Copyright (c) 2004 Christian Limpach
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
/*
22
 * A description of the x86 page table API:
23
 *
24
 * Domains trap to do_mmu_update with a list of update requests.
25
 * This is a list of (ptr, val) pairs, where the requested operation
26
 * is *ptr = val.
27
 *
28
 * Reference counting of pages:
29
 * ----------------------------
30
 * Each page has two refcounts: tot_count and type_count.
31
 *
32
 * TOT_COUNT is the obvious reference count. It counts all uses of a
33
 * physical page frame by a domain, including uses as a page directory,
34
 * a page table, or simple mappings via a PTE. This count prevents a
35
 * domain from releasing a frame back to the free pool when it still holds
36
 * a reference to it.
37
 *
38
 * TYPE_COUNT is more subtle. A frame can be put to one of three
39
 * mutually-exclusive uses: it might be used as a page directory, or a
40
 * page table, or it may be mapped writable by the domain [of course, a
41
 * frame may not be used in any of these three ways!].
42
 * So, type_count is a count of the number of times a frame is being
43
 * referred to in its current incarnation. Therefore, a page can only
44
 * change its type when its type count is zero.
45
 *
46
 * Pinning the page type:
47
 * ----------------------
48
 * The type of a page can be pinned/unpinned with the commands
49
 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50
 * pinning is not reference counted, so it can't be nested).
51
 * This is useful to prevent a page's type count falling to zero, at which
52
 * point safety checks would need to be carried out next time the count
53
 * is increased again.
54
 *
55
 * A further note on writable page mappings:
56
 * -----------------------------------------
57
 * For simplicity, the count of writable mappings for a page may not
58
 * correspond to reality. The 'writable count' is incremented for every
59
 * PTE which maps the page with the _PAGE_RW flag set. However, for
60
 * write access to be possible the page directory entry must also have
61
 * its _PAGE_RW bit set. We do not check this as it complicates the
62
 * reference counting considerably [consider the case of multiple
63
 * directory entries referencing a single page table, some with the RW
64
 * bit set, others not -- it starts getting a bit messy].
65
 * In normal use, this simplification shouldn't be a problem.
66
 * However, the logic can be added if required.
67
 *
68
 * One more note on read-only page mappings:
69
 * -----------------------------------------
70
 * We want domains to be able to map pages for read-only access. The
71
 * main reason is that page tables and directories should be readable
72
 * by a domain, but it would not be safe for them to be writable.
73
 * However, domains have free access to rings 1 & 2 of the Intel
74
 * privilege model. In terms of page protection, these are considered
75
 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76
 * read-only restrictions are respected in supervisor mode -- if the
77
 * bit is clear then any mapped page is writable.
78
 *
79
 * We get round this by always setting the WP bit and disallowing
80
 * updates to it. This is very unlikely to cause a problem for guest
81
 * OS's, which will generally use the WP bit to simplify copy-on-write
82
 * implementation (in that case, OS wants a fault when it writes to
83
 * an application-supplied buffer).
84
 */
85
86
#include <xen/init.h>
87
#include <xen/kernel.h>
88
#include <xen/lib.h>
89
#include <xen/mm.h>
90
#include <xen/domain.h>
91
#include <xen/sched.h>
92
#include <xen/err.h>
93
#include <xen/perfc.h>
94
#include <xen/irq.h>
95
#include <xen/softirq.h>
96
#include <xen/domain_page.h>
97
#include <xen/event.h>
98
#include <xen/iocap.h>
99
#include <xen/guest_access.h>
100
#include <xen/pfn.h>
101
#include <xen/vmap.h>
102
#include <xen/xmalloc.h>
103
#include <xen/efi.h>
104
#include <xen/grant_table.h>
105
#include <xen/hypercall.h>
106
#include <asm/paging.h>
107
#include <asm/shadow.h>
108
#include <asm/page.h>
109
#include <asm/flushtlb.h>
110
#include <asm/io.h>
111
#include <asm/ldt.h>
112
#include <asm/x86_emulate.h>
113
#include <asm/e820.h>
114
#include <asm/hypercall.h>
115
#include <asm/shared.h>
116
#include <asm/mem_sharing.h>
117
#include <public/memory.h>
118
#include <public/sched.h>
119
#include <xsm/xsm.h>
120
#include <xen/trace.h>
121
#include <asm/setup.h>
122
#include <asm/fixmap.h>
123
#include <asm/io_apic.h>
124
#include <asm/pci.h>
125
126
#include <asm/hvm/grant_table.h>
127
#include <asm/pv/grant_table.h>
128
129
#include "pv/mm.h"
130
131
/* Override macros from asm/page.h to make them work with mfn_t */
132
#undef mfn_to_page
133
14.5k
#define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
134
#undef page_to_mfn
135
24
#define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
136
137
/* Mapping of the fixmap space needed early. */
138
l1_pgentry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
139
    l1_fixmap[L1_PAGETABLE_ENTRIES];
140
141
paddr_t __read_mostly mem_hotplug;
142
143
/* Private domain structs for DOMID_XEN and DOMID_IO. */
144
struct domain *dom_xen, *dom_io, *dom_cow;
145
146
/* Frame table size in pages. */
147
unsigned long max_page;
148
unsigned long total_pages;
149
150
bool __read_mostly machine_to_phys_mapping_valid;
151
152
struct rangeset *__read_mostly mmio_ro_ranges;
153
154
static uint32_t base_disallow_mask;
155
/* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
156
0
#define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
157
158
0
#define L2_DISALLOW_MASK base_disallow_mask
159
160
0
#define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
161
0
                             base_disallow_mask : 0xFFFFF198U)
162
163
0
#define L4_DISALLOW_MASK (base_disallow_mask)
164
165
#define l1_disallow_mask(d)                                     \
166
0
    ((d != dom_io) &&                                           \
167
0
     (rangeset_is_empty((d)->iomem_caps) &&                     \
168
0
      rangeset_is_empty((d)->arch.ioport_caps) &&               \
169
0
      !has_arch_pdevs(d) &&                                     \
170
0
      is_pv_domain(d)) ?                                        \
171
0
     L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
172
173
static s8 __read_mostly opt_mmio_relax;
174
175
static int __init parse_mmio_relax(const char *s)
176
0
{
177
0
    if ( !*s )
178
0
        opt_mmio_relax = 1;
179
0
    else
180
0
        opt_mmio_relax = parse_bool(s, NULL);
181
0
    if ( opt_mmio_relax < 0 && strcmp(s, "all") )
182
0
    {
183
0
        opt_mmio_relax = 0;
184
0
        return -EINVAL;
185
0
    }
186
0
187
0
    return 0;
188
0
}
189
custom_param("mmio-relax", parse_mmio_relax);
190
191
static void __init init_frametable_chunk(void *start, void *end)
192
2
{
193
2
    unsigned long s = (unsigned long)start;
194
2
    unsigned long e = (unsigned long)end;
195
2
    unsigned long step;
196
2
    mfn_t mfn;
197
2
198
2
    ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
199
66
    for ( ; s < e; s += step << PAGE_SHIFT )
200
64
    {
201
64
        step = 1UL << (cpu_has_page1gb &&
202
64
                       !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
203
1
                       L3_PAGETABLE_SHIFT - PAGE_SHIFT :
204
63
                       L2_PAGETABLE_SHIFT - PAGE_SHIFT);
205
64
        /*
206
64
         * The hardcoded 4 below is arbitrary - just pick whatever you think
207
64
         * is reasonable to waste as a trade-off for using a large page.
208
64
         */
209
65
        while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
210
1
            step >>= PAGETABLE_ORDER;
211
64
        mfn = alloc_boot_pages(step, step);
212
64
        map_pages_to_xen(s, mfn_x(mfn), step, PAGE_HYPERVISOR);
213
64
    }
214
2
215
2
    memset(start, 0, end - start);
216
2
    memset(end, -1, s - e);
217
2
}
218
219
void __init init_frametable(void)
220
1
{
221
1
    unsigned int sidx, eidx, nidx;
222
1
    unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
223
1
    struct page_info *end_pg, *top_pg;
224
1
225
1
    BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_START);
226
1
    BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
227
1
228
1
    for ( sidx = 0; ; sidx = nidx )
229
2
    {
230
2
        eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
231
2
        nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
232
2
        if ( nidx >= max_idx )
233
1
            break;
234
1
        init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
235
1
                              pdx_to_page(eidx * PDX_GROUP_COUNT));
236
1
    }
237
1
238
1
    end_pg = pdx_to_page(max_pdx - 1) + 1;
239
0
    top_pg = mem_hotplug ? pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1
240
1
                         : end_pg;
241
1
    init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), top_pg);
242
1
    memset(end_pg, -1, (unsigned long)top_pg - (unsigned long)end_pg);
243
1
}
244
245
#ifndef NDEBUG
246
static unsigned int __read_mostly root_pgt_pv_xen_slots
247
    = ROOT_PAGETABLE_PV_XEN_SLOTS;
248
static l4_pgentry_t __read_mostly split_l4e;
249
#else
250
#define root_pgt_pv_xen_slots ROOT_PAGETABLE_PV_XEN_SLOTS
251
#endif
252
253
void __init arch_init_memory(void)
254
1
{
255
1
    unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
256
1
257
1
    /*
258
1
     * Basic guest-accessible flags:
259
1
     *   PRESENT, R/W, USER, A/D, AVAIL[0,1,2], AVAIL_HIGH, NX (if available).
260
1
     */
261
1
    base_disallow_mask =
262
1
        ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED |
263
1
          _PAGE_DIRTY | _PAGE_AVAIL | _PAGE_AVAIL_HIGH | _PAGE_NX);
264
1
265
1
    /*
266
1
     * Initialise our DOMID_XEN domain.
267
1
     * Any Xen-heap pages that we will allow to be mapped will have
268
1
     * their domain field set to dom_xen.
269
1
     * Hidden PCI devices will also be associated with this domain
270
1
     * (but be [partly] controlled by Dom0 nevertheless).
271
1
     */
272
1
    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0, NULL);
273
1
    BUG_ON(IS_ERR(dom_xen));
274
1
    INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
275
1
276
1
    /*
277
1
     * Initialise our DOMID_IO domain.
278
1
     * This domain owns I/O pages that are within the range of the page_info
279
1
     * array. Mappings occur at the priv of the caller.
280
1
     */
281
1
    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
282
1
    BUG_ON(IS_ERR(dom_io));
283
1
284
1
    /*
285
1
     * Initialise our COW domain.
286
1
     * This domain owns sharable pages.
287
1
     */
288
1
    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL);
289
1
    BUG_ON(IS_ERR(dom_cow));
290
1
291
1
    /* First 1MB of RAM is historically marked as I/O. */
292
257
    for ( i = 0; i < 0x100; i++ )
293
256
        share_xen_page_with_guest(mfn_to_page(_mfn(i)),
294
256
                                  dom_io, XENSHARE_writable);
295
1
296
1
    /* Any areas not specified as RAM by the e820 map are considered I/O. */
297
9
    for ( i = 0, pfn = 0; pfn < max_page; i++ )
298
8
    {
299
19
        while ( (i < e820.nr_map) &&
300
19
                (e820.map[i].type != E820_RAM) &&
301
11
                (e820.map[i].type != E820_UNUSABLE) )
302
11
            i++;
303
8
304
8
        if ( i >= e820.nr_map )
305
0
        {
306
0
            /* No more RAM regions: mark as I/O right to end of memory map. */
307
0
            rstart_pfn = rend_pfn = max_page;
308
0
        }
309
8
        else
310
8
        {
311
8
            /* Mark as I/O just up as far as next RAM region. */
312
8
            rstart_pfn = min_t(unsigned long, max_page,
313
8
                               PFN_UP(e820.map[i].addr));
314
8
            rend_pfn   = max_t(unsigned long, rstart_pfn,
315
8
                               PFN_DOWN(e820.map[i].addr + e820.map[i].size));
316
8
        }
317
8
318
8
        /*
319
8
         * Make sure any Xen mappings of RAM holes above 1MB are blown away.
320
8
         * In particular this ensures that RAM holes are respected even in
321
8
         * the statically-initialised 1-16MB mapping area.
322
8
         */
323
8
        iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
324
8
        ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
325
8
        if ( iostart_pfn < ioend_pfn )
326
0
            destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
327
0
                                 (unsigned long)mfn_to_virt(ioend_pfn));
328
8
329
8
        /* Mark as I/O up to next RAM region. */
330
473k
        for ( ; pfn < rstart_pfn; pfn++ )
331
473k
        {
332
473k
            if ( !mfn_valid(_mfn(pfn)) )
333
458k
                continue;
334
14.2k
            share_xen_page_with_guest(
335
14.2k
                mfn_to_page(_mfn(pfn)), dom_io, XENSHARE_writable);
336
14.2k
        }
337
8
338
8
        /* Skip the RAM region. */
339
8
        pfn = rend_pfn;
340
8
    }
341
1
342
1
    subarch_init_memory();
343
1
344
1
    efi_init_memory();
345
1
346
1
    mem_sharing_init();
347
1
348
1
#ifndef NDEBUG
349
1
    if ( highmem_start )
350
0
    {
351
0
        unsigned long split_va = (unsigned long)__va(highmem_start);
352
0
353
0
        if ( split_va < HYPERVISOR_VIRT_END &&
354
0
             split_va - 1 == (unsigned long)__va(highmem_start - 1) )
355
0
        {
356
0
            root_pgt_pv_xen_slots = l4_table_offset(split_va) -
357
0
                                    ROOT_PAGETABLE_FIRST_XEN_SLOT;
358
0
            ASSERT(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS);
359
0
            if ( l4_table_offset(split_va) == l4_table_offset(split_va - 1) )
360
0
            {
361
0
                l3_pgentry_t *l3tab = alloc_xen_pagetable();
362
0
363
0
                if ( l3tab )
364
0
                {
365
0
                    const l3_pgentry_t *l3idle =
366
0
                        l4e_to_l3e(idle_pg_table[l4_table_offset(split_va)]);
367
0
368
0
                    for ( i = 0; i < l3_table_offset(split_va); ++i )
369
0
                        l3tab[i] = l3idle[i];
370
0
                    for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
371
0
                        l3tab[i] = l3e_empty();
372
0
                    split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
373
0
                                             __PAGE_HYPERVISOR_RW);
374
0
                }
375
0
                else
376
0
                    ++root_pgt_pv_xen_slots;
377
0
            }
378
0
        }
379
0
    }
380
1
#endif
381
1
}
382
383
int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
384
85
{
385
85
    uint64_t maddr = pfn_to_paddr(mfn);
386
85
    int i;
387
85
388
1.20k
    for ( i = 0; i < e820.nr_map; i++ )
389
1.16k
    {
390
1.16k
        switch ( e820.map[i].type )
391
1.16k
        {
392
500
        case E820_RAM:
393
500
            if ( mem_type & RAM_TYPE_CONVENTIONAL )
394
8
                break;
395
492
            continue;
396
498
        case E820_RESERVED:
397
498
            if ( mem_type & RAM_TYPE_RESERVED )
398
330
                break;
399
168
            continue;
400
0
        case E820_UNUSABLE:
401
0
            if ( mem_type & RAM_TYPE_UNUSABLE )
402
0
                break;
403
0
            continue;
404
167
        case E820_ACPI:
405
167
        case E820_NVS:
406
167
            if ( mem_type & RAM_TYPE_ACPI )
407
40
                break;
408
127
            continue;
409
0
        default:
410
0
            /* unknown */
411
0
            continue;
412
1.16k
        }
413
1.16k
414
1.16k
        /* Test the range. */
415
378
        if ( (e820.map[i].addr <= maddr) &&
416
217
             ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
417
42
            return 1;
418
378
    }
419
85
420
43
    return 0;
421
85
}
422
423
unsigned long domain_get_maximum_gpfn(struct domain *d)
424
0
{
425
0
    if ( is_hvm_domain(d) )
426
0
        return p2m_get_hostp2m(d)->max_mapped_pfn;
427
0
    /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
428
0
    return (arch_get_max_pfn(d) ?: 1) - 1;
429
0
}
430
431
void share_xen_page_with_guest(
432
    struct page_info *page, struct domain *d, int readonly)
433
28.0k
{
434
28.0k
    if ( page_get_owner(page) == d )
435
107
        return;
436
28.0k
437
27.9k
    set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), INVALID_M2P_ENTRY);
438
27.9k
439
27.9k
    spin_lock(&d->page_alloc_lock);
440
27.9k
441
27.9k
    /* The incremented type count pins as writable or read-only. */
442
27.9k
    page->u.inuse.type_info  = (readonly ? PGT_none : PGT_writable_page);
443
27.9k
    page->u.inuse.type_info |= PGT_validated | 1;
444
27.9k
445
27.9k
    page_set_owner(page, d);
446
27.9k
    smp_wmb(); /* install valid domain ptr before updating refcnt. */
447
27.9k
    ASSERT((page->count_info & ~PGC_xen_heap) == 0);
448
27.9k
449
27.9k
    /* Only add to the allocation list if the domain isn't dying. */
450
27.9k
    if ( !d->is_dying )
451
27.9k
    {
452
27.9k
        page->count_info |= PGC_xen_heap | PGC_allocated | 1;
453
27.9k
        if ( unlikely(d->xenheap_pages++ == 0) )
454
3
            get_knownalive_domain(d);
455
27.9k
        page_list_add_tail(page, &d->xenpage_list);
456
27.9k
    }
457
27.9k
458
27.9k
    spin_unlock(&d->page_alloc_lock);
459
27.9k
}
460
461
int __init unshare_xen_page_with_guest(struct page_info *page,
462
                                       struct domain *d)
463
149
{
464
149
    if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
465
0
        return -EINVAL;
466
149
467
149
    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
468
149
        put_page(page);
469
149
470
149
    /* Remove the owner and clear the flags. */
471
149
    page->u.inuse.type_info = 0;
472
149
    page_set_owner(page, NULL);
473
149
474
149
    return 0;
475
149
}
476
477
void share_xen_page_with_privileged_guests(
478
    struct page_info *page, int readonly)
479
13.3k
{
480
13.3k
    share_xen_page_with_guest(page, dom_xen, readonly);
481
13.3k
}
482
483
void free_shared_domheap_page(struct page_info *page)
484
0
{
485
0
    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
486
0
        put_page(page);
487
0
    if ( !test_and_clear_bit(_PGC_xen_heap, &page->count_info) )
488
0
        ASSERT_UNREACHABLE();
489
0
    page->u.inuse.type_info = 0;
490
0
    page_set_owner(page, NULL);
491
0
    free_domheap_page(page);
492
0
}
493
494
void make_cr3(struct vcpu *v, mfn_t mfn)
495
12
{
496
12
    v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
497
12
}
498
499
void write_ptbase(struct vcpu *v)
500
39.5k
{
501
39.5k
    write_cr3(v->arch.cr3);
502
39.5k
}
503
504
/*
505
 * Should be called after CR3 is updated.
506
 *
507
 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
508
 * for HVM guests, arch.monitor_table and hvm's guest CR3.
509
 *
510
 * Update ref counts to shadow tables appropriately.
511
 */
512
void update_cr3(struct vcpu *v)
513
0
{
514
0
    mfn_t cr3_mfn;
515
0
516
0
    if ( paging_mode_enabled(v->domain) )
517
0
    {
518
0
        paging_update_cr3(v);
519
0
        return;
520
0
    }
521
0
522
0
    if ( !(v->arch.flags & TF_kernel_mode) )
523
0
        cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user);
524
0
    else
525
0
        cr3_mfn = pagetable_get_mfn(v->arch.guest_table);
526
0
527
0
    make_cr3(v, cr3_mfn);
528
0
}
529
530
static inline void set_tlbflush_timestamp(struct page_info *page)
531
2
{
532
2
    /*
533
2
     * Record TLB information for flush later. We do not stamp page tables
534
2
     * when running in shadow mode:
535
2
     *  1. Pointless, since it's the shadow pt's which must be tracked.
536
2
     *  2. Shadow mode reuses this field for shadowed page tables to store
537
2
     *     flags info -- we don't want to conflict with that.
538
2
     */
539
2
    if ( !(page->count_info & PGC_page_table) ||
540
0
         !shadow_mode_enabled(page_get_owner(page)) )
541
2
        page_set_tlbflush_timestamp(page);
542
2
}
543
544
const char __section(".bss.page_aligned.const") __aligned(PAGE_SIZE)
545
    zero_page[PAGE_SIZE];
546
547
static void invalidate_shadow_ldt(struct vcpu *v, int flush)
548
0
{
549
0
    l1_pgentry_t *pl1e;
550
0
    unsigned int i;
551
0
    struct page_info *page;
552
0
553
0
    BUG_ON(unlikely(in_irq()));
554
0
555
0
    spin_lock(&v->arch.pv_vcpu.shadow_ldt_lock);
556
0
557
0
    if ( v->arch.pv_vcpu.shadow_ldt_mapcnt == 0 )
558
0
        goto out;
559
0
560
0
    v->arch.pv_vcpu.shadow_ldt_mapcnt = 0;
561
0
    pl1e = pv_ldt_ptes(v);
562
0
563
0
    for ( i = 0; i < 16; i++ )
564
0
    {
565
0
        if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
566
0
            continue;
567
0
        page = l1e_get_page(pl1e[i]);
568
0
        l1e_write(&pl1e[i], l1e_empty());
569
0
        ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
570
0
        ASSERT_PAGE_IS_DOMAIN(page, v->domain);
571
0
        put_page_and_type(page);
572
0
    }
573
0
574
0
    /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
575
0
    if ( flush )
576
0
        flush_tlb_mask(v->vcpu_dirty_cpumask);
577
0
578
0
 out:
579
0
    spin_unlock(&v->arch.pv_vcpu.shadow_ldt_lock);
580
0
}
581
582
583
static int alloc_segdesc_page(struct page_info *page)
584
0
{
585
0
    const struct domain *owner = page_get_owner(page);
586
0
    struct desc_struct *descs = __map_domain_page(page);
587
0
    unsigned i;
588
0
589
0
    for ( i = 0; i < 512; i++ )
590
0
        if ( unlikely(!check_descriptor(owner, &descs[i])) )
591
0
            break;
592
0
593
0
    unmap_domain_page(descs);
594
0
595
0
    return i == 512 ? 0 : -EINVAL;
596
0
}
597
598
static int get_page_and_type_from_mfn(
599
    mfn_t mfn, unsigned long type, struct domain *d,
600
    int partial, int preemptible)
601
0
{
602
0
    struct page_info *page = mfn_to_page(mfn);
603
0
    int rc;
604
0
605
0
    if ( likely(partial >= 0) &&
606
0
         unlikely(!get_page_from_mfn(mfn, d)) )
607
0
        return -EINVAL;
608
0
609
0
    rc = (preemptible ?
610
0
          get_page_type_preemptible(page, type) :
611
0
          (get_page_type(page, type) ? 0 : -EINVAL));
612
0
613
0
    if ( unlikely(rc) && partial >= 0 &&
614
0
         (!preemptible || page != current->arch.old_guest_table) )
615
0
        put_page(page);
616
0
617
0
    return rc;
618
0
}
619
620
static void put_data_page(
621
    struct page_info *page, int writeable)
622
0
{
623
0
    if ( writeable )
624
0
        put_page_and_type(page);
625
0
    else
626
0
        put_page(page);
627
0
}
628
629
static bool inc_linear_entries(struct page_info *pg)
630
0
{
631
0
    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
632
0
633
0
    do {
634
0
        /*
635
0
         * The check below checks for the "linear use" count being non-zero
636
0
         * as well as overflow.  Signed integer overflow is undefined behavior
637
0
         * according to the C spec.  However, as long as linear_pt_count is
638
0
         * smaller in size than 'int', the arithmetic operation of the
639
0
         * increment below won't overflow; rather the result will be truncated
640
0
         * when stored.  Ensure that this is always true.
641
0
         */
642
0
        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
643
0
        oc = nc++;
644
0
        if ( nc <= 0 )
645
0
            return false;
646
0
        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
647
0
    } while ( oc != nc );
648
0
649
0
    return true;
650
0
}
651
652
static void dec_linear_entries(struct page_info *pg)
653
0
{
654
0
    typeof(pg->linear_pt_count) oc;
655
0
656
0
    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
657
0
    ASSERT(oc > 0);
658
0
}
659
660
static bool inc_linear_uses(struct page_info *pg)
661
0
{
662
0
    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
663
0
664
0
    do {
665
0
        /* See the respective comment in inc_linear_entries(). */
666
0
        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
667
0
        oc = nc--;
668
0
        if ( nc >= 0 )
669
0
            return false;
670
0
        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
671
0
    } while ( oc != nc );
672
0
673
0
    return true;
674
0
}
675
676
static void dec_linear_uses(struct page_info *pg)
677
0
{
678
0
    typeof(pg->linear_pt_count) oc;
679
0
680
0
    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
681
0
    ASSERT(oc < 0);
682
0
}
683
684
/*
685
 * We allow root tables to map each other (a.k.a. linear page tables). It
686
 * needs some special care with reference counts and access permissions:
687
 *  1. The mapping entry must be read-only, or the guest may get write access
688
 *     to its own PTEs.
689
 *  2. We must only bump the reference counts for an *already validated*
690
 *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
691
 *     on a validation that is required to complete that validation.
692
 *  3. We only need to increment the reference counts for the mapped page
693
 *     frame if it is mapped by a different root table. This is sufficient and
694
 *     also necessary to allow validation of a root table mapping itself.
695
 */
696
#define define_get_linear_pagetable(level)                                  \
697
static int                                                                  \
698
get_##level##_linear_pagetable(                                             \
699
0
    level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d)         \
700
0
{                                                                           \
701
0
    unsigned long x, y;                                                     \
702
0
    struct page_info *page;                                                 \
703
0
    unsigned long pfn;                                                      \
704
0
                                                                            \
705
0
    if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
706
0
    {                                                                       \
707
0
        gdprintk(XENLOG_WARNING,                                            \
708
0
                 "Attempt to create linear p.t. with write perms\n");       \
709
0
        return 0;                                                           \
710
0
    }                                                                       \
711
0
                                                                            \
712
0
    if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
713
0
    {                                                                       \
714
0
        struct page_info *ptpg = mfn_to_page(_mfn(pde_pfn));                \
715
0
                                                                            \
716
0
        /* Make sure the page table belongs to the correct domain. */       \
717
0
        if ( unlikely(page_get_owner(ptpg) != d) )                          \
718
0
            return 0;                                                       \
719
0
                                                                            \
720
0
        /* Make sure the mapped frame belongs to the correct domain. */     \
721
0
        if ( unlikely(!get_page_from_mfn(_mfn(pfn), d)) )                   \
722
0
            return 0;                                                       \
723
0
                                                                            \
724
0
        /*                                                                  \
725
0
         * Ensure that the mapped frame is an already-validated page table  \
726
0
         * and is not itself having linear entries, as well as that the     \
727
0
         * containing page table is not iself in use as a linear page table \
728
0
         * elsewhere.                                                       \
729
0
         * If so, atomically increment the count (checking for overflow).   \
730
0
         */                                                                 \
731
0
        page = mfn_to_page(_mfn(pfn));                                      \
732
0
        if ( !inc_linear_entries(ptpg) )                                    \
733
0
        {                                                                   \
734
0
            put_page(page);                                                 \
735
0
            return 0;                                                       \
736
0
        }                                                                   \
737
0
        if ( !inc_linear_uses(page) )                                       \
738
0
        {                                                                   \
739
0
            dec_linear_entries(ptpg);                                       \
740
0
            put_page(page);                                                 \
741
0
            return 0;                                                       \
742
0
        }                                                                   \
743
0
        y = page->u.inuse.type_info;                                        \
744
0
        do {                                                                \
745
0
            x = y;                                                          \
746
0
            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||        \
747
0
                 unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
748
0
                          (PGT_##level##_page_table|PGT_validated)) )       \
749
0
            {                                                               \
750
0
                dec_linear_uses(page);                                      \
751
0
                dec_linear_entries(ptpg);                                   \
752
0
                put_page(page);                                             \
753
0
                return 0;                                                   \
754
0
            }                                                               \
755
0
        }                                                                   \
756
0
        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );   \
757
0
    }                                                                       \
758
0
                                                                            \
759
0
    return 1;                                                               \
760
0
}
Unexecuted instantiation: mm.c:get_l2_linear_pagetable
Unexecuted instantiation: mm.c:get_l3_linear_pagetable
Unexecuted instantiation: mm.c:get_l4_linear_pagetable
761
762
763
bool is_iomem_page(mfn_t mfn)
764
0
{
765
0
    struct page_info *page;
766
0
767
0
    if ( !mfn_valid(mfn) )
768
0
        return true;
769
0
770
0
    /* Caller must know that it is an iomem page, or a reference is held. */
771
0
    page = mfn_to_page(mfn);
772
0
    ASSERT((page->count_info & PGC_count_mask) != 0);
773
0
774
0
    return (page_get_owner(page) == dom_io);
775
0
}
776
777
static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
778
0
{
779
0
    int err = 0;
780
0
    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
781
0
         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
782
0
    unsigned long xen_va =
783
0
        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
784
0
785
0
    if ( unlikely(alias) && cacheattr )
786
0
        err = map_pages_to_xen(xen_va, mfn, 1, 0);
787
0
    if ( !err )
788
0
        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
789
0
                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
790
0
    if ( unlikely(alias) && !cacheattr && !err )
791
0
        err = map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
792
0
    return err;
793
0
}
794
795
#ifndef NDEBUG
796
struct mmio_emul_range_ctxt {
797
    const struct domain *d;
798
    unsigned long mfn;
799
};
800
801
static int print_mmio_emul_range(unsigned long s, unsigned long e, void *arg)
802
0
{
803
0
    const struct mmio_emul_range_ctxt *ctxt = arg;
804
0
805
0
    if ( ctxt->mfn > e )
806
0
        return 0;
807
0
808
0
    if ( ctxt->mfn >= s )
809
0
    {
810
0
        static DEFINE_SPINLOCK(last_lock);
811
0
        static const struct domain *last_d;
812
0
        static unsigned long last_s = ~0UL, last_e;
813
0
        bool print = false;
814
0
815
0
        spin_lock(&last_lock);
816
0
        if ( last_d != ctxt->d || last_s != s || last_e != e )
817
0
        {
818
0
            last_d = ctxt->d;
819
0
            last_s = s;
820
0
            last_e = e;
821
0
            print = true;
822
0
        }
823
0
        spin_unlock(&last_lock);
824
0
825
0
        if ( print )
826
0
            printk(XENLOG_G_INFO
827
0
                   "d%d: Forcing write emulation on MFNs %lx-%lx\n",
828
0
                   ctxt->d->domain_id, s, e);
829
0
    }
830
0
831
0
    return 1;
832
0
}
833
#endif
834
835
/*
836
 * get_page_from_l1e returns:
837
 *   0  => success (page not present also counts as such)
838
 *  <0  => error code
839
 *  >0  => the page flags to be flipped
840
 */
841
int
842
get_page_from_l1e(
843
    l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
844
0
{
845
0
    unsigned long mfn = l1e_get_pfn(l1e);
846
0
    struct page_info *page = mfn_to_page(_mfn(mfn));
847
0
    uint32_t l1f = l1e_get_flags(l1e);
848
0
    struct vcpu *curr = current;
849
0
    struct domain *real_pg_owner;
850
0
    bool write;
851
0
852
0
    if ( !(l1f & _PAGE_PRESENT) )
853
0
        return 0;
854
0
855
0
    if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
856
0
    {
857
0
        gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
858
0
                 l1f & l1_disallow_mask(l1e_owner));
859
0
        return -EINVAL;
860
0
    }
861
0
862
0
    if ( !mfn_valid(_mfn(mfn)) ||
863
0
         (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
864
0
    {
865
0
        int flip = 0;
866
0
867
0
        /* Only needed the reference to confirm dom_io ownership. */
868
0
        if ( mfn_valid(_mfn(mfn)) )
869
0
            put_page(page);
870
0
871
0
        /* DOMID_IO reverts to caller for privilege checks. */
872
0
        if ( pg_owner == dom_io )
873
0
            pg_owner = curr->domain;
874
0
875
0
        if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
876
0
        {
877
0
            if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
878
0
            {
879
0
                gdprintk(XENLOG_WARNING,
880
0
                         "d%d non-privileged attempt to map MMIO space %"PRI_mfn"\n",
881
0
                         pg_owner->domain_id, mfn);
882
0
                return -EPERM;
883
0
            }
884
0
            return -EINVAL;
885
0
        }
886
0
887
0
        if ( pg_owner != l1e_owner &&
888
0
             !iomem_access_permitted(l1e_owner, mfn, mfn) )
889
0
        {
890
0
            if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
891
0
            {
892
0
                gdprintk(XENLOG_WARNING,
893
0
                         "d%d attempted to map MMIO space %"PRI_mfn" in d%d to d%d\n",
894
0
                         curr->domain->domain_id, mfn, pg_owner->domain_id,
895
0
                         l1e_owner->domain_id);
896
0
                return -EPERM;
897
0
            }
898
0
            return -EINVAL;
899
0
        }
900
0
901
0
        if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
902
0
        {
903
0
            /* MMIO pages must not be mapped cachable unless requested so. */
904
0
            switch ( opt_mmio_relax )
905
0
            {
906
0
            case 0:
907
0
                break;
908
0
            case 1:
909
0
                if ( !is_hardware_domain(l1e_owner) )
910
0
                    break;
911
0
                /* fallthrough */
912
0
            case -1:
913
0
                return 0;
914
0
            default:
915
0
                ASSERT_UNREACHABLE();
916
0
            }
917
0
        }
918
0
        else if ( l1f & _PAGE_RW )
919
0
        {
920
0
#ifndef NDEBUG
921
0
            const unsigned long *ro_map;
922
0
            unsigned int seg, bdf;
923
0
924
0
            if ( !pci_mmcfg_decode(mfn, &seg, &bdf) ||
925
0
                 ((ro_map = pci_get_ro_map(seg)) != NULL &&
926
0
                  test_bit(bdf, ro_map)) )
927
0
                printk(XENLOG_G_WARNING
928
0
                       "d%d: Forcing read-only access to MFN %lx\n",
929
0
                       l1e_owner->domain_id, mfn);
930
0
            else
931
0
                rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL,
932
0
                                       print_mmio_emul_range,
933
0
                                       &(struct mmio_emul_range_ctxt){
934
0
                                           .d = l1e_owner,
935
0
                                           .mfn = mfn });
936
0
#endif
937
0
            flip = _PAGE_RW;
938
0
        }
939
0
940
0
        switch ( l1f & PAGE_CACHE_ATTRS )
941
0
        {
942
0
        case 0: /* WB */
943
0
            flip |= _PAGE_PWT | _PAGE_PCD;
944
0
            break;
945
0
        case _PAGE_PWT: /* WT */
946
0
        case _PAGE_PWT | _PAGE_PAT: /* WP */
947
0
            flip |= _PAGE_PCD | (l1f & _PAGE_PAT);
948
0
            break;
949
0
        }
950
0
951
0
        return flip;
952
0
    }
953
0
954
0
    if ( unlikely( (real_pg_owner != pg_owner) &&
955
0
                   (real_pg_owner != dom_cow) ) )
956
0
    {
957
0
        /*
958
0
         * Let privileged domains transfer the right to map their target
959
0
         * domain's pages. This is used to allow stub-domain pvfb export to
960
0
         * dom0, until pvfb supports granted mappings. At that time this
961
0
         * minor hack can go away.
962
0
         */
963
0
        if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
964
0
             xsm_priv_mapping(XSM_TARGET, pg_owner, real_pg_owner) )
965
0
        {
966
0
            gdprintk(XENLOG_WARNING,
967
0
                     "pg_owner d%d l1e_owner d%d, but real_pg_owner d%d\n",
968
0
                     pg_owner->domain_id, l1e_owner->domain_id,
969
0
                     real_pg_owner ? real_pg_owner->domain_id : -1);
970
0
            goto could_not_pin;
971
0
        }
972
0
        pg_owner = real_pg_owner;
973
0
    }
974
0
975
0
    /*
976
0
     * Extra paranoid check for shared memory. Writable mappings
977
0
     * disallowed (unshare first!)
978
0
     */
979
0
    if ( (l1f & _PAGE_RW) && (real_pg_owner == dom_cow) )
980
0
        goto could_not_pin;
981
0
982
0
    /*
983
0
     * Foreign mappings into guests in shadow external mode don't
984
0
     * contribute to writeable mapping refcounts.  (This allows the
985
0
     * qemu-dm helper process in dom0 to map the domain's memory without
986
0
     * messing up the count of "real" writable mappings.)
987
0
     */
988
0
    write = (l1f & _PAGE_RW) &&
989
0
            ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner));
990
0
    if ( write && !get_page_type(page, PGT_writable_page) )
991
0
    {
992
0
        gdprintk(XENLOG_WARNING, "Could not get page type PGT_writable_page\n");
993
0
        goto could_not_pin;
994
0
    }
995
0
996
0
    if ( pte_flags_to_cacheattr(l1f) !=
997
0
         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
998
0
    {
999
0
        unsigned long x, nx, y = page->count_info;
1000
0
        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
1001
0
        int err;
1002
0
1003
0
        if ( is_xen_heap_page(page) )
1004
0
        {
1005
0
            if ( write )
1006
0
                put_page_type(page);
1007
0
            put_page(page);
1008
0
            gdprintk(XENLOG_WARNING,
1009
0
                     "Attempt to change cache attributes of Xen heap page\n");
1010
0
            return -EACCES;
1011
0
        }
1012
0
1013
0
        do {
1014
0
            x  = y;
1015
0
            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
1016
0
        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1017
0
1018
0
        err = update_xen_mappings(mfn, cacheattr);
1019
0
        if ( unlikely(err) )
1020
0
        {
1021
0
            cacheattr = y & PGC_cacheattr_mask;
1022
0
            do {
1023
0
                x  = y;
1024
0
                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
1025
0
            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1026
0
1027
0
            if ( write )
1028
0
                put_page_type(page);
1029
0
            put_page(page);
1030
0
1031
0
            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
1032
0
                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
1033
0
                     mfn, get_gpfn_from_mfn(mfn),
1034
0
                     l1e_get_intpte(l1e), l1e_owner->domain_id);
1035
0
            return err;
1036
0
        }
1037
0
    }
1038
0
1039
0
    return 0;
1040
0
1041
0
 could_not_pin:
1042
0
    gdprintk(XENLOG_WARNING, "Error getting mfn %" PRI_mfn " (pfn %" PRI_pfn
1043
0
             ") from L1 entry %" PRIpte " for l1e_owner d%d, pg_owner d%d\n",
1044
0
             mfn, get_gpfn_from_mfn(mfn),
1045
0
             l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
1046
0
    if ( real_pg_owner != NULL )
1047
0
        put_page(page);
1048
0
    return -EBUSY;
1049
0
}
1050
1051
1052
/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
1053
/*
1054
 * get_page_from_l2e returns:
1055
 *   1 => page not present
1056
 *   0 => success
1057
 *  <0 => error code
1058
 */
1059
define_get_linear_pagetable(l2);
1060
static int
1061
get_page_from_l2e(
1062
    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
1063
0
{
1064
0
    unsigned long mfn = l2e_get_pfn(l2e);
1065
0
    int rc;
1066
0
1067
0
    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1068
0
        return 1;
1069
0
1070
0
    if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
1071
0
    {
1072
0
        gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
1073
0
                 l2e_get_flags(l2e) & L2_DISALLOW_MASK);
1074
0
        return -EINVAL;
1075
0
    }
1076
0
1077
0
    if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
1078
0
    {
1079
0
        rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
1080
0
        if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
1081
0
            rc = 0;
1082
0
        return rc;
1083
0
    }
1084
0
1085
0
    return -EINVAL;
1086
0
}
1087
1088
1089
/*
1090
 * get_page_from_l3e returns:
1091
 *   1 => page not present
1092
 *   0 => success
1093
 *  <0 => error code
1094
 */
1095
define_get_linear_pagetable(l3);
1096
static int
1097
get_page_from_l3e(
1098
    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
1099
0
{
1100
0
    int rc;
1101
0
1102
0
    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1103
0
        return 1;
1104
0
1105
0
    if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
1106
0
    {
1107
0
        gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
1108
0
                 l3e_get_flags(l3e) & l3_disallow_mask(d));
1109
0
        return -EINVAL;
1110
0
    }
1111
0
1112
0
    rc = get_page_and_type_from_mfn(
1113
0
        l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
1114
0
    if ( unlikely(rc == -EINVAL) &&
1115
0
         !is_pv_32bit_domain(d) &&
1116
0
         get_l3_linear_pagetable(l3e, pfn, d) )
1117
0
        rc = 0;
1118
0
1119
0
    return rc;
1120
0
}
1121
1122
/*
1123
 * get_page_from_l4e returns:
1124
 *   1 => page not present
1125
 *   0 => success
1126
 *  <0 => error code
1127
 */
1128
define_get_linear_pagetable(l4);
1129
static int
1130
get_page_from_l4e(
1131
    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
1132
0
{
1133
0
    int rc;
1134
0
1135
0
    if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
1136
0
        return 1;
1137
0
1138
0
    if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
1139
0
    {
1140
0
        gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
1141
0
                 l4e_get_flags(l4e) & L4_DISALLOW_MASK);
1142
0
        return -EINVAL;
1143
0
    }
1144
0
1145
0
    rc = get_page_and_type_from_mfn(
1146
0
        l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
1147
0
    if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
1148
0
        rc = 0;
1149
0
1150
0
    return rc;
1151
0
}
1152
1153
static int _put_page_type(struct page_info *page, bool preemptible,
1154
                          struct page_info *ptpg);
1155
1156
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1157
0
{
1158
0
    unsigned long     pfn = l1e_get_pfn(l1e);
1159
0
    struct page_info *page;
1160
0
    struct domain    *pg_owner;
1161
0
    struct vcpu      *v;
1162
0
1163
0
    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(_mfn(pfn)) )
1164
0
        return;
1165
0
1166
0
    page = mfn_to_page(_mfn(pfn));
1167
0
    pg_owner = page_get_owner(page);
1168
0
1169
0
    /*
1170
0
     * Check if this is a mapping that was established via a grant reference.
1171
0
     * If it was then we should not be here: we require that such mappings are
1172
0
     * explicitly destroyed via the grant-table interface.
1173
0
     *
1174
0
     * The upshot of this is that the guest can end up with active grants that
1175
0
     * it cannot destroy (because it no longer has a PTE to present to the
1176
0
     * grant-table interface). This can lead to subtle hard-to-catch bugs,
1177
0
     * hence a special grant PTE flag can be enabled to catch the bug early.
1178
0
     *
1179
0
     * (Note that the undestroyable active grants are not a security hole in
1180
0
     * Xen. All active grants can safely be cleaned up when the domain dies.)
1181
0
     */
1182
0
    if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1183
0
         !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1184
0
    {
1185
0
        gdprintk(XENLOG_WARNING,
1186
0
                 "Attempt to implicitly unmap a granted PTE %" PRIpte "\n",
1187
0
                 l1e_get_intpte(l1e));
1188
0
        domain_crash(l1e_owner);
1189
0
    }
1190
0
1191
0
    /*
1192
0
     * Remember we didn't take a type-count of foreign writable mappings
1193
0
     * to paging-external domains.
1194
0
     */
1195
0
    if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1196
0
         ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1197
0
    {
1198
0
        put_page_and_type(page);
1199
0
    }
1200
0
    else
1201
0
    {
1202
0
        /* We expect this is rare so we blow the entire shadow LDT. */
1203
0
        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1204
0
                       PGT_seg_desc_page)) &&
1205
0
             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1206
0
             (l1e_owner == pg_owner) )
1207
0
        {
1208
0
            for_each_vcpu ( pg_owner, v )
1209
0
                invalidate_shadow_ldt(v, 1);
1210
0
        }
1211
0
        put_page(page);
1212
0
    }
1213
0
}
1214
1215
1216
/*
1217
 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1218
 * Note also that this automatically deals correctly with linear p.t.'s.
1219
 */
1220
static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1221
0
{
1222
0
    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1223
0
        return 1;
1224
0
1225
0
    if ( l2e_get_flags(l2e) & _PAGE_PSE )
1226
0
    {
1227
0
        struct page_info *page = l2e_get_page(l2e);
1228
0
        unsigned int i;
1229
0
1230
0
        for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
1231
0
            put_page_and_type(page);
1232
0
    }
1233
0
    else
1234
0
    {
1235
0
        struct page_info *pg = l2e_get_page(l2e);
1236
0
        int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn)));
1237
0
1238
0
        ASSERT(!rc);
1239
0
        put_page(pg);
1240
0
    }
1241
0
1242
0
    return 0;
1243
0
}
1244
1245
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1246
                             int partial, bool defer)
1247
0
{
1248
0
    struct page_info *pg;
1249
0
    int rc;
1250
0
1251
0
    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1252
0
        return 1;
1253
0
1254
0
    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1255
0
    {
1256
0
        unsigned long mfn = l3e_get_pfn(l3e);
1257
0
        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1258
0
1259
0
        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1260
0
        do {
1261
0
            put_data_page(mfn_to_page(_mfn(mfn)), writeable);
1262
0
        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1263
0
1264
0
        return 0;
1265
0
    }
1266
0
1267
0
    pg = l3e_get_page(l3e);
1268
0
1269
0
    if ( unlikely(partial > 0) )
1270
0
    {
1271
0
        ASSERT(!defer);
1272
0
        return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1273
0
    }
1274
0
1275
0
    if ( defer )
1276
0
    {
1277
0
        current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1278
0
        current->arch.old_guest_table = pg;
1279
0
        return 0;
1280
0
    }
1281
0
1282
0
    rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1283
0
    if ( likely(!rc) )
1284
0
        put_page(pg);
1285
0
1286
0
    return rc;
1287
0
}
1288
1289
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1290
                             int partial, bool defer)
1291
0
{
1292
0
    int rc = 1;
1293
0
1294
0
    if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1295
0
         (l4e_get_pfn(l4e) != pfn) )
1296
0
    {
1297
0
        struct page_info *pg = l4e_get_page(l4e);
1298
0
1299
0
        if ( unlikely(partial > 0) )
1300
0
        {
1301
0
            ASSERT(!defer);
1302
0
            return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1303
0
        }
1304
0
1305
0
        if ( defer )
1306
0
        {
1307
0
            current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1308
0
            current->arch.old_guest_table = pg;
1309
0
            return 0;
1310
0
        }
1311
0
1312
0
        rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1313
0
        if ( likely(!rc) )
1314
0
            put_page(pg);
1315
0
    }
1316
0
1317
0
    return rc;
1318
0
}
1319
1320
static int alloc_l1_table(struct page_info *page)
1321
0
{
1322
0
    struct domain *d = page_get_owner(page);
1323
0
    l1_pgentry_t  *pl1e;
1324
0
    unsigned int   i;
1325
0
    int            ret = 0;
1326
0
1327
0
    pl1e = __map_domain_page(page);
1328
0
1329
0
    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1330
0
    {
1331
0
        switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
1332
0
        {
1333
0
        default:
1334
0
            goto fail;
1335
0
        case 0:
1336
0
            break;
1337
0
        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1338
0
            ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1339
0
            l1e_flip_flags(pl1e[i], ret);
1340
0
            break;
1341
0
        }
1342
0
1343
0
        pl1e[i] = adjust_guest_l1e(pl1e[i], d);
1344
0
    }
1345
0
1346
0
    unmap_domain_page(pl1e);
1347
0
    return 0;
1348
0
1349
0
 fail:
1350
0
    gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
1351
0
    while ( i-- > 0 )
1352
0
        put_page_from_l1e(pl1e[i], d);
1353
0
1354
0
    unmap_domain_page(pl1e);
1355
0
    return ret;
1356
0
}
1357
1358
static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1359
0
{
1360
0
    struct page_info *page;
1361
0
    l3_pgentry_t     l3e3;
1362
0
1363
0
    if ( !is_pv_32bit_domain(d) )
1364
0
        return 1;
1365
0
1366
0
    pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1367
0
1368
0
    /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1369
0
    l3e3 = pl3e[3];
1370
0
    if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1371
0
    {
1372
0
        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
1373
0
        return 0;
1374
0
    }
1375
0
1376
0
    /*
1377
0
     * The Xen-private mappings include linear mappings. The L2 thus cannot
1378
0
     * be shared by multiple L3 tables. The test here is adequate because:
1379
0
     *  1. Cannot appear in slots != 3 because get_page_type() checks the
1380
0
     *     PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1381
0
     *  2. Cannot appear in another page table's L3:
1382
0
     *     a. alloc_l3_table() calls this function and this check will fail
1383
0
     *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
1384
0
     */
1385
0
    page = l3e_get_page(l3e3);
1386
0
    BUG_ON(page->u.inuse.type_info & PGT_pinned);
1387
0
    BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1388
0
    BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1389
0
    if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1390
0
    {
1391
0
        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
1392
0
        return 0;
1393
0
    }
1394
0
1395
0
    return 1;
1396
0
}
1397
1398
static int alloc_l2_table(struct page_info *page, unsigned long type,
1399
                          int preemptible)
1400
0
{
1401
0
    struct domain *d = page_get_owner(page);
1402
0
    unsigned long  pfn = mfn_x(page_to_mfn(page));
1403
0
    l2_pgentry_t  *pl2e;
1404
0
    unsigned int   i;
1405
0
    int            rc = 0;
1406
0
1407
0
    pl2e = map_domain_page(_mfn(pfn));
1408
0
1409
0
    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1410
0
    {
1411
0
        if ( preemptible && i > page->nr_validated_ptes
1412
0
             && hypercall_preempt_check() )
1413
0
        {
1414
0
            page->nr_validated_ptes = i;
1415
0
            rc = -ERESTART;
1416
0
            break;
1417
0
        }
1418
0
1419
0
        if ( !is_guest_l2_slot(d, type, i) ||
1420
0
             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1421
0
            continue;
1422
0
1423
0
        if ( rc < 0 )
1424
0
        {
1425
0
            gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
1426
0
            while ( i-- > 0 )
1427
0
                if ( is_guest_l2_slot(d, type, i) )
1428
0
                    put_page_from_l2e(pl2e[i], pfn);
1429
0
            break;
1430
0
        }
1431
0
1432
0
        pl2e[i] = adjust_guest_l2e(pl2e[i], d);
1433
0
    }
1434
0
1435
0
    if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1436
0
        init_xen_pae_l2_slots(pl2e, d);
1437
0
1438
0
    unmap_domain_page(pl2e);
1439
0
    return rc > 0 ? 0 : rc;
1440
0
}
1441
1442
static int alloc_l3_table(struct page_info *page)
1443
0
{
1444
0
    struct domain *d = page_get_owner(page);
1445
0
    unsigned long  pfn = mfn_x(page_to_mfn(page));
1446
0
    l3_pgentry_t  *pl3e;
1447
0
    unsigned int   i;
1448
0
    int            rc = 0, partial = page->partial_pte;
1449
0
1450
0
    pl3e = map_domain_page(_mfn(pfn));
1451
0
1452
0
    /*
1453
0
     * PAE guests allocate full pages, but aren't required to initialize
1454
0
     * more than the first four entries; when running in compatibility
1455
0
     * mode, however, the full page is visible to the MMU, and hence all
1456
0
     * 512 entries must be valid/verified, which is most easily achieved
1457
0
     * by clearing them out.
1458
0
     */
1459
0
    if ( is_pv_32bit_domain(d) )
1460
0
        memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1461
0
1462
0
    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1463
0
          i++, partial = 0 )
1464
0
    {
1465
0
        if ( is_pv_32bit_domain(d) && (i == 3) )
1466
0
        {
1467
0
            if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1468
0
                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1469
0
                rc = -EINVAL;
1470
0
            else
1471
0
                rc = get_page_and_type_from_mfn(
1472
0
                    l3e_get_mfn(pl3e[i]),
1473
0
                    PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
1474
0
        }
1475
0
        else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
1476
0
            continue;
1477
0
1478
0
        if ( rc == -ERESTART )
1479
0
        {
1480
0
            page->nr_validated_ptes = i;
1481
0
            page->partial_pte = partial ?: 1;
1482
0
        }
1483
0
        else if ( rc == -EINTR && i )
1484
0
        {
1485
0
            page->nr_validated_ptes = i;
1486
0
            page->partial_pte = 0;
1487
0
            rc = -ERESTART;
1488
0
        }
1489
0
        if ( rc < 0 )
1490
0
            break;
1491
0
1492
0
        pl3e[i] = adjust_guest_l3e(pl3e[i], d);
1493
0
    }
1494
0
1495
0
    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1496
0
        rc = -EINVAL;
1497
0
    if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
1498
0
    {
1499
0
        gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
1500
0
        if ( i )
1501
0
        {
1502
0
            page->nr_validated_ptes = i;
1503
0
            page->partial_pte = 0;
1504
0
            current->arch.old_guest_ptpg = NULL;
1505
0
            current->arch.old_guest_table = page;
1506
0
        }
1507
0
        while ( i-- > 0 )
1508
0
            pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1509
0
    }
1510
0
1511
0
    unmap_domain_page(pl3e);
1512
0
    return rc > 0 ? 0 : rc;
1513
0
}
1514
1515
void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
1516
0
{
1517
0
    memcpy(&l2t[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1518
0
           &compat_idle_pg_table_l2[
1519
0
               l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1520
0
           COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2t));
1521
0
}
1522
1523
/*
1524
 * Fill an L4 with Xen entries.
1525
 *
1526
 * This function must write all ROOT_PAGETABLE_PV_XEN_SLOTS, to clobber any
1527
 * values a guest may have left there from alloc_l4_table().
1528
 *
1529
 * l4t and l4mfn are mandatory, but l4mfn doesn't need to be the mfn under
1530
 * *l4t.  All other parameters are optional and will either fill or zero the
1531
 * appropriate slots.  Pagetables not shared with guests will gain the
1532
 * extended directmap.
1533
 */
1534
void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
1535
                       const struct domain *d, mfn_t sl4mfn, bool ro_mpt)
1536
12
{
1537
12
    /*
1538
12
     * PV vcpus need a shortened directmap.  HVM and Idle vcpus get the full
1539
12
     * directmap.
1540
12
     */
1541
12
    bool short_directmap = d && !paging_mode_external(d);
1542
12
1543
12
    /* Slot 256: RO M2P (if applicable). */
1544
12
    l4t[l4_table_offset(RO_MPT_VIRT_START)] =
1545
0
        ro_mpt ? idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]
1546
12
               : l4e_empty();
1547
12
1548
12
    /* Slot 257: PCI MMCFG. */
1549
12
    l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
1550
12
        idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
1551
12
1552
12
    /* Slot 258: Self linear mappings. */
1553
12
    ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
1554
12
    l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
1555
12
        l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
1556
12
1557
12
    /* Slot 259: Shadow linear mappings (if applicable) .*/
1558
12
    l4t[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1559
12
        mfn_eq(sl4mfn, INVALID_MFN) ? l4e_empty() :
1560
12
        l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW);
1561
12
1562
12
    /* Slot 260: Per-domain mappings (if applicable). */
1563
12
    l4t[l4_table_offset(PERDOMAIN_VIRT_START)] =
1564
12
        d ? l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW)
1565
0
          : l4e_empty();
1566
12
1567
12
    /* Slot 261-: text/data/bss, RW M2P, vmap, frametable, directmap. */
1568
12
#ifndef NDEBUG
1569
12
    if ( short_directmap &&
1570
0
         unlikely(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS) )
1571
0
    {
1572
0
        /*
1573
0
         * If using highmem-start=, artificially shorten the directmap to
1574
0
         * simulate very large machines.
1575
0
         */
1576
0
        l4_pgentry_t *next;
1577
0
1578
0
        memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1579
0
               &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1580
0
               (ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots -
1581
0
                l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1582
0
1583
0
        next = &l4t[ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots];
1584
0
1585
0
        if ( l4e_get_intpte(split_l4e) )
1586
0
            *next++ = split_l4e;
1587
0
1588
0
        memset(next, 0,
1589
0
               _p(&l4t[ROOT_PAGETABLE_LAST_XEN_SLOT + 1]) - _p(next));
1590
0
    }
1591
12
    else
1592
12
#endif
1593
12
    {
1594
12
        unsigned int slots = (short_directmap
1595
0
                              ? ROOT_PAGETABLE_PV_XEN_SLOTS
1596
12
                              : ROOT_PAGETABLE_XEN_SLOTS);
1597
12
1598
12
        memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1599
12
               &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1600
12
               (ROOT_PAGETABLE_FIRST_XEN_SLOT + slots -
1601
12
                l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1602
12
    }
1603
12
}
1604
1605
bool fill_ro_mpt(mfn_t mfn)
1606
0
{
1607
0
    l4_pgentry_t *l4tab = map_domain_page(mfn);
1608
0
    bool ret = false;
1609
0
1610
0
    if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
1611
0
    {
1612
0
        l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
1613
0
            idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
1614
0
        ret = true;
1615
0
    }
1616
0
    unmap_domain_page(l4tab);
1617
0
1618
0
    return ret;
1619
0
}
1620
1621
void zap_ro_mpt(mfn_t mfn)
1622
0
{
1623
0
    l4_pgentry_t *l4tab = map_domain_page(mfn);
1624
0
1625
0
    l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
1626
0
    unmap_domain_page(l4tab);
1627
0
}
1628
1629
static int alloc_l4_table(struct page_info *page)
1630
0
{
1631
0
    struct domain *d = page_get_owner(page);
1632
0
    unsigned long  pfn = mfn_x(page_to_mfn(page));
1633
0
    l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
1634
0
    unsigned int   i;
1635
0
    int            rc = 0, partial = page->partial_pte;
1636
0
1637
0
    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1638
0
          i++, partial = 0 )
1639
0
    {
1640
0
        if ( !is_guest_l4_slot(d, i) ||
1641
0
             (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
1642
0
            continue;
1643
0
1644
0
        if ( rc == -ERESTART )
1645
0
        {
1646
0
            page->nr_validated_ptes = i;
1647
0
            page->partial_pte = partial ?: 1;
1648
0
        }
1649
0
        else if ( rc < 0 )
1650
0
        {
1651
0
            if ( rc != -EINTR )
1652
0
                gdprintk(XENLOG_WARNING,
1653
0
                         "Failure in alloc_l4_table: slot %#x\n", i);
1654
0
            if ( i )
1655
0
            {
1656
0
                page->nr_validated_ptes = i;
1657
0
                page->partial_pte = 0;
1658
0
                if ( rc == -EINTR )
1659
0
                    rc = -ERESTART;
1660
0
                else
1661
0
                {
1662
0
                    if ( current->arch.old_guest_table )
1663
0
                        page->nr_validated_ptes++;
1664
0
                    current->arch.old_guest_ptpg = NULL;
1665
0
                    current->arch.old_guest_table = page;
1666
0
                }
1667
0
            }
1668
0
        }
1669
0
        if ( rc < 0 )
1670
0
        {
1671
0
            unmap_domain_page(pl4e);
1672
0
            return rc;
1673
0
        }
1674
0
1675
0
        pl4e[i] = adjust_guest_l4e(pl4e[i], d);
1676
0
    }
1677
0
1678
0
    if ( rc >= 0 )
1679
0
    {
1680
0
        init_xen_l4_slots(pl4e, _mfn(pfn),
1681
0
                          d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
1682
0
        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
1683
0
        rc = 0;
1684
0
    }
1685
0
    unmap_domain_page(pl4e);
1686
0
1687
0
    return rc;
1688
0
}
1689
1690
static void free_l1_table(struct page_info *page)
1691
0
{
1692
0
    struct domain *d = page_get_owner(page);
1693
0
    l1_pgentry_t *pl1e;
1694
0
    unsigned int  i;
1695
0
1696
0
    pl1e = __map_domain_page(page);
1697
0
1698
0
    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1699
0
        put_page_from_l1e(pl1e[i], d);
1700
0
1701
0
    unmap_domain_page(pl1e);
1702
0
}
1703
1704
1705
static int free_l2_table(struct page_info *page, int preemptible)
1706
0
{
1707
0
    struct domain *d = page_get_owner(page);
1708
0
    unsigned long pfn = mfn_x(page_to_mfn(page));
1709
0
    l2_pgentry_t *pl2e;
1710
0
    unsigned int  i = page->nr_validated_ptes - 1;
1711
0
    int err = 0;
1712
0
1713
0
    pl2e = map_domain_page(_mfn(pfn));
1714
0
1715
0
    ASSERT(page->nr_validated_ptes);
1716
0
    do {
1717
0
        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1718
0
             put_page_from_l2e(pl2e[i], pfn) == 0 &&
1719
0
             preemptible && i && hypercall_preempt_check() )
1720
0
        {
1721
0
           page->nr_validated_ptes = i;
1722
0
           err = -ERESTART;
1723
0
        }
1724
0
    } while ( !err && i-- );
1725
0
1726
0
    unmap_domain_page(pl2e);
1727
0
1728
0
    if ( !err )
1729
0
        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1730
0
1731
0
    return err;
1732
0
}
1733
1734
static int free_l3_table(struct page_info *page)
1735
0
{
1736
0
    struct domain *d = page_get_owner(page);
1737
0
    unsigned long pfn = mfn_x(page_to_mfn(page));
1738
0
    l3_pgentry_t *pl3e;
1739
0
    int rc = 0, partial = page->partial_pte;
1740
0
    unsigned int  i = page->nr_validated_ptes - !partial;
1741
0
1742
0
    pl3e = map_domain_page(_mfn(pfn));
1743
0
1744
0
    do {
1745
0
        rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
1746
0
        if ( rc < 0 )
1747
0
            break;
1748
0
        partial = 0;
1749
0
        if ( rc > 0 )
1750
0
            continue;
1751
0
        pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1752
0
    } while ( i-- );
1753
0
1754
0
    unmap_domain_page(pl3e);
1755
0
1756
0
    if ( rc == -ERESTART )
1757
0
    {
1758
0
        page->nr_validated_ptes = i;
1759
0
        page->partial_pte = partial ?: -1;
1760
0
    }
1761
0
    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1762
0
    {
1763
0
        page->nr_validated_ptes = i + 1;
1764
0
        page->partial_pte = 0;
1765
0
        rc = -ERESTART;
1766
0
    }
1767
0
    return rc > 0 ? 0 : rc;
1768
0
}
1769
1770
static int free_l4_table(struct page_info *page)
1771
0
{
1772
0
    struct domain *d = page_get_owner(page);
1773
0
    unsigned long pfn = mfn_x(page_to_mfn(page));
1774
0
    l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
1775
0
    int rc = 0, partial = page->partial_pte;
1776
0
    unsigned int  i = page->nr_validated_ptes - !partial;
1777
0
1778
0
    do {
1779
0
        if ( is_guest_l4_slot(d, i) )
1780
0
            rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
1781
0
        if ( rc < 0 )
1782
0
            break;
1783
0
        partial = 0;
1784
0
    } while ( i-- );
1785
0
1786
0
    if ( rc == -ERESTART )
1787
0
    {
1788
0
        page->nr_validated_ptes = i;
1789
0
        page->partial_pte = partial ?: -1;
1790
0
    }
1791
0
    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1792
0
    {
1793
0
        page->nr_validated_ptes = i + 1;
1794
0
        page->partial_pte = 0;
1795
0
        rc = -ERESTART;
1796
0
    }
1797
0
1798
0
    unmap_domain_page(pl4e);
1799
0
1800
0
    if ( rc >= 0 )
1801
0
    {
1802
0
        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
1803
0
        rc = 0;
1804
0
    }
1805
0
1806
0
    return rc;
1807
0
}
1808
1809
int page_lock(struct page_info *page)
1810
0
{
1811
0
    unsigned long x, nx;
1812
0
1813
0
    do {
1814
0
        while ( (x = page->u.inuse.type_info) & PGT_locked )
1815
0
            cpu_relax();
1816
0
        nx = x + (1 | PGT_locked);
1817
0
        if ( !(x & PGT_validated) ||
1818
0
             !(x & PGT_count_mask) ||
1819
0
             !(nx & PGT_count_mask) )
1820
0
            return 0;
1821
0
    } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1822
0
1823
0
    return 1;
1824
0
}
1825
1826
void page_unlock(struct page_info *page)
1827
0
{
1828
0
    unsigned long x, nx, y = page->u.inuse.type_info;
1829
0
1830
0
    do {
1831
0
        x = y;
1832
0
        ASSERT((x & PGT_count_mask) && (x & PGT_locked));
1833
0
1834
0
        nx = x - (1 | PGT_locked);
1835
0
        /* We must not drop the last reference here. */
1836
0
        ASSERT(nx & PGT_count_mask);
1837
0
    } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1838
0
}
1839
1840
/*
1841
 * PTE flags that a guest may change without re-validating the PTE.
1842
 * All other bits affect translation, caching, or Xen's safety.
1843
 */
1844
#define FASTPATH_FLAG_WHITELIST                                     \
1845
    (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
1846
     _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
1847
1848
/* Update the L1 entry at pl1e to new value nl1e. */
1849
static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1850
                        unsigned long gl1mfn, int preserve_ad,
1851
                        struct vcpu *pt_vcpu, struct domain *pg_dom)
1852
0
{
1853
0
    l1_pgentry_t ol1e;
1854
0
    struct domain *pt_dom = pt_vcpu->domain;
1855
0
    int rc = 0;
1856
0
1857
0
    if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1858
0
        return -EFAULT;
1859
0
1860
0
    ASSERT(!paging_mode_refcounts(pt_dom));
1861
0
1862
0
    if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1863
0
    {
1864
0
        struct page_info *page = NULL;
1865
0
1866
0
        if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1867
0
        {
1868
0
            gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
1869
0
                    l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1870
0
            return -EINVAL;
1871
0
        }
1872
0
1873
0
        /* Translate foreign guest address. */
1874
0
        if ( paging_mode_translate(pg_dom) )
1875
0
        {
1876
0
            p2m_type_t p2mt;
1877
0
            p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
1878
0
                            P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
1879
0
1880
0
            page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
1881
0
1882
0
            if ( p2m_is_paged(p2mt) )
1883
0
            {
1884
0
                if ( page )
1885
0
                    put_page(page);
1886
0
                p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
1887
0
                return -ENOENT;
1888
0
            }
1889
0
1890
0
            if ( p2mt == p2m_ram_paging_in && !page )
1891
0
                return -ENOENT;
1892
0
1893
0
            /* Did our attempt to unshare fail? */
1894
0
            if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
1895
0
            {
1896
0
                /* We could not have obtained a page ref. */
1897
0
                ASSERT(!page);
1898
0
                /* And mem_sharing_notify has already been called. */
1899
0
                return -ENOMEM;
1900
0
            }
1901
0
1902
0
            if ( !page )
1903
0
                return -EINVAL;
1904
0
            nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
1905
0
        }
1906
0
1907
0
        /* Fast path for sufficiently-similar mappings. */
1908
0
        if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
1909
0
        {
1910
0
            nl1e = adjust_guest_l1e(nl1e, pt_dom);
1911
0
            rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1912
0
                              preserve_ad);
1913
0
            if ( page )
1914
0
                put_page(page);
1915
0
            return rc ? 0 : -EBUSY;
1916
0
        }
1917
0
1918
0
        switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom) )
1919
0
        {
1920
0
        default:
1921
0
            if ( page )
1922
0
                put_page(page);
1923
0
            return rc;
1924
0
        case 0:
1925
0
            break;
1926
0
        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1927
0
            ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1928
0
            l1e_flip_flags(nl1e, rc);
1929
0
            rc = 0;
1930
0
            break;
1931
0
        }
1932
0
        if ( page )
1933
0
            put_page(page);
1934
0
1935
0
        nl1e = adjust_guest_l1e(nl1e, pt_dom);
1936
0
        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1937
0
                                    preserve_ad)) )
1938
0
        {
1939
0
            ol1e = nl1e;
1940
0
            rc = -EBUSY;
1941
0
        }
1942
0
    }
1943
0
    else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1944
0
                                     preserve_ad)) )
1945
0
    {
1946
0
        return -EBUSY;
1947
0
    }
1948
0
1949
0
    put_page_from_l1e(ol1e, pt_dom);
1950
0
    return rc;
1951
0
}
1952
1953
1954
/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1955
static int mod_l2_entry(l2_pgentry_t *pl2e,
1956
                        l2_pgentry_t nl2e,
1957
                        unsigned long pfn,
1958
                        int preserve_ad,
1959
                        struct vcpu *vcpu)
1960
0
{
1961
0
    l2_pgentry_t ol2e;
1962
0
    struct domain *d = vcpu->domain;
1963
0
    struct page_info *l2pg = mfn_to_page(_mfn(pfn));
1964
0
    unsigned long type = l2pg->u.inuse.type_info;
1965
0
    int rc = 0;
1966
0
1967
0
    if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1968
0
    {
1969
0
        gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
1970
0
                 pgentry_ptr_to_slot(pl2e));
1971
0
        return -EPERM;
1972
0
    }
1973
0
1974
0
    if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1975
0
        return -EFAULT;
1976
0
1977
0
    if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1978
0
    {
1979
0
        if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1980
0
        {
1981
0
            gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
1982
0
                    l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1983
0
            return -EINVAL;
1984
0
        }
1985
0
1986
0
        /* Fast path for sufficiently-similar mappings. */
1987
0
        if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
1988
0
        {
1989
0
            nl2e = adjust_guest_l2e(nl2e, d);
1990
0
            if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
1991
0
                return 0;
1992
0
            return -EBUSY;
1993
0
        }
1994
0
1995
0
        if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
1996
0
            return rc;
1997
0
1998
0
        nl2e = adjust_guest_l2e(nl2e, d);
1999
0
        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2000
0
                                    preserve_ad)) )
2001
0
        {
2002
0
            ol2e = nl2e;
2003
0
            rc = -EBUSY;
2004
0
        }
2005
0
    }
2006
0
    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2007
0
                                     preserve_ad)) )
2008
0
    {
2009
0
        return -EBUSY;
2010
0
    }
2011
0
2012
0
    put_page_from_l2e(ol2e, pfn);
2013
0
    return rc;
2014
0
}
2015
2016
/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
2017
static int mod_l3_entry(l3_pgentry_t *pl3e,
2018
                        l3_pgentry_t nl3e,
2019
                        unsigned long pfn,
2020
                        int preserve_ad,
2021
                        struct vcpu *vcpu)
2022
0
{
2023
0
    l3_pgentry_t ol3e;
2024
0
    struct domain *d = vcpu->domain;
2025
0
    int rc = 0;
2026
0
2027
0
    /*
2028
0
     * Disallow updates to final L3 slot. It contains Xen mappings, and it
2029
0
     * would be a pain to ensure they remain continuously valid throughout.
2030
0
     */
2031
0
    if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
2032
0
        return -EINVAL;
2033
0
2034
0
    if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
2035
0
        return -EFAULT;
2036
0
2037
0
    if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
2038
0
    {
2039
0
        if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
2040
0
        {
2041
0
            gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
2042
0
                    l3e_get_flags(nl3e) & l3_disallow_mask(d));
2043
0
            return -EINVAL;
2044
0
        }
2045
0
2046
0
        /* Fast path for sufficiently-similar mappings. */
2047
0
        if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
2048
0
        {
2049
0
            nl3e = adjust_guest_l3e(nl3e, d);
2050
0
            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
2051
0
            return rc ? 0 : -EFAULT;
2052
0
        }
2053
0
2054
0
        rc = get_page_from_l3e(nl3e, pfn, d, 0);
2055
0
        if ( unlikely(rc < 0) )
2056
0
            return rc;
2057
0
        rc = 0;
2058
0
2059
0
        nl3e = adjust_guest_l3e(nl3e, d);
2060
0
        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2061
0
                                    preserve_ad)) )
2062
0
        {
2063
0
            ol3e = nl3e;
2064
0
            rc = -EFAULT;
2065
0
        }
2066
0
    }
2067
0
    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2068
0
                                     preserve_ad)) )
2069
0
    {
2070
0
        return -EFAULT;
2071
0
    }
2072
0
2073
0
    if ( likely(rc == 0) )
2074
0
        if ( !create_pae_xen_mappings(d, pl3e) )
2075
0
            BUG();
2076
0
2077
0
    put_page_from_l3e(ol3e, pfn, 0, 1);
2078
0
    return rc;
2079
0
}
2080
2081
/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
2082
static int mod_l4_entry(l4_pgentry_t *pl4e,
2083
                        l4_pgentry_t nl4e,
2084
                        unsigned long pfn,
2085
                        int preserve_ad,
2086
                        struct vcpu *vcpu)
2087
0
{
2088
0
    struct domain *d = vcpu->domain;
2089
0
    l4_pgentry_t ol4e;
2090
0
    int rc = 0;
2091
0
2092
0
    if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
2093
0
    {
2094
0
        gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
2095
0
                 pgentry_ptr_to_slot(pl4e));
2096
0
        return -EINVAL;
2097
0
    }
2098
0
2099
0
    if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
2100
0
        return -EFAULT;
2101
0
2102
0
    if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
2103
0
    {
2104
0
        if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
2105
0
        {
2106
0
            gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
2107
0
                    l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
2108
0
            return -EINVAL;
2109
0
        }
2110
0
2111
0
        /* Fast path for sufficiently-similar mappings. */
2112
0
        if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
2113
0
        {
2114
0
            nl4e = adjust_guest_l4e(nl4e, d);
2115
0
            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
2116
0
            return rc ? 0 : -EFAULT;
2117
0
        }
2118
0
2119
0
        rc = get_page_from_l4e(nl4e, pfn, d, 0);
2120
0
        if ( unlikely(rc < 0) )
2121
0
            return rc;
2122
0
        rc = 0;
2123
0
2124
0
        nl4e = adjust_guest_l4e(nl4e, d);
2125
0
        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2126
0
                                    preserve_ad)) )
2127
0
        {
2128
0
            ol4e = nl4e;
2129
0
            rc = -EFAULT;
2130
0
        }
2131
0
    }
2132
0
    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2133
0
                                     preserve_ad)) )
2134
0
    {
2135
0
        return -EFAULT;
2136
0
    }
2137
0
2138
0
    put_page_from_l4e(ol4e, pfn, 0, 1);
2139
0
    return rc;
2140
0
}
2141
2142
static int cleanup_page_cacheattr(struct page_info *page)
2143
150
{
2144
150
    unsigned int cacheattr =
2145
150
        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2146
150
2147
150
    if ( likely(cacheattr == 0) )
2148
150
        return 0;
2149
150
2150
0
    page->count_info &= ~PGC_cacheattr_mask;
2151
0
2152
0
    BUG_ON(is_xen_heap_page(page));
2153
0
2154
0
    return update_xen_mappings(mfn_x(page_to_mfn(page)), 0);
2155
150
}
2156
2157
void put_page(struct page_info *page)
2158
1.96M
{
2159
1.96M
    unsigned long nx, x, y = page->count_info;
2160
1.96M
2161
1.97M
    do {
2162
1.97M
        ASSERT((y & PGC_count_mask) != 0);
2163
1.97M
        x  = y;
2164
1.97M
        nx = x - 1;
2165
1.97M
    }
2166
1.97M
    while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
2167
1.96M
2168
1.96M
    if ( unlikely((nx & PGC_count_mask) == 0) )
2169
150
    {
2170
150
        if ( cleanup_page_cacheattr(page) == 0 )
2171
150
            free_domheap_page(page);
2172
150
        else
2173
0
            gdprintk(XENLOG_WARNING,
2174
0
                     "Leaking mfn %" PRI_mfn "\n", mfn_x(page_to_mfn(page)));
2175
150
    }
2176
1.96M
}
2177
2178
2179
struct domain *page_get_owner_and_reference(struct page_info *page)
2180
1.96M
{
2181
1.96M
    unsigned long x, y = page->count_info;
2182
1.96M
    struct domain *owner;
2183
1.96M
2184
1.97M
    do {
2185
1.97M
        x = y;
2186
1.97M
        /*
2187
1.97M
         * Count ==  0: Page is not allocated, so we cannot take a reference.
2188
1.97M
         * Count == -1: Reference count would wrap, which is invalid.
2189
1.97M
         * Count == -2: Remaining unused ref is reserved for get_page_light().
2190
1.97M
         */
2191
1.97M
        if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2192
0
            return NULL;
2193
1.97M
    }
2194
1.97M
    while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2195
1.96M
2196
1.96M
    owner = page_get_owner(page);
2197
1.96M
    ASSERT(owner);
2198
1.96M
2199
1.96M
    return owner;
2200
1.96M
}
2201
2202
2203
int get_page(struct page_info *page, struct domain *domain)
2204
1.96M
{
2205
1.96M
    struct domain *owner = page_get_owner_and_reference(page);
2206
1.96M
2207
1.96M
    if ( likely(owner == domain) )
2208
1.98M
        return 1;
2209
1.96M
2210
18.4E
    if ( !paging_mode_refcounts(domain) && !domain->is_dying )
2211
0
        gprintk(XENLOG_INFO,
2212
18.4E
                "Error mfn %"PRI_mfn": rd=%d od=%d caf=%08lx taf=%" PRtype_info "\n",
2213
0
                mfn_x(page_to_mfn(page)), domain->domain_id,
2214
0
                owner ? owner->domain_id : DOMID_INVALID,
2215
0
                page->count_info - !!owner, page->u.inuse.type_info);
2216
18.4E
2217
18.4E
    if ( owner )
2218
0
        put_page(page);
2219
18.4E
2220
18.4E
    return 0;
2221
1.96M
}
2222
2223
/*
2224
 * Special version of get_page() to be used exclusively when
2225
 * - a page is known to already have a non-zero reference count
2226
 * - the page does not need its owner to be checked
2227
 * - it will not be called more than once without dropping the thus
2228
 *   acquired reference again.
2229
 * Due to get_page() reserving one reference, this call cannot fail.
2230
 */
2231
static void get_page_light(struct page_info *page)
2232
0
{
2233
0
    unsigned long x, nx, y = page->count_info;
2234
0
2235
0
    do {
2236
0
        x  = y;
2237
0
        nx = x + 1;
2238
0
        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2239
0
        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2240
0
        y = cmpxchg(&page->count_info, x, nx);
2241
0
    }
2242
0
    while ( unlikely(y != x) );
2243
0
}
2244
2245
static int alloc_page_type(struct page_info *page, unsigned long type,
2246
                           int preemptible)
2247
0
{
2248
0
    struct domain *owner = page_get_owner(page);
2249
0
    int rc;
2250
0
2251
0
    /* A page table is dirtied when its type count becomes non-zero. */
2252
0
    if ( likely(owner != NULL) )
2253
0
        paging_mark_dirty(owner, page_to_mfn(page));
2254
0
2255
0
    switch ( type & PGT_type_mask )
2256
0
    {
2257
0
    case PGT_l1_page_table:
2258
0
        rc = alloc_l1_table(page);
2259
0
        break;
2260
0
    case PGT_l2_page_table:
2261
0
        rc = alloc_l2_table(page, type, preemptible);
2262
0
        break;
2263
0
    case PGT_l3_page_table:
2264
0
        ASSERT(preemptible);
2265
0
        rc = alloc_l3_table(page);
2266
0
        break;
2267
0
    case PGT_l4_page_table:
2268
0
        ASSERT(preemptible);
2269
0
        rc = alloc_l4_table(page);
2270
0
        break;
2271
0
    case PGT_seg_desc_page:
2272
0
        rc = alloc_segdesc_page(page);
2273
0
        break;
2274
0
    default:
2275
0
        printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2276
0
               type, page->u.inuse.type_info,
2277
0
               page->count_info);
2278
0
        rc = -EINVAL;
2279
0
        BUG();
2280
0
    }
2281
0
2282
0
    /* No need for atomic update of type_info here: noone else updates it. */
2283
0
    smp_wmb();
2284
0
    switch ( rc )
2285
0
    {
2286
0
    case 0:
2287
0
        page->u.inuse.type_info |= PGT_validated;
2288
0
        break;
2289
0
    case -EINTR:
2290
0
        ASSERT((page->u.inuse.type_info &
2291
0
                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2292
0
        page->u.inuse.type_info &= ~PGT_count_mask;
2293
0
        break;
2294
0
    default:
2295
0
        ASSERT(rc < 0);
2296
0
        gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
2297
0
                 " (pfn %" PRI_pfn ") for type %" PRtype_info
2298
0
                 ": caf=%08lx taf=%" PRtype_info "\n",
2299
0
                 mfn_x(page_to_mfn(page)),
2300
0
                 get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
2301
0
                 type, page->count_info, page->u.inuse.type_info);
2302
0
        if ( page != current->arch.old_guest_table )
2303
0
            page->u.inuse.type_info = 0;
2304
0
        else
2305
0
        {
2306
0
            ASSERT((page->u.inuse.type_info &
2307
0
                    (PGT_count_mask | PGT_validated)) == 1);
2308
0
    case -ERESTART:
2309
0
            get_page_light(page);
2310
0
            page->u.inuse.type_info |= PGT_partial;
2311
0
        }
2312
0
        break;
2313
0
    }
2314
0
2315
0
    return rc;
2316
0
}
2317
2318
2319
int free_page_type(struct page_info *page, unsigned long type,
2320
                   int preemptible)
2321
0
{
2322
0
    struct domain *owner = page_get_owner(page);
2323
0
    unsigned long gmfn;
2324
0
    int rc;
2325
0
2326
0
    if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2327
0
    {
2328
0
        /* A page table is dirtied when its type count becomes zero. */
2329
0
        paging_mark_dirty(owner, page_to_mfn(page));
2330
0
2331
0
        ASSERT(!shadow_mode_refcounts(owner));
2332
0
2333
0
        gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
2334
0
        ASSERT(VALID_M2P(gmfn));
2335
0
        /* Page sharing not supported for shadowed domains */
2336
0
        if(!SHARED_M2P(gmfn))
2337
0
            shadow_remove_all_shadows(owner, _mfn(gmfn));
2338
0
    }
2339
0
2340
0
    if ( !(type & PGT_partial) )
2341
0
    {
2342
0
        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2343
0
        page->partial_pte = 0;
2344
0
    }
2345
0
2346
0
    switch ( type & PGT_type_mask )
2347
0
    {
2348
0
    case PGT_l1_page_table:
2349
0
        free_l1_table(page);
2350
0
        rc = 0;
2351
0
        break;
2352
0
    case PGT_l2_page_table:
2353
0
        rc = free_l2_table(page, preemptible);
2354
0
        break;
2355
0
    case PGT_l3_page_table:
2356
0
        ASSERT(preemptible);
2357
0
        rc = free_l3_table(page);
2358
0
        break;
2359
0
    case PGT_l4_page_table:
2360
0
        ASSERT(preemptible);
2361
0
        rc = free_l4_table(page);
2362
0
        break;
2363
0
    default:
2364
0
        gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
2365
0
                 type, mfn_x(page_to_mfn(page)));
2366
0
        rc = -EINVAL;
2367
0
        BUG();
2368
0
    }
2369
0
2370
0
    return rc;
2371
0
}
2372
2373
2374
static int _put_final_page_type(struct page_info *page, unsigned long type,
2375
                                bool preemptible, struct page_info *ptpg)
2376
0
{
2377
0
    int rc = free_page_type(page, type, preemptible);
2378
0
2379
0
    /* No need for atomic update of type_info here: noone else updates it. */
2380
0
    if ( rc == 0 )
2381
0
    {
2382
0
        if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
2383
0
        {
2384
0
            dec_linear_uses(page);
2385
0
            dec_linear_entries(ptpg);
2386
0
        }
2387
0
        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
2388
0
        set_tlbflush_timestamp(page);
2389
0
        smp_wmb();
2390
0
        page->u.inuse.type_info--;
2391
0
    }
2392
0
    else if ( rc == -EINTR )
2393
0
    {
2394
0
        ASSERT((page->u.inuse.type_info &
2395
0
                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2396
0
        set_tlbflush_timestamp(page);
2397
0
        smp_wmb();
2398
0
        page->u.inuse.type_info |= PGT_validated;
2399
0
    }
2400
0
    else
2401
0
    {
2402
0
        BUG_ON(rc != -ERESTART);
2403
0
        smp_wmb();
2404
0
        get_page_light(page);
2405
0
        page->u.inuse.type_info |= PGT_partial;
2406
0
    }
2407
0
2408
0
    return rc;
2409
0
}
2410
2411
2412
static int _put_page_type(struct page_info *page, bool preemptible,
2413
                          struct page_info *ptpg)
2414
2
{
2415
2
    unsigned long nx, x, y = page->u.inuse.type_info;
2416
2
    int rc = 0;
2417
2
2418
2
    for ( ; ; )
2419
2
    {
2420
2
        x  = y;
2421
2
        nx = x - 1;
2422
2
2423
2
        ASSERT((x & PGT_count_mask) != 0);
2424
2
2425
2
        if ( unlikely((nx & PGT_count_mask) == 0) )
2426
2
        {
2427
2
            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2428
0
                 likely(nx & (PGT_validated|PGT_partial)) )
2429
0
            {
2430
0
                /*
2431
0
                 * Page-table pages must be unvalidated when count is zero. The
2432
0
                 * 'free' is safe because the refcnt is non-zero and validated
2433
0
                 * bit is clear => other ops will spin or fail.
2434
0
                 */
2435
0
                nx = x & ~(PGT_validated|PGT_partial);
2436
0
                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2437
0
                                           x, nx)) != x) )
2438
0
                    continue;
2439
0
                /* We cleared the 'valid bit' so we do the clean up. */
2440
0
                rc = _put_final_page_type(page, x, preemptible, ptpg);
2441
0
                ptpg = NULL;
2442
0
                if ( x & PGT_partial )
2443
0
                    put_page(page);
2444
0
                break;
2445
0
            }
2446
2
2447
2
            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
2448
0
            {
2449
0
                /*
2450
0
                 * set_tlbflush_timestamp() accesses the same union
2451
0
                 * linear_pt_count lives in. Unvalidated page table pages,
2452
0
                 * however, should occur during domain destruction only
2453
0
                 * anyway.  Updating of linear_pt_count luckily is not
2454
0
                 * necessary anymore for a dying domain.
2455
0
                 */
2456
0
                ASSERT(page_get_owner(page)->is_dying);
2457
0
                ASSERT(page->linear_pt_count < 0);
2458
0
                ASSERT(ptpg->linear_pt_count > 0);
2459
0
                ptpg = NULL;
2460
0
            }
2461
2
2462
2
            set_tlbflush_timestamp(page);
2463
2
        }
2464
0
        else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) ==
2465
0
                           (PGT_locked | 1)) )
2466
0
        {
2467
0
            /*
2468
0
             * We must not drop the second to last reference when the page is
2469
0
             * locked, as page_unlock() doesn't do any cleanup of the type.
2470
0
             */
2471
0
            cpu_relax();
2472
0
            y = page->u.inuse.type_info;
2473
0
            continue;
2474
0
        }
2475
2
2476
2
        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2477
2
            break;
2478
2
2479
0
        if ( preemptible && hypercall_preempt_check() )
2480
0
            return -EINTR;
2481
0
    }
2482
2
2483
2
    if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
2484
0
    {
2485
0
        ASSERT(!rc);
2486
0
        dec_linear_uses(page);
2487
0
        dec_linear_entries(ptpg);
2488
0
    }
2489
2
2490
2
    return rc;
2491
2
}
2492
2493
2494
static int __get_page_type(struct page_info *page, unsigned long type,
2495
                           int preemptible)
2496
14
{
2497
14
    unsigned long nx, x, y = page->u.inuse.type_info;
2498
14
    int rc = 0, iommu_ret = 0;
2499
14
2500
14
    ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2501
14
    ASSERT(!in_irq());
2502
14
2503
14
    for ( ; ; )
2504
14
    {
2505
14
        x  = y;
2506
14
        nx = x + 1;
2507
14
        if ( unlikely((nx & PGT_count_mask) == 0) )
2508
0
        {
2509
0
            gdprintk(XENLOG_WARNING,
2510
0
                     "Type count overflow on mfn %"PRI_mfn"\n",
2511
0
                     mfn_x(page_to_mfn(page)));
2512
0
            return -EINVAL;
2513
0
        }
2514
14
        else if ( unlikely((x & PGT_count_mask) == 0) )
2515
14
        {
2516
14
            struct domain *d = page_get_owner(page);
2517
14
2518
14
            /*
2519
14
             * Normally we should never let a page go from type count 0
2520
14
             * to type count 1 when it is shadowed. One exception:
2521
14
             * out-of-sync shadowed pages are allowed to become
2522
14
             * writeable.
2523
14
             */
2524
14
            if ( d && shadow_mode_enabled(d)
2525
0
                 && (page->count_info & PGC_page_table)
2526
0
                 && !((page->shadow_flags & (1u<<29))
2527
0
                      && type == PGT_writable_page) )
2528
0
               shadow_remove_all_shadows(d, page_to_mfn(page));
2529
14
2530
14
            ASSERT(!(x & PGT_pae_xen_l2));
2531
14
            if ( (x & PGT_type_mask) != type )
2532
13
            {
2533
13
                /*
2534
13
                 * On type change we check to flush stale TLB entries. This
2535
13
                 * may be unnecessary (e.g., page was GDT/LDT) but those
2536
13
                 * circumstances should be very rare.
2537
13
                 */
2538
13
                cpumask_t *mask = this_cpu(scratch_cpumask);
2539
13
2540
13
                BUG_ON(in_irq());
2541
13
                cpumask_copy(mask, d->domain_dirty_cpumask);
2542
13
2543
13
                /* Don't flush if the timestamp is old enough */
2544
13
                tlbflush_filter(mask, page->tlbflush_timestamp);
2545
13
2546
13
                if ( unlikely(!cpumask_empty(mask)) &&
2547
13
                     /* Shadow mode: track only writable pages. */
2548
1
                     (!shadow_mode_enabled(page_get_owner(page)) ||
2549
0
                      ((nx & PGT_type_mask) == PGT_writable_page)) )
2550
1
                {
2551
1
                    perfc_incr(need_flush_tlb_flush);
2552
1
                    flush_tlb_mask(mask);
2553
1
                }
2554
13
2555
13
                /* We lose existing type and validity. */
2556
13
                nx &= ~(PGT_type_mask | PGT_validated);
2557
13
                nx |= type;
2558
13
2559
13
                /*
2560
13
                 * No special validation needed for writable pages.
2561
13
                 * Page tables and GDT/LDT need to be scanned for validity.
2562
13
                 */
2563
13
                if ( type == PGT_writable_page || type == PGT_shared_page )
2564
13
                    nx |= PGT_validated;
2565
13
            }
2566
14
        }
2567
0
        else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2568
0
        {
2569
0
            /* Don't log failure if it could be a recursive-mapping attempt. */
2570
0
            if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2571
0
                 (type == PGT_l1_page_table) )
2572
0
                return -EINVAL;
2573
0
            if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2574
0
                 (type == PGT_l2_page_table) )
2575
0
                return -EINVAL;
2576
0
            if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2577
0
                 (type == PGT_l3_page_table) )
2578
0
                return -EINVAL;
2579
0
            gdprintk(XENLOG_WARNING,
2580
0
                     "Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2581
0
                     "for mfn %" PRI_mfn " (pfn %" PRI_pfn ")\n",
2582
0
                     x, type, mfn_x(page_to_mfn(page)),
2583
0
                     get_gpfn_from_mfn(mfn_x(page_to_mfn(page))));
2584
0
            return -EINVAL;
2585
0
        }
2586
0
        else if ( unlikely(!(x & PGT_validated)) )
2587
0
        {
2588
0
            if ( !(x & PGT_partial) )
2589
0
            {
2590
0
                /* Someone else is updating validation of this page. Wait... */
2591
0
                while ( (y = page->u.inuse.type_info) == x )
2592
0
                {
2593
0
                    if ( preemptible && hypercall_preempt_check() )
2594
0
                        return -EINTR;
2595
0
                    cpu_relax();
2596
0
                }
2597
0
                continue;
2598
0
            }
2599
0
            /* Type ref count was left at 1 when PGT_partial got set. */
2600
0
            ASSERT((x & PGT_count_mask) == 1);
2601
0
            nx = x & ~PGT_partial;
2602
0
        }
2603
14
2604
14
        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2605
14
            break;
2606
14
2607
0
        if ( preemptible && hypercall_preempt_check() )
2608
0
            return -EINTR;
2609
0
    }
2610
14
2611
14
    if ( unlikely((x & PGT_type_mask) != type) )
2612
13
    {
2613
13
        /* Special pages should not be accessible from devices. */
2614
13
        struct domain *d = page_get_owner(page);
2615
13
        if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
2616
0
        {
2617
0
            gfn_t gfn = _gfn(mfn_to_gmfn(d, mfn_x(page_to_mfn(page))));
2618
0
2619
0
            if ( (x & PGT_type_mask) == PGT_writable_page )
2620
0
                iommu_ret = iommu_unmap_page(d, gfn_x(gfn));
2621
0
            else if ( type == PGT_writable_page )
2622
0
                iommu_ret = iommu_map_page(d, gfn_x(gfn),
2623
0
                                           mfn_x(page_to_mfn(page)),
2624
0
                                           IOMMUF_readable|IOMMUF_writable);
2625
0
        }
2626
13
    }
2627
14
2628
14
    if ( unlikely(!(nx & PGT_validated)) )
2629
0
    {
2630
0
        if ( !(x & PGT_partial) )
2631
0
        {
2632
0
            page->nr_validated_ptes = 0;
2633
0
            page->partial_pte = 0;
2634
0
        }
2635
0
        page->linear_pt_count = 0;
2636
0
        rc = alloc_page_type(page, type, preemptible);
2637
0
    }
2638
14
2639
14
    if ( (x & PGT_partial) && !(nx & PGT_partial) )
2640
0
        put_page(page);
2641
14
2642
14
    if ( !rc )
2643
14
        rc = iommu_ret;
2644
14
2645
14
    return rc;
2646
14
}
2647
2648
void put_page_type(struct page_info *page)
2649
2
{
2650
2
    int rc = _put_page_type(page, false, NULL);
2651
2
    ASSERT(rc == 0);
2652
2
    (void)rc;
2653
2
}
2654
2655
int get_page_type(struct page_info *page, unsigned long type)
2656
14
{
2657
14
    int rc = __get_page_type(page, type, 0);
2658
14
    if ( likely(rc == 0) )
2659
14
        return 1;
2660
0
    ASSERT(rc != -EINTR && rc != -ERESTART);
2661
0
    return 0;
2662
14
}
2663
2664
int put_page_type_preemptible(struct page_info *page)
2665
0
{
2666
0
    return _put_page_type(page, true, NULL);
2667
0
}
2668
2669
int get_page_type_preemptible(struct page_info *page, unsigned long type)
2670
0
{
2671
0
    ASSERT(!current->arch.old_guest_table);
2672
0
    return __get_page_type(page, type, 1);
2673
0
}
2674
2675
int put_old_guest_table(struct vcpu *v)
2676
0
{
2677
0
    int rc;
2678
0
2679
0
    if ( !v->arch.old_guest_table )
2680
0
        return 0;
2681
0
2682
0
    switch ( rc = _put_page_type(v->arch.old_guest_table, true,
2683
0
                                 v->arch.old_guest_ptpg) )
2684
0
    {
2685
0
    case -EINTR:
2686
0
    case -ERESTART:
2687
0
        return -ERESTART;
2688
0
    case 0:
2689
0
        put_page(v->arch.old_guest_table);
2690
0
    }
2691
0
2692
0
    v->arch.old_guest_table = NULL;
2693
0
2694
0
    return rc;
2695
0
}
2696
2697
int vcpu_destroy_pagetables(struct vcpu *v)
2698
0
{
2699
0
    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
2700
0
    struct page_info *page;
2701
0
    l4_pgentry_t *l4tab = NULL;
2702
0
    int rc = put_old_guest_table(v);
2703
0
2704
0
    if ( rc )
2705
0
        return rc;
2706
0
2707
0
    if ( is_pv_32bit_vcpu(v) )
2708
0
    {
2709
0
        l4tab = map_domain_page(_mfn(mfn));
2710
0
        mfn = l4e_get_pfn(*l4tab);
2711
0
    }
2712
0
2713
0
    if ( mfn )
2714
0
    {
2715
0
        page = mfn_to_page(_mfn(mfn));
2716
0
        if ( paging_mode_refcounts(v->domain) )
2717
0
            put_page(page);
2718
0
        else
2719
0
            rc = put_page_and_type_preemptible(page);
2720
0
    }
2721
0
2722
0
    if ( l4tab )
2723
0
    {
2724
0
        if ( !rc )
2725
0
            l4e_write(l4tab, l4e_empty());
2726
0
        unmap_domain_page(l4tab);
2727
0
    }
2728
0
    else if ( !rc )
2729
0
    {
2730
0
        v->arch.guest_table = pagetable_null();
2731
0
2732
0
        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
2733
0
        mfn = pagetable_get_pfn(v->arch.guest_table_user);
2734
0
        if ( mfn )
2735
0
        {
2736
0
            page = mfn_to_page(_mfn(mfn));
2737
0
            if ( paging_mode_refcounts(v->domain) )
2738
0
                put_page(page);
2739
0
            else
2740
0
                rc = put_page_and_type_preemptible(page);
2741
0
        }
2742
0
        if ( !rc )
2743
0
            v->arch.guest_table_user = pagetable_null();
2744
0
    }
2745
0
2746
0
    v->arch.cr3 = 0;
2747
0
2748
0
    /*
2749
0
     * put_page_and_type_preemptible() is liable to return -EINTR. The
2750
0
     * callers of us expect -ERESTART so convert it over.
2751
0
     */
2752
0
    return rc != -EINTR ? rc : -ERESTART;
2753
0
}
2754
2755
int new_guest_cr3(mfn_t mfn)
2756
0
{
2757
0
    struct vcpu *curr = current;
2758
0
    struct domain *d = curr->domain;
2759
0
    int rc;
2760
0
    mfn_t old_base_mfn;
2761
0
2762
0
    if ( is_pv_32bit_domain(d) )
2763
0
    {
2764
0
        mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
2765
0
        l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
2766
0
2767
0
        rc = mod_l4_entry(pl4e,
2768
0
                          l4e_from_mfn(mfn,
2769
0
                                       (_PAGE_PRESENT | _PAGE_RW |
2770
0
                                        _PAGE_USER | _PAGE_ACCESSED)),
2771
0
                          mfn_x(gt_mfn), 0, curr);
2772
0
        unmap_domain_page(pl4e);
2773
0
        switch ( rc )
2774
0
        {
2775
0
        case 0:
2776
0
            break;
2777
0
        case -EINTR:
2778
0
        case -ERESTART:
2779
0
            return -ERESTART;
2780
0
        default:
2781
0
            gdprintk(XENLOG_WARNING,
2782
0
                     "Error while installing new compat baseptr %" PRI_mfn "\n",
2783
0
                     mfn_x(mfn));
2784
0
            return rc;
2785
0
        }
2786
0
2787
0
        invalidate_shadow_ldt(curr, 0);
2788
0
        write_ptbase(curr);
2789
0
2790
0
        return 0;
2791
0
    }
2792
0
2793
0
    rc = put_old_guest_table(curr);
2794
0
    if ( unlikely(rc) )
2795
0
        return rc;
2796
0
2797
0
    old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
2798
0
    /*
2799
0
     * This is particularly important when getting restarted after the
2800
0
     * previous attempt got preempted in the put-old-MFN phase.
2801
0
     */
2802
0
    if ( mfn_eq(old_base_mfn, mfn) )
2803
0
    {
2804
0
        write_ptbase(curr);
2805
0
        return 0;
2806
0
    }
2807
0
2808
0
    rc = paging_mode_refcounts(d)
2809
0
         ? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL)
2810
0
         : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
2811
0
    switch ( rc )
2812
0
    {
2813
0
    case 0:
2814
0
        break;
2815
0
    case -EINTR:
2816
0
    case -ERESTART:
2817
0
        return -ERESTART;
2818
0
    default:
2819
0
        gdprintk(XENLOG_WARNING,
2820
0
                 "Error while installing new baseptr %" PRI_mfn "\n",
2821
0
                 mfn_x(mfn));
2822
0
        return rc;
2823
0
    }
2824
0
2825
0
    invalidate_shadow_ldt(curr, 0);
2826
0
2827
0
    if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
2828
0
        fill_ro_mpt(mfn);
2829
0
    curr->arch.guest_table = pagetable_from_mfn(mfn);
2830
0
    update_cr3(curr);
2831
0
2832
0
    write_ptbase(curr);
2833
0
2834
0
    if ( likely(mfn_x(old_base_mfn) != 0) )
2835
0
    {
2836
0
        struct page_info *page = mfn_to_page(old_base_mfn);
2837
0
2838
0
        if ( paging_mode_refcounts(d) )
2839
0
            put_page(page);
2840
0
        else
2841
0
            switch ( rc = put_page_and_type_preemptible(page) )
2842
0
            {
2843
0
            case -EINTR:
2844
0
                rc = -ERESTART;
2845
0
                /* fallthrough */
2846
0
            case -ERESTART:
2847
0
                curr->arch.old_guest_ptpg = NULL;
2848
0
                curr->arch.old_guest_table = page;
2849
0
                break;
2850
0
            default:
2851
0
                BUG_ON(rc);
2852
0
                break;
2853
0
            }
2854
0
    }
2855
0
2856
0
    return rc;
2857
0
}
2858
2859
static struct domain *get_pg_owner(domid_t domid)
2860
0
{
2861
0
    struct domain *pg_owner = NULL, *curr = current->domain;
2862
0
2863
0
    if ( likely(domid == DOMID_SELF) )
2864
0
    {
2865
0
        pg_owner = rcu_lock_current_domain();
2866
0
        goto out;
2867
0
    }
2868
0
2869
0
    if ( unlikely(domid == curr->domain_id) )
2870
0
    {
2871
0
        gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
2872
0
        goto out;
2873
0
    }
2874
0
2875
0
    switch ( domid )
2876
0
    {
2877
0
    case DOMID_IO:
2878
0
        pg_owner = rcu_lock_domain(dom_io);
2879
0
        break;
2880
0
    case DOMID_XEN:
2881
0
        pg_owner = rcu_lock_domain(dom_xen);
2882
0
        break;
2883
0
    default:
2884
0
        if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2885
0
        {
2886
0
            gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
2887
0
            break;
2888
0
        }
2889
0
        break;
2890
0
    }
2891
0
2892
0
 out:
2893
0
    return pg_owner;
2894
0
}
2895
2896
static void put_pg_owner(struct domain *pg_owner)
2897
0
{
2898
0
    rcu_unlock_domain(pg_owner);
2899
0
}
2900
2901
static inline int vcpumask_to_pcpumask(
2902
    struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t *pmask)
2903
0
{
2904
0
    unsigned int vcpu_id, vcpu_bias, offs;
2905
0
    unsigned long vmask;
2906
0
    struct vcpu *v;
2907
0
    bool is_native = !is_pv_32bit_domain(d);
2908
0
2909
0
    cpumask_clear(pmask);
2910
0
    for ( vmask = 0, offs = 0; ; ++offs )
2911
0
    {
2912
0
        vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2913
0
        if ( vcpu_bias >= d->max_vcpus )
2914
0
            return 0;
2915
0
2916
0
        if ( unlikely(is_native ?
2917
0
                      copy_from_guest_offset(&vmask, bmap, offs, 1) :
2918
0
                      copy_from_guest_offset((unsigned int *)&vmask, bmap,
2919
0
                                             offs, 1)) )
2920
0
        {
2921
0
            cpumask_clear(pmask);
2922
0
            return -EFAULT;
2923
0
        }
2924
0
2925
0
        while ( vmask )
2926
0
        {
2927
0
            vcpu_id = find_first_set_bit(vmask);
2928
0
            vmask &= ~(1UL << vcpu_id);
2929
0
            vcpu_id += vcpu_bias;
2930
0
            if ( (vcpu_id >= d->max_vcpus) )
2931
0
                return 0;
2932
0
            if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2933
0
                cpumask_or(pmask, pmask, v->vcpu_dirty_cpumask);
2934
0
        }
2935
0
    }
2936
0
}
2937
2938
long do_mmuext_op(
2939
    XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
2940
    unsigned int count,
2941
    XEN_GUEST_HANDLE_PARAM(uint) pdone,
2942
    unsigned int foreigndom)
2943
0
{
2944
0
    struct mmuext_op op;
2945
0
    unsigned long type;
2946
0
    unsigned int i, done = 0;
2947
0
    struct vcpu *curr = current;
2948
0
    struct domain *currd = curr->domain;
2949
0
    struct domain *pg_owner;
2950
0
    int rc = put_old_guest_table(curr);
2951
0
2952
0
    if ( unlikely(rc) )
2953
0
    {
2954
0
        if ( likely(rc == -ERESTART) )
2955
0
            rc = hypercall_create_continuation(
2956
0
                     __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
2957
0
                     foreigndom);
2958
0
        return rc;
2959
0
    }
2960
0
2961
0
    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
2962
0
         likely(guest_handle_is_null(uops)) )
2963
0
    {
2964
0
        /*
2965
0
         * See the curr->arch.old_guest_table related
2966
0
         * hypercall_create_continuation() below.
2967
0
         */
2968
0
        return (int)foreigndom;
2969
0
    }
2970
0
2971
0
    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2972
0
    {
2973
0
        count &= ~MMU_UPDATE_PREEMPTED;
2974
0
        if ( unlikely(!guest_handle_is_null(pdone)) )
2975
0
            (void)copy_from_guest(&done, pdone, 1);
2976
0
    }
2977
0
    else
2978
0
        perfc_incr(calls_to_mmuext_op);
2979
0
2980
0
    if ( unlikely(!guest_handle_okay(uops, count)) )
2981
0
        return -EFAULT;
2982
0
2983
0
    if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
2984
0
        return -ESRCH;
2985
0
2986
0
    if ( !is_pv_domain(pg_owner) )
2987
0
    {
2988
0
        put_pg_owner(pg_owner);
2989
0
        return -EINVAL;
2990
0
    }
2991
0
2992
0
    rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
2993
0
    if ( rc )
2994
0
    {
2995
0
        put_pg_owner(pg_owner);
2996
0
        return rc;
2997
0
    }
2998
0
2999
0
    for ( i = 0; i < count; i++ )
3000
0
    {
3001
0
        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3002
0
        {
3003
0
            rc = -ERESTART;
3004
0
            break;
3005
0
        }
3006
0
3007
0
        if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
3008
0
        {
3009
0
            rc = -EFAULT;
3010
0
            break;
3011
0
        }
3012
0
3013
0
        if ( is_hvm_domain(currd) )
3014
0
        {
3015
0
            switch ( op.cmd )
3016
0
            {
3017
0
            case MMUEXT_PIN_L1_TABLE:
3018
0
            case MMUEXT_PIN_L2_TABLE:
3019
0
            case MMUEXT_PIN_L3_TABLE:
3020
0
            case MMUEXT_PIN_L4_TABLE:
3021
0
            case MMUEXT_UNPIN_TABLE:
3022
0
                break;
3023
0
            default:
3024
0
                rc = -EOPNOTSUPP;
3025
0
                goto done;
3026
0
            }
3027
0
        }
3028
0
3029
0
        rc = 0;
3030
0
3031
0
        switch ( op.cmd )
3032
0
        {
3033
0
            struct page_info *page;
3034
0
            p2m_type_t p2mt;
3035
0
3036
0
        case MMUEXT_PIN_L1_TABLE:
3037
0
            type = PGT_l1_page_table;
3038
0
            goto pin_page;
3039
0
3040
0
        case MMUEXT_PIN_L2_TABLE:
3041
0
            type = PGT_l2_page_table;
3042
0
            goto pin_page;
3043
0
3044
0
        case MMUEXT_PIN_L3_TABLE:
3045
0
            type = PGT_l3_page_table;
3046
0
            goto pin_page;
3047
0
3048
0
        case MMUEXT_PIN_L4_TABLE:
3049
0
            if ( is_pv_32bit_domain(pg_owner) )
3050
0
                break;
3051
0
            type = PGT_l4_page_table;
3052
0
3053
0
        pin_page:
3054
0
            /* Ignore pinning of invalid paging levels. */
3055
0
            if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
3056
0
                break;
3057
0
3058
0
            if ( paging_mode_refcounts(pg_owner) )
3059
0
                break;
3060
0
3061
0
            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3062
0
            if ( unlikely(!page) )
3063
0
            {
3064
0
                rc = -EINVAL;
3065
0
                break;
3066
0
            }
3067
0
3068
0
            rc = get_page_type_preemptible(page, type);
3069
0
            if ( unlikely(rc) )
3070
0
            {
3071
0
                if ( rc == -EINTR )
3072
0
                    rc = -ERESTART;
3073
0
                else if ( rc != -ERESTART )
3074
0
                    gdprintk(XENLOG_WARNING,
3075
0
                             "Error %d while pinning mfn %" PRI_mfn "\n",
3076
0
                             rc, mfn_x(page_to_mfn(page)));
3077
0
                if ( page != curr->arch.old_guest_table )
3078
0
                    put_page(page);
3079
0
                break;
3080
0
            }
3081
0
3082
0
            rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
3083
0
            if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
3084
0
                                                  &page->u.inuse.type_info)) )
3085
0
            {
3086
0
                gdprintk(XENLOG_WARNING,
3087
0
                         "mfn %" PRI_mfn " already pinned\n",
3088
0
                         mfn_x(page_to_mfn(page)));
3089
0
                rc = -EINVAL;
3090
0
            }
3091
0
3092
0
            if ( unlikely(rc) )
3093
0
                goto pin_drop;
3094
0
3095
0
            /* A page is dirtied when its pin status is set. */
3096
0
            paging_mark_dirty(pg_owner, page_to_mfn(page));
3097
0
3098
0
            /* We can race domain destruction (domain_relinquish_resources). */
3099
0
            if ( unlikely(pg_owner != currd) )
3100
0
            {
3101
0
                bool drop_ref;
3102
0
3103
0
                spin_lock(&pg_owner->page_alloc_lock);
3104
0
                drop_ref = (pg_owner->is_dying &&
3105
0
                            test_and_clear_bit(_PGT_pinned,
3106
0
                                               &page->u.inuse.type_info));
3107
0
                spin_unlock(&pg_owner->page_alloc_lock);
3108
0
                if ( drop_ref )
3109
0
                {
3110
0
        pin_drop:
3111
0
                    if ( type == PGT_l1_page_table )
3112
0
                        put_page_and_type(page);
3113
0
                    else
3114
0
                    {
3115
0
                        curr->arch.old_guest_ptpg = NULL;
3116
0
                        curr->arch.old_guest_table = page;
3117
0
                    }
3118
0
                }
3119
0
            }
3120
0
            break;
3121
0
3122
0
        case MMUEXT_UNPIN_TABLE:
3123
0
            if ( paging_mode_refcounts(pg_owner) )
3124
0
                break;
3125
0
3126
0
            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3127
0
            if ( unlikely(!page) )
3128
0
            {
3129
0
                gdprintk(XENLOG_WARNING,
3130
0
                         "mfn %" PRI_mfn " bad, or bad owner d%d\n",
3131
0
                         op.arg1.mfn, pg_owner->domain_id);
3132
0
                rc = -EINVAL;
3133
0
                break;
3134
0
            }
3135
0
3136
0
            if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
3137
0
            {
3138
0
                put_page(page);
3139
0
                gdprintk(XENLOG_WARNING,
3140
0
                         "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
3141
0
                rc = -EINVAL;
3142
0
                break;
3143
0
            }
3144
0
3145
0
            switch ( rc = put_page_and_type_preemptible(page) )
3146
0
            {
3147
0
            case -EINTR:
3148
0
            case -ERESTART:
3149
0
                curr->arch.old_guest_ptpg = NULL;
3150
0
                curr->arch.old_guest_table = page;
3151
0
                rc = 0;
3152
0
                break;
3153
0
            default:
3154
0
                BUG_ON(rc);
3155
0
                break;
3156
0
            }
3157
0
            put_page(page);
3158
0
3159
0
            /* A page is dirtied when its pin status is cleared. */
3160
0
            paging_mark_dirty(pg_owner, page_to_mfn(page));
3161
0
            break;
3162
0
3163
0
        case MMUEXT_NEW_BASEPTR:
3164
0
            if ( unlikely(currd != pg_owner) )
3165
0
                rc = -EPERM;
3166
0
            else if ( unlikely(paging_mode_translate(currd)) )
3167
0
                rc = -EINVAL;
3168
0
            else
3169
0
                rc = new_guest_cr3(_mfn(op.arg1.mfn));
3170
0
            break;
3171
0
3172
0
        case MMUEXT_NEW_USER_BASEPTR: {
3173
0
            unsigned long old_mfn;
3174
0
3175
0
            if ( unlikely(currd != pg_owner) )
3176
0
                rc = -EPERM;
3177
0
            else if ( unlikely(paging_mode_translate(currd)) )
3178
0
                rc = -EINVAL;
3179
0
            if ( unlikely(rc) )
3180
0
                break;
3181
0
3182
0
            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
3183
0
            /*
3184
0
             * This is particularly important when getting restarted after the
3185
0
             * previous attempt got preempted in the put-old-MFN phase.
3186
0
             */
3187
0
            if ( old_mfn == op.arg1.mfn )
3188
0
                break;
3189
0
3190
0
            if ( op.arg1.mfn != 0 )
3191
0
            {
3192
0
                rc = get_page_and_type_from_mfn(
3193
0
                    _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
3194
0
3195
0
                if ( unlikely(rc) )
3196
0
                {
3197
0
                    if ( rc == -EINTR )
3198
0
                        rc = -ERESTART;
3199
0
                    else if ( rc != -ERESTART )
3200
0
                        gdprintk(XENLOG_WARNING,
3201
0
                                 "Error %d installing new mfn %" PRI_mfn "\n",
3202
0
                                 rc, op.arg1.mfn);
3203
0
                    break;
3204
0
                }
3205
0
3206
0
                if ( VM_ASSIST(currd, m2p_strict) )
3207
0
                    zap_ro_mpt(_mfn(op.arg1.mfn));
3208
0
            }
3209
0
3210
0
            curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
3211
0
3212
0
            if ( old_mfn != 0 )
3213
0
            {
3214
0
                page = mfn_to_page(_mfn(old_mfn));
3215
0
3216
0
                switch ( rc = put_page_and_type_preemptible(page) )
3217
0
                {
3218
0
                case -EINTR:
3219
0
                    rc = -ERESTART;
3220
0
                    /* fallthrough */
3221
0
                case -ERESTART:
3222
0
                    curr->arch.old_guest_ptpg = NULL;
3223
0
                    curr->arch.old_guest_table = page;
3224
0
                    break;
3225
0
                default:
3226
0
                    BUG_ON(rc);
3227
0
                    break;
3228
0
                }
3229
0
            }
3230
0
3231
0
            break;
3232
0
        }
3233
0
3234
0
        case MMUEXT_TLB_FLUSH_LOCAL:
3235
0
            if ( likely(currd == pg_owner) )
3236
0
                flush_tlb_local();
3237
0
            else
3238
0
                rc = -EPERM;
3239
0
            break;
3240
0
3241
0
        case MMUEXT_INVLPG_LOCAL:
3242
0
            if ( unlikely(currd != pg_owner) )
3243
0
                rc = -EPERM;
3244
0
            else
3245
0
                paging_invlpg(curr, op.arg1.linear_addr);
3246
0
            break;
3247
0
3248
0
        case MMUEXT_TLB_FLUSH_MULTI:
3249
0
        case MMUEXT_INVLPG_MULTI:
3250
0
        {
3251
0
            cpumask_t *mask = this_cpu(scratch_cpumask);
3252
0
3253
0
            if ( unlikely(currd != pg_owner) )
3254
0
                rc = -EPERM;
3255
0
            else if ( unlikely(vcpumask_to_pcpumask(currd,
3256
0
                                   guest_handle_to_param(op.arg2.vcpumask,
3257
0
                                                         const_void),
3258
0
                                   mask)) )
3259
0
                rc = -EINVAL;
3260
0
            if ( unlikely(rc) )
3261
0
                break;
3262
0
3263
0
            if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
3264
0
                flush_tlb_mask(mask);
3265
0
            else if ( __addr_ok(op.arg1.linear_addr) )
3266
0
                flush_tlb_one_mask(mask, op.arg1.linear_addr);
3267
0
            break;
3268
0
        }
3269
0
3270
0
        case MMUEXT_TLB_FLUSH_ALL:
3271
0
            if ( likely(currd == pg_owner) )
3272
0
                flush_tlb_mask(currd->domain_dirty_cpumask);
3273
0
            else
3274
0
                rc = -EPERM;
3275
0
            break;
3276
0
3277
0
        case MMUEXT_INVLPG_ALL:
3278
0
            if ( unlikely(currd != pg_owner) )
3279
0
                rc = -EPERM;
3280
0
            else if ( __addr_ok(op.arg1.linear_addr) )
3281
0
                flush_tlb_one_mask(currd->domain_dirty_cpumask,
3282
0
                                   op.arg1.linear_addr);
3283
0
            break;
3284
0
3285
0
        case MMUEXT_FLUSH_CACHE:
3286
0
            if ( unlikely(currd != pg_owner) )
3287
0
                rc = -EPERM;
3288
0
            else if ( unlikely(!cache_flush_permitted(currd)) )
3289
0
                rc = -EACCES;
3290
0
            else
3291
0
                wbinvd();
3292
0
            break;
3293
0
3294
0
        case MMUEXT_FLUSH_CACHE_GLOBAL:
3295
0
            if ( unlikely(currd != pg_owner) )
3296
0
                rc = -EPERM;
3297
0
            else if ( likely(cache_flush_permitted(currd)) )
3298
0
            {
3299
0
                unsigned int cpu;
3300
0
                cpumask_t *mask = this_cpu(scratch_cpumask);
3301
0
3302
0
                cpumask_clear(mask);
3303
0
                for_each_online_cpu(cpu)
3304
0
                    if ( !cpumask_intersects(mask,
3305
0
                                             per_cpu(cpu_sibling_mask, cpu)) )
3306
0
                        __cpumask_set_cpu(cpu, mask);
3307
0
                flush_mask(mask, FLUSH_CACHE);
3308
0
            }
3309
0
            else
3310
0
                rc = -EINVAL;
3311
0
            break;
3312
0
3313
0
        case MMUEXT_SET_LDT:
3314
0
        {
3315
0
            unsigned int ents = op.arg2.nr_ents;
3316
0
            unsigned long ptr = ents ? op.arg1.linear_addr : 0;
3317
0
3318
0
            if ( unlikely(currd != pg_owner) )
3319
0
                rc = -EPERM;
3320
0
            else if ( paging_mode_external(currd) )
3321
0
                rc = -EINVAL;
3322
0
            else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
3323
0
                      (ents > 8192) )
3324
0
            {
3325
0
                gdprintk(XENLOG_WARNING,
3326
0
                         "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
3327
0
                rc = -EINVAL;
3328
0
            }
3329
0
            else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
3330
0
                      (curr->arch.pv_vcpu.ldt_base != ptr) )
3331
0
            {
3332
0
                invalidate_shadow_ldt(curr, 0);
3333
0
                flush_tlb_local();
3334
0
                curr->arch.pv_vcpu.ldt_base = ptr;
3335
0
                curr->arch.pv_vcpu.ldt_ents = ents;
3336
0
                load_LDT(curr);
3337
0
            }
3338
0
            break;
3339
0
        }
3340
0
3341
0
        case MMUEXT_CLEAR_PAGE:
3342
0
            page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
3343
0
            if ( unlikely(p2mt != p2m_ram_rw) && page )
3344
0
            {
3345
0
                put_page(page);
3346
0
                page = NULL;
3347
0
            }
3348
0
            if ( !page || !get_page_type(page, PGT_writable_page) )
3349
0
            {
3350
0
                if ( page )
3351
0
                    put_page(page);
3352
0
                gdprintk(XENLOG_WARNING,
3353
0
                         "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
3354
0
                rc = -EINVAL;
3355
0
                break;
3356
0
            }
3357
0
3358
0
            /* A page is dirtied when it's being cleared. */
3359
0
            paging_mark_dirty(pg_owner, page_to_mfn(page));
3360
0
3361
0
            clear_domain_page(page_to_mfn(page));
3362
0
3363
0
            put_page_and_type(page);
3364
0
            break;
3365
0
3366
0
        case MMUEXT_COPY_PAGE:
3367
0
        {
3368
0
            struct page_info *src_page, *dst_page;
3369
0
3370
0
            src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
3371
0
                                         P2M_ALLOC);
3372
0
            if ( unlikely(p2mt != p2m_ram_rw) && src_page )
3373
0
            {
3374
0
                put_page(src_page);
3375
0
                src_page = NULL;
3376
0
            }
3377
0
            if ( unlikely(!src_page) )
3378
0
            {
3379
0
                gdprintk(XENLOG_WARNING,
3380
0
                         "Error copying from mfn %" PRI_mfn "\n",
3381
0
                         op.arg2.src_mfn);
3382
0
                rc = -EINVAL;
3383
0
                break;
3384
0
            }
3385
0
3386
0
            dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
3387
0
                                         P2M_ALLOC);
3388
0
            if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
3389
0
            {
3390
0
                put_page(dst_page);
3391
0
                dst_page = NULL;
3392
0
            }
3393
0
            rc = (dst_page &&
3394
0
                  get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
3395
0
            if ( unlikely(rc) )
3396
0
            {
3397
0
                put_page(src_page);
3398
0
                if ( dst_page )
3399
0
                    put_page(dst_page);
3400
0
                gdprintk(XENLOG_WARNING,
3401
0
                         "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
3402
0
                break;
3403
0
            }
3404
0
3405
0
            /* A page is dirtied when it's being copied to. */
3406
0
            paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
3407
0
3408
0
            copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
3409
0
3410
0
            put_page_and_type(dst_page);
3411
0
            put_page(src_page);
3412
0
            break;
3413
0
        }
3414
0
3415
0
        case MMUEXT_MARK_SUPER:
3416
0
        case MMUEXT_UNMARK_SUPER:
3417
0
            rc = -EOPNOTSUPP;
3418
0
            break;
3419
0
3420
0
        default:
3421
0
            rc = -ENOSYS;
3422
0
            break;
3423
0
        }
3424
0
3425
0
 done:
3426
0
        if ( unlikely(rc) )
3427
0
            break;
3428
0
3429
0
        guest_handle_add_offset(uops, 1);
3430
0
    }
3431
0
3432
0
    if ( rc == -ERESTART )
3433
0
    {
3434
0
        ASSERT(i < count);
3435
0
        rc = hypercall_create_continuation(
3436
0
            __HYPERVISOR_mmuext_op, "hihi",
3437
0
            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3438
0
    }
3439
0
    else if ( curr->arch.old_guest_table )
3440
0
    {
3441
0
        XEN_GUEST_HANDLE_PARAM(void) null;
3442
0
3443
0
        ASSERT(rc || i == count);
3444
0
        set_xen_guest_handle(null, NULL);
3445
0
        /*
3446
0
         * In order to have a way to communicate the final return value to
3447
0
         * our continuation, we pass this in place of "foreigndom", building
3448
0
         * on the fact that this argument isn't needed anymore.
3449
0
         */
3450
0
        rc = hypercall_create_continuation(
3451
0
                __HYPERVISOR_mmuext_op, "hihi", null,
3452
0
                MMU_UPDATE_PREEMPTED, null, rc);
3453
0
    }
3454
0
3455
0
    put_pg_owner(pg_owner);
3456
0
3457
0
    perfc_add(num_mmuext_ops, i);
3458
0
3459
0
    /* Add incremental work we have done to the @done output parameter. */
3460
0
    if ( unlikely(!guest_handle_is_null(pdone)) )
3461
0
    {
3462
0
        done += i;
3463
0
        copy_to_guest(pdone, &done, 1);
3464
0
    }
3465
0
3466
0
    return rc;
3467
0
}
3468
3469
long do_mmu_update(
3470
    XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
3471
    unsigned int count,
3472
    XEN_GUEST_HANDLE_PARAM(uint) pdone,
3473
    unsigned int foreigndom)
3474
0
{
3475
0
    struct mmu_update req;
3476
0
    void *va = NULL;
3477
0
    unsigned long gpfn, gmfn, mfn;
3478
0
    struct page_info *page;
3479
0
    unsigned int cmd, i = 0, done = 0, pt_dom;
3480
0
    struct vcpu *curr = current, *v = curr;
3481
0
    struct domain *d = v->domain, *pt_owner = d, *pg_owner;
3482
0
    mfn_t map_mfn = INVALID_MFN;
3483
0
    uint32_t xsm_needed = 0;
3484
0
    uint32_t xsm_checked = 0;
3485
0
    int rc = put_old_guest_table(curr);
3486
0
3487
0
    if ( unlikely(rc) )
3488
0
    {
3489
0
        if ( likely(rc == -ERESTART) )
3490
0
            rc = hypercall_create_continuation(
3491
0
                     __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
3492
0
                     foreigndom);
3493
0
        return rc;
3494
0
    }
3495
0
3496
0
    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
3497
0
         likely(guest_handle_is_null(ureqs)) )
3498
0
    {
3499
0
        /*
3500
0
         * See the curr->arch.old_guest_table related
3501
0
         * hypercall_create_continuation() below.
3502
0
         */
3503
0
        return (int)foreigndom;
3504
0
    }
3505
0
3506
0
    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3507
0
    {
3508
0
        count &= ~MMU_UPDATE_PREEMPTED;
3509
0
        if ( unlikely(!guest_handle_is_null(pdone)) )
3510
0
            (void)copy_from_guest(&done, pdone, 1);
3511
0
    }
3512
0
    else
3513
0
        perfc_incr(calls_to_mmu_update);
3514
0
3515
0
    if ( unlikely(!guest_handle_okay(ureqs, count)) )
3516
0
        return -EFAULT;
3517
0
3518
0
    if ( (pt_dom = foreigndom >> 16) != 0 )
3519
0
    {
3520
0
        /* Pagetables belong to a foreign domain (PFD). */
3521
0
        if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3522
0
            return -ESRCH;
3523
0
3524
0
        if ( pt_owner == d )
3525
0
            rcu_unlock_domain(pt_owner);
3526
0
        else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
3527
0
        {
3528
0
            rc = -EINVAL;
3529
0
            goto out;
3530
0
        }
3531
0
    }
3532
0
3533
0
    if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3534
0
    {
3535
0
        rc = -ESRCH;
3536
0
        goto out;
3537
0
    }
3538
0
3539
0
    for ( i = 0; i < count; i++ )
3540
0
    {
3541
0
        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3542
0
        {
3543
0
            rc = -ERESTART;
3544
0
            break;
3545
0
        }
3546
0
3547
0
        if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3548
0
        {
3549
0
            rc = -EFAULT;
3550
0
            break;
3551
0
        }
3552
0
3553
0
        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3554
0
3555
0
        switch ( cmd )
3556
0
        {
3557
0
            /*
3558
0
             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3559
0
             * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3560
0
             * current A/D bits.
3561
0
             */
3562
0
        case MMU_NORMAL_PT_UPDATE:
3563
0
        case MMU_PT_UPDATE_PRESERVE_AD:
3564
0
        {
3565
0
            p2m_type_t p2mt;
3566
0
3567
0
            rc = -EOPNOTSUPP;
3568
0
            if ( unlikely(paging_mode_refcounts(pt_owner)) )
3569
0
                break;
3570
0
3571
0
            xsm_needed |= XSM_MMU_NORMAL_UPDATE;
3572
0
            if ( get_pte_flags(req.val) & _PAGE_PRESENT )
3573
0
            {
3574
0
                xsm_needed |= XSM_MMU_UPDATE_READ;
3575
0
                if ( get_pte_flags(req.val) & _PAGE_RW )
3576
0
                    xsm_needed |= XSM_MMU_UPDATE_WRITE;
3577
0
            }
3578
0
            if ( xsm_needed != xsm_checked )
3579
0
            {
3580
0
                rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, xsm_needed);
3581
0
                if ( rc )
3582
0
                    break;
3583
0
                xsm_checked = xsm_needed;
3584
0
            }
3585
0
            rc = -EINVAL;
3586
0
3587
0
            req.ptr -= cmd;
3588
0
            gmfn = req.ptr >> PAGE_SHIFT;
3589
0
            page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
3590
0
3591
0
            if ( p2m_is_paged(p2mt) )
3592
0
            {
3593
0
                ASSERT(!page);
3594
0
                p2m_mem_paging_populate(pt_owner, gmfn);
3595
0
                rc = -ENOENT;
3596
0
                break;
3597
0
            }
3598
0
3599
0
            if ( unlikely(!page) )
3600
0
            {
3601
0
                gdprintk(XENLOG_WARNING,
3602
0
                         "Could not get page for normal update\n");
3603
0
                break;
3604
0
            }
3605
0
3606
0
            mfn = mfn_x(page_to_mfn(page));
3607
0
3608
0
            if ( !mfn_eq(_mfn(mfn), map_mfn) )
3609
0
            {
3610
0
                if ( va )
3611
0
                    unmap_domain_page(va);
3612
0
                va = map_domain_page(_mfn(mfn));
3613
0
                map_mfn = _mfn(mfn);
3614
0
            }
3615
0
            va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
3616
0
3617
0
            if ( page_lock(page) )
3618
0
            {
3619
0
                switch ( page->u.inuse.type_info & PGT_type_mask )
3620
0
                {
3621
0
                case PGT_l1_page_table:
3622
0
                    rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
3623
0
                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3624
0
                                      pg_owner);
3625
0
                    break;
3626
0
                case PGT_l2_page_table:
3627
0
                    rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
3628
0
                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3629
0
                    break;
3630
0
                case PGT_l3_page_table:
3631
0
                    rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
3632
0
                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3633
0
                    break;
3634
0
                case PGT_l4_page_table:
3635
0
                    rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
3636
0
                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3637
0
                    break;
3638
0
                case PGT_writable_page:
3639
0
                    perfc_incr(writable_mmu_updates);
3640
0
                    if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3641
0
                        rc = 0;
3642
0
                    break;
3643
0
                }
3644
0
                page_unlock(page);
3645
0
                if ( rc == -EINTR )
3646
0
                    rc = -ERESTART;
3647
0
            }
3648
0
            else if ( get_page_type(page, PGT_writable_page) )
3649
0
            {
3650
0
                perfc_incr(writable_mmu_updates);
3651
0
                if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3652
0
                    rc = 0;
3653
0
                put_page_type(page);
3654
0
            }
3655
0
3656
0
            put_page(page);
3657
0
        }
3658
0
        break;
3659
0
3660
0
        case MMU_MACHPHYS_UPDATE:
3661
0
            if ( unlikely(d != pt_owner) )
3662
0
            {
3663
0
                rc = -EPERM;
3664
0
                break;
3665
0
            }
3666
0
3667
0
            if ( unlikely(paging_mode_translate(pg_owner)) )
3668
0
            {
3669
0
                rc = -EINVAL;
3670
0
                break;
3671
0
            }
3672
0
3673
0
            mfn = req.ptr >> PAGE_SHIFT;
3674
0
            gpfn = req.val;
3675
0
3676
0
            xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
3677
0
            if ( xsm_needed != xsm_checked )
3678
0
            {
3679
0
                rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
3680
0
                if ( rc )
3681
0
                    break;
3682
0
                xsm_checked = xsm_needed;
3683
0
            }
3684
0
3685
0
            if ( unlikely(!get_page_from_mfn(_mfn(mfn), pg_owner)) )
3686
0
            {
3687
0
                gdprintk(XENLOG_WARNING,
3688
0
                         "Could not get page for mach->phys update\n");
3689
0
                rc = -EINVAL;
3690
0
                break;
3691
0
            }
3692
0
3693
0
            set_gpfn_from_mfn(mfn, gpfn);
3694
0
3695
0
            paging_mark_dirty(pg_owner, _mfn(mfn));
3696
0
3697
0
            put_page(mfn_to_page(_mfn(mfn)));
3698
0
            break;
3699
0
3700
0
        default:
3701
0
            rc = -ENOSYS;
3702
0
            break;
3703
0
        }
3704
0
3705
0
        if ( unlikely(rc) )
3706
0
            break;
3707
0
3708
0
        guest_handle_add_offset(ureqs, 1);
3709
0
    }
3710
0
3711
0
    if ( rc == -ERESTART )
3712
0
    {
3713
0
        ASSERT(i < count);
3714
0
        rc = hypercall_create_continuation(
3715
0
            __HYPERVISOR_mmu_update, "hihi",
3716
0
            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3717
0
    }
3718
0
    else if ( curr->arch.old_guest_table )
3719
0
    {
3720
0
        XEN_GUEST_HANDLE_PARAM(void) null;
3721
0
3722
0
        ASSERT(rc || i == count);
3723
0
        set_xen_guest_handle(null, NULL);
3724
0
        /*
3725
0
         * In order to have a way to communicate the final return value to
3726
0
         * our continuation, we pass this in place of "foreigndom", building
3727
0
         * on the fact that this argument isn't needed anymore.
3728
0
         */
3729
0
        rc = hypercall_create_continuation(
3730
0
                __HYPERVISOR_mmu_update, "hihi", null,
3731
0
                MMU_UPDATE_PREEMPTED, null, rc);
3732
0
    }
3733
0
3734
0
    put_pg_owner(pg_owner);
3735
0
3736
0
    if ( va )
3737
0
        unmap_domain_page(va);
3738
0
3739
0
    perfc_add(num_page_updates, i);
3740
0
3741
0
 out:
3742
0
    if ( pt_owner != d )
3743
0
        rcu_unlock_domain(pt_owner);
3744
0
3745
0
    /* Add incremental work we have done to the @done output parameter. */
3746
0
    if ( unlikely(!guest_handle_is_null(pdone)) )
3747
0
    {
3748
0
        done += i;
3749
0
        copy_to_guest(pdone, &done, 1);
3750
0
    }
3751
0
3752
0
    return rc;
3753
0
}
3754
3755
int donate_page(
3756
    struct domain *d, struct page_info *page, unsigned int memflags)
3757
0
{
3758
0
    const struct domain *owner = dom_xen;
3759
0
3760
0
    spin_lock(&d->page_alloc_lock);
3761
0
3762
0
    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
3763
0
        goto fail;
3764
0
3765
0
    if ( d->is_dying )
3766
0
        goto fail;
3767
0
3768
0
    if ( page->count_info & ~(PGC_allocated | 1) )
3769
0
        goto fail;
3770
0
3771
0
    if ( !(memflags & MEMF_no_refcount) )
3772
0
    {
3773
0
        if ( d->tot_pages >= d->max_pages )
3774
0
            goto fail;
3775
0
        domain_adjust_tot_pages(d, 1);
3776
0
    }
3777
0
3778
0
    page->count_info = PGC_allocated | 1;
3779
0
    page_set_owner(page, d);
3780
0
    page_list_add_tail(page,&d->page_list);
3781
0
3782
0
    spin_unlock(&d->page_alloc_lock);
3783
0
    return 0;
3784
0
3785
0
 fail:
3786
0
    spin_unlock(&d->page_alloc_lock);
3787
0
    gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
3788
0
             " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3789
0
             mfn_x(page_to_mfn(page)), d->domain_id,
3790
0
             owner ? owner->domain_id : DOMID_INVALID,
3791
0
             page->count_info, page->u.inuse.type_info);
3792
0
    return -EINVAL;
3793
0
}
3794
3795
int steal_page(
3796
    struct domain *d, struct page_info *page, unsigned int memflags)
3797
0
{
3798
0
    unsigned long x, y;
3799
0
    bool drop_dom_ref = false;
3800
0
    const struct domain *owner = dom_xen;
3801
0
3802
0
    if ( paging_mode_external(d) )
3803
0
        return -EOPNOTSUPP;
3804
0
3805
0
    spin_lock(&d->page_alloc_lock);
3806
0
3807
0
    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
3808
0
        goto fail;
3809
0
3810
0
    /*
3811
0
     * We require there is just one reference (PGC_allocated). We temporarily
3812
0
     * drop this reference now so that we can safely swizzle the owner.
3813
0
     */
3814
0
    y = page->count_info;
3815
0
    do {
3816
0
        x = y;
3817
0
        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3818
0
            goto fail;
3819
0
        y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3820
0
    } while ( y != x );
3821
0
3822
0
    /*
3823
0
     * With the sole reference dropped temporarily, no-one can update type
3824
0
     * information. Type count also needs to be zero in this case, but e.g.
3825
0
     * PGT_seg_desc_page may still have PGT_validated set, which we need to
3826
0
     * clear before transferring ownership (as validation criteria vary
3827
0
     * depending on domain type).
3828
0
     */
3829
0
    BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
3830
0
                                      PGT_pinned));
3831
0
    page->u.inuse.type_info = 0;
3832
0
3833
0
    /* Swizzle the owner then reinstate the PGC_allocated reference. */
3834
0
    page_set_owner(page, NULL);
3835
0
    y = page->count_info;
3836
0
    do {
3837
0
        x = y;
3838
0
        BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3839
0
    } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3840
0
3841
0
    /* Unlink from original owner. */
3842
0
    if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
3843
0
        drop_dom_ref = true;
3844
0
    page_list_del(page, &d->page_list);
3845
0
3846
0
    spin_unlock(&d->page_alloc_lock);
3847
0
    if ( unlikely(drop_dom_ref) )
3848
0
        put_domain(d);
3849
0
    return 0;
3850
0
3851
0
 fail:
3852
0
    spin_unlock(&d->page_alloc_lock);
3853
0
    gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn
3854
0
             " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3855
0
             mfn_x(page_to_mfn(page)), d->domain_id,
3856
0
             owner ? owner->domain_id : DOMID_INVALID,
3857
0
             page->count_info, page->u.inuse.type_info);
3858
0
    return -EINVAL;
3859
0
}
3860
3861
static int __do_update_va_mapping(
3862
    unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
3863
0
{
3864
0
    l1_pgentry_t   val = l1e_from_intpte(val64);
3865
0
    struct vcpu   *v   = current;
3866
0
    struct domain *d   = v->domain;
3867
0
    struct page_info *gl1pg;
3868
0
    l1_pgentry_t  *pl1e;
3869
0
    unsigned long  bmap_ptr;
3870
0
    mfn_t          gl1mfn;
3871
0
    cpumask_t     *mask = NULL;
3872
0
    int            rc;
3873
0
3874
0
    perfc_incr(calls_to_update_va);
3875
0
3876
0
    rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
3877
0
    if ( rc )
3878
0
        return rc;
3879
0
3880
0
    rc = -EINVAL;
3881
0
    pl1e = map_guest_l1e(va, &gl1mfn);
3882
0
    if ( unlikely(!pl1e || !get_page_from_mfn(gl1mfn, d)) )
3883
0
        goto out;
3884
0
3885
0
    gl1pg = mfn_to_page(gl1mfn);
3886
0
    if ( !page_lock(gl1pg) )
3887
0
    {
3888
0
        put_page(gl1pg);
3889
0
        goto out;
3890
0
    }
3891
0
3892
0
    if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3893
0
    {
3894
0
        page_unlock(gl1pg);
3895
0
        put_page(gl1pg);
3896
0
        goto out;
3897
0
    }
3898
0
3899
0
    rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
3900
0
3901
0
    page_unlock(gl1pg);
3902
0
    put_page(gl1pg);
3903
0
3904
0
 out:
3905
0
    if ( pl1e )
3906
0
        unmap_domain_page(pl1e);
3907
0
3908
0
    switch ( flags & UVMF_FLUSHTYPE_MASK )
3909
0
    {
3910
0
    case UVMF_TLB_FLUSH:
3911
0
        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3912
0
        {
3913
0
        case UVMF_LOCAL:
3914
0
            flush_tlb_local();
3915
0
            break;
3916
0
        case UVMF_ALL:
3917
0
            mask = d->domain_dirty_cpumask;
3918
0
            break;
3919
0
        default:
3920
0
            mask = this_cpu(scratch_cpumask);
3921
0
            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3922
0
                                                                     void),
3923
0
                                      mask);
3924
0
            break;
3925
0
        }
3926
0
        if ( mask )
3927
0
            flush_tlb_mask(mask);
3928
0
        break;
3929
0
3930
0
    case UVMF_INVLPG:
3931
0
        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3932
0
        {
3933
0
        case UVMF_LOCAL:
3934
0
            paging_invlpg(v, va);
3935
0
            break;
3936
0
        case UVMF_ALL:
3937
0
            mask = d->domain_dirty_cpumask;
3938
0
            break;
3939
0
        default:
3940
0
            mask = this_cpu(scratch_cpumask);
3941
0
            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3942
0
                                                                     void),
3943
0
                                      mask);
3944
0
            break;
3945
0
        }
3946
0
        if ( mask )
3947
0
            flush_tlb_one_mask(mask, va);
3948
0
        break;
3949
0
    }
3950
0
3951
0
    return rc;
3952
0
}
3953
3954
long do_update_va_mapping(unsigned long va, u64 val64,
3955
                          unsigned long flags)
3956
0
{
3957
0
    return __do_update_va_mapping(va, val64, flags, current->domain);
3958
0
}
3959
3960
long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3961
                                      unsigned long flags,
3962
                                      domid_t domid)
3963
0
{
3964
0
    struct domain *pg_owner;
3965
0
    int rc;
3966
0
3967
0
    if ( (pg_owner = get_pg_owner(domid)) == NULL )
3968
0
        return -ESRCH;
3969
0
3970
0
    rc = __do_update_va_mapping(va, val64, flags, pg_owner);
3971
0
3972
0
    put_pg_owner(pg_owner);
3973
0
3974
0
    return rc;
3975
0
}
3976
3977
typedef struct e820entry e820entry_t;
3978
DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3979
3980
struct memory_map_context
3981
{
3982
    unsigned int n;
3983
    unsigned long s;
3984
    struct xen_memory_map map;
3985
};
3986
3987
static int _handle_iomem_range(unsigned long s, unsigned long e,
3988
                               struct memory_map_context *ctxt)
3989
0
{
3990
0
    if ( s > ctxt->s && !(s >> (paddr_bits - PAGE_SHIFT)) )
3991
0
    {
3992
0
        e820entry_t ent;
3993
0
        XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
3994
0
        XEN_GUEST_HANDLE(e820entry_t) buffer;
3995
0
3996
0
        if ( !guest_handle_is_null(ctxt->map.buffer) )
3997
0
        {
3998
0
            if ( ctxt->n + 1 >= ctxt->map.nr_entries )
3999
0
                return -EINVAL;
4000
0
            ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4001
0
            ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4002
0
            ent.type = E820_RESERVED;
4003
0
            buffer_param = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4004
0
            buffer = guest_handle_from_param(buffer_param, e820entry_t);
4005
0
            if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) )
4006
0
                return -EFAULT;
4007
0
        }
4008
0
        ctxt->n++;
4009
0
    }
4010
0
    ctxt->s = e + 1;
4011
0
4012
0
    return 0;
4013
0
}
4014
4015
static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4016
0
{
4017
0
    int err = 0;
4018
0
4019
0
    do {
4020
0
        unsigned long low = -1UL;
4021
0
        unsigned int i;
4022
0
4023
0
        for ( i = 0; i < nr_ioapics; ++i )
4024
0
        {
4025
0
            unsigned long mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
4026
0
4027
0
            if ( mfn >= s && mfn <= e && mfn < low )
4028
0
                low = mfn;
4029
0
        }
4030
0
        if ( !(low + 1) )
4031
0
            break;
4032
0
        if ( s < low )
4033
0
            err = _handle_iomem_range(s, low - 1, p);
4034
0
        s = low + 1;
4035
0
    } while ( !err );
4036
0
4037
0
    return err || s > e ? err : _handle_iomem_range(s, e, p);
4038
0
}
4039
4040
int xenmem_add_to_physmap_one(
4041
    struct domain *d,
4042
    unsigned int space,
4043
    union xen_add_to_physmap_batch_extra extra,
4044
    unsigned long idx,
4045
    gfn_t gpfn)
4046
3
{
4047
3
    struct page_info *page = NULL;
4048
3
    unsigned long gfn = 0; /* gcc ... */
4049
3
    unsigned long prev_mfn, old_gpfn;
4050
3
    int rc = 0;
4051
3
    mfn_t mfn = INVALID_MFN;
4052
3
    p2m_type_t p2mt;
4053
3
4054
3
    switch ( space )
4055
3
    {
4056
2
        case XENMAPSPACE_shared_info:
4057
2
            if ( idx == 0 )
4058
2
                mfn = _mfn(virt_to_mfn(d->shared_info));
4059
2
            break;
4060
1
        case XENMAPSPACE_grant_table:
4061
1
            rc = gnttab_map_frame(d, idx, gpfn, &mfn);
4062
1
            if ( rc )
4063
0
                return rc;
4064
1
            break;
4065
0
        case XENMAPSPACE_gmfn_range:
4066
0
        case XENMAPSPACE_gmfn:
4067
0
        {
4068
0
            p2m_type_t p2mt;
4069
0
4070
0
            gfn = idx;
4071
0
            idx = mfn_x(get_gfn_unshare(d, idx, &p2mt));
4072
0
            /* If the page is still shared, exit early */
4073
0
            if ( p2m_is_shared(p2mt) )
4074
0
            {
4075
0
                put_gfn(d, gfn);
4076
0
                return -ENOMEM;
4077
0
            }
4078
0
            if ( !get_page_from_mfn(_mfn(idx), d) )
4079
0
                break;
4080
0
            mfn = _mfn(idx);
4081
0
            page = mfn_to_page(mfn);
4082
0
            break;
4083
0
        }
4084
0
        case XENMAPSPACE_gmfn_foreign:
4085
0
            return p2m_add_foreign(d, idx, gfn_x(gpfn), extra.foreign_domid);
4086
0
        default:
4087
0
            break;
4088
3
    }
4089
3
4090
3
    if ( !paging_mode_translate(d) || mfn_eq(mfn, INVALID_MFN) )
4091
0
    {
4092
0
        rc = -EINVAL;
4093
0
        goto put_both;
4094
0
    }
4095
3
4096
3
    /* Remove previously mapped page if it was present. */
4097
3
    prev_mfn = mfn_x(get_gfn(d, gfn_x(gpfn), &p2mt));
4098
3
    if ( mfn_valid(_mfn(prev_mfn)) )
4099
2
    {
4100
2
        if ( is_xen_heap_mfn(prev_mfn) )
4101
2
            /* Xen heap frames are simply unhooked from this phys slot. */
4102
1
            rc = guest_physmap_remove_page(d, gpfn, _mfn(prev_mfn), PAGE_ORDER_4K);
4103
2
        else
4104
2
            /* Normal domain memory is freed, to avoid leaking memory. */
4105
1
            rc = guest_remove_page(d, gfn_x(gpfn));
4106
2
    }
4107
3
    /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */
4108
3
    put_gfn(d, gfn_x(gpfn));
4109
3
4110
3
    if ( rc )
4111
0
        goto put_both;
4112
3
4113
3
    /* Unmap from old location, if any. */
4114
3
    old_gpfn = get_gpfn_from_mfn(mfn_x(mfn));
4115
3
    ASSERT( old_gpfn != SHARED_M2P_ENTRY );
4116
3
    if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range )
4117
0
        ASSERT( old_gpfn == gfn );
4118
3
    if ( old_gpfn != INVALID_M2P_ENTRY )
4119
0
        rc = guest_physmap_remove_page(d, _gfn(old_gpfn), mfn, PAGE_ORDER_4K);
4120
3
4121
3
    /* Map at new location. */
4122
3
    if ( !rc )
4123
3
        rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
4124
3
4125
3
 put_both:
4126
3
    /* In the XENMAPSPACE_gmfn, we took a ref of the gfn at the top */
4127
3
    if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range )
4128
0
        put_gfn(d, gfn);
4129
3
4130
3
    if ( page )
4131
0
        put_page(page);
4132
3
4133
3
    return rc;
4134
3
}
4135
4136
long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
4137
1
{
4138
1
    int rc;
4139
1
4140
1
    switch ( cmd )
4141
1
    {
4142
0
    case XENMEM_set_memory_map:
4143
0
    {
4144
0
        struct xen_foreign_memory_map fmap;
4145
0
        struct domain *d;
4146
0
        struct e820entry *e820;
4147
0
4148
0
        if ( copy_from_guest(&fmap, arg, 1) )
4149
0
            return -EFAULT;
4150
0
4151
0
        if ( fmap.map.nr_entries > E820MAX )
4152
0
            return -EINVAL;
4153
0
4154
0
        d = rcu_lock_domain_by_any_id(fmap.domid);
4155
0
        if ( d == NULL )
4156
0
            return -ESRCH;
4157
0
4158
0
        rc = xsm_domain_memory_map(XSM_TARGET, d);
4159
0
        if ( rc )
4160
0
        {
4161
0
            rcu_unlock_domain(d);
4162
0
            return rc;
4163
0
        }
4164
0
4165
0
        e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries);
4166
0
        if ( e820 == NULL )
4167
0
        {
4168
0
            rcu_unlock_domain(d);
4169
0
            return -ENOMEM;
4170
0
        }
4171
0
4172
0
        if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) )
4173
0
        {
4174
0
            xfree(e820);
4175
0
            rcu_unlock_domain(d);
4176
0
            return -EFAULT;
4177
0
        }
4178
0
4179
0
        spin_lock(&d->arch.e820_lock);
4180
0
        xfree(d->arch.e820);
4181
0
        d->arch.e820 = e820;
4182
0
        d->arch.nr_e820 = fmap.map.nr_entries;
4183
0
        spin_unlock(&d->arch.e820_lock);
4184
0
4185
0
        rcu_unlock_domain(d);
4186
0
        return rc;
4187
0
    }
4188
0
4189
1
    case XENMEM_memory_map:
4190
1
    {
4191
1
        struct xen_memory_map map;
4192
1
        struct domain *d = current->domain;
4193
1
4194
1
        if ( copy_from_guest(&map, arg, 1) )
4195
0
            return -EFAULT;
4196
1
4197
1
        spin_lock(&d->arch.e820_lock);
4198
1
4199
1
        /* Backwards compatibility. */
4200
1
        if ( (d->arch.nr_e820 == 0) || (d->arch.e820 == NULL) )
4201
0
        {
4202
0
            spin_unlock(&d->arch.e820_lock);
4203
0
            return -ENOSYS;
4204
0
        }
4205
1
4206
1
        map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4207
1
        if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4208
1
             __copy_to_guest(arg, &map, 1) )
4209
0
        {
4210
0
            spin_unlock(&d->arch.e820_lock);
4211
0
            return -EFAULT;
4212
0
        }
4213
1
4214
1
        spin_unlock(&d->arch.e820_lock);
4215
1
        return 0;
4216
1
    }
4217
1
4218
0
    case XENMEM_machine_memory_map:
4219
0
    {
4220
0
        struct memory_map_context ctxt;
4221
0
        XEN_GUEST_HANDLE(e820entry_t) buffer;
4222
0
        XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
4223
0
        unsigned int i;
4224
0
        bool store;
4225
0
4226
0
        rc = xsm_machine_memory_map(XSM_PRIV);
4227
0
        if ( rc )
4228
0
            return rc;
4229
0
4230
0
        if ( copy_from_guest(&ctxt.map, arg, 1) )
4231
0
            return -EFAULT;
4232
0
4233
0
        store = !guest_handle_is_null(ctxt.map.buffer);
4234
0
4235
0
        if ( store && ctxt.map.nr_entries < e820.nr_map + 1 )
4236
0
            return -EINVAL;
4237
0
4238
0
        buffer_param = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4239
0
        buffer = guest_handle_from_param(buffer_param, e820entry_t);
4240
0
        if ( store && !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4241
0
            return -EFAULT;
4242
0
4243
0
        for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4244
0
        {
4245
0
            unsigned long s = PFN_DOWN(e820.map[i].addr);
4246
0
4247
0
            if ( s > ctxt.s )
4248
0
            {
4249
0
                rc = rangeset_report_ranges(current->domain->iomem_caps,
4250
0
                                            ctxt.s, s - 1,
4251
0
                                            handle_iomem_range, &ctxt);
4252
0
                if ( !rc )
4253
0
                    rc = handle_iomem_range(s, s, &ctxt);
4254
0
                if ( rc )
4255
0
                    return rc;
4256
0
            }
4257
0
            if ( store )
4258
0
            {
4259
0
                if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4260
0
                    return -EINVAL;
4261
0
                if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) )
4262
0
                    return -EFAULT;
4263
0
            }
4264
0
            ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4265
0
        }
4266
0
4267
0
        if ( ctxt.s )
4268
0
        {
4269
0
            rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4270
0
                                        ~0UL, handle_iomem_range, &ctxt);
4271
0
            if ( !rc && ctxt.s )
4272
0
                rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4273
0
            if ( rc )
4274
0
                return rc;
4275
0
        }
4276
0
4277
0
        ctxt.map.nr_entries = ctxt.n;
4278
0
4279
0
        if ( __copy_to_guest(arg, &ctxt.map, 1) )
4280
0
            return -EFAULT;
4281
0
4282
0
        return 0;
4283
0
    }
4284
0
4285
0
    case XENMEM_machphys_mapping:
4286
0
    {
4287
0
        struct xen_machphys_mapping mapping = {
4288
0
            .v_start = MACH2PHYS_VIRT_START,
4289
0
            .v_end   = MACH2PHYS_VIRT_END,
4290
0
            .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4291
0
        };
4292
0
4293
0
        if ( !mem_hotplug && is_hardware_domain(current->domain) )
4294
0
            mapping.max_mfn = max_page - 1;
4295
0
        if ( copy_to_guest(arg, &mapping, 1) )
4296
0
            return -EFAULT;
4297
0
4298
0
        return 0;
4299
0
    }
4300
0
4301
0
    case XENMEM_set_pod_target:
4302
0
    case XENMEM_get_pod_target:
4303
0
    {
4304
0
        xen_pod_target_t target;
4305
0
        struct domain *d;
4306
0
        struct p2m_domain *p2m;
4307
0
4308
0
        if ( copy_from_guest(&target, arg, 1) )
4309
0
            return -EFAULT;
4310
0
4311
0
        d = rcu_lock_domain_by_any_id(target.domid);
4312
0
        if ( d == NULL )
4313
0
            return -ESRCH;
4314
0
4315
0
        if ( cmd == XENMEM_set_pod_target )
4316
0
            rc = xsm_set_pod_target(XSM_PRIV, d);
4317
0
        else
4318
0
            rc = xsm_get_pod_target(XSM_PRIV, d);
4319
0
4320
0
        if ( rc != 0 )
4321
0
            goto pod_target_out_unlock;
4322
0
4323
0
        if ( cmd == XENMEM_set_pod_target )
4324
0
        {
4325
0
            if ( target.target_pages > d->max_pages )
4326
0
            {
4327
0
                rc = -EINVAL;
4328
0
                goto pod_target_out_unlock;
4329
0
            }
4330
0
4331
0
            rc = p2m_pod_set_mem_target(d, target.target_pages);
4332
0
        }
4333
0
4334
0
        if ( rc == -ERESTART )
4335
0
        {
4336
0
            rc = hypercall_create_continuation(
4337
0
                __HYPERVISOR_memory_op, "lh", cmd, arg);
4338
0
        }
4339
0
        else if ( rc >= 0 )
4340
0
        {
4341
0
            p2m = p2m_get_hostp2m(d);
4342
0
            target.tot_pages       = d->tot_pages;
4343
0
            target.pod_cache_pages = p2m->pod.count;
4344
0
            target.pod_entries     = p2m->pod.entry_count;
4345
0
4346
0
            if ( __copy_to_guest(arg, &target, 1) )
4347
0
            {
4348
0
                rc= -EFAULT;
4349
0
                goto pod_target_out_unlock;
4350
0
            }
4351
0
        }
4352
0
4353
0
    pod_target_out_unlock:
4354
0
        rcu_unlock_domain(d);
4355
0
        return rc;
4356
0
    }
4357
0
4358
0
    default:
4359
0
        return subarch_memory_op(cmd, arg);
4360
1
    }
4361
1
4362
0
    return 0;
4363
1
}
4364
4365
int mmio_ro_emulated_write(
4366
    enum x86_segment seg,
4367
    unsigned long offset,
4368
    void *p_data,
4369
    unsigned int bytes,
4370
    struct x86_emulate_ctxt *ctxt)
4371
0
{
4372
0
    struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = ctxt->data;
4373
0
4374
0
    /* Only allow naturally-aligned stores at the original %cr2 address. */
4375
0
    if ( ((bytes | offset) & (bytes - 1)) || !bytes ||
4376
0
         offset != mmio_ro_ctxt->cr2 )
4377
0
    {
4378
0
        gdprintk(XENLOG_WARNING, "bad access (cr2=%lx, addr=%lx, bytes=%u)\n",
4379
0
                mmio_ro_ctxt->cr2, offset, bytes);
4380
0
        return X86EMUL_UNHANDLEABLE;
4381
0
    }
4382
0
4383
0
    return X86EMUL_OKAY;
4384
0
}
4385
4386
int mmcfg_intercept_write(
4387
    enum x86_segment seg,
4388
    unsigned long offset,
4389
    void *p_data,
4390
    unsigned int bytes,
4391
    struct x86_emulate_ctxt *ctxt)
4392
0
{
4393
0
    struct mmio_ro_emulate_ctxt *mmio_ctxt = ctxt->data;
4394
0
4395
0
    /*
4396
0
     * Only allow naturally-aligned stores no wider than 4 bytes to the
4397
0
     * original %cr2 address.
4398
0
     */
4399
0
    if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 || !bytes ||
4400
0
         offset != mmio_ctxt->cr2 )
4401
0
    {
4402
0
        gdprintk(XENLOG_WARNING, "bad write (cr2=%lx, addr=%lx, bytes=%u)\n",
4403
0
                mmio_ctxt->cr2, offset, bytes);
4404
0
        return X86EMUL_UNHANDLEABLE;
4405
0
    }
4406
0
4407
0
    offset &= 0xfff;
4408
0
    if ( pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf,
4409
0
                                  offset, bytes, p_data) >= 0 )
4410
0
        pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf),
4411
0
                        PCI_DEVFN2(mmio_ctxt->bdf), offset, bytes,
4412
0
                        *(uint32_t *)p_data);
4413
0
4414
0
    return X86EMUL_OKAY;
4415
0
}
4416
4417
void *alloc_xen_pagetable(void)
4418
31
{
4419
31
    if ( system_state != SYS_STATE_early_boot )
4420
18
    {
4421
18
        void *ptr = alloc_xenheap_page();
4422
18
4423
18
        BUG_ON(!hardware_domain && !ptr);
4424
18
        return ptr;
4425
18
    }
4426
31
4427
13
    return mfn_to_virt(mfn_x(alloc_boot_pages(1, 1)));
4428
31
}
4429
4430
void free_xen_pagetable(void *v)
4431
3
{
4432
3
    if ( system_state != SYS_STATE_early_boot )
4433
0
        free_xenheap_page(v);
4434
3
}
4435
4436
static DEFINE_SPINLOCK(map_pgdir_lock);
4437
4438
static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
4439
12.7k
{
4440
12.7k
    l4_pgentry_t *pl4e;
4441
12.7k
4442
12.7k
    pl4e = &idle_pg_table[l4_table_offset(v)];
4443
12.7k
    if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4444
2
    {
4445
2
        bool locking = system_state > SYS_STATE_boot;
4446
2
        l3_pgentry_t *pl3e = alloc_xen_pagetable();
4447
2
4448
2
        if ( !pl3e )
4449
0
            return NULL;
4450
2
        clear_page(pl3e);
4451
2
        if ( locking )
4452
0
            spin_lock(&map_pgdir_lock);
4453
2
        if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4454
2
        {
4455
2
            l4_pgentry_t l4e = l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR);
4456
2
4457
2
            l4e_write(pl4e, l4e);
4458
2
            efi_update_l4_pgtable(l4_table_offset(v), l4e);
4459
2
            pl3e = NULL;
4460
2
        }
4461
2
        if ( locking )
4462
0
            spin_unlock(&map_pgdir_lock);
4463
2
        if ( pl3e )
4464
0
            free_xen_pagetable(pl3e);
4465
2
    }
4466
12.7k
4467
12.7k
    return l4e_to_l3e(*pl4e) + l3_table_offset(v);
4468
12.7k
}
4469
4470
static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
4471
4.60k
{
4472
4.60k
    l3_pgentry_t *pl3e;
4473
4.60k
4474
4.60k
    pl3e = virt_to_xen_l3e(v);
4475
4.60k
    if ( !pl3e )
4476
0
        return NULL;
4477
4.60k
4478
4.60k
    if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4479
7
    {
4480
7
        bool locking = system_state > SYS_STATE_boot;
4481
7
        l2_pgentry_t *pl2e = alloc_xen_pagetable();
4482
7
4483
7
        if ( !pl2e )
4484
0
            return NULL;
4485
7
        clear_page(pl2e);
4486
7
        if ( locking )
4487
0
            spin_lock(&map_pgdir_lock);
4488
7
        if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4489
7
        {
4490
7
            l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
4491
7
            pl2e = NULL;
4492
7
        }
4493
7
        if ( locking )
4494
0
            spin_unlock(&map_pgdir_lock);
4495
7
        if ( pl2e )
4496
0
            free_xen_pagetable(pl2e);
4497
7
    }
4498
4.60k
4499
4.60k
    BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
4500
4.60k
    return l3e_to_l2e(*pl3e) + l2_table_offset(v);
4501
4.60k
}
4502
4503
l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
4504
13
{
4505
13
    l2_pgentry_t *pl2e;
4506
13
4507
13
    pl2e = virt_to_xen_l2e(v);
4508
13
    if ( !pl2e )
4509
0
        return NULL;
4510
13
4511
13
    if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4512
13
    {
4513
13
        bool locking = system_state > SYS_STATE_boot;
4514
13
        l1_pgentry_t *pl1e = alloc_xen_pagetable();
4515
13
4516
13
        if ( !pl1e )
4517
0
            return NULL;
4518
13
        clear_page(pl1e);
4519
13
        if ( locking )
4520
0
            spin_lock(&map_pgdir_lock);
4521
13
        if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4522
13
        {
4523
13
            l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
4524
13
            pl1e = NULL;
4525
13
        }
4526
13
        if ( locking )
4527
0
            spin_unlock(&map_pgdir_lock);
4528
13
        if ( pl1e )
4529
0
            free_xen_pagetable(pl1e);
4530
13
    }
4531
13
4532
13
    BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE);
4533
13
    return l2e_to_l1e(*pl2e) + l1_table_offset(v);
4534
13
}
4535
4536
/* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4537
1.85k
#define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) |  _PAGE_PSE) : (f))
4538
4
#define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4539
4540
/*
4541
 * map_pages_to_xen() can be called with interrupts disabled during
4542
 * early bootstrap. In this case it is safe to use flush_area_local()
4543
 * and avoid locking because only the local CPU is online.
4544
 */
4545
57
#define flush_area(v,f) (!local_irq_is_enabled() ?              \
4546
40
                         flush_area_local((const void *)v, f) : \
4547
17
                         flush_area_all((const void *)v, f))
4548
4549
int map_pages_to_xen(
4550
    unsigned long virt,
4551
    unsigned long mfn,
4552
    unsigned long nr_mfns,
4553
    unsigned int flags)
4554
231
{
4555
231
    bool locking = system_state > SYS_STATE_boot;
4556
231
    l2_pgentry_t *pl2e, ol2e;
4557
231
    l1_pgentry_t *pl1e, ol1e;
4558
231
    unsigned int  i;
4559
231
4560
41
#define flush_flags(oldf) do {                 \
4561
2
    unsigned int o_ = (oldf);                  \
4562
41
    if ( (o_) & _PAGE_GLOBAL )                 \
4563
41
        flush_flags |= FLUSH_TLB_GLOBAL;       \
4564
41
    if ( (flags & _PAGE_PRESENT) &&            \
4565
28
         (((o_) ^ flags) & PAGE_CACHE_ATTRS) ) \
4566
0
    {                                          \
4567
0
        flush_flags |= FLUSH_CACHE;            \
4568
0
        if ( virt >= DIRECTMAP_VIRT_START &&   \
4569
0
             virt < HYPERVISOR_VIRT_END )      \
4570
0
            flush_flags |= FLUSH_VA_VALID;     \
4571
0
    }                                          \
4572
41
} while (0)
4573
231
4574
4.83k
    while ( nr_mfns != 0 )
4575
4.60k
    {
4576
4.60k
        l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
4577
4.60k
4578
4.60k
        if ( !pl3e )
4579
0
            return -ENOMEM;
4580
4.60k
        ol3e = *pl3e;
4581
4.60k
4582
4.60k
        if ( cpu_has_page1gb &&
4583
4.60k
             !(((virt >> PAGE_SHIFT) | mfn) &
4584
4.60k
               ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4585
16
             nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4586
14
             !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4587
14
        {
4588
14
            /* 1GB-page mapping. */
4589
14
            l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4590
14
4591
14
            if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4592
1
            {
4593
1
                unsigned int flush_flags =
4594
1
                    FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4595
1
4596
1
                if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4597
0
                {
4598
0
                    flush_flags(lNf_to_l1f(l3e_get_flags(ol3e)));
4599
0
                    flush_area(virt, flush_flags);
4600
0
                }
4601
1
                else
4602
1
                {
4603
1
                    pl2e = l3e_to_l2e(ol3e);
4604
513
                    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4605
512
                    {
4606
512
                        ol2e = pl2e[i];
4607
512
                        if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4608
512
                            continue;
4609
0
                        if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4610
0
                            flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4611
0
                        else
4612
0
                        {
4613
0
                            unsigned int j;
4614
0
4615
0
                            pl1e = l2e_to_l1e(ol2e);
4616
0
                            for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4617
0
                                flush_flags(l1e_get_flags(pl1e[j]));
4618
0
                        }
4619
0
                    }
4620
1
                    flush_area(virt, flush_flags);
4621
513
                    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4622
512
                    {
4623
512
                        ol2e = pl2e[i];
4624
512
                        if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4625
0
                             !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4626
0
                            free_xen_pagetable(l2e_to_l1e(ol2e));
4627
512
                    }
4628
1
                    free_xen_pagetable(pl2e);
4629
1
                }
4630
1
            }
4631
14
4632
14
            virt    += 1UL << L3_PAGETABLE_SHIFT;
4633
14
            mfn     += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4634
14
            nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4635
14
            continue;
4636
14
        }
4637
4.60k
4638
4.59k
        if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4639
4.58k
             (l3e_get_flags(ol3e) & _PAGE_PSE) )
4640
0
        {
4641
0
            unsigned int flush_flags =
4642
0
                FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4643
0
4644
0
            /* Skip this PTE if there is no change. */
4645
0
            if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4646
0
                                         L1_PAGETABLE_ENTRIES - 1)) +
4647
0
                  (l2_table_offset(virt) << PAGETABLE_ORDER) +
4648
0
                  l1_table_offset(virt) == mfn) &&
4649
0
                 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4650
0
                  ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4651
0
            {
4652
0
                /* We can skip to end of L3 superpage if we got a match. */
4653
0
                i = (1u << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4654
0
                    (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4655
0
                if ( i > nr_mfns )
4656
0
                    i = nr_mfns;
4657
0
                virt    += i << PAGE_SHIFT;
4658
0
                mfn     += i;
4659
0
                nr_mfns -= i;
4660
0
                continue;
4661
0
            }
4662
0
4663
0
            pl2e = alloc_xen_pagetable();
4664
0
            if ( pl2e == NULL )
4665
0
                return -ENOMEM;
4666
0
4667
0
            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4668
0
                l2e_write(pl2e + i,
4669
0
                          l2e_from_pfn(l3e_get_pfn(ol3e) +
4670
0
                                       (i << PAGETABLE_ORDER),
4671
0
                                       l3e_get_flags(ol3e)));
4672
0
4673
0
            if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4674
0
                flush_flags |= FLUSH_TLB_GLOBAL;
4675
0
4676
0
            if ( locking )
4677
0
                spin_lock(&map_pgdir_lock);
4678
0
            if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
4679
0
                 (l3e_get_flags(*pl3e) & _PAGE_PSE) )
4680
0
            {
4681
0
                l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4682
0
                                                    __PAGE_HYPERVISOR));
4683
0
                pl2e = NULL;
4684
0
            }
4685
0
            if ( locking )
4686
0
                spin_unlock(&map_pgdir_lock);
4687
0
            flush_area(virt, flush_flags);
4688
0
            if ( pl2e )
4689
0
                free_xen_pagetable(pl2e);
4690
0
        }
4691
4.59k
4692
4.59k
        pl2e = virt_to_xen_l2e(virt);
4693
4.59k
        if ( !pl2e )
4694
0
            return -ENOMEM;
4695
4.59k
4696
4.59k
        if ( ((((virt >> PAGE_SHIFT) | mfn) &
4697
4.59k
               ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
4698
1.26k
             (nr_mfns >= (1u << PAGETABLE_ORDER)) &&
4699
1.25k
             !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4700
1.25k
        {
4701
1.25k
            /* Super-page mapping. */
4702
1.25k
            ol2e = *pl2e;
4703
1.25k
            l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4704
1.25k
4705
1.25k
            if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4706
2
            {
4707
2
                unsigned int flush_flags =
4708
2
                    FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4709
2
4710
2
                if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4711
2
                {
4712
2
                    flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4713
2
                    flush_area(virt, flush_flags);
4714
2
                }
4715
2
                else
4716
0
                {
4717
0
                    pl1e = l2e_to_l1e(ol2e);
4718
0
                    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4719
0
                        flush_flags(l1e_get_flags(pl1e[i]));
4720
0
                    flush_area(virt, flush_flags);
4721
0
                    free_xen_pagetable(pl1e);
4722
0
                }
4723
2
            }
4724
1.25k
4725
1.25k
            virt    += 1UL << L2_PAGETABLE_SHIFT;
4726
1.25k
            mfn     += 1UL << PAGETABLE_ORDER;
4727
1.25k
            nr_mfns -= 1UL << PAGETABLE_ORDER;
4728
1.25k
        }
4729
4.59k
        else
4730
3.34k
        {
4731
3.34k
            /* Normal page mapping. */
4732
3.34k
            if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4733
13
            {
4734
13
                pl1e = virt_to_xen_l1e(virt);
4735
13
                if ( pl1e == NULL )
4736
0
                    return -ENOMEM;
4737
13
            }
4738
3.32k
            else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4739
4
            {
4740
4
                unsigned int flush_flags =
4741
4
                    FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4742
4
4743
4
                /* Skip this PTE if there is no change. */
4744
4
                if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4745
4
                       l1_table_offset(virt)) == mfn) &&
4746
4
                     (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4747
4
                       ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4748
1
                {
4749
1
                    /* We can skip to end of L2 superpage if we got a match. */
4750
1
                    i = (1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4751
1
                        (mfn & ((1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4752
1
                    if ( i > nr_mfns )
4753
0
                        i = nr_mfns;
4754
1
                    virt    += i << L1_PAGETABLE_SHIFT;
4755
1
                    mfn     += i;
4756
1
                    nr_mfns -= i;
4757
1
                    goto check_l3;
4758
1
                }
4759
4
4760
3
                pl1e = alloc_xen_pagetable();
4761
3
                if ( pl1e == NULL )
4762
0
                    return -ENOMEM;
4763
3
4764
1.53k
                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4765
3
                    l1e_write(&pl1e[i],
4766
3
                              l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4767
3
                                           lNf_to_l1f(l2e_get_flags(*pl2e))));
4768
3
4769
3
                if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4770
3
                    flush_flags |= FLUSH_TLB_GLOBAL;
4771
3
4772
3
                if ( locking )
4773
3
                    spin_lock(&map_pgdir_lock);
4774
3
                if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
4775
3
                     (l2e_get_flags(*pl2e) & _PAGE_PSE) )
4776
3
                {
4777
3
                    l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4778
3
                                                        __PAGE_HYPERVISOR));
4779
3
                    pl1e = NULL;
4780
3
                }
4781
3
                if ( locking )
4782
3
                    spin_unlock(&map_pgdir_lock);
4783
3
                flush_area(virt, flush_flags);
4784
3
                if ( pl1e )
4785
0
                    free_xen_pagetable(pl1e);
4786
3
            }
4787
3.34k
4788
3.33k
            pl1e  = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4789
3.33k
            ol1e  = *pl1e;
4790
3.33k
            l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4791
3.33k
            if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4792
39
            {
4793
39
                unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4794
39
4795
39
                flush_flags(l1e_get_flags(ol1e));
4796
39
                flush_area(virt, flush_flags);
4797
39
            }
4798
3.33k
4799
3.33k
            virt    += 1UL << L1_PAGETABLE_SHIFT;
4800
3.33k
            mfn     += 1UL;
4801
3.33k
            nr_mfns -= 1UL;
4802
3.33k
4803
3.33k
            if ( (flags == PAGE_HYPERVISOR) &&
4804
2.76k
                 ((nr_mfns == 0) ||
4805
2.70k
                  ((((virt >> PAGE_SHIFT) | mfn) &
4806
2.70k
                    ((1u << PAGETABLE_ORDER) - 1)) == 0)) )
4807
62
            {
4808
62
                unsigned long base_mfn;
4809
62
4810
62
                pl1e = l2e_to_l1e(*pl2e);
4811
62
                if ( locking )
4812
26
                    spin_lock(&map_pgdir_lock);
4813
62
                base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4814
3.52k
                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4815
3.51k
                    if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4816
3.50k
                         (l1e_get_flags(*pl1e) != flags) )
4817
60
                        break;
4818
62
                if ( i == L1_PAGETABLE_ENTRIES )
4819
2
                {
4820
2
                    ol2e = *pl2e;
4821
2
                    l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4822
2
                                                        l1f_to_lNf(flags)));
4823
2
                    if ( locking )
4824
0
                        spin_unlock(&map_pgdir_lock);
4825
2
                    flush_area(virt - PAGE_SIZE,
4826
2
                               FLUSH_TLB_GLOBAL |
4827
2
                               FLUSH_ORDER(PAGETABLE_ORDER));
4828
2
                    free_xen_pagetable(l2e_to_l1e(ol2e));
4829
2
                }
4830
60
                else if ( locking )
4831
26
                    spin_unlock(&map_pgdir_lock);
4832
62
            }
4833
3.33k
        }
4834
4.59k
4835
4.59k
 check_l3:
4836
4.59k
        if ( cpu_has_page1gb &&
4837
4.59k
             (flags == PAGE_HYPERVISOR) &&
4838
3.89k
             ((nr_mfns == 0) ||
4839
3.72k
              !(((virt >> PAGE_SHIFT) | mfn) &
4840
3.72k
                ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4841
166
        {
4842
166
            unsigned long base_mfn;
4843
166
4844
166
            if ( locking )
4845
26
                spin_lock(&map_pgdir_lock);
4846
166
            ol3e = *pl3e;
4847
166
            pl2e = l3e_to_l2e(ol3e);
4848
166
            base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4849
166
                                              L1_PAGETABLE_ENTRIES - 1);
4850
1.99k
            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4851
1.99k
                if ( (l2e_get_pfn(*pl2e) !=
4852
1.99k
                      (base_mfn + (i << PAGETABLE_ORDER))) ||
4853
1.85k
                     (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4854
166
                    break;
4855
166
            if ( i == L2_PAGETABLE_ENTRIES )
4856
0
            {
4857
0
                l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4858
0
                                                    l1f_to_lNf(flags)));
4859
0
                if ( locking )
4860
0
                    spin_unlock(&map_pgdir_lock);
4861
0
                flush_area(virt - PAGE_SIZE,
4862
0
                           FLUSH_TLB_GLOBAL |
4863
0
                           FLUSH_ORDER(2*PAGETABLE_ORDER));
4864
0
                free_xen_pagetable(l3e_to_l2e(ol3e));
4865
0
            }
4866
166
            else if ( locking )
4867
26
                spin_unlock(&map_pgdir_lock);
4868
166
        }
4869
4.59k
    }
4870
231
4871
231
#undef flush_flags
4872
231
4873
231
    return 0;
4874
231
}
4875
4876
int populate_pt_range(unsigned long virt, unsigned long mfn,
4877
                      unsigned long nr_mfns)
4878
1
{
4879
1
    return map_pages_to_xen(virt, mfn, nr_mfns, MAP_SMALL_PAGES);
4880
1
}
4881
4882
/*
4883
 * Alter the permissions of a range of Xen virtual address space.
4884
 *
4885
 * Does not create new mappings, and does not modify the mfn in existing
4886
 * mappings, but will shatter superpages if necessary, and will destroy
4887
 * mappings if not passed _PAGE_PRESENT.
4888
 *
4889
 * The only flags considered are NX, RW and PRESENT.  All other input flags
4890
 * are ignored.
4891
 *
4892
 * It is an error to call with present flags over an unpopulated range.
4893
 */
4894
int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
4895
10
{
4896
10
    bool locking = system_state > SYS_STATE_boot;
4897
10
    l2_pgentry_t *pl2e;
4898
10
    l1_pgentry_t *pl1e;
4899
10
    unsigned int  i;
4900
10
    unsigned long v = s;
4901
10
4902
10
    /* Set of valid PTE bits which may be altered. */
4903
10
#define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
4904
10
    nf &= FLAGS_MASK;
4905
10
4906
10
    ASSERT(IS_ALIGNED(s, PAGE_SIZE));
4907
10
    ASSERT(IS_ALIGNED(e, PAGE_SIZE));
4908
10
4909
3.56k
    while ( v < e )
4910
3.55k
    {
4911
3.55k
        l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4912
3.55k
4913
3.55k
        if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4914
0
        {
4915
0
            /* Confirm the caller isn't trying to create new mappings. */
4916
0
            ASSERT(!(nf & _PAGE_PRESENT));
4917
0
4918
0
            v += 1UL << L3_PAGETABLE_SHIFT;
4919
0
            v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4920
0
            continue;
4921
0
        }
4922
3.55k
4923
3.55k
        if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4924
0
        {
4925
0
            if ( l2_table_offset(v) == 0 &&
4926
0
                 l1_table_offset(v) == 0 &&
4927
0
                 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4928
0
            {
4929
0
                /* PAGE1GB: whole superpage is modified. */
4930
0
                l3_pgentry_t nl3e = !(nf & _PAGE_PRESENT) ? l3e_empty()
4931
0
                    : l3e_from_pfn(l3e_get_pfn(*pl3e),
4932
0
                                   (l3e_get_flags(*pl3e) & ~FLAGS_MASK) | nf);
4933
0
4934
0
                l3e_write_atomic(pl3e, nl3e);
4935
0
                v += 1UL << L3_PAGETABLE_SHIFT;
4936
0
                continue;
4937
0
            }
4938
0
4939
0
            /* PAGE1GB: shatter the superpage and fall through. */
4940
0
            pl2e = alloc_xen_pagetable();
4941
0
            if ( !pl2e )
4942
0
                return -ENOMEM;
4943
0
            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4944
0
                l2e_write(pl2e + i,
4945
0
                          l2e_from_pfn(l3e_get_pfn(*pl3e) +
4946
0
                                       (i << PAGETABLE_ORDER),
4947
0
                                       l3e_get_flags(*pl3e)));
4948
0
            if ( locking )
4949
0
                spin_lock(&map_pgdir_lock);
4950
0
            if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
4951
0
                 (l3e_get_flags(*pl3e) & _PAGE_PSE) )
4952
0
            {
4953
0
                l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4954
0
                                                    __PAGE_HYPERVISOR));
4955
0
                pl2e = NULL;
4956
0
            }
4957
0
            if ( locking )
4958
0
                spin_unlock(&map_pgdir_lock);
4959
0
            if ( pl2e )
4960
0
                free_xen_pagetable(pl2e);
4961
0
        }
4962
3.55k
4963
3.55k
        /*
4964
3.55k
         * The L3 entry has been verified to be present, and we've dealt with
4965
3.55k
         * 1G pages as well, so the L2 table cannot require allocation.
4966
3.55k
         */
4967
3.55k
        pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
4968
3.55k
4969
3.55k
        if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4970
1.96k
        {
4971
1.96k
            /* Confirm the caller isn't trying to create new mappings. */
4972
1.96k
            ASSERT(!(nf & _PAGE_PRESENT));
4973
1.96k
4974
1.96k
            v += 1UL << L2_PAGETABLE_SHIFT;
4975
1.96k
            v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4976
1.96k
            continue;
4977
1.96k
        }
4978
3.55k
4979
1.59k
        if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4980
60
        {
4981
60
            if ( (l1_table_offset(v) == 0) &&
4982
60
                 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4983
57
            {
4984
57
                /* PSE: whole superpage is modified. */
4985
57
                l2_pgentry_t nl2e = !(nf & _PAGE_PRESENT) ? l2e_empty()
4986
0
                    : l2e_from_pfn(l2e_get_pfn(*pl2e),
4987
57
                                   (l2e_get_flags(*pl2e) & ~FLAGS_MASK) | nf);
4988
57
4989
57
                l2e_write_atomic(pl2e, nl2e);
4990
57
                v += 1UL << L2_PAGETABLE_SHIFT;
4991
57
            }
4992
60
            else
4993
3
            {
4994
3
                /* PSE: shatter the superpage and try again. */
4995
3
                pl1e = alloc_xen_pagetable();
4996
3
                if ( !pl1e )
4997
0
                    return -ENOMEM;
4998
1.53k
                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4999
3
                    l1e_write(&pl1e[i],
5000
3
                              l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5001
3
                                           l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5002
3
                if ( locking )
5003
0
                    spin_lock(&map_pgdir_lock);
5004
3
                if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
5005
3
                     (l2e_get_flags(*pl2e) & _PAGE_PSE) )
5006
3
                {
5007
3
                    l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5008
3
                                                        __PAGE_HYPERVISOR));
5009
3
                    pl1e = NULL;
5010
3
                }
5011
3
                if ( locking )
5012
0
                    spin_unlock(&map_pgdir_lock);
5013
3
                if ( pl1e )
5014
0
                    free_xen_pagetable(pl1e);
5015
3
            }
5016
60
        }
5017
1.59k
        else
5018
1.53k
        {
5019
1.53k
            l1_pgentry_t nl1e;
5020
1.53k
5021
1.53k
            /*
5022
1.53k
             * Ordinary 4kB mapping: The L2 entry has been verified to be
5023
1.53k
             * present, and we've dealt with 2M pages as well, so the L1 table
5024
1.53k
             * cannot require allocation.
5025
1.53k
             */
5026
1.53k
            pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5027
1.53k
5028
1.53k
            /* Confirm the caller isn't trying to create new mappings. */
5029
1.53k
            if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) )
5030
0
                ASSERT(!(nf & _PAGE_PRESENT));
5031
1.53k
5032
1.53k
            nl1e = !(nf & _PAGE_PRESENT) ? l1e_empty()
5033
1.15k
                : l1e_from_pfn(l1e_get_pfn(*pl1e),
5034
1.53k
                               (l1e_get_flags(*pl1e) & ~FLAGS_MASK) | nf);
5035
1.53k
5036
1.53k
            l1e_write_atomic(pl1e, nl1e);
5037
1.53k
            v += PAGE_SIZE;
5038
1.53k
5039
1.53k
            /*
5040
1.53k
             * If we are not destroying mappings, or not done with the L2E,
5041
1.53k
             * skip the empty&free check.
5042
1.53k
             */
5043
1.53k
            if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) )
5044
1.53k
                continue;
5045
2
            pl1e = l2e_to_l1e(*pl2e);
5046
2
            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5047
2
                if ( l1e_get_intpte(pl1e[i]) != 0 )
5048
2
                    break;
5049
2
            if ( i == L1_PAGETABLE_ENTRIES )
5050
0
            {
5051
0
                /* Empty: zap the L2E and free the L1 page. */
5052
0
                l2e_write_atomic(pl2e, l2e_empty());
5053
0
                flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5054
0
                free_xen_pagetable(pl1e);
5055
0
            }
5056
2
        }
5057
1.59k
5058
1.59k
        /*
5059
1.59k
         * If we are not destroying mappings, or not done with the L3E,
5060
1.59k
         * skip the empty&free check.
5061
1.59k
         */
5062
62
        if ( (nf & _PAGE_PRESENT) ||
5063
59
             ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) )
5064
59
            continue;
5065
3
        pl2e = l3e_to_l2e(*pl3e);
5066
3
        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5067
3
            if ( l2e_get_intpte(pl2e[i]) != 0 )
5068
3
                break;
5069
3
        if ( i == L2_PAGETABLE_ENTRIES )
5070
0
        {
5071
0
            /* Empty: zap the L3E and free the L2 page. */
5072
0
            l3e_write_atomic(pl3e, l3e_empty());
5073
0
            flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5074
0
            free_xen_pagetable(pl2e);
5075
0
        }
5076
3
    }
5077
10
5078
10
    flush_area(NULL, FLUSH_TLB_GLOBAL);
5079
10
5080
10
#undef FLAGS_MASK
5081
10
    return 0;
5082
10
}
5083
5084
#undef flush_area
5085
5086
int destroy_xen_mappings(unsigned long s, unsigned long e)
5087
7
{
5088
7
    return modify_xen_mappings(s, e, _PAGE_NONE);
5089
7
}
5090
5091
void __set_fixmap(
5092
    enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5093
37
{
5094
37
    BUG_ON(idx >= __end_of_fixed_addresses);
5095
37
    map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
5096
37
}
5097
5098
void *__init arch_vmap_virt_end(void)
5099
1
{
5100
1
    return (void *)fix_to_virt(__end_of_fixed_addresses);
5101
1
}
5102
5103
void __iomem *ioremap(paddr_t pa, size_t len)
5104
2
{
5105
2
    mfn_t mfn = _mfn(PFN_DOWN(pa));
5106
2
    void *va;
5107
2
5108
2
    WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
5109
2
5110
2
    /* The low first Mb is always mapped. */
5111
2
    if ( !((pa + len - 1) >> 20) )
5112
1
        va = __va(pa);
5113
2
    else
5114
1
    {
5115
1
        unsigned int offs = pa & (PAGE_SIZE - 1);
5116
1
        unsigned int nr = PFN_UP(offs + len);
5117
1
5118
1
        va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + offs;
5119
1
    }
5120
2
5121
2
    return (void __force __iomem *)va;
5122
2
}
5123
5124
int create_perdomain_mapping(struct domain *d, unsigned long va,
5125
                             unsigned int nr, l1_pgentry_t **pl1tab,
5126
                             struct page_info **ppg)
5127
13
{
5128
13
    struct page_info *pg;
5129
13
    l3_pgentry_t *l3tab;
5130
13
    l2_pgentry_t *l2tab;
5131
13
    l1_pgentry_t *l1tab;
5132
13
    int rc = 0;
5133
13
5134
13
    ASSERT(va >= PERDOMAIN_VIRT_START &&
5135
13
           va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5136
13
5137
13
    if ( !d->arch.perdomain_l3_pg )
5138
1
    {
5139
1
        pg = alloc_domheap_page(d, MEMF_no_owner);
5140
1
        if ( !pg )
5141
0
            return -ENOMEM;
5142
1
        l3tab = __map_domain_page(pg);
5143
1
        clear_page(l3tab);
5144
1
        d->arch.perdomain_l3_pg = pg;
5145
1
        if ( !nr )
5146
1
        {
5147
1
            unmap_domain_page(l3tab);
5148
1
            return 0;
5149
1
        }
5150
1
    }
5151
12
    else if ( !nr )
5152
0
        return 0;
5153
12
    else
5154
12
        l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5155
13
5156
12
    ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5157
12
5158
12
    if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) )
5159
1
    {
5160
1
        pg = alloc_domheap_page(d, MEMF_no_owner);
5161
1
        if ( !pg )
5162
0
        {
5163
0
            unmap_domain_page(l3tab);
5164
0
            return -ENOMEM;
5165
0
        }
5166
1
        l2tab = __map_domain_page(pg);
5167
1
        clear_page(l2tab);
5168
1
        l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR_RW);
5169
1
    }
5170
12
    else
5171
11
        l2tab = map_l2t_from_l3e(l3tab[l3_table_offset(va)]);
5172
12
5173
12
    unmap_domain_page(l3tab);
5174
12
5175
12
    if ( !pl1tab && !ppg )
5176
0
    {
5177
0
        unmap_domain_page(l2tab);
5178
0
        return 0;
5179
0
    }
5180
12
5181
36
    for ( l1tab = NULL; !rc && nr--; )
5182
24
    {
5183
24
        l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5184
24
5185
24
        if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5186
1
        {
5187
1
            if ( pl1tab && !IS_NIL(pl1tab) )
5188
0
            {
5189
0
                l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
5190
0
                if ( !l1tab )
5191
0
                {
5192
0
                    rc = -ENOMEM;
5193
0
                    break;
5194
0
                }
5195
0
                ASSERT(!pl1tab[l2_table_offset(va)]);
5196
0
                pl1tab[l2_table_offset(va)] = l1tab;
5197
0
                pg = virt_to_page(l1tab);
5198
0
            }
5199
1
            else
5200
1
            {
5201
1
                pg = alloc_domheap_page(d, MEMF_no_owner);
5202
1
                if ( !pg )
5203
0
                {
5204
0
                    rc = -ENOMEM;
5205
0
                    break;
5206
0
                }
5207
1
                l1tab = __map_domain_page(pg);
5208
1
            }
5209
1
            clear_page(l1tab);
5210
1
            *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR_RW);
5211
1
        }
5212
23
        else if ( !l1tab )
5213
11
            l1tab = map_l1t_from_l2e(*pl2e);
5214
24
5215
24
        if ( ppg &&
5216
24
             !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) )
5217
24
        {
5218
24
            pg = alloc_domheap_page(d, MEMF_no_owner);
5219
24
            if ( pg )
5220
24
            {
5221
24
                clear_domain_page(page_to_mfn(pg));
5222
24
                if ( !IS_NIL(ppg) )
5223
0
                    *ppg++ = pg;
5224
24
                l1tab[l1_table_offset(va)] =
5225
24
                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
5226
24
                l2e_add_flags(*pl2e, _PAGE_AVAIL0);
5227
24
            }
5228
24
            else
5229
0
                rc = -ENOMEM;
5230
24
        }
5231
24
5232
24
        va += PAGE_SIZE;
5233
24
        if ( rc || !nr || !l1_table_offset(va) )
5234
12
        {
5235
12
            /* Note that this is a no-op for the alloc_xenheap_page() case. */
5236
12
            unmap_domain_page(l1tab);
5237
12
            l1tab = NULL;
5238
12
        }
5239
24
    }
5240
12
5241
12
    ASSERT(!l1tab);
5242
12
    unmap_domain_page(l2tab);
5243
12
5244
12
    return rc;
5245
12
}
5246
5247
void destroy_perdomain_mapping(struct domain *d, unsigned long va,
5248
                               unsigned int nr)
5249
0
{
5250
0
    const l3_pgentry_t *l3tab, *pl3e;
5251
0
5252
0
    ASSERT(va >= PERDOMAIN_VIRT_START &&
5253
0
           va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5254
0
    ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5255
0
5256
0
    if ( !d->arch.perdomain_l3_pg )
5257
0
        return;
5258
0
5259
0
    l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5260
0
    pl3e = l3tab + l3_table_offset(va);
5261
0
5262
0
    if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT )
5263
0
    {
5264
0
        const l2_pgentry_t *l2tab = map_l2t_from_l3e(*pl3e);
5265
0
        const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5266
0
        unsigned int i = l1_table_offset(va);
5267
0
5268
0
        while ( nr )
5269
0
        {
5270
0
            if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
5271
0
            {
5272
0
                l1_pgentry_t *l1tab = map_l1t_from_l2e(*pl2e);
5273
0
5274
0
                for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i )
5275
0
                {
5276
0
                    if ( (l1e_get_flags(l1tab[i]) &
5277
0
                          (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5278
0
                         (_PAGE_PRESENT | _PAGE_AVAIL0) )
5279
0
                        free_domheap_page(l1e_get_page(l1tab[i]));
5280
0
                    l1tab[i] = l1e_empty();
5281
0
                }
5282
0
5283
0
                unmap_domain_page(l1tab);
5284
0
            }
5285
0
            else if ( nr + i < L1_PAGETABLE_ENTRIES )
5286
0
                break;
5287
0
            else
5288
0
                nr -= L1_PAGETABLE_ENTRIES - i;
5289
0
5290
0
            ++pl2e;
5291
0
            i = 0;
5292
0
        }
5293
0
5294
0
        unmap_domain_page(l2tab);
5295
0
    }
5296
0
5297
0
    unmap_domain_page(l3tab);
5298
0
}
5299
5300
void free_perdomain_mappings(struct domain *d)
5301
0
{
5302
0
    l3_pgentry_t *l3tab;
5303
0
    unsigned int i;
5304
0
5305
0
    if ( !d->arch.perdomain_l3_pg )
5306
0
        return;
5307
0
5308
0
    l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5309
0
5310
0
    for ( i = 0; i < PERDOMAIN_SLOTS; ++i)
5311
0
        if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT )
5312
0
        {
5313
0
            struct page_info *l2pg = l3e_get_page(l3tab[i]);
5314
0
            l2_pgentry_t *l2tab = __map_domain_page(l2pg);
5315
0
            unsigned int j;
5316
0
5317
0
            for ( j = 0; j < L2_PAGETABLE_ENTRIES; ++j )
5318
0
                if ( l2e_get_flags(l2tab[j]) & _PAGE_PRESENT )
5319
0
                {
5320
0
                    struct page_info *l1pg = l2e_get_page(l2tab[j]);
5321
0
5322
0
                    if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
5323
0
                    {
5324
0
                        l1_pgentry_t *l1tab = __map_domain_page(l1pg);
5325
0
                        unsigned int k;
5326
0
5327
0
                        for ( k = 0; k < L1_PAGETABLE_ENTRIES; ++k )
5328
0
                            if ( (l1e_get_flags(l1tab[k]) &
5329
0
                                  (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5330
0
                                 (_PAGE_PRESENT | _PAGE_AVAIL0) )
5331
0
                                free_domheap_page(l1e_get_page(l1tab[k]));
5332
0
5333
0
                        unmap_domain_page(l1tab);
5334
0
                    }
5335
0
5336
0
                    if ( is_xen_heap_page(l1pg) )
5337
0
                        free_xenheap_page(page_to_virt(l1pg));
5338
0
                    else
5339
0
                        free_domheap_page(l1pg);
5340
0
                }
5341
0
5342
0
            unmap_domain_page(l2tab);
5343
0
            free_domheap_page(l2pg);
5344
0
        }
5345
0
5346
0
    unmap_domain_page(l3tab);
5347
0
    free_domheap_page(d->arch.perdomain_l3_pg);
5348
0
    d->arch.perdomain_l3_pg = NULL;
5349
0
}
5350
5351
#ifdef MEMORY_GUARD
5352
5353
static void __memguard_change_range(void *p, unsigned long l, int guard)
5354
12
{
5355
12
    unsigned long _p = (unsigned long)p;
5356
12
    unsigned long _l = (unsigned long)l;
5357
12
    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
5358
12
5359
12
    /* Ensure we are dealing with a page-aligned whole number of pages. */
5360
12
    ASSERT(IS_ALIGNED(_p, PAGE_SIZE));
5361
12
    ASSERT(IS_ALIGNED(_l, PAGE_SIZE));
5362
12
5363
12
    if ( guard )
5364
12
        flags &= ~_PAGE_PRESENT;
5365
12
5366
12
    map_pages_to_xen(
5367
12
        _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5368
12
}
5369
5370
void memguard_guard_range(void *p, unsigned long l)
5371
12
{
5372
12
    __memguard_change_range(p, l, 1);
5373
12
}
5374
5375
void memguard_unguard_range(void *p, unsigned long l)
5376
0
{
5377
0
    __memguard_change_range(p, l, 0);
5378
0
}
5379
5380
#endif
5381
5382
void memguard_guard_stack(void *p)
5383
12
{
5384
12
    BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5385
12
    p = (void *)((unsigned long)p + STACK_SIZE -
5386
12
                 PRIMARY_STACK_SIZE - PAGE_SIZE);
5387
12
    memguard_guard_range(p, PAGE_SIZE);
5388
12
}
5389
5390
void memguard_unguard_stack(void *p)
5391
0
{
5392
0
    p = (void *)((unsigned long)p + STACK_SIZE -
5393
0
                 PRIMARY_STACK_SIZE - PAGE_SIZE);
5394
0
    memguard_unguard_range(p, PAGE_SIZE);
5395
0
}
5396
5397
void arch_dump_shared_mem_info(void)
5398
0
{
5399
0
    printk("Shared frames %u -- Saved frames %u\n",
5400
0
            mem_sharing_get_nr_shared_mfns(),
5401
0
            mem_sharing_get_nr_saved_mfns());
5402
0
}
5403
5404
const unsigned long *__init get_platform_badpages(unsigned int *array_size)
5405
15
{
5406
15
    u32 igd_id;
5407
15
    static unsigned long __initdata bad_pages[] = {
5408
15
        0x20050000,
5409
15
        0x20110000,
5410
15
        0x20130000,
5411
15
        0x20138000,
5412
15
        0x40004000,
5413
15
    };
5414
15
5415
15
    *array_size = ARRAY_SIZE(bad_pages);
5416
15
    igd_id = pci_conf_read32(0, 0, 2, 0, 0);
5417
15
    if ( !IS_SNB_GFX(igd_id) )
5418
15
        return NULL;
5419
15
5420
0
    return bad_pages;
5421
15
}
5422
5423
void paging_invlpg(struct vcpu *v, unsigned long va)
5424
0
{
5425
0
    if ( !is_canonical_address(va) )
5426
0
        return;
5427
0
5428
0
    if ( paging_mode_enabled(v->domain) &&
5429
0
         !paging_get_hostmode(v)->invlpg(v, va) )
5430
0
        return;
5431
0
5432
0
    if ( is_pv_vcpu(v) )
5433
0
        flush_tlb_one_local(va);
5434
0
    else
5435
0
        hvm_funcs.invlpg(v, va);
5436
0
}
5437
5438
/* Build a 32bit PSE page table using 4MB pages. */
5439
void write_32bit_pse_identmap(uint32_t *l2)
5440
0
{
5441
0
    unsigned int i;
5442
0
5443
0
    for ( i = 0; i < PAGE_SIZE / sizeof(*l2); i++ )
5444
0
        l2[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5445
0
                 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5446
0
}
5447
5448
unsigned long get_upper_mfn_bound(void)
5449
0
{
5450
0
    unsigned long max_mfn;
5451
0
5452
0
    max_mfn = mem_hotplug ? PFN_DOWN(mem_hotplug) : max_page;
5453
0
#ifndef CONFIG_BIGMEM
5454
0
    max_mfn = min(max_mfn, 1UL << 32);
5455
0
#endif
5456
0
    return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1;
5457
0
}
5458
5459
/*
5460
 * Local variables:
5461
 * mode: C
5462
 * c-file-style: "BSD"
5463
 * c-basic-offset: 4
5464
 * tab-width: 4
5465
 * indent-tabs-mode: nil
5466
 * End:
5467
 */