debuggers.hg

view xen/arch/x86/mm.c @ 20840:3f8fd65732cc

x86: minor cleanup to arch_memory_op()

There's a function-wide variable rc, so no need to re-declare it in
individual case handling blocks.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 13 08:17:00 2010 +0000 (2010-01-13)
parents b76fe58d0701
children 91358472d8c4
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <public/sched.h>
114 #include <xsm/xsm.h>
115 #include <xen/trace.h>
116 #include <asm/setup.h>
117 #include <asm/mem_sharing.h>
119 /*
120 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
121 * mappings to avoid type conflicts with fixed-range MTRRs covering the
122 * lowest megabyte of physical memory. In any case the VGA hole should be
123 * mapped with type UC.
124 */
125 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
126 l1_identmap[L1_PAGETABLE_ENTRIES];
128 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
130 /*
131 * PTE updates can be done with ordinary writes except:
132 * 1. Debug builds get extra checking by using CMPXCHG[8B].
133 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
134 */
135 #if !defined(NDEBUG) || defined(__i386__)
136 #define PTE_UPDATE_WITH_CMPXCHG
137 #endif
139 int mem_hotplug = 0;
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 struct domain *dom_xen, *dom_io, *dom_cow;
144 /* Frame table size in pages. */
145 unsigned long max_page;
146 unsigned long total_pages;
148 unsigned long __read_mostly pdx_group_valid[BITS_TO_LONGS(
149 (FRAMETABLE_SIZE / sizeof(*frame_table) + PDX_GROUP_COUNT - 1)
150 / PDX_GROUP_COUNT)] = { [0] = 1 };
152 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
154 int opt_allow_hugepage;
155 boolean_param("allowhugepage", opt_allow_hugepage);
157 #define l1_disallow_mask(d) \
158 ((d != dom_io) && \
159 (rangeset_is_empty((d)->iomem_caps) && \
160 rangeset_is_empty((d)->arch.ioport_caps) && \
161 !has_arch_pdevs(d)) ? \
162 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
164 #ifdef __x86_64__
165 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
166 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
167 L3_DISALLOW_MASK : \
168 COMPAT_L3_DISALLOW_MASK)
169 #else
170 #define l3_disallow_mask(d) L3_DISALLOW_MASK
171 #endif
173 static void __init init_frametable_chunk(void *start, void *end)
174 {
175 unsigned long s = (unsigned long)start;
176 unsigned long e = (unsigned long)end;
177 unsigned long step, mfn;
179 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
180 for ( ; s < e; s += step << PAGE_SHIFT )
181 {
182 step = 1UL << (cpu_has_page1gb &&
183 !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
184 L3_PAGETABLE_SHIFT - PAGE_SHIFT :
185 L2_PAGETABLE_SHIFT - PAGE_SHIFT);
186 /*
187 * The hardcoded 4 below is arbitrary - just pick whatever you think
188 * is reasonable to waste as a trade-off for using a large page.
189 */
190 while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
191 step >>= PAGETABLE_ORDER;
192 do {
193 mfn = alloc_boot_pages(step, step);
194 } while ( !mfn && (step >>= PAGETABLE_ORDER) );
195 if ( !mfn )
196 panic("Not enough memory for frame table");
197 map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
198 }
200 memset(start, 0, end - start);
201 memset(end, -1, s - (unsigned long)end);
202 }
204 void __init init_frametable(void)
205 {
206 unsigned int sidx, eidx, nidx;
207 unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
209 #ifdef __x86_64__
210 BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_END);
211 #endif
212 BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
214 for ( sidx = 0; ; sidx = nidx )
215 {
216 eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
217 nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
218 if ( nidx >= max_idx )
219 break;
220 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
221 pdx_to_page(eidx * PDX_GROUP_COUNT));
222 }
223 if ( !mem_hotplug )
224 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
225 pdx_to_page(max_pdx - 1) + 1);
226 else
227 {
228 init_frametable_chunk(pdx_to_page(sidx *PDX_GROUP_COUNT),
229 pdx_to_page(max_idx * PDX_GROUP_COUNT));
230 memset(pdx_to_page(max_pdx), -1, (unsigned long)pdx_to_page(max_idx) -
231 (unsigned long)(pdx_to_page(max_pdx)));
232 }
233 }
235 void __init arch_init_memory(void)
236 {
237 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
239 /*
240 * Initialise our DOMID_XEN domain.
241 * Any Xen-heap pages that we will allow to be mapped will have
242 * their domain field set to dom_xen.
243 */
244 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
245 BUG_ON(dom_xen == NULL);
247 /*
248 * Initialise our DOMID_IO domain.
249 * This domain owns I/O pages that are within the range of the page_info
250 * array. Mappings occur at the priv of the caller.
251 */
252 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
253 BUG_ON(dom_io == NULL);
255 /*
256 * Initialise our DOMID_IO domain.
257 * This domain owns sharable pages.
258 */
259 dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
260 BUG_ON(dom_cow == NULL);
262 /* First 1MB of RAM is historically marked as I/O. */
263 for ( i = 0; i < 0x100; i++ )
264 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
266 /* Any areas not specified as RAM by the e820 map are considered I/O. */
267 for ( i = 0, pfn = 0; pfn < max_page; i++ )
268 {
269 while ( (i < e820.nr_map) &&
270 (e820.map[i].type != E820_RAM) &&
271 (e820.map[i].type != E820_UNUSABLE) )
272 i++;
274 if ( i >= e820.nr_map )
275 {
276 /* No more RAM regions: mark as I/O right to end of memory map. */
277 rstart_pfn = rend_pfn = max_page;
278 }
279 else
280 {
281 /* Mark as I/O just up as far as next RAM region. */
282 rstart_pfn = min_t(unsigned long, max_page,
283 PFN_UP(e820.map[i].addr));
284 rend_pfn = max_t(unsigned long, rstart_pfn,
285 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
286 }
288 /*
289 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
290 * In particular this ensures that RAM holes are respected even in
291 * the statically-initialised 1-16MB mapping area.
292 */
293 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
294 #if defined(CONFIG_X86_32)
295 ioend_pfn = min_t(unsigned long, rstart_pfn,
296 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
297 #else
298 ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
299 #endif
300 if ( iostart_pfn < ioend_pfn )
301 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
302 (unsigned long)mfn_to_virt(ioend_pfn));
304 /* Mark as I/O up to next RAM region. */
305 for ( ; pfn < rstart_pfn; pfn++ )
306 {
307 if ( !mfn_valid(pfn) )
308 continue;
309 share_xen_page_with_guest(
310 mfn_to_page(pfn), dom_io, XENSHARE_writable);
311 }
313 /* Skip the RAM region. */
314 pfn = rend_pfn;
315 }
317 subarch_init_memory();
319 mem_sharing_init();
320 }
322 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
323 {
324 uint64_t maddr = pfn_to_paddr(mfn);
325 int i;
327 for ( i = 0; i < e820.nr_map; i++ )
328 {
329 switch ( e820.map[i].type )
330 {
331 case E820_RAM:
332 if ( mem_type & RAM_TYPE_CONVENTIONAL )
333 break;
334 continue;
335 case E820_RESERVED:
336 if ( mem_type & RAM_TYPE_RESERVED )
337 break;
338 continue;
339 case E820_UNUSABLE:
340 if ( mem_type & RAM_TYPE_UNUSABLE )
341 break;
342 continue;
343 case E820_ACPI:
344 case E820_NVS:
345 if ( mem_type & RAM_TYPE_ACPI )
346 break;
347 continue;
348 default:
349 /* unknown */
350 continue;
351 }
353 /* Test the range. */
354 if ( (e820.map[i].addr <= maddr) &&
355 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
356 return 1;
357 }
359 return 0;
360 }
362 unsigned long domain_get_maximum_gpfn(struct domain *d)
363 {
364 if ( is_hvm_domain(d) )
365 return d->arch.p2m->max_mapped_pfn;
366 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
367 return arch_get_max_pfn(d) - 1;
368 }
370 void share_xen_page_with_guest(
371 struct page_info *page, struct domain *d, int readonly)
372 {
373 if ( page_get_owner(page) == d )
374 return;
376 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
378 spin_lock(&d->page_alloc_lock);
380 /* The incremented type count pins as writable or read-only. */
381 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
382 page->u.inuse.type_info |= PGT_validated | 1;
384 page_set_owner(page, d);
385 wmb(); /* install valid domain ptr before updating refcnt. */
386 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
388 /* Only add to the allocation list if the domain isn't dying. */
389 if ( !d->is_dying )
390 {
391 page->count_info |= PGC_allocated | 1;
392 if ( unlikely(d->xenheap_pages++ == 0) )
393 get_knownalive_domain(d);
394 page_list_add_tail(page, &d->xenpage_list);
395 }
397 spin_unlock(&d->page_alloc_lock);
398 }
400 void share_xen_page_with_privileged_guests(
401 struct page_info *page, int readonly)
402 {
403 share_xen_page_with_guest(page, dom_xen, readonly);
404 }
406 #if defined(__i386__)
408 #ifdef NDEBUG
409 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
410 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
411 #else
412 /*
413 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
414 * We cannot safely shadow the idle page table, nor shadow page tables
415 * (detected by zero reference count). As required for correctness, we
416 * always shadow PDPTs above 4GB.
417 */
418 #define l3tab_needs_shadow(mfn) \
419 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
420 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
421 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
422 ((mfn) >= 0x100000))
423 #endif
425 static l1_pgentry_t *fix_pae_highmem_pl1e;
427 /* Cache the address of PAE high-memory fixmap page tables. */
428 static int __init cache_pae_fixmap_address(void)
429 {
430 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
431 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
432 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
433 return 0;
434 }
435 __initcall(cache_pae_fixmap_address);
437 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
439 void make_cr3(struct vcpu *v, unsigned long mfn)
440 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
441 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
442 {
443 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
444 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
445 unsigned int cpu = smp_processor_id();
447 /* Fast path: does this mfn need a shadow at all? */
448 if ( !l3tab_needs_shadow(mfn) )
449 {
450 v->arch.cr3 = mfn << PAGE_SHIFT;
451 /* Cache is no longer in use or valid */
452 cache->high_mfn = 0;
453 return;
454 }
456 /* Caching logic is not interrupt safe. */
457 ASSERT(!in_irq());
459 /* Protects against pae_flush_pgd(). */
460 spin_lock(&cache->lock);
462 cache->inuse_idx ^= 1;
463 cache->high_mfn = mfn;
465 /* Map the guest L3 table and copy to the chosen low-memory cache. */
466 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
467 /* First check the previous high mapping can't be in the TLB.
468 * (i.e. have we loaded CR3 since we last did this?) */
469 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
470 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
471 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
472 lowmem_l3tab = cache->table[cache->inuse_idx];
473 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
474 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
475 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
477 v->arch.cr3 = __pa(lowmem_l3tab);
479 spin_unlock(&cache->lock);
480 }
482 #else /* !defined(__i386__) */
484 void make_cr3(struct vcpu *v, unsigned long mfn)
485 {
486 v->arch.cr3 = mfn << PAGE_SHIFT;
487 }
489 #endif /* !defined(__i386__) */
491 void write_ptbase(struct vcpu *v)
492 {
493 write_cr3(v->arch.cr3);
494 }
496 /*
497 * Should be called after CR3 is updated.
498 *
499 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
500 * for HVM guests, arch.monitor_table and hvm's guest CR3.
501 *
502 * Update ref counts to shadow tables appropriately.
503 */
504 void update_cr3(struct vcpu *v)
505 {
506 unsigned long cr3_mfn=0;
508 if ( paging_mode_enabled(v->domain) )
509 {
510 paging_update_cr3(v);
511 return;
512 }
514 #if CONFIG_PAGING_LEVELS == 4
515 if ( !(v->arch.flags & TF_kernel_mode) )
516 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
517 else
518 #endif
519 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
521 make_cr3(v, cr3_mfn);
522 }
525 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
526 {
527 int i;
528 unsigned long pfn;
529 struct page_info *page;
531 BUG_ON(unlikely(in_irq()));
533 spin_lock(&v->arch.shadow_ldt_lock);
535 if ( v->arch.shadow_ldt_mapcnt == 0 )
536 goto out;
538 v->arch.shadow_ldt_mapcnt = 0;
540 for ( i = 16; i < 32; i++ )
541 {
542 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
543 if ( pfn == 0 ) continue;
544 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
545 page = mfn_to_page(pfn);
546 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
547 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
548 put_page_and_type(page);
549 }
551 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
552 if ( flush )
553 flush_tlb_mask(&v->vcpu_dirty_cpumask);
555 out:
556 spin_unlock(&v->arch.shadow_ldt_lock);
557 }
560 static int alloc_segdesc_page(struct page_info *page)
561 {
562 struct desc_struct *descs;
563 int i;
565 descs = __map_domain_page(page);
567 for ( i = 0; i < 512; i++ )
568 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
569 goto fail;
571 unmap_domain_page(descs);
572 return 0;
574 fail:
575 unmap_domain_page(descs);
576 return -EINVAL;
577 }
580 /* Map shadow page at offset @off. */
581 int map_ldt_shadow_page(unsigned int off)
582 {
583 struct vcpu *v = current;
584 struct domain *d = v->domain;
585 unsigned long gmfn, mfn;
586 l1_pgentry_t l1e, nl1e;
587 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
588 int okay;
590 BUG_ON(unlikely(in_irq()));
592 guest_get_eff_kern_l1e(v, gva, &l1e);
593 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
594 return 0;
596 gmfn = l1e_get_pfn(l1e);
597 mfn = gmfn_to_mfn(d, gmfn);
598 if ( unlikely(!mfn_valid(mfn)) )
599 return 0;
601 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
602 if ( unlikely(!okay) )
603 return 0;
605 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
607 spin_lock(&v->arch.shadow_ldt_lock);
608 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
609 v->arch.shadow_ldt_mapcnt++;
610 spin_unlock(&v->arch.shadow_ldt_lock);
612 return 1;
613 }
616 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
617 {
618 struct page_info *page = mfn_to_page(page_nr);
620 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
621 {
622 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
623 return 0;
624 }
626 return 1;
627 }
630 static int get_page_and_type_from_pagenr(unsigned long page_nr,
631 unsigned long type,
632 struct domain *d,
633 int partial,
634 int preemptible)
635 {
636 struct page_info *page = mfn_to_page(page_nr);
637 int rc;
639 if ( likely(partial >= 0) &&
640 unlikely(!get_page_from_pagenr(page_nr, d)) )
641 return -EINVAL;
643 rc = (preemptible ?
644 get_page_type_preemptible(page, type) :
645 (get_page_type(page, type) ? 0 : -EINVAL));
647 if ( unlikely(rc) && partial >= 0 )
648 put_page(page);
650 return rc;
651 }
653 static int get_data_page(
654 struct page_info *page, struct domain *d, int writeable)
655 {
656 int rc;
658 if ( writeable )
659 rc = get_page_and_type(page, d, PGT_writable_page);
660 else
661 rc = get_page(page, d);
663 return rc;
664 }
666 static void put_data_page(
667 struct page_info *page, int writeable)
668 {
669 if ( writeable )
670 put_page_and_type(page);
671 else
672 put_page(page);
673 }
675 /*
676 * We allow root tables to map each other (a.k.a. linear page tables). It
677 * needs some special care with reference counts and access permissions:
678 * 1. The mapping entry must be read-only, or the guest may get write access
679 * to its own PTEs.
680 * 2. We must only bump the reference counts for an *already validated*
681 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
682 * on a validation that is required to complete that validation.
683 * 3. We only need to increment the reference counts for the mapped page
684 * frame if it is mapped by a different root table. This is sufficient and
685 * also necessary to allow validation of a root table mapping itself.
686 */
687 #define define_get_linear_pagetable(level) \
688 static int \
689 get_##level##_linear_pagetable( \
690 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
691 { \
692 unsigned long x, y; \
693 struct page_info *page; \
694 unsigned long pfn; \
695 \
696 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
697 { \
698 MEM_LOG("Attempt to create linear p.t. with write perms"); \
699 return 0; \
700 } \
701 \
702 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
703 { \
704 /* Make sure the mapped frame belongs to the correct domain. */ \
705 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
706 return 0; \
707 \
708 /* \
709 * Ensure that the mapped frame is an already-validated page table. \
710 * If so, atomically increment the count (checking for overflow). \
711 */ \
712 page = mfn_to_page(pfn); \
713 y = page->u.inuse.type_info; \
714 do { \
715 x = y; \
716 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
717 unlikely((x & (PGT_type_mask|PGT_validated)) != \
718 (PGT_##level##_page_table|PGT_validated)) ) \
719 { \
720 put_page(page); \
721 return 0; \
722 } \
723 } \
724 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
725 } \
726 \
727 return 1; \
728 }
731 int is_iomem_page(unsigned long mfn)
732 {
733 struct page_info *page;
735 if ( !mfn_valid(mfn) )
736 return 1;
738 /* Caller must know that it is an iomem page, or a reference is held. */
739 page = mfn_to_page(mfn);
740 ASSERT((page->count_info & PGC_count_mask) != 0);
742 return (page_get_owner(page) == dom_io);
743 }
745 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
746 {
747 #ifdef __x86_64__
748 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
749 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
750 unsigned long xen_va =
751 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
753 if ( unlikely(alias) && cacheattr )
754 map_pages_to_xen(xen_va, mfn, 1, 0);
755 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
756 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
757 if ( unlikely(alias) && !cacheattr )
758 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
759 #endif
760 }
762 int
763 get_page_from_l1e(
764 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
765 {
766 unsigned long mfn = l1e_get_pfn(l1e);
767 struct page_info *page = mfn_to_page(mfn);
768 uint32_t l1f = l1e_get_flags(l1e);
769 struct vcpu *curr = current;
770 struct domain *real_pg_owner;
772 if ( !(l1f & _PAGE_PRESENT) )
773 return 1;
775 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
776 {
777 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
778 return 0;
779 }
781 if ( !mfn_valid(mfn) ||
782 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
783 {
784 /* Only needed the reference to confirm dom_io ownership. */
785 if ( mfn_valid(mfn) )
786 put_page(page);
788 /* DOMID_IO reverts to caller for privilege checks. */
789 if ( pg_owner == dom_io )
790 pg_owner = curr->domain;
792 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
793 {
794 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
795 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
796 pg_owner->domain_id, mfn);
797 return 0;
798 }
800 return 1;
801 }
803 if ( unlikely(real_pg_owner != pg_owner) )
804 {
805 /*
806 * Let privileged domains transfer the right to map their target
807 * domain's pages. This is used to allow stub-domain pvfb export to
808 * dom0, until pvfb supports granted mappings. At that time this
809 * minor hack can go away.
810 */
811 if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
812 !IS_PRIV_FOR(pg_owner, real_pg_owner) )
813 goto could_not_pin;
814 pg_owner = real_pg_owner;
815 }
817 /* Foreign mappings into guests in shadow external mode don't
818 * contribute to writeable mapping refcounts. (This allows the
819 * qemu-dm helper process in dom0 to map the domain's memory without
820 * messing up the count of "real" writable mappings.) */
821 if ( (l1f & _PAGE_RW) &&
822 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) &&
823 !get_page_type(page, PGT_writable_page) )
824 goto could_not_pin;
826 if ( pte_flags_to_cacheattr(l1f) !=
827 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
828 {
829 unsigned long x, nx, y = page->count_info;
830 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
832 if ( is_xen_heap_page(page) )
833 {
834 if ( (l1f & _PAGE_RW) &&
835 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
836 put_page_type(page);
837 put_page(page);
838 MEM_LOG("Attempt to change cache attributes of Xen heap page");
839 return 0;
840 }
842 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
843 {
844 x = y;
845 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
846 y = cmpxchg(&page->count_info, x, nx);
847 }
849 update_xen_mappings(mfn, cacheattr);
850 }
852 return 1;
854 could_not_pin:
855 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
856 " for l1e_owner=%d, pg_owner=%d",
857 mfn, get_gpfn_from_mfn(mfn),
858 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
859 if ( real_pg_owner != NULL )
860 put_page(page);
861 return 0;
862 }
865 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
866 define_get_linear_pagetable(l2);
867 static int
868 get_page_from_l2e(
869 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
870 {
871 unsigned long mfn = l2e_get_pfn(l2e);
872 int rc;
874 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
875 return 1;
877 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
878 {
879 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
880 return -EINVAL;
881 }
883 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
884 {
885 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
886 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
887 rc = 0;
888 }
889 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
890 {
891 rc = -EINVAL;
892 }
893 else
894 {
895 unsigned long m = mfn;
896 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
898 do {
899 if ( !mfn_valid(m) ||
900 !get_data_page(mfn_to_page(m), d, writeable) )
901 {
902 while ( m-- > mfn )
903 put_data_page(mfn_to_page(m), writeable);
904 return -EINVAL;
905 }
906 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
908 rc = 1;
909 }
911 return rc;
912 }
915 define_get_linear_pagetable(l3);
916 static int
917 get_page_from_l3e(
918 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
919 {
920 int rc;
922 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
923 return 1;
925 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
926 {
927 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
928 return -EINVAL;
929 }
931 rc = get_page_and_type_from_pagenr(
932 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
933 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
934 rc = 0;
936 return rc;
937 }
939 #if CONFIG_PAGING_LEVELS >= 4
940 define_get_linear_pagetable(l4);
941 static int
942 get_page_from_l4e(
943 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
944 {
945 int rc;
947 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
948 return 1;
950 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
951 {
952 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
953 return -EINVAL;
954 }
956 rc = get_page_and_type_from_pagenr(
957 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
958 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
959 rc = 0;
961 return rc;
962 }
963 #endif /* 4 level */
965 #ifdef __x86_64__
967 #ifdef USER_MAPPINGS_ARE_GLOBAL
968 #define adjust_guest_l1e(pl1e, d) \
969 do { \
970 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
971 likely(!is_pv_32on64_domain(d)) ) \
972 { \
973 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
974 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
975 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
976 MEM_LOG("Global bit is set to kernel page %lx", \
977 l1e_get_pfn((pl1e))); \
978 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
979 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
980 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
981 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
982 } \
983 } while ( 0 )
984 #else
985 #define adjust_guest_l1e(pl1e, d) \
986 do { \
987 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
988 likely(!is_pv_32on64_domain(d)) ) \
989 l1e_add_flags((pl1e), _PAGE_USER); \
990 } while ( 0 )
991 #endif
993 #define adjust_guest_l2e(pl2e, d) \
994 do { \
995 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
996 likely(!is_pv_32on64_domain(d)) ) \
997 l2e_add_flags((pl2e), _PAGE_USER); \
998 } while ( 0 )
1000 #define adjust_guest_l3e(pl3e, d) \
1001 do { \
1002 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1003 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
1004 _PAGE_USER : \
1005 _PAGE_USER|_PAGE_RW); \
1006 } while ( 0 )
1008 #define adjust_guest_l4e(pl4e, d) \
1009 do { \
1010 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
1011 likely(!is_pv_32on64_domain(d)) ) \
1012 l4e_add_flags((pl4e), _PAGE_USER); \
1013 } while ( 0 )
1015 #else /* !defined(__x86_64__) */
1017 #define adjust_guest_l1e(_p, _d) ((void)(_d))
1018 #define adjust_guest_l2e(_p, _d) ((void)(_d))
1019 #define adjust_guest_l3e(_p, _d) ((void)(_d))
1021 #endif
1023 #ifdef __x86_64__
1024 #define unadjust_guest_l3e(pl3e, d) \
1025 do { \
1026 if ( unlikely(is_pv_32on64_domain(d)) && \
1027 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1028 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
1029 } while ( 0 )
1030 #else
1031 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
1032 #endif
1034 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1036 unsigned long pfn = l1e_get_pfn(l1e);
1037 struct page_info *page;
1038 struct domain *pg_owner;
1039 struct vcpu *v;
1041 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1042 return;
1044 page = mfn_to_page(pfn);
1045 pg_owner = page_get_owner(page);
1047 /*
1048 * Check if this is a mapping that was established via a grant reference.
1049 * If it was then we should not be here: we require that such mappings are
1050 * explicitly destroyed via the grant-table interface.
1052 * The upshot of this is that the guest can end up with active grants that
1053 * it cannot destroy (because it no longer has a PTE to present to the
1054 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1055 * hence a special grant PTE flag can be enabled to catch the bug early.
1057 * (Note that the undestroyable active grants are not a security hole in
1058 * Xen. All active grants can safely be cleaned up when the domain dies.)
1059 */
1060 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1061 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1063 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1064 l1e_get_intpte(l1e));
1065 domain_crash(l1e_owner);
1068 /* Remember we didn't take a type-count of foreign writable mappings
1069 * to paging-external domains */
1070 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1071 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1073 put_page_and_type(page);
1075 else
1077 /* We expect this is rare so we blow the entire shadow LDT. */
1078 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1079 PGT_seg_desc_page)) &&
1080 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1081 (l1e_owner == pg_owner) )
1083 for_each_vcpu ( pg_owner, v )
1084 invalidate_shadow_ldt(v, 1);
1086 put_page(page);
1091 /*
1092 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1093 * Note also that this automatically deals correctly with linear p.t.'s.
1094 */
1095 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1097 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1098 return 1;
1100 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1102 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1103 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1105 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1106 do {
1107 put_data_page(mfn_to_page(m), writeable);
1108 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1110 else
1112 put_page_and_type(l2e_get_page(l2e));
1115 return 0;
1118 static int __put_page_type(struct page_info *, int preemptible);
1120 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1121 int partial, int preemptible)
1123 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1124 return 1;
1126 #ifdef __x86_64__
1127 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1129 unsigned long mfn = l3e_get_pfn(l3e);
1130 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1132 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1133 do {
1134 put_data_page(mfn_to_page(mfn), writeable);
1135 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1137 return 0;
1139 #endif
1141 if ( unlikely(partial > 0) )
1142 return __put_page_type(l3e_get_page(l3e), preemptible);
1144 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1147 #if CONFIG_PAGING_LEVELS >= 4
1148 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1149 int partial, int preemptible)
1151 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1152 (l4e_get_pfn(l4e) != pfn) )
1154 if ( unlikely(partial > 0) )
1155 return __put_page_type(l4e_get_page(l4e), preemptible);
1156 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1158 return 1;
1160 #endif
1162 static int alloc_l1_table(struct page_info *page)
1164 struct domain *d = page_get_owner(page);
1165 unsigned long pfn = page_to_mfn(page);
1166 l1_pgentry_t *pl1e;
1167 unsigned int i;
1169 pl1e = map_domain_page(pfn);
1171 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1173 if ( is_guest_l1_slot(i) &&
1174 unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
1175 goto fail;
1177 adjust_guest_l1e(pl1e[i], d);
1180 unmap_domain_page(pl1e);
1181 return 0;
1183 fail:
1184 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1185 while ( i-- > 0 )
1186 if ( is_guest_l1_slot(i) )
1187 put_page_from_l1e(pl1e[i], d);
1189 unmap_domain_page(pl1e);
1190 return -EINVAL;
1193 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1195 struct page_info *page;
1196 l3_pgentry_t l3e3;
1197 #ifdef __i386__
1198 l2_pgentry_t *pl2e, l2e;
1199 int i;
1200 #endif
1202 if ( !is_pv_32bit_domain(d) )
1203 return 1;
1205 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1207 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1208 l3e3 = pl3e[3];
1209 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1211 MEM_LOG("PAE L3 3rd slot is empty");
1212 return 0;
1215 /*
1216 * The Xen-private mappings include linear mappings. The L2 thus cannot
1217 * be shared by multiple L3 tables. The test here is adequate because:
1218 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1219 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1220 * 2. Cannot appear in another page table's L3:
1221 * a. alloc_l3_table() calls this function and this check will fail
1222 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1223 */
1224 page = l3e_get_page(l3e3);
1225 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1226 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1227 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1228 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1230 MEM_LOG("PAE L3 3rd slot is shared");
1231 return 0;
1234 #ifdef __i386__
1235 /* Xen linear pagetable mappings. */
1236 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1237 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1239 l2e = l2e_empty();
1240 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1241 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1242 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1244 unmap_domain_page(pl2e);
1245 #endif
1247 return 1;
1250 #ifdef __i386__
1251 /* Flush a pgdir update into low-memory caches. */
1252 static void pae_flush_pgd(
1253 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1255 struct domain *d = page_get_owner(mfn_to_page(mfn));
1256 struct vcpu *v;
1257 intpte_t _ol3e, _nl3e, _pl3e;
1258 l3_pgentry_t *l3tab_ptr;
1259 struct pae_l3_cache *cache;
1261 if ( unlikely(shadow_mode_enabled(d)) )
1263 cpumask_t m = CPU_MASK_NONE;
1264 /* Re-shadow this l3 table on any vcpus that are using it */
1265 for_each_vcpu ( d, v )
1266 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1268 paging_update_cr3(v);
1269 cpus_or(m, m, v->vcpu_dirty_cpumask);
1271 flush_tlb_mask(&m);
1274 /* If below 4GB then the pgdir is not shadowed in low memory. */
1275 if ( !l3tab_needs_shadow(mfn) )
1276 return;
1278 for_each_vcpu ( d, v )
1280 cache = &v->arch.pae_l3_cache;
1282 spin_lock(&cache->lock);
1284 if ( cache->high_mfn == mfn )
1286 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1287 _ol3e = l3e_get_intpte(*l3tab_ptr);
1288 _nl3e = l3e_get_intpte(nl3e);
1289 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1290 BUG_ON(_pl3e != _ol3e);
1293 spin_unlock(&cache->lock);
1296 flush_tlb_mask(&d->domain_dirty_cpumask);
1298 #else
1299 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1300 #endif
1302 static int alloc_l2_table(struct page_info *page, unsigned long type,
1303 int preemptible)
1305 struct domain *d = page_get_owner(page);
1306 unsigned long pfn = page_to_mfn(page);
1307 l2_pgentry_t *pl2e;
1308 unsigned int i;
1309 int rc = 0;
1311 pl2e = map_domain_page(pfn);
1313 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1315 if ( preemptible && i && hypercall_preempt_check() )
1317 page->nr_validated_ptes = i;
1318 rc = -EAGAIN;
1319 break;
1322 if ( !is_guest_l2_slot(d, type, i) ||
1323 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1324 continue;
1326 if ( rc < 0 )
1328 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1329 while ( i-- > 0 )
1330 if ( is_guest_l2_slot(d, type, i) )
1331 put_page_from_l2e(pl2e[i], pfn);
1332 break;
1335 adjust_guest_l2e(pl2e[i], d);
1338 if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1340 /* Xen private mappings. */
1341 #if defined(__i386__)
1342 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1343 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1344 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1345 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1346 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i],
1347 l2e_from_page(perdomain_pt_page(d, i),
1348 __PAGE_HYPERVISOR));
1349 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1350 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1351 #else
1352 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1353 &compat_idle_pg_table_l2[
1354 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1355 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1356 #endif
1359 unmap_domain_page(pl2e);
1360 return rc > 0 ? 0 : rc;
1363 static int alloc_l3_table(struct page_info *page, int preemptible)
1365 struct domain *d = page_get_owner(page);
1366 unsigned long pfn = page_to_mfn(page);
1367 l3_pgentry_t *pl3e;
1368 unsigned int i;
1369 int rc = 0, partial = page->partial_pte;
1371 #if CONFIG_PAGING_LEVELS == 3
1372 /*
1373 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1374 * the weird 'extended cr3' format for dealing with high-order address
1375 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1376 */
1377 if ( (pfn >= 0x100000) &&
1378 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1379 d->vcpu && d->vcpu[0] && d->vcpu[0]->is_initialised )
1381 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1382 return -EINVAL;
1384 #endif
1386 pl3e = map_domain_page(pfn);
1388 /*
1389 * PAE guests allocate full pages, but aren't required to initialize
1390 * more than the first four entries; when running in compatibility
1391 * mode, however, the full page is visible to the MMU, and hence all
1392 * 512 entries must be valid/verified, which is most easily achieved
1393 * by clearing them out.
1394 */
1395 if ( is_pv_32on64_domain(d) )
1396 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1398 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1399 i++, partial = 0 )
1401 if ( is_pv_32bit_domain(d) && (i == 3) )
1403 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1404 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1405 rc = -EINVAL;
1406 else
1407 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1408 PGT_l2_page_table |
1409 PGT_pae_xen_l2,
1410 d, partial, preemptible);
1412 else if ( !is_guest_l3_slot(i) ||
1413 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1414 partial, preemptible)) > 0 )
1415 continue;
1417 if ( rc == -EAGAIN )
1419 page->nr_validated_ptes = i;
1420 page->partial_pte = partial ?: 1;
1422 else if ( rc == -EINTR && i )
1424 page->nr_validated_ptes = i;
1425 page->partial_pte = 0;
1426 rc = -EAGAIN;
1428 if ( rc < 0 )
1429 break;
1431 adjust_guest_l3e(pl3e[i], d);
1434 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1435 rc = -EINVAL;
1436 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1438 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1439 while ( i-- > 0 )
1441 if ( !is_guest_l3_slot(i) )
1442 continue;
1443 unadjust_guest_l3e(pl3e[i], d);
1444 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1448 unmap_domain_page(pl3e);
1449 return rc > 0 ? 0 : rc;
1452 #if CONFIG_PAGING_LEVELS >= 4
1453 static int alloc_l4_table(struct page_info *page, int preemptible)
1455 struct domain *d = page_get_owner(page);
1456 unsigned long pfn = page_to_mfn(page);
1457 l4_pgentry_t *pl4e = page_to_virt(page);
1458 unsigned int i;
1459 int rc = 0, partial = page->partial_pte;
1461 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1462 i++, partial = 0 )
1464 if ( !is_guest_l4_slot(d, i) ||
1465 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1466 partial, preemptible)) > 0 )
1467 continue;
1469 if ( rc == -EAGAIN )
1471 page->nr_validated_ptes = i;
1472 page->partial_pte = partial ?: 1;
1474 else if ( rc == -EINTR )
1476 if ( i )
1478 page->nr_validated_ptes = i;
1479 page->partial_pte = 0;
1480 rc = -EAGAIN;
1483 else if ( rc < 0 )
1485 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1486 while ( i-- > 0 )
1487 if ( is_guest_l4_slot(d, i) )
1488 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1490 if ( rc < 0 )
1491 return rc;
1493 adjust_guest_l4e(pl4e[i], d);
1496 /* Xen private mappings. */
1497 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1498 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1499 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1500 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1501 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1502 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1503 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1504 __PAGE_HYPERVISOR);
1506 return rc > 0 ? 0 : rc;
1508 #else
1509 #define alloc_l4_table(page, preemptible) (-EINVAL)
1510 #endif
1513 static void free_l1_table(struct page_info *page)
1515 struct domain *d = page_get_owner(page);
1516 unsigned long pfn = page_to_mfn(page);
1517 l1_pgentry_t *pl1e;
1518 unsigned int i;
1520 pl1e = map_domain_page(pfn);
1522 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1523 if ( is_guest_l1_slot(i) )
1524 put_page_from_l1e(pl1e[i], d);
1526 unmap_domain_page(pl1e);
1530 static int free_l2_table(struct page_info *page, int preemptible)
1532 #ifdef __x86_64__
1533 struct domain *d = page_get_owner(page);
1534 #endif
1535 unsigned long pfn = page_to_mfn(page);
1536 l2_pgentry_t *pl2e;
1537 unsigned int i = page->nr_validated_ptes - 1;
1538 int err = 0;
1540 pl2e = map_domain_page(pfn);
1542 ASSERT(page->nr_validated_ptes);
1543 do {
1544 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1545 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1546 preemptible && i && hypercall_preempt_check() )
1548 page->nr_validated_ptes = i;
1549 err = -EAGAIN;
1551 } while ( !err && i-- );
1553 unmap_domain_page(pl2e);
1555 if ( !err )
1556 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1558 return err;
1561 static int free_l3_table(struct page_info *page, int preemptible)
1563 struct domain *d = page_get_owner(page);
1564 unsigned long pfn = page_to_mfn(page);
1565 l3_pgentry_t *pl3e;
1566 int rc = 0, partial = page->partial_pte;
1567 unsigned int i = page->nr_validated_ptes - !partial;
1569 pl3e = map_domain_page(pfn);
1571 do {
1572 if ( is_guest_l3_slot(i) )
1574 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1575 if ( rc < 0 )
1576 break;
1577 partial = 0;
1578 if ( rc > 0 )
1579 continue;
1580 unadjust_guest_l3e(pl3e[i], d);
1582 } while ( i-- );
1584 unmap_domain_page(pl3e);
1586 if ( rc == -EAGAIN )
1588 page->nr_validated_ptes = i;
1589 page->partial_pte = partial ?: -1;
1591 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1593 page->nr_validated_ptes = i + 1;
1594 page->partial_pte = 0;
1595 rc = -EAGAIN;
1597 return rc > 0 ? 0 : rc;
1600 #if CONFIG_PAGING_LEVELS >= 4
1601 static int free_l4_table(struct page_info *page, int preemptible)
1603 struct domain *d = page_get_owner(page);
1604 unsigned long pfn = page_to_mfn(page);
1605 l4_pgentry_t *pl4e = page_to_virt(page);
1606 int rc = 0, partial = page->partial_pte;
1607 unsigned int i = page->nr_validated_ptes - !partial;
1609 do {
1610 if ( is_guest_l4_slot(d, i) )
1611 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1612 if ( rc < 0 )
1613 break;
1614 partial = 0;
1615 } while ( i-- );
1617 if ( rc == -EAGAIN )
1619 page->nr_validated_ptes = i;
1620 page->partial_pte = partial ?: -1;
1622 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1624 page->nr_validated_ptes = i + 1;
1625 page->partial_pte = 0;
1626 rc = -EAGAIN;
1628 return rc > 0 ? 0 : rc;
1630 #else
1631 #define free_l4_table(page, preemptible) (-EINVAL)
1632 #endif
1634 static int page_lock(struct page_info *page)
1636 unsigned long x, nx;
1638 do {
1639 while ( (x = page->u.inuse.type_info) & PGT_locked )
1640 cpu_relax();
1641 nx = x + (1 | PGT_locked);
1642 if ( !(x & PGT_validated) ||
1643 !(x & PGT_count_mask) ||
1644 !(nx & PGT_count_mask) )
1645 return 0;
1646 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1648 return 1;
1651 static void page_unlock(struct page_info *page)
1653 unsigned long x, nx, y = page->u.inuse.type_info;
1655 do {
1656 x = y;
1657 nx = x - (1 | PGT_locked);
1658 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1661 /* How to write an entry to the guest pagetables.
1662 * Returns 0 for failure (pointer not valid), 1 for success. */
1663 static inline int update_intpte(intpte_t *p,
1664 intpte_t old,
1665 intpte_t new,
1666 unsigned long mfn,
1667 struct vcpu *v,
1668 int preserve_ad)
1670 int rv = 1;
1671 #ifndef PTE_UPDATE_WITH_CMPXCHG
1672 if ( !preserve_ad )
1674 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1676 else
1677 #endif
1679 intpte_t t = old;
1680 for ( ; ; )
1682 intpte_t _new = new;
1683 if ( preserve_ad )
1684 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1686 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1687 if ( unlikely(rv == 0) )
1689 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1690 ": saw %" PRIpte, old, _new, t);
1691 break;
1694 if ( t == old )
1695 break;
1697 /* Allowed to change in Accessed/Dirty flags only. */
1698 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1700 old = t;
1703 return rv;
1706 /* Macro that wraps the appropriate type-changes around update_intpte().
1707 * Arguments are: type, ptr, old, new, mfn, vcpu */
1708 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1709 update_intpte(&_t ## e_get_intpte(*(_p)), \
1710 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1711 (_m), (_v), (_ad))
1713 /* Update the L1 entry at pl1e to new value nl1e. */
1714 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1715 unsigned long gl1mfn, int preserve_ad,
1716 struct vcpu *pt_vcpu, struct domain *pg_dom)
1718 l1_pgentry_t ol1e;
1719 struct domain *pt_dom = pt_vcpu->domain;
1720 unsigned long mfn;
1721 p2m_type_t p2mt;
1722 int rc = 1;
1724 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1725 return 0;
1727 if ( unlikely(paging_mode_refcounts(pt_dom)) )
1729 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad);
1730 return rc;
1733 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1735 /* Translate foreign guest addresses. */
1736 mfn = mfn_x(gfn_to_mfn(pg_dom, l1e_get_pfn(nl1e), &p2mt));
1737 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1738 return 0;
1739 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1740 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1742 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1744 MEM_LOG("Bad L1 flags %x",
1745 l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1746 return 0;
1749 /* Fast path for identical mapping, r/w and presence. */
1750 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1752 adjust_guest_l1e(nl1e, pt_dom);
1753 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1754 preserve_ad);
1755 return rc;
1758 if ( unlikely(!get_page_from_l1e(nl1e, pt_dom, pg_dom)) )
1759 return 0;
1761 adjust_guest_l1e(nl1e, pt_dom);
1762 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1763 preserve_ad)) )
1765 ol1e = nl1e;
1766 rc = 0;
1769 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1770 preserve_ad)) )
1772 return 0;
1775 put_page_from_l1e(ol1e, pt_dom);
1776 return rc;
1780 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1781 static int mod_l2_entry(l2_pgentry_t *pl2e,
1782 l2_pgentry_t nl2e,
1783 unsigned long pfn,
1784 int preserve_ad,
1785 struct vcpu *vcpu)
1787 l2_pgentry_t ol2e;
1788 struct domain *d = vcpu->domain;
1789 struct page_info *l2pg = mfn_to_page(pfn);
1790 unsigned long type = l2pg->u.inuse.type_info;
1791 int rc = 1;
1793 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1795 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1796 return 0;
1799 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1800 return 0;
1802 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1804 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1806 MEM_LOG("Bad L2 flags %x",
1807 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1808 return 0;
1811 /* Fast path for identical mapping and presence. */
1812 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1814 adjust_guest_l2e(nl2e, d);
1815 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1816 return rc;
1819 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1820 return 0;
1822 adjust_guest_l2e(nl2e, d);
1823 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1824 preserve_ad)) )
1826 ol2e = nl2e;
1827 rc = 0;
1830 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1831 preserve_ad)) )
1833 return 0;
1836 put_page_from_l2e(ol2e, pfn);
1837 return rc;
1840 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1841 static int mod_l3_entry(l3_pgentry_t *pl3e,
1842 l3_pgentry_t nl3e,
1843 unsigned long pfn,
1844 int preserve_ad,
1845 int preemptible,
1846 struct vcpu *vcpu)
1848 l3_pgentry_t ol3e;
1849 struct domain *d = vcpu->domain;
1850 int rc = 0;
1852 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1854 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1855 return -EINVAL;
1858 /*
1859 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1860 * would be a pain to ensure they remain continuously valid throughout.
1861 */
1862 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1863 return -EINVAL;
1865 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1866 return -EFAULT;
1868 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1870 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1872 MEM_LOG("Bad L3 flags %x",
1873 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1874 return -EINVAL;
1877 /* Fast path for identical mapping and presence. */
1878 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1880 adjust_guest_l3e(nl3e, d);
1881 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1882 return rc ? 0 : -EFAULT;
1885 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1886 if ( unlikely(rc < 0) )
1887 return rc;
1888 rc = 0;
1890 adjust_guest_l3e(nl3e, d);
1891 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1892 preserve_ad)) )
1894 ol3e = nl3e;
1895 rc = -EFAULT;
1898 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1899 preserve_ad)) )
1901 return -EFAULT;
1904 if ( likely(rc == 0) )
1906 if ( !create_pae_xen_mappings(d, pl3e) )
1907 BUG();
1909 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1912 put_page_from_l3e(ol3e, pfn, 0, 0);
1913 return rc;
1916 #if CONFIG_PAGING_LEVELS >= 4
1918 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1919 static int mod_l4_entry(l4_pgentry_t *pl4e,
1920 l4_pgentry_t nl4e,
1921 unsigned long pfn,
1922 int preserve_ad,
1923 int preemptible,
1924 struct vcpu *vcpu)
1926 struct domain *d = vcpu->domain;
1927 l4_pgentry_t ol4e;
1928 int rc = 0;
1930 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1932 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1933 return -EINVAL;
1936 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1937 return -EFAULT;
1939 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1941 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1943 MEM_LOG("Bad L4 flags %x",
1944 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1945 return -EINVAL;
1948 /* Fast path for identical mapping and presence. */
1949 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1951 adjust_guest_l4e(nl4e, d);
1952 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1953 return rc ? 0 : -EFAULT;
1956 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1957 if ( unlikely(rc < 0) )
1958 return rc;
1959 rc = 0;
1961 adjust_guest_l4e(nl4e, d);
1962 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1963 preserve_ad)) )
1965 ol4e = nl4e;
1966 rc = -EFAULT;
1969 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1970 preserve_ad)) )
1972 return -EFAULT;
1975 put_page_from_l4e(ol4e, pfn, 0, 0);
1976 return rc;
1979 #endif
1981 void put_page(struct page_info *page)
1983 unsigned long nx, x, y = page->count_info;
1985 do {
1986 ASSERT((y & PGC_count_mask) != 0);
1987 x = y;
1988 nx = x - 1;
1990 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1992 if ( unlikely((nx & PGC_count_mask) == 0) )
1994 cleanup_page_cacheattr(page);
1995 free_domheap_page(page);
2000 struct domain *page_get_owner_and_reference(struct page_info *page)
2002 unsigned long x, y = page->count_info;
2004 do {
2005 x = y;
2006 /*
2007 * Count == 0: Page is not allocated, so we cannot take a reference.
2008 * Count == -1: Reference count would wrap, which is invalid.
2009 * Count == -2: Remaining unused ref is reserved for get_page_light().
2010 */
2011 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2012 return NULL;
2014 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2016 return page_get_owner(page);
2020 int get_page(struct page_info *page, struct domain *domain)
2022 struct domain *owner = page_get_owner_and_reference(page);
2024 if ( likely(owner == domain) )
2025 return 1;
2027 if ( owner != NULL )
2028 put_page(page);
2030 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
2031 gdprintk(XENLOG_INFO,
2032 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
2033 PRtype_info "\n",
2034 page_to_mfn(page), domain, owner,
2035 page->count_info, page->u.inuse.type_info);
2036 return 0;
2039 /*
2040 * Special version of get_page() to be used exclusively when
2041 * - a page is known to already have a non-zero reference count
2042 * - the page does not need its owner to be checked
2043 * - it will not be called more than once without dropping the thus
2044 * acquired reference again.
2045 * Due to get_page() reserving one reference, this call cannot fail.
2046 */
2047 static void get_page_light(struct page_info *page)
2049 unsigned long x, nx, y = page->count_info;
2051 do {
2052 x = y;
2053 nx = x + 1;
2054 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2055 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2056 y = cmpxchg(&page->count_info, x, nx);
2058 while ( unlikely(y != x) );
2061 static int alloc_page_type(struct page_info *page, unsigned long type,
2062 int preemptible)
2064 struct domain *owner = page_get_owner(page);
2065 int rc;
2067 /* A page table is dirtied when its type count becomes non-zero. */
2068 if ( likely(owner != NULL) )
2069 paging_mark_dirty(owner, page_to_mfn(page));
2071 switch ( type & PGT_type_mask )
2073 case PGT_l1_page_table:
2074 rc = alloc_l1_table(page);
2075 break;
2076 case PGT_l2_page_table:
2077 rc = alloc_l2_table(page, type, preemptible);
2078 break;
2079 case PGT_l3_page_table:
2080 rc = alloc_l3_table(page, preemptible);
2081 break;
2082 case PGT_l4_page_table:
2083 rc = alloc_l4_table(page, preemptible);
2084 break;
2085 case PGT_seg_desc_page:
2086 rc = alloc_segdesc_page(page);
2087 break;
2088 default:
2089 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2090 type, page->u.inuse.type_info,
2091 page->count_info);
2092 rc = -EINVAL;
2093 BUG();
2096 /* No need for atomic update of type_info here: noone else updates it. */
2097 wmb();
2098 if ( rc == -EAGAIN )
2100 get_page_light(page);
2101 page->u.inuse.type_info |= PGT_partial;
2103 else if ( rc == -EINTR )
2105 ASSERT((page->u.inuse.type_info &
2106 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2107 page->u.inuse.type_info &= ~PGT_count_mask;
2109 else if ( rc )
2111 ASSERT(rc < 0);
2112 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2113 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2114 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2115 type, page->count_info, page->u.inuse.type_info);
2116 page->u.inuse.type_info = 0;
2118 else
2120 page->u.inuse.type_info |= PGT_validated;
2123 return rc;
2127 int free_page_type(struct page_info *page, unsigned long type,
2128 int preemptible)
2130 struct domain *owner = page_get_owner(page);
2131 unsigned long gmfn;
2132 int rc;
2134 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2136 /* A page table is dirtied when its type count becomes zero. */
2137 paging_mark_dirty(owner, page_to_mfn(page));
2139 if ( shadow_mode_refcounts(owner) )
2140 return 0;
2142 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2143 ASSERT(VALID_M2P(gmfn));
2144 /* Page sharing not supported for shadowed domains */
2145 if(!SHARED_M2P(gmfn))
2146 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2149 if ( !(type & PGT_partial) )
2151 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2152 page->partial_pte = 0;
2155 switch ( type & PGT_type_mask )
2157 case PGT_l1_page_table:
2158 free_l1_table(page);
2159 rc = 0;
2160 break;
2161 case PGT_l2_page_table:
2162 rc = free_l2_table(page, preemptible);
2163 break;
2164 case PGT_l3_page_table:
2165 #if CONFIG_PAGING_LEVELS == 3
2166 if ( !(type & PGT_partial) )
2167 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2168 #endif
2169 rc = free_l3_table(page, preemptible);
2170 break;
2171 case PGT_l4_page_table:
2172 rc = free_l4_table(page, preemptible);
2173 break;
2174 default:
2175 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2176 rc = -EINVAL;
2177 BUG();
2180 return rc;
2184 static int __put_final_page_type(
2185 struct page_info *page, unsigned long type, int preemptible)
2187 int rc = free_page_type(page, type, preemptible);
2189 /* No need for atomic update of type_info here: noone else updates it. */
2190 if ( rc == 0 )
2192 /*
2193 * Record TLB information for flush later. We do not stamp page tables
2194 * when running in shadow mode:
2195 * 1. Pointless, since it's the shadow pt's which must be tracked.
2196 * 2. Shadow mode reuses this field for shadowed page tables to
2197 * store flags info -- we don't want to conflict with that.
2198 */
2199 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2200 (page->count_info & PGC_page_table)) )
2201 page->tlbflush_timestamp = tlbflush_current_time();
2202 wmb();
2203 page->u.inuse.type_info--;
2205 else if ( rc == -EINTR )
2207 ASSERT((page->u.inuse.type_info &
2208 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2209 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2210 (page->count_info & PGC_page_table)) )
2211 page->tlbflush_timestamp = tlbflush_current_time();
2212 wmb();
2213 page->u.inuse.type_info |= PGT_validated;
2215 else
2217 BUG_ON(rc != -EAGAIN);
2218 wmb();
2219 get_page_light(page);
2220 page->u.inuse.type_info |= PGT_partial;
2223 return rc;
2227 static int __put_page_type(struct page_info *page,
2228 int preemptible)
2230 unsigned long nx, x, y = page->u.inuse.type_info;
2231 int rc = 0;
2233 for ( ; ; )
2235 x = y;
2236 nx = x - 1;
2238 ASSERT((x & PGT_count_mask) != 0);
2240 if ( unlikely((nx & PGT_count_mask) == 0) )
2242 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2243 likely(nx & (PGT_validated|PGT_partial)) )
2245 /*
2246 * Page-table pages must be unvalidated when count is zero. The
2247 * 'free' is safe because the refcnt is non-zero and validated
2248 * bit is clear => other ops will spin or fail.
2249 */
2250 nx = x & ~(PGT_validated|PGT_partial);
2251 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2252 x, nx)) != x) )
2253 continue;
2254 /* We cleared the 'valid bit' so we do the clean up. */
2255 rc = __put_final_page_type(page, x, preemptible);
2256 if ( x & PGT_partial )
2257 put_page(page);
2258 break;
2261 /*
2262 * Record TLB information for flush later. We do not stamp page
2263 * tables when running in shadow mode:
2264 * 1. Pointless, since it's the shadow pt's which must be tracked.
2265 * 2. Shadow mode reuses this field for shadowed page tables to
2266 * store flags info -- we don't want to conflict with that.
2267 */
2268 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2269 (page->count_info & PGC_page_table)) )
2270 page->tlbflush_timestamp = tlbflush_current_time();
2273 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2274 break;
2276 if ( preemptible && hypercall_preempt_check() )
2277 return -EINTR;
2280 return rc;
2284 static int __get_page_type(struct page_info *page, unsigned long type,
2285 int preemptible)
2287 unsigned long nx, x, y = page->u.inuse.type_info;
2288 int rc = 0;
2290 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2292 for ( ; ; )
2294 x = y;
2295 nx = x + 1;
2296 if ( unlikely((nx & PGT_count_mask) == 0) )
2298 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2299 return -EINVAL;
2301 else if ( unlikely((x & PGT_count_mask) == 0) )
2303 struct domain *d = page_get_owner(page);
2305 /* Normally we should never let a page go from type count 0
2306 * to type count 1 when it is shadowed. One exception:
2307 * out-of-sync shadowed pages are allowed to become
2308 * writeable. */
2309 if ( d && shadow_mode_enabled(d)
2310 && (page->count_info & PGC_page_table)
2311 && !((page->shadow_flags & (1u<<29))
2312 && type == PGT_writable_page) )
2313 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2315 ASSERT(!(x & PGT_pae_xen_l2));
2316 if ( (x & PGT_type_mask) != type )
2318 /*
2319 * On type change we check to flush stale TLB entries. This
2320 * may be unnecessary (e.g., page was GDT/LDT) but those
2321 * circumstances should be very rare.
2322 */
2323 cpumask_t mask = d->domain_dirty_cpumask;
2325 /* Don't flush if the timestamp is old enough */
2326 tlbflush_filter(mask, page->tlbflush_timestamp);
2328 if ( unlikely(!cpus_empty(mask)) &&
2329 /* Shadow mode: track only writable pages. */
2330 (!shadow_mode_enabled(page_get_owner(page)) ||
2331 ((nx & PGT_type_mask) == PGT_writable_page)) )
2333 perfc_incr(need_flush_tlb_flush);
2334 flush_tlb_mask(&mask);
2337 /* We lose existing type and validity. */
2338 nx &= ~(PGT_type_mask | PGT_validated);
2339 nx |= type;
2341 /* No special validation needed for writable pages. */
2342 /* Page tables and GDT/LDT need to be scanned for validity. */
2343 if ( type == PGT_writable_page )
2344 nx |= PGT_validated;
2347 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2349 /* Don't log failure if it could be a recursive-mapping attempt. */
2350 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2351 (type == PGT_l1_page_table) )
2352 return -EINVAL;
2353 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2354 (type == PGT_l2_page_table) )
2355 return -EINVAL;
2356 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2357 (type == PGT_l3_page_table) )
2358 return -EINVAL;
2359 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2360 "for mfn %lx (pfn %lx)",
2361 x, type, page_to_mfn(page),
2362 get_gpfn_from_mfn(page_to_mfn(page)));
2363 return -EINVAL;
2365 else if ( unlikely(!(x & PGT_validated)) )
2367 if ( !(x & PGT_partial) )
2369 /* Someone else is updating validation of this page. Wait... */
2370 while ( (y = page->u.inuse.type_info) == x )
2372 if ( preemptible && hypercall_preempt_check() )
2373 return -EINTR;
2374 cpu_relax();
2376 continue;
2378 /* Type ref count was left at 1 when PGT_partial got set. */
2379 ASSERT((x & PGT_count_mask) == 1);
2380 nx = x & ~PGT_partial;
2383 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2384 break;
2386 if ( preemptible && hypercall_preempt_check() )
2387 return -EINTR;
2390 if ( unlikely((x & PGT_type_mask) != type) )
2392 /* Special pages should not be accessible from devices. */
2393 struct domain *d = page_get_owner(page);
2394 if ( d && !is_hvm_domain(d) && unlikely(need_iommu(d)) )
2396 if ( (x & PGT_type_mask) == PGT_writable_page )
2397 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2398 else if ( type == PGT_writable_page )
2399 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2400 page_to_mfn(page));
2404 if ( unlikely(!(nx & PGT_validated)) )
2406 if ( !(x & PGT_partial) )
2408 page->nr_validated_ptes = 0;
2409 page->partial_pte = 0;
2411 rc = alloc_page_type(page, type, preemptible);
2414 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2415 put_page(page);
2417 return rc;
2420 void put_page_type(struct page_info *page)
2422 int rc = __put_page_type(page, 0);
2423 ASSERT(rc == 0);
2424 (void)rc;
2427 int get_page_type(struct page_info *page, unsigned long type)
2429 int rc = __get_page_type(page, type, 0);
2430 if ( likely(rc == 0) )
2431 return 1;
2432 ASSERT(rc == -EINVAL);
2433 return 0;
2436 int put_page_type_preemptible(struct page_info *page)
2438 return __put_page_type(page, 1);
2441 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2443 return __get_page_type(page, type, 1);
2446 void cleanup_page_cacheattr(struct page_info *page)
2448 uint32_t cacheattr =
2449 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2451 if ( likely(cacheattr == 0) )
2452 return;
2454 page->count_info &= ~PGC_cacheattr_mask;
2456 BUG_ON(is_xen_heap_page(page));
2458 update_xen_mappings(page_to_mfn(page), 0);
2462 int new_guest_cr3(unsigned long mfn)
2464 struct vcpu *curr = current;
2465 struct domain *d = curr->domain;
2466 int okay;
2467 unsigned long old_base_mfn;
2469 #ifdef __x86_64__
2470 if ( is_pv_32on64_domain(d) )
2472 okay = paging_mode_refcounts(d)
2473 ? 0 /* Old code was broken, but what should it be? */
2474 : mod_l4_entry(
2475 __va(pagetable_get_paddr(curr->arch.guest_table)),
2476 l4e_from_pfn(
2477 mfn,
2478 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2479 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2480 if ( unlikely(!okay) )
2482 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2483 return 0;
2486 invalidate_shadow_ldt(curr, 0);
2487 write_ptbase(curr);
2489 return 1;
2491 #endif
2492 okay = paging_mode_refcounts(d)
2493 ? get_page_from_pagenr(mfn, d)
2494 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2495 if ( unlikely(!okay) )
2497 MEM_LOG("Error while installing new baseptr %lx", mfn);
2498 return 0;
2501 invalidate_shadow_ldt(curr, 0);
2503 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2505 curr->arch.guest_table = pagetable_from_pfn(mfn);
2506 update_cr3(curr);
2508 write_ptbase(curr);
2510 if ( likely(old_base_mfn != 0) )
2512 if ( paging_mode_refcounts(d) )
2513 put_page(mfn_to_page(old_base_mfn));
2514 else
2515 put_page_and_type(mfn_to_page(old_base_mfn));
2518 return 1;
2521 static struct domain *get_pg_owner(domid_t domid)
2523 struct domain *pg_owner = NULL, *curr = current->domain;
2525 if ( likely(domid == DOMID_SELF) )
2527 pg_owner = rcu_lock_domain(curr);
2528 goto out;
2531 if ( unlikely(domid == curr->domain_id) )
2533 MEM_LOG("Cannot specify itself as foreign domain");
2534 goto out;
2537 if ( unlikely(paging_mode_translate(curr)) )
2539 MEM_LOG("Cannot mix foreign mappings with translated domains");
2540 goto out;
2543 switch ( domid )
2545 case DOMID_IO:
2546 pg_owner = rcu_lock_domain(dom_io);
2547 break;
2548 case DOMID_XEN:
2549 if ( !IS_PRIV(curr) )
2551 MEM_LOG("Cannot set foreign dom");
2552 break;
2554 pg_owner = rcu_lock_domain(dom_xen);
2555 break;
2556 default:
2557 if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2559 MEM_LOG("Unknown domain '%u'", domid);
2560 break;
2562 if ( !IS_PRIV_FOR(curr, pg_owner) )
2564 MEM_LOG("Cannot set foreign dom");
2565 rcu_unlock_domain(pg_owner);
2566 pg_owner = NULL;
2568 break;
2571 out:
2572 return pg_owner;
2575 static void put_pg_owner(struct domain *pg_owner)
2577 rcu_unlock_domain(pg_owner);
2580 static inline int vcpumask_to_pcpumask(
2581 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2583 unsigned int vcpu_id, vcpu_bias, offs;
2584 unsigned long vmask;
2585 struct vcpu *v;
2586 bool_t is_native = !is_pv_32on64_domain(d);
2588 cpus_clear(*pmask);
2589 for ( vmask = 0, offs = 0; ; ++offs)
2591 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2592 if ( vcpu_bias >= d->max_vcpus )
2593 return 0;
2595 if ( unlikely(is_native ?
2596 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2597 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2598 offs, 1)) )
2600 cpus_clear(*pmask);
2601 return -EFAULT;
2604 while ( vmask )
2606 vcpu_id = find_first_set_bit(vmask);
2607 vmask &= ~(1UL << vcpu_id);
2608 vcpu_id += vcpu_bias;
2609 if ( (vcpu_id >= d->max_vcpus) )
2610 return 0;
2611 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2612 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2617 #ifdef __i386__
2618 static inline void *fixmap_domain_page(unsigned long mfn)
2620 unsigned int cpu = smp_processor_id();
2621 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2623 l1e_write(fix_pae_highmem_pl1e - cpu,
2624 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2625 flush_tlb_one_local(ptr);
2626 return ptr;
2628 static inline void fixunmap_domain_page(const void *ptr)
2630 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2632 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2633 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2635 #else
2636 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2637 #define fixunmap_domain_page(ptr) ((void)(ptr))
2638 #endif
2640 int do_mmuext_op(
2641 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2642 unsigned int count,
2643 XEN_GUEST_HANDLE(uint) pdone,
2644 unsigned int foreigndom)
2646 struct mmuext_op op;
2647 int rc = 0, i = 0, okay;
2648 unsigned long type;
2649 unsigned int done = 0;
2650 struct vcpu *curr = current;
2651 struct domain *d = curr->domain;
2652 struct domain *pg_owner;
2654 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2656 count &= ~MMU_UPDATE_PREEMPTED;
2657 if ( unlikely(!guest_handle_is_null(pdone)) )
2658 (void)copy_from_guest(&done, pdone, 1);
2660 else
2661 perfc_incr(calls_to_mmuext_op);
2663 if ( unlikely(!guest_handle_okay(uops, count)) )
2665 rc = -EFAULT;
2666 goto out;
2669 if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
2671 rc = -ESRCH;
2672 goto out;
2675 for ( i = 0; i < count; i++ )
2677 if ( hypercall_preempt_check() )
2679 rc = -EAGAIN;
2680 break;
2683 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2685 MEM_LOG("Bad __copy_from_guest");
2686 rc = -EFAULT;
2687 break;
2690 okay = 1;
2692 switch ( op.cmd )
2694 case MMUEXT_PIN_L1_TABLE:
2695 type = PGT_l1_page_table;
2696 goto pin_page;
2698 case MMUEXT_PIN_L2_TABLE:
2699 type = PGT_l2_page_table;
2700 goto pin_page;
2702 case MMUEXT_PIN_L3_TABLE:
2703 type = PGT_l3_page_table;
2704 goto pin_page;
2706 case MMUEXT_PIN_L4_TABLE:
2707 if ( is_pv_32bit_domain(pg_owner) )
2708 break;
2709 type = PGT_l4_page_table;
2711 pin_page: {
2712 unsigned long mfn;
2713 struct page_info *page;
2715 /* Ignore pinning of invalid paging levels. */
2716 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2717 break;
2719 if ( paging_mode_refcounts(pg_owner) )
2720 break;
2722 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
2723 rc = get_page_and_type_from_pagenr(mfn, type, pg_owner, 0, 1);
2724 okay = !rc;
2725 if ( unlikely(!okay) )
2727 if ( rc == -EINTR )
2728 rc = -EAGAIN;
2729 else if ( rc != -EAGAIN )
2730 MEM_LOG("Error while pinning mfn %lx", mfn);
2731 break;
2734 page = mfn_to_page(mfn);
2736 if ( (rc = xsm_memory_pin_page(d, page)) != 0 )
2738 put_page_and_type(page);
2739 okay = 0;
2740 break;
2743 if ( unlikely(test_and_set_bit(_PGT_pinned,
2744 &page->u.inuse.type_info)) )
2746 MEM_LOG("Mfn %lx already pinned", mfn);
2747 put_page_and_type(page);
2748 okay = 0;
2749 break;
2752 /* A page is dirtied when its pin status is set. */
2753 paging_mark_dirty(pg_owner, mfn);
2755 /* We can race domain destruction (domain_relinquish_resources). */
2756 if ( unlikely(pg_owner != d) )
2758 int drop_ref;
2759 spin_lock(&pg_owner->page_alloc_lock);
2760 drop_ref = (pg_owner->is_dying &&
2761 test_and_clear_bit(_PGT_pinned,
2762 &page->u.inuse.type_info));
2763 spin_unlock(&pg_owner->page_alloc_lock);
2764 if ( drop_ref )
2765 put_page_and_type(page);
2768 break;
2771 case MMUEXT_UNPIN_TABLE: {
2772 unsigned long mfn;
2773 struct page_info *page;
2775 if ( paging_mode_refcounts(pg_owner) )
2776 break;
2778 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
2779 if ( unlikely(!(okay = get_page_from_pagenr(mfn, pg_owner))) )
2781 MEM_LOG("Mfn %lx bad domain", mfn);
2782 break;
2785 page = mfn_to_page(mfn);
2787 if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
2789 okay = 0;
2790 put_page(page);
2791 MEM_LOG("Mfn %lx not pinned", mfn);
2792 break;
2795 put_page_and_type(page);
2796 put_page(page);
2798 /* A page is dirtied when its pin status is cleared. */
2799 paging_mark_dirty(pg_owner, mfn);
2801 break;
2804 case MMUEXT_NEW_BASEPTR:
2805 okay = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn));
2806 break;
2808 #ifdef __x86_64__
2809 case MMUEXT_NEW_USER_BASEPTR: {
2810 unsigned long old_mfn, mfn;
2812 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2813 if ( mfn != 0 )
2815 if ( paging_mode_refcounts(d) )
2816 okay = get_page_from_pagenr(mfn, d);
2817 else
2818 okay = !get_page_and_type_from_pagenr(
2819 mfn, PGT_root_page_table, d, 0, 0);
2820 if ( unlikely(!okay) )
2822 MEM_LOG("Error while installing new mfn %lx", mfn);
2823 break;
2827 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2828 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2830 if ( old_mfn != 0 )
2832 if ( paging_mode_refcounts(d) )
2833 put_page(mfn_to_page(old_mfn));
2834 else
2835 put_page_and_type(mfn_to_page(old_mfn));
2838 break;
2840 #endif
2842 case MMUEXT_TLB_FLUSH_LOCAL:
2843 flush_tlb_local();
2844 break;
2846 case MMUEXT_INVLPG_LOCAL:
2847 if ( !paging_mode_enabled(d)
2848 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2849 flush_tlb_one_local(op.arg1.linear_addr);
2850 break;
2852 case MMUEXT_TLB_FLUSH_MULTI:
2853 case MMUEXT_INVLPG_MULTI:
2855 cpumask_t pmask;
2857 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
2859 okay = 0;
2860 break;
2862 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2863 flush_tlb_mask(&pmask);
2864 else
2865 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
2866 break;
2869 case MMUEXT_TLB_FLUSH_ALL:
2870 flush_tlb_mask(&d->domain_dirty_cpumask);
2871 break;
2873 case MMUEXT_INVLPG_ALL:
2874 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
2875 break;
2877 case MMUEXT_FLUSH_CACHE:
2878 if ( unlikely(!cache_flush_permitted(d)) )
2880 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2881 okay = 0;
2883 else
2885 wbinvd();
2887 break;
2889 case MMUEXT_SET_LDT:
2891 unsigned long ptr = op.arg1.linear_addr;
2892 unsigned long ents = op.arg2.nr_ents;
2894 if ( paging_mode_external(d) )
2896 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2897 okay = 0;
2899 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2900 (ents > 8192) ||
2901 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2903 okay = 0;
2904 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2906 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2907 (curr->arch.guest_context.ldt_base != ptr) )
2909 invalidate_shadow_ldt(curr, 0);
2910 flush_tlb_local();
2911 curr->arch.guest_context.ldt_base = ptr;
2912 curr->arch.guest_context.ldt_ents = ents;
2913 load_LDT(curr);
2914 if ( ents != 0 )
2915 (void)map_ldt_shadow_page(0);
2917 break;
2920 case MMUEXT_CLEAR_PAGE: {
2921 unsigned long mfn;
2922 unsigned char *ptr;
2924 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2925 okay = !get_page_and_type_from_pagenr(
2926 mfn, PGT_writable_page, d, 0, 0);
2927 if ( unlikely(!okay) )
2929 MEM_LOG("Error while clearing mfn %lx", mfn);
2930 break;
2933 /* A page is dirtied when it's being cleared. */
2934 paging_mark_dirty(d, mfn);
2936 ptr = fixmap_domain_page(mfn);
2937 clear_page(ptr);
2938 fixunmap_domain_page(ptr);
2940 put_page_and_type(mfn_to_page(mfn));
2941 break;
2944 case MMUEXT_COPY_PAGE:
2946 const unsigned char *src;
2947 unsigned char *dst;
2948 unsigned long src_mfn, mfn;
2950 src_mfn = gmfn_to_mfn(d, op.arg2.src_mfn);
2951 okay = get_page_from_pagenr(src_mfn, d);
2952 if ( unlikely(!okay) )
2954 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2955 break;
2958 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2959 okay = !get_page_and_type_from_pagenr(
2960 mfn, PGT_writable_page, d, 0, 0);
2961 if ( unlikely(!okay) )
2963 put_page(mfn_to_page(src_mfn));
2964 MEM_LOG("Error while copying to mfn %lx", mfn);
2965 break;
2968 /* A page is dirtied when it's being copied to. */
2969 paging_mark_dirty(d, mfn);
2971 src = map_domain_page(src_mfn);
2972 dst = fixmap_domain_page(mfn);
2973 copy_page(dst, src);
2974 fixunmap_domain_page(dst);
2975 unmap_domain_page(src);
2977 put_page_and_type(mfn_to_page(mfn));
2978 put_page(mfn_to_page(src_mfn));
2979 break;
2982 default:
2983 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2984 rc = -ENOSYS;
2985 okay = 0;
2986 break;
2989 if ( unlikely(!okay) )
2991 rc = rc ? rc : -EINVAL;
2992 break;
2995 guest_handle_add_offset(uops, 1);
2998 if ( rc == -EAGAIN )
2999 rc = hypercall_create_continuation(
3000 __HYPERVISOR_mmuext_op, "hihi",
3001 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3003 put_pg_owner(pg_owner);
3005 perfc_add(num_mmuext_ops, i);
3007 out:
3008 /* Add incremental work we have done to the @done output parameter. */
3009 if ( unlikely(!guest_handle_is_null(pdone)) )
3011 done += i;
3012 copy_to_guest(pdone, &done, 1);
3015 return rc;
3018 int do_mmu_update(
3019 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
3020 unsigned int count,
3021 XEN_GUEST_HANDLE(uint) pdone,
3022 unsigned int foreigndom)
3024 struct mmu_update req;
3025 void *va;
3026 unsigned long gpfn, gmfn, mfn;
3027 struct page_info *page;
3028 int rc = 0, okay = 1, i = 0;
3029 unsigned int cmd, done = 0, pt_dom;
3030 struct domain *d = current->domain, *pt_owner = d, *pg_owner;
3031 struct vcpu *v = current;
3032 struct domain_mmap_cache mapcache;
3034 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3036 count &= ~MMU_UPDATE_PREEMPTED;
3037 if ( unlikely(!guest_handle_is_null(pdone)) )
3038 (void)copy_from_guest(&done, pdone, 1);
3040 else
3041 perfc_incr(calls_to_mmu_update);
3043 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3045 rc = -EFAULT;
3046 goto out;
3049 if ( (pt_dom = foreigndom >> 16) != 0 )
3051 /* Pagetables belong to a foreign domain (PFD). */
3052 if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3054 rc = -EINVAL;
3055 goto out;
3057 if ( pt_owner == d )
3058 rcu_unlock_domain(pt_owner);
3059 if ( (v = pt_owner->vcpu ? pt_owner->vcpu[0] : NULL) == NULL )
3061 rc = -EINVAL;
3062 goto out;
3064 if ( !IS_PRIV_FOR(d, pt_owner) )
3066 rc = -ESRCH;
3067 goto out;
3071 if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3073 rc = -ESRCH;
3074 goto out;
3077 domain_mmap_cache_init(&mapcache);
3079 for ( i = 0; i < count; i++ )
3081 if ( hypercall_preempt_check() )
3083 rc = -EAGAIN;
3084 break;
3087 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3089 MEM_LOG("Bad __copy_from_guest");
3090 rc = -EFAULT;
3091 break;
3094 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3095 okay = 0;
3097 switch ( cmd )
3099 /*
3100 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3101 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3102 * current A/D bits.
3103 */
3104 case MMU_NORMAL_PT_UPDATE:
3105 case MMU_PT_UPDATE_PRESERVE_AD:
3107 p2m_type_t p2mt;
3109 rc = xsm_mmu_normal_update(d, pg_owner, req.val);
3110 if ( rc )
3111 break;
3113 req.ptr -= cmd;
3114 gmfn = req.ptr >> PAGE_SHIFT;
3115 mfn = mfn_x(gfn_to_mfn(pt_owner, gmfn, &p2mt));
3116 if ( !p2m_is_valid(p2mt) )
3117 mfn = INVALID_MFN;
3119 if ( p2m_is_paged(p2mt) )
3121 p2m_mem_paging_populate(pg_owner, gmfn);
3123 rc = -ENOENT;
3124 break;
3127 if ( unlikely(!get_page_from_pagenr(mfn, pt_owner)) )
3129 MEM_LOG("Could not get page for normal update");
3130 break;
3133 va = map_domain_page_with_cache(mfn, &mapcache);
3134 va = (void *)((unsigned long)va +
3135 (unsigned long)(req.ptr & ~PAGE_MASK));
3136 page = mfn_to_page(mfn);
3138 if ( page_lock(page) )
3140 switch ( page->u.inuse.type_info & PGT_type_mask )
3142 case PGT_l1_page_table:
3144 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3145 p2m_type_t l1e_p2mt;
3146 gfn_to_mfn(pg_owner, l1e_get_pfn(l1e), &l1e_p2mt);
3148 if ( p2m_is_paged(l1e_p2mt) )
3150 p2m_mem_paging_populate(pg_owner, l1e_get_pfn(l1e));
3152 rc = -ENOENT;
3153 break;
3155 else if ( p2m_ram_paging_in_start == l1e_p2mt )
3157 rc = -ENOENT;
3158 break;
3160 /* XXX: Ugly: pull all the checks into a separate function.
3161 * Don't want to do it now, not to interfere with mem_paging
3162 * patches */
3163 else if ( p2m_ram_shared == l1e_p2mt )
3165 /* Unshare the page for RW foreign mappings */
3166 if(l1e_get_flags(l1e) & _PAGE_RW)
3168 rc = mem_sharing_unshare_page(pg_owner,
3169 l1e_get_pfn(l1e),
3170 0);
3171 if(rc) break;
3175 okay = mod_l1_entry(va, l1e, mfn,
3176 cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3177 pg_owner);
3179 break;
3180 case PGT_l2_page_table:
3182 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3183 p2m_type_t l2e_p2mt;
3184 gfn_to_mfn(pg_owner, l2e_get_pfn(l2e), &l2e_p2mt);
3186 if ( p2m_is_paged(l2e_p2mt) )
3188 p2m_mem_paging_populate(pg_owner, l2e_get_pfn(l2e));
3190 rc = -ENOENT;
3191 break;
3193 else if ( p2m_ram_paging_in_start == l2e_p2mt )
3195 rc = -ENOENT;
3196 break;
3198 else if ( p2m_ram_shared == l2e_p2mt )
3200 MEM_LOG("Unexpected attempt to map shared page.\n");
3201 rc = -EINVAL;
3202 break;
3206 okay = mod_l2_entry(va, l2e, mfn,
3207 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3209 break;
3210 case PGT_l3_page_table:
3212 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3213 p2m_type_t l3e_p2mt;
3214 gfn_to_mfn(pg_owner, l3e_get_pfn(l3e), &l3e_p2mt);
3216 if ( p2m_is_paged(l3e_p2mt) )
3218 p2m_mem_paging_populate(pg_owner, l3e_get_pfn(l3e));
3220 rc = -ENOENT;
3221 break;
3223 else if ( p2m_ram_paging_in_start == l3e_p2mt )
3225 rc = -ENOENT;
3226 break;
3228 else if ( p2m_ram_shared == l3e_p2mt )
3230 MEM_LOG("Unexpected attempt to map shared page.\n");
3231 rc = -EINVAL;
3232 break;
3235 rc = mod_l3_entry(va, l3e, mfn,
3236 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3237 okay = !rc;
3239 break;
3240 #if CONFIG_PAGING_LEVELS >= 4
3241 case PGT_l4_page_table:
3243 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3244 p2m_type_t l4e_p2mt;
3245 gfn_to_mfn(pg_owner, l4e_get_pfn(l4e), &l4e_p2mt);
3247 if ( p2m_is_paged(l4e_p2mt) )
3249 p2m_mem_paging_populate(pg_owner, l4e_get_pfn(l4e));
3251 rc = -ENOENT;
3252 break;
3254 else if ( p2m_ram_paging_in_start == l4e_p2mt )
3256 rc = -ENOENT;
3257 break;
3259 else if ( p2m_ram_shared == l4e_p2mt )
3261 MEM_LOG("Unexpected attempt to map shared page.\n");
3262 rc = -EINVAL;
3263 break;
3266 rc = mod_l4_entry(va, l4e, mfn,
3267 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3268 okay = !rc;
3270 break;
3271 #endif
3272 case PGT_writable_page:
3273 perfc_incr(writable_mmu_updates);
3274 okay = paging_write_guest_entry(
3275 v, va, req.val, _mfn(mfn));
3276 break;
3278 page_unlock(page);
3279 if ( rc == -EINTR )
3280 rc = -EAGAIN;
3282 else if ( get_page_type(page, PGT_writable_page) )
3284 perfc_incr(writable_mmu_updates);
3285 okay = paging_write_guest_entry(
3286 v, va, req.val, _mfn(mfn));
3287 put_page_type(page);
3290 unmap_domain_page_with_cache(va, &mapcache);
3291 put_page(page);
3293 break;
3295 case MMU_MACHPHYS_UPDATE:
3297 mfn = req.ptr >> PAGE_SHIFT;
3298 gpfn = req.val;
3300 rc = xsm_mmu_machphys_update(d, mfn);
3301 if ( rc )
3302 break;
3304 if ( unlikely(!get_page_from_pagenr(mfn, pg_owner)) )
3306 MEM_LOG("Could not get page for mach->phys update");
3307 break;
3310 if ( unlikely(paging_mode_translate(pg_owner)) )
3312 MEM_LOG("Mach-phys update on auto-translate guest");
3313 break;
3316 set_gpfn_from_mfn(mfn, gpfn);
3317 okay = 1;
3319 paging_mark_dirty(pg_owner, mfn);
3321 put_page(mfn_to_page(mfn));
3322 break;
3324 default:
3325 MEM_LOG("Invalid page update command %x", cmd);
3326 rc = -ENOSYS;
3327 okay = 0;
3328 break;
3331 if ( unlikely(!okay) )
3333 rc = rc ? rc : -EINVAL;
3334 break;
3337 guest_handle_add_offset(ureqs, 1);
3340 if ( rc == -EAGAIN )
3341 rc = hypercall_create_continuation(
3342 __HYPERVISOR_mmu_update, "hihi",
3343 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3345 put_pg_owner(pg_owner);
3347 domain_mmap_cache_destroy(&mapcache);
3349 perfc_add(num_page_updates, i);
3351 out:
3352 if ( pt_owner && (pt_owner != d) )
3353 rcu_unlock_domain(pt_owner);
3355 /* Add incremental work we have done to the @done output parameter. */
3356 if ( unlikely(!guest_handle_is_null(pdone)) )
3358 done += i;
3359 copy_to_guest(pdone, &done, 1);
3362 return rc;
3366 static int create_grant_pte_mapping(
3367 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3369 int rc = GNTST_okay;
3370 void *va;
3371 unsigned long gmfn, mfn;
3372 struct page_info *page;
3373 l1_pgentry_t ol1e;
3374 struct domain *d = v->domain;
3376 ASSERT(domain_is_locked(d));
3378 adjust_guest_l1e(nl1e, d);
3380 gmfn = pte_addr >> PAGE_SHIFT;
3381 mfn = gmfn_to_mfn(d, gmfn);
3383 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3385 MEM_LOG("Could not get page for normal update");
3386 return GNTST_general_error;
3389 va = map_domain_page(mfn);
3390 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3391 page = mfn_to_page(mfn);
3393 if ( !page_lock(page) )
3395 rc = GNTST_general_error;
3396 goto failed;
3399 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3401 page_unlock(page);
3402 rc = GNTST_general_error;
3403 goto failed;
3406 ol1e = *(l1_pgentry_t *)va;
3407 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3409 page_unlock(page);
3410 rc = GNTST_general_error;
3411 goto failed;
3414 page_unlock(page);
3416 if ( !paging_mode_refcounts(d) )
3417 put_page_from_l1e(ol1e, d);
3419 failed:
3420 unmap_domain_page(va);
3421 put_page(page);
3423 return rc;
3426 static int destroy_grant_pte_mapping(
3427 uint64_t addr, unsigned long frame, struct domain *d)
3429 int rc = GNTST_okay;
3430 void *va;
3431 unsigned long gmfn, mfn;
3432 struct page_info *page;
3433 l1_pgentry_t ol1e;
3435 gmfn = addr >> PAGE_SHIFT;
3436 mfn = gmfn_to_mfn(d, gmfn);
3438 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3440 MEM_LOG("Could not get page for normal update");
3441 return GNTST_general_error;
3444 va = map_domain_page(mfn);
3445 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3446 page = mfn_to_page(mfn);
3448 if ( !page_lock(page) )
3450 rc = GNTST_general_error;
3451 goto failed;
3454 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3456 page_unlock(page);
3457 rc = GNTST_general_error;
3458 goto failed;
3461 ol1e = *(l1_pgentry_t *)va;
3463 /* Check that the virtual address supplied is actually mapped to frame. */
3464 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3466 page_unlock(page);
3467 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3468 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3469 rc = GNTST_general_error;
3470 goto failed;
3473 /* Delete pagetable entry. */
3474 if ( unlikely(!UPDATE_ENTRY
3475 (l1,
3476 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3477 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3478 0)) )
3480 page_unlock(page);
3481 MEM_LOG("Cannot delete PTE entry at %p", va);
3482 rc = GNTST_general_error;
3483 goto failed;
3486 page_unlock(page);
3488 failed:
3489 unmap_domain_page(va);
3490 put_page(page);
3491 return rc;
3495 static int create_grant_va_mapping(
3496 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3498 l1_pgentry_t *pl1e, ol1e;
3499 struct domain *d = v->domain;
3500 unsigned long gl1mfn;
3501 struct page_info *l1pg;
3502 int okay;
3504 ASSERT(domain_is_locked(d));
3506 adjust_guest_l1e(nl1e, d);
3508 pl1e = guest_map_l1e(v, va, &gl1mfn);
3509 if ( !pl1e )
3511 MEM_LOG("Could not find L1 PTE for address %lx", va);
3512 return GNTST_general_error;
3515 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3517 guest_unmap_l1e(v, pl1e);
3518 return GNTST_general_error;
3521 l1pg = mfn_to_page(gl1mfn);
3522 if ( !page_lock(l1pg) )
3524 put_page(l1pg);
3525 guest_unmap_l1e(v, pl1e);
3526 return GNTST_general_error;
3529 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3531 page_unlock(l1pg);
3532 put_page(l1pg);
3533 guest_unmap_l1e(v, pl1e);
3534 return GNTST_general_error;
3537 ol1e = *pl1e;
3538 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3540 page_unlock(l1pg);
3541 put_page(l1pg);
3542 guest_unmap_l1e(v, pl1e);
3544 if ( okay && !paging_mode_refcounts(d) )
3545 put_page_from_l1e(ol1e, d);
3547 return okay ? GNTST_okay : GNTST_general_error;
3550 static int replace_grant_va_mapping(
3551 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3553 l1_pgentry_t *pl1e, ol1e;
3554 unsigned long gl1mfn;
3555 struct page_info *l1pg;
3556 int rc = 0;
3558 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3559 if ( !pl1e )
3561 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3562 return GNTST_general_error;
3565 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3567 rc = GNTST_general_error;
3568 goto out;
3571 l1pg = mfn_to_page(gl1mfn);
3572 if ( !page_lock(l1pg) )
3574 rc = GNTST_general_error;
3575 put_page(l1pg);
3576 goto out;
3579 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3581 rc = GNTST_general_error;
3582 goto unlock_and_out;
3585 ol1e = *pl1e;
3587 /* Check that the virtual address supplied is actually mapped to frame. */
3588 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3590 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3591 l1e_get_pfn(ol1e), addr, frame);
3592 rc = GNTST_general_error;
3593 goto unlock_and_out;
3596 /* Delete pagetable entry. */
3597 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3599 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3600 rc = GNTST_general_error;
3601 goto unlock_and_out;
3604 unlock_and_out:
3605 page_unlock(l1pg);
3606 put_page(l1pg);
3607 out:
3608 guest_unmap_l1e(v, pl1e);
3609 return rc;
3612 static int destroy_grant_va_mapping(
3613 unsigned long addr, unsigned long frame, struct vcpu *v)
3615 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3618 static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame,
3619 unsigned int flags,
3620 unsigned int cache_flags)
3622 p2m_type_t p2mt;
3623 int rc;
3625 if ( cache_flags || (flags & ~GNTMAP_readonly) != GNTMAP_host_map )
3626 return GNTST_general_error;
3628 if ( flags & GNTMAP_readonly )
3629 p2mt = p2m_grant_map_ro;
3630 else
3631 p2mt = p2m_grant_map_rw;
3632 rc = guest_physmap_add_entry(current->domain, addr >> PAGE_SHIFT,
3633 frame, 0, p2mt);
3634 if ( rc )
3635 return GNTST_general_error;
3636 else
3637 return GNTST_okay;
3640 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3641 unsigned int flags, unsigned int cache_flags)
3643 l1_pgentry_t pte;
3645 if ( paging_mode_external(current->domain) )
3646 return create_grant_p2m_mapping(addr, frame, flags, cache_flags);
3648 pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3649 if ( (flags & GNTMAP_application_map) )
3650 l1e_add_flags(pte,_PAGE_USER);
3651 if ( !(flags & GNTMAP_readonly) )
3652 l1e_add_flags(pte,_PAGE_RW);
3654 l1e_add_flags(pte,
3655 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3656 & _PAGE_AVAIL);
3658 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3660 if ( flags & GNTMAP_contains_pte )
3661 return create_grant_pte_mapping(addr, pte, current);
3662 return create_grant_va_mapping(addr, pte, current);
3665 static int replace_grant_p2m_mapping(
3666 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3668 unsigned long gfn = (unsigned long)(addr >> PAGE_SHIFT);
3669 p2m_type_t type;
3670 mfn_t old_mfn;
3672 if ( new_addr != 0 || (flags & GNTMAP_contains_pte) )
3673 return GNTST_general_error;
3675 old_mfn = gfn_to_mfn_current(gfn, &type);
3676 if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame )
3678 gdprintk(XENLOG_WARNING,
3679 "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n",
3680 type, mfn_x(old_mfn), frame);
3681 return GNTST_general_error;
3683 guest_physmap_remove_page(current->domain, gfn, frame, 0);
3685 return GNTST_okay;
3688 int replace_grant_host_mapping(
3689 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3691 struct vcpu *curr = current;
3692 l1_pgentry_t *pl1e, ol1e;
3693 unsigned long gl1mfn;
3694 struct page_info *l1pg;
3695 int rc;
3697 if ( paging_mode_external(current->domain) )
3698 return replace_grant_p2m_mapping(addr, frame, new_addr, flags);
3700 if ( flags & GNTMAP_contains_pte )
3702 if ( !new_addr )
3703 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3705 MEM_LOG("Unsupported grant table operation");
3706 return GNTST_general_error;
3709 if ( !new_addr )
3710 return destroy_grant_va_mapping(addr, frame, curr);
3712 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3713 if ( !pl1e )
3715 MEM_LOG("Could not find L1 PTE for address %lx",
3716 (unsigned long)new_addr);
3717 return GNTST_general_error;
3720 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3722 guest_unmap_l1e(curr, pl1e);
3723 return GNTST_general_error;
3726 l1pg = mfn_to_page(gl1mfn);
3727 if ( !page_lock(l1pg) )
3729 put_page(l1pg);
3730 guest_unmap_l1e(curr, pl1e);
3731 return GNTST_general_error;
3734 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3736 page_unlock(l1pg);
3737 put_page(l1pg);
3738 guest_unmap_l1e(curr, pl1e);
3739 return GNTST_general_error;
3742 ol1e = *pl1e;
3744 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3745 gl1mfn, curr, 0)) )
3747 page_unlock(l1pg);
3748 put_page(l1pg);
3749 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3750 guest_unmap_l1e(curr, pl1e);
3751 return GNTST_general_error;
3754 page_unlock(l1pg);
3755 put_page(l1pg);
3756 guest_unmap_l1e(curr, pl1e);
3758 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3759 if ( rc && !paging_mode_refcounts(curr->domain) )
3760 put_page_from_l1e(ol1e, curr->domain);
3762 return rc;
3765 int donate_page(
3766 struct domain *d, struct page_info *page, unsigned int memflags)
3768 spin_lock(&d->page_alloc_lock);
3770 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
3771 goto fail;
3773 if ( d->is_dying )
3774 goto fail;
3776 if ( page->count_info & ~(PGC_allocated | 1) )
3777 goto fail;
3779 if ( !(memflags & MEMF_no_refcount) )
3781 if ( d->tot_pages >= d->max_pages )
3782 goto fail;
3783 d->tot_pages++;
3786 page->count_info = PGC_allocated | 1;
3787 page_set_owner(page, d);
3788 page_list_add_tail(page,&d->page_list);
3790 spin_unlock(&d->page_alloc_lock);
3791 return 0;
3793 fail:
3794 spin_unlock(&d->page_alloc_lock);
3795 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3796 (void *)page_to_mfn(page), d, d->domain_id,
3797 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3798 return -1;
3801 int steal_page(
3802 struct domain *d, struct page_info *page, unsigned int memflags)
3804 unsigned long x, y;
3806 spin_lock(&d->page_alloc_lock);
3808 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3809 goto fail;
3811 /*
3812 * We require there is just one reference (PGC_allocated). We temporarily
3813 * drop this reference now so that we can safely swizzle the owner.
3814 */
3815 y = page->count_info;
3816 do {
3817 x = y;
3818 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3819 goto fail;
3820 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3821 } while ( y != x );
3823 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3824 page_set_owner(page, NULL);
3825 y = page->count_info;
3826 do {
3827 x = y;
3828 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3829 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3831 /* Unlink from original owner. */
3832 if ( !(memflags & MEMF_no_refcount) )
3833 d->tot_pages--;
3834 page_list_del(page, &d->page_list);
3836 spin_unlock(&d->page_alloc_lock);
3837 return 0;
3839 fail:
3840 spin_unlock(&d->page_alloc_lock);
3841 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3842 (void *)page_to_mfn(page), d, d->domain_id,
3843 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3844 return -1;
3847 int page_make_sharable(struct domain *d,
3848 struct page_info *page,
3849 int expected_refcnt)
3851 unsigned long x, nx, y;
3853 /* Acquire ref first, so that the page doesn't dissapear from us */
3854 if(!get_page(page, d))
3855 return -EINVAL;
3857 spin_lock(&d->page_alloc_lock);
3859 /* Change page type and count atomically */
3860 y = page->u.inuse.type_info;
3861 nx = PGT_shared_page | PGT_validated | 1;
3862 do {
3863 x = y;
3864 /* We can only change the type if count is zero, and
3865 type is PGT_none */
3866 if((x & (PGT_type_mask | PGT_count_mask)) != PGT_none)
3868 put_page(page);
3869 spin_unlock(&d->page_alloc_lock);
3870 return -EEXIST;
3872 y = cmpxchg(&page->u.inuse.type_info, x, nx);
3873 } while(x != y);
3875 /* Check if the ref count is 2. The first from PGT_allocated, and the second
3876 * from get_page at the top of this function */
3877 if(page->count_info != (PGC_allocated | (2 + expected_refcnt)))
3879 /* Return type count back to zero */
3880 put_page_and_type(page);
3881 spin_unlock(&d->page_alloc_lock);
3882 return -E2BIG;
3885 page_set_owner(page, dom_cow);
3886 d->tot_pages--;
3887 page_list_del(page, &d->page_list);
3888 spin_unlock(&d->page_alloc_lock);
3890 /* NOTE: We are not putting the page back. In effect this function acquires
3891 * one ref and type ref for the caller */
3893 return 0;
3896 int page_make_private(struct domain *d, struct page_info *page)
3898 unsigned long x, y;
3900 if(!get_page(page, dom_cow))
3901 return -EINVAL;
3903 spin_lock(&d->page_alloc_lock);
3905 /* Change page type and count atomically */
3906 y = page->u.inuse.type_info;
3907 do {
3908 x = y;
3909 /* We can only change the type if count is one */
3910 if((x & (PGT_type_mask | PGT_count_mask)) !=
3911 (PGT_shared_page | 1))
3913 put_page(page);
3914 spin_unlock(&d->page_alloc_lock);
3915 return -EEXIST;
3917 y = cmpxchg(&page->u.inuse.type_info, x, PGT_none);
3918 } while(x != y);
3920 /* We dropped type ref above, drop one ref count too */
3921 put_page(page);
3923 /* Change the owner */
3924 ASSERT(page_get_owner(page) == dom_cow);
3925 page_set_owner(page, d);
3927 d->tot_pages++;
3928 page_list_add_tail(page, &d->page_list);
3929 spin_unlock(&d->page_alloc_lock);
3931 put_page(page);
3933 return 0;
3936 static int __do_update_va_mapping(
3937 unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
3939 l1_pgentry_t val = l1e_from_intpte(val64);
3940 struct vcpu *v = current;
3941 struct domain *d = v->domain;
3942 struct page_info *gl1pg;
3943 l1_pgentry_t *pl1e;
3944 unsigned long bmap_ptr, gl1mfn;
3945 cpumask_t pmask;
3946 int rc;
3948 perfc_incr(calls_to_update_va);
3950 rc = xsm_update_va_mapping(d, pg_owner, val);
3951 if ( rc )
3952 return rc;
3954 rc = -EINVAL;
3955 pl1e = guest_map_l1e(v, va, &gl1mfn);
3956 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3957 goto out;
3959 gl1pg = mfn_to_page(gl1mfn);
3960 if ( !page_lock(gl1pg) )
3962 put_page(gl1pg);
3963 goto out;
3966 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3968 page_unlock(gl1pg);
3969 put_page(gl1pg);
3970 goto out;
3973 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v, pg_owner) ? 0 : -EINVAL;
3975 page_unlock(gl1pg);
3976 put_page(gl1pg);
3978 out:
3979 if ( pl1e )
3980 guest_unmap_l1e(v, pl1e);
3982 switch ( flags & UVMF_FLUSHTYPE_MASK )
3984 case UVMF_TLB_FLUSH:
3985 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3987 case UVMF_LOCAL:
3988 flush_tlb_local();
3989 break;
3990 case UVMF_ALL:
3991 flush_tlb_mask(&d->domain_dirty_cpumask);
3992 break;
3993 default:
3994 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3995 void),
3996 &pmask);
3997 flush_tlb_mask(&pmask);
3998 break;
4000 break;
4002 case UVMF_INVLPG:
4003 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
4005 case UVMF_LOCAL:
4006 if ( !paging_mode_enabled(d) ||
4007 (paging_invlpg(v, va) != 0) )
4008 flush_tlb_one_local(va);
4009 break;
4010 case UVMF_ALL:
4011 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
4012 break;
4013 default:
4014 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
4015 void),
4016 &pmask);
4017 flush_tlb_one_mask(&pmask, va);
4018 break;
4020 break;
4023 return rc;
4026 int do_update_va_mapping(unsigned long va, u64 val64,
4027 unsigned long flags)
4029 return __do_update_va_mapping(va, val64, flags, current->domain);
4032 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
4033 unsigned long flags,
4034 domid_t domid)
4036 struct domain *pg_owner;
4037 int rc;
4039 if ( (pg_owner = get_pg_owner(domid)) == NULL )
4040 return -ESRCH;
4042 rc = __do_update_va_mapping(va, val64, flags, pg_owner);
4044 put_pg_owner(pg_owner);
4046 return rc;
4051 /*************************
4052 * Descriptor Tables
4053 */
4055 void destroy_gdt(struct vcpu *v)
4057 int i;
4058 unsigned long pfn;
4060 v->arch.guest_context.gdt_ents = 0;
4061 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
4063 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
4064 put_page_and_type(mfn_to_page(pfn));
4065 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
4066 v->arch.guest_context.gdt_frames[i] = 0;
4071 long set_gdt(struct vcpu *v,
4072 unsigned long *frames,
4073 unsigned int entries)
4075 struct domain *d = v->domain;
4076 /* NB. There are 512 8-byte entries per GDT page. */
4077 int i, nr_pages = (entries + 511) / 512;
4078 unsigned long mfn;
4080 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4081 return -EINVAL;
4083 /* Check the pages in the new GDT. */
4084 for ( i = 0; i < nr_pages; i++ )
4086 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
4087 if ( !mfn_valid(mfn) ||
4088 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
4089 goto fail;
4092 /* Tear down the old GDT. */
4093 destroy_gdt(v);
4095 /* Install the new GDT. */
4096 v->arch.guest_context.gdt_ents = entries;
4097 for ( i = 0; i < nr_pages; i++ )
4099 v->arch.guest_context.gdt_frames[i] = frames[i];
4100 l1e_write(&v->arch.perdomain_ptes[i],
4101 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
4104 return 0;
4106 fail:
4107 while ( i-- > 0 )
4108 put_page_and_type(mfn_to_page(frames[i]));
4109 return -EINVAL;
4113 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
4115 int nr_pages = (entries + 511) / 512;
4116 unsigned long frames[16];
4117 struct vcpu *curr = current;
4118 long ret;
4120 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
4121 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4122 return -EINVAL;
4124 if ( copy_from_guest(frames, frame_list, nr_pages) )
4125 return -EFAULT;
4127 domain_lock(curr->domain);
4129 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
4130 flush_tlb_local();
4132 domain_unlock(curr->domain);
4134 return ret;
4138 long do_update_descriptor(u64 pa, u64 desc)
4140 struct domain *dom = current->domain;
4141 unsigned long gmfn = pa >> PAGE_SHIFT;
4142 unsigned long mfn;
4143 unsigned int offset;
4144 struct desc_struct *gdt_pent, d;
4145 struct page_info *page;
4146 long ret = -EINVAL;
4148 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
4150 *(u64 *)&d = desc;
4152 mfn = gmfn_to_mfn(dom, gmfn);
4153 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
4154 !mfn_valid(mfn) ||
4155 !check_descriptor(dom, &d) )
4156 return -EINVAL;
4158 page = mfn_to_page(mfn);
4159 if ( unlikely(!get_page(page, dom)) )
4160 return -EINVAL;
4162 /* Check if the given frame is in use in an unsafe context. */
4163 switch ( page->u.inuse.type_info & PGT_type_mask )
4165 case PGT_seg_desc_page:
4166 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
4167 goto out;
4168 break;
4169 default:
4170 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
4171 goto out;
4172 break;
4175 paging_mark_dirty(dom, mfn);
4177 /* All is good so make the update. */
4178 gdt_pent = map_domain_page(mfn);
4179 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
4180 unmap_domain_page(gdt_pent);
4182 put_page_type(page);
4184 ret = 0; /* success */
4186 out:
4187 put_page(page);
4189 return ret;
4192 typedef struct e820entry e820entry_t;
4193 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
4195 struct memory_map_context
4197 unsigned int n;
4198 unsigned long s;
4199 struct xen_memory_map map;
4200 };
4202 static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4204 struct memory_map_context *ctxt = p;
4206 if ( s > ctxt->s )
4208 e820entry_t ent;
4209 XEN_GUEST_HANDLE(e820entry_t) buffer;
4211 if ( ctxt->n + 1 >= ctxt->map.nr_entries )
4212 return -EINVAL;
4213 ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4214 ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4215 ent.type = E820_RESERVED;
4216 buffer = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4217 if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) < 0 )
4218 return -EFAULT;
4219 ctxt->n++;
4221 ctxt->s = e + 1;
4223 return 0;
4226 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
4228 struct page_info *page = NULL;
4229 int rc;
4231 switch ( op )
4233 case XENMEM_add_to_physmap:
4235 struct xen_add_to_physmap xatp;
4236 unsigned long prev_mfn, mfn = 0, gpfn;
4237 struct domain *d;
4239 if ( copy_from_guest(&xatp, arg, 1) )
4240 return -EFAULT;
4242 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
4243 if ( rc != 0 )
4244 return rc;
4246 if ( xsm_add_to_physmap(current->domain, d) )
4248 rcu_unlock_domain(d);
4249 return -EPERM;
4252 switch ( xatp.space )
4254 case XENMAPSPACE_shared_info:
4255 if ( xatp.idx == 0 )
4256 mfn = virt_to_mfn(d->shared_info);
4257 break;
4258 case XENMAPSPACE_grant_table:
4259 spin_lock(&d->grant_table->lock);
4261 if ( d->grant_table->gt_version == 0 )
4262 d->grant_table->gt_version = 1;
4264 if ( d->grant_table->gt_version == 2 &&
4265 (xatp.idx & XENMAPIDX_grant_table_status) )
4267 xatp.idx &= ~XENMAPIDX_grant_table_status;
4268 if ( xatp.idx < nr_status_frames(d->grant_table) )
4269 mfn = virt_to_mfn(d->grant_table->status[xatp.idx]);
4271 else
4273 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
4274 (xatp.idx < max_nr_grant_frames) )
4275 gnttab_grow_table(d, xatp.idx + 1);
4277 if ( xatp.idx < nr_grant_frames(d->grant_table) )
4278 mfn = virt_to_mfn(d->grant_table->shared_raw[xatp.idx]);
4281 spin_unlock(&d->grant_table->lock);
4282 break;
4283 case XENMAPSPACE_gmfn:
4285 p2m_type_t p2mt;
4287 xatp.idx = mfn_x(gfn_to_mfn_unshare(d, xatp.idx, &p2mt, 0));
4288 /* If the page is still shared, exit early */
4289 if ( p2m_is_shared(p2mt) )
4291 rcu_unlock_domain(d);
4292 return -ENOMEM;
4294 if ( !get_page_from_pagenr(xatp.idx, d) )
4295 break;
4296 mfn = xatp.idx;
4297 page = mfn_to_page(mfn);
4298 break;
4300 default:
4301 break;
4304 if ( !paging_mode_translate(d) || (mfn == 0) )
4306 if ( page )
4307 put_page(page);
4308 rcu_unlock_domain(d);
4309 return -EINVAL;
4312 domain_lock(d);
4314 if ( page )
4315 put_page(page);
4317 /* Remove previously mapped page if it was present. */
4318 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
4319 if ( mfn_valid(prev_mfn) )
4321 if ( is_xen_heap_mfn(prev_mfn) )
4322 /* Xen heap frames are simply unhooked from this phys slot. */
4323 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
4324 else
4325 /* Normal domain memory is freed, to avoid leaking memory. */
4326 guest_remove_page(d, xatp.gpfn);
4329 /* Unmap from old location, if any. */
4330 gpfn = get_gpfn_from_mfn(mfn);
4331 ASSERT( gpfn != SHARED_M2P_ENTRY );
4332 if ( gpfn != INVALID_M2P_ENTRY )
4333 guest_physmap_remove_page(d, gpfn, mfn, 0);
4335 /* Map at new location. */
4336 rc = guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
4338 domain_unlock(d);
4340 rcu_unlock_domain(d);
4342 return rc;
4345 case XENMEM_set_memory_map:
4347 struct xen_foreign_memory_map fmap;
4348 struct domain *d;
4350 if ( copy_from_guest(&fmap, arg, 1) )
4351 return -EFAULT;
4353 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
4354 return -EINVAL;
4356 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4357 if ( rc != 0 )
4358 return rc;
4360 rc = xsm_domain_memory_map(d);
4361 if ( rc )
4363 rcu_unlock_domain(d);
4364 return rc;
4367 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
4368 fmap.map.nr_entries) ? -EFAULT : 0;
4369 d->arch.nr_e820 = fmap.map.nr_entries;
4371 rcu_unlock_domain(d);
4372 return rc;
4375 case XENMEM_memory_map:
4377 struct xen_memory_map map;
4378 struct domain *d = current->domain;
4380 /* Backwards compatibility. */
4381 if ( d->arch.nr_e820 == 0 )
4382 return -ENOSYS;
4384 if ( copy_from_guest(&map, arg, 1) )
4385 return -EFAULT;
4387 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4388 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4389 copy_to_guest(arg, &map, 1) )
4390 return -EFAULT;
4392 return 0;
4395 case XENMEM_machine_memory_map:
4397 struct memory_map_context ctxt;
4398 XEN_GUEST_HANDLE(e820entry_t) buffer;
4399 unsigned int i;
4401 if ( !IS_PRIV(current->domain) )
4402 return -EINVAL;
4404 rc = xsm_machine_memory_map();
4405 if ( rc )
4406 return rc;
4408 if ( copy_from_guest(&ctxt.map, arg, 1) )
4409 return -EFAULT;
4410 if ( ctxt.map.nr_entries < e820.nr_map + 1 )
4411 return -EINVAL;
4413 buffer = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4414 if ( !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4415 return -EFAULT;
4417 for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4419 unsigned long s = PFN_DOWN(e820.map[i].addr);
4421 if ( s )
4423 rc = rangeset_report_ranges(current->domain->iomem_caps,
4424 ctxt.s, s - 1,
4425 handle_iomem_range, &ctxt);
4426 if ( !rc )
4427 rc = handle_iomem_range(s, s, &ctxt);
4428 if ( rc )
4429 return rc;
4431 if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4432 return -EINVAL;
4433 if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) < 0 )
4434 return -EFAULT;
4435 ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4438 if ( ctxt.s )
4440 rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4441 ~0UL, handle_iomem_range, &ctxt);
4442 if ( !rc && ctxt.s )
4443 rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4444 if ( rc )
4445 return rc;
4448 ctxt.map.nr_entries = ctxt.n;
4450 if ( copy_to_guest(arg, &ctxt.map, 1) )
4451 return -EFAULT;
4453 return 0;
4456 case XENMEM_machphys_mapping:
4458 static const struct xen_machphys_mapping mapping = {
4459 .v_start = MACH2PHYS_VIRT_START,
4460 .v_end = MACH2PHYS_VIRT_END,
4461 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4462 };
4464 if ( copy_to_guest(arg, &mapping, 1) )
4465 return -EFAULT;
4467 return 0;
4470 case XENMEM_set_pod_target:
4471 case XENMEM_get_pod_target:
4473 xen_pod_target_t target;
4474 struct domain *d;
4476 /* Support DOMID_SELF? */
4477 if ( !IS_PRIV(current->domain) )
4478 return -EINVAL;
4480 if ( copy_from_guest(&target, arg, 1) )
4481 return -EFAULT;
4483 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4484 if ( rc != 0 )
4485 return rc;
4487 if ( op == XENMEM_set_pod_target )
4489 if ( target.target_pages > d->max_pages )
4491 rc = -EINVAL;
4492 goto pod_target_out_unlock;
4495 rc = p2m_pod_set_mem_target(d, target.target_pages);
4498 target.tot_pages = d->tot_pages;
4499 target.pod_cache_pages = d->arch.p2m->pod.count;
4500 target.pod_entries = d->arch.p2m->pod.entry_count;
4502 if ( copy_to_guest(arg, &target, 1) )
4504 rc= -EFAULT;
4505 goto pod_target_out_unlock;
4508 pod_target_out_unlock:
4509 rcu_unlock_domain(d);
4510 return rc;
4513 case XENMEM_get_sharing_freed_pages:
4514 return mem_sharing_get_nr_saved_mfns();
4516 default:
4517 return subarch_memory_op(op, arg);
4520 return 0;
4524 /*************************
4525 * Writable Pagetables
4526 */
4528 struct ptwr_emulate_ctxt {
4529 struct x86_emulate_ctxt ctxt;
4530 unsigned long cr2;
4531 l1_pgentry_t pte;
4532 };
4534 static int ptwr_emulated_read(
4535 enum x86_segment seg,
4536 unsigned long offset,
4537 void *p_data,
4538 unsigned int bytes,
4539 struct x86_emulate_ctxt *ctxt)
4541 unsigned int rc;
4542 unsigned long addr = offset;
4544 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4546 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4547 return X86EMUL_EXCEPTION;
4550 return X86EMUL_OKAY;
4553 static int ptwr_emulated_update(
4554 unsigned long addr,
4555 paddr_t old,
4556 paddr_t val,
4557 unsigned int bytes,
4558 unsigned int do_cmpxchg,
4559 struct ptwr_emulate_ctxt *ptwr_ctxt)
4561 unsigned long mfn;
4562 unsigned long unaligned_addr = addr;
4563 struct page_info *page;
4564 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4565 struct vcpu *v = current;
4566 struct domain *d = v->domain;
4568 /* Only allow naturally-aligned stores within the original %cr2 page. */
4569 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4571 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4572 ptwr_ctxt->cr2, addr, bytes);
4573 return X86EMUL_UNHANDLEABLE;
4576 /* Turn a sub-word access into a full-word access. */
4577 if ( bytes != sizeof(paddr_t) )
4579 paddr_t full;
4580 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4582 /* Align address; read full word. */
4583 addr &= ~(sizeof(paddr_t)-1);
4584 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4586 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4587 return X86EMUL_EXCEPTION;
4589 /* Mask out bits provided by caller. */
4590 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4591 /* Shift the caller value and OR in the missing bits. */
4592 val &= (((paddr_t)1 << (bytes*8)) - 1);
4593 val <<= (offset)*8;
4594 val |= full;
4595 /* Also fill in missing parts of the cmpxchg old value. */
4596 old &= (((paddr_t)1 << (bytes*8)) - 1);
4597 old <<= (offset)*8;
4598 old |= full;
4601 pte = ptwr_ctxt->pte;
4602 mfn = l1e_get_pfn(pte);
4603 page = mfn_to_page(mfn);
4605 /* We are looking only for read-only mappings of p.t. pages. */
4606 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4607 ASSERT(mfn_valid(mfn));
4608 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4609 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4610 ASSERT(page_get_owner(page) == d);
4612 /* Check the new PTE. */
4613 nl1e = l1e_from_intpte(val);
4614 if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
4616 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4617 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4619 /*
4620 * If this is an upper-half write to a PAE PTE then we assume that
4621 * the guest has simply got the two writes the wrong way round. We
4622 * zap the PRESENT bit on the assumption that the bottom half will
4623 * be written immediately after we return to the guest.
4624 */
4625 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4626 l1e_get_intpte(nl1e));
4627 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4629 else
4631 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4632 return X86EMUL_UNHANDLEABLE;
4636 adjust_guest_l1e(nl1e, d);
4638 /* Checked successfully: do the update (write or cmpxchg). */
4639 pl1e = map_domain_page(mfn);
4640 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4641 if ( do_cmpxchg )
4643 int okay;
4644 intpte_t t = old;
4645 ol1e = l1e_from_intpte(old);
4647 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4648 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4649 okay = (okay && t == old);
4651 if ( !okay )
4653 unmap_domain_page(pl1e);
4654 put_page_from_l1e(nl1e, d);
4655 return X86EMUL_CMPXCHG_FAILED;
4658 else
4660 ol1e = *pl1e;
4661 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4662 BUG();
4665 trace_ptwr_emulation(addr, nl1e);
4667 unmap_domain_page(pl1e);
4669 /* Finally, drop the old PTE. */
4670 put_page_from_l1e(ol1e, d);
4672 return X86EMUL_OKAY;
4675 static int ptwr_emulated_write(
4676 enum x86_segment seg,
4677 unsigned long offset,
4678 void *p_data,
4679 unsigned int bytes,
4680 struct x86_emulate_ctxt *ctxt)
4682 paddr_t val = 0;
4684 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4686 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4687 offset, bytes);
4688 return X86EMUL_UNHANDLEABLE;
4691 memcpy(&val, p_data, bytes);
4693 return ptwr_emulated_update(
4694 offset, 0, val, bytes, 0,
4695 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4698 static int ptwr_emulated_cmpxchg(
4699 enum x86_segment seg,
4700 unsigned long offset,
4701 void *p_old,
4702 void *p_new,
4703 unsigned int bytes,
4704 struct x86_emulate_ctxt *ctxt)
4706 paddr_t old = 0, new = 0;
4708 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4710 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4711 offset, bytes);
4712 return X86EMUL_UNHANDLEABLE;
4715 memcpy(&old, p_old, bytes);
4716 memcpy(&new, p_new, bytes);
4718 return ptwr_emulated_update(
4719 offset, old, new, bytes, 1,
4720 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4723 static const struct x86_emulate_ops ptwr_emulate_ops = {
4724 .read = ptwr_emulated_read,
4725 .insn_fetch = ptwr_emulated_read,
4726 .write = ptwr_emulated_write,
4727 .cmpxchg = ptwr_emulated_cmpxchg,
4728 };
4730 /* Write page fault handler: check if guest is trying to modify a PTE. */
4731 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4732 struct cpu_user_regs *regs)
4734 struct domain *d = v->domain;
4735 struct page_info *page;
4736 l1_pgentry_t pte;
4737 struct ptwr_emulate_ctxt ptwr_ctxt;
4738 int rc;
4740 /* Attempt to read the PTE that maps the VA being accessed. */
4741 guest_get_eff_l1e(v, addr, &pte);
4743 /* We are looking only for read-only mappings of p.t. pages. */
4744 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4745 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4746 goto bail;
4748 page = l1e_get_page(pte);
4749 if ( !page_lock(page) )
4751 put_page(page);
4752 goto bail;
4755 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4757 page_unlock(page);
4758 put_page(page);
4759 goto bail;
4762 ptwr_ctxt.ctxt.regs = regs;
4763 ptwr_ctxt.ctxt.force_writeback = 0;
4764 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4765 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4766 ptwr_ctxt.cr2 = addr;
4767 ptwr_ctxt.pte = pte;
4769 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4771 page_unlock(page);
4772 put_page(page);
4774 if ( rc == X86EMUL_UNHANDLEABLE )
4775 goto bail;
4777 perfc_incr(ptwr_emulations);
4778 return EXCRET_fault_fixed;
4780 bail:
4781 return 0;
4784 void free_xen_pagetable(void *v)
4786 if ( early_boot )
4787 return;
4789 if ( is_xen_heap_page(virt_to_page(v)) )
4790 free_xenheap_page(v);
4791 else
4792 free_domheap_page(virt_to_page(v));
4795 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4796 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4797 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4799 /*
4800 * map_pages_to_xen() can be called with interrupts disabled:
4801 * * During early bootstrap; or
4802 * * alloc_xenheap_pages() via memguard_guard_range
4803 * In these cases it is safe to use flush_area_local():
4804 * * Because only the local CPU is online; or
4805 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4806 */
4807 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4808 flush_area_local((const void *)v, f) : \
4809 flush_area_all((const void *)v, f))
4811 int map_pages_to_xen(
4812 unsigned long virt,
4813 unsigned long mfn,
4814 unsigned long nr_mfns,
4815 unsigned int flags)
4817 l2_pgentry_t *pl2e, ol2e;
4818 l1_pgentry_t *pl1e, ol1e;
4819 unsigned int i;
4821 while ( nr_mfns != 0 )
4823 #ifdef __x86_64__
4824 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4825 l3_pgentry_t ol3e = *pl3e;
4827 if ( cpu_has_page1gb &&
4828 !(((virt >> PAGE_SHIFT) | mfn) &
4829 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4830 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4831 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4833 /* 1GB-page mapping. */
4834 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4836 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4838 unsigned int flush_flags =
4839 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4841 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4843 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4844 flush_flags |= FLUSH_TLB_GLOBAL;
4845 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4846 PAGE_CACHE_ATTRS )
4847 flush_flags |= FLUSH_CACHE;
4848 flush_area(virt, flush_flags);
4850 else
4852 pl2e = l3e_to_l2e(ol3e);
4853 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4855 ol2e = pl2e[i];
4856 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4857 continue;
4858 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4860 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4861 flush_flags |= FLUSH_TLB_GLOBAL;
4862 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4863 PAGE_CACHE_ATTRS )
4864 flush_flags |= FLUSH_CACHE;
4866 else
4868 unsigned int j;
4870 pl1e = l2e_to_l1e(ol2e);
4871 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4873 ol1e = pl1e[j];
4874 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4875 flush_flags |= FLUSH_TLB_GLOBAL;
4876 if ( (l1e_get_flags(ol1e) ^ flags) &
4877 PAGE_CACHE_ATTRS )
4878 flush_flags |= FLUSH_CACHE;
4882 flush_area(virt, flush_flags);
4883 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4885 ol2e = pl2e[i];
4886 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4887 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4888 free_xen_pagetable(l2e_to_l1e(ol2e));
4890 free_xen_pagetable(pl2e);
4894 virt += 1UL << L3_PAGETABLE_SHIFT;
4895 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4896 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4897 continue;
4900 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4901 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4903 unsigned int flush_flags =
4904 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4906 /* Skip this PTE if there is no change. */
4907 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4908 L1_PAGETABLE_ENTRIES - 1)) +
4909 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4910 l1_table_offset(virt) == mfn) &&
4911 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4912 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4914 /* We can skip to end of L3 superpage if we got a match. */
4915 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4916 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4917 if ( i > nr_mfns )
4918 i = nr_mfns;
4919 virt += i << PAGE_SHIFT;
4920 mfn += i;
4921 nr_mfns -= i;
4922 continue;
4925 pl2e = alloc_xen_pagetable();
4926 if ( pl2e == NULL )
4927 return -ENOMEM;
4929 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4930 l2e_write(pl2e + i,
4931 l2e_from_pfn(l3e_get_pfn(ol3e) +
4932 (i << PAGETABLE_ORDER),
4933 l3e_get_flags(ol3e)));
4935 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4936 flush_flags |= FLUSH_TLB_GLOBAL;
4938 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4939 __PAGE_HYPERVISOR));
4940 flush_area(virt, flush_flags);
4942 #endif
4944 pl2e = virt_to_xen_l2e(virt);
4946 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4947 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4948 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4950 /* Super-page mapping. */
4951 ol2e = *pl2e;
4952 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4954 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4956 unsigned int flush_flags =
4957 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4959 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4961 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4962 flush_flags |= FLUSH_TLB_GLOBAL;
4963 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4964 PAGE_CACHE_ATTRS )
4965 flush_flags |= FLUSH_CACHE;
4966 flush_area(virt, flush_flags);
4968 else
4970 pl1e = l2e_to_l1e(ol2e);
4971 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4973 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4974 flush_flags |= FLUSH_TLB_GLOBAL;
4975 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4976 PAGE_CACHE_ATTRS )
4977 flush_flags |= FLUSH_CACHE;
4979 flush_area(virt, flush_flags);
4980 free_xen_pagetable(pl1e);
4984 virt += 1UL << L2_PAGETABLE_SHIFT;
4985 mfn += 1UL << PAGETABLE_ORDER;
4986 nr_mfns -= 1UL << PAGETABLE_ORDER;
4988 else
4990 /* Normal page mapping. */
4991 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4993 pl1e = alloc_xen_pagetable();
4994 if ( pl1e == NULL )
4995 return -ENOMEM;
4996 clear_page(pl1e);
4997 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4998 __PAGE_HYPERVISOR));
5000 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5002 unsigned int flush_flags =
5003 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
5005 /* Skip this PTE if there is no change. */
5006 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
5007 l1_table_offset(virt)) == mfn) &&
5008 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
5009 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
5011 /* We can skip to end of L2 superpage if we got a match. */
5012 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
5013 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
5014 if ( i > nr_mfns )
5015 i = nr_mfns;
5016 virt += i << L1_PAGETABLE_SHIFT;
5017 mfn += i;
5018 nr_mfns -= i;
5019 goto check_l3;
5022 pl1e = alloc_xen_pagetable();
5023 if ( pl1e == NULL )
5024 return -ENOMEM;
5026 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5027 l1e_write(&pl1e[i],
5028 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5029 lNf_to_l1f(l2e_get_flags(*pl2e))));
5031 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
5032 flush_flags |= FLUSH_TLB_GLOBAL;
5034 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5035 __PAGE_HYPERVISOR));
5036 flush_area(virt, flush_flags);
5039 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
5040 ol1e = *pl1e;
5041 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
5042 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
5044 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
5045 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
5046 flush_flags |= FLUSH_TLB_GLOBAL;
5047 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
5048 flush_flags |= FLUSH_CACHE;
5049 flush_area(virt, flush_flags);
5052 virt += 1UL << L1_PAGETABLE_SHIFT;
5053 mfn += 1UL;
5054 nr_mfns -= 1UL;
5056 if ( (flags == PAGE_HYPERVISOR) &&
5057 ((nr_mfns == 0) ||
5058 ((((virt >> PAGE_SHIFT) | mfn) &
5059 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
5061 unsigned long base_mfn;
5062 pl1e = l2e_to_l1e(*pl2e);
5063 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
5064 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
5065 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
5066 (l1e_get_flags(*pl1e) != flags) )
5067 break;
5068 if ( i == L1_PAGETABLE_ENTRIES )
5070 ol2e = *pl2e;
5071 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
5072 l1f_to_lNf(flags)));
5073 flush_area(virt - PAGE_SIZE,
5074 FLUSH_TLB_GLOBAL |
5075 FLUSH_ORDER(PAGETABLE_ORDER));
5076 free_xen_pagetable(l2e_to_l1e(ol2e));
5081 check_l3: ;
5082 #ifdef __x86_64__
5083 if ( cpu_has_page1gb &&
5084 (flags == PAGE_HYPERVISOR) &&
5085 ((nr_mfns == 0) ||
5086 !(((virt >> PAGE_SHIFT) | mfn) &
5087 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
5089 unsigned long base_mfn;
5091 ol3e = *pl3e;
5092 pl2e = l3e_to_l2e(ol3e);
5093 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
5094 L1_PAGETABLE_ENTRIES - 1);
5095 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
5096 if ( (l2e_get_pfn(*pl2e) !=
5097 (base_mfn + (i << PAGETABLE_ORDER))) ||
5098 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
5099 break;
5100 if ( i == L2_PAGETABLE_ENTRIES )
5102 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
5103 l1f_to_lNf(flags)));
5104 flush_area(virt - PAGE_SIZE,
5105 FLUSH_TLB_GLOBAL |
5106 FLUSH_ORDER(2*PAGETABLE_ORDER));
5107 free_xen_pagetable(l3e_to_l2e(ol3e));
5110 #endif
5113 return 0;
5116 void destroy_xen_mappings(unsigned long s, unsigned long e)
5118 l2_pgentry_t *pl2e;
5119 l1_pgentry_t *pl1e;
5120 unsigned int i;
5121 unsigned long v = s;
5123 ASSERT((s & ~PAGE_MASK) == 0);
5124 ASSERT((e & ~PAGE_MASK) == 0);
5126 while ( v < e )
5128 #ifdef __x86_64__
5129 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
5131 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
5133 v += 1UL << L3_PAGETABLE_SHIFT;
5134 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
5135 continue;
5138 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
5140 if ( l2_table_offset(v) == 0 &&
5141 l1_table_offset(v) == 0 &&
5142 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
5144 /* PAGE1GB: whole superpage is destroyed. */
5145 l3e_write_atomic(pl3e, l3e_empty());
5146 v += 1UL << L3_PAGETABLE_SHIFT;
5147 continue;
5150 /* PAGE1GB: shatter the superpage and fall through. */
5151 pl2e = alloc_xen_pagetable();
5152 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5153 l2e_write(pl2e + i,
5154 l2e_from_pfn(l3e_get_pfn(*pl3e) +
5155 (i << PAGETABLE_ORDER),
5156 l3e_get_flags(*pl3e)));
5157 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5158 __PAGE_HYPERVISOR));
5160 #endif
5162 pl2e = virt_to_xen_l2e(v);
5164 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5166 v += 1UL << L2_PAGETABLE_SHIFT;
5167 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
5168 continue;
5171 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5173 if ( (l1_table_offset(v) == 0) &&
5174 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
5176 /* PSE: whole superpage is destroyed. */
5177 l2e_write_atomic(pl2e, l2e_empty());
5178 v += 1UL << L2_PAGETABLE_SHIFT;
5180 else
5182 /* PSE: shatter the superpage and try again. */
5183 pl1e = alloc_xen_pagetable();
5184 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5185 l1e_write(&pl1e[i],
5186 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5187 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5188 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5189 __PAGE_HYPERVISOR));
5192 else
5194 /* Ordinary 4kB mapping. */
5195 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5196 l1e_write_atomic(pl1e, l1e_empty());
5197 v += PAGE_SIZE;
5199 /* If we are done with the L2E, check if it is now empty. */
5200 if ( (v != e) && (l1_table_offset(v) != 0) )
5201 continue;
5202 pl1e = l2e_to_l1e(*pl2e);
5203 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5204 if ( l1e_get_intpte(pl1e[i]) != 0 )
5205 break;
5206 if ( i == L1_PAGETABLE_ENTRIES )
5208 /* Empty: zap the L2E and free the L1 page. */
5209 l2e_write_atomic(pl2e, l2e_empty());
5210 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5211 free_xen_pagetable(pl1e);
5215 #ifdef __x86_64__
5216 /* If we are done with the L3E, check if it is now empty. */
5217 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
5218 continue;
5219 pl2e = l3e_to_l2e(*pl3e);
5220 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5221 if ( l2e_get_intpte(pl2e[i]) != 0 )
5222 break;
5223 if ( i == L2_PAGETABLE_ENTRIES )
5225 /* Empty: zap the L3E and free the L2 page. */
5226 l3e_write_atomic(pl3e, l3e_empty());
5227 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5228 free_xen_pagetable(pl2e);
5230 #endif
5233 flush_area(NULL, FLUSH_TLB_GLOBAL);
5236 void __set_fixmap(
5237 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5239 BUG_ON(idx >= __end_of_fixed_addresses);
5240 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
5243 #ifdef MEMORY_GUARD
5245 void memguard_init(void)
5247 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
5248 #ifdef __i386__
5249 map_pages_to_xen(
5250 (unsigned long)__va(start),
5251 start >> PAGE_SHIFT,
5252 (xenheap_phys_end - start) >> PAGE_SHIFT,
5253 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5254 #else
5255 map_pages_to_xen(
5256 (unsigned long)__va(start),
5257 start >> PAGE_SHIFT,
5258 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5259 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5260 BUG_ON(start != xen_phys_start);
5261 map_pages_to_xen(
5262 XEN_VIRT_START,
5263 start >> PAGE_SHIFT,
5264 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5265 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5266 #endif
5269 static void __memguard_change_range(void *p, unsigned long l, int guard)
5271 unsigned long _p = (unsigned long)p;
5272 unsigned long _l = (unsigned long)l;
5273 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
5275 /* Ensure we are dealing with a page-aligned whole number of pages. */
5276 ASSERT((_p&~PAGE_MASK) == 0);
5277 ASSERT((_l&~PAGE_MASK) == 0);
5279 if ( guard )
5280 flags &= ~_PAGE_PRESENT;
5282 map_pages_to_xen(
5283 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5286 void memguard_guard_range(void *p, unsigned long l)
5288 __memguard_change_range(p, l, 1);
5291 void memguard_unguard_range(void *p, unsigned long l)
5293 __memguard_change_range(p, l, 0);
5296 #endif
5298 void memguard_guard_stack(void *p)
5300 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5301 p = (void *)((unsigned long)p + STACK_SIZE -
5302 PRIMARY_STACK_SIZE - PAGE_SIZE);
5303 memguard_guard_range(p, PAGE_SIZE);
5306 /*
5307 * Local variables:
5308 * mode: C
5309 * c-set-style: "BSD"
5310 * c-basic-offset: 4
5311 * tab-width: 4
5312 * indent-tabs-mode: nil
5313 * End:
5314 */