xcp-1.6-updates/xen-4.1.hg

view xen/arch/x86/mm.c @ 23319:ff523faf2be1

x86/mm: fix mod_l1_entry() return value when encountering r/o MMIO page

While putting together the workaround announced in
http://lists.xen.org/archives/html/xen-devel/2012-06/msg00709.html, I
found that mod_l1_entry(), upon encountering a set bit in
mmio_ro_ranges, would return 1 instead of 0 (the removal of the write
permission is supposed to be entirely transparent to the caller, even
more so to the calling guest).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
xen-unstable changeset: 25487:baa85434d0ec
xen-unstable date: Thu Jun 21 11:30:59 2012 +0200
author Jan Beulich <jbeulich@novell.com>
date Mon Jul 09 10:30:16 2012 +0100 (2012-07-09)
parents c7729c73fefc
children 9d30201cbcc4
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <xen/xmalloc.h>
103 #include <asm/paging.h>
104 #include <asm/shadow.h>
105 #include <asm/page.h>
106 #include <asm/flushtlb.h>
107 #include <asm/io.h>
108 #include <asm/ldt.h>
109 #include <asm/x86_emulate.h>
110 #include <asm/e820.h>
111 #include <asm/hypercall.h>
112 #include <asm/shared.h>
113 #include <public/memory.h>
114 #include <public/sched.h>
115 #include <xsm/xsm.h>
116 #include <xen/trace.h>
117 #include <asm/setup.h>
118 #include <asm/fixmap.h>
119 #include <asm/mem_sharing.h>
121 /*
122 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
123 * mappings to avoid type conflicts with fixed-range MTRRs covering the
124 * lowest megabyte of physical memory. In any case the VGA hole should be
125 * mapped with type UC.
126 */
127 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
128 l1_identmap[L1_PAGETABLE_ENTRIES];
130 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
132 /*
133 * PTE updates can be done with ordinary writes except:
134 * 1. Debug builds get extra checking by using CMPXCHG[8B].
135 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
136 */
137 #if !defined(NDEBUG) || defined(__i386__)
138 #define PTE_UPDATE_WITH_CMPXCHG
139 #endif
141 bool_t __read_mostly mem_hotplug = 0;
143 /* Private domain structs for DOMID_XEN and DOMID_IO. */
144 struct domain *dom_xen, *dom_io, *dom_cow;
146 /* Frame table size in pages. */
147 unsigned long max_page;
148 unsigned long total_pages;
150 unsigned long __read_mostly pdx_group_valid[BITS_TO_LONGS(
151 (FRAMETABLE_SIZE / sizeof(*frame_table) + PDX_GROUP_COUNT - 1)
152 / PDX_GROUP_COUNT)] = { [0] = 1 };
154 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
156 bool_t __read_mostly opt_allow_superpage;
157 boolean_param("allowsuperpage", opt_allow_superpage);
159 #ifdef __i386__
160 static int get_superpage(unsigned long mfn, struct domain *d);
161 #endif
162 static void put_superpage(unsigned long mfn);
164 #define l1_disallow_mask(d) \
165 ((d != dom_io) && \
166 (rangeset_is_empty((d)->iomem_caps) && \
167 rangeset_is_empty((d)->arch.ioport_caps) && \
168 !has_arch_pdevs(d) && \
169 !is_hvm_domain(d)) ? \
170 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
172 #ifdef __x86_64__
173 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
174 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
175 L3_DISALLOW_MASK : \
176 COMPAT_L3_DISALLOW_MASK)
177 #else
178 #define l3_disallow_mask(d) L3_DISALLOW_MASK
179 #endif
181 #ifdef __x86_64__
182 static void __init init_spagetable(void)
183 {
184 unsigned long s, start = SPAGETABLE_VIRT_START;
185 unsigned long end = SPAGETABLE_VIRT_END;
186 unsigned long step, mfn;
187 unsigned int max_entries;
189 step = 1UL << PAGETABLE_ORDER;
190 max_entries = (max_pdx + ((1UL<<SUPERPAGE_ORDER)-1)) >> SUPERPAGE_ORDER;
191 end = start + (((max_entries * sizeof(*spage_table)) +
192 ((1UL<<SUPERPAGE_SHIFT)-1)) & (~((1UL<<SUPERPAGE_SHIFT)-1)));
194 for (s = start; s < end; s += step << PAGE_SHIFT)
195 {
196 mfn = alloc_boot_pages(step, step);
197 if ( !mfn )
198 panic("Not enough memory for spage table");
199 map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
200 }
201 memset((void *)start, 0, end - start);
202 }
203 #endif
205 static void __init init_frametable_chunk(void *start, void *end)
206 {
207 unsigned long s = (unsigned long)start;
208 unsigned long e = (unsigned long)end;
209 unsigned long step, mfn;
211 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
212 for ( ; s < e; s += step << PAGE_SHIFT )
213 {
214 step = 1UL << (cpu_has_page1gb &&
215 !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
216 L3_PAGETABLE_SHIFT - PAGE_SHIFT :
217 L2_PAGETABLE_SHIFT - PAGE_SHIFT);
218 /*
219 * The hardcoded 4 below is arbitrary - just pick whatever you think
220 * is reasonable to waste as a trade-off for using a large page.
221 */
222 while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
223 step >>= PAGETABLE_ORDER;
224 do {
225 mfn = alloc_boot_pages(step, step);
226 } while ( !mfn && (step >>= PAGETABLE_ORDER) );
227 if ( !mfn )
228 panic("Not enough memory for frame table");
229 map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
230 }
232 memset(start, 0, end - start);
233 memset(end, -1, s - (unsigned long)end);
234 }
236 void __init init_frametable(void)
237 {
238 unsigned int sidx, eidx, nidx;
239 unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
241 #ifdef __x86_64__
242 BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_END);
243 #endif
244 BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
246 for ( sidx = 0; ; sidx = nidx )
247 {
248 eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
249 nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
250 if ( nidx >= max_idx )
251 break;
252 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
253 pdx_to_page(eidx * PDX_GROUP_COUNT));
254 }
255 if ( !mem_hotplug )
256 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
257 pdx_to_page(max_pdx - 1) + 1);
258 else
259 {
260 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
261 pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1);
262 memset(pdx_to_page(max_pdx), -1,
263 (unsigned long)pdx_to_page(max_idx * PDX_GROUP_COUNT) -
264 (unsigned long)pdx_to_page(max_pdx));
265 }
266 #ifdef __x86_64__
267 if (opt_allow_superpage)
268 init_spagetable();
269 #endif
270 }
272 void __init arch_init_memory(void)
273 {
274 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
276 /*
277 * Initialise our DOMID_XEN domain.
278 * Any Xen-heap pages that we will allow to be mapped will have
279 * their domain field set to dom_xen.
280 */
281 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
282 BUG_ON(dom_xen == NULL);
284 /*
285 * Initialise our DOMID_IO domain.
286 * This domain owns I/O pages that are within the range of the page_info
287 * array. Mappings occur at the priv of the caller.
288 */
289 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
290 BUG_ON(dom_io == NULL);
292 /*
293 * Initialise our DOMID_IO domain.
294 * This domain owns sharable pages.
295 */
296 dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
297 BUG_ON(dom_cow == NULL);
299 /* First 1MB of RAM is historically marked as I/O. */
300 for ( i = 0; i < 0x100; i++ )
301 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
303 /* Any areas not specified as RAM by the e820 map are considered I/O. */
304 for ( i = 0, pfn = 0; pfn < max_page; i++ )
305 {
306 while ( (i < e820.nr_map) &&
307 (e820.map[i].type != E820_RAM) &&
308 (e820.map[i].type != E820_UNUSABLE) )
309 i++;
311 if ( i >= e820.nr_map )
312 {
313 /* No more RAM regions: mark as I/O right to end of memory map. */
314 rstart_pfn = rend_pfn = max_page;
315 }
316 else
317 {
318 /* Mark as I/O just up as far as next RAM region. */
319 rstart_pfn = min_t(unsigned long, max_page,
320 PFN_UP(e820.map[i].addr));
321 rend_pfn = max_t(unsigned long, rstart_pfn,
322 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
323 }
325 /*
326 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
327 * In particular this ensures that RAM holes are respected even in
328 * the statically-initialised 1-16MB mapping area.
329 */
330 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
331 #if defined(CONFIG_X86_32)
332 ioend_pfn = min_t(unsigned long, rstart_pfn,
333 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
334 #else
335 ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
336 #endif
337 if ( iostart_pfn < ioend_pfn )
338 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
339 (unsigned long)mfn_to_virt(ioend_pfn));
341 /* Mark as I/O up to next RAM region. */
342 for ( ; pfn < rstart_pfn; pfn++ )
343 {
344 if ( !mfn_valid(pfn) )
345 continue;
346 share_xen_page_with_guest(
347 mfn_to_page(pfn), dom_io, XENSHARE_writable);
348 }
350 /* Skip the RAM region. */
351 pfn = rend_pfn;
352 }
354 subarch_init_memory();
356 mem_sharing_init();
357 }
359 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
360 {
361 uint64_t maddr = pfn_to_paddr(mfn);
362 int i;
364 for ( i = 0; i < e820.nr_map; i++ )
365 {
366 switch ( e820.map[i].type )
367 {
368 case E820_RAM:
369 if ( mem_type & RAM_TYPE_CONVENTIONAL )
370 break;
371 continue;
372 case E820_RESERVED:
373 if ( mem_type & RAM_TYPE_RESERVED )
374 break;
375 continue;
376 case E820_UNUSABLE:
377 if ( mem_type & RAM_TYPE_UNUSABLE )
378 break;
379 continue;
380 case E820_ACPI:
381 case E820_NVS:
382 if ( mem_type & RAM_TYPE_ACPI )
383 break;
384 continue;
385 default:
386 /* unknown */
387 continue;
388 }
390 /* Test the range. */
391 if ( (e820.map[i].addr <= maddr) &&
392 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
393 return 1;
394 }
396 return 0;
397 }
399 unsigned long domain_get_maximum_gpfn(struct domain *d)
400 {
401 if ( is_hvm_domain(d) )
402 return p2m_get_hostp2m(d)->max_mapped_pfn;
403 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
404 return arch_get_max_pfn(d) - 1;
405 }
407 void share_xen_page_with_guest(
408 struct page_info *page, struct domain *d, int readonly)
409 {
410 if ( page_get_owner(page) == d )
411 return;
413 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
415 spin_lock(&d->page_alloc_lock);
417 /* The incremented type count pins as writable or read-only. */
418 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
419 page->u.inuse.type_info |= PGT_validated | 1;
421 page_set_owner(page, d);
422 wmb(); /* install valid domain ptr before updating refcnt. */
423 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
425 /* Only add to the allocation list if the domain isn't dying. */
426 if ( !d->is_dying )
427 {
428 page->count_info |= PGC_allocated | 1;
429 if ( unlikely(d->xenheap_pages++ == 0) )
430 get_knownalive_domain(d);
431 page_list_add_tail(page, &d->xenpage_list);
432 }
434 spin_unlock(&d->page_alloc_lock);
435 }
437 void share_xen_page_with_privileged_guests(
438 struct page_info *page, int readonly)
439 {
440 share_xen_page_with_guest(page, dom_xen, readonly);
441 }
443 #if defined(__i386__)
445 #ifdef NDEBUG
446 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
447 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
448 #else
449 /*
450 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
451 * We cannot safely shadow the idle page table, nor shadow page tables
452 * (detected by zero reference count). As required for correctness, we
453 * always shadow PDPTs above 4GB.
454 */
455 #define l3tab_needs_shadow(mfn) \
456 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
457 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
458 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
459 ((mfn) >= 0x100000))
460 #endif
462 static l1_pgentry_t *fix_pae_highmem_pl1e;
464 /* Cache the address of PAE high-memory fixmap page tables. */
465 static int __init cache_pae_fixmap_address(void)
466 {
467 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
468 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
469 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
470 return 0;
471 }
472 __initcall(cache_pae_fixmap_address);
474 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
476 void make_cr3(struct vcpu *v, unsigned long mfn)
477 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
478 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
479 {
480 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
481 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
482 unsigned int cpu = smp_processor_id();
484 /* Fast path: does this mfn need a shadow at all? */
485 if ( !l3tab_needs_shadow(mfn) )
486 {
487 v->arch.cr3 = mfn << PAGE_SHIFT;
488 /* Cache is no longer in use or valid */
489 cache->high_mfn = 0;
490 return;
491 }
493 /* Caching logic is not interrupt safe. */
494 ASSERT(!in_irq());
496 /* Protects against pae_flush_pgd(). */
497 spin_lock(&cache->lock);
499 cache->inuse_idx ^= 1;
500 cache->high_mfn = mfn;
502 /* Map the guest L3 table and copy to the chosen low-memory cache. */
503 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
504 /* First check the previous high mapping can't be in the TLB.
505 * (i.e. have we loaded CR3 since we last did this?) */
506 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
507 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
508 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
509 lowmem_l3tab = cache->table[cache->inuse_idx];
510 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
511 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
512 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
514 v->arch.cr3 = __pa(lowmem_l3tab);
516 spin_unlock(&cache->lock);
517 }
519 #else /* !defined(__i386__) */
521 void make_cr3(struct vcpu *v, unsigned long mfn)
522 {
523 v->arch.cr3 = mfn << PAGE_SHIFT;
524 }
526 #endif /* !defined(__i386__) */
528 void write_ptbase(struct vcpu *v)
529 {
530 write_cr3(v->arch.cr3);
531 }
533 /*
534 * Should be called after CR3 is updated.
535 *
536 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
537 * for HVM guests, arch.monitor_table and hvm's guest CR3.
538 *
539 * Update ref counts to shadow tables appropriately.
540 */
541 void update_cr3(struct vcpu *v)
542 {
543 unsigned long cr3_mfn=0;
545 if ( paging_mode_enabled(v->domain) )
546 {
547 paging_update_cr3(v);
548 return;
549 }
551 #if CONFIG_PAGING_LEVELS == 4
552 if ( !(v->arch.flags & TF_kernel_mode) )
553 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
554 else
555 #endif
556 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
558 make_cr3(v, cr3_mfn);
559 }
562 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
563 {
564 int i;
565 unsigned long pfn;
566 struct page_info *page;
568 BUG_ON(unlikely(in_irq()));
570 spin_lock(&v->arch.shadow_ldt_lock);
572 if ( v->arch.shadow_ldt_mapcnt == 0 )
573 goto out;
575 v->arch.shadow_ldt_mapcnt = 0;
577 for ( i = 16; i < 32; i++ )
578 {
579 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
580 if ( pfn == 0 ) continue;
581 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
582 page = mfn_to_page(pfn);
583 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
584 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
585 put_page_and_type(page);
586 }
588 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
589 if ( flush )
590 flush_tlb_mask(&v->vcpu_dirty_cpumask);
592 out:
593 spin_unlock(&v->arch.shadow_ldt_lock);
594 }
597 static int alloc_segdesc_page(struct page_info *page)
598 {
599 struct desc_struct *descs;
600 int i;
602 descs = __map_domain_page(page);
604 for ( i = 0; i < 512; i++ )
605 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
606 goto fail;
608 unmap_domain_page(descs);
609 return 0;
611 fail:
612 unmap_domain_page(descs);
613 return -EINVAL;
614 }
617 /* Map shadow page at offset @off. */
618 int map_ldt_shadow_page(unsigned int off)
619 {
620 struct vcpu *v = current;
621 struct domain *d = v->domain;
622 unsigned long gmfn, mfn;
623 l1_pgentry_t l1e, nl1e;
624 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
625 int okay;
627 BUG_ON(unlikely(in_irq()));
629 guest_get_eff_kern_l1e(v, gva, &l1e);
630 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
631 return 0;
633 gmfn = l1e_get_pfn(l1e);
634 mfn = gmfn_to_mfn(d, gmfn);
635 if ( unlikely(!mfn_valid(mfn)) )
636 return 0;
638 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
639 if ( unlikely(!okay) )
640 return 0;
642 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
644 spin_lock(&v->arch.shadow_ldt_lock);
645 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
646 v->arch.shadow_ldt_mapcnt++;
647 spin_unlock(&v->arch.shadow_ldt_lock);
649 return 1;
650 }
653 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
654 {
655 struct page_info *page = mfn_to_page(page_nr);
657 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
658 {
659 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
660 return 0;
661 }
663 return 1;
664 }
667 static int get_page_and_type_from_pagenr(unsigned long page_nr,
668 unsigned long type,
669 struct domain *d,
670 int partial,
671 int preemptible)
672 {
673 struct page_info *page = mfn_to_page(page_nr);
674 int rc;
676 if ( likely(partial >= 0) &&
677 unlikely(!get_page_from_pagenr(page_nr, d)) )
678 return -EINVAL;
680 rc = (preemptible ?
681 get_page_type_preemptible(page, type) :
682 (get_page_type(page, type) ? 0 : -EINVAL));
684 if ( unlikely(rc) && partial >= 0 )
685 put_page(page);
687 return rc;
688 }
690 #ifdef __x86_64__
691 static void put_data_page(
692 struct page_info *page, int writeable)
693 {
694 if ( writeable )
695 put_page_and_type(page);
696 else
697 put_page(page);
698 }
699 #endif
701 /*
702 * We allow root tables to map each other (a.k.a. linear page tables). It
703 * needs some special care with reference counts and access permissions:
704 * 1. The mapping entry must be read-only, or the guest may get write access
705 * to its own PTEs.
706 * 2. We must only bump the reference counts for an *already validated*
707 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
708 * on a validation that is required to complete that validation.
709 * 3. We only need to increment the reference counts for the mapped page
710 * frame if it is mapped by a different root table. This is sufficient and
711 * also necessary to allow validation of a root table mapping itself.
712 */
713 #define define_get_linear_pagetable(level) \
714 static int \
715 get_##level##_linear_pagetable( \
716 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
717 { \
718 unsigned long x, y; \
719 struct page_info *page; \
720 unsigned long pfn; \
721 \
722 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
723 { \
724 MEM_LOG("Attempt to create linear p.t. with write perms"); \
725 return 0; \
726 } \
727 \
728 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
729 { \
730 /* Make sure the mapped frame belongs to the correct domain. */ \
731 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
732 return 0; \
733 \
734 /* \
735 * Ensure that the mapped frame is an already-validated page table. \
736 * If so, atomically increment the count (checking for overflow). \
737 */ \
738 page = mfn_to_page(pfn); \
739 y = page->u.inuse.type_info; \
740 do { \
741 x = y; \
742 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
743 unlikely((x & (PGT_type_mask|PGT_validated)) != \
744 (PGT_##level##_page_table|PGT_validated)) ) \
745 { \
746 put_page(page); \
747 return 0; \
748 } \
749 } \
750 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
751 } \
752 \
753 return 1; \
754 }
757 int is_iomem_page(unsigned long mfn)
758 {
759 struct page_info *page;
761 if ( !mfn_valid(mfn) )
762 return 1;
764 /* Caller must know that it is an iomem page, or a reference is held. */
765 page = mfn_to_page(mfn);
766 ASSERT((page->count_info & PGC_count_mask) != 0);
768 return (page_get_owner(page) == dom_io);
769 }
771 static int update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
772 {
773 int err = 0;
774 #ifdef __x86_64__
775 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
776 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
777 unsigned long xen_va =
778 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
780 if ( unlikely(alias) && cacheattr )
781 err = map_pages_to_xen(xen_va, mfn, 1, 0);
782 if ( !err )
783 err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
784 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
785 if ( unlikely(alias) && !cacheattr && !err )
786 err = map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
787 #endif
788 return err;
789 }
791 int
792 get_page_from_l1e(
793 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
794 {
795 unsigned long mfn = l1e_get_pfn(l1e);
796 struct page_info *page = mfn_to_page(mfn);
797 uint32_t l1f = l1e_get_flags(l1e);
798 struct vcpu *curr = current;
799 struct domain *real_pg_owner;
800 bool_t write;
802 if ( !(l1f & _PAGE_PRESENT) )
803 return 1;
805 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
806 {
807 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
808 return 0;
809 }
811 if ( !mfn_valid(mfn) ||
812 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
813 {
814 /* Only needed the reference to confirm dom_io ownership. */
815 if ( mfn_valid(mfn) )
816 put_page(page);
818 /* DOMID_IO reverts to caller for privilege checks. */
819 if ( pg_owner == dom_io )
820 pg_owner = curr->domain;
822 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
823 {
824 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
825 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
826 pg_owner->domain_id, mfn);
827 return 0;
828 }
830 if ( !(l1f & _PAGE_RW) || IS_PRIV(pg_owner) ||
831 !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
832 return 1;
833 dprintk(XENLOG_G_WARNING,
834 "d%d: Forcing read-only access to MFN %lx\n",
835 l1e_owner->domain_id, mfn);
836 return -1;
837 }
839 if ( unlikely(real_pg_owner != pg_owner) )
840 {
841 /*
842 * Let privileged domains transfer the right to map their target
843 * domain's pages. This is used to allow stub-domain pvfb export to
844 * dom0, until pvfb supports granted mappings. At that time this
845 * minor hack can go away.
846 */
847 if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
848 !IS_PRIV_FOR(pg_owner, real_pg_owner) )
849 goto could_not_pin;
850 pg_owner = real_pg_owner;
851 }
853 /* Foreign mappings into guests in shadow external mode don't
854 * contribute to writeable mapping refcounts. (This allows the
855 * qemu-dm helper process in dom0 to map the domain's memory without
856 * messing up the count of "real" writable mappings.) */
857 write = (l1f & _PAGE_RW) &&
858 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner));
859 if ( write && !get_page_type(page, PGT_writable_page) )
860 goto could_not_pin;
862 if ( pte_flags_to_cacheattr(l1f) !=
863 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
864 {
865 unsigned long x, nx, y = page->count_info;
866 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
868 if ( is_xen_heap_page(page) )
869 {
870 if ( write )
871 put_page_type(page);
872 put_page(page);
873 MEM_LOG("Attempt to change cache attributes of Xen heap page");
874 return 0;
875 }
877 do {
878 x = y;
879 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
880 } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
882 if ( unlikely(update_xen_mappings(mfn, cacheattr) != 0) )
883 {
884 cacheattr = y & PGC_cacheattr_mask;
885 do {
886 x = y;
887 nx = (x & ~PGC_cacheattr_mask) | cacheattr;
888 } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
890 if ( write )
891 put_page_type(page);
892 put_page(page);
894 MEM_LOG("Error updating mappings for mfn %lx (pfn %lx,"
895 " from L1 entry %" PRIpte ") for %d",
896 mfn, get_gpfn_from_mfn(mfn),
897 l1e_get_intpte(l1e), l1e_owner->domain_id);
898 return 0;
899 }
900 }
902 return 1;
904 could_not_pin:
905 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
906 " for l1e_owner=%d, pg_owner=%d",
907 mfn, get_gpfn_from_mfn(mfn),
908 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
909 if ( real_pg_owner != NULL )
910 put_page(page);
911 return 0;
912 }
915 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
916 define_get_linear_pagetable(l2);
917 static int
918 get_page_from_l2e(
919 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
920 {
921 unsigned long mfn = l2e_get_pfn(l2e);
922 int rc;
924 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
925 return 1;
927 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
928 {
929 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
930 return -EINVAL;
931 }
933 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
934 {
935 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
936 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
937 rc = 0;
938 return rc;
939 }
941 if ( !opt_allow_superpage )
942 {
943 MEM_LOG("Attempt to map superpage without allowsuperpage "
944 "flag in hypervisor");
945 return -EINVAL;
946 }
948 if ( mfn & (L1_PAGETABLE_ENTRIES-1) )
949 {
950 MEM_LOG("Unaligned superpage map attempt mfn %lx", mfn);
951 return -EINVAL;
952 }
954 return get_superpage(mfn, d);
955 }
958 define_get_linear_pagetable(l3);
959 static int
960 get_page_from_l3e(
961 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
962 {
963 int rc;
965 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
966 return 1;
968 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
969 {
970 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
971 return -EINVAL;
972 }
974 rc = get_page_and_type_from_pagenr(
975 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
976 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
977 rc = 0;
979 return rc;
980 }
982 #if CONFIG_PAGING_LEVELS >= 4
983 define_get_linear_pagetable(l4);
984 static int
985 get_page_from_l4e(
986 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
987 {
988 int rc;
990 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
991 return 1;
993 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
994 {
995 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
996 return -EINVAL;
997 }
999 rc = get_page_and_type_from_pagenr(
1000 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
1001 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
1002 rc = 0;
1004 return rc;
1006 #endif /* 4 level */
1008 #ifdef __x86_64__
1010 #ifdef USER_MAPPINGS_ARE_GLOBAL
1011 #define adjust_guest_l1e(pl1e, d) \
1012 do { \
1013 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
1014 likely(!is_pv_32on64_domain(d)) ) \
1015 { \
1016 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
1017 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
1018 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
1019 MEM_LOG("Global bit is set to kernel page %lx", \
1020 l1e_get_pfn((pl1e))); \
1021 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
1022 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
1023 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
1024 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
1025 } \
1026 } while ( 0 )
1027 #else
1028 #define adjust_guest_l1e(pl1e, d) \
1029 do { \
1030 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
1031 likely(!is_pv_32on64_domain(d)) ) \
1032 l1e_add_flags((pl1e), _PAGE_USER); \
1033 } while ( 0 )
1034 #endif
1036 #define adjust_guest_l2e(pl2e, d) \
1037 do { \
1038 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
1039 likely(!is_pv_32on64_domain(d)) ) \
1040 l2e_add_flags((pl2e), _PAGE_USER); \
1041 } while ( 0 )
1043 #define adjust_guest_l3e(pl3e, d) \
1044 do { \
1045 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1046 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
1047 _PAGE_USER : \
1048 _PAGE_USER|_PAGE_RW); \
1049 } while ( 0 )
1051 #define adjust_guest_l4e(pl4e, d) \
1052 do { \
1053 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
1054 likely(!is_pv_32on64_domain(d)) ) \
1055 l4e_add_flags((pl4e), _PAGE_USER); \
1056 } while ( 0 )
1058 #else /* !defined(__x86_64__) */
1060 #define adjust_guest_l1e(_p, _d) ((void)(_d))
1061 #define adjust_guest_l2e(_p, _d) ((void)(_d))
1062 #define adjust_guest_l3e(_p, _d) ((void)(_d))
1064 #endif
1066 #ifdef __x86_64__
1067 #define unadjust_guest_l3e(pl3e, d) \
1068 do { \
1069 if ( unlikely(is_pv_32on64_domain(d)) && \
1070 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1071 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
1072 } while ( 0 )
1073 #else
1074 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
1075 #endif
1077 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1079 unsigned long pfn = l1e_get_pfn(l1e);
1080 struct page_info *page;
1081 struct domain *pg_owner;
1082 struct vcpu *v;
1084 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1085 return;
1087 page = mfn_to_page(pfn);
1088 pg_owner = page_get_owner(page);
1090 /*
1091 * Check if this is a mapping that was established via a grant reference.
1092 * If it was then we should not be here: we require that such mappings are
1093 * explicitly destroyed via the grant-table interface.
1095 * The upshot of this is that the guest can end up with active grants that
1096 * it cannot destroy (because it no longer has a PTE to present to the
1097 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1098 * hence a special grant PTE flag can be enabled to catch the bug early.
1100 * (Note that the undestroyable active grants are not a security hole in
1101 * Xen. All active grants can safely be cleaned up when the domain dies.)
1102 */
1103 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1104 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1106 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1107 l1e_get_intpte(l1e));
1108 domain_crash(l1e_owner);
1111 /* Remember we didn't take a type-count of foreign writable mappings
1112 * to paging-external domains */
1113 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1114 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1116 put_page_and_type(page);
1118 else
1120 /* We expect this is rare so we blow the entire shadow LDT. */
1121 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1122 PGT_seg_desc_page)) &&
1123 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1124 (l1e_owner == pg_owner) )
1126 for_each_vcpu ( pg_owner, v )
1127 invalidate_shadow_ldt(v, 1);
1129 put_page(page);
1134 /*
1135 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1136 * Note also that this automatically deals correctly with linear p.t.'s.
1137 */
1138 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1140 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1141 return 1;
1143 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1144 put_superpage(l2e_get_pfn(l2e));
1145 else
1146 put_page_and_type(l2e_get_page(l2e));
1148 return 0;
1151 static int __put_page_type(struct page_info *, int preemptible);
1153 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1154 int partial, int preemptible)
1156 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1157 return 1;
1159 #ifdef __x86_64__
1160 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1162 unsigned long mfn = l3e_get_pfn(l3e);
1163 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1165 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1166 do {
1167 put_data_page(mfn_to_page(mfn), writeable);
1168 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1170 return 0;
1172 #endif
1174 if ( unlikely(partial > 0) )
1175 return __put_page_type(l3e_get_page(l3e), preemptible);
1177 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1180 #if CONFIG_PAGING_LEVELS >= 4
1181 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1182 int partial, int preemptible)
1184 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1185 (l4e_get_pfn(l4e) != pfn) )
1187 if ( unlikely(partial > 0) )
1188 return __put_page_type(l4e_get_page(l4e), preemptible);
1189 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1191 return 1;
1193 #endif
1195 static int alloc_l1_table(struct page_info *page)
1197 struct domain *d = page_get_owner(page);
1198 unsigned long pfn = page_to_mfn(page);
1199 l1_pgentry_t *pl1e;
1200 unsigned int i;
1202 pl1e = map_domain_page(pfn);
1204 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1206 if ( is_guest_l1_slot(i) )
1207 switch ( get_page_from_l1e(pl1e[i], d, d) )
1209 case 0:
1210 goto fail;
1211 case -1:
1212 l1e_remove_flags(pl1e[i], _PAGE_RW);
1213 break;
1216 adjust_guest_l1e(pl1e[i], d);
1219 unmap_domain_page(pl1e);
1220 return 0;
1222 fail:
1223 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1224 while ( i-- > 0 )
1225 if ( is_guest_l1_slot(i) )
1226 put_page_from_l1e(pl1e[i], d);
1228 unmap_domain_page(pl1e);
1229 return -EINVAL;
1232 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1234 struct page_info *page;
1235 l3_pgentry_t l3e3;
1236 #ifdef __i386__
1237 l2_pgentry_t *pl2e, l2e;
1238 int i;
1239 #endif
1241 if ( !is_pv_32bit_domain(d) )
1242 return 1;
1244 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1246 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1247 l3e3 = pl3e[3];
1248 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1250 MEM_LOG("PAE L3 3rd slot is empty");
1251 return 0;
1254 /*
1255 * The Xen-private mappings include linear mappings. The L2 thus cannot
1256 * be shared by multiple L3 tables. The test here is adequate because:
1257 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1258 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1259 * 2. Cannot appear in another page table's L3:
1260 * a. alloc_l3_table() calls this function and this check will fail
1261 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1262 */
1263 page = l3e_get_page(l3e3);
1264 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1265 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1266 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1267 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1269 MEM_LOG("PAE L3 3rd slot is shared");
1270 return 0;
1273 #ifdef __i386__
1274 /* Xen linear pagetable mappings. */
1275 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1276 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1278 l2e = l2e_empty();
1279 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1280 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1281 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1283 unmap_domain_page(pl2e);
1284 #endif
1286 return 1;
1289 #ifdef __i386__
1290 /* Flush a pgdir update into low-memory caches. */
1291 static void pae_flush_pgd(
1292 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1294 struct domain *d = page_get_owner(mfn_to_page(mfn));
1295 struct vcpu *v;
1296 intpte_t _ol3e, _nl3e, _pl3e;
1297 l3_pgentry_t *l3tab_ptr;
1298 struct pae_l3_cache *cache;
1300 if ( unlikely(shadow_mode_enabled(d)) )
1302 cpumask_t m = CPU_MASK_NONE;
1303 /* Re-shadow this l3 table on any vcpus that are using it */
1304 for_each_vcpu ( d, v )
1305 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1307 paging_update_cr3(v);
1308 cpus_or(m, m, v->vcpu_dirty_cpumask);
1310 flush_tlb_mask(&m);
1313 /* If below 4GB then the pgdir is not shadowed in low memory. */
1314 if ( !l3tab_needs_shadow(mfn) )
1315 return;
1317 for_each_vcpu ( d, v )
1319 cache = &v->arch.pae_l3_cache;
1321 spin_lock(&cache->lock);
1323 if ( cache->high_mfn == mfn )
1325 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1326 _ol3e = l3e_get_intpte(*l3tab_ptr);
1327 _nl3e = l3e_get_intpte(nl3e);
1328 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1329 BUG_ON(_pl3e != _ol3e);
1332 spin_unlock(&cache->lock);
1335 flush_tlb_mask(&d->domain_dirty_cpumask);
1337 #else
1338 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1339 #endif
1341 static int alloc_l2_table(struct page_info *page, unsigned long type,
1342 int preemptible)
1344 struct domain *d = page_get_owner(page);
1345 unsigned long pfn = page_to_mfn(page);
1346 l2_pgentry_t *pl2e;
1347 unsigned int i;
1348 int rc = 0;
1350 pl2e = map_domain_page(pfn);
1352 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1354 if ( preemptible && i && hypercall_preempt_check() )
1356 page->nr_validated_ptes = i;
1357 rc = -EAGAIN;
1358 break;
1361 if ( !is_guest_l2_slot(d, type, i) ||
1362 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1363 continue;
1365 if ( rc < 0 )
1367 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1368 while ( i-- > 0 )
1369 if ( is_guest_l2_slot(d, type, i) )
1370 put_page_from_l2e(pl2e[i], pfn);
1371 break;
1374 adjust_guest_l2e(pl2e[i], d);
1377 if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1379 /* Xen private mappings. */
1380 #if defined(__i386__)
1381 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1382 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1383 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1384 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1385 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i],
1386 l2e_from_page(perdomain_pt_page(d, i),
1387 __PAGE_HYPERVISOR));
1388 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1389 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1390 #else
1391 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1392 &compat_idle_pg_table_l2[
1393 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1394 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1395 #endif
1398 unmap_domain_page(pl2e);
1399 return rc > 0 ? 0 : rc;
1402 static int alloc_l3_table(struct page_info *page, int preemptible)
1404 struct domain *d = page_get_owner(page);
1405 unsigned long pfn = page_to_mfn(page);
1406 l3_pgentry_t *pl3e;
1407 unsigned int i;
1408 int rc = 0, partial = page->partial_pte;
1410 #if CONFIG_PAGING_LEVELS == 3
1411 /*
1412 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1413 * the weird 'extended cr3' format for dealing with high-order address
1414 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1415 */
1416 if ( (pfn >= 0x100000) &&
1417 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1418 d->vcpu && d->vcpu[0] && d->vcpu[0]->is_initialised )
1420 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1421 return -EINVAL;
1423 #endif
1425 pl3e = map_domain_page(pfn);
1427 /*
1428 * PAE guests allocate full pages, but aren't required to initialize
1429 * more than the first four entries; when running in compatibility
1430 * mode, however, the full page is visible to the MMU, and hence all
1431 * 512 entries must be valid/verified, which is most easily achieved
1432 * by clearing them out.
1433 */
1434 if ( is_pv_32on64_domain(d) )
1435 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1437 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1438 i++, partial = 0 )
1440 if ( is_pv_32bit_domain(d) && (i == 3) )
1442 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1443 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1444 rc = -EINVAL;
1445 else
1446 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1447 PGT_l2_page_table |
1448 PGT_pae_xen_l2,
1449 d, partial, preemptible);
1451 else if ( !is_guest_l3_slot(i) ||
1452 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1453 partial, preemptible)) > 0 )
1454 continue;
1456 if ( rc == -EAGAIN )
1458 page->nr_validated_ptes = i;
1459 page->partial_pte = partial ?: 1;
1461 else if ( rc == -EINTR && i )
1463 page->nr_validated_ptes = i;
1464 page->partial_pte = 0;
1465 rc = -EAGAIN;
1467 if ( rc < 0 )
1468 break;
1470 adjust_guest_l3e(pl3e[i], d);
1473 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1474 rc = -EINVAL;
1475 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1477 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1478 while ( i-- > 0 )
1480 if ( !is_guest_l3_slot(i) )
1481 continue;
1482 unadjust_guest_l3e(pl3e[i], d);
1483 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1487 unmap_domain_page(pl3e);
1488 return rc > 0 ? 0 : rc;
1491 #if CONFIG_PAGING_LEVELS >= 4
1492 static int alloc_l4_table(struct page_info *page, int preemptible)
1494 struct domain *d = page_get_owner(page);
1495 unsigned long pfn = page_to_mfn(page);
1496 l4_pgentry_t *pl4e = page_to_virt(page);
1497 unsigned int i;
1498 int rc = 0, partial = page->partial_pte;
1500 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1501 i++, partial = 0 )
1503 if ( !is_guest_l4_slot(d, i) ||
1504 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1505 partial, preemptible)) > 0 )
1506 continue;
1508 if ( rc == -EAGAIN )
1510 page->nr_validated_ptes = i;
1511 page->partial_pte = partial ?: 1;
1513 else if ( rc == -EINTR )
1515 if ( i )
1517 page->nr_validated_ptes = i;
1518 page->partial_pte = 0;
1519 rc = -EAGAIN;
1522 else if ( rc < 0 )
1524 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1525 while ( i-- > 0 )
1526 if ( is_guest_l4_slot(d, i) )
1527 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1529 if ( rc < 0 )
1530 return rc;
1532 adjust_guest_l4e(pl4e[i], d);
1535 /* Xen private mappings. */
1536 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1537 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1538 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1539 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1540 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1541 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1542 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1543 __PAGE_HYPERVISOR);
1545 return rc > 0 ? 0 : rc;
1547 #else
1548 #define alloc_l4_table(page, preemptible) (-EINVAL)
1549 #endif
1552 static void free_l1_table(struct page_info *page)
1554 struct domain *d = page_get_owner(page);
1555 unsigned long pfn = page_to_mfn(page);
1556 l1_pgentry_t *pl1e;
1557 unsigned int i;
1559 pl1e = map_domain_page(pfn);
1561 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1562 if ( is_guest_l1_slot(i) )
1563 put_page_from_l1e(pl1e[i], d);
1565 unmap_domain_page(pl1e);
1569 static int free_l2_table(struct page_info *page, int preemptible)
1571 #ifdef __x86_64__
1572 struct domain *d = page_get_owner(page);
1573 #endif
1574 unsigned long pfn = page_to_mfn(page);
1575 l2_pgentry_t *pl2e;
1576 unsigned int i = page->nr_validated_ptes - 1;
1577 int err = 0;
1579 pl2e = map_domain_page(pfn);
1581 ASSERT(page->nr_validated_ptes);
1582 do {
1583 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1584 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1585 preemptible && i && hypercall_preempt_check() )
1587 page->nr_validated_ptes = i;
1588 err = -EAGAIN;
1590 } while ( !err && i-- );
1592 unmap_domain_page(pl2e);
1594 if ( !err )
1595 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1597 return err;
1600 static int free_l3_table(struct page_info *page, int preemptible)
1602 struct domain *d = page_get_owner(page);
1603 unsigned long pfn = page_to_mfn(page);
1604 l3_pgentry_t *pl3e;
1605 int rc = 0, partial = page->partial_pte;
1606 unsigned int i = page->nr_validated_ptes - !partial;
1608 pl3e = map_domain_page(pfn);
1610 do {
1611 if ( is_guest_l3_slot(i) )
1613 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1614 if ( rc < 0 )
1615 break;
1616 partial = 0;
1617 if ( rc > 0 )
1618 continue;
1619 unadjust_guest_l3e(pl3e[i], d);
1621 } while ( i-- );
1623 unmap_domain_page(pl3e);
1625 if ( rc == -EAGAIN )
1627 page->nr_validated_ptes = i;
1628 page->partial_pte = partial ?: -1;
1630 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1632 page->nr_validated_ptes = i + 1;
1633 page->partial_pte = 0;
1634 rc = -EAGAIN;
1636 return rc > 0 ? 0 : rc;
1639 #if CONFIG_PAGING_LEVELS >= 4
1640 static int free_l4_table(struct page_info *page, int preemptible)
1642 struct domain *d = page_get_owner(page);
1643 unsigned long pfn = page_to_mfn(page);
1644 l4_pgentry_t *pl4e = page_to_virt(page);
1645 int rc = 0, partial = page->partial_pte;
1646 unsigned int i = page->nr_validated_ptes - !partial;
1648 do {
1649 if ( is_guest_l4_slot(d, i) )
1650 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1651 if ( rc < 0 )
1652 break;
1653 partial = 0;
1654 } while ( i-- );
1656 if ( rc == -EAGAIN )
1658 page->nr_validated_ptes = i;
1659 page->partial_pte = partial ?: -1;
1661 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1663 page->nr_validated_ptes = i + 1;
1664 page->partial_pte = 0;
1665 rc = -EAGAIN;
1667 return rc > 0 ? 0 : rc;
1669 #else
1670 #define free_l4_table(page, preemptible) (-EINVAL)
1671 #endif
1673 static int page_lock(struct page_info *page)
1675 unsigned long x, nx;
1677 do {
1678 while ( (x = page->u.inuse.type_info) & PGT_locked )
1679 cpu_relax();
1680 nx = x + (1 | PGT_locked);
1681 if ( !(x & PGT_validated) ||
1682 !(x & PGT_count_mask) ||
1683 !(nx & PGT_count_mask) )
1684 return 0;
1685 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1687 return 1;
1690 static void page_unlock(struct page_info *page)
1692 unsigned long x, nx, y = page->u.inuse.type_info;
1694 do {
1695 x = y;
1696 nx = x - (1 | PGT_locked);
1697 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1700 /* How to write an entry to the guest pagetables.
1701 * Returns 0 for failure (pointer not valid), 1 for success. */
1702 static inline int update_intpte(intpte_t *p,
1703 intpte_t old,
1704 intpte_t new,
1705 unsigned long mfn,
1706 struct vcpu *v,
1707 int preserve_ad)
1709 int rv = 1;
1710 #ifndef PTE_UPDATE_WITH_CMPXCHG
1711 if ( !preserve_ad )
1713 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1715 else
1716 #endif
1718 intpte_t t = old;
1719 for ( ; ; )
1721 intpte_t _new = new;
1722 if ( preserve_ad )
1723 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1725 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1726 if ( unlikely(rv == 0) )
1728 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1729 ": saw %" PRIpte, old, _new, t);
1730 break;
1733 if ( t == old )
1734 break;
1736 /* Allowed to change in Accessed/Dirty flags only. */
1737 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1739 old = t;
1742 return rv;
1745 /* Macro that wraps the appropriate type-changes around update_intpte().
1746 * Arguments are: type, ptr, old, new, mfn, vcpu */
1747 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1748 update_intpte(&_t ## e_get_intpte(*(_p)), \
1749 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1750 (_m), (_v), (_ad))
1752 /* Update the L1 entry at pl1e to new value nl1e. */
1753 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1754 unsigned long gl1mfn, int preserve_ad,
1755 struct vcpu *pt_vcpu, struct domain *pg_dom)
1757 l1_pgentry_t ol1e;
1758 struct domain *pt_dom = pt_vcpu->domain;
1759 unsigned long mfn;
1760 p2m_type_t p2mt;
1761 int rc = 1;
1763 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1764 return 0;
1766 if ( unlikely(paging_mode_refcounts(pt_dom)) )
1768 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad);
1769 return rc;
1772 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1774 /* Translate foreign guest addresses. */
1775 mfn = mfn_x(gfn_to_mfn(p2m_get_hostp2m(pg_dom),
1776 l1e_get_pfn(nl1e), &p2mt));
1777 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1778 return 0;
1779 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1780 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1782 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1784 MEM_LOG("Bad L1 flags %x",
1785 l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1786 return 0;
1789 /* Fast path for identical mapping, r/w and presence. */
1790 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1792 adjust_guest_l1e(nl1e, pt_dom);
1793 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1794 preserve_ad);
1795 return rc;
1798 switch ( get_page_from_l1e(nl1e, pt_dom, pg_dom) )
1800 case 0:
1801 return 0;
1802 case -1:
1803 l1e_remove_flags(nl1e, _PAGE_RW);
1804 rc = 0;
1805 break;
1808 adjust_guest_l1e(nl1e, pt_dom);
1809 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1810 preserve_ad)) )
1812 ol1e = nl1e;
1813 rc = 0;
1816 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1817 preserve_ad)) )
1819 return 0;
1822 put_page_from_l1e(ol1e, pt_dom);
1823 return rc;
1827 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1828 static int mod_l2_entry(l2_pgentry_t *pl2e,
1829 l2_pgentry_t nl2e,
1830 unsigned long pfn,
1831 int preserve_ad,
1832 struct vcpu *vcpu)
1834 l2_pgentry_t ol2e;
1835 struct domain *d = vcpu->domain;
1836 struct page_info *l2pg = mfn_to_page(pfn);
1837 unsigned long type = l2pg->u.inuse.type_info;
1838 int rc = 1;
1840 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1842 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1843 return 0;
1846 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1847 return 0;
1849 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1851 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1853 MEM_LOG("Bad L2 flags %x",
1854 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1855 return 0;
1858 /* Fast path for identical mapping and presence. */
1859 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1861 adjust_guest_l2e(nl2e, d);
1862 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1863 return rc;
1866 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1867 return 0;
1869 adjust_guest_l2e(nl2e, d);
1870 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1871 preserve_ad)) )
1873 ol2e = nl2e;
1874 rc = 0;
1877 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1878 preserve_ad)) )
1880 return 0;
1883 put_page_from_l2e(ol2e, pfn);
1884 return rc;
1887 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1888 static int mod_l3_entry(l3_pgentry_t *pl3e,
1889 l3_pgentry_t nl3e,
1890 unsigned long pfn,
1891 int preserve_ad,
1892 int preemptible,
1893 struct vcpu *vcpu)
1895 l3_pgentry_t ol3e;
1896 struct domain *d = vcpu->domain;
1897 int rc = 0;
1899 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1901 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1902 return -EINVAL;
1905 /*
1906 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1907 * would be a pain to ensure they remain continuously valid throughout.
1908 */
1909 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1910 return -EINVAL;
1912 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1913 return -EFAULT;
1915 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1917 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1919 MEM_LOG("Bad L3 flags %x",
1920 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1921 return -EINVAL;
1924 /* Fast path for identical mapping and presence. */
1925 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1927 adjust_guest_l3e(nl3e, d);
1928 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1929 return rc ? 0 : -EFAULT;
1932 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1933 if ( unlikely(rc < 0) )
1934 return rc;
1935 rc = 0;
1937 adjust_guest_l3e(nl3e, d);
1938 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1939 preserve_ad)) )
1941 ol3e = nl3e;
1942 rc = -EFAULT;
1945 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1946 preserve_ad)) )
1948 return -EFAULT;
1951 if ( likely(rc == 0) )
1953 if ( !create_pae_xen_mappings(d, pl3e) )
1954 BUG();
1956 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1959 put_page_from_l3e(ol3e, pfn, 0, 0);
1960 return rc;
1963 #if CONFIG_PAGING_LEVELS >= 4
1965 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1966 static int mod_l4_entry(l4_pgentry_t *pl4e,
1967 l4_pgentry_t nl4e,
1968 unsigned long pfn,
1969 int preserve_ad,
1970 int preemptible,
1971 struct vcpu *vcpu)
1973 struct domain *d = vcpu->domain;
1974 l4_pgentry_t ol4e;
1975 int rc = 0;
1977 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1979 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1980 return -EINVAL;
1983 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1984 return -EFAULT;
1986 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1988 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1990 MEM_LOG("Bad L4 flags %x",
1991 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1992 return -EINVAL;
1995 /* Fast path for identical mapping and presence. */
1996 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1998 adjust_guest_l4e(nl4e, d);
1999 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
2000 return rc ? 0 : -EFAULT;
2003 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
2004 if ( unlikely(rc < 0) )
2005 return rc;
2006 rc = 0;
2008 adjust_guest_l4e(nl4e, d);
2009 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2010 preserve_ad)) )
2012 ol4e = nl4e;
2013 rc = -EFAULT;
2016 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2017 preserve_ad)) )
2019 return -EFAULT;
2022 put_page_from_l4e(ol4e, pfn, 0, 0);
2023 return rc;
2026 #endif
2028 static int cleanup_page_cacheattr(struct page_info *page)
2030 uint32_t cacheattr =
2031 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2033 if ( likely(cacheattr == 0) )
2034 return 0;
2036 page->count_info &= ~PGC_cacheattr_mask;
2038 BUG_ON(is_xen_heap_page(page));
2040 return update_xen_mappings(page_to_mfn(page), 0);
2043 void put_page(struct page_info *page)
2045 unsigned long nx, x, y = page->count_info;
2047 do {
2048 ASSERT((y & PGC_count_mask) != 0);
2049 x = y;
2050 nx = x - 1;
2052 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
2054 if ( unlikely((nx & PGC_count_mask) == 0) )
2056 if ( cleanup_page_cacheattr(page) == 0 )
2057 free_domheap_page(page);
2058 else
2059 MEM_LOG("Leaking pfn %lx", page_to_mfn(page));
2064 struct domain *page_get_owner_and_reference(struct page_info *page)
2066 unsigned long x, y = page->count_info;
2068 do {
2069 x = y;
2070 /*
2071 * Count == 0: Page is not allocated, so we cannot take a reference.
2072 * Count == -1: Reference count would wrap, which is invalid.
2073 * Count == -2: Remaining unused ref is reserved for get_page_light().
2074 */
2075 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2076 return NULL;
2078 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2080 return page_get_owner(page);
2084 int get_page(struct page_info *page, struct domain *domain)
2086 struct domain *owner = page_get_owner_and_reference(page);
2088 if ( likely(owner == domain) )
2089 return 1;
2091 if ( owner != NULL )
2092 put_page(page);
2094 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
2095 gdprintk(XENLOG_INFO,
2096 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
2097 PRtype_info "\n",
2098 page_to_mfn(page), domain, owner,
2099 page->count_info, page->u.inuse.type_info);
2100 return 0;
2103 /*
2104 * Special version of get_page() to be used exclusively when
2105 * - a page is known to already have a non-zero reference count
2106 * - the page does not need its owner to be checked
2107 * - it will not be called more than once without dropping the thus
2108 * acquired reference again.
2109 * Due to get_page() reserving one reference, this call cannot fail.
2110 */
2111 static void get_page_light(struct page_info *page)
2113 unsigned long x, nx, y = page->count_info;
2115 do {
2116 x = y;
2117 nx = x + 1;
2118 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2119 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2120 y = cmpxchg(&page->count_info, x, nx);
2122 while ( unlikely(y != x) );
2125 static int alloc_page_type(struct page_info *page, unsigned long type,
2126 int preemptible)
2128 struct domain *owner = page_get_owner(page);
2129 int rc;
2131 /* A page table is dirtied when its type count becomes non-zero. */
2132 if ( likely(owner != NULL) )
2133 paging_mark_dirty(owner, page_to_mfn(page));
2135 switch ( type & PGT_type_mask )
2137 case PGT_l1_page_table:
2138 rc = alloc_l1_table(page);
2139 break;
2140 case PGT_l2_page_table:
2141 rc = alloc_l2_table(page, type, preemptible);
2142 break;
2143 case PGT_l3_page_table:
2144 rc = alloc_l3_table(page, preemptible);
2145 break;
2146 case PGT_l4_page_table:
2147 rc = alloc_l4_table(page, preemptible);
2148 break;
2149 case PGT_seg_desc_page:
2150 rc = alloc_segdesc_page(page);
2151 break;
2152 default:
2153 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2154 type, page->u.inuse.type_info,
2155 page->count_info);
2156 rc = -EINVAL;
2157 BUG();
2160 /* No need for atomic update of type_info here: noone else updates it. */
2161 wmb();
2162 if ( rc == -EAGAIN )
2164 get_page_light(page);
2165 page->u.inuse.type_info |= PGT_partial;
2167 else if ( rc == -EINTR )
2169 ASSERT((page->u.inuse.type_info &
2170 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2171 page->u.inuse.type_info &= ~PGT_count_mask;
2173 else if ( rc )
2175 ASSERT(rc < 0);
2176 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2177 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2178 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2179 type, page->count_info, page->u.inuse.type_info);
2180 page->u.inuse.type_info = 0;
2182 else
2184 page->u.inuse.type_info |= PGT_validated;
2187 return rc;
2191 int free_page_type(struct page_info *page, unsigned long type,
2192 int preemptible)
2194 struct domain *owner = page_get_owner(page);
2195 unsigned long gmfn;
2196 int rc;
2198 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2200 /* A page table is dirtied when its type count becomes zero. */
2201 paging_mark_dirty(owner, page_to_mfn(page));
2203 if ( shadow_mode_refcounts(owner) )
2204 return 0;
2206 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2207 ASSERT(VALID_M2P(gmfn));
2208 /* Page sharing not supported for shadowed domains */
2209 if(!SHARED_M2P(gmfn))
2210 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2213 if ( !(type & PGT_partial) )
2215 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2216 page->partial_pte = 0;
2219 switch ( type & PGT_type_mask )
2221 case PGT_l1_page_table:
2222 free_l1_table(page);
2223 rc = 0;
2224 break;
2225 case PGT_l2_page_table:
2226 rc = free_l2_table(page, preemptible);
2227 break;
2228 case PGT_l3_page_table:
2229 #if CONFIG_PAGING_LEVELS == 3
2230 if ( !(type & PGT_partial) )
2231 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2232 #endif
2233 rc = free_l3_table(page, preemptible);
2234 break;
2235 case PGT_l4_page_table:
2236 rc = free_l4_table(page, preemptible);
2237 break;
2238 default:
2239 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2240 rc = -EINVAL;
2241 BUG();
2244 return rc;
2248 static int __put_final_page_type(
2249 struct page_info *page, unsigned long type, int preemptible)
2251 int rc = free_page_type(page, type, preemptible);
2253 /* No need for atomic update of type_info here: noone else updates it. */
2254 if ( rc == 0 )
2256 /*
2257 * Record TLB information for flush later. We do not stamp page tables
2258 * when running in shadow mode:
2259 * 1. Pointless, since it's the shadow pt's which must be tracked.
2260 * 2. Shadow mode reuses this field for shadowed page tables to
2261 * store flags info -- we don't want to conflict with that.
2262 */
2263 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2264 (page->count_info & PGC_page_table)) )
2265 page->tlbflush_timestamp = tlbflush_current_time();
2266 wmb();
2267 page->u.inuse.type_info--;
2269 else if ( rc == -EINTR )
2271 ASSERT((page->u.inuse.type_info &
2272 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2273 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2274 (page->count_info & PGC_page_table)) )
2275 page->tlbflush_timestamp = tlbflush_current_time();
2276 wmb();
2277 page->u.inuse.type_info |= PGT_validated;
2279 else
2281 BUG_ON(rc != -EAGAIN);
2282 wmb();
2283 get_page_light(page);
2284 page->u.inuse.type_info |= PGT_partial;
2287 return rc;
2291 static int __put_page_type(struct page_info *page,
2292 int preemptible)
2294 unsigned long nx, x, y = page->u.inuse.type_info;
2295 int rc = 0;
2297 for ( ; ; )
2299 x = y;
2300 nx = x - 1;
2302 ASSERT((x & PGT_count_mask) != 0);
2304 if ( unlikely((nx & PGT_count_mask) == 0) )
2306 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2307 likely(nx & (PGT_validated|PGT_partial)) )
2309 /*
2310 * Page-table pages must be unvalidated when count is zero. The
2311 * 'free' is safe because the refcnt is non-zero and validated
2312 * bit is clear => other ops will spin or fail.
2313 */
2314 nx = x & ~(PGT_validated|PGT_partial);
2315 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2316 x, nx)) != x) )
2317 continue;
2318 /* We cleared the 'valid bit' so we do the clean up. */
2319 rc = __put_final_page_type(page, x, preemptible);
2320 if ( x & PGT_partial )
2321 put_page(page);
2322 break;
2325 /*
2326 * Record TLB information for flush later. We do not stamp page
2327 * tables when running in shadow mode:
2328 * 1. Pointless, since it's the shadow pt's which must be tracked.
2329 * 2. Shadow mode reuses this field for shadowed page tables to
2330 * store flags info -- we don't want to conflict with that.
2331 */
2332 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2333 (page->count_info & PGC_page_table)) )
2334 page->tlbflush_timestamp = tlbflush_current_time();
2337 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2338 break;
2340 if ( preemptible && hypercall_preempt_check() )
2341 return -EINTR;
2344 return rc;
2348 static int __get_page_type(struct page_info *page, unsigned long type,
2349 int preemptible)
2351 unsigned long nx, x, y = page->u.inuse.type_info;
2352 int rc = 0;
2354 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2356 for ( ; ; )
2358 x = y;
2359 nx = x + 1;
2360 if ( unlikely((nx & PGT_count_mask) == 0) )
2362 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2363 return -EINVAL;
2365 else if ( unlikely((x & PGT_count_mask) == 0) )
2367 struct domain *d = page_get_owner(page);
2369 /* Normally we should never let a page go from type count 0
2370 * to type count 1 when it is shadowed. One exception:
2371 * out-of-sync shadowed pages are allowed to become
2372 * writeable. */
2373 if ( d && shadow_mode_enabled(d)
2374 && (page->count_info & PGC_page_table)
2375 && !((page->shadow_flags & (1u<<29))
2376 && type == PGT_writable_page) )
2377 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2379 ASSERT(!(x & PGT_pae_xen_l2));
2380 if ( (x & PGT_type_mask) != type )
2382 /*
2383 * On type change we check to flush stale TLB entries. This
2384 * may be unnecessary (e.g., page was GDT/LDT) but those
2385 * circumstances should be very rare.
2386 */
2387 cpumask_t mask = d->domain_dirty_cpumask;
2389 /* Don't flush if the timestamp is old enough */
2390 tlbflush_filter(mask, page->tlbflush_timestamp);
2392 if ( unlikely(!cpus_empty(mask)) &&
2393 /* Shadow mode: track only writable pages. */
2394 (!shadow_mode_enabled(page_get_owner(page)) ||
2395 ((nx & PGT_type_mask) == PGT_writable_page)) )
2397 perfc_incr(need_flush_tlb_flush);
2398 flush_tlb_mask(&mask);
2401 /* We lose existing type and validity. */
2402 nx &= ~(PGT_type_mask | PGT_validated);
2403 nx |= type;
2405 /* No special validation needed for writable pages. */
2406 /* Page tables and GDT/LDT need to be scanned for validity. */
2407 if ( type == PGT_writable_page || type == PGT_shared_page )
2408 nx |= PGT_validated;
2411 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2413 /* Don't log failure if it could be a recursive-mapping attempt. */
2414 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2415 (type == PGT_l1_page_table) )
2416 return -EINVAL;
2417 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2418 (type == PGT_l2_page_table) )
2419 return -EINVAL;
2420 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2421 (type == PGT_l3_page_table) )
2422 return -EINVAL;
2423 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2424 "for mfn %lx (pfn %lx)",
2425 x, type, page_to_mfn(page),
2426 get_gpfn_from_mfn(page_to_mfn(page)));
2427 return -EINVAL;
2429 else if ( unlikely(!(x & PGT_validated)) )
2431 if ( !(x & PGT_partial) )
2433 /* Someone else is updating validation of this page. Wait... */
2434 while ( (y = page->u.inuse.type_info) == x )
2436 if ( preemptible && hypercall_preempt_check() )
2437 return -EINTR;
2438 cpu_relax();
2440 continue;
2442 /* Type ref count was left at 1 when PGT_partial got set. */
2443 ASSERT((x & PGT_count_mask) == 1);
2444 nx = x & ~PGT_partial;
2447 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2448 break;
2450 if ( preemptible && hypercall_preempt_check() )
2451 return -EINTR;
2454 if ( unlikely((x & PGT_type_mask) != type) )
2456 /* Special pages should not be accessible from devices. */
2457 struct domain *d = page_get_owner(page);
2458 if ( d && !is_hvm_domain(d) && unlikely(need_iommu(d)) )
2460 if ( (x & PGT_type_mask) == PGT_writable_page )
2461 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2462 else if ( type == PGT_writable_page )
2463 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2464 page_to_mfn(page),
2465 IOMMUF_readable|IOMMUF_writable);
2469 if ( unlikely(!(nx & PGT_validated)) )
2471 if ( !(x & PGT_partial) )
2473 page->nr_validated_ptes = 0;
2474 page->partial_pte = 0;
2476 rc = alloc_page_type(page, type, preemptible);
2479 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2480 put_page(page);
2482 return rc;
2485 void put_page_type(struct page_info *page)
2487 int rc = __put_page_type(page, 0);
2488 ASSERT(rc == 0);
2489 (void)rc;
2492 int get_page_type(struct page_info *page, unsigned long type)
2494 int rc = __get_page_type(page, type, 0);
2495 if ( likely(rc == 0) )
2496 return 1;
2497 ASSERT(rc == -EINVAL);
2498 return 0;
2501 int put_page_type_preemptible(struct page_info *page)
2503 return __put_page_type(page, 1);
2506 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2508 return __get_page_type(page, type, 1);
2511 static int get_spage_pages(struct page_info *page, struct domain *d)
2513 int i;
2515 for (i = 0; i < (1<<PAGETABLE_ORDER); i++, page++)
2517 if (!get_page_and_type(page, d, PGT_writable_page))
2519 while (--i >= 0)
2520 put_page_and_type(--page);
2521 return 0;
2524 return 1;
2527 static void put_spage_pages(struct page_info *page)
2529 int i;
2531 for (i = 0; i < (1<<PAGETABLE_ORDER); i++, page++)
2533 put_page_and_type(page);
2535 return;
2538 #ifdef __x86_64__
2540 static int mark_superpage(struct spage_info *spage, struct domain *d)
2542 unsigned long x, nx, y = spage->type_info;
2543 int pages_done = 0;
2545 ASSERT(opt_allow_superpage);
2547 do {
2548 x = y;
2549 nx = x + 1;
2550 if ( (x & SGT_type_mask) == SGT_mark )
2552 MEM_LOG("Duplicate superpage mark attempt mfn %lx",
2553 spage_to_mfn(spage));
2554 if ( pages_done )
2555 put_spage_pages(spage_to_page(spage));
2556 return -EINVAL;
2558 if ( (x & SGT_type_mask) == SGT_dynamic )
2560 if ( pages_done )
2562 put_spage_pages(spage_to_page(spage));
2563 pages_done = 0;
2566 else if ( !pages_done )
2568 if ( !get_spage_pages(spage_to_page(spage), d) )
2570 MEM_LOG("Superpage type conflict in mark attempt mfn %lx",
2571 spage_to_mfn(spage));
2572 return -EINVAL;
2574 pages_done = 1;
2576 nx = (nx & ~SGT_type_mask) | SGT_mark;
2578 } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x );
2580 return 0;
2583 static int unmark_superpage(struct spage_info *spage)
2585 unsigned long x, nx, y = spage->type_info;
2586 unsigned long do_pages = 0;
2588 ASSERT(opt_allow_superpage);
2590 do {
2591 x = y;
2592 nx = x - 1;
2593 if ( (x & SGT_type_mask) != SGT_mark )
2595 MEM_LOG("Attempt to unmark unmarked superpage mfn %lx",
2596 spage_to_mfn(spage));
2597 return -EINVAL;
2599 if ( (nx & SGT_count_mask) == 0 )
2601 nx = (nx & ~SGT_type_mask) | SGT_none;
2602 do_pages = 1;
2604 else
2606 nx = (nx & ~SGT_type_mask) | SGT_dynamic;
2608 } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x );
2610 if ( do_pages )
2611 put_spage_pages(spage_to_page(spage));
2613 return 0;
2616 void clear_superpage_mark(struct page_info *page)
2618 struct spage_info *spage;
2620 if ( !opt_allow_superpage )
2621 return;
2623 spage = page_to_spage(page);
2624 if ((spage->type_info & SGT_type_mask) == SGT_mark)
2625 unmark_superpage(spage);
2629 int get_superpage(unsigned long mfn, struct domain *d)
2631 struct spage_info *spage;
2632 unsigned long x, nx, y;
2633 int pages_done = 0;
2635 ASSERT(opt_allow_superpage);
2637 spage = mfn_to_spage(mfn);
2638 y = spage->type_info;
2639 do {
2640 x = y;
2641 nx = x + 1;
2642 if ( (x & SGT_type_mask) != SGT_none )
2644 if ( pages_done )
2646 put_spage_pages(spage_to_page(spage));
2647 pages_done = 0;
2650 else
2652 if ( !get_spage_pages(spage_to_page(spage), d) )
2654 MEM_LOG("Type conflict on superpage mapping mfn %lx",
2655 spage_to_mfn(spage));
2656 return -EINVAL;
2658 pages_done = 1;
2659 nx = (nx & ~SGT_type_mask) | SGT_dynamic;
2661 } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x );
2663 return 0;
2666 static void put_superpage(unsigned long mfn)
2668 struct spage_info *spage;
2669 unsigned long x, nx, y;
2670 unsigned long do_pages = 0;
2672 if ( !opt_allow_superpage )
2674 put_spage_pages(mfn_to_page(mfn));
2675 return;
2678 spage = mfn_to_spage(mfn);
2679 y = spage->type_info;
2680 do {
2681 x = y;
2682 nx = x - 1;
2683 if ((x & SGT_type_mask) == SGT_dynamic)
2685 if ((nx & SGT_count_mask) == 0)
2687 nx = (nx & ~SGT_type_mask) | SGT_none;
2688 do_pages = 1;
2692 } while ((y = cmpxchg(&spage->type_info, x, nx)) != x);
2694 if (do_pages)
2695 put_spage_pages(spage_to_page(spage));
2697 return;
2700 #else /* __i386__ */
2702 void clear_superpage_mark(struct page_info *page)
2706 static int get_superpage(unsigned long mfn, struct domain *d)
2708 return get_spage_pages(mfn_to_page(mfn), d);
2711 static void put_superpage(unsigned long mfn)
2713 put_spage_pages(mfn_to_page(mfn));
2716 #endif
2719 int new_guest_cr3(unsigned long mfn)
2721 struct vcpu *curr = current;
2722 struct domain *d = curr->domain;
2723 int okay;
2724 unsigned long old_base_mfn;
2726 #ifdef __x86_64__
2727 if ( is_pv_32on64_domain(d) )
2729 okay = paging_mode_refcounts(d)
2730 ? 0 /* Old code was broken, but what should it be? */
2731 : mod_l4_entry(
2732 __va(pagetable_get_paddr(curr->arch.guest_table)),
2733 l4e_from_pfn(
2734 mfn,
2735 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2736 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2737 if ( unlikely(!okay) )
2739 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2740 return 0;
2743 invalidate_shadow_ldt(curr, 0);
2744 write_ptbase(curr);
2746 return 1;
2748 #endif
2749 okay = paging_mode_refcounts(d)
2750 ? get_page_from_pagenr(mfn, d)
2751 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2752 if ( unlikely(!okay) )
2754 MEM_LOG("Error while installing new baseptr %lx", mfn);
2755 return 0;
2758 invalidate_shadow_ldt(curr, 0);
2760 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2762 curr->arch.guest_table = pagetable_from_pfn(mfn);
2763 update_cr3(curr);
2765 write_ptbase(curr);
2767 if ( likely(old_base_mfn != 0) )
2769 if ( paging_mode_refcounts(d) )
2770 put_page(mfn_to_page(old_base_mfn));
2771 else
2772 put_page_and_type(mfn_to_page(old_base_mfn));
2775 return 1;
2778 static struct domain *get_pg_owner(domid_t domid)
2780 struct domain *pg_owner = NULL, *curr = current->domain;
2782 if ( likely(domid == DOMID_SELF) )
2784 pg_owner = rcu_lock_current_domain();
2785 goto out;
2788 if ( unlikely(domid == curr->domain_id) )
2790 MEM_LOG("Cannot specify itself as foreign domain");
2791 goto out;
2794 if ( unlikely(paging_mode_translate(curr)) )
2796 MEM_LOG("Cannot mix foreign mappings with translated domains");
2797 goto out;
2800 switch ( domid )
2802 case DOMID_IO:
2803 pg_owner = rcu_lock_domain(dom_io);
2804 break;
2805 case DOMID_XEN:
2806 if ( !IS_PRIV(curr) )
2808 MEM_LOG("Cannot set foreign dom");
2809 break;
2811 pg_owner = rcu_lock_domain(dom_xen);
2812 break;
2813 default:
2814 if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2816 MEM_LOG("Unknown domain '%u'", domid);
2817 break;
2819 if ( !IS_PRIV_FOR(curr, pg_owner) )
2821 MEM_LOG("Cannot set foreign dom");
2822 rcu_unlock_domain(pg_owner);
2823 pg_owner = NULL;
2825 break;
2828 out:
2829 return pg_owner;
2832 static void put_pg_owner(struct domain *pg_owner)
2834 rcu_unlock_domain(pg_owner);
2837 static inline int vcpumask_to_pcpumask(
2838 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2840 unsigned int vcpu_id, vcpu_bias, offs;
2841 unsigned long vmask;
2842 struct vcpu *v;
2843 bool_t is_native = !is_pv_32on64_domain(d);
2845 cpus_clear(*pmask);
2846 for ( vmask = 0, offs = 0; ; ++offs)
2848 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2849 if ( vcpu_bias >= d->max_vcpus )
2850 return 0;
2852 if ( unlikely(is_native ?
2853 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2854 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2855 offs, 1)) )
2857 cpus_clear(*pmask);
2858 return -EFAULT;
2861 while ( vmask )
2863 vcpu_id = find_first_set_bit(vmask);
2864 vmask &= ~(1UL << vcpu_id);
2865 vcpu_id += vcpu_bias;
2866 if ( (vcpu_id >= d->max_vcpus) )
2867 return 0;
2868 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2869 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2874 #ifdef __i386__
2875 static inline void *fixmap_domain_page(unsigned long mfn)
2877 unsigned int cpu = smp_processor_id();
2878 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2880 l1e_write(fix_pae_highmem_pl1e - cpu,
2881 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2882 flush_tlb_one_local(ptr);
2883 return ptr;
2885 static inline void fixunmap_domain_page(const void *ptr)
2887 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2889 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2890 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2892 #else
2893 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2894 #define fixunmap_domain_page(ptr) ((void)(ptr))
2895 #endif
2897 int do_mmuext_op(
2898 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2899 unsigned int count,
2900 XEN_GUEST_HANDLE(uint) pdone,
2901 unsigned int foreigndom)
2903 struct mmuext_op op;
2904 int rc = 0, i = 0, okay;
2905 unsigned long type;
2906 unsigned int done = 0;
2907 struct vcpu *curr = current;
2908 struct domain *d = curr->domain;
2909 struct domain *pg_owner;
2911 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2913 count &= ~MMU_UPDATE_PREEMPTED;
2914 if ( unlikely(!guest_handle_is_null(pdone)) )
2915 (void)copy_from_guest(&done, pdone, 1);
2917 else
2918 perfc_incr(calls_to_mmuext_op);
2920 if ( unlikely(!guest_handle_okay(uops, count)) )
2922 rc = -EFAULT;
2923 goto out;
2926 if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
2928 rc = -ESRCH;
2929 goto out;
2932 for ( i = 0; i < count; i++ )
2934 if ( hypercall_preempt_check() )
2936 rc = -EAGAIN;
2937 break;
2940 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2942 MEM_LOG("Bad __copy_from_guest");
2943 rc = -EFAULT;
2944 break;
2947 okay = 1;
2949 switch ( op.cmd )
2951 case MMUEXT_PIN_L1_TABLE:
2952 type = PGT_l1_page_table;
2953 goto pin_page;
2955 case MMUEXT_PIN_L2_TABLE:
2956 type = PGT_l2_page_table;
2957 goto pin_page;
2959 case MMUEXT_PIN_L3_TABLE:
2960 type = PGT_l3_page_table;
2961 goto pin_page;
2963 case MMUEXT_PIN_L4_TABLE:
2964 if ( is_pv_32bit_domain(pg_owner) )
2965 break;
2966 type = PGT_l4_page_table;
2968 pin_page: {
2969 unsigned long mfn;
2970 struct page_info *page;
2972 /* Ignore pinning of invalid paging levels. */
2973 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2974 break;
2976 if ( paging_mode_refcounts(pg_owner) )
2977 break;
2979 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
2980 rc = get_page_and_type_from_pagenr(mfn, type, pg_owner, 0, 1);
2981 okay = !rc;
2982 if ( unlikely(!okay) )
2984 if ( rc == -EINTR )
2985 rc = -EAGAIN;
2986 else if ( rc != -EAGAIN )
2987 MEM_LOG("Error while pinning mfn %lx", mfn);
2988 break;
2991 page = mfn_to_page(mfn);
2993 if ( (rc = xsm_memory_pin_page(d, page)) != 0 )
2995 put_page_and_type(page);
2996 okay = 0;
2997 break;
3000 if ( unlikely(test_and_set_bit(_PGT_pinned,
3001 &page->u.inuse.type_info)) )
3003 MEM_LOG("Mfn %lx already pinned", mfn);
3004 put_page_and_type(page);
3005 okay = 0;
3006 break;
3009 /* A page is dirtied when its pin status is set. */
3010 paging_mark_dirty(pg_owner, mfn);
3012 /* We can race domain destruction (domain_relinquish_resources). */
3013 if ( unlikely(pg_owner != d) )
3015 int drop_ref;
3016 spin_lock(&pg_owner->page_alloc_lock);
3017 drop_ref = (pg_owner->is_dying &&
3018 test_and_clear_bit(_PGT_pinned,
3019 &page->u.inuse.type_info));
3020 spin_unlock(&pg_owner->page_alloc_lock);
3021 if ( drop_ref )
3022 put_page_and_type(page);
3025 break;
3028 case MMUEXT_UNPIN_TABLE: {
3029 unsigned long mfn;
3030 struct page_info *page;
3032 if ( paging_mode_refcounts(pg_owner) )
3033 break;
3035 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
3036 if ( unlikely(!(okay = get_page_from_pagenr(mfn, pg_owner))) )
3038 MEM_LOG("Mfn %lx bad domain", mfn);
3039 break;
3042 page = mfn_to_page(mfn);
3044 if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
3046 okay = 0;
3047 put_page(page);
3048 MEM_LOG("Mfn %lx not pinned", mfn);
3049 break;
3052 put_page_and_type(page);
3053 put_page(page);
3055 /* A page is dirtied when its pin status is cleared. */
3056 paging_mark_dirty(pg_owner, mfn);
3058 break;
3061 case MMUEXT_NEW_BASEPTR:
3062 okay = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn));
3063 break;
3065 #ifdef __x86_64__
3066 case MMUEXT_NEW_USER_BASEPTR: {
3067 unsigned long old_mfn, mfn;
3069 mfn = gmfn_to_mfn(d, op.arg1.mfn);
3070 if ( mfn != 0 )
3072 if ( paging_mode_refcounts(d) )
3073 okay = get_page_from_pagenr(mfn, d);
3074 else
3075 okay = !get_page_and_type_from_pagenr(
3076 mfn, PGT_root_page_table, d, 0, 0);
3077 if ( unlikely(!okay) )
3079 MEM_LOG("Error while installing new mfn %lx", mfn);
3080 break;
3084 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
3085 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
3087 if ( old_mfn != 0 )
3089 if ( paging_mode_refcounts(d) )
3090 put_page(mfn_to_page(old_mfn));
3091 else
3092 put_page_and_type(mfn_to_page(old_mfn));
3095 break;
3097 #endif
3099 case MMUEXT_TLB_FLUSH_LOCAL:
3100 flush_tlb_local();
3101 break;
3103 case MMUEXT_INVLPG_LOCAL:
3104 if ( !paging_mode_enabled(d)
3105 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
3106 flush_tlb_one_local(op.arg1.linear_addr);
3107 break;
3109 case MMUEXT_TLB_FLUSH_MULTI:
3110 case MMUEXT_INVLPG_MULTI:
3112 cpumask_t pmask;
3114 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
3116 okay = 0;
3117 break;
3119 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
3120 flush_tlb_mask(&pmask);
3121 else
3122 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
3123 break;
3126 case MMUEXT_TLB_FLUSH_ALL:
3127 flush_tlb_mask(&d->domain_dirty_cpumask);
3128 break;
3130 case MMUEXT_INVLPG_ALL:
3131 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
3132 break;
3134 case MMUEXT_FLUSH_CACHE:
3135 if ( unlikely(!cache_flush_permitted(d)) )
3137 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
3138 okay = 0;
3140 else
3142 wbinvd();
3144 break;
3146 case MMUEXT_FLUSH_CACHE_GLOBAL:
3147 if ( unlikely(foreigndom != DOMID_SELF) )
3148 okay = 0;
3149 else if ( likely(cache_flush_permitted(d)) )
3151 unsigned int cpu;
3152 cpumask_t mask = CPU_MASK_NONE;
3154 for_each_online_cpu(cpu)
3155 if ( !cpus_intersects(mask,
3156 per_cpu(cpu_sibling_map, cpu)) )
3157 cpu_set(cpu, mask);
3158 flush_mask(&mask, FLUSH_CACHE);
3160 else
3162 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE_GLOBAL");
3163 okay = 0;
3165 break;
3167 case MMUEXT_SET_LDT:
3169 unsigned long ptr = op.arg1.linear_addr;
3170 unsigned long ents = op.arg2.nr_ents;
3172 if ( paging_mode_external(d) )
3174 MEM_LOG("ignoring SET_LDT hypercall from external domain");
3175 okay = 0;
3177 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
3178 (ents > 8192) ||
3179 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
3181 okay = 0;
3182 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
3184 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
3185 (curr->arch.guest_context.ldt_base != ptr) )
3187 invalidate_shadow_ldt(curr, 0);
3188 flush_tlb_local();
3189 curr->arch.guest_context.ldt_base = ptr;
3190 curr->arch.guest_context.ldt_ents = ents;
3191 load_LDT(curr);
3192 if ( ents != 0 )
3193 (void)map_ldt_shadow_page(0);
3195 break;
3198 case MMUEXT_CLEAR_PAGE: {
3199 unsigned long mfn;
3200 unsigned char *ptr;
3202 mfn = gmfn_to_mfn(d, op.arg1.mfn);
3203 okay = !get_page_and_type_from_pagenr(
3204 mfn, PGT_writable_page, d, 0, 0);
3205 if ( unlikely(!okay) )
3207 MEM_LOG("Error while clearing mfn %lx", mfn);
3208 break;
3211 /* A page is dirtied when it's being cleared. */
3212 paging_mark_dirty(d, mfn);
3214 ptr = fixmap_domain_page(mfn);
3215 clear_page(ptr);
3216 fixunmap_domain_page(ptr);
3218 put_page_and_type(mfn_to_page(mfn));
3219 break;
3222 case MMUEXT_COPY_PAGE:
3224 const unsigned char *src;
3225 unsigned char *dst;
3226 unsigned long src_mfn, mfn;
3228 src_mfn = gmfn_to_mfn(d, op.arg2.src_mfn);
3229 okay = get_page_from_pagenr(src_mfn, d);
3230 if ( unlikely(!okay) )
3232 MEM_LOG("Error while copying from mfn %lx", src_mfn);
3233 break;
3236 mfn = gmfn_to_mfn(d, op.arg1.mfn);
3237 okay = !get_page_and_type_from_pagenr(
3238 mfn, PGT_writable_page, d, 0, 0);
3239 if ( unlikely(!okay) )
3241 put_page(mfn_to_page(src_mfn));
3242 MEM_LOG("Error while copying to mfn %lx", mfn);
3243 break;
3246 /* A page is dirtied when it's being copied to. */
3247 paging_mark_dirty(d, mfn);
3249 src = map_domain_page(src_mfn);
3250 dst = fixmap_domain_page(mfn);
3251 copy_page(dst, src);
3252 fixunmap_domain_page(dst);
3253 unmap_domain_page(src);
3255 put_page_and_type(mfn_to_page(mfn));
3256 put_page(mfn_to_page(src_mfn));
3257 break;
3260 #ifdef __x86_64__
3261 case MMUEXT_MARK_SUPER:
3263 unsigned long mfn;
3264 struct spage_info *spage;
3266 mfn = op.arg1.mfn;
3267 if ( mfn & (L1_PAGETABLE_ENTRIES-1) )
3269 MEM_LOG("Unaligned superpage reference mfn %lx", mfn);
3270 okay = 0;
3271 break;
3274 if ( !opt_allow_superpage )
3276 MEM_LOG("Superpages disallowed");
3277 okay = 0;
3278 rc = -ENOSYS;
3279 break;
3282 spage = mfn_to_spage(mfn);
3283 okay = (mark_superpage(spage, d) >= 0);
3284 break;
3287 case MMUEXT_UNMARK_SUPER:
3289 unsigned long mfn;
3290 struct spage_info *spage;
3292 mfn = op.arg1.mfn;
3293 if ( mfn & (L1_PAGETABLE_ENTRIES-1) )
3295 MEM_LOG("Unaligned superpage reference mfn %lx", mfn);
3296 okay = 0;
3297 break;
3300 if ( !opt_allow_superpage )
3302 MEM_LOG("Superpages disallowed");
3303 okay = 0;
3304 rc = -ENOSYS;
3305 break;
3308 spage = mfn_to_spage(mfn);
3309 okay = (unmark_superpage(spage) >= 0);
3310 break;
3312 #endif
3314 default:
3315 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
3316 rc = -ENOSYS;
3317 okay = 0;
3318 break;
3321 if ( unlikely(!okay) )
3323 rc = rc ? rc : -EINVAL;
3324 break;
3327 guest_handle_add_offset(uops, 1);
3330 if ( rc == -EAGAIN )
3331 rc = hypercall_create_continuation(
3332 __HYPERVISOR_mmuext_op, "hihi",
3333 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3335 put_pg_owner(pg_owner);
3337 perfc_add(num_mmuext_ops, i);
3339 out:
3340 /* Add incremental work we have done to the @done output parameter. */
3341 if ( unlikely(!guest_handle_is_null(pdone)) )
3343 done += i;
3344 copy_to_guest(pdone, &done, 1);
3347 return rc;
3350 int do_mmu_update(
3351 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
3352 unsigned int count,
3353 XEN_GUEST_HANDLE(uint) pdone,
3354 unsigned int foreigndom)
3356 struct mmu_update req;
3357 void *va;
3358 unsigned long gpfn, gmfn, mfn;
3359 struct page_info *page;
3360 int rc = 0, okay = 1, i = 0;
3361 unsigned int cmd, done = 0, pt_dom;
3362 struct vcpu *v = current;
3363 struct domain *d = v->domain, *pt_owner = d, *pg_owner;
3364 struct domain_mmap_cache mapcache;
3366 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3368 count &= ~MMU_UPDATE_PREEMPTED;
3369 if ( unlikely(!guest_handle_is_null(pdone)) )
3370 (void)copy_from_guest(&done, pdone, 1);
3372 else
3373 perfc_incr(calls_to_mmu_update);
3375 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3377 rc = -EFAULT;
3378 goto out;
3381 if ( (pt_dom = foreigndom >> 16) != 0 )
3383 /* Pagetables belong to a foreign domain (PFD). */
3384 if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3386 rc = -EINVAL;
3387 goto out;
3389 if ( pt_owner == d )
3390 rcu_unlock_domain(pt_owner);
3391 if ( (v = pt_owner->vcpu ? pt_owner->vcpu[0] : NULL) == NULL )
3393 rc = -EINVAL;
3394 goto out;
3396 if ( !IS_PRIV_FOR(d, pt_owner) )
3398 rc = -ESRCH;
3399 goto out;
3403 if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3405 rc = -ESRCH;
3406 goto out;
3409 domain_mmap_cache_init(&mapcache);
3411 for ( i = 0; i < count; i++ )
3413 if ( hypercall_preempt_check() )
3415 rc = -EAGAIN;
3416 break;
3419 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3421 MEM_LOG("Bad __copy_from_guest");
3422 rc = -EFAULT;
3423 break;
3426 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3427 okay = 0;
3429 switch ( cmd )
3431 /*
3432 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3433 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3434 * current A/D bits.
3435 */
3436 case MMU_NORMAL_PT_UPDATE:
3437 case MMU_PT_UPDATE_PRESERVE_AD:
3439 p2m_type_t p2mt;
3441 rc = xsm_mmu_normal_update(d, pg_owner, req.val);
3442 if ( rc )
3443 break;
3445 req.ptr -= cmd;
3446 gmfn = req.ptr >> PAGE_SHIFT;
3447 mfn = mfn_x(gfn_to_mfn(p2m_get_hostp2m(pt_owner), gmfn, &p2mt));
3448 if ( !p2m_is_valid(p2mt) )
3449 mfn = INVALID_MFN;
3451 if ( p2m_is_paged(p2mt) )
3453 p2m_mem_paging_populate(p2m_get_hostp2m(pg_owner), gmfn);
3455 rc = -ENOENT;
3456 break;
3459 if ( unlikely(!get_page_from_pagenr(mfn, pt_owner)) )
3461 MEM_LOG("Could not get page for normal update");
3462 break;
3465 va = map_domain_page_with_cache(mfn, &mapcache);
3466 va = (void *)((unsigned long)va +
3467 (unsigned long)(req.ptr & ~PAGE_MASK));
3468 page = mfn_to_page(mfn);
3470 if ( page_lock(page) )
3472 switch ( page->u.inuse.type_info & PGT_type_mask )
3474 case PGT_l1_page_table:
3476 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3477 p2m_type_t l1e_p2mt;
3478 gfn_to_mfn(p2m_get_hostp2m(pg_owner),
3479 l1e_get_pfn(l1e), &l1e_p2mt);
3481 if ( p2m_is_paged(l1e_p2mt) )
3483 p2m_mem_paging_populate(p2m_get_hostp2m(pg_owner),
3484 l1e_get_pfn(l1e));
3485 rc = -ENOENT;
3486 break;
3488 else if ( p2m_ram_paging_in_start == l1e_p2mt )
3490 rc = -ENOENT;
3491 break;
3493 #ifdef __x86_64__
3494 /* XXX: Ugly: pull all the checks into a separate function.
3495 * Don't want to do it now, not to interfere with mem_paging
3496 * patches */
3497 else if ( p2m_ram_shared == l1e_p2mt )
3499 /* Unshare the page for RW foreign mappings */
3500 if ( l1e_get_flags(l1e) & _PAGE_RW )
3502 rc = mem_sharing_unshare_page(p2m_get_hostp2m(pg_owner),
3503 l1e_get_pfn(l1e),
3504 0);
3505 if ( rc )
3506 break;
3509 #endif
3511 okay = mod_l1_entry(va, l1e, mfn,
3512 cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3513 pg_owner);
3515 break;
3516 case PGT_l2_page_table:
3518 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3519 p2m_type_t l2e_p2mt;
3520 gfn_to_mfn(p2m_get_hostp2m(pg_owner), l2e_get_pfn(l2e), &l2e_p2mt);
3522 if ( p2m_is_paged(l2e_p2mt) )
3524 p2m_mem_paging_populate(p2m_get_hostp2m(pg_owner),
3525 l2e_get_pfn(l2e));
3526 rc = -ENOENT;
3527 break;
3529 else if ( p2m_ram_paging_in_start == l2e_p2mt )
3531 rc = -ENOENT;
3532 break;
3534 else if ( p2m_ram_shared == l2e_p2mt )
3536 MEM_LOG("Unexpected attempt to map shared page.\n");
3537 rc = -EINVAL;
3538 break;
3542 okay = mod_l2_entry(va, l2e, mfn,
3543 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3545 break;
3546 case PGT_l3_page_table:
3548 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3549 p2m_type_t l3e_p2mt;
3550 gfn_to_mfn(p2m_get_hostp2m(pg_owner), l3e_get_pfn(l3e), &l3e_p2mt);
3552 if ( p2m_is_paged(l3e_p2mt) )
3554 p2m_mem_paging_populate(p2m_get_hostp2m(pg_owner),
3555 l3e_get_pfn(l3e));
3556 rc = -ENOENT;
3557 break;
3559 else if ( p2m_ram_paging_in_start == l3e_p2mt )
3561 rc = -ENOENT;
3562 break;
3564 else if ( p2m_ram_shared == l3e_p2mt )
3566 MEM_LOG("Unexpected attempt to map shared page.\n");
3567 rc = -EINVAL;
3568 break;
3571 rc = mod_l3_entry(va, l3e, mfn,
3572 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3573 okay = !rc;
3575 break;
3576 #if CONFIG_PAGING_LEVELS >= 4
3577 case PGT_l4_page_table:
3579 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3580 p2m_type_t l4e_p2mt;
3581 gfn_to_mfn(p2m_get_hostp2m(pg_owner),
3582 l4e_get_pfn(l4e), &l4e_p2mt);
3584 if ( p2m_is_paged(l4e_p2mt) )
3586 p2m_mem_paging_populate(p2m_get_hostp2m(pg_owner),
3587 l4e_get_pfn(l4e));
3588 rc = -ENOENT;
3589 break;
3591 else if ( p2m_ram_paging_in_start == l4e_p2mt )
3593 rc = -ENOENT;
3594 break;
3596 else if ( p2m_ram_shared == l4e_p2mt )
3598 MEM_LOG("Unexpected attempt to map shared page.\n");
3599 rc = -EINVAL;
3600 break;
3603 rc = mod_l4_entry(va, l4e, mfn,
3604 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3605 okay = !rc;
3607 break;
3608 #endif
3609 case PGT_writable_page:
3610 perfc_incr(writable_mmu_updates);
3611 okay = paging_write_guest_entry(
3612 v, va, req.val, _mfn(mfn));
3613 break;
3615 page_unlock(page);
3616 if ( rc == -EINTR )
3617 rc = -EAGAIN;
3619 else if ( get_page_type(page, PGT_writable_page) )
3621 perfc_incr(writable_mmu_updates);
3622 okay = paging_write_guest_entry(
3623 v, va, req.val, _mfn(mfn));
3624 put_page_type(page);
3627 unmap_domain_page_with_cache(va, &mapcache);
3628 put_page(page);
3630 break;
3632 case MMU_MACHPHYS_UPDATE:
3634 mfn = req.ptr >> PAGE_SHIFT;
3635 gpfn = req.val;
3637 rc = xsm_mmu_machphys_update(d, mfn);
3638 if ( rc )
3639 break;
3641 if ( unlikely(!get_page_from_pagenr(mfn, pg_owner)) )
3643 MEM_LOG("Could not get page for mach->phys update");
3644 break;
3647 if ( unlikely(paging_mode_translate(pg_owner)) )
3649 MEM_LOG("Mach-phys update on auto-translate guest");
3650 break;
3653 set_gpfn_from_mfn(mfn, gpfn);
3654 okay = 1;
3656 paging_mark_dirty(pg_owner, mfn);
3658 put_page(mfn_to_page(mfn));
3659 break;
3661 default:
3662 MEM_LOG("Invalid page update command %x", cmd);
3663 rc = -ENOSYS;
3664 okay = 0;
3665 break;
3668 if ( unlikely(!okay) )
3670 rc = rc ? rc : -EINVAL;
3671 break;
3674 guest_handle_add_offset(ureqs, 1);
3677 if ( rc == -EAGAIN )
3678 rc = hypercall_create_continuation(
3679 __HYPERVISOR_mmu_update, "hihi",
3680 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3682 put_pg_owner(pg_owner);
3684 domain_mmap_cache_destroy(&mapcache);
3686 perfc_add(num_page_updates, i);
3688 out:
3689 if ( pt_owner && (pt_owner != d) )
3690 rcu_unlock_domain(pt_owner);
3692 /* Add incremental work we have done to the @done output parameter. */
3693 if ( unlikely(!guest_handle_is_null(pdone)) )
3695 done += i;
3696 copy_to_guest(pdone, &done, 1);
3699 return rc;
3703 static int create_grant_pte_mapping(
3704 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3706 int rc = GNTST_okay;
3707 void *va;
3708 unsigned long gmfn, mfn;
3709 struct page_info *page;
3710 l1_pgentry_t ol1e;
3711 struct domain *d = v->domain;
3713 ASSERT(domain_is_locked(d));
3715 adjust_guest_l1e(nl1e, d);
3717 gmfn = pte_addr >> PAGE_SHIFT;
3718 mfn = gmfn_to_mfn(d, gmfn);
3720 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3722 MEM_LOG("Could not get page for normal update");
3723 return GNTST_general_error;
3726 va = map_domain_page(mfn);
3727 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3728 page = mfn_to_page(mfn);
3730 if ( !page_lock(page) )
3732 rc = GNTST_general_error;
3733 goto failed;
3736 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3738 page_unlock(page);
3739 rc = GNTST_general_error;
3740 goto failed;
3743 ol1e = *(l1_pgentry_t *)va;
3744 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3746 page_unlock(page);
3747 rc = GNTST_general_error;
3748 goto failed;
3751 page_unlock(page);
3753 if ( !paging_mode_refcounts(d) )
3754 put_page_from_l1e(ol1e, d);
3756 failed:
3757 unmap_domain_page(va);
3758 put_page(page);
3760 return rc;
3763 static int destroy_grant_pte_mapping(
3764 uint64_t addr, unsigned long frame, struct domain *d)
3766 int rc = GNTST_okay;
3767 void *va;
3768 unsigned long gmfn, mfn;
3769 struct page_info *page;
3770 l1_pgentry_t ol1e;
3772 gmfn = addr >> PAGE_SHIFT;
3773 mfn = gmfn_to_mfn(d, gmfn);
3775 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3777 MEM_LOG("Could not get page for normal update");
3778 return GNTST_general_error;
3781 va = map_domain_page(mfn);
3782 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3783 page = mfn_to_page(mfn);
3785 if ( !page_lock(page) )
3787 rc = GNTST_general_error;
3788 goto failed;
3791 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3793 page_unlock(page);
3794 rc = GNTST_general_error;
3795 goto failed;
3798 ol1e = *(l1_pgentry_t *)va;
3800 /* Check that the virtual address supplied is actually mapped to frame. */
3801 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3803 page_unlock(page);
3804 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3805 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3806 rc = GNTST_general_error;
3807 goto failed;
3810 /* Delete pagetable entry. */
3811 if ( unlikely(!UPDATE_ENTRY
3812 (l1,
3813 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3814 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3815 0)) )
3817 page_unlock(page);
3818 MEM_LOG("Cannot delete PTE entry at %p", va);
3819 rc = GNTST_general_error;
3820 goto failed;
3823 page_unlock(page);
3825 failed:
3826 unmap_domain_page(va);
3827 put_page(page);
3828 return rc;
3832 static int create_grant_va_mapping(
3833 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3835 l1_pgentry_t *pl1e, ol1e;
3836 struct domain *d = v->domain;
3837 unsigned long gl1mfn;
3838 struct page_info *l1pg;
3839 int okay;
3841 ASSERT(domain_is_locked(d));
3843 adjust_guest_l1e(nl1e, d);
3845 pl1e = guest_map_l1e(v, va, &gl1mfn);
3846 if ( !pl1e )
3848 MEM_LOG("Could not find L1 PTE for address %lx", va);
3849 return GNTST_general_error;
3852 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3854 guest_unmap_l1e(v, pl1e);
3855 return GNTST_general_error;
3858 l1pg = mfn_to_page(gl1mfn);
3859 if ( !page_lock(l1pg) )
3861 put_page(l1pg);
3862 guest_unmap_l1e(v, pl1e);
3863 return GNTST_general_error;
3866 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3868 page_unlock(l1pg);
3869 put_page(l1pg);
3870 guest_unmap_l1e(v, pl1e);
3871 return GNTST_general_error;
3874 ol1e = *pl1e;
3875 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3877 page_unlock(l1pg);
3878 put_page(l1pg);
3879 guest_unmap_l1e(v, pl1e);
3881 if ( okay && !paging_mode_refcounts(d) )
3882 put_page_from_l1e(ol1e, d);
3884 return okay ? GNTST_okay : GNTST_general_error;
3887 static int replace_grant_va_mapping(
3888 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3890 l1_pgentry_t *pl1e, ol1e;
3891 unsigned long gl1mfn;
3892 struct page_info *l1pg;
3893 int rc = 0;
3895 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3896 if ( !pl1e )
3898 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3899 return GNTST_general_error;
3902 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3904 rc = GNTST_general_error;
3905 goto out;
3908 l1pg = mfn_to_page(gl1mfn);
3909 if ( !page_lock(l1pg) )
3911 rc = GNTST_general_error;
3912 put_page(l1pg);
3913 goto out;
3916 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3918 rc = GNTST_general_error;
3919 goto unlock_and_out;
3922 ol1e = *pl1e;
3924 /* Check that the virtual address supplied is actually mapped to frame. */
3925 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3927 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3928 l1e_get_pfn(ol1e), addr, frame);
3929 rc = GNTST_general_error;
3930 goto unlock_and_out;
3933 /* Delete pagetable entry. */
3934 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3936 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3937 rc = GNTST_general_error;
3938 goto unlock_and_out;
3941 unlock_and_out:
3942 page_unlock(l1pg);
3943 put_page(l1pg);
3944 out:
3945 guest_unmap_l1e(v, pl1e);
3946 return rc;
3949 static int destroy_grant_va_mapping(
3950 unsigned long addr, unsigned long frame, struct vcpu *v)
3952 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3955 static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame,
3956 unsigned int flags,
3957 unsigned int cache_flags)
3959 p2m_type_t p2mt;
3960 int rc;
3962 if ( cache_flags || (flags & ~GNTMAP_readonly) != GNTMAP_host_map )
3963 return GNTST_general_error;
3965 if ( flags & GNTMAP_readonly )
3966 p2mt = p2m_grant_map_ro;
3967 else
3968 p2mt = p2m_grant_map_rw;
3969 rc = guest_physmap_add_entry(p2m_get_hostp2m(current->domain),
3970 addr >> PAGE_SHIFT, frame, 0, p2mt);
3971 if ( rc )
3972 return GNTST_general_error;
3973 else
3974 return GNTST_okay;
3977 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3978 unsigned int flags, unsigned int cache_flags)
3980 l1_pgentry_t pte;
3982 if ( paging_mode_external(current->domain) )
3983 return create_grant_p2m_mapping(addr, frame, flags, cache_flags);
3985 pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3986 if ( (flags & GNTMAP_application_map) )
3987 l1e_add_flags(pte,_PAGE_USER);
3988 if ( !(flags & GNTMAP_readonly) )
3989 l1e_add_flags(pte,_PAGE_RW);
3991 l1e_add_flags(pte,
3992 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3993 & _PAGE_AVAIL);
3995 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3997 if ( flags & GNTMAP_contains_pte )
3998 return create_grant_pte_mapping(addr, pte, current);
3999 return create_grant_va_mapping(addr, pte, current);
4002 static int replace_grant_p2m_mapping(
4003 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
4005 unsigned long gfn = (unsigned long)(addr >> PAGE_SHIFT);
4006 p2m_type_t type;
4007 mfn_t old_mfn;
4008 struct domain *d = current->domain;
4010 if ( new_addr != 0 || (flags & GNTMAP_contains_pte) )
4011 return GNTST_general_error;
4013 old_mfn = gfn_to_mfn(p2m_get_hostp2m(d), gfn, &type);
4014 if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame )
4016 gdprintk(XENLOG_WARNING,
4017 "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n",
4018 type, mfn_x(old_mfn), frame);
4019 return GNTST_general_error;
4021 guest_physmap_remove_page(d, gfn, frame, 0);
4023 return GNTST_okay;
4026 int replace_grant_host_mapping(
4027 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
4029 struct vcpu *curr = current;
4030 l1_pgentry_t *pl1e, ol1e;
4031 unsigned long gl1mfn;
4032 struct page_info *l1pg;
4033 int rc;
4035 if ( paging_mode_external(current->domain) )
4036 return replace_grant_p2m_mapping(addr, frame, new_addr, flags);
4038 if ( flags & GNTMAP_contains_pte )
4040 if ( !new_addr )
4041 return destroy_grant_pte_mapping(addr, frame, curr->domain);
4043 MEM_LOG("Unsupported grant table operation");
4044 return GNTST_general_error;
4047 if ( !new_addr )
4048 return destroy_grant_va_mapping(addr, frame, curr);
4050 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
4051 if ( !pl1e )
4053 MEM_LOG("Could not find L1 PTE for address %lx",
4054 (unsigned long)new_addr);
4055 return GNTST_general_error;
4058 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
4060 guest_unmap_l1e(curr, pl1e);
4061 return GNTST_general_error;
4064 l1pg = mfn_to_page(gl1mfn);
4065 if ( !page_lock(l1pg) )
4067 put_page(l1pg);
4068 guest_unmap_l1e(curr, pl1e);
4069 return GNTST_general_error;
4072 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4074 page_unlock(l1pg);
4075 put_page(l1pg);
4076 guest_unmap_l1e(curr, pl1e);
4077 return GNTST_general_error;
4080 ol1e = *pl1e;
4082 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
4083 gl1mfn, curr, 0)) )
4085 page_unlock(l1pg);
4086 put_page(l1pg);
4087 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
4088 guest_unmap_l1e(curr, pl1e);
4089 return GNTST_general_error;
4092 page_unlock(l1pg);
4093 put_page(l1pg);
4094 guest_unmap_l1e(curr, pl1e);
4096 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
4097 if ( rc && !paging_mode_refcounts(curr->domain) )
4098 put_page_from_l1e(ol1e, curr->domain);
4100 return rc;
4103 int donate_page(
4104 struct domain *d, struct page_info *page, unsigned int memflags)
4106 spin_lock(&d->page_alloc_lock);
4108 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
4109 goto fail;
4111 if ( d->is_dying )
4112 goto fail;
4114 if ( page->count_info & ~(PGC_allocated | 1) )
4115 goto fail;
4117 if ( !(memflags & MEMF_no_refcount) )
4119 if ( d->tot_pages >= d->max_pages )
4120 goto fail;
4121 d->tot_pages++;
4124 page->count_info = PGC_allocated | 1;
4125 page_set_owner(page, d);
4126 page_list_add_tail(page,&d->page_list);
4128 spin_unlock(&d->page_alloc_lock);
4129 return 0;
4131 fail:
4132 spin_unlock(&d->page_alloc_lock);
4133 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
4134 (void *)page_to_mfn(page), d, d->domain_id,
4135 page_get_owner(page), page->count_info, page->u.inuse.type_info);
4136 return -1;
4139 int steal_page(
4140 struct domain *d, struct page_info *page, unsigned int memflags)
4142 unsigned long x, y;
4143 bool_t drop_dom_ref = 0;
4145 spin_lock(&d->page_alloc_lock);
4147 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
4148 goto fail;
4150 /*
4151 * We require there is just one reference (PGC_allocated). We temporarily
4152 * drop this reference now so that we can safely swizzle the owner.
4153 */
4154 y = page->count_info;
4155 do {
4156 x = y;
4157 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
4158 goto fail;
4159 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
4160 } while ( y != x );
4162 /* Swizzle the owner then reinstate the PGC_allocated reference. */
4163 page_set_owner(page, NULL);
4164 y = page->count_info;
4165 do {
4166 x = y;
4167 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
4168 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
4170 /* Unlink from original owner. */
4171 if ( !(memflags & MEMF_no_refcount) && !--d->tot_pages )
4172 drop_dom_ref = 1;
4173 page_list_del(page, &d->page_list);
4175 spin_unlock(&d->page_alloc_lock);
4176 if ( unlikely(drop_dom_ref) )
4177 put_domain(d);
4178 return 0;
4180 fail:
4181 spin_unlock(&d->page_alloc_lock);
4182 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
4183 (void *)page_to_mfn(page), d, d->domain_id,
4184 page_get_owner(page), page->count_info, page->u.inuse.type_info);
4185 return -1;
4188 int page_make_sharable(struct domain *d,
4189 struct page_info *page,
4190 int expected_refcnt)
4192 spin_lock(&d->page_alloc_lock);
4194 /* Change page type and count atomically */
4195 if ( !get_page_and_type(page, d, PGT_shared_page) )
4197 spin_unlock(&d->page_alloc_lock);
4198 return -EINVAL;
4201 /* Check it wasn't already sharable and undo if it was */
4202 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
4204 put_page_and_type(page);
4205 spin_unlock(&d->page_alloc_lock);
4206 return -EEXIST;
4209 /* Check if the ref count is 2. The first from PGT_allocated, and
4210 * the second from get_page_and_type at the top of this function */
4211 if(page->count_info != (PGC_allocated | (2 + expected_refcnt)))
4213 /* Return type count back to zero */
4214 put_page_and_type(page);
4215 spin_unlock(&d->page_alloc_lock);
4216 return -E2BIG;
4219 page_set_owner(page, dom_cow);
4220 d->tot_pages--;
4221 page_list_del(page, &d->page_list);
4222 spin_unlock(&d->page_alloc_lock);
4223 return 0;
4226 int page_make_private(struct domain *d, struct page_info *page)
4228 if(!get_page(page, dom_cow))
4229 return -EINVAL;
4231 spin_lock(&d->page_alloc_lock);
4233 /* We can only change the type if count is one */
4234 if ( (page->u.inuse.type_info & (PGT_type_mask | PGT_count_mask))
4235 != (PGT_shared_page | 1) )
4237 put_page(page);
4238 spin_unlock(&d->page_alloc_lock);
4239 return -EEXIST;
4242 /* Drop the final typecount */
4243 put_page_and_type(page);
4245 /* Change the owner */
4246 ASSERT(page_get_owner(page) == dom_cow);
4247 page_set_owner(page, d);
4249 d->tot_pages++;
4250 page_list_add_tail(page, &d->page_list);
4251 spin_unlock(&d->page_alloc_lock);
4253 put_page(page);
4255 return 0;
4258 static int __do_update_va_mapping(
4259 unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
4261 l1_pgentry_t val = l1e_from_intpte(val64);
4262 struct vcpu *v = current;
4263 struct domain *d = v->domain;
4264 struct page_info *gl1pg;
4265 l1_pgentry_t *pl1e;
4266 unsigned long bmap_ptr, gl1mfn;
4267 cpumask_t pmask;
4268 int rc;
4270 perfc_incr(calls_to_update_va);
4272 rc = xsm_update_va_mapping(d, pg_owner, val);
4273 if ( rc )
4274 return rc;
4276 rc = -EINVAL;
4277 pl1e = guest_map_l1e(v, va, &gl1mfn);
4278 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
4279 goto out;
4281 gl1pg = mfn_to_page(gl1mfn);
4282 if ( !page_lock(gl1pg) )
4284 put_page(gl1pg);
4285 goto out;
4288 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4290 page_unlock(gl1pg);
4291 put_page(gl1pg);
4292 goto out;
4295 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v, pg_owner) ? 0 : -EINVAL;
4297 page_unlock(gl1pg);
4298 put_page(gl1pg);
4300 out:
4301 if ( pl1e )
4302 guest_unmap_l1e(v, pl1e);
4304 switch ( flags & UVMF_FLUSHTYPE_MASK )
4306 case UVMF_TLB_FLUSH:
4307 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
4309 case UVMF_LOCAL:
4310 flush_tlb_local();
4311 break;
4312 case UVMF_ALL:
4313 flush_tlb_mask(&d->domain_dirty_cpumask);
4314 break;
4315 default:
4316 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
4317 void),
4318 &pmask);
4319 flush_tlb_mask(&pmask);
4320 break;
4322 break;
4324 case UVMF_INVLPG:
4325 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
4327 case UVMF_LOCAL:
4328 if ( !paging_mode_enabled(d) ||
4329 (paging_invlpg(v, va) != 0) )
4330 flush_tlb_one_local(va);
4331 break;
4332 case UVMF_ALL:
4333 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
4334 break;
4335 default:
4336 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
4337 void),
4338 &pmask);
4339 flush_tlb_one_mask(&pmask, va);
4340 break;
4342 break;
4345 return rc;
4348 int do_update_va_mapping(unsigned long va, u64 val64,
4349 unsigned long flags)
4351 return __do_update_va_mapping(va, val64, flags, current->domain);
4354 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
4355 unsigned long flags,
4356 domid_t domid)
4358 struct domain *pg_owner;
4359 int rc;
4361 if ( (pg_owner = get_pg_owner(domid)) == NULL )
4362 return -ESRCH;
4364 rc = __do_update_va_mapping(va, val64, flags, pg_owner);
4366 put_pg_owner(pg_owner);
4368 return rc;
4373 /*************************
4374 * Descriptor Tables
4375 */
4377 void destroy_gdt(struct vcpu *v)
4379 int i;
4380 unsigned long pfn;
4382 v->arch.guest_context.gdt_ents = 0;
4383 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
4385 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
4386 put_page_and_type(mfn_to_page(pfn));
4387 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
4388 v->arch.guest_context.gdt_frames[i] = 0;
4393 long set_gdt(struct vcpu *v,
4394 unsigned long *frames,
4395 unsigned int entries)
4397 struct domain *d = v->domain;
4398 /* NB. There are 512 8-byte entries per GDT page. */
4399 int i, nr_pages = (entries + 511) / 512;
4400 unsigned long mfn;
4402 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4403 return -EINVAL;
4405 /* Check the pages in the new GDT. */
4406 for ( i = 0; i < nr_pages; i++ )
4408 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
4409 if ( !mfn_valid(mfn) ||
4410 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
4411 goto fail;
4414 /* Tear down the old GDT. */
4415 destroy_gdt(v);
4417 /* Install the new GDT. */
4418 v->arch.guest_context.gdt_ents = entries;
4419 for ( i = 0; i < nr_pages; i++ )
4421 v->arch.guest_context.gdt_frames[i] = frames[i];
4422 l1e_write(&v->arch.perdomain_ptes[i],
4423 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
4426 return 0;
4428 fail:
4429 while ( i-- > 0 )
4430 put_page_and_type(mfn_to_page(frames[i]));
4431 return -EINVAL;
4435 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
4437 int nr_pages = (entries + 511) / 512;
4438 unsigned long frames[16];
4439 struct vcpu *curr = current;
4440 long ret;
4442 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
4443 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4444 return -EINVAL;
4446 if ( copy_from_guest(frames, frame_list, nr_pages) )
4447 return -EFAULT;
4449 domain_lock(curr->domain);
4451 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
4452 flush_tlb_local();
4454 domain_unlock(curr->domain);
4456 return ret;
4460 long do_update_descriptor(u64 pa, u64 desc)
4462 struct domain *dom = current->domain;
4463 unsigned long gmfn = pa >> PAGE_SHIFT;
4464 unsigned long mfn;
4465 unsigned int offset;
4466 struct desc_struct *gdt_pent, d;
4467 struct page_info *page;
4468 long ret = -EINVAL;
4470 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
4472 *(u64 *)&d = desc;
4474 mfn = gmfn_to_mfn(dom, gmfn);
4475 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
4476 !mfn_valid(mfn) ||
4477 !check_descriptor(dom, &d) )
4478 return -EINVAL;
4480 page = mfn_to_page(mfn);
4481 if ( unlikely(!get_page(page, dom)) )
4482 return -EINVAL;
4484 /* Check if the given frame is in use in an unsafe context. */
4485 switch ( page->u.inuse.type_info & PGT_type_mask )
4487 case PGT_seg_desc_page:
4488 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
4489 goto out;
4490 break;
4491 default:
4492 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
4493 goto out;
4494 break;
4497 paging_mark_dirty(dom, mfn);
4499 /* All is good so make the update. */
4500 gdt_pent = map_domain_page(mfn);
4501 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
4502 unmap_domain_page(gdt_pent);
4504 put_page_type(page);
4506 ret = 0; /* success */
4508 out:
4509 put_page(page);
4511 return ret;
4514 typedef struct e820entry e820entry_t;
4515 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
4517 struct memory_map_context
4519 unsigned int n;
4520 unsigned long s;
4521 struct xen_memory_map map;
4522 };
4524 static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4526 struct memory_map_context *ctxt = p;
4528 if ( s > ctxt->s )
4530 e820entry_t ent;
4531 XEN_GUEST_HANDLE(e820entry_t) buffer;
4533 if ( ctxt->n + 1 >= ctxt->map.nr_entries )
4534 return -EINVAL;
4535 ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4536 ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4537 ent.type = E820_RESERVED;
4538 buffer = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4539 if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) )
4540 return -EFAULT;
4541 ctxt->n++;
4543 ctxt->s = e + 1;
4545 return 0;
4548 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
4550 struct page_info *page = NULL;
4551 int rc;
4553 switch ( op )
4555 case XENMEM_add_to_physmap:
4557 struct xen_add_to_physmap xatp;
4558 unsigned long prev_mfn, mfn = 0, gpfn;
4559 struct domain *d;
4561 if ( copy_from_guest(&xatp, arg, 1) )
4562 return -EFAULT;
4564 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
4565 if ( rc != 0 )
4566 return rc;
4568 if ( xsm_add_to_physmap(current->domain, d) )
4570 rcu_unlock_domain(d);
4571 return -EPERM;
4574 switch ( xatp.space )
4576 case XENMAPSPACE_shared_info:
4577 if ( xatp.idx == 0 )
4578 mfn = virt_to_mfn(d->shared_info);
4579 break;
4580 case XENMAPSPACE_grant_table:
4581 spin_lock(&d->grant_table->lock);
4583 if ( d->grant_table->gt_version == 0 )
4584 d->grant_table->gt_version = 1;
4586 if ( d->grant_table->gt_version == 2 &&
4587 (xatp.idx & XENMAPIDX_grant_table_status) )
4589 xatp.idx &= ~XENMAPIDX_grant_table_status;
4590 if ( xatp.idx < nr_status_frames(d->grant_table) )
4591 mfn = virt_to_mfn(d->grant_table->status[xatp.idx]);
4593 else
4595 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
4596 (xatp.idx < max_nr_grant_frames) )
4597 gnttab_grow_table(d, xatp.idx + 1);
4599 if ( xatp.idx < nr_grant_frames(d->grant_table) )
4600 mfn = virt_to_mfn(d->grant_table->shared_raw[xatp.idx]);
4603 spin_unlock(&d->grant_table->lock);
4604 break;
4605 case XENMAPSPACE_gmfn:
4607 p2m_type_t p2mt;
4609 xatp.idx = mfn_x(gfn_to_mfn_unshare(p2m_get_hostp2m(d),
4610 xatp.idx, &p2mt, 0));
4611 /* If the page is still shared, exit early */
4612 if ( p2m_is_shared(p2mt) )
4614 rcu_unlock_domain(d);
4615 return -ENOMEM;
4617 if ( !get_page_from_pagenr(xatp.idx, d) )
4618 break;
4619 mfn = xatp.idx;
4620 page = mfn_to_page(mfn);
4621 break;
4623 default:
4624 break;
4627 if ( !paging_mode_translate(d) || (mfn == 0) )
4629 if ( page )
4630 put_page(page);
4631 rcu_unlock_domain(d);
4632 return -EINVAL;
4635 domain_lock(d);
4637 if ( page )
4638 put_page(page);
4640 /* Remove previously mapped page if it was present. */
4641 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
4642 if ( mfn_valid(prev_mfn) )
4644 if ( is_xen_heap_mfn(prev_mfn) )
4645 /* Xen heap frames are simply unhooked from this phys slot. */
4646 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
4647 else
4648 /* Normal domain memory is freed, to avoid leaking memory. */
4649 guest_remove_page(d, xatp.gpfn);
4652 /* Unmap from old location, if any. */
4653 gpfn = get_gpfn_from_mfn(mfn);
4654 ASSERT( gpfn != SHARED_M2P_ENTRY );
4655 if ( gpfn != INVALID_M2P_ENTRY )
4656 guest_physmap_remove_page(d, gpfn, mfn, 0);
4658 /* Map at new location. */
4659 rc = guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
4661 domain_unlock(d);
4663 rcu_unlock_domain(d);
4665 return rc;
4668 case XENMEM_set_memory_map:
4670 struct xen_foreign_memory_map fmap;
4671 struct domain *d;
4672 struct e820entry *e820;
4674 if ( copy_from_guest(&fmap, arg, 1) )
4675 return -EFAULT;
4677 if ( fmap.map.nr_entries > E820MAX )
4678 return -EINVAL;
4680 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4681 if ( rc != 0 )
4682 return rc;
4684 rc = xsm_domain_memory_map(d);
4685 if ( rc )
4687 rcu_unlock_domain(d);
4688 return rc;
4691 e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries);
4692 if ( e820 == NULL )
4694 rcu_unlock_domain(d);
4695 return -ENOMEM;
4698 if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) )
4700 xfree(e820);
4701 rcu_unlock_domain(d);
4702 return -EFAULT;
4705 spin_lock(&d->arch.e820_lock);
4706 xfree(d->arch.e820);
4707 d->arch.e820 = e820;
4708 d->arch.nr_e820 = fmap.map.nr_entries;
4709 spin_unlock(&d->arch.e820_lock);
4711 rcu_unlock_domain(d);
4712 return rc;
4715 case XENMEM_memory_map:
4717 struct xen_memory_map map;
4718 struct domain *d = current->domain;
4720 if ( copy_from_guest(&map, arg, 1) )
4721 return -EFAULT;
4723 spin_lock(&d->arch.e820_lock);
4725 /* Backwards compatibility. */
4726 if ( (d->arch.nr_e820 == 0) ||
4727 (d->arch.e820 == NULL) )
4729 spin_unlock(&d->arch.e820_lock);
4730 return -ENOSYS;
4733 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4734 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4735 copy_to_guest(arg, &map, 1) )
4737 spin_unlock(&d->arch.e820_lock);
4738 return -EFAULT;
4741 spin_unlock(&d->arch.e820_lock);
4742 return 0;
4745 case XENMEM_machine_memory_map:
4747 struct memory_map_context ctxt;
4748 XEN_GUEST_HANDLE(e820entry_t) buffer;
4749 unsigned int i;
4751 if ( !IS_PRIV(current->domain) )
4752 return -EINVAL;
4754 rc = xsm_machine_memory_map();
4755 if ( rc )
4756 return rc;
4758 if ( copy_from_guest(&ctxt.map, arg, 1) )
4759 return -EFAULT;
4760 if ( ctxt.map.nr_entries < e820.nr_map + 1 )
4761 return -EINVAL;
4763 buffer = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4764 if ( !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4765 return -EFAULT;
4767 for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4769 unsigned long s = PFN_DOWN(e820.map[i].addr);
4771 if ( s )
4773 rc = rangeset_report_ranges(current->domain->iomem_caps,
4774 ctxt.s, s - 1,
4775 handle_iomem_range, &ctxt);
4776 if ( !rc )
4777 rc = handle_iomem_range(s, s, &ctxt);
4778 if ( rc )
4779 return rc;
4781 if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4782 return -EINVAL;
4783 if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) )
4784 return -EFAULT;
4785 ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4788 if ( ctxt.s )
4790 rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4791 ~0UL, handle_iomem_range, &ctxt);
4792 if ( !rc && ctxt.s )
4793 rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4794 if ( rc )
4795 return rc;
4798 ctxt.map.nr_entries = ctxt.n;
4800 if ( copy_to_guest(arg, &ctxt.map, 1) )
4801 return -EFAULT;
4803 return 0;
4806 case XENMEM_machphys_mapping:
4808 struct xen_machphys_mapping mapping = {
4809 .v_start = MACH2PHYS_VIRT_START,
4810 .v_end = MACH2PHYS_VIRT_END,
4811 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4812 };
4814 if ( !mem_hotplug && current->domain == dom0 )
4815 mapping.max_mfn = max_page - 1;
4816 if ( copy_to_guest(arg, &mapping, 1) )
4817 return -EFAULT;
4819 return 0;
4822 case XENMEM_set_pod_target:
4823 case XENMEM_get_pod_target:
4825 xen_pod_target_t target;
4826 struct domain *d;
4827 struct p2m_domain *p2m;
4829 /* Support DOMID_SELF? */
4830 if ( !IS_PRIV(current->domain) )
4831 return -EINVAL;
4833 if ( copy_from_guest(&target, arg, 1) )
4834 return -EFAULT;
4836 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4837 if ( rc != 0 )
4838 return rc;
4840 if ( op == XENMEM_set_pod_target )
4842 if ( target.target_pages > d->max_pages )
4844 rc = -EINVAL;
4845 goto pod_target_out_unlock;
4848 rc = p2m_pod_set_mem_target(d, target.target_pages);
4851 if ( rc == -EAGAIN )
4853 rc = hypercall_create_continuation(
4854 __HYPERVISOR_memory_op, "lh", op, arg);
4856 else if ( rc >= 0 )
4858 p2m = p2m_get_hostp2m(d);
4859 target.tot_pages = d->tot_pages;
4860 target.pod_cache_pages = p2m->pod.count;
4861 target.pod_entries = p2m->pod.entry_count;
4863 if ( copy_to_guest(arg, &target, 1) )
4865 rc= -EFAULT;
4866 goto pod_target_out_unlock;
4870 pod_target_out_unlock:
4871 rcu_unlock_domain(d);
4872 return rc;
4875 #ifdef __x86_64__
4876 case XENMEM_get_sharing_freed_pages:
4877 return mem_sharing_get_nr_saved_mfns();
4878 #endif
4880 default:
4881 return subarch_memory_op(op, arg);
4884 return 0;
4888 /*************************
4889 * Writable Pagetables
4890 */
4892 struct ptwr_emulate_ctxt {
4893 struct x86_emulate_ctxt ctxt;
4894 unsigned long cr2;
4895 l1_pgentry_t pte;
4896 };
4898 static int ptwr_emulated_read(
4899 enum x86_segment seg,
4900 unsigned long offset,
4901 void *p_data,
4902 unsigned int bytes,
4903 struct x86_emulate_ctxt *ctxt)
4905 unsigned int rc;
4906 unsigned long addr = offset;
4908 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4910 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4911 return X86EMUL_EXCEPTION;
4914 return X86EMUL_OKAY;
4917 static int ptwr_emulated_update(
4918 unsigned long addr,
4919 paddr_t old,
4920 paddr_t val,
4921 unsigned int bytes,
4922 unsigned int do_cmpxchg,
4923 struct ptwr_emulate_ctxt *ptwr_ctxt)
4925 unsigned long mfn;
4926 unsigned long unaligned_addr = addr;
4927 struct page_info *page;
4928 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4929 struct vcpu *v = current;
4930 struct domain *d = v->domain;
4932 /* Only allow naturally-aligned stores within the original %cr2 page. */
4933 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4935 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4936 ptwr_ctxt->cr2, addr, bytes);
4937 return X86EMUL_UNHANDLEABLE;
4940 /* Turn a sub-word access into a full-word access. */
4941 if ( bytes != sizeof(paddr_t) )
4943 paddr_t full;
4944 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4946 /* Align address; read full word. */
4947 addr &= ~(sizeof(paddr_t)-1);
4948 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4950 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4951 return X86EMUL_EXCEPTION;
4953 /* Mask out bits provided by caller. */
4954 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4955 /* Shift the caller value and OR in the missing bits. */
4956 val &= (((paddr_t)1 << (bytes*8)) - 1);
4957 val <<= (offset)*8;
4958 val |= full;
4959 /* Also fill in missing parts of the cmpxchg old value. */
4960 old &= (((paddr_t)1 << (bytes*8)) - 1);
4961 old <<= (offset)*8;
4962 old |= full;
4965 pte = ptwr_ctxt->pte;
4966 mfn = l1e_get_pfn(pte);
4967 page = mfn_to_page(mfn);
4969 /* We are looking only for read-only mappings of p.t. pages. */
4970 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4971 ASSERT(mfn_valid(mfn));
4972 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4973 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4974 ASSERT(page_get_owner(page) == d);
4976 /* Check the new PTE. */
4977 nl1e = l1e_from_intpte(val);
4978 switch ( get_page_from_l1e(nl1e, d, d) )
4980 case 0:
4981 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4982 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4984 /*
4985 * If this is an upper-half write to a PAE PTE then we assume that
4986 * the guest has simply got the two writes the wrong way round. We
4987 * zap the PRESENT bit on the assumption that the bottom half will
4988 * be written immediately after we return to the guest.
4989 */
4990 gdprintk(XENLOG_DEBUG, "ptwr_emulate: fixing up invalid PAE PTE %"
4991 PRIpte"\n", l1e_get_intpte(nl1e));
4992 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4994 else
4996 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4997 return X86EMUL_UNHANDLEABLE;
4999 break;
5000 case -1:
5001 l1e_remove_flags(nl1e, _PAGE_RW);
5002 break;
5005 adjust_guest_l1e(nl1e, d);
5007 /* Checked successfully: do the update (write or cmpxchg). */
5008 pl1e = map_domain_page(mfn);
5009 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
5010 if ( do_cmpxchg )
5012 int okay;
5013 intpte_t t = old;
5014 ol1e = l1e_from_intpte(old);
5016 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
5017 &t, l1e_get_intpte(nl1e), _mfn(mfn));
5018 okay = (okay && t == old);
5020 if ( !okay )
5022 unmap_domain_page(pl1e);
5023 put_page_from_l1e(nl1e, d);
5024 return X86EMUL_CMPXCHG_FAILED;
5027 else
5029 ol1e = *pl1e;
5030 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
5031 BUG();
5034 trace_ptwr_emulation(addr, nl1e);
5036 unmap_domain_page(pl1e);
5038 /* Finally, drop the old PTE. */
5039 put_page_from_l1e(ol1e, d);
5041 return X86EMUL_OKAY;
5044 static int ptwr_emulated_write(
5045 enum x86_segment seg,
5046 unsigned long offset,
5047 void *p_data,
5048 unsigned int bytes,
5049 struct x86_emulate_ctxt *ctxt)
5051 paddr_t val = 0;
5053 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
5055 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
5056 offset, bytes);
5057 return X86EMUL_UNHANDLEABLE;
5060 memcpy(&val, p_data, bytes);
5062 return ptwr_emulated_update(
5063 offset, 0, val, bytes, 0,
5064 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
5067 static int ptwr_emulated_cmpxchg(
5068 enum x86_segment seg,
5069 unsigned long offset,
5070 void *p_old,
5071 void *p_new,
5072 unsigned int bytes,
5073 struct x86_emulate_ctxt *ctxt)
5075 paddr_t old = 0, new = 0;
5077 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
5079 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
5080 offset, bytes);
5081 return X86EMUL_UNHANDLEABLE;
5084 memcpy(&old, p_old, bytes);
5085 memcpy(&new, p_new, bytes);
5087 return ptwr_emulated_update(
5088 offset, old, new, bytes, 1,
5089 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
5092 static const struct x86_emulate_ops ptwr_emulate_ops = {
5093 .read = ptwr_emulated_read,
5094 .insn_fetch = ptwr_emulated_read,
5095 .write = ptwr_emulated_write,
5096 .cmpxchg = ptwr_emulated_cmpxchg,
5097 };
5099 /* Write page fault handler: check if guest is trying to modify a PTE. */
5100 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
5101 struct cpu_user_regs *regs)
5103 struct domain *d = v->domain;
5104 struct page_info *page;
5105 l1_pgentry_t pte;
5106 struct ptwr_emulate_ctxt ptwr_ctxt;
5107 int rc;
5109 /* Attempt to read the PTE that maps the VA being accessed. */
5110 guest_get_eff_l1e(v, addr, &pte);
5112 /* We are looking only for read-only mappings of p.t. pages. */
5113 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
5114 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
5115 goto bail;
5117 page = l1e_get_page(pte);
5118 if ( !page_lock(page) )
5120 put_page(page);
5121 goto bail;
5124 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
5126 page_unlock(page);
5127 put_page(page);
5128 goto bail;
5131 ptwr_ctxt.ctxt.regs = regs;
5132 ptwr_ctxt.ctxt.force_writeback = 0;
5133 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
5134 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
5135 ptwr_ctxt.cr2 = addr;
5136 ptwr_ctxt.pte = pte;
5138 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
5140 page_unlock(page);
5141 put_page(page);
5143 if ( rc == X86EMUL_UNHANDLEABLE )
5144 goto bail;
5146 perfc_incr(ptwr_emulations);
5147 return EXCRET_fault_fixed;
5149 bail:
5150 return 0;
5153 void free_xen_pagetable(void *v)
5155 if ( system_state == SYS_STATE_early_boot )
5156 return;
5158 if ( is_xen_heap_page(virt_to_page(v)) )
5159 free_xenheap_page(v);
5160 else
5161 free_domheap_page(virt_to_page(v));
5164 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
5165 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
5166 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
5168 /*
5169 * map_pages_to_xen() can be called with interrupts disabled:
5170 * * During early bootstrap; or
5171 * * alloc_xenheap_pages() via memguard_guard_range
5172 * In these cases it is safe to use flush_area_local():
5173 * * Because only the local CPU is online; or
5174 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
5175 */
5176 #define flush_area(v,f) (!local_irq_is_enabled() ? \
5177 flush_area_local((const void *)v, f) : \
5178 flush_area_all((const void *)v, f))
5180 int map_pages_to_xen(
5181 unsigned long virt,
5182 unsigned long mfn,
5183 unsigned long nr_mfns,
5184 unsigned int flags)
5186 l2_pgentry_t *pl2e, ol2e;
5187 l1_pgentry_t *pl1e, ol1e;
5188 unsigned int i;
5190 while ( nr_mfns != 0 )
5192 #ifdef __x86_64__
5193 l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
5195 if ( !pl3e )
5196 return -ENOMEM;
5197 ol3e = *pl3e;
5199 if ( cpu_has_page1gb &&
5200 !(((virt >> PAGE_SHIFT) | mfn) &
5201 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
5202 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
5203 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
5205 /* 1GB-page mapping. */
5206 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
5208 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
5210 unsigned int flush_flags =
5211 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
5213 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
5215 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
5216 flush_flags |= FLUSH_TLB_GLOBAL;
5217 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
5218 PAGE_CACHE_ATTRS )
5219 flush_flags |= FLUSH_CACHE;
5220 flush_area(virt, flush_flags);
5222 else
5224 pl2e = l3e_to_l2e(ol3e);
5225 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5227 ol2e = pl2e[i];
5228 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
5229 continue;
5230 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
5232 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
5233 flush_flags |= FLUSH_TLB_GLOBAL;
5234 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
5235 PAGE_CACHE_ATTRS )
5236 flush_flags |= FLUSH_CACHE;
5238 else
5240 unsigned int j;
5242 pl1e = l2e_to_l1e(ol2e);
5243 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
5245 ol1e = pl1e[j];
5246 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
5247 flush_flags |= FLUSH_TLB_GLOBAL;
5248 if ( (l1e_get_flags(ol1e) ^ flags) &
5249 PAGE_CACHE_ATTRS )
5250 flush_flags |= FLUSH_CACHE;
5254 flush_area(virt, flush_flags);
5255 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5257 ol2e = pl2e[i];
5258 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
5259 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
5260 free_xen_pagetable(l2e_to_l1e(ol2e));
5262 free_xen_pagetable(pl2e);
5266 virt += 1UL << L3_PAGETABLE_SHIFT;
5267 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
5268 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
5269 continue;
5272 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
5273 (l3e_get_flags(ol3e) & _PAGE_PSE) )
5275 unsigned int flush_flags =
5276 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
5278 /* Skip this PTE if there is no change. */
5279 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
5280 L1_PAGETABLE_ENTRIES - 1)) +
5281 (l2_table_offset(virt) << PAGETABLE_ORDER) +
5282 l1_table_offset(virt) == mfn) &&
5283 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
5284 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
5286 /* We can skip to end of L3 superpage if we got a match. */
5287 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
5288 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
5289 if ( i > nr_mfns )
5290 i = nr_mfns;
5291 virt += i << PAGE_SHIFT;
5292 mfn += i;
5293 nr_mfns -= i;
5294 continue;
5297 pl2e = alloc_xen_pagetable();
5298 if ( pl2e == NULL )
5299 return -ENOMEM;
5301 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5302 l2e_write(pl2e + i,
5303 l2e_from_pfn(l3e_get_pfn(ol3e) +
5304 (i << PAGETABLE_ORDER),
5305 l3e_get_flags(ol3e)));
5307 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
5308 flush_flags |= FLUSH_TLB_GLOBAL;
5310 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5311 __PAGE_HYPERVISOR));
5312 flush_area(virt, flush_flags);
5314 #endif
5316 pl2e = virt_to_xen_l2e(virt);
5317 if ( !pl2e )
5318 return -ENOMEM;
5320 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
5321 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
5322 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
5324 /* Super-page mapping. */
5325 ol2e = *pl2e;
5326 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
5328 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
5330 unsigned int flush_flags =
5331 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
5333 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
5335 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
5336 flush_flags |= FLUSH_TLB_GLOBAL;
5337 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
5338 PAGE_CACHE_ATTRS )
5339 flush_flags |= FLUSH_CACHE;
5340 flush_area(virt, flush_flags);
5342 else
5344 pl1e = l2e_to_l1e(ol2e);
5345 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5347 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
5348 flush_flags |= FLUSH_TLB_GLOBAL;
5349 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
5350 PAGE_CACHE_ATTRS )
5351 flush_flags |= FLUSH_CACHE;
5353 flush_area(virt, flush_flags);
5354 free_xen_pagetable(pl1e);
5358 virt += 1UL << L2_PAGETABLE_SHIFT;
5359 mfn += 1UL << PAGETABLE_ORDER;
5360 nr_mfns -= 1UL << PAGETABLE_ORDER;
5362 else
5364 /* Normal page mapping. */
5365 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5367 pl1e = alloc_xen_pagetable();
5368 if ( pl1e == NULL )
5369 return -ENOMEM;
5370 clear_page(pl1e);
5371 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5372 __PAGE_HYPERVISOR));
5374 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5376 unsigned int flush_flags =
5377 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
5379 /* Skip this PTE if there is no change. */
5380 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
5381 l1_table_offset(virt)) == mfn) &&
5382 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
5383 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
5385 /* We can skip to end of L2 superpage if we got a match. */
5386 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
5387 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
5388 if ( i > nr_mfns )
5389 i = nr_mfns;
5390 virt += i << L1_PAGETABLE_SHIFT;
5391 mfn += i;
5392 nr_mfns -= i;
5393 goto check_l3;
5396 pl1e = alloc_xen_pagetable();
5397 if ( pl1e == NULL )
5398 return -ENOMEM;
5400 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5401 l1e_write(&pl1e[i],
5402 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5403 lNf_to_l1f(l2e_get_flags(*pl2e))));
5405 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
5406 flush_flags |= FLUSH_TLB_GLOBAL;
5408 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5409 __PAGE_HYPERVISOR));
5410 flush_area(virt, flush_flags);
5413 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
5414 ol1e = *pl1e;
5415 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
5416 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
5418 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
5419 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
5420 flush_flags |= FLUSH_TLB_GLOBAL;
5421 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
5422 flush_flags |= FLUSH_CACHE;
5423 flush_area(virt, flush_flags);
5426 virt += 1UL << L1_PAGETABLE_SHIFT;
5427 mfn += 1UL;
5428 nr_mfns -= 1UL;
5430 if ( (flags == PAGE_HYPERVISOR) &&
5431 ((nr_mfns == 0) ||
5432 ((((virt >> PAGE_SHIFT) | mfn) &
5433 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
5435 unsigned long base_mfn;
5436 pl1e = l2e_to_l1e(*pl2e);
5437 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
5438 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
5439 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
5440 (l1e_get_flags(*pl1e) != flags) )
5441 break;
5442 if ( i == L1_PAGETABLE_ENTRIES )
5444 ol2e = *pl2e;
5445 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
5446 l1f_to_lNf(flags)));
5447 flush_area(virt - PAGE_SIZE,
5448 FLUSH_TLB_GLOBAL |
5449 FLUSH_ORDER(PAGETABLE_ORDER));
5450 free_xen_pagetable(l2e_to_l1e(ol2e));
5455 check_l3: ;
5456 #ifdef __x86_64__
5457 if ( cpu_has_page1gb &&
5458 (flags == PAGE_HYPERVISOR) &&
5459 ((nr_mfns == 0) ||
5460 !(((virt >> PAGE_SHIFT) | mfn) &
5461 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
5463 unsigned long base_mfn;
5465 ol3e = *pl3e;
5466 pl2e = l3e_to_l2e(ol3e);
5467 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
5468 L1_PAGETABLE_ENTRIES - 1);
5469 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
5470 if ( (l2e_get_pfn(*pl2e) !=
5471 (base_mfn + (i << PAGETABLE_ORDER))) ||
5472 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
5473 break;
5474 if ( i == L2_PAGETABLE_ENTRIES )
5476 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
5477 l1f_to_lNf(flags)));
5478 flush_area(virt - PAGE_SIZE,
5479 FLUSH_TLB_GLOBAL |
5480 FLUSH_ORDER(2*PAGETABLE_ORDER));
5481 free_xen_pagetable(l3e_to_l2e(ol3e));
5484 #endif
5487 return 0;
5490 void destroy_xen_mappings(unsigned long s, unsigned long e)
5492 l2_pgentry_t *pl2e;
5493 l1_pgentry_t *pl1e;
5494 unsigned int i;
5495 unsigned long v = s;
5497 ASSERT((s & ~PAGE_MASK) == 0);
5498 ASSERT((e & ~PAGE_MASK) == 0);
5500 while ( v < e )
5502 #ifdef __x86_64__
5503 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
5505 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
5507 v += 1UL << L3_PAGETABLE_SHIFT;
5508 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
5509 continue;
5512 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
5514 if ( l2_table_offset(v) == 0 &&
5515 l1_table_offset(v) == 0 &&
5516 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
5518 /* PAGE1GB: whole superpage is destroyed. */
5519 l3e_write_atomic(pl3e, l3e_empty());
5520 v += 1UL << L3_PAGETABLE_SHIFT;
5521 continue;
5524 /* PAGE1GB: shatter the superpage and fall through. */
5525 pl2e = alloc_xen_pagetable();
5526 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5527 l2e_write(pl2e + i,
5528 l2e_from_pfn(l3e_get_pfn(*pl3e) +
5529 (i << PAGETABLE_ORDER),
5530 l3e_get_flags(*pl3e)));
5531 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5532 __PAGE_HYPERVISOR));
5534 #endif
5536 pl2e = virt_to_xen_l2e(v);
5538 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5540 v += 1UL << L2_PAGETABLE_SHIFT;
5541 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
5542 continue;
5545 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5547 if ( (l1_table_offset(v) == 0) &&
5548 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
5550 /* PSE: whole superpage is destroyed. */
5551 l2e_write_atomic(pl2e, l2e_empty());
5552 v += 1UL << L2_PAGETABLE_SHIFT;
5554 else
5556 /* PSE: shatter the superpage and try again. */
5557 pl1e = alloc_xen_pagetable();
5558 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5559 l1e_write(&pl1e[i],
5560 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5561 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5562 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5563 __PAGE_HYPERVISOR));
5566 else
5568 /* Ordinary 4kB mapping. */
5569 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5570 l1e_write_atomic(pl1e, l1e_empty());
5571 v += PAGE_SIZE;
5573 /* If we are done with the L2E, check if it is now empty. */
5574 if ( (v != e) && (l1_table_offset(v) != 0) )
5575 continue;
5576 pl1e = l2e_to_l1e(*pl2e);
5577 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5578 if ( l1e_get_intpte(pl1e[i]) != 0 )
5579 break;
5580 if ( i == L1_PAGETABLE_ENTRIES )
5582 /* Empty: zap the L2E and free the L1 page. */
5583 l2e_write_atomic(pl2e, l2e_empty());
5584 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5585 free_xen_pagetable(pl1e);
5589 #ifdef __x86_64__
5590 /* If we are done with the L3E, check if it is now empty. */
5591 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
5592 continue;
5593 pl2e = l3e_to_l2e(*pl3e);
5594 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5595 if ( l2e_get_intpte(pl2e[i]) != 0 )
5596 break;
5597 if ( i == L2_PAGETABLE_ENTRIES )
5599 /* Empty: zap the L3E and free the L2 page. */
5600 l3e_write_atomic(pl3e, l3e_empty());
5601 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5602 free_xen_pagetable(pl2e);
5604 #endif
5607 flush_area(NULL, FLUSH_TLB_GLOBAL);
5610 void __set_fixmap(
5611 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5613 BUG_ON(idx >= __end_of_fixed_addresses);
5614 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
5617 #ifdef MEMORY_GUARD
5619 void memguard_init(void)
5621 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
5622 #ifdef __i386__
5623 map_pages_to_xen(
5624 (unsigned long)__va(start),
5625 start >> PAGE_SHIFT,
5626 (xenheap_phys_end - start) >> PAGE_SHIFT,
5627 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5628 #else
5629 map_pages_to_xen(
5630 (unsigned long)__va(start),
5631 start >> PAGE_SHIFT,
5632 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5633 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5634 BUG_ON(start != xen_phys_start);
5635 map_pages_to_xen(
5636 XEN_VIRT_START,
5637 start >> PAGE_SHIFT,
5638 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5639 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5640 #endif
5643 static void __memguard_change_range(void *p, unsigned long l, int guard)
5645 unsigned long _p = (unsigned long)p;
5646 unsigned long _l = (unsigned long)l;
5647 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
5649 /* Ensure we are dealing with a page-aligned whole number of pages. */
5650 ASSERT((_p&~PAGE_MASK) == 0);
5651 ASSERT((_l&~PAGE_MASK) == 0);
5653 if ( guard )
5654 flags &= ~_PAGE_PRESENT;
5656 map_pages_to_xen(
5657 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5660 void memguard_guard_range(void *p, unsigned long l)
5662 __memguard_change_range(p, l, 1);
5665 void memguard_unguard_range(void *p, unsigned long l)
5667 __memguard_change_range(p, l, 0);
5670 #endif
5672 void memguard_guard_stack(void *p)
5674 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5675 p = (void *)((unsigned long)p + STACK_SIZE -
5676 PRIMARY_STACK_SIZE - PAGE_SIZE);
5677 memguard_guard_range(p, PAGE_SIZE);
5680 void memguard_unguard_stack(void *p)
5682 p = (void *)((unsigned long)p + STACK_SIZE -
5683 PRIMARY_STACK_SIZE - PAGE_SIZE);
5684 memguard_unguard_range(p, PAGE_SIZE);
5687 /*
5688 * Local variables:
5689 * mode: C
5690 * c-set-style: "BSD"
5691 * c-basic-offset: 4
5692 * tab-width: 4
5693 * indent-tabs-mode: nil
5694 * End:
5695 */