debuggers.hg

view xen/arch/x86/mm.c @ 16959:ed8ab1a36b09

x86-64: use 1GB pages in 1:1 mapping if available

At once adjust the 2/4Mb page handling slightly in a few places (to
match the newly added code):
- when re-creating a large page mapping after finding that all small
page mappings in the respective area are using identical flags and
suitable MFNs, the virtual address was already incremented pas the
area to be dealt with, which needs to be accounted for in the
invocation of flush_area() in that path
- don't or-in/and-out _PAGE_PSE on non-present pages
- when comparing flags, try minimse the number of l1f_to_lNf()/
lNf_to_l1f() instances used
- instead of skipping a single page when encountering a big page
mapping equalling to what a small page mapping would establish, skip
to the next larger page boundary

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 28 10:17:05 2008 +0000 (2008-01-28)
parents c360bb765b25
children e23144190f93
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 static struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps)) ? \
167 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
169 #ifdef CONFIG_COMPAT
170 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
171 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
172 L3_DISALLOW_MASK : \
173 COMPAT_L3_DISALLOW_MASK)
174 #else
175 #define l3_disallow_mask(d) L3_DISALLOW_MASK
176 #endif
178 static void queue_deferred_ops(struct domain *d, unsigned int ops)
179 {
180 ASSERT(d == current->domain);
181 this_cpu(percpu_mm_info).deferred_ops |= ops;
182 }
184 void __init init_frametable(void)
185 {
186 unsigned long nr_pages, page_step, i, mfn;
188 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
190 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
191 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
193 for ( i = 0; i < nr_pages; i += page_step )
194 {
195 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
196 if ( mfn == 0 )
197 panic("Not enough memory for frame table\n");
198 map_pages_to_xen(
199 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
200 mfn, page_step, PAGE_HYPERVISOR);
201 }
203 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
204 }
206 void __init arch_init_memory(void)
207 {
208 extern void subarch_init_memory(void);
210 unsigned long i, pfn, rstart_pfn, rend_pfn, ioend_pfn;
212 /*
213 * Initialise our DOMID_XEN domain.
214 * Any Xen-heap pages that we will allow to be mapped will have
215 * their domain field set to dom_xen.
216 */
217 dom_xen = alloc_domain(DOMID_XEN);
218 BUG_ON(dom_xen == NULL);
220 /*
221 * Initialise our DOMID_IO domain.
222 * This domain owns I/O pages that are within the range of the page_info
223 * array. Mappings occur at the priv of the caller.
224 */
225 dom_io = alloc_domain(DOMID_IO);
226 BUG_ON(dom_io == NULL);
228 /* First 1MB of RAM is historically marked as I/O. */
229 for ( i = 0; i < 0x100; i++ )
230 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
232 /* Any areas not specified as RAM by the e820 map are considered I/O. */
233 for ( i = 0, pfn = 0; pfn < max_page; i++ )
234 {
235 while ( (i < e820.nr_map) &&
236 (e820.map[i].type != E820_RAM) &&
237 (e820.map[i].type != E820_UNUSABLE) )
238 i++;
240 if ( i >= e820.nr_map )
241 {
242 /* No more RAM regions: mark as I/O right to end of memory map. */
243 rstart_pfn = rend_pfn = max_page;
244 }
245 else
246 {
247 /* Mark as I/O just up as far as next RAM region. */
248 rstart_pfn = min_t(unsigned long, max_page,
249 PFN_UP(e820.map[i].addr));
250 rend_pfn = max_t(unsigned long, rstart_pfn,
251 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
252 }
254 /*
255 * Make sure any Xen mappings are blown away.
256 * In particular this ensures that RAM holes are respected even in
257 * the statically-initialised 0-16MB mapping area.
258 */
259 ioend_pfn = rstart_pfn;
260 #if defined(CONFIG_X86_32)
261 ioend_pfn = min_t(unsigned long, ioend_pfn,
262 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
263 #endif
264 if ( pfn < ioend_pfn )
265 destroy_xen_mappings((unsigned long)mfn_to_virt(pfn),
266 (unsigned long)mfn_to_virt(ioend_pfn));
268 /* Mark as I/O up to next RAM region. */
269 for ( ; pfn < rstart_pfn; pfn++ )
270 {
271 BUG_ON(!mfn_valid(pfn));
272 share_xen_page_with_guest(
273 mfn_to_page(pfn), dom_io, XENSHARE_writable);
274 }
276 /* Skip the RAM region. */
277 pfn = rend_pfn;
278 }
280 subarch_init_memory();
281 }
283 int memory_is_conventional_ram(paddr_t p)
284 {
285 int i;
287 for ( i = 0; i < e820.nr_map; i++ )
288 {
289 if ( (e820.map[i].type == E820_RAM) &&
290 (e820.map[i].addr <= p) &&
291 (e820.map[i].size > p) )
292 return 1;
293 }
295 return 0;
296 }
298 unsigned long domain_get_maximum_gpfn(struct domain *d)
299 {
300 if ( is_hvm_domain(d) )
301 return d->arch.p2m.max_mapped_pfn;
302 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
303 return arch_get_max_pfn(d) - 1;
304 }
306 void share_xen_page_with_guest(
307 struct page_info *page, struct domain *d, int readonly)
308 {
309 if ( page_get_owner(page) == d )
310 return;
312 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
314 spin_lock(&d->page_alloc_lock);
316 /* The incremented type count pins as writable or read-only. */
317 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
318 page->u.inuse.type_info |= PGT_validated | 1;
320 page_set_owner(page, d);
321 wmb(); /* install valid domain ptr before updating refcnt. */
322 ASSERT(page->count_info == 0);
324 /* Only add to the allocation list if the domain isn't dying. */
325 if ( !d->is_dying )
326 {
327 page->count_info |= PGC_allocated | 1;
328 if ( unlikely(d->xenheap_pages++ == 0) )
329 get_knownalive_domain(d);
330 list_add_tail(&page->list, &d->xenpage_list);
331 }
333 spin_unlock(&d->page_alloc_lock);
334 }
336 void share_xen_page_with_privileged_guests(
337 struct page_info *page, int readonly)
338 {
339 share_xen_page_with_guest(page, dom_xen, readonly);
340 }
342 #if defined(CONFIG_X86_PAE)
344 #ifdef NDEBUG
345 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
346 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
347 #else
348 /*
349 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
350 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
351 * (detected by lack of an owning domain). As required for correctness, we
352 * always shadow PDPTs above 4GB.
353 */
354 #define l3tab_needs_shadow(mfn) \
355 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
356 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
357 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
358 ((mfn) >= 0x100000))
359 #endif
361 static l1_pgentry_t *fix_pae_highmem_pl1e;
363 /* Cache the address of PAE high-memory fixmap page tables. */
364 static int __init cache_pae_fixmap_address(void)
365 {
366 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
367 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
368 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
369 return 0;
370 }
371 __initcall(cache_pae_fixmap_address);
373 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
375 void make_cr3(struct vcpu *v, unsigned long mfn)
376 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
377 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
378 {
379 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
380 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
381 unsigned int cpu = smp_processor_id();
383 /* Fast path: does this mfn need a shadow at all? */
384 if ( !l3tab_needs_shadow(mfn) )
385 {
386 v->arch.cr3 = mfn << PAGE_SHIFT;
387 /* Cache is no longer in use or valid */
388 cache->high_mfn = 0;
389 return;
390 }
392 /* Caching logic is not interrupt safe. */
393 ASSERT(!in_irq());
395 /* Protects against pae_flush_pgd(). */
396 spin_lock(&cache->lock);
398 cache->inuse_idx ^= 1;
399 cache->high_mfn = mfn;
401 /* Map the guest L3 table and copy to the chosen low-memory cache. */
402 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
403 /* First check the previous high mapping can't be in the TLB.
404 * (i.e. have we loaded CR3 since we last did this?) */
405 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
406 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
407 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
408 lowmem_l3tab = cache->table[cache->inuse_idx];
409 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
410 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
411 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
413 v->arch.cr3 = __pa(lowmem_l3tab);
415 spin_unlock(&cache->lock);
416 }
418 #else /* !CONFIG_X86_PAE */
420 void make_cr3(struct vcpu *v, unsigned long mfn)
421 {
422 v->arch.cr3 = mfn << PAGE_SHIFT;
423 }
425 #endif /* !CONFIG_X86_PAE */
427 void write_ptbase(struct vcpu *v)
428 {
429 write_cr3(v->arch.cr3);
430 }
432 /*
433 * Should be called after CR3 is updated.
434 *
435 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
436 * for HVM guests, arch.monitor_table and hvm's guest CR3.
437 *
438 * Update ref counts to shadow tables appropriately.
439 */
440 void update_cr3(struct vcpu *v)
441 {
442 unsigned long cr3_mfn=0;
444 if ( paging_mode_enabled(v->domain) )
445 {
446 paging_update_cr3(v);
447 return;
448 }
450 #if CONFIG_PAGING_LEVELS == 4
451 if ( !(v->arch.flags & TF_kernel_mode) )
452 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
453 else
454 #endif
455 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
457 make_cr3(v, cr3_mfn);
458 }
461 static void invalidate_shadow_ldt(struct vcpu *v)
462 {
463 int i;
464 unsigned long pfn;
465 struct page_info *page;
467 if ( v->arch.shadow_ldt_mapcnt == 0 )
468 return;
470 v->arch.shadow_ldt_mapcnt = 0;
472 for ( i = 16; i < 32; i++ )
473 {
474 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
475 if ( pfn == 0 ) continue;
476 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
477 page = mfn_to_page(pfn);
478 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
479 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
480 put_page_and_type(page);
481 }
483 /* Dispose of the (now possibly invalid) mappings from the TLB. */
484 if ( v == current )
485 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
486 else
487 flush_tlb_mask(v->domain->domain_dirty_cpumask);
488 }
491 static int alloc_segdesc_page(struct page_info *page)
492 {
493 struct desc_struct *descs;
494 int i;
496 descs = map_domain_page(page_to_mfn(page));
498 for ( i = 0; i < 512; i++ )
499 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
500 goto fail;
502 unmap_domain_page(descs);
503 return 1;
505 fail:
506 unmap_domain_page(descs);
507 return 0;
508 }
511 /* Map shadow page at offset @off. */
512 int map_ldt_shadow_page(unsigned int off)
513 {
514 struct vcpu *v = current;
515 struct domain *d = v->domain;
516 unsigned long gmfn, mfn;
517 l1_pgentry_t l1e, nl1e;
518 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
519 int okay;
521 BUG_ON(unlikely(in_irq()));
523 guest_get_eff_kern_l1e(v, gva, &l1e);
524 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
525 return 0;
527 gmfn = l1e_get_pfn(l1e);
528 mfn = gmfn_to_mfn(d, gmfn);
529 if ( unlikely(!mfn_valid(mfn)) )
530 return 0;
532 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
533 if ( unlikely(!okay) )
534 return 0;
536 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
538 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
539 v->arch.shadow_ldt_mapcnt++;
541 return 1;
542 }
545 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
546 {
547 struct page_info *page = mfn_to_page(page_nr);
549 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
550 {
551 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
552 return 0;
553 }
555 return 1;
556 }
559 static int get_page_and_type_from_pagenr(unsigned long page_nr,
560 unsigned long type,
561 struct domain *d)
562 {
563 struct page_info *page = mfn_to_page(page_nr);
565 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
566 return 0;
568 if ( unlikely(!get_page_type(page, type)) )
569 {
570 put_page(page);
571 return 0;
572 }
574 return 1;
575 }
577 /*
578 * We allow root tables to map each other (a.k.a. linear page tables). It
579 * needs some special care with reference counts and access permissions:
580 * 1. The mapping entry must be read-only, or the guest may get write access
581 * to its own PTEs.
582 * 2. We must only bump the reference counts for an *already validated*
583 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
584 * on a validation that is required to complete that validation.
585 * 3. We only need to increment the reference counts for the mapped page
586 * frame if it is mapped by a different root table. This is sufficient and
587 * also necessary to allow validation of a root table mapping itself.
588 */
589 #define define_get_linear_pagetable(level) \
590 static int \
591 get_##level##_linear_pagetable( \
592 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
593 { \
594 unsigned long x, y; \
595 struct page_info *page; \
596 unsigned long pfn; \
597 \
598 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
599 { \
600 MEM_LOG("Attempt to create linear p.t. with write perms"); \
601 return 0; \
602 } \
603 \
604 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
605 { \
606 /* Make sure the mapped frame belongs to the correct domain. */ \
607 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
608 return 0; \
609 \
610 /* \
611 * Ensure that the mapped frame is an already-validated page table. \
612 * If so, atomically increment the count (checking for overflow). \
613 */ \
614 page = mfn_to_page(pfn); \
615 y = page->u.inuse.type_info; \
616 do { \
617 x = y; \
618 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
619 unlikely((x & (PGT_type_mask|PGT_validated)) != \
620 (PGT_##level##_page_table|PGT_validated)) ) \
621 { \
622 put_page(page); \
623 return 0; \
624 } \
625 } \
626 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
627 } \
628 \
629 return 1; \
630 }
633 int is_iomem_page(unsigned long mfn)
634 {
635 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
636 }
639 int
640 get_page_from_l1e(
641 l1_pgentry_t l1e, struct domain *d)
642 {
643 unsigned long mfn = l1e_get_pfn(l1e);
644 struct page_info *page = mfn_to_page(mfn);
645 uint32_t l1f = l1e_get_flags(l1e);
646 struct vcpu *curr = current;
647 int okay;
649 if ( !(l1f & _PAGE_PRESENT) )
650 return 1;
652 if ( unlikely(l1f & l1_disallow_mask(d)) )
653 {
654 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
655 return 0;
656 }
658 if ( is_iomem_page(mfn) )
659 {
660 /* DOMID_IO reverts to caller for privilege checks. */
661 if ( d == dom_io )
662 d = curr->domain;
664 if ( !iomem_access_permitted(d, mfn, mfn) )
665 {
666 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
667 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
668 d->domain_id, mfn);
669 return 0;
670 }
672 return 1;
673 }
675 /* Foreign mappings into guests in shadow external mode don't
676 * contribute to writeable mapping refcounts. (This allows the
677 * qemu-dm helper process in dom0 to map the domain's memory without
678 * messing up the count of "real" writable mappings.) */
679 okay = (((l1f & _PAGE_RW) &&
680 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
681 ? get_page_and_type(page, d, PGT_writable_page)
682 : get_page(page, d));
683 if ( !okay )
684 {
685 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
686 " for dom%d",
687 mfn, get_gpfn_from_mfn(mfn),
688 l1e_get_intpte(l1e), d->domain_id);
689 }
690 else if ( pte_flags_to_cacheattr(l1f) !=
691 ((page->count_info >> PGC_cacheattr_base) & 7) )
692 {
693 uint32_t x, nx, y = page->count_info;
694 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
696 if ( is_xen_heap_page(page) )
697 {
698 if ( (l1f & _PAGE_RW) &&
699 !(unlikely(paging_mode_external(d) &&
700 (d != curr->domain))) )
701 put_page_type(page);
702 put_page(page);
703 MEM_LOG("Attempt to change cache attributes of Xen heap page");
704 return 0;
705 }
707 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
708 {
709 x = y;
710 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
711 y = cmpxchg(&page->count_info, x, nx);
712 }
714 #ifdef __x86_64__
715 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
716 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
717 #endif
718 }
720 return okay;
721 }
724 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
725 define_get_linear_pagetable(l2);
726 static int
727 get_page_from_l2e(
728 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
729 {
730 int rc;
732 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
733 return 1;
735 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
736 {
737 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
738 return 0;
739 }
741 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
742 if ( unlikely(!rc) )
743 rc = get_l2_linear_pagetable(l2e, pfn, d);
745 return rc;
746 }
749 #if CONFIG_PAGING_LEVELS >= 3
750 define_get_linear_pagetable(l3);
751 static int
752 get_page_from_l3e(
753 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
754 {
755 int rc;
757 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
758 return 1;
760 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
761 {
762 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
763 return 0;
764 }
766 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
767 if ( unlikely(!rc) )
768 rc = get_l3_linear_pagetable(l3e, pfn, d);
770 return rc;
771 }
772 #endif /* 3 level */
774 #if CONFIG_PAGING_LEVELS >= 4
775 define_get_linear_pagetable(l4);
776 static int
777 get_page_from_l4e(
778 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
779 {
780 int rc;
782 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
783 return 1;
785 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
786 {
787 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
788 return 0;
789 }
791 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
792 if ( unlikely(!rc) )
793 rc = get_l4_linear_pagetable(l4e, pfn, d);
795 return rc;
796 }
797 #endif /* 4 level */
799 #ifdef __x86_64__
801 #ifdef USER_MAPPINGS_ARE_GLOBAL
802 #define adjust_guest_l1e(pl1e, d) \
803 do { \
804 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
805 likely(!is_pv_32on64_domain(d)) ) \
806 { \
807 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
808 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
809 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
810 MEM_LOG("Global bit is set to kernel page %lx", \
811 l1e_get_pfn((pl1e))); \
812 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
813 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
814 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
815 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
816 } \
817 } while ( 0 )
818 #else
819 #define adjust_guest_l1e(pl1e, d) \
820 do { \
821 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
822 likely(!is_pv_32on64_domain(d)) ) \
823 l1e_add_flags((pl1e), _PAGE_USER); \
824 } while ( 0 )
825 #endif
827 #define adjust_guest_l2e(pl2e, d) \
828 do { \
829 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
830 likely(!is_pv_32on64_domain(d)) ) \
831 l2e_add_flags((pl2e), _PAGE_USER); \
832 } while ( 0 )
834 #define adjust_guest_l3e(pl3e, d) \
835 do { \
836 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
837 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
838 _PAGE_USER : \
839 _PAGE_USER|_PAGE_RW); \
840 } while ( 0 )
842 #define adjust_guest_l4e(pl4e, d) \
843 do { \
844 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
845 likely(!is_pv_32on64_domain(d)) ) \
846 l4e_add_flags((pl4e), _PAGE_USER); \
847 } while ( 0 )
849 #else /* !defined(__x86_64__) */
851 #define adjust_guest_l1e(_p, _d) ((void)(_d))
852 #define adjust_guest_l2e(_p, _d) ((void)(_d))
853 #define adjust_guest_l3e(_p, _d) ((void)(_d))
855 #endif
857 #ifdef CONFIG_COMPAT
858 #define unadjust_guest_l3e(pl3e, d) \
859 do { \
860 if ( unlikely(is_pv_32on64_domain(d)) && \
861 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
862 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
863 } while ( 0 )
864 #else
865 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
866 #endif
868 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
869 {
870 unsigned long pfn = l1e_get_pfn(l1e);
871 struct page_info *page;
872 struct domain *e;
873 struct vcpu *v;
875 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
876 return;
878 page = mfn_to_page(pfn);
880 e = page_get_owner(page);
882 /*
883 * Check if this is a mapping that was established via a grant reference.
884 * If it was then we should not be here: we require that such mappings are
885 * explicitly destroyed via the grant-table interface.
886 *
887 * The upshot of this is that the guest can end up with active grants that
888 * it cannot destroy (because it no longer has a PTE to present to the
889 * grant-table interface). This can lead to subtle hard-to-catch bugs,
890 * hence a special grant PTE flag can be enabled to catch the bug early.
891 *
892 * (Note that the undestroyable active grants are not a security hole in
893 * Xen. All active grants can safely be cleaned up when the domain dies.)
894 */
895 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
896 !d->is_shutting_down && !d->is_dying )
897 {
898 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
899 l1e_get_intpte(l1e));
900 domain_crash(d);
901 }
903 /* Remember we didn't take a type-count of foreign writable mappings
904 * to paging-external domains */
905 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
906 !(unlikely((e != d) && paging_mode_external(e))) )
907 {
908 put_page_and_type(page);
909 }
910 else
911 {
912 /* We expect this is rare so we blow the entire shadow LDT. */
913 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
914 PGT_ldt_page)) &&
915 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
916 (d == e) )
917 {
918 for_each_vcpu ( d, v )
919 invalidate_shadow_ldt(v);
920 }
921 put_page(page);
922 }
923 }
926 /*
927 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
928 * Note also that this automatically deals correctly with linear p.t.'s.
929 */
930 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
931 {
932 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
933 (l2e_get_pfn(l2e) != pfn) )
934 put_page_and_type(l2e_get_page(l2e));
935 }
938 #if CONFIG_PAGING_LEVELS >= 3
939 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
940 {
941 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
942 (l3e_get_pfn(l3e) != pfn) )
943 put_page_and_type(l3e_get_page(l3e));
944 }
945 #endif
947 #if CONFIG_PAGING_LEVELS >= 4
948 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
949 {
950 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
951 (l4e_get_pfn(l4e) != pfn) )
952 put_page_and_type(l4e_get_page(l4e));
953 }
954 #endif
956 static int alloc_l1_table(struct page_info *page)
957 {
958 struct domain *d = page_get_owner(page);
959 unsigned long pfn = page_to_mfn(page);
960 l1_pgentry_t *pl1e;
961 int i;
963 pl1e = map_domain_page(pfn);
965 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
966 {
967 if ( is_guest_l1_slot(i) &&
968 unlikely(!get_page_from_l1e(pl1e[i], d)) )
969 goto fail;
971 adjust_guest_l1e(pl1e[i], d);
972 }
974 unmap_domain_page(pl1e);
975 return 1;
977 fail:
978 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
979 while ( i-- > 0 )
980 if ( is_guest_l1_slot(i) )
981 put_page_from_l1e(pl1e[i], d);
983 unmap_domain_page(pl1e);
984 return 0;
985 }
987 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
988 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
989 {
990 struct page_info *page;
991 l2_pgentry_t *pl2e;
992 l3_pgentry_t l3e3;
993 #ifndef CONFIG_COMPAT
994 l2_pgentry_t l2e;
995 int i;
996 #endif
998 if ( !is_pv_32bit_domain(d) )
999 return 1;
1001 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1003 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1004 l3e3 = pl3e[3];
1005 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1007 MEM_LOG("PAE L3 3rd slot is empty");
1008 return 0;
1011 /*
1012 * The Xen-private mappings include linear mappings. The L2 thus cannot
1013 * be shared by multiple L3 tables. The test here is adequate because:
1014 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1015 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1016 * 2. Cannot appear in another page table's L3:
1017 * a. alloc_l3_table() calls this function and this check will fail
1018 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1019 */
1020 page = l3e_get_page(l3e3);
1021 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1022 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1023 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1024 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1026 MEM_LOG("PAE L3 3rd slot is shared");
1027 return 0;
1030 /* Xen private mappings. */
1031 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1032 #ifndef CONFIG_COMPAT
1033 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1034 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1035 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1036 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1038 l2e = l2e_from_page(
1039 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1040 __PAGE_HYPERVISOR);
1041 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1043 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1045 l2e = l2e_empty();
1046 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1047 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1048 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1050 #else
1051 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1052 &compat_idle_pg_table_l2[
1053 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1054 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1055 #endif
1056 unmap_domain_page(pl2e);
1058 return 1;
1060 #else
1061 # define create_pae_xen_mappings(d, pl3e) (1)
1062 #endif
1064 #ifdef CONFIG_X86_PAE
1065 /* Flush a pgdir update into low-memory caches. */
1066 static void pae_flush_pgd(
1067 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1069 struct domain *d = page_get_owner(mfn_to_page(mfn));
1070 struct vcpu *v;
1071 intpte_t _ol3e, _nl3e, _pl3e;
1072 l3_pgentry_t *l3tab_ptr;
1073 struct pae_l3_cache *cache;
1075 if ( unlikely(shadow_mode_enabled(d)) )
1077 cpumask_t m = CPU_MASK_NONE;
1078 /* Re-shadow this l3 table on any vcpus that are using it */
1079 for_each_vcpu ( d, v )
1080 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1082 paging_update_cr3(v);
1083 cpus_or(m, m, v->vcpu_dirty_cpumask);
1085 flush_tlb_mask(m);
1088 /* If below 4GB then the pgdir is not shadowed in low memory. */
1089 if ( !l3tab_needs_shadow(mfn) )
1090 return;
1092 for_each_vcpu ( d, v )
1094 cache = &v->arch.pae_l3_cache;
1096 spin_lock(&cache->lock);
1098 if ( cache->high_mfn == mfn )
1100 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1101 _ol3e = l3e_get_intpte(*l3tab_ptr);
1102 _nl3e = l3e_get_intpte(nl3e);
1103 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1104 BUG_ON(_pl3e != _ol3e);
1107 spin_unlock(&cache->lock);
1110 flush_tlb_mask(d->domain_dirty_cpumask);
1112 #else
1113 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1114 #endif
1116 static int alloc_l2_table(struct page_info *page, unsigned long type)
1118 struct domain *d = page_get_owner(page);
1119 unsigned long pfn = page_to_mfn(page);
1120 l2_pgentry_t *pl2e;
1121 int i;
1123 pl2e = map_domain_page(pfn);
1125 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1127 if ( is_guest_l2_slot(d, type, i) &&
1128 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1129 goto fail;
1131 adjust_guest_l2e(pl2e[i], d);
1134 #if CONFIG_PAGING_LEVELS == 2
1135 /* Xen private mappings. */
1136 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1137 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1138 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1139 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1140 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1141 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1142 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1143 l2e_from_page(
1144 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1145 __PAGE_HYPERVISOR);
1146 #endif
1148 unmap_domain_page(pl2e);
1149 return 1;
1151 fail:
1152 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1153 while ( i-- > 0 )
1154 if ( is_guest_l2_slot(d, type, i) )
1155 put_page_from_l2e(pl2e[i], pfn);
1157 unmap_domain_page(pl2e);
1158 return 0;
1162 #if CONFIG_PAGING_LEVELS >= 3
1163 static int alloc_l3_table(struct page_info *page)
1165 struct domain *d = page_get_owner(page);
1166 unsigned long pfn = page_to_mfn(page);
1167 l3_pgentry_t *pl3e;
1168 int i;
1170 #ifdef CONFIG_X86_PAE
1171 /*
1172 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1173 * the weird 'extended cr3' format for dealing with high-order address
1174 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1175 */
1176 if ( (pfn >= 0x100000) &&
1177 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1178 d->vcpu[0] && d->vcpu[0]->is_initialised )
1180 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1181 return 0;
1183 #endif
1185 pl3e = map_domain_page(pfn);
1187 /*
1188 * PAE guests allocate full pages, but aren't required to initialize
1189 * more than the first four entries; when running in compatibility
1190 * mode, however, the full page is visible to the MMU, and hence all
1191 * 512 entries must be valid/verified, which is most easily achieved
1192 * by clearing them out.
1193 */
1194 if ( is_pv_32on64_domain(d) )
1195 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1197 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1199 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1200 if ( is_pv_32bit_domain(d) && (i == 3) )
1202 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1203 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1204 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1205 PGT_l2_page_table |
1206 PGT_pae_xen_l2,
1207 d) )
1208 goto fail;
1210 else
1211 #endif
1212 if ( is_guest_l3_slot(i) &&
1213 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1214 goto fail;
1216 adjust_guest_l3e(pl3e[i], d);
1219 if ( !create_pae_xen_mappings(d, pl3e) )
1220 goto fail;
1222 unmap_domain_page(pl3e);
1223 return 1;
1225 fail:
1226 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1227 while ( i-- > 0 )
1228 if ( is_guest_l3_slot(i) )
1229 put_page_from_l3e(pl3e[i], pfn);
1231 unmap_domain_page(pl3e);
1232 return 0;
1234 #else
1235 #define alloc_l3_table(page) (0)
1236 #endif
1238 #if CONFIG_PAGING_LEVELS >= 4
1239 static int alloc_l4_table(struct page_info *page)
1241 struct domain *d = page_get_owner(page);
1242 unsigned long pfn = page_to_mfn(page);
1243 l4_pgentry_t *pl4e = page_to_virt(page);
1244 int i;
1246 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1248 if ( is_guest_l4_slot(d, i) &&
1249 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1250 goto fail;
1252 adjust_guest_l4e(pl4e[i], d);
1255 /* Xen private mappings. */
1256 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1257 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1258 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1259 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1260 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1261 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1262 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1263 __PAGE_HYPERVISOR);
1264 if ( is_pv_32on64_domain(d) )
1265 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1266 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1267 __PAGE_HYPERVISOR);
1269 return 1;
1271 fail:
1272 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1273 while ( i-- > 0 )
1274 if ( is_guest_l4_slot(d, i) )
1275 put_page_from_l4e(pl4e[i], pfn);
1277 return 0;
1279 #else
1280 #define alloc_l4_table(page) (0)
1281 #endif
1284 static void free_l1_table(struct page_info *page)
1286 struct domain *d = page_get_owner(page);
1287 unsigned long pfn = page_to_mfn(page);
1288 l1_pgentry_t *pl1e;
1289 int i;
1291 pl1e = map_domain_page(pfn);
1293 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1294 if ( is_guest_l1_slot(i) )
1295 put_page_from_l1e(pl1e[i], d);
1297 unmap_domain_page(pl1e);
1301 static void free_l2_table(struct page_info *page)
1303 #ifdef CONFIG_COMPAT
1304 struct domain *d = page_get_owner(page);
1305 #endif
1306 unsigned long pfn = page_to_mfn(page);
1307 l2_pgentry_t *pl2e;
1308 int i;
1310 pl2e = map_domain_page(pfn);
1312 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1313 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1314 put_page_from_l2e(pl2e[i], pfn);
1316 unmap_domain_page(pl2e);
1318 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1322 #if CONFIG_PAGING_LEVELS >= 3
1324 static void free_l3_table(struct page_info *page)
1326 struct domain *d = page_get_owner(page);
1327 unsigned long pfn = page_to_mfn(page);
1328 l3_pgentry_t *pl3e;
1329 int i;
1331 pl3e = map_domain_page(pfn);
1333 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1334 if ( is_guest_l3_slot(i) )
1336 put_page_from_l3e(pl3e[i], pfn);
1337 unadjust_guest_l3e(pl3e[i], d);
1340 unmap_domain_page(pl3e);
1343 #endif
1345 #if CONFIG_PAGING_LEVELS >= 4
1347 static void free_l4_table(struct page_info *page)
1349 struct domain *d = page_get_owner(page);
1350 unsigned long pfn = page_to_mfn(page);
1351 l4_pgentry_t *pl4e = page_to_virt(page);
1352 int i;
1354 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1355 if ( is_guest_l4_slot(d, i) )
1356 put_page_from_l4e(pl4e[i], pfn);
1359 #endif
1362 /* How to write an entry to the guest pagetables.
1363 * Returns 0 for failure (pointer not valid), 1 for success. */
1364 static inline int update_intpte(intpte_t *p,
1365 intpte_t old,
1366 intpte_t new,
1367 unsigned long mfn,
1368 struct vcpu *v,
1369 int preserve_ad)
1371 int rv = 1;
1372 #ifndef PTE_UPDATE_WITH_CMPXCHG
1373 if ( !preserve_ad )
1375 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1377 else
1378 #endif
1380 intpte_t t = old;
1381 for ( ; ; )
1383 intpte_t _new = new;
1384 if ( preserve_ad )
1385 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1387 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1388 if ( unlikely(rv == 0) )
1390 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1391 ": saw %" PRIpte, old, _new, t);
1392 break;
1395 if ( t == old )
1396 break;
1398 /* Allowed to change in Accessed/Dirty flags only. */
1399 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1401 old = t;
1404 return rv;
1407 /* Macro that wraps the appropriate type-changes around update_intpte().
1408 * Arguments are: type, ptr, old, new, mfn, vcpu */
1409 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1410 update_intpte(&_t ## e_get_intpte(*(_p)), \
1411 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1412 (_m), (_v), (_ad))
1414 /* Update the L1 entry at pl1e to new value nl1e. */
1415 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1416 unsigned long gl1mfn, int preserve_ad)
1418 l1_pgentry_t ol1e;
1419 struct vcpu *curr = current;
1420 struct domain *d = curr->domain;
1421 unsigned long mfn;
1423 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1424 return 0;
1426 if ( unlikely(paging_mode_refcounts(d)) )
1427 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1429 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1431 /* Translate foreign guest addresses. */
1432 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1433 if ( unlikely(mfn == INVALID_MFN) )
1434 return 0;
1435 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1436 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1438 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1440 MEM_LOG("Bad L1 flags %x",
1441 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1442 return 0;
1445 adjust_guest_l1e(nl1e, d);
1447 /* Fast path for identical mapping, r/w and presence. */
1448 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1449 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1450 preserve_ad);
1452 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1453 return 0;
1455 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1456 preserve_ad)) )
1458 put_page_from_l1e(nl1e, d);
1459 return 0;
1462 else
1464 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1465 preserve_ad)) )
1466 return 0;
1469 put_page_from_l1e(ol1e, d);
1470 return 1;
1474 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1475 static int mod_l2_entry(l2_pgentry_t *pl2e,
1476 l2_pgentry_t nl2e,
1477 unsigned long pfn,
1478 unsigned long type,
1479 int preserve_ad)
1481 l2_pgentry_t ol2e;
1482 struct vcpu *curr = current;
1483 struct domain *d = curr->domain;
1485 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1487 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1488 return 0;
1491 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1492 return 0;
1494 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1496 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1498 MEM_LOG("Bad L2 flags %x",
1499 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1500 return 0;
1503 adjust_guest_l2e(nl2e, d);
1505 /* Fast path for identical mapping and presence. */
1506 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1507 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1509 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1510 return 0;
1512 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1513 preserve_ad)) )
1515 put_page_from_l2e(nl2e, pfn);
1516 return 0;
1519 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1520 preserve_ad)) )
1522 return 0;
1525 put_page_from_l2e(ol2e, pfn);
1526 return 1;
1529 #if CONFIG_PAGING_LEVELS >= 3
1531 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1532 static int mod_l3_entry(l3_pgentry_t *pl3e,
1533 l3_pgentry_t nl3e,
1534 unsigned long pfn,
1535 int preserve_ad)
1537 l3_pgentry_t ol3e;
1538 struct vcpu *curr = current;
1539 struct domain *d = curr->domain;
1540 int okay;
1542 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1544 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1545 return 0;
1548 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1549 /*
1550 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1551 * would be a pain to ensure they remain continuously valid throughout.
1552 */
1553 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1554 return 0;
1555 #endif
1557 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1558 return 0;
1560 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1562 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1564 MEM_LOG("Bad L3 flags %x",
1565 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1566 return 0;
1569 adjust_guest_l3e(nl3e, d);
1571 /* Fast path for identical mapping and presence. */
1572 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1573 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1575 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1576 return 0;
1578 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1579 preserve_ad)) )
1581 put_page_from_l3e(nl3e, pfn);
1582 return 0;
1585 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1586 preserve_ad)) )
1588 return 0;
1591 okay = create_pae_xen_mappings(d, pl3e);
1592 BUG_ON(!okay);
1594 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1596 put_page_from_l3e(ol3e, pfn);
1597 return 1;
1600 #endif
1602 #if CONFIG_PAGING_LEVELS >= 4
1604 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1605 static int mod_l4_entry(l4_pgentry_t *pl4e,
1606 l4_pgentry_t nl4e,
1607 unsigned long pfn,
1608 int preserve_ad)
1610 struct vcpu *curr = current;
1611 struct domain *d = curr->domain;
1612 l4_pgentry_t ol4e;
1614 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1616 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1617 return 0;
1620 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1621 return 0;
1623 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1625 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1627 MEM_LOG("Bad L4 flags %x",
1628 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1629 return 0;
1632 adjust_guest_l4e(nl4e, d);
1634 /* Fast path for identical mapping and presence. */
1635 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1636 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1638 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1639 return 0;
1641 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1642 preserve_ad)) )
1644 put_page_from_l4e(nl4e, pfn);
1645 return 0;
1648 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1649 preserve_ad)) )
1651 return 0;
1654 put_page_from_l4e(ol4e, pfn);
1655 return 1;
1658 #endif
1660 void put_page(struct page_info *page)
1662 u32 nx, x, y = page->count_info;
1664 do {
1665 x = y;
1666 nx = x - 1;
1668 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1670 if ( unlikely((nx & PGC_count_mask) == 0) )
1672 cleanup_page_cacheattr(page);
1673 free_domheap_page(page);
1678 int get_page(struct page_info *page, struct domain *domain)
1680 u32 x, nx, y = page->count_info;
1681 u32 d, nd = page->u.inuse._domain;
1682 u32 _domain = pickle_domptr(domain);
1684 do {
1685 x = y;
1686 nx = x + 1;
1687 d = nd;
1688 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1689 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1690 unlikely(d != _domain) ) /* Wrong owner? */
1692 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1693 gdprintk(XENLOG_INFO,
1694 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1695 PRtype_info "\n",
1696 page_to_mfn(page), domain, unpickle_domptr(d),
1697 x, page->u.inuse.type_info);
1698 return 0;
1700 asm volatile (
1701 LOCK_PREFIX "cmpxchg8b %3"
1702 : "=d" (nd), "=a" (y), "=c" (d),
1703 "=m" (*(volatile u64 *)(&page->count_info))
1704 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1706 while ( unlikely(nd != d) || unlikely(y != x) );
1708 return 1;
1712 static int alloc_page_type(struct page_info *page, unsigned long type)
1714 struct domain *owner = page_get_owner(page);
1716 /* A page table is dirtied when its type count becomes non-zero. */
1717 if ( likely(owner != NULL) )
1718 paging_mark_dirty(owner, page_to_mfn(page));
1720 switch ( type & PGT_type_mask )
1722 case PGT_l1_page_table:
1723 return alloc_l1_table(page);
1724 case PGT_l2_page_table:
1725 return alloc_l2_table(page, type);
1726 case PGT_l3_page_table:
1727 return alloc_l3_table(page);
1728 case PGT_l4_page_table:
1729 return alloc_l4_table(page);
1730 case PGT_gdt_page:
1731 case PGT_ldt_page:
1732 return alloc_segdesc_page(page);
1733 default:
1734 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1735 type, page->u.inuse.type_info,
1736 page->count_info);
1737 BUG();
1740 return 0;
1744 void free_page_type(struct page_info *page, unsigned long type)
1746 struct domain *owner = page_get_owner(page);
1747 unsigned long gmfn;
1749 if ( likely(owner != NULL) )
1751 /*
1752 * We have to flush before the next use of the linear mapping
1753 * (e.g., update_va_mapping()) or we could end up modifying a page
1754 * that is no longer a page table (and hence screw up ref counts).
1755 */
1756 if ( current->domain == owner )
1757 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1758 else
1759 flush_tlb_mask(owner->domain_dirty_cpumask);
1761 if ( unlikely(paging_mode_enabled(owner)) )
1763 /* A page table is dirtied when its type count becomes zero. */
1764 paging_mark_dirty(owner, page_to_mfn(page));
1766 if ( shadow_mode_refcounts(owner) )
1767 return;
1769 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1770 ASSERT(VALID_M2P(gmfn));
1771 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1775 switch ( type & PGT_type_mask )
1777 case PGT_l1_page_table:
1778 free_l1_table(page);
1779 break;
1781 case PGT_l2_page_table:
1782 free_l2_table(page);
1783 break;
1785 #if CONFIG_PAGING_LEVELS >= 3
1786 case PGT_l3_page_table:
1787 free_l3_table(page);
1788 break;
1789 #endif
1791 #if CONFIG_PAGING_LEVELS >= 4
1792 case PGT_l4_page_table:
1793 free_l4_table(page);
1794 break;
1795 #endif
1797 default:
1798 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1799 type, page_to_mfn(page));
1800 BUG();
1805 void put_page_type(struct page_info *page)
1807 unsigned long nx, x, y = page->u.inuse.type_info;
1809 again:
1810 do {
1811 x = y;
1812 nx = x - 1;
1814 ASSERT((x & PGT_count_mask) != 0);
1816 if ( unlikely((nx & PGT_count_mask) == 0) )
1818 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1819 likely(nx & PGT_validated) )
1821 /*
1822 * Page-table pages must be unvalidated when count is zero. The
1823 * 'free' is safe because the refcnt is non-zero and validated
1824 * bit is clear => other ops will spin or fail.
1825 */
1826 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1827 x & ~PGT_validated)) != x) )
1828 goto again;
1829 /* We cleared the 'valid bit' so we do the clean up. */
1830 free_page_type(page, x);
1831 /* Carry on, but with the 'valid bit' now clear. */
1832 x &= ~PGT_validated;
1833 nx &= ~PGT_validated;
1836 /*
1837 * Record TLB information for flush later. We do not stamp page
1838 * tables when running in shadow mode:
1839 * 1. Pointless, since it's the shadow pt's which must be tracked.
1840 * 2. Shadow mode reuses this field for shadowed page tables to
1841 * store flags info -- we don't want to conflict with that.
1842 */
1843 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1844 (page->count_info & PGC_page_table)) )
1845 page->tlbflush_timestamp = tlbflush_current_time();
1848 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1852 int get_page_type(struct page_info *page, unsigned long type)
1854 unsigned long nx, x, y = page->u.inuse.type_info;
1856 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1858 again:
1859 do {
1860 x = y;
1861 nx = x + 1;
1862 if ( unlikely((nx & PGT_count_mask) == 0) )
1864 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1865 return 0;
1867 else if ( unlikely((x & PGT_count_mask) == 0) )
1869 struct domain *d = page_get_owner(page);
1871 /* Never allow a shadowed frame to go from type count 0 to 1 */
1872 if ( d && shadow_mode_enabled(d) )
1873 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1875 ASSERT(!(x & PGT_pae_xen_l2));
1876 if ( (x & PGT_type_mask) != type )
1878 /*
1879 * On type change we check to flush stale TLB entries. This
1880 * may be unnecessary (e.g., page was GDT/LDT) but those
1881 * circumstances should be very rare.
1882 */
1883 cpumask_t mask = d->domain_dirty_cpumask;
1885 /* Don't flush if the timestamp is old enough */
1886 tlbflush_filter(mask, page->tlbflush_timestamp);
1888 if ( unlikely(!cpus_empty(mask)) &&
1889 /* Shadow mode: track only writable pages. */
1890 (!shadow_mode_enabled(page_get_owner(page)) ||
1891 ((nx & PGT_type_mask) == PGT_writable_page)) )
1893 perfc_incr(need_flush_tlb_flush);
1894 flush_tlb_mask(mask);
1897 /* We lose existing type and validity. */
1898 nx &= ~(PGT_type_mask | PGT_validated);
1899 nx |= type;
1901 /* No special validation needed for writable pages. */
1902 /* Page tables and GDT/LDT need to be scanned for validity. */
1903 if ( type == PGT_writable_page )
1904 nx |= PGT_validated;
1907 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1909 /* Don't log failure if it could be a recursive-mapping attempt. */
1910 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1911 (type == PGT_l1_page_table) )
1912 return 0;
1913 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1914 (type == PGT_l2_page_table) )
1915 return 0;
1916 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1917 (type == PGT_l3_page_table) )
1918 return 0;
1919 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1920 "for mfn %lx (pfn %lx)",
1921 x, type, page_to_mfn(page),
1922 get_gpfn_from_mfn(page_to_mfn(page)));
1923 return 0;
1925 else if ( unlikely(!(x & PGT_validated)) )
1927 /* Someone else is updating validation of this page. Wait... */
1928 while ( (y = page->u.inuse.type_info) == x )
1929 cpu_relax();
1930 goto again;
1933 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1935 if ( unlikely(!(nx & PGT_validated)) )
1937 /* Try to validate page type; drop the new reference on failure. */
1938 if ( unlikely(!alloc_page_type(page, type)) )
1940 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1941 PRtype_info ": caf=%08x taf=%" PRtype_info,
1942 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1943 type, page->count_info, page->u.inuse.type_info);
1944 /* Noone else can get a reference. We hold the only ref. */
1945 page->u.inuse.type_info = 0;
1946 return 0;
1949 /* Noone else is updating simultaneously. */
1950 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1953 return 1;
1957 void cleanup_page_cacheattr(struct page_info *page)
1959 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1961 if ( likely(cacheattr == 0) )
1962 return;
1964 page->count_info &= ~PGC_cacheattr_mask;
1966 BUG_ON(is_xen_heap_page(page));
1968 #ifdef __x86_64__
1969 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1970 1, PAGE_HYPERVISOR);
1971 #endif
1975 int new_guest_cr3(unsigned long mfn)
1977 struct vcpu *v = current;
1978 struct domain *d = v->domain;
1979 int okay;
1980 unsigned long old_base_mfn;
1982 #ifdef CONFIG_COMPAT
1983 if ( is_pv_32on64_domain(d) )
1985 okay = paging_mode_refcounts(d)
1986 ? 0 /* Old code was broken, but what should it be? */
1987 : mod_l4_entry(
1988 __va(pagetable_get_paddr(v->arch.guest_table)),
1989 l4e_from_pfn(
1990 mfn,
1991 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1992 pagetable_get_pfn(v->arch.guest_table), 0);
1993 if ( unlikely(!okay) )
1995 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1996 return 0;
1999 invalidate_shadow_ldt(v);
2000 write_ptbase(v);
2002 return 1;
2004 #endif
2005 okay = paging_mode_refcounts(d)
2006 ? get_page_from_pagenr(mfn, d)
2007 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
2008 if ( unlikely(!okay) )
2010 MEM_LOG("Error while installing new baseptr %lx", mfn);
2011 return 0;
2014 invalidate_shadow_ldt(v);
2016 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2018 v->arch.guest_table = pagetable_from_pfn(mfn);
2019 update_cr3(v);
2021 write_ptbase(v);
2023 if ( likely(old_base_mfn != 0) )
2025 if ( paging_mode_refcounts(d) )
2026 put_page(mfn_to_page(old_base_mfn));
2027 else
2028 put_page_and_type(mfn_to_page(old_base_mfn));
2031 return 1;
2034 static void process_deferred_ops(void)
2036 unsigned int deferred_ops;
2037 struct domain *d = current->domain;
2038 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2040 deferred_ops = info->deferred_ops;
2041 info->deferred_ops = 0;
2043 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2045 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2046 flush_tlb_mask(d->domain_dirty_cpumask);
2047 else
2048 flush_tlb_local();
2051 if ( deferred_ops & DOP_RELOAD_LDT )
2052 (void)map_ldt_shadow_page(0);
2054 if ( unlikely(info->foreign != NULL) )
2056 rcu_unlock_domain(info->foreign);
2057 info->foreign = NULL;
2061 static int set_foreigndom(domid_t domid)
2063 struct domain *e, *d = current->domain;
2064 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2065 int okay = 1;
2067 ASSERT(info->foreign == NULL);
2069 if ( likely(domid == DOMID_SELF) )
2070 goto out;
2072 if ( unlikely(domid == d->domain_id) )
2074 MEM_LOG("Cannot specify itself as foreign domain");
2075 okay = 0;
2077 else if ( unlikely(paging_mode_translate(d)) )
2079 MEM_LOG("Cannot mix foreign mappings with translated domains");
2080 okay = 0;
2082 else switch ( domid )
2084 case DOMID_IO:
2085 info->foreign = rcu_lock_domain(dom_io);
2086 break;
2087 case DOMID_XEN:
2088 if (!IS_PRIV(d)) {
2089 MEM_LOG("Cannot set foreign dom");
2090 okay = 0;
2091 break;
2093 info->foreign = rcu_lock_domain(dom_xen);
2094 break;
2095 default:
2096 e = rcu_lock_domain_by_id(domid);
2097 if ( e == NULL )
2099 MEM_LOG("Unknown domain '%u'", domid);
2100 okay = 0;
2101 break;
2103 if (!IS_PRIV_FOR(d, e)) {
2104 MEM_LOG("Cannot set foreign dom");
2105 okay = 0;
2106 rcu_unlock_domain(e);
2107 break;
2109 info->foreign = e;
2110 break;
2113 out:
2114 return okay;
2117 static inline cpumask_t vcpumask_to_pcpumask(
2118 struct domain *d, unsigned long vmask)
2120 unsigned int vcpu_id;
2121 cpumask_t pmask = CPU_MASK_NONE;
2122 struct vcpu *v;
2124 while ( vmask != 0 )
2126 vcpu_id = find_first_set_bit(vmask);
2127 vmask &= ~(1UL << vcpu_id);
2128 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2129 ((v = d->vcpu[vcpu_id]) != NULL) )
2130 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2133 return pmask;
2136 int do_mmuext_op(
2137 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2138 unsigned int count,
2139 XEN_GUEST_HANDLE(uint) pdone,
2140 unsigned int foreigndom)
2142 struct mmuext_op op;
2143 int rc = 0, i = 0, okay;
2144 unsigned long mfn = 0, gmfn = 0, type;
2145 unsigned int done = 0;
2146 struct page_info *page;
2147 struct vcpu *v = current;
2148 struct domain *d = v->domain;
2150 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2152 count &= ~MMU_UPDATE_PREEMPTED;
2153 if ( unlikely(!guest_handle_is_null(pdone)) )
2154 (void)copy_from_guest(&done, pdone, 1);
2156 else
2157 perfc_incr(calls_to_mmuext_op);
2159 if ( unlikely(!guest_handle_okay(uops, count)) )
2161 rc = -EFAULT;
2162 goto out;
2165 if ( !set_foreigndom(foreigndom) )
2167 rc = -ESRCH;
2168 goto out;
2171 LOCK_BIGLOCK(d);
2173 for ( i = 0; i < count; i++ )
2175 if ( hypercall_preempt_check() )
2177 rc = hypercall_create_continuation(
2178 __HYPERVISOR_mmuext_op, "hihi",
2179 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2180 break;
2183 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2185 MEM_LOG("Bad __copy_from_guest");
2186 rc = -EFAULT;
2187 break;
2190 okay = 1;
2191 gmfn = op.arg1.mfn;
2192 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2193 page = mfn_to_page(mfn);
2195 switch ( op.cmd )
2197 case MMUEXT_PIN_L1_TABLE:
2198 type = PGT_l1_page_table;
2199 goto pin_page;
2201 case MMUEXT_PIN_L2_TABLE:
2202 type = PGT_l2_page_table;
2203 goto pin_page;
2205 case MMUEXT_PIN_L3_TABLE:
2206 type = PGT_l3_page_table;
2207 goto pin_page;
2209 case MMUEXT_PIN_L4_TABLE:
2210 if ( is_pv_32bit_domain(FOREIGNDOM) )
2211 break;
2212 type = PGT_l4_page_table;
2214 pin_page:
2215 rc = xsm_memory_pin_page(d, page);
2216 if ( rc )
2217 break;
2219 /* Ignore pinning of invalid paging levels. */
2220 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2221 break;
2223 if ( paging_mode_refcounts(FOREIGNDOM) )
2224 break;
2226 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2227 if ( unlikely(!okay) )
2229 MEM_LOG("Error while pinning mfn %lx", mfn);
2230 break;
2233 if ( unlikely(test_and_set_bit(_PGT_pinned,
2234 &page->u.inuse.type_info)) )
2236 MEM_LOG("Mfn %lx already pinned", mfn);
2237 put_page_and_type(page);
2238 okay = 0;
2239 break;
2242 /* A page is dirtied when its pin status is set. */
2243 paging_mark_dirty(d, mfn);
2245 /* We can race domain destruction (domain_relinquish_resources). */
2246 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2248 int drop_ref;
2249 spin_lock(&FOREIGNDOM->page_alloc_lock);
2250 drop_ref = (FOREIGNDOM->is_dying &&
2251 test_and_clear_bit(_PGT_pinned,
2252 &page->u.inuse.type_info));
2253 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2254 if ( drop_ref )
2255 put_page_and_type(page);
2258 break;
2260 case MMUEXT_UNPIN_TABLE:
2261 if ( paging_mode_refcounts(d) )
2262 break;
2264 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2266 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2267 mfn, page_get_owner(page));
2269 else if ( likely(test_and_clear_bit(_PGT_pinned,
2270 &page->u.inuse.type_info)) )
2272 put_page_and_type(page);
2273 put_page(page);
2274 /* A page is dirtied when its pin status is cleared. */
2275 paging_mark_dirty(d, mfn);
2277 else
2279 okay = 0;
2280 put_page(page);
2281 MEM_LOG("Mfn %lx not pinned", mfn);
2283 break;
2285 case MMUEXT_NEW_BASEPTR:
2286 okay = new_guest_cr3(mfn);
2287 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2288 break;
2290 #ifdef __x86_64__
2291 case MMUEXT_NEW_USER_BASEPTR: {
2292 unsigned long old_mfn;
2294 if ( mfn != 0 )
2296 if ( paging_mode_refcounts(d) )
2297 okay = get_page_from_pagenr(mfn, d);
2298 else
2299 okay = get_page_and_type_from_pagenr(
2300 mfn, PGT_root_page_table, d);
2301 if ( unlikely(!okay) )
2303 MEM_LOG("Error while installing new mfn %lx", mfn);
2304 break;
2308 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2309 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2311 if ( old_mfn != 0 )
2313 if ( paging_mode_refcounts(d) )
2314 put_page(mfn_to_page(old_mfn));
2315 else
2316 put_page_and_type(mfn_to_page(old_mfn));
2319 break;
2321 #endif
2323 case MMUEXT_TLB_FLUSH_LOCAL:
2324 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2325 break;
2327 case MMUEXT_INVLPG_LOCAL:
2328 if ( !paging_mode_enabled(d)
2329 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2330 flush_tlb_one_local(op.arg1.linear_addr);
2331 break;
2333 case MMUEXT_TLB_FLUSH_MULTI:
2334 case MMUEXT_INVLPG_MULTI:
2336 unsigned long vmask;
2337 cpumask_t pmask;
2338 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2340 okay = 0;
2341 break;
2343 pmask = vcpumask_to_pcpumask(d, vmask);
2344 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2345 flush_tlb_mask(pmask);
2346 else
2347 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2348 break;
2351 case MMUEXT_TLB_FLUSH_ALL:
2352 flush_tlb_mask(d->domain_dirty_cpumask);
2353 break;
2355 case MMUEXT_INVLPG_ALL:
2356 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2357 break;
2359 case MMUEXT_FLUSH_CACHE:
2360 if ( unlikely(!cache_flush_permitted(d)) )
2362 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2363 okay = 0;
2365 else
2367 wbinvd();
2369 break;
2371 case MMUEXT_SET_LDT:
2373 unsigned long ptr = op.arg1.linear_addr;
2374 unsigned long ents = op.arg2.nr_ents;
2376 if ( paging_mode_external(d) )
2378 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2379 okay = 0;
2381 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2382 (ents > 8192) ||
2383 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2385 okay = 0;
2386 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2388 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2389 (v->arch.guest_context.ldt_base != ptr) )
2391 invalidate_shadow_ldt(v);
2392 v->arch.guest_context.ldt_base = ptr;
2393 v->arch.guest_context.ldt_ents = ents;
2394 load_LDT(v);
2395 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2396 if ( ents != 0 )
2397 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2399 break;
2402 default:
2403 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2404 rc = -ENOSYS;
2405 okay = 0;
2406 break;
2409 if ( unlikely(!okay) )
2411 rc = rc ? rc : -EINVAL;
2412 break;
2415 guest_handle_add_offset(uops, 1);
2418 process_deferred_ops();
2420 UNLOCK_BIGLOCK(d);
2422 perfc_add(num_mmuext_ops, i);
2424 out:
2425 /* Add incremental work we have done to the @done output parameter. */
2426 if ( unlikely(!guest_handle_is_null(pdone)) )
2428 done += i;
2429 copy_to_guest(pdone, &done, 1);
2432 return rc;
2435 int do_mmu_update(
2436 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2437 unsigned int count,
2438 XEN_GUEST_HANDLE(uint) pdone,
2439 unsigned int foreigndom)
2441 struct mmu_update req;
2442 void *va;
2443 unsigned long gpfn, gmfn, mfn;
2444 struct page_info *page;
2445 int rc = 0, okay = 1, i = 0;
2446 unsigned int cmd, done = 0;
2447 struct vcpu *v = current;
2448 struct domain *d = v->domain;
2449 unsigned long type_info;
2450 struct domain_mmap_cache mapcache;
2452 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2454 count &= ~MMU_UPDATE_PREEMPTED;
2455 if ( unlikely(!guest_handle_is_null(pdone)) )
2456 (void)copy_from_guest(&done, pdone, 1);
2458 else
2459 perfc_incr(calls_to_mmu_update);
2461 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2463 rc = -EFAULT;
2464 goto out;
2467 if ( !set_foreigndom(foreigndom) )
2469 rc = -ESRCH;
2470 goto out;
2473 domain_mmap_cache_init(&mapcache);
2475 LOCK_BIGLOCK(d);
2477 for ( i = 0; i < count; i++ )
2479 if ( hypercall_preempt_check() )
2481 rc = hypercall_create_continuation(
2482 __HYPERVISOR_mmu_update, "hihi",
2483 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2484 break;
2487 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2489 MEM_LOG("Bad __copy_from_guest");
2490 rc = -EFAULT;
2491 break;
2494 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2495 okay = 0;
2497 switch ( cmd )
2499 /*
2500 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2501 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2502 * current A/D bits.
2503 */
2504 case MMU_NORMAL_PT_UPDATE:
2505 case MMU_PT_UPDATE_PRESERVE_AD:
2506 rc = xsm_mmu_normal_update(d, req.val);
2507 if ( rc )
2508 break;
2510 req.ptr -= cmd;
2511 gmfn = req.ptr >> PAGE_SHIFT;
2512 mfn = gmfn_to_mfn(d, gmfn);
2514 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2516 MEM_LOG("Could not get page for normal update");
2517 break;
2520 va = map_domain_page_with_cache(mfn, &mapcache);
2521 va = (void *)((unsigned long)va +
2522 (unsigned long)(req.ptr & ~PAGE_MASK));
2523 page = mfn_to_page(mfn);
2525 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2527 case PGT_l1_page_table:
2528 case PGT_l2_page_table:
2529 case PGT_l3_page_table:
2530 case PGT_l4_page_table:
2532 if ( paging_mode_refcounts(d) )
2534 MEM_LOG("mmu update on auto-refcounted domain!");
2535 break;
2538 if ( unlikely(!get_page_type(
2539 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2540 goto not_a_pt;
2542 switch ( type_info & PGT_type_mask )
2544 case PGT_l1_page_table:
2546 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2547 okay = mod_l1_entry(va, l1e, mfn,
2548 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2550 break;
2551 case PGT_l2_page_table:
2553 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2554 okay = mod_l2_entry(va, l2e, mfn, type_info,
2555 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2557 break;
2558 #if CONFIG_PAGING_LEVELS >= 3
2559 case PGT_l3_page_table:
2561 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2562 okay = mod_l3_entry(va, l3e, mfn,
2563 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2565 break;
2566 #endif
2567 #if CONFIG_PAGING_LEVELS >= 4
2568 case PGT_l4_page_table:
2570 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2571 okay = mod_l4_entry(va, l4e, mfn,
2572 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2574 break;
2575 #endif
2578 put_page_type(page);
2580 break;
2582 default:
2583 not_a_pt:
2585 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2586 break;
2588 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2590 put_page_type(page);
2592 break;
2595 unmap_domain_page_with_cache(va, &mapcache);
2597 put_page(page);
2598 break;
2600 case MMU_MACHPHYS_UPDATE:
2602 mfn = req.ptr >> PAGE_SHIFT;
2603 gpfn = req.val;
2605 rc = xsm_mmu_machphys_update(d, mfn);
2606 if ( rc )
2607 break;
2609 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2611 MEM_LOG("Could not get page for mach->phys update");
2612 break;
2615 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2617 MEM_LOG("Mach-phys update on auto-translate guest");
2618 break;
2621 set_gpfn_from_mfn(mfn, gpfn);
2622 okay = 1;
2624 paging_mark_dirty(FOREIGNDOM, mfn);
2626 put_page(mfn_to_page(mfn));
2627 break;
2629 default:
2630 MEM_LOG("Invalid page update command %x", cmd);
2631 rc = -ENOSYS;
2632 okay = 0;
2633 break;
2636 if ( unlikely(!okay) )
2638 rc = rc ? rc : -EINVAL;
2639 break;
2642 guest_handle_add_offset(ureqs, 1);
2645 process_deferred_ops();
2647 UNLOCK_BIGLOCK(d);
2649 domain_mmap_cache_destroy(&mapcache);
2651 perfc_add(num_page_updates, i);
2653 out:
2654 /* Add incremental work we have done to the @done output parameter. */
2655 if ( unlikely(!guest_handle_is_null(pdone)) )
2657 done += i;
2658 copy_to_guest(pdone, &done, 1);
2661 return rc;
2665 static int create_grant_pte_mapping(
2666 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2668 int rc = GNTST_okay;
2669 void *va;
2670 unsigned long gmfn, mfn;
2671 struct page_info *page;
2672 u32 type;
2673 l1_pgentry_t ol1e;
2674 struct domain *d = v->domain;
2676 ASSERT(spin_is_locked(&d->big_lock));
2678 adjust_guest_l1e(nl1e, d);
2680 gmfn = pte_addr >> PAGE_SHIFT;
2681 mfn = gmfn_to_mfn(d, gmfn);
2683 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2685 MEM_LOG("Could not get page for normal update");
2686 return GNTST_general_error;
2689 va = map_domain_page(mfn);
2690 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2691 page = mfn_to_page(mfn);
2693 type = page->u.inuse.type_info & PGT_type_mask;
2694 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2696 MEM_LOG("Grant map attempted to update a non-L1 page");
2697 rc = GNTST_general_error;
2698 goto failed;
2701 ol1e = *(l1_pgentry_t *)va;
2702 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
2704 put_page_type(page);
2705 rc = GNTST_general_error;
2706 goto failed;
2709 if ( !paging_mode_refcounts(d) )
2710 put_page_from_l1e(ol1e, d);
2712 put_page_type(page);
2714 failed:
2715 unmap_domain_page(va);
2716 put_page(page);
2718 return rc;
2721 static int destroy_grant_pte_mapping(
2722 uint64_t addr, unsigned long frame, struct domain *d)
2724 int rc = GNTST_okay;
2725 void *va;
2726 unsigned long gmfn, mfn;
2727 struct page_info *page;
2728 u32 type;
2729 l1_pgentry_t ol1e;
2731 gmfn = addr >> PAGE_SHIFT;
2732 mfn = gmfn_to_mfn(d, gmfn);
2734 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2736 MEM_LOG("Could not get page for normal update");
2737 return GNTST_general_error;
2740 va = map_domain_page(mfn);
2741 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2742 page = mfn_to_page(mfn);
2744 type = page->u.inuse.type_info & PGT_type_mask;
2745 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2747 MEM_LOG("Grant map attempted to update a non-L1 page");
2748 rc = GNTST_general_error;
2749 goto failed;
2752 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2754 put_page_type(page);
2755 rc = GNTST_general_error;
2756 goto failed;
2759 /* Check that the virtual address supplied is actually mapped to frame. */
2760 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2762 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2763 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2764 put_page_type(page);
2765 rc = GNTST_general_error;
2766 goto failed;
2769 /* Delete pagetable entry. */
2770 if ( unlikely(!UPDATE_ENTRY
2771 (l1,
2772 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2773 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
2774 0)) )
2776 MEM_LOG("Cannot delete PTE entry at %p", va);
2777 put_page_type(page);
2778 rc = GNTST_general_error;
2779 goto failed;
2782 put_page_type(page);
2784 failed:
2785 unmap_domain_page(va);
2786 put_page(page);
2787 return rc;
2791 static int create_grant_va_mapping(
2792 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2794 l1_pgentry_t *pl1e, ol1e;
2795 struct domain *d = v->domain;
2796 unsigned long gl1mfn;
2797 int okay;
2799 ASSERT(spin_is_locked(&d->big_lock));
2801 adjust_guest_l1e(nl1e, d);
2803 pl1e = guest_map_l1e(v, va, &gl1mfn);
2804 if ( !pl1e )
2806 MEM_LOG("Could not find L1 PTE for address %lx", va);
2807 return GNTST_general_error;
2809 ol1e = *pl1e;
2810 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
2811 guest_unmap_l1e(v, pl1e);
2812 pl1e = NULL;
2814 if ( !okay )
2815 return GNTST_general_error;
2817 if ( !paging_mode_refcounts(d) )
2818 put_page_from_l1e(ol1e, d);
2820 return GNTST_okay;
2823 static int replace_grant_va_mapping(
2824 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2826 l1_pgentry_t *pl1e, ol1e;
2827 unsigned long gl1mfn;
2828 int rc = 0;
2830 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2831 if ( !pl1e )
2833 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2834 return GNTST_general_error;
2836 ol1e = *pl1e;
2838 /* Check that the virtual address supplied is actually mapped to frame. */
2839 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2841 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2842 l1e_get_pfn(ol1e), addr, frame);
2843 rc = GNTST_general_error;
2844 goto out;
2847 /* Delete pagetable entry. */
2848 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
2850 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2851 rc = GNTST_general_error;
2852 goto out;
2855 out:
2856 guest_unmap_l1e(v, pl1e);
2857 return rc;
2860 static int destroy_grant_va_mapping(
2861 unsigned long addr, unsigned long frame, struct vcpu *v)
2863 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2866 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2867 unsigned int flags, unsigned int cache_flags)
2869 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2871 if ( (flags & GNTMAP_application_map) )
2872 l1e_add_flags(pte,_PAGE_USER);
2873 if ( !(flags & GNTMAP_readonly) )
2874 l1e_add_flags(pte,_PAGE_RW);
2876 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2878 if ( flags & GNTMAP_contains_pte )
2879 return create_grant_pte_mapping(addr, pte, current);
2880 return create_grant_va_mapping(addr, pte, current);
2883 int replace_grant_host_mapping(
2884 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2886 struct vcpu *curr = current;
2887 l1_pgentry_t *pl1e, ol1e;
2888 unsigned long gl1mfn;
2889 int rc;
2891 if ( flags & GNTMAP_contains_pte )
2893 if ( !new_addr )
2894 return destroy_grant_pte_mapping(addr, frame, curr->domain);
2896 MEM_LOG("Unsupported grant table operation");
2897 return GNTST_general_error;
2900 if ( !new_addr )
2901 return destroy_grant_va_mapping(addr, frame, curr);
2903 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
2904 if ( !pl1e )
2906 MEM_LOG("Could not find L1 PTE for address %lx",
2907 (unsigned long)new_addr);
2908 return GNTST_general_error;
2910 ol1e = *pl1e;
2912 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2913 gl1mfn, curr, 0)) )
2915 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2916 guest_unmap_l1e(curr, pl1e);
2917 return GNTST_general_error;
2920 guest_unmap_l1e(curr, pl1e);
2922 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
2923 if ( rc && !paging_mode_refcounts(curr->domain) )
2924 put_page_from_l1e(ol1e, curr->domain);
2926 return rc;
2929 int steal_page(
2930 struct domain *d, struct page_info *page, unsigned int memflags)
2932 u32 _d, _nd, x, y;
2934 spin_lock(&d->page_alloc_lock);
2936 /*
2937 * The tricky bit: atomically release ownership while there is just one
2938 * benign reference to the page (PGC_allocated). If that reference
2939 * disappears then the deallocation routine will safely spin.
2940 */
2941 _d = pickle_domptr(d);
2942 _nd = page->u.inuse._domain;
2943 y = page->count_info;
2944 do {
2945 x = y;
2946 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2947 (1 | PGC_allocated)) || unlikely(_nd != _d) )
2949 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2950 " caf=%08x, taf=%" PRtype_info "\n",
2951 (void *) page_to_mfn(page),
2952 d, d->domain_id, unpickle_domptr(_nd), x,
2953 page->u.inuse.type_info);
2954 spin_unlock(&d->page_alloc_lock);
2955 return -1;
2957 asm volatile (
2958 LOCK_PREFIX "cmpxchg8b %2"
2959 : "=d" (_nd), "=a" (y),
2960 "=m" (*(volatile u64 *)(&page->count_info))
2961 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2962 } while (unlikely(_nd != _d) || unlikely(y != x));
2964 /*
2965 * Unlink from 'd'. At least one reference remains (now anonymous), so
2966 * noone else is spinning to try to delete this page from 'd'.
2967 */
2968 if ( !(memflags & MEMF_no_refcount) )
2969 d->tot_pages--;
2970 list_del(&page->list);
2972 spin_unlock(&d->page_alloc_lock);
2974 return 0;
2977 int do_update_va_mapping(unsigned long va, u64 val64,
2978 unsigned long flags)
2980 l1_pgentry_t val = l1e_from_intpte(val64);
2981 struct vcpu *v = current;
2982 struct domain *d = v->domain;
2983 l1_pgentry_t *pl1e;
2984 unsigned long vmask, bmap_ptr, gl1mfn;
2985 cpumask_t pmask;
2986 int rc = 0;
2988 perfc_incr(calls_to_update_va);
2990 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2991 return -EINVAL;
2993 rc = xsm_update_va_mapping(current->domain, val);
2994 if ( rc )
2995 return rc;
2997 LOCK_BIGLOCK(d);
2999 pl1e = guest_map_l1e(v, va, &gl1mfn);
3001 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3002 rc = -EINVAL;
3004 if ( pl1e )
3005 guest_unmap_l1e(v, pl1e);
3006 pl1e = NULL;
3008 process_deferred_ops();
3010 UNLOCK_BIGLOCK(d);
3012 switch ( flags & UVMF_FLUSHTYPE_MASK )
3014 case UVMF_TLB_FLUSH:
3015 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3017 case UVMF_LOCAL:
3018 flush_tlb_local();
3019 break;
3020 case UVMF_ALL:
3021 flush_tlb_mask(d->domain_dirty_cpumask);
3022 break;
3023 default:
3024 if ( unlikely(!is_pv_32on64_domain(d) ?
3025 get_user(vmask, (unsigned long *)bmap_ptr) :
3026 get_user(vmask, (unsigned int *)bmap_ptr)) )
3027 rc = -EFAULT;
3028 pmask = vcpumask_to_pcpumask(d, vmask);
3029 flush_tlb_mask(pmask);
3030 break;
3032 break;
3034 case UVMF_INVLPG:
3035 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3037 case UVMF_LOCAL:
3038 if ( !paging_mode_enabled(d) ||
3039 (paging_invlpg(v, va) != 0) )
3040 flush_tlb_one_local(va);
3041 break;
3042 case UVMF_ALL:
3043 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3044 break;
3045 default:
3046 if ( unlikely(!is_pv_32on64_domain(d) ?
3047 get_user(vmask, (unsigned long *)bmap_ptr) :
3048 get_user(vmask, (unsigned int *)bmap_ptr)) )
3049 rc = -EFAULT;
3050 pmask = vcpumask_to_pcpumask(d, vmask);
3051 flush_tlb_one_mask(pmask, va);
3052 break;
3054 break;
3057 return rc;
3060 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3061 unsigned long flags,
3062 domid_t domid)
3064 int rc;
3066 if ( !set_foreigndom(domid) )
3067 return -ESRCH;
3069 rc = do_update_va_mapping(va, val64, flags);
3071 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3072 process_deferred_ops(); /* only to clear foreigndom */
3074 return rc;
3079 /*************************
3080 * Descriptor Tables
3081 */
3083 void destroy_gdt(struct vcpu *v)
3085 int i;
3086 unsigned long pfn;
3088 v->arch.guest_context.gdt_ents = 0;
3089 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3091 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3092 put_page_and_type(mfn_to_page(pfn));
3093 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3094 v->arch.guest_context.gdt_frames[i] = 0;
3099 long set_gdt(struct vcpu *v,
3100 unsigned long *frames,
3101 unsigned int entries)
3103 struct domain *d = v->domain;
3104 /* NB. There are 512 8-byte entries per GDT page. */
3105 int i, nr_pages = (entries + 511) / 512;
3106 unsigned long mfn;
3108 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3109 return -EINVAL;
3111 /* Check the pages in the new GDT. */
3112 for ( i = 0; i < nr_pages; i++ )
3114 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3115 if ( !mfn_valid(mfn) ||
3116 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3117 goto fail;
3120 /* Tear down the old GDT. */
3121 destroy_gdt(v);
3123 /* Install the new GDT. */
3124 v->arch.guest_context.gdt_ents = entries;
3125 for ( i = 0; i < nr_pages; i++ )
3127 v->arch.guest_context.gdt_frames[i] = frames[i];
3128 l1e_write(&v->arch.perdomain_ptes[i],
3129 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3132 return 0;
3134 fail:
3135 while ( i-- > 0 )
3136 put_page_and_type(mfn_to_page(frames[i]));
3137 return -EINVAL;
3141 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3143 int nr_pages = (entries + 511) / 512;
3144 unsigned long frames[16];
3145 struct vcpu *curr = current;
3146 long ret;
3148 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3149 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3150 return -EINVAL;
3152 if ( copy_from_guest(frames, frame_list, nr_pages) )
3153 return -EFAULT;
3155 LOCK_BIGLOCK(curr->domain);
3157 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3158 flush_tlb_local();
3160 UNLOCK_BIGLOCK(curr->domain);
3162 return ret;
3166 long do_update_descriptor(u64 pa, u64 desc)
3168 struct domain *dom = current->domain;
3169 unsigned long gmfn = pa >> PAGE_SHIFT;
3170 unsigned long mfn;
3171 unsigned int offset;
3172 struct desc_struct *gdt_pent, d;
3173 struct page_info *page;
3174 long ret = -EINVAL;
3176 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3178 *(u64 *)&d = desc;
3180 mfn = gmfn_to_mfn(dom, gmfn);
3181 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3182 !mfn_valid(mfn) ||
3183 !check_descriptor(dom, &d) )
3184 return -EINVAL;
3186 page = mfn_to_page(mfn);
3187 if ( unlikely(!get_page(page, dom)) )
3188 return -EINVAL;
3190 /* Check if the given frame is in use in an unsafe context. */
3191 switch ( page->u.inuse.type_info & PGT_type_mask )
3193 case PGT_gdt_page:
3194 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3195 goto out;
3196 break;
3197 case PGT_ldt_page:
3198 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3199 goto out;
3200 break;
3201 default:
3202 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3203 goto out;
3204 break;
3207 paging_mark_dirty(dom, mfn);
3209 /* All is good so make the update. */
3210 gdt_pent = map_domain_page(mfn);
3211 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3212 unmap_domain_page(gdt_pent);
3214 put_page_type(page);
3216 ret = 0; /* success */
3218 out:
3219 put_page(page);
3221 return ret;
3224 typedef struct e820entry e820entry_t;
3225 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3227 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3229 switch ( op )
3231 case XENMEM_add_to_physmap:
3233 struct xen_add_to_physmap xatp;
3234 unsigned long prev_mfn, mfn = 0, gpfn;
3235 struct domain *d;
3237 if ( copy_from_guest(&xatp, arg, 1) )
3238 return -EFAULT;
3240 if ( xatp.domid == DOMID_SELF )
3241 d = rcu_lock_current_domain();
3242 else {
3243 d = rcu_lock_domain_by_id(xatp.domid);
3244 if ( d == NULL )
3245 return -ESRCH;
3246 if ( !IS_PRIV_FOR(current->domain, d) ) {
3247 rcu_unlock_domain(d);
3248 return -EPERM;
3252 if ( xsm_add_to_physmap(current->domain, d) )
3254 rcu_unlock_domain(d);
3255 return -EPERM;
3258 switch ( xatp.space )
3260 case XENMAPSPACE_shared_info:
3261 if ( xatp.idx == 0 )
3262 mfn = virt_to_mfn(d->shared_info);
3263 /* XXX: assumption here, this is called after E820 table is build
3264 * need the E820 to initialize MTRR.
3265 */
3266 if ( is_hvm_domain(d) ) {
3267 extern void init_mtrr_in_hyper(struct vcpu *);
3268 struct vcpu *vs;
3269 for_each_vcpu(d, vs)
3270 init_mtrr_in_hyper(vs);
3272 break;
3273 case XENMAPSPACE_grant_table:
3274 spin_lock(&d->grant_table->lock);
3276 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3277 (xatp.idx < max_nr_grant_frames) )
3278 gnttab_grow_table(d, xatp.idx + 1);
3280 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3281 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3283 spin_unlock(&d->grant_table->lock);
3284 break;
3285 default:
3286 break;
3289 if ( !paging_mode_translate(d) || (mfn == 0) )
3291 rcu_unlock_domain(d);
3292 return -EINVAL;
3295 LOCK_BIGLOCK(d);
3297 /* Remove previously mapped page if it was present. */
3298 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3299 if ( mfn_valid(prev_mfn) )
3301 if ( is_xen_heap_mfn(prev_mfn) )
3302 /* Xen heap frames are simply unhooked from this phys slot. */
3303 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3304 else
3305 /* Normal domain memory is freed, to avoid leaking memory. */
3306 guest_remove_page(d, xatp.gpfn);
3309 /* Unmap from old location, if any. */
3310 gpfn = get_gpfn_from_mfn(mfn);
3311 if ( gpfn != INVALID_M2P_ENTRY )
3312 guest_physmap_remove_page(d, gpfn, mfn);
3314 /* Map at new location. */
3315 guest_physmap_add_page(d, xatp.gpfn, mfn);
3317 UNLOCK_BIGLOCK(d);
3319 rcu_unlock_domain(d);
3321 break;
3324 case XENMEM_set_memory_map:
3326 struct xen_foreign_memory_map fmap;
3327 struct domain *d;
3328 int rc;
3330 if ( copy_from_guest(&fmap, arg, 1) )
3331 return -EFAULT;
3333 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3334 return -EINVAL;
3336 if ( fmap.domid == DOMID_SELF )
3337 d = rcu_lock_current_domain();
3338 else {
3339 d = rcu_lock_domain_by_id(fmap.domid);
3340 if ( d == NULL )
3341 return -ESRCH;
3342 if ( !IS_PRIV_FOR(current->domain, d) ) {
3343 rcu_unlock_domain(d);
3344 return -EPERM;
3348 rc = xsm_domain_memory_map(d);
3349 if ( rc )
3351 rcu_unlock_domain(d);
3352 return rc;
3355 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3356 fmap.map.nr_entries) ? -EFAULT : 0;
3357 d->arch.nr_e820 = fmap.map.nr_entries;
3359 rcu_unlock_domain(d);
3360 return rc;
3363 case XENMEM_memory_map:
3365 struct xen_memory_map map;
3366 struct domain *d = current->domain;
3368 /* Backwards compatibility. */
3369 if ( d->arch.nr_e820 == 0 )
3370 return -ENOSYS;
3372 if ( copy_from_guest(&map, arg, 1) )
3373 return -EFAULT;
3375 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3376 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3377 copy_to_guest(arg, &map, 1) )
3378 return -EFAULT;
3380 return 0;
3383 case XENMEM_machine_memory_map:
3385 struct xen_memory_map memmap;
3386 XEN_GUEST_HANDLE(e820entry_t) buffer;
3387 int count;
3388 int rc;
3390 if ( !IS_PRIV(current->domain) )
3391 return -EINVAL;
3393 rc = xsm_machine_memory_map();
3394 if ( rc )
3395 return rc;
3397 if ( copy_from_guest(&memmap, arg, 1) )
3398 return -EFAULT;
3399 if ( memmap.nr_entries < e820.nr_map + 1 )
3400 return -EINVAL;
3402 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3404 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3405 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3406 return -EFAULT;
3408 memmap.nr_entries = count;
3410 if ( copy_to_guest(arg, &memmap, 1) )
3411 return -EFAULT;
3413 return 0;
3416 case XENMEM_machphys_mapping:
3418 static const struct xen_machphys_mapping mapping = {
3419 .v_start = MACH2PHYS_VIRT_START,
3420 .v_end = MACH2PHYS_VIRT_END,
3421 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3422 };
3424 if ( copy_to_guest(arg, &mapping, 1) )
3425 return -EFAULT;
3427 return 0;
3430 default:
3431 return subarch_memory_op(op, arg);
3434 return 0;
3438 /*************************
3439 * Writable Pagetables
3440 */
3442 struct ptwr_emulate_ctxt {
3443 struct x86_emulate_ctxt ctxt;
3444 unsigned long cr2;
3445 l1_pgentry_t pte;
3446 };
3448 static int ptwr_emulated_read(
3449 enum x86_segment seg,
3450 unsigned long offset,
3451 unsigned long *val,
3452 unsigned int bytes,
3453 struct x86_emulate_ctxt *ctxt)
3455 unsigned int rc;
3456 unsigned long addr = offset;
3458 *val = 0;
3459 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3461 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3462 return X86EMUL_EXCEPTION;
3465 return X86EMUL_OKAY;
3468 static int ptwr_emulated_update(
3469 unsigned long addr,
3470 paddr_t old,
3471 paddr_t val,
3472 unsigned int bytes,
3473 unsigned int do_cmpxchg,
3474 struct ptwr_emulate_ctxt *ptwr_ctxt)
3476 unsigned long mfn;
3477 unsigned long unaligned_addr = addr;
3478 struct page_info *page;
3479 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3480 struct vcpu *v = current;
3481 struct domain *d = v->domain;
3483 /* Only allow naturally-aligned stores within the original %cr2 page. */
3484 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3486 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3487 ptwr_ctxt->cr2, addr, bytes);
3488 return X86EMUL_UNHANDLEABLE;
3491 /* Turn a sub-word access into a full-word access. */
3492 if ( bytes != sizeof(paddr_t) )
3494 paddr_t full;
3495 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3497 /* Align address; read full word. */
3498 addr &= ~(sizeof(paddr_t)-1);
3499 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3501 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3502 return X86EMUL_EXCEPTION;
3504 /* Mask out bits provided by caller. */
3505 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3506 /* Shift the caller value and OR in the missing bits. */
3507 val &= (((paddr_t)1 << (bytes*8)) - 1);
3508 val <<= (offset)*8;
3509 val |= full;
3510 /* Also fill in missing parts of the cmpxchg old value. */
3511 old &= (((paddr_t)1 << (bytes*8)) - 1);
3512 old <<= (offset)*8;
3513 old |= full;
3516 pte = ptwr_ctxt->pte;
3517 mfn = l1e_get_pfn(pte);
3518 page = mfn_to_page(mfn);
3520 /* We are looking only for read-only mappings of p.t. pages. */
3521 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3522 ASSERT(mfn_valid(mfn));
3523 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3524 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3525 ASSERT(page_get_owner(page) == d);
3527 /* Check the new PTE. */
3528 nl1e = l1e_from_intpte(val);
3529 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3531 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3532 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3533 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3535 /*
3536 * If this is an upper-half write to a PAE PTE then we assume that
3537 * the guest has simply got the two writes the wrong way round. We
3538 * zap the PRESENT bit on the assumption that the bottom half will
3539 * be written immediately after we return to the guest.
3540 */
3541 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3542 l1e_get_intpte(nl1e));
3543 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3545 else
3547 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3548 return X86EMUL_UNHANDLEABLE;
3552 adjust_guest_l1e(nl1e, d);
3554 /* Checked successfully: do the update (write or cmpxchg). */
3555 pl1e = map_domain_page(mfn);
3556 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3557 if ( do_cmpxchg )
3559 int okay;
3560 intpte_t t = old;
3561 ol1e = l1e_from_intpte(old);
3563 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3564 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3565 okay = (okay && t == old);
3567 if ( !okay )
3569 unmap_domain_page(pl1e);
3570 put_page_from_l1e(nl1e, d);
3571 return X86EMUL_CMPXCHG_FAILED;
3574 else
3576 ol1e = *pl1e;
3577 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3578 BUG();
3581 trace_ptwr_emulation(addr, nl1e);
3583 unmap_domain_page(pl1e);
3585 /* Finally, drop the old PTE. */
3586 put_page_from_l1e(ol1e, d);
3588 return X86EMUL_OKAY;
3591 static int ptwr_emulated_write(
3592 enum x86_segment seg,
3593 unsigned long offset,
3594 unsigned long val,
3595 unsigned int bytes,
3596 struct x86_emulate_ctxt *ctxt)
3598 return ptwr_emulated_update(
3599 offset, 0, val, bytes, 0,
3600 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3603 static int ptwr_emulated_cmpxchg(
3604 enum x86_segment seg,
3605 unsigned long offset,
3606 unsigned long old,
3607 unsigned long new,
3608 unsigned int bytes,
3609 struct x86_emulate_ctxt *ctxt)
3611 return ptwr_emulated_update(
3612 offset, old, new, bytes, 1,
3613 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3616 static int ptwr_emulated_cmpxchg8b(
3617 enum x86_segment seg,
3618 unsigned long offset,
3619 unsigned long old,
3620 unsigned long old_hi,
3621 unsigned long new,
3622 unsigned long new_hi,
3623 struct x86_emulate_ctxt *ctxt)
3625 if ( CONFIG_PAGING_LEVELS == 2 )
3626 return X86EMUL_UNHANDLEABLE;
3627 return ptwr_emulated_update(
3628 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3629 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3632 static struct x86_emulate_ops ptwr_emulate_ops = {
3633 .read = ptwr_emulated_read,
3634 .insn_fetch = ptwr_emulated_read,
3635 .write = ptwr_emulated_write,
3636 .cmpxchg = ptwr_emulated_cmpxchg,
3637 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3638 };
3640 /* Write page fault handler: check if guest is trying to modify a PTE. */
3641 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3642 struct cpu_user_regs *regs)
3644 struct domain *d = v->domain;
3645 struct page_info *page;
3646 l1_pgentry_t pte;
3647 struct ptwr_emulate_ctxt ptwr_ctxt;
3648 int rc;
3650 LOCK_BIGLOCK(d);
3652 /* Attempt to read the PTE that maps the VA being accessed. */
3653 guest_get_eff_l1e(v, addr, &pte);
3654 page = l1e_get_page(pte);
3656 /* We are looking only for read-only mappings of p.t. pages. */
3657 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3658 !mfn_valid(l1e_get_pfn(pte)) ||
3659 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3660 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3661 (page_get_owner(page) != d) )
3662 goto bail;
3664 ptwr_ctxt.ctxt.regs = regs;
3665 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3666 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3667 ptwr_ctxt.cr2 = addr;
3668 ptwr_ctxt.pte = pte;
3670 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3671 if ( rc == X86EMUL_UNHANDLEABLE )
3672 goto bail;
3674 UNLOCK_BIGLOCK(d);
3675 perfc_incr(ptwr_emulations);
3676 return EXCRET_fault_fixed;
3678 bail:
3679 UNLOCK_BIGLOCK(d);
3680 return 0;
3683 void free_xen_pagetable(void *v)
3685 extern int early_boot;
3687 if ( early_boot )
3688 return;
3690 if ( is_xen_heap_page(virt_to_page(v)) )
3691 free_xenheap_page(v);
3692 else
3693 free_domheap_page(virt_to_page(v));
3696 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3697 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
3698 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
3700 /*
3701 * map_pages_to_xen() can be called with interrupts disabled:
3702 * * During early bootstrap; or
3703 * * alloc_xenheap_pages() via memguard_guard_range
3704 * In these cases it is safe to use flush_area_local():
3705 * * Because only the local CPU is online; or
3706 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3707 */
3708 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3709 flush_area_local((const void *)v, f) : \
3710 flush_area_all((const void *)v, f))
3712 int map_pages_to_xen(
3713 unsigned long virt,
3714 unsigned long mfn,
3715 unsigned long nr_mfns,
3716 unsigned int flags)
3718 l2_pgentry_t *pl2e, ol2e;
3719 l1_pgentry_t *pl1e, ol1e;
3720 unsigned int i;
3722 while ( nr_mfns != 0 )
3724 #ifdef __x86_64__
3725 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
3726 l3_pgentry_t ol3e = *pl3e;
3728 if ( cpu_has_page1gb &&
3729 !(((virt >> PAGE_SHIFT) | mfn) &
3730 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
3731 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
3732 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
3734 /* 1GB-page mapping. */
3735 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
3737 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
3739 unsigned int flush_flags =
3740 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3742 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
3744 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3745 flush_flags |= FLUSH_TLB_GLOBAL;
3746 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
3747 PAGE_CACHE_ATTRS )
3748 flush_flags |= FLUSH_CACHE;
3749 flush_area(virt, flush_flags);
3751 else
3753 pl2e = l3e_to_l2e(ol3e);
3754 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3756 ol2e = pl2e[i];
3757 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3758 continue;
3759 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3761 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3762 flush_flags |= FLUSH_TLB_GLOBAL;
3763 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3764 PAGE_CACHE_ATTRS )
3765 flush_flags |= FLUSH_CACHE;
3767 else
3769 unsigned int j;
3771 pl1e = l2e_to_l1e(ol2e);
3772 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
3774 ol1e = pl1e[j];
3775 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3776 flush_flags |= FLUSH_TLB_GLOBAL;
3777 if ( (l1e_get_flags(ol1e) ^ flags) &
3778 PAGE_CACHE_ATTRS )
3779 flush_flags |= FLUSH_CACHE;
3783 flush_area(virt, flush_flags);
3784 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3786 ol2e = pl2e[i];
3787 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
3788 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3789 free_xen_pagetable(l2e_to_l1e(ol2e));
3791 free_xen_pagetable(pl2e);
3795 virt += 1UL << L3_PAGETABLE_SHIFT;
3796 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3797 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3798 continue;
3801 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
3802 (l3e_get_flags(ol3e) & _PAGE_PSE) )
3804 unsigned int flush_flags =
3805 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3807 /* Skip this PTE if there is no change. */
3808 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
3809 L1_PAGETABLE_ENTRIES - 1)) +
3810 (l2_table_offset(virt) << PAGETABLE_ORDER) +
3811 l1_table_offset(virt) == mfn) &&
3812 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
3813 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
3815 /* We can skip to end of L3 superpage if we got a match. */
3816 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3817 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3818 if ( i > nr_mfns )
3819 i = nr_mfns;
3820 virt += i << PAGE_SHIFT;
3821 mfn += i;
3822 nr_mfns -= i;
3823 continue;
3826 pl2e = alloc_xen_pagetable();
3827 if ( pl2e == NULL )
3828 return -ENOMEM;
3830 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3831 l2e_write(pl2e + i,
3832 l2e_from_pfn(l3e_get_pfn(ol3e) +
3833 (i << PAGETABLE_ORDER),
3834 l3e_get_flags(ol3e)));
3836 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3837 flush_flags |= FLUSH_TLB_GLOBAL;
3839 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
3840 __PAGE_HYPERVISOR));
3841 flush_area(virt, flush_flags);
3843 #endif
3845 pl2e = virt_to_xen_l2e(virt);
3847 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3848 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3849 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3851 /* Super-page mapping. */
3852 ol2e = *pl2e;
3853 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
3855 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3857 unsigned int flush_flags =
3858 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3860 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3862 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3863 flush_flags |= FLUSH_TLB_GLOBAL;
3864 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3865 PAGE_CACHE_ATTRS )
3866 flush_flags |= FLUSH_CACHE;
3867 flush_area(virt, flush_flags);
3869 else
3871 pl1e = l2e_to_l1e(ol2e);
3872 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3874 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3875 flush_flags |= FLUSH_TLB_GLOBAL;
3876 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3877 PAGE_CACHE_ATTRS )
3878 flush_flags |= FLUSH_CACHE;
3880 flush_area(virt, flush_flags);
3881 free_xen_pagetable(pl1e);
3885 virt += 1UL << L2_PAGETABLE_SHIFT;
3886 mfn += 1UL << PAGETABLE_ORDER;
3887 nr_mfns -= 1UL << PAGETABLE_ORDER;
3889 else
3891 /* Normal page mapping. */
3892 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3894 pl1e = alloc_xen_pagetable();
3895 if ( pl1e == NULL )
3896 return -ENOMEM;
3897 clear_page(pl1e);
3898 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3899 __PAGE_HYPERVISOR));
3901 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3903 unsigned int flush_flags =
3904 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3906 /* Skip this PTE if there is no change. */
3907 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3908 l1_table_offset(virt)) == mfn) &&
3909 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3910 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3912 /* We can skip to end of L2 superpage if we got a match. */
3913 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3914 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3915 if ( i > nr_mfns )
3916 i = nr_mfns;
3917 virt += i << L1_PAGETABLE_SHIFT;
3918 mfn += i;
3919 nr_mfns -= i;
3920 goto check_l3;
3923 pl1e = alloc_xen_pagetable();
3924 if ( pl1e == NULL )
3925 return -ENOMEM;
3927 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3928 l1e_write(&pl1e[i],
3929 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3930 lNf_to_l1f(l2e_get_flags(*pl2e))));
3932 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3933 flush_flags |= FLUSH_TLB_GLOBAL;
3935 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3936 __PAGE_HYPERVISOR));
3937 flush_area(virt, flush_flags);
3940 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3941 ol1e = *pl1e;
3942 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3943 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3945 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3946 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3947 flush_flags |= FLUSH_TLB_GLOBAL;
3948 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3949 flush_flags |= FLUSH_CACHE;
3950 flush_area(virt, flush_flags);
3953 virt += 1UL << L1_PAGETABLE_SHIFT;
3954 mfn += 1UL;
3955 nr_mfns -= 1UL;
3957 if ( (flags == PAGE_HYPERVISOR) &&
3958 ((nr_mfns == 0) ||
3959 ((((virt >> PAGE_SHIFT) | mfn) &
3960 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3962 unsigned long base_mfn;
3963 pl1e = l2e_to_l1e(*pl2e);
3964 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3965 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3966 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3967 (l1e_get_flags(*pl1e) != flags) )
3968 break;
3969 if ( i == L1_PAGETABLE_ENTRIES )
3971 ol2e = *pl2e;
3972 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3973 l1f_to_lNf(flags)));
3974 flush_area(virt - PAGE_SIZE,
3975 FLUSH_TLB_GLOBAL |
3976 FLUSH_ORDER(PAGETABLE_ORDER));
3977 free_xen_pagetable(l2e_to_l1e(ol2e));
3982 check_l3: ;
3983 #ifdef __x86_64__
3984 if ( cpu_has_page1gb &&
3985 (flags == PAGE_HYPERVISOR) &&
3986 ((nr_mfns == 0) ||
3987 !(((virt >> PAGE_SHIFT) | mfn) &
3988 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
3990 unsigned long base_mfn;
3992 ol3e = *pl3e;
3993 pl2e = l3e_to_l2e(ol3e);
3994 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
3995 L1_PAGETABLE_ENTRIES - 1);
3996 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
3997 if ( (l2e_get_pfn(*pl2e) !=
3998 (base_mfn + (i << PAGETABLE_ORDER))) ||
3999 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4000 break;
4001 if ( i == L2_PAGETABLE_ENTRIES )
4003 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4004 l1f_to_lNf(flags)));
4005 flush_area(virt - PAGE_SIZE,
4006 FLUSH_TLB_GLOBAL |
4007 FLUSH_ORDER(2*PAGETABLE_ORDER));
4008 free_xen_pagetable(l3e_to_l2e(ol3e));
4011 #endif
4014 return 0;
4017 void destroy_xen_mappings(unsigned long s, unsigned long e)
4019 l2_pgentry_t *pl2e;
4020 l1_pgentry_t *pl1e;
4021 unsigned int i;
4022 unsigned long v = s;
4024 ASSERT((s & ~PAGE_MASK) == 0);
4025 ASSERT((e & ~PAGE_MASK) == 0);
4027 while ( v < e )
4029 #ifdef __x86_64__
4030 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4032 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4034 v += 1UL << L3_PAGETABLE_SHIFT;
4035 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4036 continue;
4039 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4041 if ( l2_table_offset(v) == 0 &&
4042 l1_table_offset(v) == 0 &&
4043 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4045 /* PAGE1GB: whole superpage is destroyed. */
4046 l3e_write_atomic(pl3e, l3e_empty());
4047 v += 1UL << L3_PAGETABLE_SHIFT;
4048 continue;
4051 /* PAGE1GB: shatter the superpage and fall through. */
4052 pl2e = alloc_xen_pagetable();
4053 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4054 l2e_write(pl2e + i,
4055 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4056 (i << PAGETABLE_ORDER),
4057 l3e_get_flags(*pl3e)));
4058 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4059 __PAGE_HYPERVISOR));
4061 #endif
4063 pl2e = virt_to_xen_l2e(v);
4065 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4067 v += 1UL << L2_PAGETABLE_SHIFT;
4068 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4069 continue;
4072 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4074 if ( (l1_table_offset(v) == 0) &&
4075 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4077 /* PSE: whole superpage is destroyed. */
4078 l2e_write_atomic(pl2e, l2e_empty());
4079 v += 1UL << L2_PAGETABLE_SHIFT;
4081 else
4083 /* PSE: shatter the superpage and try again. */
4084 pl1e = alloc_xen_pagetable();
4085 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4086 l1e_write(&pl1e[i],
4087 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4088 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4089 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4090 __PAGE_HYPERVISOR));
4093 else
4095 /* Ordinary 4kB mapping. */
4096 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4097 l1e_write_atomic(pl1e, l1e_empty());
4098 v += PAGE_SIZE;
4100 /* If we are done with the L2E, check if it is now empty. */
4101 if ( (v != e) && (l1_table_offset(v) != 0) )
4102 continue;
4103 pl1e = l2e_to_l1e(*pl2e);
4104 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4105 if ( l1e_get_intpte(pl1e[i]) != 0 )
4106 break;
4107 if ( i == L1_PAGETABLE_ENTRIES )
4109 /* Empty: zap the L2E and free the L1 page. */
4110 l2e_write_atomic(pl2e, l2e_empty());
4111 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4112 free_xen_pagetable(pl1e);
4116 #ifdef __x86_64__
4117 /* If we are done with the L3E, check if it is now empty. */
4118 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4119 continue;
4120 pl2e = l3e_to_l2e(*pl3e);
4121 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4122 if ( l2e_get_intpte(pl2e[i]) != 0 )
4123 break;
4124 if ( i == L2_PAGETABLE_ENTRIES )
4126 /* Empty: zap the L3E and free the L2 page. */
4127 l3e_write_atomic(pl3e, l3e_empty());
4128 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4129 free_xen_pagetable(pl2e);
4131 #endif
4134 flush_area(NULL, FLUSH_TLB_GLOBAL);
4137 void __set_fixmap(
4138 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4140 BUG_ON(idx >= __end_of_fixed_addresses);
4141 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4144 #ifdef MEMORY_GUARD
4146 void memguard_init(void)
4148 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4149 map_pages_to_xen(
4150 (unsigned long)__va(start),
4151 start >> PAGE_SHIFT,
4152 (xenheap_phys_end - start) >> PAGE_SHIFT,
4153 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4154 #ifdef __x86_64__
4155 BUG_ON(start != xen_phys_start);
4156 map_pages_to_xen(
4157 XEN_VIRT_START,
4158 start >> PAGE_SHIFT,
4159 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4160 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4161 #endif
4164 static void __memguard_change_range(void *p, unsigned long l, int guard)
4166 unsigned long _p = (unsigned long)p;
4167 unsigned long _l = (unsigned long)l;
4168 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4170 /* Ensure we are dealing with a page-aligned whole number of pages. */
4171 ASSERT((_p&~PAGE_MASK) == 0);
4172 ASSERT((_l&~PAGE_MASK) == 0);
4174 if ( guard )
4175 flags &= ~_PAGE_PRESENT;
4177 map_pages_to_xen(
4178 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4181 void memguard_guard_range(void *p, unsigned long l)
4183 __memguard_change_range(p, l, 1);
4186 void memguard_unguard_range(void *p, unsigned long l)
4188 __memguard_change_range(p, l, 0);
4191 #endif
4193 void memguard_guard_stack(void *p)
4195 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4196 p = (void *)((unsigned long)p + STACK_SIZE -
4197 PRIMARY_STACK_SIZE - PAGE_SIZE);
4198 memguard_guard_range(p, PAGE_SIZE);
4201 /*
4202 * Local variables:
4203 * mode: C
4204 * c-set-style: "BSD"
4205 * c-basic-offset: 4
4206 * tab-width: 4
4207 * indent-tabs-mode: nil
4208 * End:
4209 */