debuggers.hg

view xen/arch/x86/mm.c @ 16964:e23144190f93

x86: Fix 16889:60bb765b25b5 in a couple of respects:
1. Leave bottom-most 1MB permanently mapped.
2. ACPI-table mapping code shoudl be aware that mappings above 1MB of
non-RAM are not permanent.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 29 09:36:37 2008 +0000 (2008-01-29)
parents ed8ab1a36b09
children 9d29141a5e52
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 static struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps)) ? \
167 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
169 #ifdef CONFIG_COMPAT
170 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
171 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
172 L3_DISALLOW_MASK : \
173 COMPAT_L3_DISALLOW_MASK)
174 #else
175 #define l3_disallow_mask(d) L3_DISALLOW_MASK
176 #endif
178 static void queue_deferred_ops(struct domain *d, unsigned int ops)
179 {
180 ASSERT(d == current->domain);
181 this_cpu(percpu_mm_info).deferred_ops |= ops;
182 }
184 void __init init_frametable(void)
185 {
186 unsigned long nr_pages, page_step, i, mfn;
188 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
190 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
191 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
193 for ( i = 0; i < nr_pages; i += page_step )
194 {
195 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
196 if ( mfn == 0 )
197 panic("Not enough memory for frame table\n");
198 map_pages_to_xen(
199 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
200 mfn, page_step, PAGE_HYPERVISOR);
201 }
203 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
204 }
206 void __init arch_init_memory(void)
207 {
208 extern void subarch_init_memory(void);
210 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
212 /*
213 * Initialise our DOMID_XEN domain.
214 * Any Xen-heap pages that we will allow to be mapped will have
215 * their domain field set to dom_xen.
216 */
217 dom_xen = alloc_domain(DOMID_XEN);
218 BUG_ON(dom_xen == NULL);
220 /*
221 * Initialise our DOMID_IO domain.
222 * This domain owns I/O pages that are within the range of the page_info
223 * array. Mappings occur at the priv of the caller.
224 */
225 dom_io = alloc_domain(DOMID_IO);
226 BUG_ON(dom_io == NULL);
228 /* First 1MB of RAM is historically marked as I/O. */
229 for ( i = 0; i < 0x100; i++ )
230 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
232 /* Any areas not specified as RAM by the e820 map are considered I/O. */
233 for ( i = 0, pfn = 0; pfn < max_page; i++ )
234 {
235 while ( (i < e820.nr_map) &&
236 (e820.map[i].type != E820_RAM) &&
237 (e820.map[i].type != E820_UNUSABLE) )
238 i++;
240 if ( i >= e820.nr_map )
241 {
242 /* No more RAM regions: mark as I/O right to end of memory map. */
243 rstart_pfn = rend_pfn = max_page;
244 }
245 else
246 {
247 /* Mark as I/O just up as far as next RAM region. */
248 rstart_pfn = min_t(unsigned long, max_page,
249 PFN_UP(e820.map[i].addr));
250 rend_pfn = max_t(unsigned long, rstart_pfn,
251 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
252 }
254 /*
255 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
256 * In particular this ensures that RAM holes are respected even in
257 * the statically-initialised 1-16MB mapping area.
258 */
259 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
260 ioend_pfn = rstart_pfn;
261 #if defined(CONFIG_X86_32)
262 ioend_pfn = min_t(unsigned long, ioend_pfn,
263 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
264 #endif
265 if ( iostart_pfn < ioend_pfn )
266 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
267 (unsigned long)mfn_to_virt(ioend_pfn));
269 /* Mark as I/O up to next RAM region. */
270 for ( ; pfn < rstart_pfn; pfn++ )
271 {
272 BUG_ON(!mfn_valid(pfn));
273 share_xen_page_with_guest(
274 mfn_to_page(pfn), dom_io, XENSHARE_writable);
275 }
277 /* Skip the RAM region. */
278 pfn = rend_pfn;
279 }
281 subarch_init_memory();
282 }
284 int memory_is_conventional_ram(paddr_t p)
285 {
286 int i;
288 for ( i = 0; i < e820.nr_map; i++ )
289 {
290 if ( (e820.map[i].type == E820_RAM) &&
291 (e820.map[i].addr <= p) &&
292 (e820.map[i].size > p) )
293 return 1;
294 }
296 return 0;
297 }
299 unsigned long domain_get_maximum_gpfn(struct domain *d)
300 {
301 if ( is_hvm_domain(d) )
302 return d->arch.p2m.max_mapped_pfn;
303 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
304 return arch_get_max_pfn(d) - 1;
305 }
307 void share_xen_page_with_guest(
308 struct page_info *page, struct domain *d, int readonly)
309 {
310 if ( page_get_owner(page) == d )
311 return;
313 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
315 spin_lock(&d->page_alloc_lock);
317 /* The incremented type count pins as writable or read-only. */
318 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
319 page->u.inuse.type_info |= PGT_validated | 1;
321 page_set_owner(page, d);
322 wmb(); /* install valid domain ptr before updating refcnt. */
323 ASSERT(page->count_info == 0);
325 /* Only add to the allocation list if the domain isn't dying. */
326 if ( !d->is_dying )
327 {
328 page->count_info |= PGC_allocated | 1;
329 if ( unlikely(d->xenheap_pages++ == 0) )
330 get_knownalive_domain(d);
331 list_add_tail(&page->list, &d->xenpage_list);
332 }
334 spin_unlock(&d->page_alloc_lock);
335 }
337 void share_xen_page_with_privileged_guests(
338 struct page_info *page, int readonly)
339 {
340 share_xen_page_with_guest(page, dom_xen, readonly);
341 }
343 #if defined(CONFIG_X86_PAE)
345 #ifdef NDEBUG
346 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
347 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
348 #else
349 /*
350 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
351 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
352 * (detected by lack of an owning domain). As required for correctness, we
353 * always shadow PDPTs above 4GB.
354 */
355 #define l3tab_needs_shadow(mfn) \
356 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
357 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
358 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
359 ((mfn) >= 0x100000))
360 #endif
362 static l1_pgentry_t *fix_pae_highmem_pl1e;
364 /* Cache the address of PAE high-memory fixmap page tables. */
365 static int __init cache_pae_fixmap_address(void)
366 {
367 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
368 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
369 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
370 return 0;
371 }
372 __initcall(cache_pae_fixmap_address);
374 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
376 void make_cr3(struct vcpu *v, unsigned long mfn)
377 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
378 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
379 {
380 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
381 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
382 unsigned int cpu = smp_processor_id();
384 /* Fast path: does this mfn need a shadow at all? */
385 if ( !l3tab_needs_shadow(mfn) )
386 {
387 v->arch.cr3 = mfn << PAGE_SHIFT;
388 /* Cache is no longer in use or valid */
389 cache->high_mfn = 0;
390 return;
391 }
393 /* Caching logic is not interrupt safe. */
394 ASSERT(!in_irq());
396 /* Protects against pae_flush_pgd(). */
397 spin_lock(&cache->lock);
399 cache->inuse_idx ^= 1;
400 cache->high_mfn = mfn;
402 /* Map the guest L3 table and copy to the chosen low-memory cache. */
403 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
404 /* First check the previous high mapping can't be in the TLB.
405 * (i.e. have we loaded CR3 since we last did this?) */
406 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
407 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
408 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
409 lowmem_l3tab = cache->table[cache->inuse_idx];
410 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
411 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
412 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
414 v->arch.cr3 = __pa(lowmem_l3tab);
416 spin_unlock(&cache->lock);
417 }
419 #else /* !CONFIG_X86_PAE */
421 void make_cr3(struct vcpu *v, unsigned long mfn)
422 {
423 v->arch.cr3 = mfn << PAGE_SHIFT;
424 }
426 #endif /* !CONFIG_X86_PAE */
428 void write_ptbase(struct vcpu *v)
429 {
430 write_cr3(v->arch.cr3);
431 }
433 /*
434 * Should be called after CR3 is updated.
435 *
436 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
437 * for HVM guests, arch.monitor_table and hvm's guest CR3.
438 *
439 * Update ref counts to shadow tables appropriately.
440 */
441 void update_cr3(struct vcpu *v)
442 {
443 unsigned long cr3_mfn=0;
445 if ( paging_mode_enabled(v->domain) )
446 {
447 paging_update_cr3(v);
448 return;
449 }
451 #if CONFIG_PAGING_LEVELS == 4
452 if ( !(v->arch.flags & TF_kernel_mode) )
453 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
454 else
455 #endif
456 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
458 make_cr3(v, cr3_mfn);
459 }
462 static void invalidate_shadow_ldt(struct vcpu *v)
463 {
464 int i;
465 unsigned long pfn;
466 struct page_info *page;
468 if ( v->arch.shadow_ldt_mapcnt == 0 )
469 return;
471 v->arch.shadow_ldt_mapcnt = 0;
473 for ( i = 16; i < 32; i++ )
474 {
475 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
476 if ( pfn == 0 ) continue;
477 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
478 page = mfn_to_page(pfn);
479 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
480 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
481 put_page_and_type(page);
482 }
484 /* Dispose of the (now possibly invalid) mappings from the TLB. */
485 if ( v == current )
486 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
487 else
488 flush_tlb_mask(v->domain->domain_dirty_cpumask);
489 }
492 static int alloc_segdesc_page(struct page_info *page)
493 {
494 struct desc_struct *descs;
495 int i;
497 descs = map_domain_page(page_to_mfn(page));
499 for ( i = 0; i < 512; i++ )
500 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
501 goto fail;
503 unmap_domain_page(descs);
504 return 1;
506 fail:
507 unmap_domain_page(descs);
508 return 0;
509 }
512 /* Map shadow page at offset @off. */
513 int map_ldt_shadow_page(unsigned int off)
514 {
515 struct vcpu *v = current;
516 struct domain *d = v->domain;
517 unsigned long gmfn, mfn;
518 l1_pgentry_t l1e, nl1e;
519 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
520 int okay;
522 BUG_ON(unlikely(in_irq()));
524 guest_get_eff_kern_l1e(v, gva, &l1e);
525 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
526 return 0;
528 gmfn = l1e_get_pfn(l1e);
529 mfn = gmfn_to_mfn(d, gmfn);
530 if ( unlikely(!mfn_valid(mfn)) )
531 return 0;
533 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
534 if ( unlikely(!okay) )
535 return 0;
537 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
539 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
540 v->arch.shadow_ldt_mapcnt++;
542 return 1;
543 }
546 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
547 {
548 struct page_info *page = mfn_to_page(page_nr);
550 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
551 {
552 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
553 return 0;
554 }
556 return 1;
557 }
560 static int get_page_and_type_from_pagenr(unsigned long page_nr,
561 unsigned long type,
562 struct domain *d)
563 {
564 struct page_info *page = mfn_to_page(page_nr);
566 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
567 return 0;
569 if ( unlikely(!get_page_type(page, type)) )
570 {
571 put_page(page);
572 return 0;
573 }
575 return 1;
576 }
578 /*
579 * We allow root tables to map each other (a.k.a. linear page tables). It
580 * needs some special care with reference counts and access permissions:
581 * 1. The mapping entry must be read-only, or the guest may get write access
582 * to its own PTEs.
583 * 2. We must only bump the reference counts for an *already validated*
584 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
585 * on a validation that is required to complete that validation.
586 * 3. We only need to increment the reference counts for the mapped page
587 * frame if it is mapped by a different root table. This is sufficient and
588 * also necessary to allow validation of a root table mapping itself.
589 */
590 #define define_get_linear_pagetable(level) \
591 static int \
592 get_##level##_linear_pagetable( \
593 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
594 { \
595 unsigned long x, y; \
596 struct page_info *page; \
597 unsigned long pfn; \
598 \
599 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
600 { \
601 MEM_LOG("Attempt to create linear p.t. with write perms"); \
602 return 0; \
603 } \
604 \
605 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
606 { \
607 /* Make sure the mapped frame belongs to the correct domain. */ \
608 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
609 return 0; \
610 \
611 /* \
612 * Ensure that the mapped frame is an already-validated page table. \
613 * If so, atomically increment the count (checking for overflow). \
614 */ \
615 page = mfn_to_page(pfn); \
616 y = page->u.inuse.type_info; \
617 do { \
618 x = y; \
619 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
620 unlikely((x & (PGT_type_mask|PGT_validated)) != \
621 (PGT_##level##_page_table|PGT_validated)) ) \
622 { \
623 put_page(page); \
624 return 0; \
625 } \
626 } \
627 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
628 } \
629 \
630 return 1; \
631 }
634 int is_iomem_page(unsigned long mfn)
635 {
636 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
637 }
640 int
641 get_page_from_l1e(
642 l1_pgentry_t l1e, struct domain *d)
643 {
644 unsigned long mfn = l1e_get_pfn(l1e);
645 struct page_info *page = mfn_to_page(mfn);
646 uint32_t l1f = l1e_get_flags(l1e);
647 struct vcpu *curr = current;
648 int okay;
650 if ( !(l1f & _PAGE_PRESENT) )
651 return 1;
653 if ( unlikely(l1f & l1_disallow_mask(d)) )
654 {
655 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
656 return 0;
657 }
659 if ( is_iomem_page(mfn) )
660 {
661 /* DOMID_IO reverts to caller for privilege checks. */
662 if ( d == dom_io )
663 d = curr->domain;
665 if ( !iomem_access_permitted(d, mfn, mfn) )
666 {
667 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
668 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
669 d->domain_id, mfn);
670 return 0;
671 }
673 return 1;
674 }
676 /* Foreign mappings into guests in shadow external mode don't
677 * contribute to writeable mapping refcounts. (This allows the
678 * qemu-dm helper process in dom0 to map the domain's memory without
679 * messing up the count of "real" writable mappings.) */
680 okay = (((l1f & _PAGE_RW) &&
681 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
682 ? get_page_and_type(page, d, PGT_writable_page)
683 : get_page(page, d));
684 if ( !okay )
685 {
686 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
687 " for dom%d",
688 mfn, get_gpfn_from_mfn(mfn),
689 l1e_get_intpte(l1e), d->domain_id);
690 }
691 else if ( pte_flags_to_cacheattr(l1f) !=
692 ((page->count_info >> PGC_cacheattr_base) & 7) )
693 {
694 uint32_t x, nx, y = page->count_info;
695 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
697 if ( is_xen_heap_page(page) )
698 {
699 if ( (l1f & _PAGE_RW) &&
700 !(unlikely(paging_mode_external(d) &&
701 (d != curr->domain))) )
702 put_page_type(page);
703 put_page(page);
704 MEM_LOG("Attempt to change cache attributes of Xen heap page");
705 return 0;
706 }
708 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
709 {
710 x = y;
711 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
712 y = cmpxchg(&page->count_info, x, nx);
713 }
715 #ifdef __x86_64__
716 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
717 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
718 #endif
719 }
721 return okay;
722 }
725 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
726 define_get_linear_pagetable(l2);
727 static int
728 get_page_from_l2e(
729 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
730 {
731 int rc;
733 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
734 return 1;
736 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
737 {
738 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
739 return 0;
740 }
742 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
743 if ( unlikely(!rc) )
744 rc = get_l2_linear_pagetable(l2e, pfn, d);
746 return rc;
747 }
750 #if CONFIG_PAGING_LEVELS >= 3
751 define_get_linear_pagetable(l3);
752 static int
753 get_page_from_l3e(
754 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
755 {
756 int rc;
758 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
759 return 1;
761 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
762 {
763 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
764 return 0;
765 }
767 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
768 if ( unlikely(!rc) )
769 rc = get_l3_linear_pagetable(l3e, pfn, d);
771 return rc;
772 }
773 #endif /* 3 level */
775 #if CONFIG_PAGING_LEVELS >= 4
776 define_get_linear_pagetable(l4);
777 static int
778 get_page_from_l4e(
779 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
780 {
781 int rc;
783 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
784 return 1;
786 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
787 {
788 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
789 return 0;
790 }
792 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
793 if ( unlikely(!rc) )
794 rc = get_l4_linear_pagetable(l4e, pfn, d);
796 return rc;
797 }
798 #endif /* 4 level */
800 #ifdef __x86_64__
802 #ifdef USER_MAPPINGS_ARE_GLOBAL
803 #define adjust_guest_l1e(pl1e, d) \
804 do { \
805 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
806 likely(!is_pv_32on64_domain(d)) ) \
807 { \
808 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
809 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
810 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
811 MEM_LOG("Global bit is set to kernel page %lx", \
812 l1e_get_pfn((pl1e))); \
813 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
814 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
815 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
816 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
817 } \
818 } while ( 0 )
819 #else
820 #define adjust_guest_l1e(pl1e, d) \
821 do { \
822 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
823 likely(!is_pv_32on64_domain(d)) ) \
824 l1e_add_flags((pl1e), _PAGE_USER); \
825 } while ( 0 )
826 #endif
828 #define adjust_guest_l2e(pl2e, d) \
829 do { \
830 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
831 likely(!is_pv_32on64_domain(d)) ) \
832 l2e_add_flags((pl2e), _PAGE_USER); \
833 } while ( 0 )
835 #define adjust_guest_l3e(pl3e, d) \
836 do { \
837 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
838 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
839 _PAGE_USER : \
840 _PAGE_USER|_PAGE_RW); \
841 } while ( 0 )
843 #define adjust_guest_l4e(pl4e, d) \
844 do { \
845 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
846 likely(!is_pv_32on64_domain(d)) ) \
847 l4e_add_flags((pl4e), _PAGE_USER); \
848 } while ( 0 )
850 #else /* !defined(__x86_64__) */
852 #define adjust_guest_l1e(_p, _d) ((void)(_d))
853 #define adjust_guest_l2e(_p, _d) ((void)(_d))
854 #define adjust_guest_l3e(_p, _d) ((void)(_d))
856 #endif
858 #ifdef CONFIG_COMPAT
859 #define unadjust_guest_l3e(pl3e, d) \
860 do { \
861 if ( unlikely(is_pv_32on64_domain(d)) && \
862 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
863 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
864 } while ( 0 )
865 #else
866 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
867 #endif
869 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
870 {
871 unsigned long pfn = l1e_get_pfn(l1e);
872 struct page_info *page;
873 struct domain *e;
874 struct vcpu *v;
876 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
877 return;
879 page = mfn_to_page(pfn);
881 e = page_get_owner(page);
883 /*
884 * Check if this is a mapping that was established via a grant reference.
885 * If it was then we should not be here: we require that such mappings are
886 * explicitly destroyed via the grant-table interface.
887 *
888 * The upshot of this is that the guest can end up with active grants that
889 * it cannot destroy (because it no longer has a PTE to present to the
890 * grant-table interface). This can lead to subtle hard-to-catch bugs,
891 * hence a special grant PTE flag can be enabled to catch the bug early.
892 *
893 * (Note that the undestroyable active grants are not a security hole in
894 * Xen. All active grants can safely be cleaned up when the domain dies.)
895 */
896 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
897 !d->is_shutting_down && !d->is_dying )
898 {
899 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
900 l1e_get_intpte(l1e));
901 domain_crash(d);
902 }
904 /* Remember we didn't take a type-count of foreign writable mappings
905 * to paging-external domains */
906 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
907 !(unlikely((e != d) && paging_mode_external(e))) )
908 {
909 put_page_and_type(page);
910 }
911 else
912 {
913 /* We expect this is rare so we blow the entire shadow LDT. */
914 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
915 PGT_ldt_page)) &&
916 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
917 (d == e) )
918 {
919 for_each_vcpu ( d, v )
920 invalidate_shadow_ldt(v);
921 }
922 put_page(page);
923 }
924 }
927 /*
928 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
929 * Note also that this automatically deals correctly with linear p.t.'s.
930 */
931 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
932 {
933 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
934 (l2e_get_pfn(l2e) != pfn) )
935 put_page_and_type(l2e_get_page(l2e));
936 }
939 #if CONFIG_PAGING_LEVELS >= 3
940 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
941 {
942 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
943 (l3e_get_pfn(l3e) != pfn) )
944 put_page_and_type(l3e_get_page(l3e));
945 }
946 #endif
948 #if CONFIG_PAGING_LEVELS >= 4
949 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
950 {
951 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
952 (l4e_get_pfn(l4e) != pfn) )
953 put_page_and_type(l4e_get_page(l4e));
954 }
955 #endif
957 static int alloc_l1_table(struct page_info *page)
958 {
959 struct domain *d = page_get_owner(page);
960 unsigned long pfn = page_to_mfn(page);
961 l1_pgentry_t *pl1e;
962 int i;
964 pl1e = map_domain_page(pfn);
966 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
967 {
968 if ( is_guest_l1_slot(i) &&
969 unlikely(!get_page_from_l1e(pl1e[i], d)) )
970 goto fail;
972 adjust_guest_l1e(pl1e[i], d);
973 }
975 unmap_domain_page(pl1e);
976 return 1;
978 fail:
979 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
980 while ( i-- > 0 )
981 if ( is_guest_l1_slot(i) )
982 put_page_from_l1e(pl1e[i], d);
984 unmap_domain_page(pl1e);
985 return 0;
986 }
988 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
989 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
990 {
991 struct page_info *page;
992 l2_pgentry_t *pl2e;
993 l3_pgentry_t l3e3;
994 #ifndef CONFIG_COMPAT
995 l2_pgentry_t l2e;
996 int i;
997 #endif
999 if ( !is_pv_32bit_domain(d) )
1000 return 1;
1002 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1004 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1005 l3e3 = pl3e[3];
1006 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1008 MEM_LOG("PAE L3 3rd slot is empty");
1009 return 0;
1012 /*
1013 * The Xen-private mappings include linear mappings. The L2 thus cannot
1014 * be shared by multiple L3 tables. The test here is adequate because:
1015 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1016 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1017 * 2. Cannot appear in another page table's L3:
1018 * a. alloc_l3_table() calls this function and this check will fail
1019 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1020 */
1021 page = l3e_get_page(l3e3);
1022 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1023 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1024 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1025 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1027 MEM_LOG("PAE L3 3rd slot is shared");
1028 return 0;
1031 /* Xen private mappings. */
1032 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1033 #ifndef CONFIG_COMPAT
1034 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1035 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1036 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1037 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1039 l2e = l2e_from_page(
1040 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1041 __PAGE_HYPERVISOR);
1042 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1044 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1046 l2e = l2e_empty();
1047 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1048 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1049 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1051 #else
1052 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1053 &compat_idle_pg_table_l2[
1054 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1055 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1056 #endif
1057 unmap_domain_page(pl2e);
1059 return 1;
1061 #else
1062 # define create_pae_xen_mappings(d, pl3e) (1)
1063 #endif
1065 #ifdef CONFIG_X86_PAE
1066 /* Flush a pgdir update into low-memory caches. */
1067 static void pae_flush_pgd(
1068 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1070 struct domain *d = page_get_owner(mfn_to_page(mfn));
1071 struct vcpu *v;
1072 intpte_t _ol3e, _nl3e, _pl3e;
1073 l3_pgentry_t *l3tab_ptr;
1074 struct pae_l3_cache *cache;
1076 if ( unlikely(shadow_mode_enabled(d)) )
1078 cpumask_t m = CPU_MASK_NONE;
1079 /* Re-shadow this l3 table on any vcpus that are using it */
1080 for_each_vcpu ( d, v )
1081 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1083 paging_update_cr3(v);
1084 cpus_or(m, m, v->vcpu_dirty_cpumask);
1086 flush_tlb_mask(m);
1089 /* If below 4GB then the pgdir is not shadowed in low memory. */
1090 if ( !l3tab_needs_shadow(mfn) )
1091 return;
1093 for_each_vcpu ( d, v )
1095 cache = &v->arch.pae_l3_cache;
1097 spin_lock(&cache->lock);
1099 if ( cache->high_mfn == mfn )
1101 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1102 _ol3e = l3e_get_intpte(*l3tab_ptr);
1103 _nl3e = l3e_get_intpte(nl3e);
1104 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1105 BUG_ON(_pl3e != _ol3e);
1108 spin_unlock(&cache->lock);
1111 flush_tlb_mask(d->domain_dirty_cpumask);
1113 #else
1114 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1115 #endif
1117 static int alloc_l2_table(struct page_info *page, unsigned long type)
1119 struct domain *d = page_get_owner(page);
1120 unsigned long pfn = page_to_mfn(page);
1121 l2_pgentry_t *pl2e;
1122 int i;
1124 pl2e = map_domain_page(pfn);
1126 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1128 if ( is_guest_l2_slot(d, type, i) &&
1129 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1130 goto fail;
1132 adjust_guest_l2e(pl2e[i], d);
1135 #if CONFIG_PAGING_LEVELS == 2
1136 /* Xen private mappings. */
1137 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1138 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1139 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1140 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1141 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1142 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1143 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1144 l2e_from_page(
1145 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1146 __PAGE_HYPERVISOR);
1147 #endif
1149 unmap_domain_page(pl2e);
1150 return 1;
1152 fail:
1153 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1154 while ( i-- > 0 )
1155 if ( is_guest_l2_slot(d, type, i) )
1156 put_page_from_l2e(pl2e[i], pfn);
1158 unmap_domain_page(pl2e);
1159 return 0;
1163 #if CONFIG_PAGING_LEVELS >= 3
1164 static int alloc_l3_table(struct page_info *page)
1166 struct domain *d = page_get_owner(page);
1167 unsigned long pfn = page_to_mfn(page);
1168 l3_pgentry_t *pl3e;
1169 int i;
1171 #ifdef CONFIG_X86_PAE
1172 /*
1173 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1174 * the weird 'extended cr3' format for dealing with high-order address
1175 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1176 */
1177 if ( (pfn >= 0x100000) &&
1178 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1179 d->vcpu[0] && d->vcpu[0]->is_initialised )
1181 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1182 return 0;
1184 #endif
1186 pl3e = map_domain_page(pfn);
1188 /*
1189 * PAE guests allocate full pages, but aren't required to initialize
1190 * more than the first four entries; when running in compatibility
1191 * mode, however, the full page is visible to the MMU, and hence all
1192 * 512 entries must be valid/verified, which is most easily achieved
1193 * by clearing them out.
1194 */
1195 if ( is_pv_32on64_domain(d) )
1196 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1198 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1200 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1201 if ( is_pv_32bit_domain(d) && (i == 3) )
1203 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1204 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1205 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1206 PGT_l2_page_table |
1207 PGT_pae_xen_l2,
1208 d) )
1209 goto fail;
1211 else
1212 #endif
1213 if ( is_guest_l3_slot(i) &&
1214 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1215 goto fail;
1217 adjust_guest_l3e(pl3e[i], d);
1220 if ( !create_pae_xen_mappings(d, pl3e) )
1221 goto fail;
1223 unmap_domain_page(pl3e);
1224 return 1;
1226 fail:
1227 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1228 while ( i-- > 0 )
1229 if ( is_guest_l3_slot(i) )
1230 put_page_from_l3e(pl3e[i], pfn);
1232 unmap_domain_page(pl3e);
1233 return 0;
1235 #else
1236 #define alloc_l3_table(page) (0)
1237 #endif
1239 #if CONFIG_PAGING_LEVELS >= 4
1240 static int alloc_l4_table(struct page_info *page)
1242 struct domain *d = page_get_owner(page);
1243 unsigned long pfn = page_to_mfn(page);
1244 l4_pgentry_t *pl4e = page_to_virt(page);
1245 int i;
1247 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1249 if ( is_guest_l4_slot(d, i) &&
1250 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1251 goto fail;
1253 adjust_guest_l4e(pl4e[i], d);
1256 /* Xen private mappings. */
1257 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1258 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1259 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1260 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1261 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1262 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1263 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1264 __PAGE_HYPERVISOR);
1265 if ( is_pv_32on64_domain(d) )
1266 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1267 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1268 __PAGE_HYPERVISOR);
1270 return 1;
1272 fail:
1273 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1274 while ( i-- > 0 )
1275 if ( is_guest_l4_slot(d, i) )
1276 put_page_from_l4e(pl4e[i], pfn);
1278 return 0;
1280 #else
1281 #define alloc_l4_table(page) (0)
1282 #endif
1285 static void free_l1_table(struct page_info *page)
1287 struct domain *d = page_get_owner(page);
1288 unsigned long pfn = page_to_mfn(page);
1289 l1_pgentry_t *pl1e;
1290 int i;
1292 pl1e = map_domain_page(pfn);
1294 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1295 if ( is_guest_l1_slot(i) )
1296 put_page_from_l1e(pl1e[i], d);
1298 unmap_domain_page(pl1e);
1302 static void free_l2_table(struct page_info *page)
1304 #ifdef CONFIG_COMPAT
1305 struct domain *d = page_get_owner(page);
1306 #endif
1307 unsigned long pfn = page_to_mfn(page);
1308 l2_pgentry_t *pl2e;
1309 int i;
1311 pl2e = map_domain_page(pfn);
1313 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1314 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1315 put_page_from_l2e(pl2e[i], pfn);
1317 unmap_domain_page(pl2e);
1319 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1323 #if CONFIG_PAGING_LEVELS >= 3
1325 static void free_l3_table(struct page_info *page)
1327 struct domain *d = page_get_owner(page);
1328 unsigned long pfn = page_to_mfn(page);
1329 l3_pgentry_t *pl3e;
1330 int i;
1332 pl3e = map_domain_page(pfn);
1334 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1335 if ( is_guest_l3_slot(i) )
1337 put_page_from_l3e(pl3e[i], pfn);
1338 unadjust_guest_l3e(pl3e[i], d);
1341 unmap_domain_page(pl3e);
1344 #endif
1346 #if CONFIG_PAGING_LEVELS >= 4
1348 static void free_l4_table(struct page_info *page)
1350 struct domain *d = page_get_owner(page);
1351 unsigned long pfn = page_to_mfn(page);
1352 l4_pgentry_t *pl4e = page_to_virt(page);
1353 int i;
1355 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1356 if ( is_guest_l4_slot(d, i) )
1357 put_page_from_l4e(pl4e[i], pfn);
1360 #endif
1363 /* How to write an entry to the guest pagetables.
1364 * Returns 0 for failure (pointer not valid), 1 for success. */
1365 static inline int update_intpte(intpte_t *p,
1366 intpte_t old,
1367 intpte_t new,
1368 unsigned long mfn,
1369 struct vcpu *v,
1370 int preserve_ad)
1372 int rv = 1;
1373 #ifndef PTE_UPDATE_WITH_CMPXCHG
1374 if ( !preserve_ad )
1376 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1378 else
1379 #endif
1381 intpte_t t = old;
1382 for ( ; ; )
1384 intpte_t _new = new;
1385 if ( preserve_ad )
1386 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1388 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1389 if ( unlikely(rv == 0) )
1391 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1392 ": saw %" PRIpte, old, _new, t);
1393 break;
1396 if ( t == old )
1397 break;
1399 /* Allowed to change in Accessed/Dirty flags only. */
1400 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1402 old = t;
1405 return rv;
1408 /* Macro that wraps the appropriate type-changes around update_intpte().
1409 * Arguments are: type, ptr, old, new, mfn, vcpu */
1410 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1411 update_intpte(&_t ## e_get_intpte(*(_p)), \
1412 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1413 (_m), (_v), (_ad))
1415 /* Update the L1 entry at pl1e to new value nl1e. */
1416 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1417 unsigned long gl1mfn, int preserve_ad)
1419 l1_pgentry_t ol1e;
1420 struct vcpu *curr = current;
1421 struct domain *d = curr->domain;
1422 unsigned long mfn;
1424 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1425 return 0;
1427 if ( unlikely(paging_mode_refcounts(d)) )
1428 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1430 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1432 /* Translate foreign guest addresses. */
1433 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1434 if ( unlikely(mfn == INVALID_MFN) )
1435 return 0;
1436 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1437 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1439 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1441 MEM_LOG("Bad L1 flags %x",
1442 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1443 return 0;
1446 adjust_guest_l1e(nl1e, d);
1448 /* Fast path for identical mapping, r/w and presence. */
1449 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1450 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1451 preserve_ad);
1453 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1454 return 0;
1456 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1457 preserve_ad)) )
1459 put_page_from_l1e(nl1e, d);
1460 return 0;
1463 else
1465 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1466 preserve_ad)) )
1467 return 0;
1470 put_page_from_l1e(ol1e, d);
1471 return 1;
1475 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1476 static int mod_l2_entry(l2_pgentry_t *pl2e,
1477 l2_pgentry_t nl2e,
1478 unsigned long pfn,
1479 unsigned long type,
1480 int preserve_ad)
1482 l2_pgentry_t ol2e;
1483 struct vcpu *curr = current;
1484 struct domain *d = curr->domain;
1486 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1488 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1489 return 0;
1492 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1493 return 0;
1495 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1497 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1499 MEM_LOG("Bad L2 flags %x",
1500 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1501 return 0;
1504 adjust_guest_l2e(nl2e, d);
1506 /* Fast path for identical mapping and presence. */
1507 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1508 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1510 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1511 return 0;
1513 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1514 preserve_ad)) )
1516 put_page_from_l2e(nl2e, pfn);
1517 return 0;
1520 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1521 preserve_ad)) )
1523 return 0;
1526 put_page_from_l2e(ol2e, pfn);
1527 return 1;
1530 #if CONFIG_PAGING_LEVELS >= 3
1532 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1533 static int mod_l3_entry(l3_pgentry_t *pl3e,
1534 l3_pgentry_t nl3e,
1535 unsigned long pfn,
1536 int preserve_ad)
1538 l3_pgentry_t ol3e;
1539 struct vcpu *curr = current;
1540 struct domain *d = curr->domain;
1541 int okay;
1543 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1545 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1546 return 0;
1549 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1550 /*
1551 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1552 * would be a pain to ensure they remain continuously valid throughout.
1553 */
1554 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1555 return 0;
1556 #endif
1558 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1559 return 0;
1561 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1563 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1565 MEM_LOG("Bad L3 flags %x",
1566 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1567 return 0;
1570 adjust_guest_l3e(nl3e, d);
1572 /* Fast path for identical mapping and presence. */
1573 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1574 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1576 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1577 return 0;
1579 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1580 preserve_ad)) )
1582 put_page_from_l3e(nl3e, pfn);
1583 return 0;
1586 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1587 preserve_ad)) )
1589 return 0;
1592 okay = create_pae_xen_mappings(d, pl3e);
1593 BUG_ON(!okay);
1595 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1597 put_page_from_l3e(ol3e, pfn);
1598 return 1;
1601 #endif
1603 #if CONFIG_PAGING_LEVELS >= 4
1605 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1606 static int mod_l4_entry(l4_pgentry_t *pl4e,
1607 l4_pgentry_t nl4e,
1608 unsigned long pfn,
1609 int preserve_ad)
1611 struct vcpu *curr = current;
1612 struct domain *d = curr->domain;
1613 l4_pgentry_t ol4e;
1615 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1617 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1618 return 0;
1621 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1622 return 0;
1624 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1626 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1628 MEM_LOG("Bad L4 flags %x",
1629 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1630 return 0;
1633 adjust_guest_l4e(nl4e, d);
1635 /* Fast path for identical mapping and presence. */
1636 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1637 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1639 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1640 return 0;
1642 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1643 preserve_ad)) )
1645 put_page_from_l4e(nl4e, pfn);
1646 return 0;
1649 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1650 preserve_ad)) )
1652 return 0;
1655 put_page_from_l4e(ol4e, pfn);
1656 return 1;
1659 #endif
1661 void put_page(struct page_info *page)
1663 u32 nx, x, y = page->count_info;
1665 do {
1666 x = y;
1667 nx = x - 1;
1669 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1671 if ( unlikely((nx & PGC_count_mask) == 0) )
1673 cleanup_page_cacheattr(page);
1674 free_domheap_page(page);
1679 int get_page(struct page_info *page, struct domain *domain)
1681 u32 x, nx, y = page->count_info;
1682 u32 d, nd = page->u.inuse._domain;
1683 u32 _domain = pickle_domptr(domain);
1685 do {
1686 x = y;
1687 nx = x + 1;
1688 d = nd;
1689 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1690 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1691 unlikely(d != _domain) ) /* Wrong owner? */
1693 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1694 gdprintk(XENLOG_INFO,
1695 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1696 PRtype_info "\n",
1697 page_to_mfn(page), domain, unpickle_domptr(d),
1698 x, page->u.inuse.type_info);
1699 return 0;
1701 asm volatile (
1702 LOCK_PREFIX "cmpxchg8b %3"
1703 : "=d" (nd), "=a" (y), "=c" (d),
1704 "=m" (*(volatile u64 *)(&page->count_info))
1705 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1707 while ( unlikely(nd != d) || unlikely(y != x) );
1709 return 1;
1713 static int alloc_page_type(struct page_info *page, unsigned long type)
1715 struct domain *owner = page_get_owner(page);
1717 /* A page table is dirtied when its type count becomes non-zero. */
1718 if ( likely(owner != NULL) )
1719 paging_mark_dirty(owner, page_to_mfn(page));
1721 switch ( type & PGT_type_mask )
1723 case PGT_l1_page_table:
1724 return alloc_l1_table(page);
1725 case PGT_l2_page_table:
1726 return alloc_l2_table(page, type);
1727 case PGT_l3_page_table:
1728 return alloc_l3_table(page);
1729 case PGT_l4_page_table:
1730 return alloc_l4_table(page);
1731 case PGT_gdt_page:
1732 case PGT_ldt_page:
1733 return alloc_segdesc_page(page);
1734 default:
1735 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1736 type, page->u.inuse.type_info,
1737 page->count_info);
1738 BUG();
1741 return 0;
1745 void free_page_type(struct page_info *page, unsigned long type)
1747 struct domain *owner = page_get_owner(page);
1748 unsigned long gmfn;
1750 if ( likely(owner != NULL) )
1752 /*
1753 * We have to flush before the next use of the linear mapping
1754 * (e.g., update_va_mapping()) or we could end up modifying a page
1755 * that is no longer a page table (and hence screw up ref counts).
1756 */
1757 if ( current->domain == owner )
1758 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1759 else
1760 flush_tlb_mask(owner->domain_dirty_cpumask);
1762 if ( unlikely(paging_mode_enabled(owner)) )
1764 /* A page table is dirtied when its type count becomes zero. */
1765 paging_mark_dirty(owner, page_to_mfn(page));
1767 if ( shadow_mode_refcounts(owner) )
1768 return;
1770 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1771 ASSERT(VALID_M2P(gmfn));
1772 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1776 switch ( type & PGT_type_mask )
1778 case PGT_l1_page_table:
1779 free_l1_table(page);
1780 break;
1782 case PGT_l2_page_table:
1783 free_l2_table(page);
1784 break;
1786 #if CONFIG_PAGING_LEVELS >= 3
1787 case PGT_l3_page_table:
1788 free_l3_table(page);
1789 break;
1790 #endif
1792 #if CONFIG_PAGING_LEVELS >= 4
1793 case PGT_l4_page_table:
1794 free_l4_table(page);
1795 break;
1796 #endif
1798 default:
1799 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1800 type, page_to_mfn(page));
1801 BUG();
1806 void put_page_type(struct page_info *page)
1808 unsigned long nx, x, y = page->u.inuse.type_info;
1810 again:
1811 do {
1812 x = y;
1813 nx = x - 1;
1815 ASSERT((x & PGT_count_mask) != 0);
1817 if ( unlikely((nx & PGT_count_mask) == 0) )
1819 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1820 likely(nx & PGT_validated) )
1822 /*
1823 * Page-table pages must be unvalidated when count is zero. The
1824 * 'free' is safe because the refcnt is non-zero and validated
1825 * bit is clear => other ops will spin or fail.
1826 */
1827 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1828 x & ~PGT_validated)) != x) )
1829 goto again;
1830 /* We cleared the 'valid bit' so we do the clean up. */
1831 free_page_type(page, x);
1832 /* Carry on, but with the 'valid bit' now clear. */
1833 x &= ~PGT_validated;
1834 nx &= ~PGT_validated;
1837 /*
1838 * Record TLB information for flush later. We do not stamp page
1839 * tables when running in shadow mode:
1840 * 1. Pointless, since it's the shadow pt's which must be tracked.
1841 * 2. Shadow mode reuses this field for shadowed page tables to
1842 * store flags info -- we don't want to conflict with that.
1843 */
1844 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1845 (page->count_info & PGC_page_table)) )
1846 page->tlbflush_timestamp = tlbflush_current_time();
1849 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1853 int get_page_type(struct page_info *page, unsigned long type)
1855 unsigned long nx, x, y = page->u.inuse.type_info;
1857 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1859 again:
1860 do {
1861 x = y;
1862 nx = x + 1;
1863 if ( unlikely((nx & PGT_count_mask) == 0) )
1865 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1866 return 0;
1868 else if ( unlikely((x & PGT_count_mask) == 0) )
1870 struct domain *d = page_get_owner(page);
1872 /* Never allow a shadowed frame to go from type count 0 to 1 */
1873 if ( d && shadow_mode_enabled(d) )
1874 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1876 ASSERT(!(x & PGT_pae_xen_l2));
1877 if ( (x & PGT_type_mask) != type )
1879 /*
1880 * On type change we check to flush stale TLB entries. This
1881 * may be unnecessary (e.g., page was GDT/LDT) but those
1882 * circumstances should be very rare.
1883 */
1884 cpumask_t mask = d->domain_dirty_cpumask;
1886 /* Don't flush if the timestamp is old enough */
1887 tlbflush_filter(mask, page->tlbflush_timestamp);
1889 if ( unlikely(!cpus_empty(mask)) &&
1890 /* Shadow mode: track only writable pages. */
1891 (!shadow_mode_enabled(page_get_owner(page)) ||
1892 ((nx & PGT_type_mask) == PGT_writable_page)) )
1894 perfc_incr(need_flush_tlb_flush);
1895 flush_tlb_mask(mask);
1898 /* We lose existing type and validity. */
1899 nx &= ~(PGT_type_mask | PGT_validated);
1900 nx |= type;
1902 /* No special validation needed for writable pages. */
1903 /* Page tables and GDT/LDT need to be scanned for validity. */
1904 if ( type == PGT_writable_page )
1905 nx |= PGT_validated;
1908 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1910 /* Don't log failure if it could be a recursive-mapping attempt. */
1911 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1912 (type == PGT_l1_page_table) )
1913 return 0;
1914 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1915 (type == PGT_l2_page_table) )
1916 return 0;
1917 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1918 (type == PGT_l3_page_table) )
1919 return 0;
1920 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1921 "for mfn %lx (pfn %lx)",
1922 x, type, page_to_mfn(page),
1923 get_gpfn_from_mfn(page_to_mfn(page)));
1924 return 0;
1926 else if ( unlikely(!(x & PGT_validated)) )
1928 /* Someone else is updating validation of this page. Wait... */
1929 while ( (y = page->u.inuse.type_info) == x )
1930 cpu_relax();
1931 goto again;
1934 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1936 if ( unlikely(!(nx & PGT_validated)) )
1938 /* Try to validate page type; drop the new reference on failure. */
1939 if ( unlikely(!alloc_page_type(page, type)) )
1941 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1942 PRtype_info ": caf=%08x taf=%" PRtype_info,
1943 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1944 type, page->count_info, page->u.inuse.type_info);
1945 /* Noone else can get a reference. We hold the only ref. */
1946 page->u.inuse.type_info = 0;
1947 return 0;
1950 /* Noone else is updating simultaneously. */
1951 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1954 return 1;
1958 void cleanup_page_cacheattr(struct page_info *page)
1960 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1962 if ( likely(cacheattr == 0) )
1963 return;
1965 page->count_info &= ~PGC_cacheattr_mask;
1967 BUG_ON(is_xen_heap_page(page));
1969 #ifdef __x86_64__
1970 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1971 1, PAGE_HYPERVISOR);
1972 #endif
1976 int new_guest_cr3(unsigned long mfn)
1978 struct vcpu *v = current;
1979 struct domain *d = v->domain;
1980 int okay;
1981 unsigned long old_base_mfn;
1983 #ifdef CONFIG_COMPAT
1984 if ( is_pv_32on64_domain(d) )
1986 okay = paging_mode_refcounts(d)
1987 ? 0 /* Old code was broken, but what should it be? */
1988 : mod_l4_entry(
1989 __va(pagetable_get_paddr(v->arch.guest_table)),
1990 l4e_from_pfn(
1991 mfn,
1992 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1993 pagetable_get_pfn(v->arch.guest_table), 0);
1994 if ( unlikely(!okay) )
1996 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1997 return 0;
2000 invalidate_shadow_ldt(v);
2001 write_ptbase(v);
2003 return 1;
2005 #endif
2006 okay = paging_mode_refcounts(d)
2007 ? get_page_from_pagenr(mfn, d)
2008 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
2009 if ( unlikely(!okay) )
2011 MEM_LOG("Error while installing new baseptr %lx", mfn);
2012 return 0;
2015 invalidate_shadow_ldt(v);
2017 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2019 v->arch.guest_table = pagetable_from_pfn(mfn);
2020 update_cr3(v);
2022 write_ptbase(v);
2024 if ( likely(old_base_mfn != 0) )
2026 if ( paging_mode_refcounts(d) )
2027 put_page(mfn_to_page(old_base_mfn));
2028 else
2029 put_page_and_type(mfn_to_page(old_base_mfn));
2032 return 1;
2035 static void process_deferred_ops(void)
2037 unsigned int deferred_ops;
2038 struct domain *d = current->domain;
2039 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2041 deferred_ops = info->deferred_ops;
2042 info->deferred_ops = 0;
2044 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2046 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2047 flush_tlb_mask(d->domain_dirty_cpumask);
2048 else
2049 flush_tlb_local();
2052 if ( deferred_ops & DOP_RELOAD_LDT )
2053 (void)map_ldt_shadow_page(0);
2055 if ( unlikely(info->foreign != NULL) )
2057 rcu_unlock_domain(info->foreign);
2058 info->foreign = NULL;
2062 static int set_foreigndom(domid_t domid)
2064 struct domain *e, *d = current->domain;
2065 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2066 int okay = 1;
2068 ASSERT(info->foreign == NULL);
2070 if ( likely(domid == DOMID_SELF) )
2071 goto out;
2073 if ( unlikely(domid == d->domain_id) )
2075 MEM_LOG("Cannot specify itself as foreign domain");
2076 okay = 0;
2078 else if ( unlikely(paging_mode_translate(d)) )
2080 MEM_LOG("Cannot mix foreign mappings with translated domains");
2081 okay = 0;
2083 else switch ( domid )
2085 case DOMID_IO:
2086 info->foreign = rcu_lock_domain(dom_io);
2087 break;
2088 case DOMID_XEN:
2089 if (!IS_PRIV(d)) {
2090 MEM_LOG("Cannot set foreign dom");
2091 okay = 0;
2092 break;
2094 info->foreign = rcu_lock_domain(dom_xen);
2095 break;
2096 default:
2097 e = rcu_lock_domain_by_id(domid);
2098 if ( e == NULL )
2100 MEM_LOG("Unknown domain '%u'", domid);
2101 okay = 0;
2102 break;
2104 if (!IS_PRIV_FOR(d, e)) {
2105 MEM_LOG("Cannot set foreign dom");
2106 okay = 0;
2107 rcu_unlock_domain(e);
2108 break;
2110 info->foreign = e;
2111 break;
2114 out:
2115 return okay;
2118 static inline cpumask_t vcpumask_to_pcpumask(
2119 struct domain *d, unsigned long vmask)
2121 unsigned int vcpu_id;
2122 cpumask_t pmask = CPU_MASK_NONE;
2123 struct vcpu *v;
2125 while ( vmask != 0 )
2127 vcpu_id = find_first_set_bit(vmask);
2128 vmask &= ~(1UL << vcpu_id);
2129 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2130 ((v = d->vcpu[vcpu_id]) != NULL) )
2131 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2134 return pmask;
2137 int do_mmuext_op(
2138 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2139 unsigned int count,
2140 XEN_GUEST_HANDLE(uint) pdone,
2141 unsigned int foreigndom)
2143 struct mmuext_op op;
2144 int rc = 0, i = 0, okay;
2145 unsigned long mfn = 0, gmfn = 0, type;
2146 unsigned int done = 0;
2147 struct page_info *page;
2148 struct vcpu *v = current;
2149 struct domain *d = v->domain;
2151 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2153 count &= ~MMU_UPDATE_PREEMPTED;
2154 if ( unlikely(!guest_handle_is_null(pdone)) )
2155 (void)copy_from_guest(&done, pdone, 1);
2157 else
2158 perfc_incr(calls_to_mmuext_op);
2160 if ( unlikely(!guest_handle_okay(uops, count)) )
2162 rc = -EFAULT;
2163 goto out;
2166 if ( !set_foreigndom(foreigndom) )
2168 rc = -ESRCH;
2169 goto out;
2172 LOCK_BIGLOCK(d);
2174 for ( i = 0; i < count; i++ )
2176 if ( hypercall_preempt_check() )
2178 rc = hypercall_create_continuation(
2179 __HYPERVISOR_mmuext_op, "hihi",
2180 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2181 break;
2184 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2186 MEM_LOG("Bad __copy_from_guest");
2187 rc = -EFAULT;
2188 break;
2191 okay = 1;
2192 gmfn = op.arg1.mfn;
2193 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2194 page = mfn_to_page(mfn);
2196 switch ( op.cmd )
2198 case MMUEXT_PIN_L1_TABLE:
2199 type = PGT_l1_page_table;
2200 goto pin_page;
2202 case MMUEXT_PIN_L2_TABLE:
2203 type = PGT_l2_page_table;
2204 goto pin_page;
2206 case MMUEXT_PIN_L3_TABLE:
2207 type = PGT_l3_page_table;
2208 goto pin_page;
2210 case MMUEXT_PIN_L4_TABLE:
2211 if ( is_pv_32bit_domain(FOREIGNDOM) )
2212 break;
2213 type = PGT_l4_page_table;
2215 pin_page:
2216 rc = xsm_memory_pin_page(d, page);
2217 if ( rc )
2218 break;
2220 /* Ignore pinning of invalid paging levels. */
2221 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2222 break;
2224 if ( paging_mode_refcounts(FOREIGNDOM) )
2225 break;
2227 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2228 if ( unlikely(!okay) )
2230 MEM_LOG("Error while pinning mfn %lx", mfn);
2231 break;
2234 if ( unlikely(test_and_set_bit(_PGT_pinned,
2235 &page->u.inuse.type_info)) )
2237 MEM_LOG("Mfn %lx already pinned", mfn);
2238 put_page_and_type(page);
2239 okay = 0;
2240 break;
2243 /* A page is dirtied when its pin status is set. */
2244 paging_mark_dirty(d, mfn);
2246 /* We can race domain destruction (domain_relinquish_resources). */
2247 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2249 int drop_ref;
2250 spin_lock(&FOREIGNDOM->page_alloc_lock);
2251 drop_ref = (FOREIGNDOM->is_dying &&
2252 test_and_clear_bit(_PGT_pinned,
2253 &page->u.inuse.type_info));
2254 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2255 if ( drop_ref )
2256 put_page_and_type(page);
2259 break;
2261 case MMUEXT_UNPIN_TABLE:
2262 if ( paging_mode_refcounts(d) )
2263 break;
2265 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2267 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2268 mfn, page_get_owner(page));
2270 else if ( likely(test_and_clear_bit(_PGT_pinned,
2271 &page->u.inuse.type_info)) )
2273 put_page_and_type(page);
2274 put_page(page);
2275 /* A page is dirtied when its pin status is cleared. */
2276 paging_mark_dirty(d, mfn);
2278 else
2280 okay = 0;
2281 put_page(page);
2282 MEM_LOG("Mfn %lx not pinned", mfn);
2284 break;
2286 case MMUEXT_NEW_BASEPTR:
2287 okay = new_guest_cr3(mfn);
2288 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2289 break;
2291 #ifdef __x86_64__
2292 case MMUEXT_NEW_USER_BASEPTR: {
2293 unsigned long old_mfn;
2295 if ( mfn != 0 )
2297 if ( paging_mode_refcounts(d) )
2298 okay = get_page_from_pagenr(mfn, d);
2299 else
2300 okay = get_page_and_type_from_pagenr(
2301 mfn, PGT_root_page_table, d);
2302 if ( unlikely(!okay) )
2304 MEM_LOG("Error while installing new mfn %lx", mfn);
2305 break;
2309 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2310 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2312 if ( old_mfn != 0 )
2314 if ( paging_mode_refcounts(d) )
2315 put_page(mfn_to_page(old_mfn));
2316 else
2317 put_page_and_type(mfn_to_page(old_mfn));
2320 break;
2322 #endif
2324 case MMUEXT_TLB_FLUSH_LOCAL:
2325 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2326 break;
2328 case MMUEXT_INVLPG_LOCAL:
2329 if ( !paging_mode_enabled(d)
2330 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2331 flush_tlb_one_local(op.arg1.linear_addr);
2332 break;
2334 case MMUEXT_TLB_FLUSH_MULTI:
2335 case MMUEXT_INVLPG_MULTI:
2337 unsigned long vmask;
2338 cpumask_t pmask;
2339 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2341 okay = 0;
2342 break;
2344 pmask = vcpumask_to_pcpumask(d, vmask);
2345 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2346 flush_tlb_mask(pmask);
2347 else
2348 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2349 break;
2352 case MMUEXT_TLB_FLUSH_ALL:
2353 flush_tlb_mask(d->domain_dirty_cpumask);
2354 break;
2356 case MMUEXT_INVLPG_ALL:
2357 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2358 break;
2360 case MMUEXT_FLUSH_CACHE:
2361 if ( unlikely(!cache_flush_permitted(d)) )
2363 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2364 okay = 0;
2366 else
2368 wbinvd();
2370 break;
2372 case MMUEXT_SET_LDT:
2374 unsigned long ptr = op.arg1.linear_addr;
2375 unsigned long ents = op.arg2.nr_ents;
2377 if ( paging_mode_external(d) )
2379 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2380 okay = 0;
2382 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2383 (ents > 8192) ||
2384 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2386 okay = 0;
2387 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2389 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2390 (v->arch.guest_context.ldt_base != ptr) )
2392 invalidate_shadow_ldt(v);
2393 v->arch.guest_context.ldt_base = ptr;
2394 v->arch.guest_context.ldt_ents = ents;
2395 load_LDT(v);
2396 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2397 if ( ents != 0 )
2398 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2400 break;
2403 default:
2404 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2405 rc = -ENOSYS;
2406 okay = 0;
2407 break;
2410 if ( unlikely(!okay) )
2412 rc = rc ? rc : -EINVAL;
2413 break;
2416 guest_handle_add_offset(uops, 1);
2419 process_deferred_ops();
2421 UNLOCK_BIGLOCK(d);
2423 perfc_add(num_mmuext_ops, i);
2425 out:
2426 /* Add incremental work we have done to the @done output parameter. */
2427 if ( unlikely(!guest_handle_is_null(pdone)) )
2429 done += i;
2430 copy_to_guest(pdone, &done, 1);
2433 return rc;
2436 int do_mmu_update(
2437 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2438 unsigned int count,
2439 XEN_GUEST_HANDLE(uint) pdone,
2440 unsigned int foreigndom)
2442 struct mmu_update req;
2443 void *va;
2444 unsigned long gpfn, gmfn, mfn;
2445 struct page_info *page;
2446 int rc = 0, okay = 1, i = 0;
2447 unsigned int cmd, done = 0;
2448 struct vcpu *v = current;
2449 struct domain *d = v->domain;
2450 unsigned long type_info;
2451 struct domain_mmap_cache mapcache;
2453 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2455 count &= ~MMU_UPDATE_PREEMPTED;
2456 if ( unlikely(!guest_handle_is_null(pdone)) )
2457 (void)copy_from_guest(&done, pdone, 1);
2459 else
2460 perfc_incr(calls_to_mmu_update);
2462 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2464 rc = -EFAULT;
2465 goto out;
2468 if ( !set_foreigndom(foreigndom) )
2470 rc = -ESRCH;
2471 goto out;
2474 domain_mmap_cache_init(&mapcache);
2476 LOCK_BIGLOCK(d);
2478 for ( i = 0; i < count; i++ )
2480 if ( hypercall_preempt_check() )
2482 rc = hypercall_create_continuation(
2483 __HYPERVISOR_mmu_update, "hihi",
2484 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2485 break;
2488 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2490 MEM_LOG("Bad __copy_from_guest");
2491 rc = -EFAULT;
2492 break;
2495 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2496 okay = 0;
2498 switch ( cmd )
2500 /*
2501 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2502 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2503 * current A/D bits.
2504 */
2505 case MMU_NORMAL_PT_UPDATE:
2506 case MMU_PT_UPDATE_PRESERVE_AD:
2507 rc = xsm_mmu_normal_update(d, req.val);
2508 if ( rc )
2509 break;
2511 req.ptr -= cmd;
2512 gmfn = req.ptr >> PAGE_SHIFT;
2513 mfn = gmfn_to_mfn(d, gmfn);
2515 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2517 MEM_LOG("Could not get page for normal update");
2518 break;
2521 va = map_domain_page_with_cache(mfn, &mapcache);
2522 va = (void *)((unsigned long)va +
2523 (unsigned long)(req.ptr & ~PAGE_MASK));
2524 page = mfn_to_page(mfn);
2526 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2528 case PGT_l1_page_table:
2529 case PGT_l2_page_table:
2530 case PGT_l3_page_table:
2531 case PGT_l4_page_table:
2533 if ( paging_mode_refcounts(d) )
2535 MEM_LOG("mmu update on auto-refcounted domain!");
2536 break;
2539 if ( unlikely(!get_page_type(
2540 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2541 goto not_a_pt;
2543 switch ( type_info & PGT_type_mask )
2545 case PGT_l1_page_table:
2547 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2548 okay = mod_l1_entry(va, l1e, mfn,
2549 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2551 break;
2552 case PGT_l2_page_table:
2554 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2555 okay = mod_l2_entry(va, l2e, mfn, type_info,
2556 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2558 break;
2559 #if CONFIG_PAGING_LEVELS >= 3
2560 case PGT_l3_page_table:
2562 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2563 okay = mod_l3_entry(va, l3e, mfn,
2564 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2566 break;
2567 #endif
2568 #if CONFIG_PAGING_LEVELS >= 4
2569 case PGT_l4_page_table:
2571 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2572 okay = mod_l4_entry(va, l4e, mfn,
2573 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2575 break;
2576 #endif
2579 put_page_type(page);
2581 break;
2583 default:
2584 not_a_pt:
2586 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2587 break;
2589 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2591 put_page_type(page);
2593 break;
2596 unmap_domain_page_with_cache(va, &mapcache);
2598 put_page(page);
2599 break;
2601 case MMU_MACHPHYS_UPDATE:
2603 mfn = req.ptr >> PAGE_SHIFT;
2604 gpfn = req.val;
2606 rc = xsm_mmu_machphys_update(d, mfn);
2607 if ( rc )
2608 break;
2610 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2612 MEM_LOG("Could not get page for mach->phys update");
2613 break;
2616 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2618 MEM_LOG("Mach-phys update on auto-translate guest");
2619 break;
2622 set_gpfn_from_mfn(mfn, gpfn);
2623 okay = 1;
2625 paging_mark_dirty(FOREIGNDOM, mfn);
2627 put_page(mfn_to_page(mfn));
2628 break;
2630 default:
2631 MEM_LOG("Invalid page update command %x", cmd);
2632 rc = -ENOSYS;
2633 okay = 0;
2634 break;
2637 if ( unlikely(!okay) )
2639 rc = rc ? rc : -EINVAL;
2640 break;
2643 guest_handle_add_offset(ureqs, 1);
2646 process_deferred_ops();
2648 UNLOCK_BIGLOCK(d);
2650 domain_mmap_cache_destroy(&mapcache);
2652 perfc_add(num_page_updates, i);
2654 out:
2655 /* Add incremental work we have done to the @done output parameter. */
2656 if ( unlikely(!guest_handle_is_null(pdone)) )
2658 done += i;
2659 copy_to_guest(pdone, &done, 1);
2662 return rc;
2666 static int create_grant_pte_mapping(
2667 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2669 int rc = GNTST_okay;
2670 void *va;
2671 unsigned long gmfn, mfn;
2672 struct page_info *page;
2673 u32 type;
2674 l1_pgentry_t ol1e;
2675 struct domain *d = v->domain;
2677 ASSERT(spin_is_locked(&d->big_lock));
2679 adjust_guest_l1e(nl1e, d);
2681 gmfn = pte_addr >> PAGE_SHIFT;
2682 mfn = gmfn_to_mfn(d, gmfn);
2684 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2686 MEM_LOG("Could not get page for normal update");
2687 return GNTST_general_error;
2690 va = map_domain_page(mfn);
2691 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2692 page = mfn_to_page(mfn);
2694 type = page->u.inuse.type_info & PGT_type_mask;
2695 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2697 MEM_LOG("Grant map attempted to update a non-L1 page");
2698 rc = GNTST_general_error;
2699 goto failed;
2702 ol1e = *(l1_pgentry_t *)va;
2703 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
2705 put_page_type(page);
2706 rc = GNTST_general_error;
2707 goto failed;
2710 if ( !paging_mode_refcounts(d) )
2711 put_page_from_l1e(ol1e, d);
2713 put_page_type(page);
2715 failed:
2716 unmap_domain_page(va);
2717 put_page(page);
2719 return rc;
2722 static int destroy_grant_pte_mapping(
2723 uint64_t addr, unsigned long frame, struct domain *d)
2725 int rc = GNTST_okay;
2726 void *va;
2727 unsigned long gmfn, mfn;
2728 struct page_info *page;
2729 u32 type;
2730 l1_pgentry_t ol1e;
2732 gmfn = addr >> PAGE_SHIFT;
2733 mfn = gmfn_to_mfn(d, gmfn);
2735 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2737 MEM_LOG("Could not get page for normal update");
2738 return GNTST_general_error;
2741 va = map_domain_page(mfn);
2742 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2743 page = mfn_to_page(mfn);
2745 type = page->u.inuse.type_info & PGT_type_mask;
2746 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2748 MEM_LOG("Grant map attempted to update a non-L1 page");
2749 rc = GNTST_general_error;
2750 goto failed;
2753 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2755 put_page_type(page);
2756 rc = GNTST_general_error;
2757 goto failed;
2760 /* Check that the virtual address supplied is actually mapped to frame. */
2761 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2763 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2764 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2765 put_page_type(page);
2766 rc = GNTST_general_error;
2767 goto failed;
2770 /* Delete pagetable entry. */
2771 if ( unlikely(!UPDATE_ENTRY
2772 (l1,
2773 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2774 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
2775 0)) )
2777 MEM_LOG("Cannot delete PTE entry at %p", va);
2778 put_page_type(page);
2779 rc = GNTST_general_error;
2780 goto failed;
2783 put_page_type(page);
2785 failed:
2786 unmap_domain_page(va);
2787 put_page(page);
2788 return rc;
2792 static int create_grant_va_mapping(
2793 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2795 l1_pgentry_t *pl1e, ol1e;
2796 struct domain *d = v->domain;
2797 unsigned long gl1mfn;
2798 int okay;
2800 ASSERT(spin_is_locked(&d->big_lock));
2802 adjust_guest_l1e(nl1e, d);
2804 pl1e = guest_map_l1e(v, va, &gl1mfn);
2805 if ( !pl1e )
2807 MEM_LOG("Could not find L1 PTE for address %lx", va);
2808 return GNTST_general_error;
2810 ol1e = *pl1e;
2811 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
2812 guest_unmap_l1e(v, pl1e);
2813 pl1e = NULL;
2815 if ( !okay )
2816 return GNTST_general_error;
2818 if ( !paging_mode_refcounts(d) )
2819 put_page_from_l1e(ol1e, d);
2821 return GNTST_okay;
2824 static int replace_grant_va_mapping(
2825 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2827 l1_pgentry_t *pl1e, ol1e;
2828 unsigned long gl1mfn;
2829 int rc = 0;
2831 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2832 if ( !pl1e )
2834 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2835 return GNTST_general_error;
2837 ol1e = *pl1e;
2839 /* Check that the virtual address supplied is actually mapped to frame. */
2840 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2842 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2843 l1e_get_pfn(ol1e), addr, frame);
2844 rc = GNTST_general_error;
2845 goto out;
2848 /* Delete pagetable entry. */
2849 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
2851 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2852 rc = GNTST_general_error;
2853 goto out;
2856 out:
2857 guest_unmap_l1e(v, pl1e);
2858 return rc;
2861 static int destroy_grant_va_mapping(
2862 unsigned long addr, unsigned long frame, struct vcpu *v)
2864 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2867 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2868 unsigned int flags, unsigned int cache_flags)
2870 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2872 if ( (flags & GNTMAP_application_map) )
2873 l1e_add_flags(pte,_PAGE_USER);
2874 if ( !(flags & GNTMAP_readonly) )
2875 l1e_add_flags(pte,_PAGE_RW);
2877 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2879 if ( flags & GNTMAP_contains_pte )
2880 return create_grant_pte_mapping(addr, pte, current);
2881 return create_grant_va_mapping(addr, pte, current);
2884 int replace_grant_host_mapping(
2885 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2887 struct vcpu *curr = current;
2888 l1_pgentry_t *pl1e, ol1e;
2889 unsigned long gl1mfn;
2890 int rc;
2892 if ( flags & GNTMAP_contains_pte )
2894 if ( !new_addr )
2895 return destroy_grant_pte_mapping(addr, frame, curr->domain);
2897 MEM_LOG("Unsupported grant table operation");
2898 return GNTST_general_error;
2901 if ( !new_addr )
2902 return destroy_grant_va_mapping(addr, frame, curr);
2904 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
2905 if ( !pl1e )
2907 MEM_LOG("Could not find L1 PTE for address %lx",
2908 (unsigned long)new_addr);
2909 return GNTST_general_error;
2911 ol1e = *pl1e;
2913 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2914 gl1mfn, curr, 0)) )
2916 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2917 guest_unmap_l1e(curr, pl1e);
2918 return GNTST_general_error;
2921 guest_unmap_l1e(curr, pl1e);
2923 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
2924 if ( rc && !paging_mode_refcounts(curr->domain) )
2925 put_page_from_l1e(ol1e, curr->domain);
2927 return rc;
2930 int steal_page(
2931 struct domain *d, struct page_info *page, unsigned int memflags)
2933 u32 _d, _nd, x, y;
2935 spin_lock(&d->page_alloc_lock);
2937 /*
2938 * The tricky bit: atomically release ownership while there is just one
2939 * benign reference to the page (PGC_allocated). If that reference
2940 * disappears then the deallocation routine will safely spin.
2941 */
2942 _d = pickle_domptr(d);
2943 _nd = page->u.inuse._domain;
2944 y = page->count_info;
2945 do {
2946 x = y;
2947 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2948 (1 | PGC_allocated)) || unlikely(_nd != _d) )
2950 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2951 " caf=%08x, taf=%" PRtype_info "\n",
2952 (void *) page_to_mfn(page),
2953 d, d->domain_id, unpickle_domptr(_nd), x,
2954 page->u.inuse.type_info);
2955 spin_unlock(&d->page_alloc_lock);
2956 return -1;
2958 asm volatile (
2959 LOCK_PREFIX "cmpxchg8b %2"
2960 : "=d" (_nd), "=a" (y),
2961 "=m" (*(volatile u64 *)(&page->count_info))
2962 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2963 } while (unlikely(_nd != _d) || unlikely(y != x));
2965 /*
2966 * Unlink from 'd'. At least one reference remains (now anonymous), so
2967 * noone else is spinning to try to delete this page from 'd'.
2968 */
2969 if ( !(memflags & MEMF_no_refcount) )
2970 d->tot_pages--;
2971 list_del(&page->list);
2973 spin_unlock(&d->page_alloc_lock);
2975 return 0;
2978 int do_update_va_mapping(unsigned long va, u64 val64,
2979 unsigned long flags)
2981 l1_pgentry_t val = l1e_from_intpte(val64);
2982 struct vcpu *v = current;
2983 struct domain *d = v->domain;
2984 l1_pgentry_t *pl1e;
2985 unsigned long vmask, bmap_ptr, gl1mfn;
2986 cpumask_t pmask;
2987 int rc = 0;
2989 perfc_incr(calls_to_update_va);
2991 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2992 return -EINVAL;
2994 rc = xsm_update_va_mapping(current->domain, val);
2995 if ( rc )
2996 return rc;
2998 LOCK_BIGLOCK(d);
3000 pl1e = guest_map_l1e(v, va, &gl1mfn);
3002 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3003 rc = -EINVAL;
3005 if ( pl1e )
3006 guest_unmap_l1e(v, pl1e);
3007 pl1e = NULL;
3009 process_deferred_ops();
3011 UNLOCK_BIGLOCK(d);
3013 switch ( flags & UVMF_FLUSHTYPE_MASK )
3015 case UVMF_TLB_FLUSH:
3016 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3018 case UVMF_LOCAL:
3019 flush_tlb_local();
3020 break;
3021 case UVMF_ALL:
3022 flush_tlb_mask(d->domain_dirty_cpumask);
3023 break;
3024 default:
3025 if ( unlikely(!is_pv_32on64_domain(d) ?
3026 get_user(vmask, (unsigned long *)bmap_ptr) :
3027 get_user(vmask, (unsigned int *)bmap_ptr)) )
3028 rc = -EFAULT;
3029 pmask = vcpumask_to_pcpumask(d, vmask);
3030 flush_tlb_mask(pmask);
3031 break;
3033 break;
3035 case UVMF_INVLPG:
3036 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3038 case UVMF_LOCAL:
3039 if ( !paging_mode_enabled(d) ||
3040 (paging_invlpg(v, va) != 0) )
3041 flush_tlb_one_local(va);
3042 break;
3043 case UVMF_ALL:
3044 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3045 break;
3046 default:
3047 if ( unlikely(!is_pv_32on64_domain(d) ?
3048 get_user(vmask, (unsigned long *)bmap_ptr) :
3049 get_user(vmask, (unsigned int *)bmap_ptr)) )
3050 rc = -EFAULT;
3051 pmask = vcpumask_to_pcpumask(d, vmask);
3052 flush_tlb_one_mask(pmask, va);
3053 break;
3055 break;
3058 return rc;
3061 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3062 unsigned long flags,
3063 domid_t domid)
3065 int rc;
3067 if ( !set_foreigndom(domid) )
3068 return -ESRCH;
3070 rc = do_update_va_mapping(va, val64, flags);
3072 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3073 process_deferred_ops(); /* only to clear foreigndom */
3075 return rc;
3080 /*************************
3081 * Descriptor Tables
3082 */
3084 void destroy_gdt(struct vcpu *v)
3086 int i;
3087 unsigned long pfn;
3089 v->arch.guest_context.gdt_ents = 0;
3090 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3092 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3093 put_page_and_type(mfn_to_page(pfn));
3094 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3095 v->arch.guest_context.gdt_frames[i] = 0;
3100 long set_gdt(struct vcpu *v,
3101 unsigned long *frames,
3102 unsigned int entries)
3104 struct domain *d = v->domain;
3105 /* NB. There are 512 8-byte entries per GDT page. */
3106 int i, nr_pages = (entries + 511) / 512;
3107 unsigned long mfn;
3109 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3110 return -EINVAL;
3112 /* Check the pages in the new GDT. */
3113 for ( i = 0; i < nr_pages; i++ )
3115 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3116 if ( !mfn_valid(mfn) ||
3117 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3118 goto fail;
3121 /* Tear down the old GDT. */
3122 destroy_gdt(v);
3124 /* Install the new GDT. */
3125 v->arch.guest_context.gdt_ents = entries;
3126 for ( i = 0; i < nr_pages; i++ )
3128 v->arch.guest_context.gdt_frames[i] = frames[i];
3129 l1e_write(&v->arch.perdomain_ptes[i],
3130 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3133 return 0;
3135 fail:
3136 while ( i-- > 0 )
3137 put_page_and_type(mfn_to_page(frames[i]));
3138 return -EINVAL;
3142 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3144 int nr_pages = (entries + 511) / 512;
3145 unsigned long frames[16];
3146 struct vcpu *curr = current;
3147 long ret;
3149 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3150 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3151 return -EINVAL;
3153 if ( copy_from_guest(frames, frame_list, nr_pages) )
3154 return -EFAULT;
3156 LOCK_BIGLOCK(curr->domain);
3158 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3159 flush_tlb_local();
3161 UNLOCK_BIGLOCK(curr->domain);
3163 return ret;
3167 long do_update_descriptor(u64 pa, u64 desc)
3169 struct domain *dom = current->domain;
3170 unsigned long gmfn = pa >> PAGE_SHIFT;
3171 unsigned long mfn;
3172 unsigned int offset;
3173 struct desc_struct *gdt_pent, d;
3174 struct page_info *page;
3175 long ret = -EINVAL;
3177 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3179 *(u64 *)&d = desc;
3181 mfn = gmfn_to_mfn(dom, gmfn);
3182 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3183 !mfn_valid(mfn) ||
3184 !check_descriptor(dom, &d) )
3185 return -EINVAL;
3187 page = mfn_to_page(mfn);
3188 if ( unlikely(!get_page(page, dom)) )
3189 return -EINVAL;
3191 /* Check if the given frame is in use in an unsafe context. */
3192 switch ( page->u.inuse.type_info & PGT_type_mask )
3194 case PGT_gdt_page:
3195 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3196 goto out;
3197 break;
3198 case PGT_ldt_page:
3199 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3200 goto out;
3201 break;
3202 default:
3203 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3204 goto out;
3205 break;
3208 paging_mark_dirty(dom, mfn);
3210 /* All is good so make the update. */
3211 gdt_pent = map_domain_page(mfn);
3212 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3213 unmap_domain_page(gdt_pent);
3215 put_page_type(page);
3217 ret = 0; /* success */
3219 out:
3220 put_page(page);
3222 return ret;
3225 typedef struct e820entry e820entry_t;
3226 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3228 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3230 switch ( op )
3232 case XENMEM_add_to_physmap:
3234 struct xen_add_to_physmap xatp;
3235 unsigned long prev_mfn, mfn = 0, gpfn;
3236 struct domain *d;
3238 if ( copy_from_guest(&xatp, arg, 1) )
3239 return -EFAULT;
3241 if ( xatp.domid == DOMID_SELF )
3242 d = rcu_lock_current_domain();
3243 else {
3244 d = rcu_lock_domain_by_id(xatp.domid);
3245 if ( d == NULL )
3246 return -ESRCH;
3247 if ( !IS_PRIV_FOR(current->domain, d) ) {
3248 rcu_unlock_domain(d);
3249 return -EPERM;
3253 if ( xsm_add_to_physmap(current->domain, d) )
3255 rcu_unlock_domain(d);
3256 return -EPERM;
3259 switch ( xatp.space )
3261 case XENMAPSPACE_shared_info:
3262 if ( xatp.idx == 0 )
3263 mfn = virt_to_mfn(d->shared_info);
3264 /* XXX: assumption here, this is called after E820 table is build
3265 * need the E820 to initialize MTRR.
3266 */
3267 if ( is_hvm_domain(d) ) {
3268 extern void init_mtrr_in_hyper(struct vcpu *);
3269 struct vcpu *vs;
3270 for_each_vcpu(d, vs)
3271 init_mtrr_in_hyper(vs);
3273 break;
3274 case XENMAPSPACE_grant_table:
3275 spin_lock(&d->grant_table->lock);
3277 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3278 (xatp.idx < max_nr_grant_frames) )
3279 gnttab_grow_table(d, xatp.idx + 1);
3281 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3282 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3284 spin_unlock(&d->grant_table->lock);
3285 break;
3286 default:
3287 break;
3290 if ( !paging_mode_translate(d) || (mfn == 0) )
3292 rcu_unlock_domain(d);
3293 return -EINVAL;
3296 LOCK_BIGLOCK(d);
3298 /* Remove previously mapped page if it was present. */
3299 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3300 if ( mfn_valid(prev_mfn) )
3302 if ( is_xen_heap_mfn(prev_mfn) )
3303 /* Xen heap frames are simply unhooked from this phys slot. */
3304 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3305 else
3306 /* Normal domain memory is freed, to avoid leaking memory. */
3307 guest_remove_page(d, xatp.gpfn);
3310 /* Unmap from old location, if any. */
3311 gpfn = get_gpfn_from_mfn(mfn);
3312 if ( gpfn != INVALID_M2P_ENTRY )
3313 guest_physmap_remove_page(d, gpfn, mfn);
3315 /* Map at new location. */
3316 guest_physmap_add_page(d, xatp.gpfn, mfn);
3318 UNLOCK_BIGLOCK(d);
3320 rcu_unlock_domain(d);
3322 break;
3325 case XENMEM_set_memory_map:
3327 struct xen_foreign_memory_map fmap;
3328 struct domain *d;
3329 int rc;
3331 if ( copy_from_guest(&fmap, arg, 1) )
3332 return -EFAULT;
3334 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3335 return -EINVAL;
3337 if ( fmap.domid == DOMID_SELF )
3338 d = rcu_lock_current_domain();
3339 else {
3340 d = rcu_lock_domain_by_id(fmap.domid);
3341 if ( d == NULL )
3342 return -ESRCH;
3343 if ( !IS_PRIV_FOR(current->domain, d) ) {
3344 rcu_unlock_domain(d);
3345 return -EPERM;
3349 rc = xsm_domain_memory_map(d);
3350 if ( rc )
3352 rcu_unlock_domain(d);
3353 return rc;
3356 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3357 fmap.map.nr_entries) ? -EFAULT : 0;
3358 d->arch.nr_e820 = fmap.map.nr_entries;
3360 rcu_unlock_domain(d);
3361 return rc;
3364 case XENMEM_memory_map:
3366 struct xen_memory_map map;
3367 struct domain *d = current->domain;
3369 /* Backwards compatibility. */
3370 if ( d->arch.nr_e820 == 0 )
3371 return -ENOSYS;
3373 if ( copy_from_guest(&map, arg, 1) )
3374 return -EFAULT;
3376 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3377 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3378 copy_to_guest(arg, &map, 1) )
3379 return -EFAULT;
3381 return 0;
3384 case XENMEM_machine_memory_map:
3386 struct xen_memory_map memmap;
3387 XEN_GUEST_HANDLE(e820entry_t) buffer;
3388 int count;
3389 int rc;
3391 if ( !IS_PRIV(current->domain) )
3392 return -EINVAL;
3394 rc = xsm_machine_memory_map();
3395 if ( rc )
3396 return rc;
3398 if ( copy_from_guest(&memmap, arg, 1) )
3399 return -EFAULT;
3400 if ( memmap.nr_entries < e820.nr_map + 1 )
3401 return -EINVAL;
3403 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3405 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3406 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3407 return -EFAULT;
3409 memmap.nr_entries = count;
3411 if ( copy_to_guest(arg, &memmap, 1) )
3412 return -EFAULT;
3414 return 0;
3417 case XENMEM_machphys_mapping:
3419 static const struct xen_machphys_mapping mapping = {
3420 .v_start = MACH2PHYS_VIRT_START,
3421 .v_end = MACH2PHYS_VIRT_END,
3422 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3423 };
3425 if ( copy_to_guest(arg, &mapping, 1) )
3426 return -EFAULT;
3428 return 0;
3431 default:
3432 return subarch_memory_op(op, arg);
3435 return 0;
3439 /*************************
3440 * Writable Pagetables
3441 */
3443 struct ptwr_emulate_ctxt {
3444 struct x86_emulate_ctxt ctxt;
3445 unsigned long cr2;
3446 l1_pgentry_t pte;
3447 };
3449 static int ptwr_emulated_read(
3450 enum x86_segment seg,
3451 unsigned long offset,
3452 unsigned long *val,
3453 unsigned int bytes,
3454 struct x86_emulate_ctxt *ctxt)
3456 unsigned int rc;
3457 unsigned long addr = offset;
3459 *val = 0;
3460 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3462 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3463 return X86EMUL_EXCEPTION;
3466 return X86EMUL_OKAY;
3469 static int ptwr_emulated_update(
3470 unsigned long addr,
3471 paddr_t old,
3472 paddr_t val,
3473 unsigned int bytes,
3474 unsigned int do_cmpxchg,
3475 struct ptwr_emulate_ctxt *ptwr_ctxt)
3477 unsigned long mfn;
3478 unsigned long unaligned_addr = addr;
3479 struct page_info *page;
3480 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3481 struct vcpu *v = current;
3482 struct domain *d = v->domain;
3484 /* Only allow naturally-aligned stores within the original %cr2 page. */
3485 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3487 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3488 ptwr_ctxt->cr2, addr, bytes);
3489 return X86EMUL_UNHANDLEABLE;
3492 /* Turn a sub-word access into a full-word access. */
3493 if ( bytes != sizeof(paddr_t) )
3495 paddr_t full;
3496 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3498 /* Align address; read full word. */
3499 addr &= ~(sizeof(paddr_t)-1);
3500 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3502 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3503 return X86EMUL_EXCEPTION;
3505 /* Mask out bits provided by caller. */
3506 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3507 /* Shift the caller value and OR in the missing bits. */
3508 val &= (((paddr_t)1 << (bytes*8)) - 1);
3509 val <<= (offset)*8;
3510 val |= full;
3511 /* Also fill in missing parts of the cmpxchg old value. */
3512 old &= (((paddr_t)1 << (bytes*8)) - 1);
3513 old <<= (offset)*8;
3514 old |= full;
3517 pte = ptwr_ctxt->pte;
3518 mfn = l1e_get_pfn(pte);
3519 page = mfn_to_page(mfn);
3521 /* We are looking only for read-only mappings of p.t. pages. */
3522 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3523 ASSERT(mfn_valid(mfn));
3524 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3525 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3526 ASSERT(page_get_owner(page) == d);
3528 /* Check the new PTE. */
3529 nl1e = l1e_from_intpte(val);
3530 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3532 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3533 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3534 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3536 /*
3537 * If this is an upper-half write to a PAE PTE then we assume that
3538 * the guest has simply got the two writes the wrong way round. We
3539 * zap the PRESENT bit on the assumption that the bottom half will
3540 * be written immediately after we return to the guest.
3541 */
3542 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3543 l1e_get_intpte(nl1e));
3544 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3546 else
3548 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3549 return X86EMUL_UNHANDLEABLE;
3553 adjust_guest_l1e(nl1e, d);
3555 /* Checked successfully: do the update (write or cmpxchg). */
3556 pl1e = map_domain_page(mfn);
3557 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3558 if ( do_cmpxchg )
3560 int okay;
3561 intpte_t t = old;
3562 ol1e = l1e_from_intpte(old);
3564 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3565 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3566 okay = (okay && t == old);
3568 if ( !okay )
3570 unmap_domain_page(pl1e);
3571 put_page_from_l1e(nl1e, d);
3572 return X86EMUL_CMPXCHG_FAILED;
3575 else
3577 ol1e = *pl1e;
3578 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3579 BUG();
3582 trace_ptwr_emulation(addr, nl1e);
3584 unmap_domain_page(pl1e);
3586 /* Finally, drop the old PTE. */
3587 put_page_from_l1e(ol1e, d);
3589 return X86EMUL_OKAY;
3592 static int ptwr_emulated_write(
3593 enum x86_segment seg,
3594 unsigned long offset,
3595 unsigned long val,
3596 unsigned int bytes,
3597 struct x86_emulate_ctxt *ctxt)
3599 return ptwr_emulated_update(
3600 offset, 0, val, bytes, 0,
3601 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3604 static int ptwr_emulated_cmpxchg(
3605 enum x86_segment seg,
3606 unsigned long offset,
3607 unsigned long old,
3608 unsigned long new,
3609 unsigned int bytes,
3610 struct x86_emulate_ctxt *ctxt)
3612 return ptwr_emulated_update(
3613 offset, old, new, bytes, 1,
3614 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3617 static int ptwr_emulated_cmpxchg8b(
3618 enum x86_segment seg,
3619 unsigned long offset,
3620 unsigned long old,
3621 unsigned long old_hi,
3622 unsigned long new,
3623 unsigned long new_hi,
3624 struct x86_emulate_ctxt *ctxt)
3626 if ( CONFIG_PAGING_LEVELS == 2 )
3627 return X86EMUL_UNHANDLEABLE;
3628 return ptwr_emulated_update(
3629 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3630 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3633 static struct x86_emulate_ops ptwr_emulate_ops = {
3634 .read = ptwr_emulated_read,
3635 .insn_fetch = ptwr_emulated_read,
3636 .write = ptwr_emulated_write,
3637 .cmpxchg = ptwr_emulated_cmpxchg,
3638 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3639 };
3641 /* Write page fault handler: check if guest is trying to modify a PTE. */
3642 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3643 struct cpu_user_regs *regs)
3645 struct domain *d = v->domain;
3646 struct page_info *page;
3647 l1_pgentry_t pte;
3648 struct ptwr_emulate_ctxt ptwr_ctxt;
3649 int rc;
3651 LOCK_BIGLOCK(d);
3653 /* Attempt to read the PTE that maps the VA being accessed. */
3654 guest_get_eff_l1e(v, addr, &pte);
3655 page = l1e_get_page(pte);
3657 /* We are looking only for read-only mappings of p.t. pages. */
3658 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3659 !mfn_valid(l1e_get_pfn(pte)) ||
3660 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3661 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3662 (page_get_owner(page) != d) )
3663 goto bail;
3665 ptwr_ctxt.ctxt.regs = regs;
3666 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3667 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3668 ptwr_ctxt.cr2 = addr;
3669 ptwr_ctxt.pte = pte;
3671 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3672 if ( rc == X86EMUL_UNHANDLEABLE )
3673 goto bail;
3675 UNLOCK_BIGLOCK(d);
3676 perfc_incr(ptwr_emulations);
3677 return EXCRET_fault_fixed;
3679 bail:
3680 UNLOCK_BIGLOCK(d);
3681 return 0;
3684 void free_xen_pagetable(void *v)
3686 extern int early_boot;
3688 if ( early_boot )
3689 return;
3691 if ( is_xen_heap_page(virt_to_page(v)) )
3692 free_xenheap_page(v);
3693 else
3694 free_domheap_page(virt_to_page(v));
3697 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3698 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
3699 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
3701 /*
3702 * map_pages_to_xen() can be called with interrupts disabled:
3703 * * During early bootstrap; or
3704 * * alloc_xenheap_pages() via memguard_guard_range
3705 * In these cases it is safe to use flush_area_local():
3706 * * Because only the local CPU is online; or
3707 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3708 */
3709 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3710 flush_area_local((const void *)v, f) : \
3711 flush_area_all((const void *)v, f))
3713 int map_pages_to_xen(
3714 unsigned long virt,
3715 unsigned long mfn,
3716 unsigned long nr_mfns,
3717 unsigned int flags)
3719 l2_pgentry_t *pl2e, ol2e;
3720 l1_pgentry_t *pl1e, ol1e;
3721 unsigned int i;
3723 while ( nr_mfns != 0 )
3725 #ifdef __x86_64__
3726 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
3727 l3_pgentry_t ol3e = *pl3e;
3729 if ( cpu_has_page1gb &&
3730 !(((virt >> PAGE_SHIFT) | mfn) &
3731 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
3732 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
3733 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
3735 /* 1GB-page mapping. */
3736 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
3738 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
3740 unsigned int flush_flags =
3741 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3743 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
3745 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3746 flush_flags |= FLUSH_TLB_GLOBAL;
3747 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
3748 PAGE_CACHE_ATTRS )
3749 flush_flags |= FLUSH_CACHE;
3750 flush_area(virt, flush_flags);
3752 else
3754 pl2e = l3e_to_l2e(ol3e);
3755 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3757 ol2e = pl2e[i];
3758 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3759 continue;
3760 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3762 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3763 flush_flags |= FLUSH_TLB_GLOBAL;
3764 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3765 PAGE_CACHE_ATTRS )
3766 flush_flags |= FLUSH_CACHE;
3768 else
3770 unsigned int j;
3772 pl1e = l2e_to_l1e(ol2e);
3773 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
3775 ol1e = pl1e[j];
3776 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3777 flush_flags |= FLUSH_TLB_GLOBAL;
3778 if ( (l1e_get_flags(ol1e) ^ flags) &
3779 PAGE_CACHE_ATTRS )
3780 flush_flags |= FLUSH_CACHE;
3784 flush_area(virt, flush_flags);
3785 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3787 ol2e = pl2e[i];
3788 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
3789 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3790 free_xen_pagetable(l2e_to_l1e(ol2e));
3792 free_xen_pagetable(pl2e);
3796 virt += 1UL << L3_PAGETABLE_SHIFT;
3797 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3798 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3799 continue;
3802 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
3803 (l3e_get_flags(ol3e) & _PAGE_PSE) )
3805 unsigned int flush_flags =
3806 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3808 /* Skip this PTE if there is no change. */
3809 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
3810 L1_PAGETABLE_ENTRIES - 1)) +
3811 (l2_table_offset(virt) << PAGETABLE_ORDER) +
3812 l1_table_offset(virt) == mfn) &&
3813 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
3814 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
3816 /* We can skip to end of L3 superpage if we got a match. */
3817 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3818 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3819 if ( i > nr_mfns )
3820 i = nr_mfns;
3821 virt += i << PAGE_SHIFT;
3822 mfn += i;
3823 nr_mfns -= i;
3824 continue;
3827 pl2e = alloc_xen_pagetable();
3828 if ( pl2e == NULL )
3829 return -ENOMEM;
3831 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3832 l2e_write(pl2e + i,
3833 l2e_from_pfn(l3e_get_pfn(ol3e) +
3834 (i << PAGETABLE_ORDER),
3835 l3e_get_flags(ol3e)));
3837 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3838 flush_flags |= FLUSH_TLB_GLOBAL;
3840 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
3841 __PAGE_HYPERVISOR));
3842 flush_area(virt, flush_flags);
3844 #endif
3846 pl2e = virt_to_xen_l2e(virt);
3848 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3849 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3850 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3852 /* Super-page mapping. */
3853 ol2e = *pl2e;
3854 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
3856 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3858 unsigned int flush_flags =
3859 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3861 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3863 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3864 flush_flags |= FLUSH_TLB_GLOBAL;
3865 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3866 PAGE_CACHE_ATTRS )
3867 flush_flags |= FLUSH_CACHE;
3868 flush_area(virt, flush_flags);
3870 else
3872 pl1e = l2e_to_l1e(ol2e);
3873 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3875 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3876 flush_flags |= FLUSH_TLB_GLOBAL;
3877 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3878 PAGE_CACHE_ATTRS )
3879 flush_flags |= FLUSH_CACHE;
3881 flush_area(virt, flush_flags);
3882 free_xen_pagetable(pl1e);
3886 virt += 1UL << L2_PAGETABLE_SHIFT;
3887 mfn += 1UL << PAGETABLE_ORDER;
3888 nr_mfns -= 1UL << PAGETABLE_ORDER;
3890 else
3892 /* Normal page mapping. */
3893 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3895 pl1e = alloc_xen_pagetable();
3896 if ( pl1e == NULL )
3897 return -ENOMEM;
3898 clear_page(pl1e);
3899 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3900 __PAGE_HYPERVISOR));
3902 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3904 unsigned int flush_flags =
3905 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3907 /* Skip this PTE if there is no change. */
3908 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3909 l1_table_offset(virt)) == mfn) &&
3910 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3911 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3913 /* We can skip to end of L2 superpage if we got a match. */
3914 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3915 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3916 if ( i > nr_mfns )
3917 i = nr_mfns;
3918 virt += i << L1_PAGETABLE_SHIFT;
3919 mfn += i;
3920 nr_mfns -= i;
3921 goto check_l3;
3924 pl1e = alloc_xen_pagetable();
3925 if ( pl1e == NULL )
3926 return -ENOMEM;
3928 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3929 l1e_write(&pl1e[i],
3930 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3931 lNf_to_l1f(l2e_get_flags(*pl2e))));
3933 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3934 flush_flags |= FLUSH_TLB_GLOBAL;
3936 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3937 __PAGE_HYPERVISOR));
3938 flush_area(virt, flush_flags);
3941 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3942 ol1e = *pl1e;
3943 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3944 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3946 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3947 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3948 flush_flags |= FLUSH_TLB_GLOBAL;
3949 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3950 flush_flags |= FLUSH_CACHE;
3951 flush_area(virt, flush_flags);
3954 virt += 1UL << L1_PAGETABLE_SHIFT;
3955 mfn += 1UL;
3956 nr_mfns -= 1UL;
3958 if ( (flags == PAGE_HYPERVISOR) &&
3959 ((nr_mfns == 0) ||
3960 ((((virt >> PAGE_SHIFT) | mfn) &
3961 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3963 unsigned long base_mfn;
3964 pl1e = l2e_to_l1e(*pl2e);
3965 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3966 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3967 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3968 (l1e_get_flags(*pl1e) != flags) )
3969 break;
3970 if ( i == L1_PAGETABLE_ENTRIES )
3972 ol2e = *pl2e;
3973 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3974 l1f_to_lNf(flags)));
3975 flush_area(virt - PAGE_SIZE,
3976 FLUSH_TLB_GLOBAL |
3977 FLUSH_ORDER(PAGETABLE_ORDER));
3978 free_xen_pagetable(l2e_to_l1e(ol2e));
3983 check_l3: ;
3984 #ifdef __x86_64__
3985 if ( cpu_has_page1gb &&
3986 (flags == PAGE_HYPERVISOR) &&
3987 ((nr_mfns == 0) ||
3988 !(((virt >> PAGE_SHIFT) | mfn) &
3989 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
3991 unsigned long base_mfn;
3993 ol3e = *pl3e;
3994 pl2e = l3e_to_l2e(ol3e);
3995 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
3996 L1_PAGETABLE_ENTRIES - 1);
3997 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
3998 if ( (l2e_get_pfn(*pl2e) !=
3999 (base_mfn + (i << PAGETABLE_ORDER))) ||
4000 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4001 break;
4002 if ( i == L2_PAGETABLE_ENTRIES )
4004 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4005 l1f_to_lNf(flags)));
4006 flush_area(virt - PAGE_SIZE,
4007 FLUSH_TLB_GLOBAL |
4008 FLUSH_ORDER(2*PAGETABLE_ORDER));
4009 free_xen_pagetable(l3e_to_l2e(ol3e));
4012 #endif
4015 return 0;
4018 void destroy_xen_mappings(unsigned long s, unsigned long e)
4020 l2_pgentry_t *pl2e;
4021 l1_pgentry_t *pl1e;
4022 unsigned int i;
4023 unsigned long v = s;
4025 ASSERT((s & ~PAGE_MASK) == 0);
4026 ASSERT((e & ~PAGE_MASK) == 0);
4028 while ( v < e )
4030 #ifdef __x86_64__
4031 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4033 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4035 v += 1UL << L3_PAGETABLE_SHIFT;
4036 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4037 continue;
4040 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4042 if ( l2_table_offset(v) == 0 &&
4043 l1_table_offset(v) == 0 &&
4044 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4046 /* PAGE1GB: whole superpage is destroyed. */
4047 l3e_write_atomic(pl3e, l3e_empty());
4048 v += 1UL << L3_PAGETABLE_SHIFT;
4049 continue;
4052 /* PAGE1GB: shatter the superpage and fall through. */
4053 pl2e = alloc_xen_pagetable();
4054 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4055 l2e_write(pl2e + i,
4056 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4057 (i << PAGETABLE_ORDER),
4058 l3e_get_flags(*pl3e)));
4059 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4060 __PAGE_HYPERVISOR));
4062 #endif
4064 pl2e = virt_to_xen_l2e(v);
4066 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4068 v += 1UL << L2_PAGETABLE_SHIFT;
4069 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4070 continue;
4073 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4075 if ( (l1_table_offset(v) == 0) &&
4076 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4078 /* PSE: whole superpage is destroyed. */
4079 l2e_write_atomic(pl2e, l2e_empty());
4080 v += 1UL << L2_PAGETABLE_SHIFT;
4082 else
4084 /* PSE: shatter the superpage and try again. */
4085 pl1e = alloc_xen_pagetable();
4086 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4087 l1e_write(&pl1e[i],
4088 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4089 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4090 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4091 __PAGE_HYPERVISOR));
4094 else
4096 /* Ordinary 4kB mapping. */
4097 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4098 l1e_write_atomic(pl1e, l1e_empty());
4099 v += PAGE_SIZE;
4101 /* If we are done with the L2E, check if it is now empty. */
4102 if ( (v != e) && (l1_table_offset(v) != 0) )
4103 continue;
4104 pl1e = l2e_to_l1e(*pl2e);
4105 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4106 if ( l1e_get_intpte(pl1e[i]) != 0 )
4107 break;
4108 if ( i == L1_PAGETABLE_ENTRIES )
4110 /* Empty: zap the L2E and free the L1 page. */
4111 l2e_write_atomic(pl2e, l2e_empty());
4112 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4113 free_xen_pagetable(pl1e);
4117 #ifdef __x86_64__
4118 /* If we are done with the L3E, check if it is now empty. */
4119 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4120 continue;
4121 pl2e = l3e_to_l2e(*pl3e);
4122 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4123 if ( l2e_get_intpte(pl2e[i]) != 0 )
4124 break;
4125 if ( i == L2_PAGETABLE_ENTRIES )
4127 /* Empty: zap the L3E and free the L2 page. */
4128 l3e_write_atomic(pl3e, l3e_empty());
4129 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4130 free_xen_pagetable(pl2e);
4132 #endif
4135 flush_area(NULL, FLUSH_TLB_GLOBAL);
4138 void __set_fixmap(
4139 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4141 BUG_ON(idx >= __end_of_fixed_addresses);
4142 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4145 #ifdef MEMORY_GUARD
4147 void memguard_init(void)
4149 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4150 map_pages_to_xen(
4151 (unsigned long)__va(start),
4152 start >> PAGE_SHIFT,
4153 (xenheap_phys_end - start) >> PAGE_SHIFT,
4154 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4155 #ifdef __x86_64__
4156 BUG_ON(start != xen_phys_start);
4157 map_pages_to_xen(
4158 XEN_VIRT_START,
4159 start >> PAGE_SHIFT,
4160 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4161 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4162 #endif
4165 static void __memguard_change_range(void *p, unsigned long l, int guard)
4167 unsigned long _p = (unsigned long)p;
4168 unsigned long _l = (unsigned long)l;
4169 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4171 /* Ensure we are dealing with a page-aligned whole number of pages. */
4172 ASSERT((_p&~PAGE_MASK) == 0);
4173 ASSERT((_l&~PAGE_MASK) == 0);
4175 if ( guard )
4176 flags &= ~_PAGE_PRESENT;
4178 map_pages_to_xen(
4179 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4182 void memguard_guard_range(void *p, unsigned long l)
4184 __memguard_change_range(p, l, 1);
4187 void memguard_unguard_range(void *p, unsigned long l)
4189 __memguard_change_range(p, l, 0);
4192 #endif
4194 void memguard_guard_stack(void *p)
4196 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4197 p = (void *)((unsigned long)p + STACK_SIZE -
4198 PRIMARY_STACK_SIZE - PAGE_SIZE);
4199 memguard_guard_range(p, PAGE_SIZE);
4202 /*
4203 * Local variables:
4204 * mode: C
4205 * c-set-style: "BSD"
4206 * c-basic-offset: 4
4207 * tab-width: 4
4208 * indent-tabs-mode: nil
4209 * End:
4210 */