debuggers.hg

view xen/arch/x86/mm.c @ 19946:01ae7dc043ba

x86: extend mmu_update hypercall to allow update of foreign pagetables.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jul 07 14:38:59 2009 +0100 (2009-07-07)
parents 2f9e1348aa98
children d6c1d7992f43
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <public/sched.h>
114 #include <xsm/xsm.h>
115 #include <xen/trace.h>
117 /*
118 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
119 * mappings to avoid type conflicts with fixed-range MTRRs covering the
120 * lowest megabyte of physical memory. In any case the VGA hole should be
121 * mapped with type UC.
122 */
123 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
124 l1_identmap[L1_PAGETABLE_ENTRIES];
126 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
128 /*
129 * PTE updates can be done with ordinary writes except:
130 * 1. Debug builds get extra checking by using CMPXCHG[8B].
131 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
132 */
133 #if !defined(NDEBUG) || defined(__i386__)
134 #define PTE_UPDATE_WITH_CMPXCHG
135 #endif
137 /* Used to defer flushing of memory structures. */
138 struct percpu_mm_info {
139 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
140 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
141 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
142 unsigned int deferred_ops;
143 /* If non-NULL, specifies a foreign subject domain for some operations. */
144 struct domain *foreign;
145 };
146 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
148 /*
149 * Returns the current foreign domain; defaults to the currently-executing
150 * domain if a foreign override hasn't been specified.
151 */
152 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
154 /* Private domain structs for DOMID_XEN and DOMID_IO. */
155 struct domain *dom_xen, *dom_io;
157 /* Frame table and its size in pages. */
158 struct page_info *__read_mostly frame_table;
159 unsigned long max_page;
160 unsigned long total_pages;
162 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
164 int opt_allow_hugepage;
165 boolean_param("allowhugepage", opt_allow_hugepage);
167 #define l1_disallow_mask(d) \
168 ((d != dom_io) && \
169 (rangeset_is_empty((d)->iomem_caps) && \
170 rangeset_is_empty((d)->arch.ioport_caps) && \
171 !has_arch_pdevs(d)) ? \
172 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
174 #ifdef CONFIG_COMPAT
175 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
176 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
177 L3_DISALLOW_MASK : \
178 COMPAT_L3_DISALLOW_MASK)
179 #else
180 #define l3_disallow_mask(d) L3_DISALLOW_MASK
181 #endif
183 void __init init_frametable(void)
184 {
185 unsigned long nr_pages, page_step, i, mfn;
187 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
189 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
190 page_step = 1 << (cpu_has_page1gb ? L3_PAGETABLE_SHIFT - PAGE_SHIFT
191 : L2_PAGETABLE_SHIFT - PAGE_SHIFT);
193 for ( i = 0; i < nr_pages; i += page_step )
194 {
195 /*
196 * The hardcoded 4 below is arbitrary - just pick whatever you think
197 * is reasonable to waste as a trade-off for using a large page.
198 */
199 while (nr_pages + 4 - i < page_step)
200 page_step >>= PAGETABLE_ORDER;
201 mfn = alloc_boot_pages(page_step, page_step);
202 if ( mfn == 0 )
203 panic("Not enough memory for frame table\n");
204 map_pages_to_xen(
205 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
206 mfn, page_step, PAGE_HYPERVISOR);
207 }
209 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
210 }
212 void __init arch_init_memory(void)
213 {
214 extern void subarch_init_memory(void);
216 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
218 /*
219 * Initialise our DOMID_XEN domain.
220 * Any Xen-heap pages that we will allow to be mapped will have
221 * their domain field set to dom_xen.
222 */
223 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
224 BUG_ON(dom_xen == NULL);
226 /*
227 * Initialise our DOMID_IO domain.
228 * This domain owns I/O pages that are within the range of the page_info
229 * array. Mappings occur at the priv of the caller.
230 */
231 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
232 BUG_ON(dom_io == NULL);
234 /* First 1MB of RAM is historically marked as I/O. */
235 for ( i = 0; i < 0x100; i++ )
236 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
238 /* Any areas not specified as RAM by the e820 map are considered I/O. */
239 for ( i = 0, pfn = 0; pfn < max_page; i++ )
240 {
241 while ( (i < e820.nr_map) &&
242 (e820.map[i].type != E820_RAM) &&
243 (e820.map[i].type != E820_UNUSABLE) )
244 i++;
246 if ( i >= e820.nr_map )
247 {
248 /* No more RAM regions: mark as I/O right to end of memory map. */
249 rstart_pfn = rend_pfn = max_page;
250 }
251 else
252 {
253 /* Mark as I/O just up as far as next RAM region. */
254 rstart_pfn = min_t(unsigned long, max_page,
255 PFN_UP(e820.map[i].addr));
256 rend_pfn = max_t(unsigned long, rstart_pfn,
257 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
258 }
260 /*
261 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
262 * In particular this ensures that RAM holes are respected even in
263 * the statically-initialised 1-16MB mapping area.
264 */
265 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
266 ioend_pfn = rstart_pfn;
267 #if defined(CONFIG_X86_32)
268 ioend_pfn = min_t(unsigned long, ioend_pfn,
269 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
270 #endif
271 if ( iostart_pfn < ioend_pfn )
272 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
273 (unsigned long)mfn_to_virt(ioend_pfn));
275 /* Mark as I/O up to next RAM region. */
276 for ( ; pfn < rstart_pfn; pfn++ )
277 {
278 BUG_ON(!mfn_valid(pfn));
279 share_xen_page_with_guest(
280 mfn_to_page(pfn), dom_io, XENSHARE_writable);
281 }
283 /* Skip the RAM region. */
284 pfn = rend_pfn;
285 }
287 subarch_init_memory();
288 }
290 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
291 {
292 uint64_t maddr = pfn_to_paddr(mfn);
293 int i;
295 for ( i = 0; i < e820.nr_map; i++ )
296 {
297 switch ( e820.map[i].type )
298 {
299 case E820_RAM:
300 if ( mem_type & RAM_TYPE_CONVENTIONAL )
301 break;
302 continue;
303 case E820_RESERVED:
304 if ( mem_type & RAM_TYPE_RESERVED )
305 break;
306 continue;
307 case E820_UNUSABLE:
308 if ( mem_type & RAM_TYPE_UNUSABLE )
309 break;
310 continue;
311 case E820_ACPI:
312 case E820_NVS:
313 if ( mem_type & RAM_TYPE_ACPI )
314 break;
315 continue;
316 default:
317 /* unknown */
318 continue;
319 }
321 /* Test the range. */
322 if ( (e820.map[i].addr <= maddr) &&
323 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
324 return 1;
325 }
327 return 0;
328 }
330 unsigned long domain_get_maximum_gpfn(struct domain *d)
331 {
332 if ( is_hvm_domain(d) )
333 return d->arch.p2m->max_mapped_pfn;
334 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
335 return arch_get_max_pfn(d) - 1;
336 }
338 void share_xen_page_with_guest(
339 struct page_info *page, struct domain *d, int readonly)
340 {
341 if ( page_get_owner(page) == d )
342 return;
344 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
346 spin_lock(&d->page_alloc_lock);
348 /* The incremented type count pins as writable or read-only. */
349 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
350 page->u.inuse.type_info |= PGT_validated | 1;
352 page_set_owner(page, d);
353 wmb(); /* install valid domain ptr before updating refcnt. */
354 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
356 /* Only add to the allocation list if the domain isn't dying. */
357 if ( !d->is_dying )
358 {
359 page->count_info |= PGC_allocated | 1;
360 if ( unlikely(d->xenheap_pages++ == 0) )
361 get_knownalive_domain(d);
362 page_list_add_tail(page, &d->xenpage_list);
363 }
365 spin_unlock(&d->page_alloc_lock);
366 }
368 void share_xen_page_with_privileged_guests(
369 struct page_info *page, int readonly)
370 {
371 share_xen_page_with_guest(page, dom_xen, readonly);
372 }
374 #if defined(__i386__)
376 #ifdef NDEBUG
377 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
378 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
379 #else
380 /*
381 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
382 * We cannot safely shadow the idle page table, nor shadow page tables
383 * (detected by zero reference count). As required for correctness, we
384 * always shadow PDPTs above 4GB.
385 */
386 #define l3tab_needs_shadow(mfn) \
387 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
388 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
389 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
390 ((mfn) >= 0x100000))
391 #endif
393 static l1_pgentry_t *fix_pae_highmem_pl1e;
395 /* Cache the address of PAE high-memory fixmap page tables. */
396 static int __init cache_pae_fixmap_address(void)
397 {
398 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
399 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
400 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
401 return 0;
402 }
403 __initcall(cache_pae_fixmap_address);
405 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
407 void make_cr3(struct vcpu *v, unsigned long mfn)
408 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
409 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
410 {
411 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
412 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
413 unsigned int cpu = smp_processor_id();
415 /* Fast path: does this mfn need a shadow at all? */
416 if ( !l3tab_needs_shadow(mfn) )
417 {
418 v->arch.cr3 = mfn << PAGE_SHIFT;
419 /* Cache is no longer in use or valid */
420 cache->high_mfn = 0;
421 return;
422 }
424 /* Caching logic is not interrupt safe. */
425 ASSERT(!in_irq());
427 /* Protects against pae_flush_pgd(). */
428 spin_lock(&cache->lock);
430 cache->inuse_idx ^= 1;
431 cache->high_mfn = mfn;
433 /* Map the guest L3 table and copy to the chosen low-memory cache. */
434 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
435 /* First check the previous high mapping can't be in the TLB.
436 * (i.e. have we loaded CR3 since we last did this?) */
437 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
438 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
439 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
440 lowmem_l3tab = cache->table[cache->inuse_idx];
441 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
442 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
443 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
445 v->arch.cr3 = __pa(lowmem_l3tab);
447 spin_unlock(&cache->lock);
448 }
450 #else /* !defined(__i386__) */
452 void make_cr3(struct vcpu *v, unsigned long mfn)
453 {
454 v->arch.cr3 = mfn << PAGE_SHIFT;
455 }
457 #endif /* !defined(__i386__) */
459 void write_ptbase(struct vcpu *v)
460 {
461 write_cr3(v->arch.cr3);
462 }
464 /*
465 * Should be called after CR3 is updated.
466 *
467 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
468 * for HVM guests, arch.monitor_table and hvm's guest CR3.
469 *
470 * Update ref counts to shadow tables appropriately.
471 */
472 void update_cr3(struct vcpu *v)
473 {
474 unsigned long cr3_mfn=0;
476 if ( paging_mode_enabled(v->domain) )
477 {
478 paging_update_cr3(v);
479 return;
480 }
482 #if CONFIG_PAGING_LEVELS == 4
483 if ( !(v->arch.flags & TF_kernel_mode) )
484 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
485 else
486 #endif
487 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
489 make_cr3(v, cr3_mfn);
490 }
493 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
494 {
495 int i;
496 unsigned long pfn;
497 struct page_info *page;
499 BUG_ON(unlikely(in_irq()));
501 spin_lock(&v->arch.shadow_ldt_lock);
503 if ( v->arch.shadow_ldt_mapcnt == 0 )
504 goto out;
506 v->arch.shadow_ldt_mapcnt = 0;
508 for ( i = 16; i < 32; i++ )
509 {
510 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
511 if ( pfn == 0 ) continue;
512 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
513 page = mfn_to_page(pfn);
514 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
515 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
516 put_page_and_type(page);
517 }
519 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
520 if ( flush )
521 flush_tlb_mask(&v->vcpu_dirty_cpumask);
523 out:
524 spin_unlock(&v->arch.shadow_ldt_lock);
525 }
528 static int alloc_segdesc_page(struct page_info *page)
529 {
530 struct desc_struct *descs;
531 int i;
533 descs = map_domain_page(page_to_mfn(page));
535 for ( i = 0; i < 512; i++ )
536 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
537 goto fail;
539 unmap_domain_page(descs);
540 return 0;
542 fail:
543 unmap_domain_page(descs);
544 return -EINVAL;
545 }
548 /* Map shadow page at offset @off. */
549 int map_ldt_shadow_page(unsigned int off)
550 {
551 struct vcpu *v = current;
552 struct domain *d = v->domain;
553 unsigned long gmfn, mfn;
554 l1_pgentry_t l1e, nl1e;
555 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
556 int okay;
558 BUG_ON(unlikely(in_irq()));
560 guest_get_eff_kern_l1e(v, gva, &l1e);
561 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
562 return 0;
564 gmfn = l1e_get_pfn(l1e);
565 mfn = gmfn_to_mfn(d, gmfn);
566 if ( unlikely(!mfn_valid(mfn)) )
567 return 0;
569 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
570 if ( unlikely(!okay) )
571 return 0;
573 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
575 spin_lock(&v->arch.shadow_ldt_lock);
576 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
577 v->arch.shadow_ldt_mapcnt++;
578 spin_unlock(&v->arch.shadow_ldt_lock);
580 return 1;
581 }
584 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
585 {
586 struct page_info *page = mfn_to_page(page_nr);
588 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
589 {
590 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
591 return 0;
592 }
594 return 1;
595 }
598 static int get_page_and_type_from_pagenr(unsigned long page_nr,
599 unsigned long type,
600 struct domain *d,
601 int partial,
602 int preemptible)
603 {
604 struct page_info *page = mfn_to_page(page_nr);
605 int rc;
607 if ( likely(partial >= 0) &&
608 unlikely(!get_page_from_pagenr(page_nr, d)) )
609 return -EINVAL;
611 rc = (preemptible ?
612 get_page_type_preemptible(page, type) :
613 (get_page_type(page, type) ? 0 : -EINVAL));
615 if ( unlikely(rc) && partial >= 0 )
616 put_page(page);
618 return rc;
619 }
621 static int get_data_page(
622 struct page_info *page, struct domain *d, int writeable)
623 {
624 int rc;
626 if ( writeable )
627 rc = get_page_and_type(page, d, PGT_writable_page);
628 else
629 rc = get_page(page, d);
631 return rc;
632 }
634 static void put_data_page(
635 struct page_info *page, int writeable)
636 {
637 if ( writeable )
638 put_page_and_type(page);
639 else
640 put_page(page);
641 }
643 /*
644 * We allow root tables to map each other (a.k.a. linear page tables). It
645 * needs some special care with reference counts and access permissions:
646 * 1. The mapping entry must be read-only, or the guest may get write access
647 * to its own PTEs.
648 * 2. We must only bump the reference counts for an *already validated*
649 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
650 * on a validation that is required to complete that validation.
651 * 3. We only need to increment the reference counts for the mapped page
652 * frame if it is mapped by a different root table. This is sufficient and
653 * also necessary to allow validation of a root table mapping itself.
654 */
655 #define define_get_linear_pagetable(level) \
656 static int \
657 get_##level##_linear_pagetable( \
658 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
659 { \
660 unsigned long x, y; \
661 struct page_info *page; \
662 unsigned long pfn; \
663 \
664 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
665 { \
666 MEM_LOG("Attempt to create linear p.t. with write perms"); \
667 return 0; \
668 } \
669 \
670 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
671 { \
672 /* Make sure the mapped frame belongs to the correct domain. */ \
673 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
674 return 0; \
675 \
676 /* \
677 * Ensure that the mapped frame is an already-validated page table. \
678 * If so, atomically increment the count (checking for overflow). \
679 */ \
680 page = mfn_to_page(pfn); \
681 y = page->u.inuse.type_info; \
682 do { \
683 x = y; \
684 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
685 unlikely((x & (PGT_type_mask|PGT_validated)) != \
686 (PGT_##level##_page_table|PGT_validated)) ) \
687 { \
688 put_page(page); \
689 return 0; \
690 } \
691 } \
692 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
693 } \
694 \
695 return 1; \
696 }
699 int is_iomem_page(unsigned long mfn)
700 {
701 struct page_info *page;
703 if ( !mfn_valid(mfn) )
704 return 1;
706 /* Caller must know that it is an iomem page, or a reference is held. */
707 page = mfn_to_page(mfn);
708 ASSERT((page->count_info & PGC_count_mask) != 0);
710 return (page_get_owner(page) == dom_io);
711 }
713 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
714 {
715 #ifdef __x86_64__
716 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
717 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
718 unsigned long xen_va =
719 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
721 if ( unlikely(alias) && cacheattr )
722 map_pages_to_xen(xen_va, mfn, 1, 0);
723 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
724 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
725 if ( unlikely(alias) && !cacheattr )
726 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
727 #endif
728 }
730 int
731 get_page_from_l1e(
732 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
733 {
734 unsigned long mfn = l1e_get_pfn(l1e);
735 struct page_info *page = mfn_to_page(mfn);
736 uint32_t l1f = l1e_get_flags(l1e);
737 struct vcpu *curr = current;
738 struct domain *real_pg_owner;
740 if ( !(l1f & _PAGE_PRESENT) )
741 return 1;
743 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
744 {
745 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
746 return 0;
747 }
749 if ( !mfn_valid(mfn) ||
750 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
751 {
752 /* Only needed the reference to confirm dom_io ownership. */
753 if ( mfn_valid(mfn) )
754 put_page(page);
756 /* DOMID_IO reverts to caller for privilege checks. */
757 if ( pg_owner == dom_io )
758 pg_owner = curr->domain;
760 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
761 {
762 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
763 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
764 pg_owner->domain_id, mfn);
765 return 0;
766 }
768 return 1;
769 }
771 if ( real_pg_owner == NULL )
772 goto could_not_pin;
774 if ( unlikely(real_pg_owner != pg_owner) )
775 {
776 /*
777 * Let privileged domains transfer the right to map their target
778 * domain's pages. This is used to allow stub-domain pvfb export to
779 * dom0, until pvfb supports granted mappings. At that time this
780 * minor hack can go away.
781 */
782 if ( (pg_owner == l1e_owner) || !IS_PRIV_FOR(pg_owner, real_pg_owner) )
783 goto could_not_pin;
784 pg_owner = real_pg_owner;
785 }
787 /* Foreign mappings into guests in shadow external mode don't
788 * contribute to writeable mapping refcounts. (This allows the
789 * qemu-dm helper process in dom0 to map the domain's memory without
790 * messing up the count of "real" writable mappings.) */
791 if ( (l1f & _PAGE_RW) &&
792 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) &&
793 !get_page_type(page, PGT_writable_page) )
794 goto could_not_pin;
796 if ( pte_flags_to_cacheattr(l1f) !=
797 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
798 {
799 unsigned long x, nx, y = page->count_info;
800 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
802 if ( is_xen_heap_page(page) )
803 {
804 if ( (l1f & _PAGE_RW) &&
805 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
806 put_page_type(page);
807 put_page(page);
808 MEM_LOG("Attempt to change cache attributes of Xen heap page");
809 return 0;
810 }
812 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
813 {
814 x = y;
815 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
816 y = cmpxchg(&page->count_info, x, nx);
817 }
819 update_xen_mappings(mfn, cacheattr);
820 }
822 return 1;
824 could_not_pin:
825 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
826 " for l1e_owner=%d, pg_owner=%d",
827 mfn, get_gpfn_from_mfn(mfn),
828 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
829 if ( real_pg_owner != NULL )
830 put_page(page);
831 return 0;
832 }
835 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
836 define_get_linear_pagetable(l2);
837 static int
838 get_page_from_l2e(
839 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
840 {
841 unsigned long mfn = l2e_get_pfn(l2e);
842 int rc;
844 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
845 return 1;
847 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
848 {
849 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
850 return -EINVAL;
851 }
853 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
854 {
855 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
856 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
857 rc = 0;
858 }
859 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
860 {
861 rc = -EINVAL;
862 }
863 else
864 {
865 unsigned long m = mfn;
866 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
868 do {
869 rc = get_data_page(mfn_to_page(m), d, writeable);
870 if ( unlikely(!rc) )
871 {
872 while ( m-- > mfn )
873 put_data_page(mfn_to_page(m), writeable);
874 return -EINVAL;
875 }
876 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
877 }
879 return rc;
880 }
883 define_get_linear_pagetable(l3);
884 static int
885 get_page_from_l3e(
886 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
887 {
888 int rc;
890 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
891 return 1;
893 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
894 {
895 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
896 return -EINVAL;
897 }
899 rc = get_page_and_type_from_pagenr(
900 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
901 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
902 rc = 0;
904 return rc;
905 }
907 #if CONFIG_PAGING_LEVELS >= 4
908 define_get_linear_pagetable(l4);
909 static int
910 get_page_from_l4e(
911 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
912 {
913 int rc;
915 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
916 return 1;
918 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
919 {
920 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
921 return -EINVAL;
922 }
924 rc = get_page_and_type_from_pagenr(
925 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
926 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
927 rc = 0;
929 return rc;
930 }
931 #endif /* 4 level */
933 #ifdef __x86_64__
935 #ifdef USER_MAPPINGS_ARE_GLOBAL
936 #define adjust_guest_l1e(pl1e, d) \
937 do { \
938 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
939 likely(!is_pv_32on64_domain(d)) ) \
940 { \
941 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
942 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
943 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
944 MEM_LOG("Global bit is set to kernel page %lx", \
945 l1e_get_pfn((pl1e))); \
946 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
947 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
948 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
949 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
950 } \
951 } while ( 0 )
952 #else
953 #define adjust_guest_l1e(pl1e, d) \
954 do { \
955 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
956 likely(!is_pv_32on64_domain(d)) ) \
957 l1e_add_flags((pl1e), _PAGE_USER); \
958 } while ( 0 )
959 #endif
961 #define adjust_guest_l2e(pl2e, d) \
962 do { \
963 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
964 likely(!is_pv_32on64_domain(d)) ) \
965 l2e_add_flags((pl2e), _PAGE_USER); \
966 } while ( 0 )
968 #define adjust_guest_l3e(pl3e, d) \
969 do { \
970 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
971 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
972 _PAGE_USER : \
973 _PAGE_USER|_PAGE_RW); \
974 } while ( 0 )
976 #define adjust_guest_l4e(pl4e, d) \
977 do { \
978 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
979 likely(!is_pv_32on64_domain(d)) ) \
980 l4e_add_flags((pl4e), _PAGE_USER); \
981 } while ( 0 )
983 #else /* !defined(__x86_64__) */
985 #define adjust_guest_l1e(_p, _d) ((void)(_d))
986 #define adjust_guest_l2e(_p, _d) ((void)(_d))
987 #define adjust_guest_l3e(_p, _d) ((void)(_d))
989 #endif
991 #ifdef CONFIG_COMPAT
992 #define unadjust_guest_l3e(pl3e, d) \
993 do { \
994 if ( unlikely(is_pv_32on64_domain(d)) && \
995 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
996 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
997 } while ( 0 )
998 #else
999 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
1000 #endif
1002 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1004 unsigned long pfn = l1e_get_pfn(l1e);
1005 struct page_info *page;
1006 struct domain *pg_owner;
1007 struct vcpu *v;
1009 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1010 return;
1012 page = mfn_to_page(pfn);
1013 pg_owner = page_get_owner(page);
1015 /*
1016 * Check if this is a mapping that was established via a grant reference.
1017 * If it was then we should not be here: we require that such mappings are
1018 * explicitly destroyed via the grant-table interface.
1020 * The upshot of this is that the guest can end up with active grants that
1021 * it cannot destroy (because it no longer has a PTE to present to the
1022 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1023 * hence a special grant PTE flag can be enabled to catch the bug early.
1025 * (Note that the undestroyable active grants are not a security hole in
1026 * Xen. All active grants can safely be cleaned up when the domain dies.)
1027 */
1028 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1029 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1031 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1032 l1e_get_intpte(l1e));
1033 domain_crash(l1e_owner);
1036 /* Remember we didn't take a type-count of foreign writable mappings
1037 * to paging-external domains */
1038 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1039 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1041 put_page_and_type(page);
1043 else
1045 /* We expect this is rare so we blow the entire shadow LDT. */
1046 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1047 PGT_seg_desc_page)) &&
1048 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1049 (l1e_owner == pg_owner) )
1051 for_each_vcpu ( pg_owner, v )
1052 invalidate_shadow_ldt(v, 1);
1054 put_page(page);
1059 /*
1060 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1061 * Note also that this automatically deals correctly with linear p.t.'s.
1062 */
1063 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1065 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1066 return 1;
1068 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1070 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1071 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1073 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1074 do {
1075 put_data_page(mfn_to_page(m), writeable);
1076 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1078 else
1080 put_page_and_type(l2e_get_page(l2e));
1083 return 0;
1086 static int __put_page_type(struct page_info *, int preemptible);
1088 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1089 int partial, int preemptible)
1091 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1092 return 1;
1094 #ifdef __x86_64__
1095 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1097 unsigned long mfn = l3e_get_pfn(l3e);
1098 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1100 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1101 do {
1102 put_data_page(mfn_to_page(mfn), writeable);
1103 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1105 return 0;
1107 #endif
1109 if ( unlikely(partial > 0) )
1110 return __put_page_type(l3e_get_page(l3e), preemptible);
1112 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1115 #if CONFIG_PAGING_LEVELS >= 4
1116 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1117 int partial, int preemptible)
1119 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1120 (l4e_get_pfn(l4e) != pfn) )
1122 if ( unlikely(partial > 0) )
1123 return __put_page_type(l4e_get_page(l4e), preemptible);
1124 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1126 return 1;
1128 #endif
1130 static int alloc_l1_table(struct page_info *page)
1132 struct domain *d = page_get_owner(page);
1133 unsigned long pfn = page_to_mfn(page);
1134 l1_pgentry_t *pl1e;
1135 unsigned int i;
1137 pl1e = map_domain_page(pfn);
1139 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1141 if ( is_guest_l1_slot(i) &&
1142 unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
1143 goto fail;
1145 adjust_guest_l1e(pl1e[i], d);
1148 unmap_domain_page(pl1e);
1149 return 0;
1151 fail:
1152 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1153 while ( i-- > 0 )
1154 if ( is_guest_l1_slot(i) )
1155 put_page_from_l1e(pl1e[i], d);
1157 unmap_domain_page(pl1e);
1158 return -EINVAL;
1161 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1163 struct page_info *page;
1164 l2_pgentry_t *pl2e;
1165 l3_pgentry_t l3e3;
1166 #ifndef CONFIG_COMPAT
1167 l2_pgentry_t l2e;
1168 int i;
1169 #endif
1171 if ( !is_pv_32bit_domain(d) )
1172 return 1;
1174 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1176 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1177 l3e3 = pl3e[3];
1178 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1180 MEM_LOG("PAE L3 3rd slot is empty");
1181 return 0;
1184 /*
1185 * The Xen-private mappings include linear mappings. The L2 thus cannot
1186 * be shared by multiple L3 tables. The test here is adequate because:
1187 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1188 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1189 * 2. Cannot appear in another page table's L3:
1190 * a. alloc_l3_table() calls this function and this check will fail
1191 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1192 */
1193 page = l3e_get_page(l3e3);
1194 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1195 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1196 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1197 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1199 MEM_LOG("PAE L3 3rd slot is shared");
1200 return 0;
1203 /* Xen private mappings. */
1204 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1205 #ifndef CONFIG_COMPAT
1206 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1207 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1208 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1209 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1211 l2e = l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
1212 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1214 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1216 l2e = l2e_empty();
1217 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1218 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1219 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1221 #else
1222 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1223 &compat_idle_pg_table_l2[
1224 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1225 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1226 #endif
1227 unmap_domain_page(pl2e);
1229 return 1;
1232 #ifdef __i386__
1233 /* Flush a pgdir update into low-memory caches. */
1234 static void pae_flush_pgd(
1235 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1237 struct domain *d = page_get_owner(mfn_to_page(mfn));
1238 struct vcpu *v;
1239 intpte_t _ol3e, _nl3e, _pl3e;
1240 l3_pgentry_t *l3tab_ptr;
1241 struct pae_l3_cache *cache;
1243 if ( unlikely(shadow_mode_enabled(d)) )
1245 cpumask_t m = CPU_MASK_NONE;
1246 /* Re-shadow this l3 table on any vcpus that are using it */
1247 for_each_vcpu ( d, v )
1248 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1250 paging_update_cr3(v);
1251 cpus_or(m, m, v->vcpu_dirty_cpumask);
1253 flush_tlb_mask(&m);
1256 /* If below 4GB then the pgdir is not shadowed in low memory. */
1257 if ( !l3tab_needs_shadow(mfn) )
1258 return;
1260 for_each_vcpu ( d, v )
1262 cache = &v->arch.pae_l3_cache;
1264 spin_lock(&cache->lock);
1266 if ( cache->high_mfn == mfn )
1268 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1269 _ol3e = l3e_get_intpte(*l3tab_ptr);
1270 _nl3e = l3e_get_intpte(nl3e);
1271 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1272 BUG_ON(_pl3e != _ol3e);
1275 spin_unlock(&cache->lock);
1278 flush_tlb_mask(&d->domain_dirty_cpumask);
1280 #else
1281 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1282 #endif
1284 static int alloc_l2_table(struct page_info *page, unsigned long type,
1285 int preemptible)
1287 struct domain *d = page_get_owner(page);
1288 unsigned long pfn = page_to_mfn(page);
1289 l2_pgentry_t *pl2e;
1290 unsigned int i;
1291 int rc = 0;
1293 pl2e = map_domain_page(pfn);
1295 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1297 if ( preemptible && i && hypercall_preempt_check() )
1299 page->nr_validated_ptes = i;
1300 rc = -EAGAIN;
1301 break;
1304 if ( !is_guest_l2_slot(d, type, i) ||
1305 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1306 continue;
1308 if ( rc < 0 )
1310 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1311 while ( i-- > 0 )
1312 if ( is_guest_l2_slot(d, type, i) )
1313 put_page_from_l2e(pl2e[i], pfn);
1314 break;
1317 adjust_guest_l2e(pl2e[i], d);
1320 unmap_domain_page(pl2e);
1321 return rc > 0 ? 0 : rc;
1324 static int alloc_l3_table(struct page_info *page, int preemptible)
1326 struct domain *d = page_get_owner(page);
1327 unsigned long pfn = page_to_mfn(page);
1328 l3_pgentry_t *pl3e;
1329 unsigned int i;
1330 int rc = 0, partial = page->partial_pte;
1332 #if CONFIG_PAGING_LEVELS == 3
1333 /*
1334 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1335 * the weird 'extended cr3' format for dealing with high-order address
1336 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1337 */
1338 if ( (pfn >= 0x100000) &&
1339 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1340 d->vcpu && d->vcpu[0] && d->vcpu[0]->is_initialised )
1342 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1343 return -EINVAL;
1345 #endif
1347 pl3e = map_domain_page(pfn);
1349 /*
1350 * PAE guests allocate full pages, but aren't required to initialize
1351 * more than the first four entries; when running in compatibility
1352 * mode, however, the full page is visible to the MMU, and hence all
1353 * 512 entries must be valid/verified, which is most easily achieved
1354 * by clearing them out.
1355 */
1356 if ( is_pv_32on64_domain(d) )
1357 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1359 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1360 i++, partial = 0 )
1362 if ( is_pv_32bit_domain(d) && (i == 3) )
1364 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1365 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1366 rc = -EINVAL;
1367 else
1368 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1369 PGT_l2_page_table |
1370 PGT_pae_xen_l2,
1371 d, partial, preemptible);
1373 else if ( !is_guest_l3_slot(i) ||
1374 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1375 partial, preemptible)) > 0 )
1376 continue;
1378 if ( rc == -EAGAIN )
1380 page->nr_validated_ptes = i;
1381 page->partial_pte = partial ?: 1;
1383 else if ( rc == -EINTR && i )
1385 page->nr_validated_ptes = i;
1386 page->partial_pte = 0;
1387 rc = -EAGAIN;
1389 if ( rc < 0 )
1390 break;
1392 adjust_guest_l3e(pl3e[i], d);
1395 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1396 rc = -EINVAL;
1397 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1399 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1400 while ( i-- > 0 )
1402 if ( !is_guest_l3_slot(i) )
1403 continue;
1404 unadjust_guest_l3e(pl3e[i], d);
1405 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1409 unmap_domain_page(pl3e);
1410 return rc > 0 ? 0 : rc;
1413 #if CONFIG_PAGING_LEVELS >= 4
1414 static int alloc_l4_table(struct page_info *page, int preemptible)
1416 struct domain *d = page_get_owner(page);
1417 unsigned long pfn = page_to_mfn(page);
1418 l4_pgentry_t *pl4e = page_to_virt(page);
1419 unsigned int i;
1420 int rc = 0, partial = page->partial_pte;
1422 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1423 i++, partial = 0 )
1425 if ( !is_guest_l4_slot(d, i) ||
1426 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1427 partial, preemptible)) > 0 )
1428 continue;
1430 if ( rc == -EAGAIN )
1432 page->nr_validated_ptes = i;
1433 page->partial_pte = partial ?: 1;
1435 else if ( rc == -EINTR )
1437 if ( i )
1439 page->nr_validated_ptes = i;
1440 page->partial_pte = 0;
1441 rc = -EAGAIN;
1444 else if ( rc < 0 )
1446 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1447 while ( i-- > 0 )
1448 if ( is_guest_l4_slot(d, i) )
1449 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1451 if ( rc < 0 )
1452 return rc;
1454 adjust_guest_l4e(pl4e[i], d);
1457 /* Xen private mappings. */
1458 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1459 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1460 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1461 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1462 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1463 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1464 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1465 __PAGE_HYPERVISOR);
1467 return rc > 0 ? 0 : rc;
1469 #else
1470 #define alloc_l4_table(page, preemptible) (-EINVAL)
1471 #endif
1474 static void free_l1_table(struct page_info *page)
1476 struct domain *d = page_get_owner(page);
1477 unsigned long pfn = page_to_mfn(page);
1478 l1_pgentry_t *pl1e;
1479 unsigned int i;
1481 pl1e = map_domain_page(pfn);
1483 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1484 if ( is_guest_l1_slot(i) )
1485 put_page_from_l1e(pl1e[i], d);
1487 unmap_domain_page(pl1e);
1491 static int free_l2_table(struct page_info *page, int preemptible)
1493 #ifdef CONFIG_COMPAT
1494 struct domain *d = page_get_owner(page);
1495 #endif
1496 unsigned long pfn = page_to_mfn(page);
1497 l2_pgentry_t *pl2e;
1498 unsigned int i = page->nr_validated_ptes - 1;
1499 int err = 0;
1501 pl2e = map_domain_page(pfn);
1503 ASSERT(page->nr_validated_ptes);
1504 do {
1505 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1506 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1507 preemptible && i && hypercall_preempt_check() )
1509 page->nr_validated_ptes = i;
1510 err = -EAGAIN;
1512 } while ( !err && i-- );
1514 unmap_domain_page(pl2e);
1516 if ( !err )
1517 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1519 return err;
1522 static int free_l3_table(struct page_info *page, int preemptible)
1524 struct domain *d = page_get_owner(page);
1525 unsigned long pfn = page_to_mfn(page);
1526 l3_pgentry_t *pl3e;
1527 int rc = 0, partial = page->partial_pte;
1528 unsigned int i = page->nr_validated_ptes - !partial;
1530 pl3e = map_domain_page(pfn);
1532 do {
1533 if ( is_guest_l3_slot(i) )
1535 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1536 if ( rc < 0 )
1537 break;
1538 partial = 0;
1539 if ( rc > 0 )
1540 continue;
1541 unadjust_guest_l3e(pl3e[i], d);
1543 } while ( i-- );
1545 unmap_domain_page(pl3e);
1547 if ( rc == -EAGAIN )
1549 page->nr_validated_ptes = i;
1550 page->partial_pte = partial ?: -1;
1552 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1554 page->nr_validated_ptes = i + 1;
1555 page->partial_pte = 0;
1556 rc = -EAGAIN;
1558 return rc > 0 ? 0 : rc;
1561 #if CONFIG_PAGING_LEVELS >= 4
1562 static int free_l4_table(struct page_info *page, int preemptible)
1564 struct domain *d = page_get_owner(page);
1565 unsigned long pfn = page_to_mfn(page);
1566 l4_pgentry_t *pl4e = page_to_virt(page);
1567 int rc = 0, partial = page->partial_pte;
1568 unsigned int i = page->nr_validated_ptes - !partial;
1570 do {
1571 if ( is_guest_l4_slot(d, i) )
1572 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1573 if ( rc < 0 )
1574 break;
1575 partial = 0;
1576 } while ( i-- );
1578 if ( rc == -EAGAIN )
1580 page->nr_validated_ptes = i;
1581 page->partial_pte = partial ?: -1;
1583 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1585 page->nr_validated_ptes = i + 1;
1586 page->partial_pte = 0;
1587 rc = -EAGAIN;
1589 return rc > 0 ? 0 : rc;
1591 #else
1592 #define free_l4_table(page, preemptible) (-EINVAL)
1593 #endif
1595 static int page_lock(struct page_info *page)
1597 unsigned long x, nx;
1599 do {
1600 while ( (x = page->u.inuse.type_info) & PGT_locked )
1601 cpu_relax();
1602 nx = x + (1 | PGT_locked);
1603 if ( !(x & PGT_validated) ||
1604 !(x & PGT_count_mask) ||
1605 !(nx & PGT_count_mask) )
1606 return 0;
1607 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1609 return 1;
1612 static void page_unlock(struct page_info *page)
1614 unsigned long x, nx, y = page->u.inuse.type_info;
1616 do {
1617 x = y;
1618 nx = x - (1 | PGT_locked);
1619 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1622 /* How to write an entry to the guest pagetables.
1623 * Returns 0 for failure (pointer not valid), 1 for success. */
1624 static inline int update_intpte(intpte_t *p,
1625 intpte_t old,
1626 intpte_t new,
1627 unsigned long mfn,
1628 struct vcpu *v,
1629 int preserve_ad)
1631 int rv = 1;
1632 #ifndef PTE_UPDATE_WITH_CMPXCHG
1633 if ( !preserve_ad )
1635 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1637 else
1638 #endif
1640 intpte_t t = old;
1641 for ( ; ; )
1643 intpte_t _new = new;
1644 if ( preserve_ad )
1645 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1647 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1648 if ( unlikely(rv == 0) )
1650 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1651 ": saw %" PRIpte, old, _new, t);
1652 break;
1655 if ( t == old )
1656 break;
1658 /* Allowed to change in Accessed/Dirty flags only. */
1659 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1661 old = t;
1664 return rv;
1667 /* Macro that wraps the appropriate type-changes around update_intpte().
1668 * Arguments are: type, ptr, old, new, mfn, vcpu */
1669 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1670 update_intpte(&_t ## e_get_intpte(*(_p)), \
1671 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1672 (_m), (_v), (_ad))
1674 /* Update the L1 entry at pl1e to new value nl1e. */
1675 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1676 unsigned long gl1mfn, int preserve_ad,
1677 struct vcpu *vcpu)
1679 l1_pgentry_t ol1e;
1680 struct domain *d = vcpu->domain;
1681 unsigned long mfn;
1682 p2m_type_t p2mt;
1683 int rc = 1;
1685 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1686 return 0;
1688 if ( unlikely(paging_mode_refcounts(d)) )
1690 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu, preserve_ad);
1691 return rc;
1694 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1696 /* Translate foreign guest addresses. */
1697 mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
1698 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1699 return 0;
1700 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1701 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1703 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1705 MEM_LOG("Bad L1 flags %x",
1706 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1707 return 0;
1710 /* Fast path for identical mapping, r/w and presence. */
1711 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1713 adjust_guest_l1e(nl1e, d);
1714 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1715 preserve_ad);
1716 return rc;
1719 if ( unlikely(!get_page_from_l1e(nl1e, d, FOREIGNDOM)) )
1720 return 0;
1722 adjust_guest_l1e(nl1e, d);
1723 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1724 preserve_ad)) )
1726 ol1e = nl1e;
1727 rc = 0;
1730 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1731 preserve_ad)) )
1733 return 0;
1736 put_page_from_l1e(ol1e, d);
1737 return rc;
1741 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1742 static int mod_l2_entry(l2_pgentry_t *pl2e,
1743 l2_pgentry_t nl2e,
1744 unsigned long pfn,
1745 int preserve_ad,
1746 struct vcpu *vcpu)
1748 l2_pgentry_t ol2e;
1749 struct domain *d = vcpu->domain;
1750 struct page_info *l2pg = mfn_to_page(pfn);
1751 unsigned long type = l2pg->u.inuse.type_info;
1752 int rc = 1;
1754 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1756 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1757 return 0;
1760 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1761 return 0;
1763 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1765 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1767 MEM_LOG("Bad L2 flags %x",
1768 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1769 return 0;
1772 /* Fast path for identical mapping and presence. */
1773 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1775 adjust_guest_l2e(nl2e, d);
1776 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1777 return rc;
1780 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1781 return 0;
1783 adjust_guest_l2e(nl2e, d);
1784 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1785 preserve_ad)) )
1787 ol2e = nl2e;
1788 rc = 0;
1791 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1792 preserve_ad)) )
1794 return 0;
1797 put_page_from_l2e(ol2e, pfn);
1798 return rc;
1801 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1802 static int mod_l3_entry(l3_pgentry_t *pl3e,
1803 l3_pgentry_t nl3e,
1804 unsigned long pfn,
1805 int preserve_ad,
1806 int preemptible,
1807 struct vcpu *vcpu)
1809 l3_pgentry_t ol3e;
1810 struct domain *d = vcpu->domain;
1811 int rc = 0;
1813 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1815 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1816 return -EINVAL;
1819 /*
1820 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1821 * would be a pain to ensure they remain continuously valid throughout.
1822 */
1823 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1824 return -EINVAL;
1826 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1827 return -EFAULT;
1829 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1831 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1833 MEM_LOG("Bad L3 flags %x",
1834 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1835 return -EINVAL;
1838 /* Fast path for identical mapping and presence. */
1839 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1841 adjust_guest_l3e(nl3e, d);
1842 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1843 return rc ? 0 : -EFAULT;
1846 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1847 if ( unlikely(rc < 0) )
1848 return rc;
1849 rc = 0;
1851 adjust_guest_l3e(nl3e, d);
1852 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1853 preserve_ad)) )
1855 ol3e = nl3e;
1856 rc = -EFAULT;
1859 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1860 preserve_ad)) )
1862 return -EFAULT;
1865 if ( likely(rc == 0) )
1867 if ( !create_pae_xen_mappings(d, pl3e) )
1868 BUG();
1870 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1873 put_page_from_l3e(ol3e, pfn, 0, 0);
1874 return rc;
1877 #if CONFIG_PAGING_LEVELS >= 4
1879 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1880 static int mod_l4_entry(l4_pgentry_t *pl4e,
1881 l4_pgentry_t nl4e,
1882 unsigned long pfn,
1883 int preserve_ad,
1884 int preemptible,
1885 struct vcpu *vcpu)
1887 struct domain *d = vcpu->domain;
1888 l4_pgentry_t ol4e;
1889 int rc = 0;
1891 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1893 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1894 return -EINVAL;
1897 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1898 return -EFAULT;
1900 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1902 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1904 MEM_LOG("Bad L4 flags %x",
1905 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1906 return -EINVAL;
1909 /* Fast path for identical mapping and presence. */
1910 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1912 adjust_guest_l4e(nl4e, d);
1913 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1914 return rc ? 0 : -EFAULT;
1917 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1918 if ( unlikely(rc < 0) )
1919 return rc;
1920 rc = 0;
1922 adjust_guest_l4e(nl4e, d);
1923 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1924 preserve_ad)) )
1926 ol4e = nl4e;
1927 rc = -EFAULT;
1930 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1931 preserve_ad)) )
1933 return -EFAULT;
1936 put_page_from_l4e(ol4e, pfn, 0, 0);
1937 return rc;
1940 #endif
1942 void put_page(struct page_info *page)
1944 unsigned long nx, x, y = page->count_info;
1946 do {
1947 ASSERT((y & PGC_count_mask) != 0);
1948 x = y;
1949 nx = x - 1;
1951 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1953 if ( unlikely((nx & PGC_count_mask) == 0) )
1955 cleanup_page_cacheattr(page);
1956 free_domheap_page(page);
1961 struct domain *page_get_owner_and_reference(struct page_info *page)
1963 unsigned long x, y = page->count_info;
1965 do {
1966 x = y;
1967 /*
1968 * Count == 0: Page is not allocated, so we cannot take a reference.
1969 * Count == -1: Reference count would wrap, which is invalid.
1970 * Count == -2: Remaining unused ref is reserved for get_page_light().
1971 */
1972 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
1973 return NULL;
1975 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
1977 return page_get_owner(page);
1981 int get_page(struct page_info *page, struct domain *domain)
1983 struct domain *owner = page_get_owner_and_reference(page);
1985 if ( likely(owner == domain) )
1986 return 1;
1988 if ( owner != NULL )
1989 put_page(page);
1991 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1992 gdprintk(XENLOG_INFO,
1993 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
1994 PRtype_info "\n",
1995 page_to_mfn(page), domain, owner,
1996 page->count_info, page->u.inuse.type_info);
1997 return 0;
2000 /*
2001 * Special version of get_page() to be used exclusively when
2002 * - a page is known to already have a non-zero reference count
2003 * - the page does not need its owner to be checked
2004 * - it will not be called more than once without dropping the thus
2005 * acquired reference again.
2006 * Due to get_page() reserving one reference, this call cannot fail.
2007 */
2008 static void get_page_light(struct page_info *page)
2010 unsigned long x, nx, y = page->count_info;
2012 do {
2013 x = y;
2014 nx = x + 1;
2015 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2016 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2017 y = cmpxchg(&page->count_info, x, nx);
2019 while ( unlikely(y != x) );
2022 static int alloc_page_type(struct page_info *page, unsigned long type,
2023 int preemptible)
2025 struct domain *owner = page_get_owner(page);
2026 int rc;
2028 /* A page table is dirtied when its type count becomes non-zero. */
2029 if ( likely(owner != NULL) )
2030 paging_mark_dirty(owner, page_to_mfn(page));
2032 switch ( type & PGT_type_mask )
2034 case PGT_l1_page_table:
2035 rc = alloc_l1_table(page);
2036 break;
2037 case PGT_l2_page_table:
2038 rc = alloc_l2_table(page, type, preemptible);
2039 break;
2040 case PGT_l3_page_table:
2041 rc = alloc_l3_table(page, preemptible);
2042 break;
2043 case PGT_l4_page_table:
2044 rc = alloc_l4_table(page, preemptible);
2045 break;
2046 case PGT_seg_desc_page:
2047 rc = alloc_segdesc_page(page);
2048 break;
2049 default:
2050 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2051 type, page->u.inuse.type_info,
2052 page->count_info);
2053 rc = -EINVAL;
2054 BUG();
2057 /* No need for atomic update of type_info here: noone else updates it. */
2058 wmb();
2059 if ( rc == -EAGAIN )
2061 get_page_light(page);
2062 page->u.inuse.type_info |= PGT_partial;
2064 else if ( rc == -EINTR )
2066 ASSERT((page->u.inuse.type_info &
2067 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2068 page->u.inuse.type_info &= ~PGT_count_mask;
2070 else if ( rc )
2072 ASSERT(rc < 0);
2073 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2074 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2075 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2076 type, page->count_info, page->u.inuse.type_info);
2077 page->u.inuse.type_info = 0;
2079 else
2081 page->u.inuse.type_info |= PGT_validated;
2084 return rc;
2088 int free_page_type(struct page_info *page, unsigned long type,
2089 int preemptible)
2091 struct domain *owner = page_get_owner(page);
2092 unsigned long gmfn;
2093 int rc;
2095 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2097 /* A page table is dirtied when its type count becomes zero. */
2098 paging_mark_dirty(owner, page_to_mfn(page));
2100 if ( shadow_mode_refcounts(owner) )
2101 return 0;
2103 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2104 ASSERT(VALID_M2P(gmfn));
2105 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2108 if ( !(type & PGT_partial) )
2110 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2111 page->partial_pte = 0;
2114 switch ( type & PGT_type_mask )
2116 case PGT_l1_page_table:
2117 free_l1_table(page);
2118 rc = 0;
2119 break;
2120 case PGT_l2_page_table:
2121 rc = free_l2_table(page, preemptible);
2122 break;
2123 case PGT_l3_page_table:
2124 #if CONFIG_PAGING_LEVELS == 3
2125 if ( !(type & PGT_partial) )
2126 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2127 #endif
2128 rc = free_l3_table(page, preemptible);
2129 break;
2130 case PGT_l4_page_table:
2131 rc = free_l4_table(page, preemptible);
2132 break;
2133 default:
2134 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2135 rc = -EINVAL;
2136 BUG();
2139 return rc;
2143 static int __put_final_page_type(
2144 struct page_info *page, unsigned long type, int preemptible)
2146 int rc = free_page_type(page, type, preemptible);
2148 /* No need for atomic update of type_info here: noone else updates it. */
2149 if ( rc == 0 )
2151 /*
2152 * Record TLB information for flush later. We do not stamp page tables
2153 * when running in shadow mode:
2154 * 1. Pointless, since it's the shadow pt's which must be tracked.
2155 * 2. Shadow mode reuses this field for shadowed page tables to
2156 * store flags info -- we don't want to conflict with that.
2157 */
2158 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2159 (page->count_info & PGC_page_table)) )
2160 page->tlbflush_timestamp = tlbflush_current_time();
2161 wmb();
2162 page->u.inuse.type_info--;
2164 else if ( rc == -EINTR )
2166 ASSERT((page->u.inuse.type_info &
2167 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2168 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2169 (page->count_info & PGC_page_table)) )
2170 page->tlbflush_timestamp = tlbflush_current_time();
2171 wmb();
2172 page->u.inuse.type_info |= PGT_validated;
2174 else
2176 BUG_ON(rc != -EAGAIN);
2177 wmb();
2178 get_page_light(page);
2179 page->u.inuse.type_info |= PGT_partial;
2182 return rc;
2186 static int __put_page_type(struct page_info *page,
2187 int preemptible)
2189 unsigned long nx, x, y = page->u.inuse.type_info;
2190 int rc = 0;
2192 for ( ; ; )
2194 x = y;
2195 nx = x - 1;
2197 ASSERT((x & PGT_count_mask) != 0);
2199 if ( unlikely((nx & PGT_count_mask) == 0) )
2201 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2202 likely(nx & (PGT_validated|PGT_partial)) )
2204 /*
2205 * Page-table pages must be unvalidated when count is zero. The
2206 * 'free' is safe because the refcnt is non-zero and validated
2207 * bit is clear => other ops will spin or fail.
2208 */
2209 nx = x & ~(PGT_validated|PGT_partial);
2210 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2211 x, nx)) != x) )
2212 continue;
2213 /* We cleared the 'valid bit' so we do the clean up. */
2214 rc = __put_final_page_type(page, x, preemptible);
2215 if ( x & PGT_partial )
2216 put_page(page);
2217 break;
2220 /*
2221 * Record TLB information for flush later. We do not stamp page
2222 * tables when running in shadow mode:
2223 * 1. Pointless, since it's the shadow pt's which must be tracked.
2224 * 2. Shadow mode reuses this field for shadowed page tables to
2225 * store flags info -- we don't want to conflict with that.
2226 */
2227 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2228 (page->count_info & PGC_page_table)) )
2229 page->tlbflush_timestamp = tlbflush_current_time();
2232 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2233 break;
2235 if ( preemptible && hypercall_preempt_check() )
2236 return -EINTR;
2239 return rc;
2243 static int __get_page_type(struct page_info *page, unsigned long type,
2244 int preemptible)
2246 unsigned long nx, x, y = page->u.inuse.type_info;
2247 int rc = 0;
2249 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2251 for ( ; ; )
2253 x = y;
2254 nx = x + 1;
2255 if ( unlikely((nx & PGT_count_mask) == 0) )
2257 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2258 return -EINVAL;
2260 else if ( unlikely((x & PGT_count_mask) == 0) )
2262 struct domain *d = page_get_owner(page);
2264 /* Normally we should never let a page go from type count 0
2265 * to type count 1 when it is shadowed. One exception:
2266 * out-of-sync shadowed pages are allowed to become
2267 * writeable. */
2268 if ( d && shadow_mode_enabled(d)
2269 && (page->count_info & PGC_page_table)
2270 && !((page->shadow_flags & (1u<<29))
2271 && type == PGT_writable_page) )
2272 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2274 ASSERT(!(x & PGT_pae_xen_l2));
2275 if ( (x & PGT_type_mask) != type )
2277 /*
2278 * On type change we check to flush stale TLB entries. This
2279 * may be unnecessary (e.g., page was GDT/LDT) but those
2280 * circumstances should be very rare.
2281 */
2282 cpumask_t mask = d->domain_dirty_cpumask;
2284 /* Don't flush if the timestamp is old enough */
2285 tlbflush_filter(mask, page->tlbflush_timestamp);
2287 if ( unlikely(!cpus_empty(mask)) &&
2288 /* Shadow mode: track only writable pages. */
2289 (!shadow_mode_enabled(page_get_owner(page)) ||
2290 ((nx & PGT_type_mask) == PGT_writable_page)) )
2292 perfc_incr(need_flush_tlb_flush);
2293 flush_tlb_mask(&mask);
2296 /* We lose existing type and validity. */
2297 nx &= ~(PGT_type_mask | PGT_validated);
2298 nx |= type;
2300 /* No special validation needed for writable pages. */
2301 /* Page tables and GDT/LDT need to be scanned for validity. */
2302 if ( type == PGT_writable_page )
2303 nx |= PGT_validated;
2306 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2308 /* Don't log failure if it could be a recursive-mapping attempt. */
2309 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2310 (type == PGT_l1_page_table) )
2311 return -EINVAL;
2312 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2313 (type == PGT_l2_page_table) )
2314 return -EINVAL;
2315 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2316 (type == PGT_l3_page_table) )
2317 return -EINVAL;
2318 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2319 "for mfn %lx (pfn %lx)",
2320 x, type, page_to_mfn(page),
2321 get_gpfn_from_mfn(page_to_mfn(page)));
2322 return -EINVAL;
2324 else if ( unlikely(!(x & PGT_validated)) )
2326 if ( !(x & PGT_partial) )
2328 /* Someone else is updating validation of this page. Wait... */
2329 while ( (y = page->u.inuse.type_info) == x )
2331 if ( preemptible && hypercall_preempt_check() )
2332 return -EINTR;
2333 cpu_relax();
2335 continue;
2337 /* Type ref count was left at 1 when PGT_partial got set. */
2338 ASSERT((x & PGT_count_mask) == 1);
2339 nx = x & ~PGT_partial;
2342 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2343 break;
2345 if ( preemptible && hypercall_preempt_check() )
2346 return -EINTR;
2349 if ( unlikely((x & PGT_type_mask) != type) )
2351 /* Special pages should not be accessible from devices. */
2352 struct domain *d = page_get_owner(page);
2353 if ( d && unlikely(need_iommu(d)) )
2355 if ( (x & PGT_type_mask) == PGT_writable_page )
2356 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2357 else if ( type == PGT_writable_page )
2358 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2359 page_to_mfn(page));
2363 if ( unlikely(!(nx & PGT_validated)) )
2365 if ( !(x & PGT_partial) )
2367 page->nr_validated_ptes = 0;
2368 page->partial_pte = 0;
2370 rc = alloc_page_type(page, type, preemptible);
2373 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2374 put_page(page);
2376 return rc;
2379 void put_page_type(struct page_info *page)
2381 int rc = __put_page_type(page, 0);
2382 ASSERT(rc == 0);
2383 (void)rc;
2386 int get_page_type(struct page_info *page, unsigned long type)
2388 int rc = __get_page_type(page, type, 0);
2389 if ( likely(rc == 0) )
2390 return 1;
2391 ASSERT(rc == -EINVAL);
2392 return 0;
2395 int put_page_type_preemptible(struct page_info *page)
2397 return __put_page_type(page, 1);
2400 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2402 return __get_page_type(page, type, 1);
2405 void cleanup_page_cacheattr(struct page_info *page)
2407 uint32_t cacheattr =
2408 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2410 if ( likely(cacheattr == 0) )
2411 return;
2413 page->count_info &= ~PGC_cacheattr_mask;
2415 BUG_ON(is_xen_heap_page(page));
2417 update_xen_mappings(page_to_mfn(page), 0);
2421 int new_guest_cr3(unsigned long mfn)
2423 struct vcpu *curr = current;
2424 struct domain *d = curr->domain;
2425 int okay;
2426 unsigned long old_base_mfn;
2428 #ifdef CONFIG_COMPAT
2429 if ( is_pv_32on64_domain(d) )
2431 okay = paging_mode_refcounts(d)
2432 ? 0 /* Old code was broken, but what should it be? */
2433 : mod_l4_entry(
2434 __va(pagetable_get_paddr(curr->arch.guest_table)),
2435 l4e_from_pfn(
2436 mfn,
2437 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2438 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2439 if ( unlikely(!okay) )
2441 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2442 return 0;
2445 invalidate_shadow_ldt(curr, 0);
2446 write_ptbase(curr);
2448 return 1;
2450 #endif
2451 okay = paging_mode_refcounts(d)
2452 ? get_page_from_pagenr(mfn, d)
2453 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2454 if ( unlikely(!okay) )
2456 MEM_LOG("Error while installing new baseptr %lx", mfn);
2457 return 0;
2460 invalidate_shadow_ldt(curr, 0);
2462 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2464 curr->arch.guest_table = pagetable_from_pfn(mfn);
2465 update_cr3(curr);
2467 write_ptbase(curr);
2469 if ( likely(old_base_mfn != 0) )
2471 if ( paging_mode_refcounts(d) )
2472 put_page(mfn_to_page(old_base_mfn));
2473 else
2474 put_page_and_type(mfn_to_page(old_base_mfn));
2477 return 1;
2480 static void process_deferred_ops(void)
2482 unsigned int deferred_ops;
2483 struct domain *d = current->domain;
2484 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2486 deferred_ops = info->deferred_ops;
2487 info->deferred_ops = 0;
2489 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2491 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2492 flush_tlb_mask(&d->domain_dirty_cpumask);
2493 else
2494 flush_tlb_local();
2497 /*
2498 * Do this after flushing TLBs, to ensure we see fresh LDT mappings
2499 * via the linear pagetable mapping.
2500 */
2501 if ( deferred_ops & DOP_RELOAD_LDT )
2502 (void)map_ldt_shadow_page(0);
2504 if ( unlikely(info->foreign != NULL) )
2506 rcu_unlock_domain(info->foreign);
2507 info->foreign = NULL;
2511 static int set_foreigndom(domid_t domid)
2513 struct domain *e, *d = current->domain;
2514 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2515 int okay = 1;
2517 ASSERT(info->foreign == NULL);
2519 if ( likely(domid == DOMID_SELF) )
2520 goto out;
2522 if ( unlikely(domid == d->domain_id) )
2524 MEM_LOG("Cannot specify itself as foreign domain");
2525 okay = 0;
2527 else if ( unlikely(paging_mode_translate(d)) )
2529 MEM_LOG("Cannot mix foreign mappings with translated domains");
2530 okay = 0;
2532 else switch ( domid )
2534 case DOMID_IO:
2535 info->foreign = rcu_lock_domain(dom_io);
2536 break;
2537 case DOMID_XEN:
2538 if (!IS_PRIV(d)) {
2539 MEM_LOG("Cannot set foreign dom");
2540 okay = 0;
2541 break;
2543 info->foreign = rcu_lock_domain(dom_xen);
2544 break;
2545 default:
2546 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2548 MEM_LOG("Unknown domain '%u'", domid);
2549 okay = 0;
2550 break;
2552 if ( !IS_PRIV_FOR(d, e) )
2554 MEM_LOG("Cannot set foreign dom");
2555 okay = 0;
2556 rcu_unlock_domain(e);
2557 break;
2559 info->foreign = e;
2560 break;
2563 out:
2564 return okay;
2567 static inline int vcpumask_to_pcpumask(
2568 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2570 unsigned int vcpu_id, vcpu_bias, offs;
2571 unsigned long vmask;
2572 struct vcpu *v;
2573 bool_t is_native = !is_pv_32on64_domain(d);
2575 cpus_clear(*pmask);
2576 for ( vmask = 0, offs = 0; ; ++offs)
2578 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2579 if ( vcpu_bias >= d->max_vcpus )
2580 return 0;
2582 if ( unlikely(is_native ?
2583 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2584 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2585 offs, 1)) )
2587 cpus_clear(*pmask);
2588 return -EFAULT;
2591 while ( vmask )
2593 vcpu_id = find_first_set_bit(vmask);
2594 vmask &= ~(1UL << vcpu_id);
2595 vcpu_id += vcpu_bias;
2596 if ( (vcpu_id >= d->max_vcpus) )
2597 return 0;
2598 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2599 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2604 #ifdef __i386__
2605 static inline void *fixmap_domain_page(unsigned long mfn)
2607 unsigned int cpu = smp_processor_id();
2608 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2610 l1e_write(fix_pae_highmem_pl1e - cpu,
2611 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2612 flush_tlb_one_local(ptr);
2613 return ptr;
2615 static inline void fixunmap_domain_page(const void *ptr)
2617 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2619 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2620 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2622 #else
2623 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2624 #define fixunmap_domain_page(ptr) ((void)(ptr))
2625 #endif
2627 int do_mmuext_op(
2628 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2629 unsigned int count,
2630 XEN_GUEST_HANDLE(uint) pdone,
2631 unsigned int foreigndom)
2633 struct mmuext_op op;
2634 int rc = 0, i = 0, okay;
2635 unsigned long mfn = 0, gmfn = 0, type;
2636 unsigned int done = 0;
2637 struct page_info *page;
2638 struct vcpu *curr = current;
2639 struct domain *d = curr->domain;
2641 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2643 count &= ~MMU_UPDATE_PREEMPTED;
2644 if ( unlikely(!guest_handle_is_null(pdone)) )
2645 (void)copy_from_guest(&done, pdone, 1);
2647 else
2648 perfc_incr(calls_to_mmuext_op);
2650 if ( unlikely(!guest_handle_okay(uops, count)) )
2652 rc = -EFAULT;
2653 goto out;
2656 if ( !set_foreigndom(foreigndom) )
2658 rc = -ESRCH;
2659 goto out;
2662 for ( i = 0; i < count; i++ )
2664 if ( hypercall_preempt_check() )
2666 rc = -EAGAIN;
2667 break;
2670 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2672 MEM_LOG("Bad __copy_from_guest");
2673 rc = -EFAULT;
2674 break;
2677 okay = 1;
2678 gmfn = op.arg1.mfn;
2679 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2680 page = mfn_to_page(mfn);
2682 switch ( op.cmd )
2684 case MMUEXT_PIN_L1_TABLE:
2685 type = PGT_l1_page_table;
2686 goto pin_page;
2688 case MMUEXT_PIN_L2_TABLE:
2689 type = PGT_l2_page_table;
2690 goto pin_page;
2692 case MMUEXT_PIN_L3_TABLE:
2693 type = PGT_l3_page_table;
2694 goto pin_page;
2696 case MMUEXT_PIN_L4_TABLE:
2697 if ( is_pv_32bit_domain(FOREIGNDOM) )
2698 break;
2699 type = PGT_l4_page_table;
2701 pin_page:
2702 rc = xsm_memory_pin_page(d, page);
2703 if ( rc )
2704 break;
2706 /* Ignore pinning of invalid paging levels. */
2707 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2708 break;
2710 if ( paging_mode_refcounts(FOREIGNDOM) )
2711 break;
2713 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2714 okay = !rc;
2715 if ( unlikely(!okay) )
2717 if ( rc == -EINTR )
2718 rc = -EAGAIN;
2719 else if ( rc != -EAGAIN )
2720 MEM_LOG("Error while pinning mfn %lx", mfn);
2721 break;
2724 if ( unlikely(test_and_set_bit(_PGT_pinned,
2725 &page->u.inuse.type_info)) )
2727 MEM_LOG("Mfn %lx already pinned", mfn);
2728 put_page_and_type(page);
2729 okay = 0;
2730 break;
2733 /* A page is dirtied when its pin status is set. */
2734 paging_mark_dirty(d, mfn);
2736 /* We can race domain destruction (domain_relinquish_resources). */
2737 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2739 int drop_ref;
2740 spin_lock(&FOREIGNDOM->page_alloc_lock);
2741 drop_ref = (FOREIGNDOM->is_dying &&
2742 test_and_clear_bit(_PGT_pinned,
2743 &page->u.inuse.type_info));
2744 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2745 if ( drop_ref )
2746 put_page_and_type(page);
2749 break;
2751 case MMUEXT_UNPIN_TABLE:
2752 if ( paging_mode_refcounts(d) )
2753 break;
2755 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2757 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2758 mfn, page_get_owner(page));
2760 else if ( likely(test_and_clear_bit(_PGT_pinned,
2761 &page->u.inuse.type_info)) )
2763 put_page_and_type(page);
2764 put_page(page);
2765 if ( !rc )
2767 /* A page is dirtied when its pin status is cleared. */
2768 paging_mark_dirty(d, mfn);
2771 else
2773 okay = 0;
2774 put_page(page);
2775 MEM_LOG("Mfn %lx not pinned", mfn);
2777 break;
2779 case MMUEXT_NEW_BASEPTR:
2780 okay = new_guest_cr3(mfn);
2781 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2782 break;
2784 #ifdef __x86_64__
2785 case MMUEXT_NEW_USER_BASEPTR: {
2786 unsigned long old_mfn;
2788 if ( mfn != 0 )
2790 if ( paging_mode_refcounts(d) )
2791 okay = get_page_from_pagenr(mfn, d);
2792 else
2793 okay = !get_page_and_type_from_pagenr(
2794 mfn, PGT_root_page_table, d, 0, 0);
2795 if ( unlikely(!okay) )
2797 MEM_LOG("Error while installing new mfn %lx", mfn);
2798 break;
2802 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2803 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2805 if ( old_mfn != 0 )
2807 if ( paging_mode_refcounts(d) )
2808 put_page(mfn_to_page(old_mfn));
2809 else
2810 put_page_and_type(mfn_to_page(old_mfn));
2813 break;
2815 #endif
2817 case MMUEXT_TLB_FLUSH_LOCAL:
2818 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2819 break;
2821 case MMUEXT_INVLPG_LOCAL:
2822 if ( !paging_mode_enabled(d)
2823 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2824 flush_tlb_one_local(op.arg1.linear_addr);
2825 break;
2827 case MMUEXT_TLB_FLUSH_MULTI:
2828 case MMUEXT_INVLPG_MULTI:
2830 cpumask_t pmask;
2832 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
2834 okay = 0;
2835 break;
2837 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2838 flush_tlb_mask(&pmask);
2839 else
2840 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
2841 break;
2844 case MMUEXT_TLB_FLUSH_ALL:
2845 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
2846 break;
2848 case MMUEXT_INVLPG_ALL:
2849 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
2850 break;
2852 case MMUEXT_FLUSH_CACHE:
2853 if ( unlikely(!cache_flush_permitted(d)) )
2855 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2856 okay = 0;
2858 else
2860 wbinvd();
2862 break;
2864 case MMUEXT_SET_LDT:
2866 unsigned long ptr = op.arg1.linear_addr;
2867 unsigned long ents = op.arg2.nr_ents;
2869 if ( paging_mode_external(d) )
2871 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2872 okay = 0;
2874 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2875 (ents > 8192) ||
2876 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2878 okay = 0;
2879 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2881 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2882 (curr->arch.guest_context.ldt_base != ptr) )
2884 invalidate_shadow_ldt(curr, 0);
2885 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2886 curr->arch.guest_context.ldt_base = ptr;
2887 curr->arch.guest_context.ldt_ents = ents;
2888 load_LDT(curr);
2889 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2890 if ( ents != 0 )
2891 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2893 break;
2896 case MMUEXT_CLEAR_PAGE:
2898 unsigned char *ptr;
2900 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2901 FOREIGNDOM, 0, 0);
2902 if ( unlikely(!okay) )
2904 MEM_LOG("Error while clearing mfn %lx", mfn);
2905 break;
2908 /* A page is dirtied when it's being cleared. */
2909 paging_mark_dirty(d, mfn);
2911 ptr = fixmap_domain_page(mfn);
2912 clear_page(ptr);
2913 fixunmap_domain_page(ptr);
2915 put_page_and_type(page);
2916 break;
2919 case MMUEXT_COPY_PAGE:
2921 const unsigned char *src;
2922 unsigned char *dst;
2923 unsigned long src_mfn;
2925 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2926 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2927 if ( unlikely(!okay) )
2929 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2930 break;
2933 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2934 FOREIGNDOM, 0, 0);
2935 if ( unlikely(!okay) )
2937 put_page(mfn_to_page(src_mfn));
2938 MEM_LOG("Error while copying to mfn %lx", mfn);
2939 break;
2942 /* A page is dirtied when it's being copied to. */
2943 paging_mark_dirty(d, mfn);
2945 src = map_domain_page(src_mfn);
2946 dst = fixmap_domain_page(mfn);
2947 copy_page(dst, src);
2948 fixunmap_domain_page(dst);
2949 unmap_domain_page(src);
2951 put_page_and_type(page);
2952 put_page(mfn_to_page(src_mfn));
2953 break;
2956 default:
2957 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2958 rc = -ENOSYS;
2959 okay = 0;
2960 break;
2963 if ( unlikely(!okay) )
2965 rc = rc ? rc : -EINVAL;
2966 break;
2969 guest_handle_add_offset(uops, 1);
2972 if ( rc == -EAGAIN )
2973 rc = hypercall_create_continuation(
2974 __HYPERVISOR_mmuext_op, "hihi",
2975 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2977 process_deferred_ops();
2979 perfc_add(num_mmuext_ops, i);
2981 out:
2982 /* Add incremental work we have done to the @done output parameter. */
2983 if ( unlikely(!guest_handle_is_null(pdone)) )
2985 done += i;
2986 copy_to_guest(pdone, &done, 1);
2989 return rc;
2992 int do_mmu_update(
2993 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2994 unsigned int count,
2995 XEN_GUEST_HANDLE(uint) pdone,
2996 unsigned int foreigndom)
2998 struct mmu_update req;
2999 void *va;
3000 unsigned long gpfn, gmfn, mfn;
3001 struct page_info *page;
3002 int rc = 0, okay = 1, i = 0;
3003 unsigned int cmd, done = 0, pt_dom;
3004 struct domain *d = current->domain, *pt_owner = d;
3005 struct vcpu *v = current;
3006 struct domain_mmap_cache mapcache;
3008 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3010 count &= ~MMU_UPDATE_PREEMPTED;
3011 if ( unlikely(!guest_handle_is_null(pdone)) )
3012 (void)copy_from_guest(&done, pdone, 1);
3014 else
3015 perfc_incr(calls_to_mmu_update);
3017 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3019 rc = -EFAULT;
3020 goto out;
3023 if ( (pt_dom = foreigndom >> 16) != 0 )
3025 /* Pagetables belong to a foreign domain (PFD). */
3026 if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3028 rc = -EINVAL;
3029 goto out;
3031 if ( pt_owner == d )
3032 rcu_unlock_domain(pt_owner);
3033 if ( (v = pt_owner->vcpu ? pt_owner->vcpu[0] : NULL) == NULL )
3035 rc = -EINVAL;
3036 goto out;
3038 if ( !IS_PRIV_FOR(d, pt_owner) )
3040 rc = -ESRCH;
3041 goto out;
3045 if ( !set_foreigndom((uint16_t)foreigndom) )
3047 rc = -ESRCH;
3048 goto out;
3051 domain_mmap_cache_init(&mapcache);
3053 for ( i = 0; i < count; i++ )
3055 if ( hypercall_preempt_check() )
3057 rc = -EAGAIN;
3058 break;
3061 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3063 MEM_LOG("Bad __copy_from_guest");
3064 rc = -EFAULT;
3065 break;
3068 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3069 okay = 0;
3071 switch ( cmd )
3073 /*
3074 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3075 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3076 * current A/D bits.
3077 */
3078 case MMU_NORMAL_PT_UPDATE:
3079 case MMU_PT_UPDATE_PRESERVE_AD:
3080 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
3081 if ( rc )
3082 break;
3084 req.ptr -= cmd;
3085 gmfn = req.ptr >> PAGE_SHIFT;
3086 mfn = gmfn_to_mfn(pt_owner, gmfn);
3088 if ( unlikely(!get_page_from_pagenr(mfn, pt_owner)) )
3090 MEM_LOG("Could not get page for normal update");
3091 break;
3094 va = map_domain_page_with_cache(mfn, &mapcache);
3095 va = (void *)((unsigned long)va +
3096 (unsigned long)(req.ptr & ~PAGE_MASK));
3097 page = mfn_to_page(mfn);
3099 if ( page_lock(page) )
3101 switch ( page->u.inuse.type_info & PGT_type_mask )
3103 case PGT_l1_page_table:
3105 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3106 okay = mod_l1_entry(va, l1e, mfn,
3107 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3109 break;
3110 case PGT_l2_page_table:
3112 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3113 okay = mod_l2_entry(va, l2e, mfn,
3114 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3116 break;
3117 case PGT_l3_page_table:
3119 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3120 rc = mod_l3_entry(va, l3e, mfn,
3121 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3122 okay = !rc;
3124 break;
3125 #if CONFIG_PAGING_LEVELS >= 4
3126 case PGT_l4_page_table:
3128 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3129 rc = mod_l4_entry(va, l4e, mfn,
3130 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3131 okay = !rc;
3133 break;
3134 #endif
3135 case PGT_writable_page:
3136 perfc_incr(writable_mmu_updates);
3137 okay = paging_write_guest_entry(
3138 v, va, req.val, _mfn(mfn));
3139 break;
3141 page_unlock(page);
3142 if ( rc == -EINTR )
3143 rc = -EAGAIN;
3145 else if ( get_page_type(page, PGT_writable_page) )
3147 perfc_incr(writable_mmu_updates);
3148 okay = paging_write_guest_entry(
3149 v, va, req.val, _mfn(mfn));
3150 put_page_type(page);
3153 unmap_domain_page_with_cache(va, &mapcache);
3154 put_page(page);
3155 break;
3157 case MMU_MACHPHYS_UPDATE:
3159 mfn = req.ptr >> PAGE_SHIFT;
3160 gpfn = req.val;
3162 rc = xsm_mmu_machphys_update(d, mfn);
3163 if ( rc )
3164 break;
3166 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3168 MEM_LOG("Could not get page for mach->phys update");
3169 break;
3172 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3174 MEM_LOG("Mach-phys update on auto-translate guest");
3175 break;
3178 set_gpfn_from_mfn(mfn, gpfn);
3179 okay = 1;
3181 paging_mark_dirty(FOREIGNDOM, mfn);
3183 put_page(mfn_to_page(mfn));
3184 break;
3186 default:
3187 MEM_LOG("Invalid page update command %x", cmd);
3188 rc = -ENOSYS;
3189 okay = 0;
3190 break;
3193 if ( unlikely(!okay) )
3195 rc = rc ? rc : -EINVAL;
3196 break;
3199 guest_handle_add_offset(ureqs, 1);
3202 if ( rc == -EAGAIN )
3203 rc = hypercall_create_continuation(
3204 __HYPERVISOR_mmu_update, "hihi",
3205 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3207 process_deferred_ops();
3209 domain_mmap_cache_destroy(&mapcache);
3211 perfc_add(num_page_updates, i);
3213 out:
3214 if ( pt_owner && (pt_owner != d) )
3215 rcu_unlock_domain(pt_owner);
3217 /* Add incremental work we have done to the @done output parameter. */
3218 if ( unlikely(!guest_handle_is_null(pdone)) )
3220 done += i;
3221 copy_to_guest(pdone, &done, 1);
3224 return rc;
3228 static int create_grant_pte_mapping(
3229 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3231 int rc = GNTST_okay;
3232 void *va;
3233 unsigned long gmfn, mfn;
3234 struct page_info *page;
3235 l1_pgentry_t ol1e;
3236 struct domain *d = v->domain;
3238 ASSERT(domain_is_locked(d));
3240 adjust_guest_l1e(nl1e, d);
3242 gmfn = pte_addr >> PAGE_SHIFT;
3243 mfn = gmfn_to_mfn(d, gmfn);
3245 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3247 MEM_LOG("Could not get page for normal update");
3248 return GNTST_general_error;
3251 va = map_domain_page(mfn);
3252 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3253 page = mfn_to_page(mfn);
3255 if ( !page_lock(page) )
3257 rc = GNTST_general_error;
3258 goto failed;
3261 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3263 page_unlock(page);
3264 rc = GNTST_general_error;
3265 goto failed;
3268 ol1e = *(l1_pgentry_t *)va;
3269 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3271 page_unlock(page);
3272 rc = GNTST_general_error;
3273 goto failed;
3276 page_unlock(page);
3278 if ( !paging_mode_refcounts(d) )
3279 put_page_from_l1e(ol1e, d);
3281 failed:
3282 unmap_domain_page(va);
3283 put_page(page);
3285 return rc;
3288 static int destroy_grant_pte_mapping(
3289 uint64_t addr, unsigned long frame, struct domain *d)
3291 int rc = GNTST_okay;
3292 void *va;
3293 unsigned long gmfn, mfn;
3294 struct page_info *page;
3295 l1_pgentry_t ol1e;
3297 gmfn = addr >> PAGE_SHIFT;
3298 mfn = gmfn_to_mfn(d, gmfn);
3300 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3302 MEM_LOG("Could not get page for normal update");
3303 return GNTST_general_error;
3306 va = map_domain_page(mfn);
3307 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3308 page = mfn_to_page(mfn);
3310 if ( !page_lock(page) )
3312 rc = GNTST_general_error;
3313 goto failed;
3316 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3318 page_unlock(page);
3319 rc = GNTST_general_error;
3320 goto failed;
3323 ol1e = *(l1_pgentry_t *)va;
3325 /* Check that the virtual address supplied is actually mapped to frame. */
3326 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3328 page_unlock(page);
3329 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3330 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3331 rc = GNTST_general_error;
3332 goto failed;
3335 /* Delete pagetable entry. */
3336 if ( unlikely(!UPDATE_ENTRY
3337 (l1,
3338 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3339 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3340 0)) )
3342 page_unlock(page);
3343 MEM_LOG("Cannot delete PTE entry at %p", va);
3344 rc = GNTST_general_error;
3345 goto failed;
3348 page_unlock(page);
3350 failed:
3351 unmap_domain_page(va);
3352 put_page(page);
3353 return rc;
3357 static int create_grant_va_mapping(
3358 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3360 l1_pgentry_t *pl1e, ol1e;
3361 struct domain *d = v->domain;
3362 unsigned long gl1mfn;
3363 struct page_info *l1pg;
3364 int okay;
3366 ASSERT(domain_is_locked(d));
3368 adjust_guest_l1e(nl1e, d);
3370 pl1e = guest_map_l1e(v, va, &gl1mfn);
3371 if ( !pl1e )
3373 MEM_LOG("Could not find L1 PTE for address %lx", va);
3374 return GNTST_general_error;
3377 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3379 guest_unmap_l1e(v, pl1e);
3380 return GNTST_general_error;
3383 l1pg = mfn_to_page(gl1mfn);
3384 if ( !page_lock(l1pg) )
3386 put_page(l1pg);
3387 guest_unmap_l1e(v, pl1e);
3388 return GNTST_general_error;
3391 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3393 page_unlock(l1pg);
3394 put_page(l1pg);
3395 guest_unmap_l1e(v, pl1e);
3396 return GNTST_general_error;
3399 ol1e = *pl1e;
3400 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3402 page_unlock(l1pg);
3403 put_page(l1pg);
3404 guest_unmap_l1e(v, pl1e);
3406 if ( okay && !paging_mode_refcounts(d) )
3407 put_page_from_l1e(ol1e, d);
3409 return okay ? GNTST_okay : GNTST_general_error;
3412 static int replace_grant_va_mapping(
3413 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3415 l1_pgentry_t *pl1e, ol1e;
3416 unsigned long gl1mfn;
3417 struct page_info *l1pg;
3418 int rc = 0;
3420 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3421 if ( !pl1e )
3423 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3424 return GNTST_general_error;
3427 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3429 rc = GNTST_general_error;
3430 goto out;
3433 l1pg = mfn_to_page(gl1mfn);
3434 if ( !page_lock(l1pg) )
3436 rc = GNTST_general_error;
3437 put_page(l1pg);
3438 goto out;
3441 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3443 rc = GNTST_general_error;
3444 goto unlock_and_out;
3447 ol1e = *pl1e;
3449 /* Check that the virtual address supplied is actually mapped to frame. */
3450 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3452 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3453 l1e_get_pfn(ol1e), addr, frame);
3454 rc = GNTST_general_error;
3455 goto unlock_and_out;
3458 /* Delete pagetable entry. */
3459 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3461 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3462 rc = GNTST_general_error;
3463 goto unlock_and_out;
3466 unlock_and_out:
3467 page_unlock(l1pg);
3468 put_page(l1pg);
3469 out:
3470 guest_unmap_l1e(v, pl1e);
3471 return rc;
3474 static int destroy_grant_va_mapping(
3475 unsigned long addr, unsigned long frame, struct vcpu *v)
3477 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3480 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3481 unsigned int flags, unsigned int cache_flags)
3483 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3485 if ( (flags & GNTMAP_application_map) )
3486 l1e_add_flags(pte,_PAGE_USER);
3487 if ( !(flags & GNTMAP_readonly) )
3488 l1e_add_flags(pte,_PAGE_RW);
3490 l1e_add_flags(pte,
3491 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3492 & _PAGE_AVAIL);
3494 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3496 if ( flags & GNTMAP_contains_pte )
3497 return create_grant_pte_mapping(addr, pte, current);
3498 return create_grant_va_mapping(addr, pte, current);
3501 int replace_grant_host_mapping(
3502 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3504 struct vcpu *curr = current;
3505 l1_pgentry_t *pl1e, ol1e;
3506 unsigned long gl1mfn;
3507 struct page_info *l1pg;
3508 int rc;
3510 if ( flags & GNTMAP_contains_pte )
3512 if ( !new_addr )
3513 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3515 MEM_LOG("Unsupported grant table operation");
3516 return GNTST_general_error;
3519 if ( !new_addr )
3520 return destroy_grant_va_mapping(addr, frame, curr);
3522 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3523 if ( !pl1e )
3525 MEM_LOG("Could not find L1 PTE for address %lx",
3526 (unsigned long)new_addr);
3527 return GNTST_general_error;
3530 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3532 guest_unmap_l1e(curr, pl1e);
3533 return GNTST_general_error;
3536 l1pg = mfn_to_page(gl1mfn);
3537 if ( !page_lock(l1pg) )
3539 put_page(l1pg);
3540 guest_unmap_l1e(curr, pl1e);
3541 return GNTST_general_error;
3544 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3546 page_unlock(l1pg);
3547 put_page(l1pg);
3548 guest_unmap_l1e(curr, pl1e);
3549 return GNTST_general_error;
3552 ol1e = *pl1e;
3554 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3555 gl1mfn, curr, 0)) )
3557 page_unlock(l1pg);
3558 put_page(l1pg);
3559 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3560 guest_unmap_l1e(curr, pl1e);
3561 return GNTST_general_error;
3564 page_unlock(l1pg);
3565 put_page(l1pg);
3566 guest_unmap_l1e(curr, pl1e);
3568 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3569 if ( rc && !paging_mode_refcounts(curr->domain) )
3570 put_page_from_l1e(ol1e, curr->domain);
3572 return rc;
3575 int donate_page(
3576 struct domain *d, struct page_info *page, unsigned int memflags)
3578 spin_lock(&d->page_alloc_lock);
3580 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
3581 goto fail;
3583 if ( d->is_dying )
3584 goto fail;
3586 if ( page->count_info & ~(PGC_allocated | 1) )
3587 goto fail;
3589 if ( !(memflags & MEMF_no_refcount) )
3591 if ( d->tot_pages >= d->max_pages )
3592 goto fail;
3593 d->tot_pages++;
3596 page->count_info = PGC_allocated | 1;
3597 page_set_owner(page, d);
3598 page_list_add_tail(page,&d->page_list);
3600 spin_unlock(&d->page_alloc_lock);
3601 return 0;
3603 fail:
3604 spin_unlock(&d->page_alloc_lock);
3605 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3606 (void *)page_to_mfn(page), d, d->domain_id,
3607 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3608 return -1;
3611 int steal_page(
3612 struct domain *d, struct page_info *page, unsigned int memflags)
3614 unsigned long x, y;
3616 spin_lock(&d->page_alloc_lock);
3618 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3619 goto fail;
3621 /*
3622 * We require there is just one reference (PGC_allocated). We temporarily
3623 * drop this reference now so that we can safely swizzle the owner.
3624 */
3625 y = page->count_info;
3626 do {
3627 x = y;
3628 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3629 goto fail;
3630 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3631 } while ( y != x );
3633 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3634 page_set_owner(page, NULL);
3635 y = page->count_info;
3636 do {
3637 x = y;
3638 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3639 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3641 /* Unlink from original owner. */
3642 if ( !(memflags & MEMF_no_refcount) )
3643 d->tot_pages--;
3644 page_list_del(page, &d->page_list);
3646 spin_unlock(&d->page_alloc_lock);
3647 return 0;
3649 fail:
3650 spin_unlock(&d->page_alloc_lock);
3651 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3652 (void *)page_to_mfn(page), d, d->domain_id,
3653 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3654 return -1;
3657 int do_update_va_mapping(unsigned long va, u64 val64,
3658 unsigned long flags)
3660 l1_pgentry_t val = l1e_from_intpte(val64);
3661 struct vcpu *v = current;
3662 struct domain *d = v->domain;
3663 struct page_info *gl1pg;
3664 l1_pgentry_t *pl1e;
3665 unsigned long bmap_ptr, gl1mfn;
3666 cpumask_t pmask;
3667 int rc;
3669 perfc_incr(calls_to_update_va);
3671 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3672 if ( rc )
3673 return rc;
3675 rc = -EINVAL;
3676 pl1e = guest_map_l1e(v, va, &gl1mfn);
3677 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3678 goto out;
3680 gl1pg = mfn_to_page(gl1mfn);
3681 if ( !page_lock(gl1pg) )
3683 put_page(gl1pg);
3684 goto out;
3687 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3689 page_unlock(gl1pg);
3690 put_page(gl1pg);
3691 goto out;
3694 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v) ? 0 : -EINVAL;
3696 page_unlock(gl1pg);
3697 put_page(gl1pg);
3699 out:
3700 if ( pl1e )
3701 guest_unmap_l1e(v, pl1e);
3703 switch ( flags & UVMF_FLUSHTYPE_MASK )
3705 case UVMF_TLB_FLUSH:
3706 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3708 case UVMF_LOCAL:
3709 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
3710 break;
3711 case UVMF_ALL:
3712 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
3713 break;
3714 default:
3715 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3716 break;
3717 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3718 void),
3719 &pmask);
3720 if ( cpu_isset(smp_processor_id(), pmask) )
3721 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
3722 flush_tlb_mask(&pmask);
3723 break;
3725 break;
3727 case UVMF_INVLPG:
3728 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3729 break;
3730 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3732 case UVMF_LOCAL:
3733 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3734 break;
3735 if ( !paging_mode_enabled(d) ||
3736 (paging_invlpg(v, va) != 0) )
3737 flush_tlb_one_local(va);
3738 break;
3739 case UVMF_ALL:
3740 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
3741 break;
3742 default:
3743 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3744 void),
3745 &pmask);
3746 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3747 cpu_clear(smp_processor_id(), pmask);
3748 flush_tlb_one_mask(&pmask, va);
3749 break;
3751 break;
3754 process_deferred_ops();
3756 return rc;
3759 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3760 unsigned long flags,
3761 domid_t domid)
3763 int rc;
3765 if ( !set_foreigndom(domid) )
3766 return -ESRCH;
3768 rc = do_update_va_mapping(va, val64, flags);
3770 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3771 process_deferred_ops(); /* only to clear foreigndom */
3773 return rc;
3778 /*************************
3779 * Descriptor Tables
3780 */
3782 void destroy_gdt(struct vcpu *v)
3784 int i;
3785 unsigned long pfn;
3787 v->arch.guest_context.gdt_ents = 0;
3788 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3790 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3791 put_page_and_type(mfn_to_page(pfn));
3792 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3793 v->arch.guest_context.gdt_frames[i] = 0;
3798 long set_gdt(struct vcpu *v,
3799 unsigned long *frames,
3800 unsigned int entries)
3802 struct domain *d = v->domain;
3803 /* NB. There are 512 8-byte entries per GDT page. */
3804 int i, nr_pages = (entries + 511) / 512;
3805 unsigned long mfn;
3807 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3808 return -EINVAL;
3810 /* Check the pages in the new GDT. */
3811 for ( i = 0; i < nr_pages; i++ )
3813 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3814 if ( !mfn_valid(mfn) ||
3815 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3816 goto fail;
3819 /* Tear down the old GDT. */
3820 destroy_gdt(v);
3822 /* Install the new GDT. */
3823 v->arch.guest_context.gdt_ents = entries;
3824 for ( i = 0; i < nr_pages; i++ )
3826 v->arch.guest_context.gdt_frames[i] = frames[i];
3827 l1e_write(&v->arch.perdomain_ptes[i],
3828 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3831 return 0;
3833 fail:
3834 while ( i-- > 0 )
3835 put_page_and_type(mfn_to_page(frames[i]));
3836 return -EINVAL;
3840 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3842 int nr_pages = (entries + 511) / 512;
3843 unsigned long frames[16];
3844 struct vcpu *curr = current;
3845 long ret;
3847 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3848 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3849 return -EINVAL;
3851 if ( copy_from_guest(frames, frame_list, nr_pages) )
3852 return -EFAULT;
3854 domain_lock(curr->domain);
3856 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3857 flush_tlb_local();
3859 domain_unlock(curr->domain);
3861 return ret;
3865 long do_update_descriptor(u64 pa, u64 desc)
3867 struct domain *dom = current->domain;
3868 unsigned long gmfn = pa >> PAGE_SHIFT;
3869 unsigned long mfn;
3870 unsigned int offset;
3871 struct desc_struct *gdt_pent, d;
3872 struct page_info *page;
3873 long ret = -EINVAL;
3875 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3877 *(u64 *)&d = desc;
3879 mfn = gmfn_to_mfn(dom, gmfn);
3880 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3881 !mfn_valid(mfn) ||
3882 !check_descriptor(dom, &d) )
3883 return -EINVAL;
3885 page = mfn_to_page(mfn);
3886 if ( unlikely(!get_page(page, dom)) )
3887 return -EINVAL;
3889 /* Check if the given frame is in use in an unsafe context. */
3890 switch ( page->u.inuse.type_info & PGT_type_mask )
3892 case PGT_seg_desc_page:
3893 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3894 goto out;
3895 break;
3896 default:
3897 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3898 goto out;
3899 break;
3902 paging_mark_dirty(dom, mfn);
3904 /* All is good so make the update. */
3905 gdt_pent = map_domain_page(mfn);
3906 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3907 unmap_domain_page(gdt_pent);
3909 put_page_type(page);
3911 ret = 0; /* success */
3913 out:
3914 put_page(page);
3916 return ret;
3919 typedef struct e820entry e820entry_t;
3920 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3922 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3924 struct page_info *page = NULL;
3925 int rc;
3927 switch ( op )
3929 case XENMEM_add_to_physmap:
3931 struct xen_add_to_physmap xatp;
3932 unsigned long prev_mfn, mfn = 0, gpfn;
3933 struct domain *d;
3935 if ( copy_from_guest(&xatp, arg, 1) )
3936 return -EFAULT;
3938 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3939 if ( rc != 0 )
3940 return rc;
3942 if ( xsm_add_to_physmap(current->domain, d) )
3944 rcu_unlock_domain(d);
3945 return -EPERM;
3948 switch ( xatp.space )
3950 case XENMAPSPACE_shared_info:
3951 if ( xatp.idx == 0 )
3952 mfn = virt_to_mfn(d->shared_info);
3953 break;
3954 case XENMAPSPACE_grant_table:
3955 spin_lock(&d->grant_table->lock);
3957 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3958 (xatp.idx < max_nr_grant_frames) )
3959 gnttab_grow_table(d, xatp.idx + 1);
3961 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3962 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3964 spin_unlock(&d->grant_table->lock);
3965 break;
3966 case XENMAPSPACE_gmfn:
3967 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3968 if ( !get_page_from_pagenr(xatp.idx, d) )
3969 break;
3970 mfn = xatp.idx;
3971 page = mfn_to_page(mfn);
3972 break;
3973 default:
3974 break;
3977 if ( !paging_mode_translate(d) || (mfn == 0) )
3979 if ( page )
3980 put_page(page);
3981 rcu_unlock_domain(d);
3982 return -EINVAL;
3985 domain_lock(d);
3987 /* Remove previously mapped page if it was present. */
3988 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3989 if ( mfn_valid(prev_mfn) )
3991 if ( is_xen_heap_mfn(prev_mfn) )
3992 /* Xen heap frames are simply unhooked from this phys slot. */
3993 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3994 else
3995 /* Normal domain memory is freed, to avoid leaking memory. */
3996 guest_remove_page(d, xatp.gpfn);
3999 /* Unmap from old location, if any. */
4000 gpfn = get_gpfn_from_mfn(mfn);
4001 if ( gpfn != INVALID_M2P_ENTRY )
4002 guest_physmap_remove_page(d, gpfn, mfn, 0);
4004 /* Map at new location. */
4005 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
4007 domain_unlock(d);
4009 if ( page )
4010 put_page(page);
4012 rcu_unlock_domain(d);
4014 break;
4017 case XENMEM_set_memory_map:
4019 struct xen_foreign_memory_map fmap;
4020 struct domain *d;
4021 int rc;
4023 if ( copy_from_guest(&fmap, arg, 1) )
4024 return -EFAULT;
4026 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
4027 return -EINVAL;
4029 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4030 if ( rc != 0 )
4031 return rc;
4033 rc = xsm_domain_memory_map(d);
4034 if ( rc )
4036 rcu_unlock_domain(d);
4037 return rc;
4040 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
4041 fmap.map.nr_entries) ? -EFAULT : 0;
4042 d->arch.nr_e820 = fmap.map.nr_entries;
4044 rcu_unlock_domain(d);
4045 return rc;
4048 case XENMEM_memory_map:
4050 struct xen_memory_map map;
4051 struct domain *d = current->domain;
4053 /* Backwards compatibility. */
4054 if ( d->arch.nr_e820 == 0 )
4055 return -ENOSYS;
4057 if ( copy_from_guest(&map, arg, 1) )
4058 return -EFAULT;
4060 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4061 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4062 copy_to_guest(arg, &map, 1) )
4063 return -EFAULT;
4065 return 0;
4068 case XENMEM_machine_memory_map:
4070 struct xen_memory_map memmap;
4071 XEN_GUEST_HANDLE(e820entry_t) buffer;
4072 int count;
4073 int rc;
4075 if ( !IS_PRIV(current->domain) )
4076 return -EINVAL;
4078 rc = xsm_machine_memory_map();
4079 if ( rc )
4080 return rc;
4082 if ( copy_from_guest(&memmap, arg, 1) )
4083 return -EFAULT;
4084 if ( memmap.nr_entries < e820.nr_map + 1 )
4085 return -EINVAL;
4087 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
4089 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
4090 if ( copy_to_guest(buffer, e820.map, count) < 0 )
4091 return -EFAULT;
4093 memmap.nr_entries = count;
4095 if ( copy_to_guest(arg, &memmap, 1) )
4096 return -EFAULT;
4098 return 0;
4101 case XENMEM_machphys_mapping:
4103 static const struct xen_machphys_mapping mapping = {
4104 .v_start = MACH2PHYS_VIRT_START,
4105 .v_end = MACH2PHYS_VIRT_END,
4106 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4107 };
4109 if ( copy_to_guest(arg, &mapping, 1) )
4110 return -EFAULT;
4112 return 0;
4115 case XENMEM_set_pod_target:
4116 case XENMEM_get_pod_target:
4118 xen_pod_target_t target;
4119 struct domain *d;
4121 /* Support DOMID_SELF? */
4122 if ( !IS_PRIV(current->domain) )
4123 return -EINVAL;
4125 if ( copy_from_guest(&target, arg, 1) )
4126 return -EFAULT;
4128 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4129 if ( rc != 0 )
4130 return rc;
4132 if ( op == XENMEM_set_pod_target )
4134 if ( target.target_pages > d->max_pages )
4136 rc = -EINVAL;
4137 goto pod_target_out_unlock;
4140 rc = p2m_pod_set_mem_target(d, target.target_pages);
4143 target.tot_pages = d->tot_pages;
4144 target.pod_cache_pages = d->arch.p2m->pod.count;
4145 target.pod_entries = d->arch.p2m->pod.entry_count;
4147 if ( copy_to_guest(arg, &target, 1) )
4149 rc= -EFAULT;
4150 goto pod_target_out_unlock;
4153 pod_target_out_unlock:
4154 rcu_unlock_domain(d);
4155 return rc;
4158 default:
4159 return subarch_memory_op(op, arg);
4162 return 0;
4166 /*************************
4167 * Writable Pagetables
4168 */
4170 struct ptwr_emulate_ctxt {
4171 struct x86_emulate_ctxt ctxt;
4172 unsigned long cr2;
4173 l1_pgentry_t pte;
4174 };
4176 static int ptwr_emulated_read(
4177 enum x86_segment seg,
4178 unsigned long offset,
4179 void *p_data,
4180 unsigned int bytes,
4181 struct x86_emulate_ctxt *ctxt)
4183 unsigned int rc;
4184 unsigned long addr = offset;
4186 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4188 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4189 return X86EMUL_EXCEPTION;
4192 return X86EMUL_OKAY;
4195 static int ptwr_emulated_update(
4196 unsigned long addr,
4197 paddr_t old,
4198 paddr_t val,
4199 unsigned int bytes,
4200 unsigned int do_cmpxchg,
4201 struct ptwr_emulate_ctxt *ptwr_ctxt)
4203 unsigned long mfn;
4204 unsigned long unaligned_addr = addr;
4205 struct page_info *page;
4206 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4207 struct vcpu *v = current;
4208 struct domain *d = v->domain;
4210 /* Only allow naturally-aligned stores within the original %cr2 page. */
4211 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4213 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4214 ptwr_ctxt->cr2, addr, bytes);
4215 return X86EMUL_UNHANDLEABLE;
4218 /* Turn a sub-word access into a full-word access. */
4219 if ( bytes != sizeof(paddr_t) )
4221 paddr_t full;
4222 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4224 /* Align address; read full word. */
4225 addr &= ~(sizeof(paddr_t)-1);
4226 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4228 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4229 return X86EMUL_EXCEPTION;
4231 /* Mask out bits provided by caller. */
4232 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4233 /* Shift the caller value and OR in the missing bits. */
4234 val &= (((paddr_t)1 << (bytes*8)) - 1);
4235 val <<= (offset)*8;
4236 val |= full;
4237 /* Also fill in missing parts of the cmpxchg old value. */
4238 old &= (((paddr_t)1 << (bytes*8)) - 1);
4239 old <<= (offset)*8;
4240 old |= full;
4243 pte = ptwr_ctxt->pte;
4244 mfn = l1e_get_pfn(pte);
4245 page = mfn_to_page(mfn);
4247 /* We are looking only for read-only mappings of p.t. pages. */
4248 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4249 ASSERT(mfn_valid(mfn));
4250 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4251 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4252 ASSERT(page_get_owner(page) == d);
4254 /* Check the new PTE. */
4255 nl1e = l1e_from_intpte(val);
4256 if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
4258 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4259 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4261 /*
4262 * If this is an upper-half write to a PAE PTE then we assume that
4263 * the guest has simply got the two writes the wrong way round. We
4264 * zap the PRESENT bit on the assumption that the bottom half will
4265 * be written immediately after we return to the guest.
4266 */
4267 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4268 l1e_get_intpte(nl1e));
4269 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4271 else
4273 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4274 return X86EMUL_UNHANDLEABLE;
4278 adjust_guest_l1e(nl1e, d);
4280 /* Checked successfully: do the update (write or cmpxchg). */
4281 pl1e = map_domain_page(mfn);
4282 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4283 if ( do_cmpxchg )
4285 int okay;
4286 intpte_t t = old;
4287 ol1e = l1e_from_intpte(old);
4289 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4290 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4291 okay = (okay && t == old);
4293 if ( !okay )
4295 unmap_domain_page(pl1e);
4296 put_page_from_l1e(nl1e, d);
4297 return X86EMUL_CMPXCHG_FAILED;
4300 else
4302 ol1e = *pl1e;
4303 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4304 BUG();
4307 trace_ptwr_emulation(addr, nl1e);
4309 unmap_domain_page(pl1e);
4311 /* Finally, drop the old PTE. */
4312 put_page_from_l1e(ol1e, d);
4314 return X86EMUL_OKAY;
4317 static int ptwr_emulated_write(
4318 enum x86_segment seg,
4319 unsigned long offset,
4320 void *p_data,
4321 unsigned int bytes,
4322 struct x86_emulate_ctxt *ctxt)
4324 paddr_t val = 0;
4326 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4328 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4329 offset, bytes);
4330 return X86EMUL_UNHANDLEABLE;
4333 memcpy(&val, p_data, bytes);
4335 return ptwr_emulated_update(
4336 offset, 0, val, bytes, 0,
4337 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4340 static int ptwr_emulated_cmpxchg(
4341 enum x86_segment seg,
4342 unsigned long offset,
4343 void *p_old,
4344 void *p_new,
4345 unsigned int bytes,
4346 struct x86_emulate_ctxt *ctxt)
4348 paddr_t old = 0, new = 0;
4350 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4352 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4353 offset, bytes);
4354 return X86EMUL_UNHANDLEABLE;
4357 memcpy(&old, p_old, bytes);
4358 memcpy(&new, p_new, bytes);
4360 return ptwr_emulated_update(
4361 offset, old, new, bytes, 1,
4362 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4365 static struct x86_emulate_ops ptwr_emulate_ops = {
4366 .read = ptwr_emulated_read,
4367 .insn_fetch = ptwr_emulated_read,
4368 .write = ptwr_emulated_write,
4369 .cmpxchg = ptwr_emulated_cmpxchg,
4370 };
4372 /* Write page fault handler: check if guest is trying to modify a PTE. */
4373 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4374 struct cpu_user_regs *regs)
4376 struct domain *d = v->domain;
4377 struct page_info *page;
4378 l1_pgentry_t pte;
4379 struct ptwr_emulate_ctxt ptwr_ctxt;
4380 int rc;
4382 /* Attempt to read the PTE that maps the VA being accessed. */
4383 guest_get_eff_l1e(v, addr, &pte);
4385 /* We are looking only for read-only mappings of p.t. pages. */
4386 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4387 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4388 goto bail;
4390 page = l1e_get_page(pte);
4391 if ( !page_lock(page) )
4393 put_page(page);
4394 goto bail;
4397 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4399 page_unlock(page);
4400 put_page(page);
4401 goto bail;
4404 ptwr_ctxt.ctxt.regs = regs;
4405 ptwr_ctxt.ctxt.force_writeback = 0;
4406 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4407 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4408 ptwr_ctxt.cr2 = addr;
4409 ptwr_ctxt.pte = pte;
4411 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4413 page_unlock(page);
4414 put_page(page);
4416 if ( rc == X86EMUL_UNHANDLEABLE )
4417 goto bail;
4419 perfc_incr(ptwr_emulations);
4420 return EXCRET_fault_fixed;
4422 bail:
4423 return 0;
4426 void free_xen_pagetable(void *v)
4428 extern int early_boot;
4430 if ( early_boot )
4431 return;
4433 if ( is_xen_heap_page(virt_to_page(v)) )
4434 free_xenheap_page(v);
4435 else
4436 free_domheap_page(virt_to_page(v));
4439 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4440 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4441 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4443 /*
4444 * map_pages_to_xen() can be called with interrupts disabled:
4445 * * During early bootstrap; or
4446 * * alloc_xenheap_pages() via memguard_guard_range
4447 * In these cases it is safe to use flush_area_local():
4448 * * Because only the local CPU is online; or
4449 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4450 */
4451 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4452 flush_area_local((const void *)v, f) : \
4453 flush_area_all((const void *)v, f))
4455 int map_pages_to_xen(
4456 unsigned long virt,
4457 unsigned long mfn,
4458 unsigned long nr_mfns,
4459 unsigned int flags)
4461 l2_pgentry_t *pl2e, ol2e;
4462 l1_pgentry_t *pl1e, ol1e;
4463 unsigned int i;
4465 while ( nr_mfns != 0 )
4467 #ifdef __x86_64__
4468 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4469 l3_pgentry_t ol3e = *pl3e;
4471 if ( cpu_has_page1gb &&
4472 !(((virt >> PAGE_SHIFT) | mfn) &
4473 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4474 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4475 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4477 /* 1GB-page mapping. */
4478 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4480 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4482 unsigned int flush_flags =
4483 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4485 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4487 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4488 flush_flags |= FLUSH_TLB_GLOBAL;
4489 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4490 PAGE_CACHE_ATTRS )
4491 flush_flags |= FLUSH_CACHE;
4492 flush_area(virt, flush_flags);
4494 else
4496 pl2e = l3e_to_l2e(ol3e);
4497 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4499 ol2e = pl2e[i];
4500 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4501 continue;
4502 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4504 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4505 flush_flags |= FLUSH_TLB_GLOBAL;
4506 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4507 PAGE_CACHE_ATTRS )
4508 flush_flags |= FLUSH_CACHE;
4510 else
4512 unsigned int j;
4514 pl1e = l2e_to_l1e(ol2e);
4515 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4517 ol1e = pl1e[j];
4518 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4519 flush_flags |= FLUSH_TLB_GLOBAL;
4520 if ( (l1e_get_flags(ol1e) ^ flags) &
4521 PAGE_CACHE_ATTRS )
4522 flush_flags |= FLUSH_CACHE;
4526 flush_area(virt, flush_flags);
4527 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4529 ol2e = pl2e[i];
4530 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4531 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4532 free_xen_pagetable(l2e_to_l1e(ol2e));
4534 free_xen_pagetable(pl2e);
4538 virt += 1UL << L3_PAGETABLE_SHIFT;
4539 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4540 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4541 continue;
4544 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4545 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4547 unsigned int flush_flags =
4548 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4550 /* Skip this PTE if there is no change. */
4551 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4552 L1_PAGETABLE_ENTRIES - 1)) +
4553 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4554 l1_table_offset(virt) == mfn) &&
4555 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4556 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4558 /* We can skip to end of L3 superpage if we got a match. */
4559 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4560 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4561 if ( i > nr_mfns )
4562 i = nr_mfns;
4563 virt += i << PAGE_SHIFT;
4564 mfn += i;
4565 nr_mfns -= i;
4566 continue;
4569 pl2e = alloc_xen_pagetable();
4570 if ( pl2e == NULL )
4571 return -ENOMEM;
4573 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4574 l2e_write(pl2e + i,
4575 l2e_from_pfn(l3e_get_pfn(ol3e) +
4576 (i << PAGETABLE_ORDER),
4577 l3e_get_flags(ol3e)));
4579 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4580 flush_flags |= FLUSH_TLB_GLOBAL;
4582 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4583 __PAGE_HYPERVISOR));
4584 flush_area(virt, flush_flags);
4586 #endif
4588 pl2e = virt_to_xen_l2e(virt);
4590 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4591 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4592 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4594 /* Super-page mapping. */
4595 ol2e = *pl2e;
4596 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4598 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4600 unsigned int flush_flags =
4601 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4603 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4605 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4606 flush_flags |= FLUSH_TLB_GLOBAL;
4607 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4608 PAGE_CACHE_ATTRS )
4609 flush_flags |= FLUSH_CACHE;
4610 flush_area(virt, flush_flags);
4612 else
4614 pl1e = l2e_to_l1e(ol2e);
4615 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4617 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4618 flush_flags |= FLUSH_TLB_GLOBAL;
4619 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4620 PAGE_CACHE_ATTRS )
4621 flush_flags |= FLUSH_CACHE;
4623 flush_area(virt, flush_flags);
4624 free_xen_pagetable(pl1e);
4628 virt += 1UL << L2_PAGETABLE_SHIFT;
4629 mfn += 1UL << PAGETABLE_ORDER;
4630 nr_mfns -= 1UL << PAGETABLE_ORDER;
4632 else
4634 /* Normal page mapping. */
4635 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4637 pl1e = alloc_xen_pagetable();
4638 if ( pl1e == NULL )
4639 return -ENOMEM;
4640 clear_page(pl1e);
4641 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4642 __PAGE_HYPERVISOR));
4644 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4646 unsigned int flush_flags =
4647 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4649 /* Skip this PTE if there is no change. */
4650 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4651 l1_table_offset(virt)) == mfn) &&
4652 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4653 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4655 /* We can skip to end of L2 superpage if we got a match. */
4656 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4657 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4658 if ( i > nr_mfns )
4659 i = nr_mfns;
4660 virt += i << L1_PAGETABLE_SHIFT;
4661 mfn += i;
4662 nr_mfns -= i;
4663 goto check_l3;
4666 pl1e = alloc_xen_pagetable();
4667 if ( pl1e == NULL )
4668 return -ENOMEM;
4670 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4671 l1e_write(&pl1e[i],
4672 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4673 lNf_to_l1f(l2e_get_flags(*pl2e))));
4675 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4676 flush_flags |= FLUSH_TLB_GLOBAL;
4678 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4679 __PAGE_HYPERVISOR));
4680 flush_area(virt, flush_flags);
4683 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4684 ol1e = *pl1e;
4685 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4686 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4688 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4689 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4690 flush_flags |= FLUSH_TLB_GLOBAL;
4691 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4692 flush_flags |= FLUSH_CACHE;
4693 flush_area(virt, flush_flags);
4696 virt += 1UL << L1_PAGETABLE_SHIFT;
4697 mfn += 1UL;
4698 nr_mfns -= 1UL;
4700 if ( (flags == PAGE_HYPERVISOR) &&
4701 ((nr_mfns == 0) ||
4702 ((((virt >> PAGE_SHIFT) | mfn) &
4703 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4705 unsigned long base_mfn;
4706 pl1e = l2e_to_l1e(*pl2e);
4707 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4708 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4709 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4710 (l1e_get_flags(*pl1e) != flags) )
4711 break;
4712 if ( i == L1_PAGETABLE_ENTRIES )
4714 ol2e = *pl2e;
4715 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4716 l1f_to_lNf(flags)));
4717 flush_area(virt - PAGE_SIZE,
4718 FLUSH_TLB_GLOBAL |
4719 FLUSH_ORDER(PAGETABLE_ORDER));
4720 free_xen_pagetable(l2e_to_l1e(ol2e));
4725 check_l3: ;
4726 #ifdef __x86_64__
4727 if ( cpu_has_page1gb &&
4728 (flags == PAGE_HYPERVISOR) &&
4729 ((nr_mfns == 0) ||
4730 !(((virt >> PAGE_SHIFT) | mfn) &
4731 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4733 unsigned long base_mfn;
4735 ol3e = *pl3e;
4736 pl2e = l3e_to_l2e(ol3e);
4737 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4738 L1_PAGETABLE_ENTRIES - 1);
4739 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4740 if ( (l2e_get_pfn(*pl2e) !=
4741 (base_mfn + (i << PAGETABLE_ORDER))) ||
4742 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4743 break;
4744 if ( i == L2_PAGETABLE_ENTRIES )
4746 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4747 l1f_to_lNf(flags)));
4748 flush_area(virt - PAGE_SIZE,
4749 FLUSH_TLB_GLOBAL |
4750 FLUSH_ORDER(2*PAGETABLE_ORDER));
4751 free_xen_pagetable(l3e_to_l2e(ol3e));
4754 #endif
4757 return 0;
4760 void destroy_xen_mappings(unsigned long s, unsigned long e)
4762 l2_pgentry_t *pl2e;
4763 l1_pgentry_t *pl1e;
4764 unsigned int i;
4765 unsigned long v = s;
4767 ASSERT((s & ~PAGE_MASK) == 0);
4768 ASSERT((e & ~PAGE_MASK) == 0);
4770 while ( v < e )
4772 #ifdef __x86_64__
4773 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4775 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4777 v += 1UL << L3_PAGETABLE_SHIFT;
4778 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4779 continue;
4782 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4784 if ( l2_table_offset(v) == 0 &&
4785 l1_table_offset(v) == 0 &&
4786 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4788 /* PAGE1GB: whole superpage is destroyed. */
4789 l3e_write_atomic(pl3e, l3e_empty());
4790 v += 1UL << L3_PAGETABLE_SHIFT;
4791 continue;
4794 /* PAGE1GB: shatter the superpage and fall through. */
4795 pl2e = alloc_xen_pagetable();
4796 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4797 l2e_write(pl2e + i,
4798 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4799 (i << PAGETABLE_ORDER),
4800 l3e_get_flags(*pl3e)));
4801 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4802 __PAGE_HYPERVISOR));
4804 #endif
4806 pl2e = virt_to_xen_l2e(v);
4808 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4810 v += 1UL << L2_PAGETABLE_SHIFT;
4811 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4812 continue;
4815 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4817 if ( (l1_table_offset(v) == 0) &&
4818 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4820 /* PSE: whole superpage is destroyed. */
4821 l2e_write_atomic(pl2e, l2e_empty());
4822 v += 1UL << L2_PAGETABLE_SHIFT;
4824 else
4826 /* PSE: shatter the superpage and try again. */
4827 pl1e = alloc_xen_pagetable();
4828 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4829 l1e_write(&pl1e[i],
4830 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4831 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4832 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4833 __PAGE_HYPERVISOR));
4836 else
4838 /* Ordinary 4kB mapping. */
4839 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4840 l1e_write_atomic(pl1e, l1e_empty());
4841 v += PAGE_SIZE;
4843 /* If we are done with the L2E, check if it is now empty. */
4844 if ( (v != e) && (l1_table_offset(v) != 0) )
4845 continue;
4846 pl1e = l2e_to_l1e(*pl2e);
4847 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4848 if ( l1e_get_intpte(pl1e[i]) != 0 )
4849 break;
4850 if ( i == L1_PAGETABLE_ENTRIES )
4852 /* Empty: zap the L2E and free the L1 page. */
4853 l2e_write_atomic(pl2e, l2e_empty());
4854 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4855 free_xen_pagetable(pl1e);
4859 #ifdef __x86_64__
4860 /* If we are done with the L3E, check if it is now empty. */
4861 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4862 continue;
4863 pl2e = l3e_to_l2e(*pl3e);
4864 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4865 if ( l2e_get_intpte(pl2e[i]) != 0 )
4866 break;
4867 if ( i == L2_PAGETABLE_ENTRIES )
4869 /* Empty: zap the L3E and free the L2 page. */
4870 l3e_write_atomic(pl3e, l3e_empty());
4871 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4872 free_xen_pagetable(pl2e);
4874 #endif
4877 flush_area(NULL, FLUSH_TLB_GLOBAL);
4880 void __set_fixmap(
4881 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4883 BUG_ON(idx >= __end_of_fixed_addresses);
4884 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4887 #ifdef MEMORY_GUARD
4889 void memguard_init(void)
4891 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4892 #ifdef __i386__
4893 map_pages_to_xen(
4894 (unsigned long)__va(start),
4895 start >> PAGE_SHIFT,
4896 (xenheap_phys_end - start) >> PAGE_SHIFT,
4897 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4898 #else
4899 map_pages_to_xen(
4900 (unsigned long)__va(start),
4901 start >> PAGE_SHIFT,
4902 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4903 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4904 BUG_ON(start != xen_phys_start);
4905 map_pages_to_xen(
4906 XEN_VIRT_START,
4907 start >> PAGE_SHIFT,
4908 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4909 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4910 #endif
4913 static void __memguard_change_range(void *p, unsigned long l, int guard)
4915 unsigned long _p = (unsigned long)p;
4916 unsigned long _l = (unsigned long)l;
4917 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4919 /* Ensure we are dealing with a page-aligned whole number of pages. */
4920 ASSERT((_p&~PAGE_MASK) == 0);
4921 ASSERT((_l&~PAGE_MASK) == 0);
4923 if ( guard )
4924 flags &= ~_PAGE_PRESENT;
4926 map_pages_to_xen(
4927 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4930 void memguard_guard_range(void *p, unsigned long l)
4932 __memguard_change_range(p, l, 1);
4935 void memguard_unguard_range(void *p, unsigned long l)
4937 __memguard_change_range(p, l, 0);
4940 #endif
4942 void memguard_guard_stack(void *p)
4944 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4945 p = (void *)((unsigned long)p + STACK_SIZE -
4946 PRIMARY_STACK_SIZE - PAGE_SIZE);
4947 memguard_guard_range(p, PAGE_SIZE);
4950 /*
4951 * Local variables:
4952 * mode: C
4953 * c-set-style: "BSD"
4954 * c-basic-offset: 4
4955 * tab-width: 4
4956 * indent-tabs-mode: nil
4957 * End:
4958 */