debuggers.hg

view xen/arch/x86/mm.c @ 16369:ff2edb1fd9f2

x86: Change cache attributes of Xen 1:1 page mappings in response to
guest mapping requests.
Based on a patch by Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Wed Nov 07 11:44:05 2007 +0000 (2007-11-07)
parents 05f257f4f3c7
children 5b8730c78454
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
118 /*
119 * PTE updates can be done with ordinary writes except:
120 * 1. Debug builds get extra checking by using CMPXCHG[8B].
121 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
122 */
123 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
124 #define PTE_UPDATE_WITH_CMPXCHG
125 #endif
127 /* Used to defer flushing of memory structures. */
128 struct percpu_mm_info {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
130 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
131 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
132 unsigned int deferred_ops;
133 /* If non-NULL, specifies a foreign subject domain for some operations. */
134 struct domain *foreign;
135 };
136 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
138 /*
139 * Returns the current foreign domain; defaults to the currently-executing
140 * domain if a foreign override hasn't been specified.
141 */
142 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 static struct domain *dom_xen, *dom_io;
147 /* Frame table and its size in pages. */
148 struct page_info *frame_table;
149 unsigned long max_page;
150 unsigned long total_pages;
152 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
154 #define l1_disallow_mask(d) \
155 ((d != dom_io) && \
156 (rangeset_is_empty((d)->iomem_caps) && \
157 rangeset_is_empty((d)->arch.ioport_caps)) ? \
158 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
160 #ifdef CONFIG_COMPAT
161 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
162 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
163 L3_DISALLOW_MASK : \
164 COMPAT_L3_DISALLOW_MASK)
165 #else
166 #define l3_disallow_mask(d) L3_DISALLOW_MASK
167 #endif
169 static void queue_deferred_ops(struct domain *d, unsigned int ops)
170 {
171 ASSERT(d == current->domain);
172 this_cpu(percpu_mm_info).deferred_ops |= ops;
173 }
175 void __init init_frametable(void)
176 {
177 unsigned long nr_pages, page_step, i, mfn;
179 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
181 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
182 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
184 for ( i = 0; i < nr_pages; i += page_step )
185 {
186 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
187 if ( mfn == 0 )
188 panic("Not enough memory for frame table\n");
189 map_pages_to_xen(
190 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
191 mfn, page_step, PAGE_HYPERVISOR);
192 }
194 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
195 }
197 void __init arch_init_memory(void)
198 {
199 extern void subarch_init_memory(void);
201 unsigned long i, pfn, rstart_pfn, rend_pfn;
203 /*
204 * Initialise our DOMID_XEN domain.
205 * Any Xen-heap pages that we will allow to be mapped will have
206 * their domain field set to dom_xen.
207 */
208 dom_xen = alloc_domain(DOMID_XEN);
209 BUG_ON(dom_xen == NULL);
211 /*
212 * Initialise our DOMID_IO domain.
213 * This domain owns I/O pages that are within the range of the page_info
214 * array. Mappings occur at the priv of the caller.
215 */
216 dom_io = alloc_domain(DOMID_IO);
217 BUG_ON(dom_io == NULL);
219 /* First 1MB of RAM is historically marked as I/O. */
220 for ( i = 0; i < 0x100; i++ )
221 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
223 /* Any areas not specified as RAM by the e820 map are considered I/O. */
224 for ( i = 0, pfn = 0; pfn < max_page; i++ )
225 {
226 while ( (i < e820.nr_map) &&
227 (e820.map[i].type != E820_RAM) &&
228 (e820.map[i].type != E820_UNUSABLE) )
229 i++;
231 if ( i >= e820.nr_map )
232 {
233 /* No more RAM regions: mark as I/O right to end of memory map. */
234 rstart_pfn = rend_pfn = max_page;
235 }
236 else
237 {
238 /* Mark as I/O just up as far as next RAM region. */
239 rstart_pfn = min_t(unsigned long, max_page,
240 PFN_UP(e820.map[i].addr));
241 rend_pfn = max_t(unsigned long, rstart_pfn,
242 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
243 }
245 /* Mark as I/O up to next RAM region. */
246 for ( ; pfn < rstart_pfn; pfn++ )
247 {
248 BUG_ON(!mfn_valid(pfn));
249 share_xen_page_with_guest(
250 mfn_to_page(pfn), dom_io, XENSHARE_writable);
251 }
253 /* Skip the RAM region. */
254 pfn = rend_pfn;
255 }
257 subarch_init_memory();
258 }
260 int memory_is_conventional_ram(paddr_t p)
261 {
262 int i;
264 for ( i = 0; i < e820.nr_map; i++ )
265 {
266 if ( (e820.map[i].type == E820_RAM) &&
267 (e820.map[i].addr <= p) &&
268 (e820.map[i].size > p) )
269 return 1;
270 }
272 return 0;
273 }
275 unsigned long domain_get_maximum_gpfn(struct domain *d)
276 {
277 if ( is_hvm_domain(d) )
278 return d->arch.p2m.max_mapped_pfn;
279 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
280 return arch_get_max_pfn(d) - 1;
281 }
283 void share_xen_page_with_guest(
284 struct page_info *page, struct domain *d, int readonly)
285 {
286 if ( page_get_owner(page) == d )
287 return;
289 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
291 spin_lock(&d->page_alloc_lock);
293 /* The incremented type count pins as writable or read-only. */
294 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
295 page->u.inuse.type_info |= PGT_validated | 1;
297 page_set_owner(page, d);
298 wmb(); /* install valid domain ptr before updating refcnt. */
299 ASSERT(page->count_info == 0);
301 /* Only add to the allocation list if the domain isn't dying. */
302 if ( !d->is_dying )
303 {
304 page->count_info |= PGC_allocated | 1;
305 if ( unlikely(d->xenheap_pages++ == 0) )
306 get_knownalive_domain(d);
307 list_add_tail(&page->list, &d->xenpage_list);
308 }
310 spin_unlock(&d->page_alloc_lock);
311 }
313 void share_xen_page_with_privileged_guests(
314 struct page_info *page, int readonly)
315 {
316 share_xen_page_with_guest(page, dom_xen, readonly);
317 }
319 #if defined(CONFIG_X86_PAE)
321 #ifdef NDEBUG
322 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
323 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
324 #else
325 /*
326 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
327 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
328 * (detected by lack of an owning domain). As required for correctness, we
329 * always shadow PDPTs above 4GB.
330 */
331 #define l3tab_needs_shadow(mfn) \
332 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
333 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
334 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
335 ((mfn) >= 0x100000))
336 #endif
338 static l1_pgentry_t *fix_pae_highmem_pl1e;
340 /* Cache the address of PAE high-memory fixmap page tables. */
341 static int __init cache_pae_fixmap_address(void)
342 {
343 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
344 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
345 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
346 return 0;
347 }
348 __initcall(cache_pae_fixmap_address);
350 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
352 void make_cr3(struct vcpu *v, unsigned long mfn)
353 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
354 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
355 {
356 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
357 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
358 unsigned int cpu = smp_processor_id();
360 /* Fast path: does this mfn need a shadow at all? */
361 if ( !l3tab_needs_shadow(mfn) )
362 {
363 v->arch.cr3 = mfn << PAGE_SHIFT;
364 /* Cache is no longer in use or valid */
365 cache->high_mfn = 0;
366 return;
367 }
369 /* Caching logic is not interrupt safe. */
370 ASSERT(!in_irq());
372 /* Protects against pae_flush_pgd(). */
373 spin_lock(&cache->lock);
375 cache->inuse_idx ^= 1;
376 cache->high_mfn = mfn;
378 /* Map the guest L3 table and copy to the chosen low-memory cache. */
379 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
380 /* First check the previous high mapping can't be in the TLB.
381 * (i.e. have we loaded CR3 since we last did this?) */
382 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
383 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
384 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
385 lowmem_l3tab = cache->table[cache->inuse_idx];
386 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
387 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
388 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
390 v->arch.cr3 = __pa(lowmem_l3tab);
392 spin_unlock(&cache->lock);
393 }
395 #else /* !CONFIG_X86_PAE */
397 void make_cr3(struct vcpu *v, unsigned long mfn)
398 {
399 v->arch.cr3 = mfn << PAGE_SHIFT;
400 }
402 #endif /* !CONFIG_X86_PAE */
404 void write_ptbase(struct vcpu *v)
405 {
406 write_cr3(v->arch.cr3);
407 }
409 /*
410 * Should be called after CR3 is updated.
411 *
412 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
413 * for HVM guests, arch.monitor_table and hvm's guest CR3.
414 *
415 * Update ref counts to shadow tables appropriately.
416 */
417 void update_cr3(struct vcpu *v)
418 {
419 unsigned long cr3_mfn=0;
421 if ( paging_mode_enabled(v->domain) )
422 {
423 paging_update_cr3(v);
424 return;
425 }
427 #if CONFIG_PAGING_LEVELS == 4
428 if ( !(v->arch.flags & TF_kernel_mode) )
429 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
430 else
431 #endif
432 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
434 make_cr3(v, cr3_mfn);
435 }
438 static void invalidate_shadow_ldt(struct vcpu *v)
439 {
440 int i;
441 unsigned long pfn;
442 struct page_info *page;
444 if ( v->arch.shadow_ldt_mapcnt == 0 )
445 return;
447 v->arch.shadow_ldt_mapcnt = 0;
449 for ( i = 16; i < 32; i++ )
450 {
451 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
452 if ( pfn == 0 ) continue;
453 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
454 page = mfn_to_page(pfn);
455 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
456 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
457 put_page_and_type(page);
458 }
460 /* Dispose of the (now possibly invalid) mappings from the TLB. */
461 if ( v == current )
462 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
463 else
464 flush_tlb_mask(v->domain->domain_dirty_cpumask);
465 }
468 static int alloc_segdesc_page(struct page_info *page)
469 {
470 struct desc_struct *descs;
471 int i;
473 descs = map_domain_page(page_to_mfn(page));
475 for ( i = 0; i < 512; i++ )
476 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
477 goto fail;
479 unmap_domain_page(descs);
480 return 1;
482 fail:
483 unmap_domain_page(descs);
484 return 0;
485 }
488 /* Map shadow page at offset @off. */
489 int map_ldt_shadow_page(unsigned int off)
490 {
491 struct vcpu *v = current;
492 struct domain *d = v->domain;
493 unsigned long gmfn, mfn;
494 l1_pgentry_t l1e, nl1e;
495 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
496 int okay;
498 BUG_ON(unlikely(in_irq()));
500 guest_get_eff_kern_l1e(v, gva, &l1e);
501 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
502 return 0;
504 gmfn = l1e_get_pfn(l1e);
505 mfn = gmfn_to_mfn(d, gmfn);
506 if ( unlikely(!mfn_valid(mfn)) )
507 return 0;
509 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
510 if ( unlikely(!okay) )
511 return 0;
513 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
515 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
516 v->arch.shadow_ldt_mapcnt++;
518 return 1;
519 }
522 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
523 {
524 struct page_info *page = mfn_to_page(page_nr);
526 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
527 {
528 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
529 return 0;
530 }
532 return 1;
533 }
536 static int get_page_and_type_from_pagenr(unsigned long page_nr,
537 unsigned long type,
538 struct domain *d)
539 {
540 struct page_info *page = mfn_to_page(page_nr);
542 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
543 return 0;
545 if ( unlikely(!get_page_type(page, type)) )
546 {
547 put_page(page);
548 return 0;
549 }
551 return 1;
552 }
554 /*
555 * We allow root tables to map each other (a.k.a. linear page tables). It
556 * needs some special care with reference counts and access permissions:
557 * 1. The mapping entry must be read-only, or the guest may get write access
558 * to its own PTEs.
559 * 2. We must only bump the reference counts for an *already validated*
560 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
561 * on a validation that is required to complete that validation.
562 * 3. We only need to increment the reference counts for the mapped page
563 * frame if it is mapped by a different root table. This is sufficient and
564 * also necessary to allow validation of a root table mapping itself.
565 */
566 #define define_get_linear_pagetable(level) \
567 static int \
568 get_##level##_linear_pagetable( \
569 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
570 { \
571 unsigned long x, y; \
572 struct page_info *page; \
573 unsigned long pfn; \
574 \
575 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
576 { \
577 MEM_LOG("Attempt to create linear p.t. with write perms"); \
578 return 0; \
579 } \
580 \
581 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
582 { \
583 /* Make sure the mapped frame belongs to the correct domain. */ \
584 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
585 return 0; \
586 \
587 /* \
588 * Ensure that the mapped frame is an already-validated page table. \
589 * If so, atomically increment the count (checking for overflow). \
590 */ \
591 page = mfn_to_page(pfn); \
592 y = page->u.inuse.type_info; \
593 do { \
594 x = y; \
595 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
596 unlikely((x & (PGT_type_mask|PGT_validated)) != \
597 (PGT_##level##_page_table|PGT_validated)) ) \
598 { \
599 put_page(page); \
600 return 0; \
601 } \
602 } \
603 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
604 } \
605 \
606 return 1; \
607 }
610 int is_iomem_page(unsigned long mfn)
611 {
612 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
613 }
616 int
617 get_page_from_l1e(
618 l1_pgentry_t l1e, struct domain *d)
619 {
620 unsigned long mfn = l1e_get_pfn(l1e);
621 struct page_info *page = mfn_to_page(mfn);
622 uint32_t l1f = l1e_get_flags(l1e);
623 int okay;
625 if ( !(l1f & _PAGE_PRESENT) )
626 return 1;
628 if ( unlikely(l1f & l1_disallow_mask(d)) )
629 {
630 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
631 return 0;
632 }
634 if ( is_iomem_page(mfn) )
635 {
636 /* DOMID_IO reverts to caller for privilege checks. */
637 if ( d == dom_io )
638 d = current->domain;
640 if ( !iomem_access_permitted(d, mfn, mfn) )
641 {
642 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
643 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
644 d->domain_id, mfn);
645 return 0;
646 }
648 /* No reference counting for out-of-range I/O pages. */
649 if ( !mfn_valid(mfn) )
650 return 1;
652 d = dom_io;
653 }
655 /* Foreign mappings into guests in shadow external mode don't
656 * contribute to writeable mapping refcounts. (This allows the
657 * qemu-dm helper process in dom0 to map the domain's memory without
658 * messing up the count of "real" writable mappings.) */
659 okay = (((l1f & _PAGE_RW) &&
660 !(unlikely(paging_mode_external(d) && (d != current->domain))))
661 ? get_page_and_type(page, d, PGT_writable_page)
662 : get_page(page, d));
663 if ( !okay )
664 {
665 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
666 " for dom%d",
667 mfn, get_gpfn_from_mfn(mfn),
668 l1e_get_intpte(l1e), d->domain_id);
669 }
670 else if ( (pte_flags_to_cacheattr(l1f) !=
671 ((page->count_info >> PGC_cacheattr_base) & 7)) &&
672 !is_iomem_page(mfn) )
673 {
674 uint32_t x, nx, y = page->count_info;
675 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
677 if ( is_xen_heap_frame(page) )
678 {
679 if ( (l1f & _PAGE_RW) &&
680 !(unlikely(paging_mode_external(d) &&
681 (d != current->domain))) )
682 put_page_type(page);
683 put_page(page);
684 MEM_LOG("Attempt to change cache attributes of Xen heap page");
685 return 0;
686 }
688 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
689 {
690 x = y;
691 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
692 y = cmpxchg(&page->count_info, x, nx);
693 }
695 #ifdef __x86_64__
696 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
697 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
698 #endif
699 }
701 return okay;
702 }
705 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
706 define_get_linear_pagetable(l2);
707 static int
708 get_page_from_l2e(
709 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
710 {
711 int rc;
713 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
714 return 1;
716 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
717 {
718 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
719 return 0;
720 }
722 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
723 if ( unlikely(!rc) )
724 rc = get_l2_linear_pagetable(l2e, pfn, d);
726 return rc;
727 }
730 #if CONFIG_PAGING_LEVELS >= 3
731 define_get_linear_pagetable(l3);
732 static int
733 get_page_from_l3e(
734 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
735 {
736 int rc;
738 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
739 return 1;
741 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
742 {
743 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
744 return 0;
745 }
747 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
748 if ( unlikely(!rc) )
749 rc = get_l3_linear_pagetable(l3e, pfn, d);
751 return rc;
752 }
753 #endif /* 3 level */
755 #if CONFIG_PAGING_LEVELS >= 4
756 define_get_linear_pagetable(l4);
757 static int
758 get_page_from_l4e(
759 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
760 {
761 int rc;
763 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
764 return 1;
766 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
767 {
768 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
769 return 0;
770 }
772 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
773 if ( unlikely(!rc) )
774 rc = get_l4_linear_pagetable(l4e, pfn, d);
776 return rc;
777 }
778 #endif /* 4 level */
780 #ifdef __x86_64__
782 #ifdef USER_MAPPINGS_ARE_GLOBAL
783 #define adjust_guest_l1e(pl1e, d) \
784 do { \
785 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
786 likely(!is_pv_32on64_domain(d)) ) \
787 { \
788 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
789 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
790 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
791 MEM_LOG("Global bit is set to kernel page %lx", \
792 l1e_get_pfn((pl1e))); \
793 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
794 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
795 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
796 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
797 } \
798 } while ( 0 )
799 #else
800 #define adjust_guest_l1e(pl1e, d) \
801 do { \
802 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
803 likely(!is_pv_32on64_domain(d)) ) \
804 l1e_add_flags((pl1e), _PAGE_USER); \
805 } while ( 0 )
806 #endif
808 #define adjust_guest_l2e(pl2e, d) \
809 do { \
810 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
811 likely(!is_pv_32on64_domain(d)) ) \
812 l2e_add_flags((pl2e), _PAGE_USER); \
813 } while ( 0 )
815 #define adjust_guest_l3e(pl3e, d) \
816 do { \
817 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
818 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
819 _PAGE_USER : \
820 _PAGE_USER|_PAGE_RW); \
821 } while ( 0 )
823 #define adjust_guest_l4e(pl4e, d) \
824 do { \
825 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
826 likely(!is_pv_32on64_domain(d)) ) \
827 l4e_add_flags((pl4e), _PAGE_USER); \
828 } while ( 0 )
830 #else /* !defined(__x86_64__) */
832 #define adjust_guest_l1e(_p, _d) ((void)(_d))
833 #define adjust_guest_l2e(_p, _d) ((void)(_d))
834 #define adjust_guest_l3e(_p, _d) ((void)(_d))
836 #endif
838 #ifdef CONFIG_COMPAT
839 #define unadjust_guest_l3e(pl3e, d) \
840 do { \
841 if ( unlikely(is_pv_32on64_domain(d)) && \
842 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
843 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
844 } while ( 0 )
845 #else
846 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
847 #endif
849 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
850 {
851 unsigned long pfn = l1e_get_pfn(l1e);
852 struct page_info *page = mfn_to_page(pfn);
853 struct domain *e;
854 struct vcpu *v;
856 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
857 return;
859 e = page_get_owner(page);
861 /*
862 * Check if this is a mapping that was established via a grant reference.
863 * If it was then we should not be here: we require that such mappings are
864 * explicitly destroyed via the grant-table interface.
865 *
866 * The upshot of this is that the guest can end up with active grants that
867 * it cannot destroy (because it no longer has a PTE to present to the
868 * grant-table interface). This can lead to subtle hard-to-catch bugs,
869 * hence a special grant PTE flag can be enabled to catch the bug early.
870 *
871 * (Note that the undestroyable active grants are not a security hole in
872 * Xen. All active grants can safely be cleaned up when the domain dies.)
873 */
874 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
875 !d->is_shutting_down && !d->is_dying )
876 {
877 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
878 l1e_get_intpte(l1e));
879 domain_crash(d);
880 }
882 /* Remember we didn't take a type-count of foreign writable mappings
883 * to paging-external domains */
884 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
885 !(unlikely((e != d) && paging_mode_external(e))) )
886 {
887 put_page_and_type(page);
888 }
889 else
890 {
891 /* We expect this is rare so we blow the entire shadow LDT. */
892 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
893 PGT_ldt_page)) &&
894 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
895 (d == e) )
896 {
897 for_each_vcpu ( d, v )
898 invalidate_shadow_ldt(v);
899 }
900 put_page(page);
901 }
902 }
905 /*
906 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
907 * Note also that this automatically deals correctly with linear p.t.'s.
908 */
909 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
910 {
911 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
912 (l2e_get_pfn(l2e) != pfn) )
913 put_page_and_type(l2e_get_page(l2e));
914 }
917 #if CONFIG_PAGING_LEVELS >= 3
918 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
919 {
920 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
921 (l3e_get_pfn(l3e) != pfn) )
922 put_page_and_type(l3e_get_page(l3e));
923 }
924 #endif
926 #if CONFIG_PAGING_LEVELS >= 4
927 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
928 {
929 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
930 (l4e_get_pfn(l4e) != pfn) )
931 put_page_and_type(l4e_get_page(l4e));
932 }
933 #endif
935 static int alloc_l1_table(struct page_info *page)
936 {
937 struct domain *d = page_get_owner(page);
938 unsigned long pfn = page_to_mfn(page);
939 l1_pgentry_t *pl1e;
940 int i;
942 pl1e = map_domain_page(pfn);
944 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
945 {
946 if ( is_guest_l1_slot(i) &&
947 unlikely(!get_page_from_l1e(pl1e[i], d)) )
948 goto fail;
950 adjust_guest_l1e(pl1e[i], d);
951 }
953 unmap_domain_page(pl1e);
954 return 1;
956 fail:
957 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
958 while ( i-- > 0 )
959 if ( is_guest_l1_slot(i) )
960 put_page_from_l1e(pl1e[i], d);
962 unmap_domain_page(pl1e);
963 return 0;
964 }
966 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
967 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
968 {
969 struct page_info *page;
970 l2_pgentry_t *pl2e;
971 l3_pgentry_t l3e3;
972 #ifndef CONFIG_COMPAT
973 l2_pgentry_t l2e;
974 int i;
975 #endif
977 if ( !is_pv_32bit_domain(d) )
978 return 1;
980 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
982 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
983 l3e3 = pl3e[3];
984 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
985 {
986 MEM_LOG("PAE L3 3rd slot is empty");
987 return 0;
988 }
990 /*
991 * The Xen-private mappings include linear mappings. The L2 thus cannot
992 * be shared by multiple L3 tables. The test here is adequate because:
993 * 1. Cannot appear in slots != 3 because get_page_type() checks the
994 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
995 * 2. Cannot appear in another page table's L3:
996 * a. alloc_l3_table() calls this function and this check will fail
997 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
998 */
999 page = l3e_get_page(l3e3);
1000 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1001 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1002 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1003 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1005 MEM_LOG("PAE L3 3rd slot is shared");
1006 return 0;
1009 /* Xen private mappings. */
1010 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1011 #ifndef CONFIG_COMPAT
1012 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1013 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1014 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1015 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1017 l2e = l2e_from_page(
1018 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1019 __PAGE_HYPERVISOR);
1020 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1022 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1024 l2e = l2e_empty();
1025 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1026 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1027 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1029 #else
1030 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1031 &compat_idle_pg_table_l2[
1032 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1033 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1034 #endif
1035 unmap_domain_page(pl2e);
1037 return 1;
1039 #else
1040 # define create_pae_xen_mappings(d, pl3e) (1)
1041 #endif
1043 #ifdef CONFIG_X86_PAE
1044 /* Flush a pgdir update into low-memory caches. */
1045 static void pae_flush_pgd(
1046 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1048 struct domain *d = page_get_owner(mfn_to_page(mfn));
1049 struct vcpu *v;
1050 intpte_t _ol3e, _nl3e, _pl3e;
1051 l3_pgentry_t *l3tab_ptr;
1052 struct pae_l3_cache *cache;
1054 if ( unlikely(shadow_mode_enabled(d)) )
1056 cpumask_t m = CPU_MASK_NONE;
1057 /* Re-shadow this l3 table on any vcpus that are using it */
1058 for_each_vcpu ( d, v )
1059 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1061 paging_update_cr3(v);
1062 cpus_or(m, m, v->vcpu_dirty_cpumask);
1064 flush_tlb_mask(m);
1067 /* If below 4GB then the pgdir is not shadowed in low memory. */
1068 if ( !l3tab_needs_shadow(mfn) )
1069 return;
1071 for_each_vcpu ( d, v )
1073 cache = &v->arch.pae_l3_cache;
1075 spin_lock(&cache->lock);
1077 if ( cache->high_mfn == mfn )
1079 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1080 _ol3e = l3e_get_intpte(*l3tab_ptr);
1081 _nl3e = l3e_get_intpte(nl3e);
1082 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1083 BUG_ON(_pl3e != _ol3e);
1086 spin_unlock(&cache->lock);
1089 flush_tlb_mask(d->domain_dirty_cpumask);
1091 #else
1092 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1093 #endif
1095 static int alloc_l2_table(struct page_info *page, unsigned long type)
1097 struct domain *d = page_get_owner(page);
1098 unsigned long pfn = page_to_mfn(page);
1099 l2_pgentry_t *pl2e;
1100 int i;
1102 pl2e = map_domain_page(pfn);
1104 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1106 if ( is_guest_l2_slot(d, type, i) &&
1107 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1108 goto fail;
1110 adjust_guest_l2e(pl2e[i], d);
1113 #if CONFIG_PAGING_LEVELS == 2
1114 /* Xen private mappings. */
1115 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1116 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1117 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1118 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1119 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1120 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1121 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1122 l2e_from_page(
1123 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1124 __PAGE_HYPERVISOR);
1125 #endif
1127 unmap_domain_page(pl2e);
1128 return 1;
1130 fail:
1131 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1132 while ( i-- > 0 )
1133 if ( is_guest_l2_slot(d, type, i) )
1134 put_page_from_l2e(pl2e[i], pfn);
1136 unmap_domain_page(pl2e);
1137 return 0;
1141 #if CONFIG_PAGING_LEVELS >= 3
1142 static int alloc_l3_table(struct page_info *page)
1144 struct domain *d = page_get_owner(page);
1145 unsigned long pfn = page_to_mfn(page);
1146 l3_pgentry_t *pl3e;
1147 int i;
1149 #ifdef CONFIG_X86_PAE
1150 /*
1151 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1152 * the weird 'extended cr3' format for dealing with high-order address
1153 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1154 */
1155 if ( (pfn >= 0x100000) &&
1156 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1157 d->vcpu[0] && d->vcpu[0]->is_initialised )
1159 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1160 return 0;
1162 #endif
1164 pl3e = map_domain_page(pfn);
1166 /*
1167 * PAE guests allocate full pages, but aren't required to initialize
1168 * more than the first four entries; when running in compatibility
1169 * mode, however, the full page is visible to the MMU, and hence all
1170 * 512 entries must be valid/verified, which is most easily achieved
1171 * by clearing them out.
1172 */
1173 if ( is_pv_32on64_domain(d) )
1174 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1176 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1178 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1179 if ( is_pv_32bit_domain(d) && (i == 3) )
1181 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1182 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1183 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1184 PGT_l2_page_table |
1185 PGT_pae_xen_l2,
1186 d) )
1187 goto fail;
1189 else
1190 #endif
1191 if ( is_guest_l3_slot(i) &&
1192 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1193 goto fail;
1195 adjust_guest_l3e(pl3e[i], d);
1198 if ( !create_pae_xen_mappings(d, pl3e) )
1199 goto fail;
1201 unmap_domain_page(pl3e);
1202 return 1;
1204 fail:
1205 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1206 while ( i-- > 0 )
1207 if ( is_guest_l3_slot(i) )
1208 put_page_from_l3e(pl3e[i], pfn);
1210 unmap_domain_page(pl3e);
1211 return 0;
1213 #else
1214 #define alloc_l3_table(page) (0)
1215 #endif
1217 #if CONFIG_PAGING_LEVELS >= 4
1218 static int alloc_l4_table(struct page_info *page)
1220 struct domain *d = page_get_owner(page);
1221 unsigned long pfn = page_to_mfn(page);
1222 l4_pgentry_t *pl4e = page_to_virt(page);
1223 int i;
1225 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1227 if ( is_guest_l4_slot(d, i) &&
1228 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1229 goto fail;
1231 adjust_guest_l4e(pl4e[i], d);
1234 /* Xen private mappings. */
1235 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1236 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1237 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1238 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1239 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1240 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1241 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1242 __PAGE_HYPERVISOR);
1243 if ( is_pv_32on64_domain(d) )
1244 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1245 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1246 __PAGE_HYPERVISOR);
1248 return 1;
1250 fail:
1251 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1252 while ( i-- > 0 )
1253 if ( is_guest_l4_slot(d, i) )
1254 put_page_from_l4e(pl4e[i], pfn);
1256 return 0;
1258 #else
1259 #define alloc_l4_table(page) (0)
1260 #endif
1263 static void free_l1_table(struct page_info *page)
1265 struct domain *d = page_get_owner(page);
1266 unsigned long pfn = page_to_mfn(page);
1267 l1_pgentry_t *pl1e;
1268 int i;
1270 pl1e = map_domain_page(pfn);
1272 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1273 if ( is_guest_l1_slot(i) )
1274 put_page_from_l1e(pl1e[i], d);
1276 unmap_domain_page(pl1e);
1280 static void free_l2_table(struct page_info *page)
1282 #ifdef CONFIG_COMPAT
1283 struct domain *d = page_get_owner(page);
1284 #endif
1285 unsigned long pfn = page_to_mfn(page);
1286 l2_pgentry_t *pl2e;
1287 int i;
1289 pl2e = map_domain_page(pfn);
1291 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1292 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1293 put_page_from_l2e(pl2e[i], pfn);
1295 unmap_domain_page(pl2e);
1297 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1301 #if CONFIG_PAGING_LEVELS >= 3
1303 static void free_l3_table(struct page_info *page)
1305 struct domain *d = page_get_owner(page);
1306 unsigned long pfn = page_to_mfn(page);
1307 l3_pgentry_t *pl3e;
1308 int i;
1310 pl3e = map_domain_page(pfn);
1312 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1313 if ( is_guest_l3_slot(i) )
1315 put_page_from_l3e(pl3e[i], pfn);
1316 unadjust_guest_l3e(pl3e[i], d);
1319 unmap_domain_page(pl3e);
1322 #endif
1324 #if CONFIG_PAGING_LEVELS >= 4
1326 static void free_l4_table(struct page_info *page)
1328 struct domain *d = page_get_owner(page);
1329 unsigned long pfn = page_to_mfn(page);
1330 l4_pgentry_t *pl4e = page_to_virt(page);
1331 int i;
1333 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1334 if ( is_guest_l4_slot(d, i) )
1335 put_page_from_l4e(pl4e[i], pfn);
1338 #endif
1341 /* How to write an entry to the guest pagetables.
1342 * Returns 0 for failure (pointer not valid), 1 for success. */
1343 static inline int update_intpte(intpte_t *p,
1344 intpte_t old,
1345 intpte_t new,
1346 unsigned long mfn,
1347 struct vcpu *v)
1349 int rv = 1;
1350 #ifndef PTE_UPDATE_WITH_CMPXCHG
1351 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1352 #else
1354 intpte_t t = old;
1355 for ( ; ; )
1357 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1358 if ( unlikely(rv == 0) )
1360 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1361 ": saw %" PRIpte, old, new, t);
1362 break;
1365 if ( t == old )
1366 break;
1368 /* Allowed to change in Accessed/Dirty flags only. */
1369 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1371 old = t;
1374 #endif
1375 return rv;
1378 /* Macro that wraps the appropriate type-changes around update_intpte().
1379 * Arguments are: type, ptr, old, new, mfn, vcpu */
1380 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1381 update_intpte(&_t ## e_get_intpte(*(_p)), \
1382 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1383 (_m), (_v))
1385 /* Update the L1 entry at pl1e to new value nl1e. */
1386 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1387 unsigned long gl1mfn)
1389 l1_pgentry_t ol1e;
1390 struct domain *d = current->domain;
1391 unsigned long mfn;
1393 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1394 return 0;
1396 if ( unlikely(paging_mode_refcounts(d)) )
1397 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1399 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1401 /* Translate foreign guest addresses. */
1402 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1403 if ( unlikely(mfn == INVALID_MFN) )
1404 return 0;
1405 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1406 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1408 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1410 MEM_LOG("Bad L1 flags %x",
1411 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1412 return 0;
1415 adjust_guest_l1e(nl1e, d);
1417 /* Fast path for identical mapping, r/w and presence. */
1418 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1419 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1421 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1422 return 0;
1424 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1426 put_page_from_l1e(nl1e, d);
1427 return 0;
1430 else
1432 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1433 return 0;
1436 put_page_from_l1e(ol1e, d);
1437 return 1;
1441 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1442 static int mod_l2_entry(l2_pgentry_t *pl2e,
1443 l2_pgentry_t nl2e,
1444 unsigned long pfn,
1445 unsigned long type)
1447 l2_pgentry_t ol2e;
1448 struct domain *d = current->domain;
1450 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1452 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1453 return 0;
1456 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1457 return 0;
1459 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1461 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1463 MEM_LOG("Bad L2 flags %x",
1464 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1465 return 0;
1468 adjust_guest_l2e(nl2e, d);
1470 /* Fast path for identical mapping and presence. */
1471 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1472 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
1474 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1475 return 0;
1477 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1479 put_page_from_l2e(nl2e, pfn);
1480 return 0;
1483 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1485 return 0;
1488 put_page_from_l2e(ol2e, pfn);
1489 return 1;
1492 #if CONFIG_PAGING_LEVELS >= 3
1494 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1495 static int mod_l3_entry(l3_pgentry_t *pl3e,
1496 l3_pgentry_t nl3e,
1497 unsigned long pfn)
1499 l3_pgentry_t ol3e;
1500 struct domain *d = current->domain;
1501 int okay;
1503 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1505 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1506 return 0;
1509 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1510 /*
1511 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1512 * would be a pain to ensure they remain continuously valid throughout.
1513 */
1514 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1515 return 0;
1516 #endif
1518 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1519 return 0;
1521 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1523 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1525 MEM_LOG("Bad L3 flags %x",
1526 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1527 return 0;
1530 adjust_guest_l3e(nl3e, d);
1532 /* Fast path for identical mapping and presence. */
1533 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1534 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
1536 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1537 return 0;
1539 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1541 put_page_from_l3e(nl3e, pfn);
1542 return 0;
1545 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1547 return 0;
1550 okay = create_pae_xen_mappings(d, pl3e);
1551 BUG_ON(!okay);
1553 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1555 put_page_from_l3e(ol3e, pfn);
1556 return 1;
1559 #endif
1561 #if CONFIG_PAGING_LEVELS >= 4
1563 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1564 static int mod_l4_entry(struct domain *d,
1565 l4_pgentry_t *pl4e,
1566 l4_pgentry_t nl4e,
1567 unsigned long pfn)
1569 l4_pgentry_t ol4e;
1571 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1573 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1574 return 0;
1577 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1578 return 0;
1580 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1582 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1584 MEM_LOG("Bad L4 flags %x",
1585 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1586 return 0;
1589 adjust_guest_l4e(nl4e, current->domain);
1591 /* Fast path for identical mapping and presence. */
1592 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1593 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
1595 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1596 return 0;
1598 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1600 put_page_from_l4e(nl4e, pfn);
1601 return 0;
1604 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1606 return 0;
1609 put_page_from_l4e(ol4e, pfn);
1610 return 1;
1613 #endif
1615 static int alloc_page_type(struct page_info *page, unsigned long type)
1617 struct domain *owner = page_get_owner(page);
1619 /* A page table is dirtied when its type count becomes non-zero. */
1620 if ( likely(owner != NULL) )
1621 paging_mark_dirty(owner, page_to_mfn(page));
1623 switch ( type & PGT_type_mask )
1625 case PGT_l1_page_table:
1626 return alloc_l1_table(page);
1627 case PGT_l2_page_table:
1628 return alloc_l2_table(page, type);
1629 case PGT_l3_page_table:
1630 return alloc_l3_table(page);
1631 case PGT_l4_page_table:
1632 return alloc_l4_table(page);
1633 case PGT_gdt_page:
1634 case PGT_ldt_page:
1635 return alloc_segdesc_page(page);
1636 default:
1637 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1638 type, page->u.inuse.type_info,
1639 page->count_info);
1640 BUG();
1643 return 0;
1647 void free_page_type(struct page_info *page, unsigned long type)
1649 struct domain *owner = page_get_owner(page);
1650 unsigned long gmfn;
1652 if ( likely(owner != NULL) )
1654 /*
1655 * We have to flush before the next use of the linear mapping
1656 * (e.g., update_va_mapping()) or we could end up modifying a page
1657 * that is no longer a page table (and hence screw up ref counts).
1658 */
1659 if ( current->domain == owner )
1660 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1661 else
1662 flush_tlb_mask(owner->domain_dirty_cpumask);
1664 if ( unlikely(paging_mode_enabled(owner)) )
1666 /* A page table is dirtied when its type count becomes zero. */
1667 paging_mark_dirty(owner, page_to_mfn(page));
1669 if ( shadow_mode_refcounts(owner) )
1670 return;
1672 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1673 ASSERT(VALID_M2P(gmfn));
1674 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1678 switch ( type & PGT_type_mask )
1680 case PGT_l1_page_table:
1681 free_l1_table(page);
1682 break;
1684 case PGT_l2_page_table:
1685 free_l2_table(page);
1686 break;
1688 #if CONFIG_PAGING_LEVELS >= 3
1689 case PGT_l3_page_table:
1690 free_l3_table(page);
1691 break;
1692 #endif
1694 #if CONFIG_PAGING_LEVELS >= 4
1695 case PGT_l4_page_table:
1696 free_l4_table(page);
1697 break;
1698 #endif
1700 default:
1701 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1702 type, page_to_mfn(page));
1703 BUG();
1708 void put_page_type(struct page_info *page)
1710 unsigned long nx, x, y = page->u.inuse.type_info;
1712 again:
1713 do {
1714 x = y;
1715 nx = x - 1;
1717 ASSERT((x & PGT_count_mask) != 0);
1719 if ( unlikely((nx & PGT_count_mask) == 0) )
1721 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1722 likely(nx & PGT_validated) )
1724 /*
1725 * Page-table pages must be unvalidated when count is zero. The
1726 * 'free' is safe because the refcnt is non-zero and validated
1727 * bit is clear => other ops will spin or fail.
1728 */
1729 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1730 x & ~PGT_validated)) != x) )
1731 goto again;
1732 /* We cleared the 'valid bit' so we do the clean up. */
1733 free_page_type(page, x);
1734 /* Carry on, but with the 'valid bit' now clear. */
1735 x &= ~PGT_validated;
1736 nx &= ~PGT_validated;
1739 /*
1740 * Record TLB information for flush later. We do not stamp page
1741 * tables when running in shadow mode:
1742 * 1. Pointless, since it's the shadow pt's which must be tracked.
1743 * 2. Shadow mode reuses this field for shadowed page tables to
1744 * store flags info -- we don't want to conflict with that.
1745 */
1746 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1747 (page->count_info & PGC_page_table)) )
1748 page->tlbflush_timestamp = tlbflush_current_time();
1751 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1755 int get_page_type(struct page_info *page, unsigned long type)
1757 unsigned long nx, x, y = page->u.inuse.type_info;
1759 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1761 again:
1762 do {
1763 x = y;
1764 nx = x + 1;
1765 if ( unlikely((nx & PGT_count_mask) == 0) )
1767 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1768 return 0;
1770 else if ( unlikely((x & PGT_count_mask) == 0) )
1772 struct domain *d = page_get_owner(page);
1774 /* Never allow a shadowed frame to go from type count 0 to 1 */
1775 if ( d && shadow_mode_enabled(d) )
1776 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1778 ASSERT(!(x & PGT_pae_xen_l2));
1779 if ( (x & PGT_type_mask) != type )
1781 /*
1782 * On type change we check to flush stale TLB entries. This
1783 * may be unnecessary (e.g., page was GDT/LDT) but those
1784 * circumstances should be very rare.
1785 */
1786 cpumask_t mask = d->domain_dirty_cpumask;
1788 /* Don't flush if the timestamp is old enough */
1789 tlbflush_filter(mask, page->tlbflush_timestamp);
1791 if ( unlikely(!cpus_empty(mask)) &&
1792 /* Shadow mode: track only writable pages. */
1793 (!shadow_mode_enabled(page_get_owner(page)) ||
1794 ((nx & PGT_type_mask) == PGT_writable_page)) )
1796 perfc_incr(need_flush_tlb_flush);
1797 flush_tlb_mask(mask);
1800 /* We lose existing type, back pointer, and validity. */
1801 nx &= ~(PGT_type_mask | PGT_validated);
1802 nx |= type;
1804 /* No special validation needed for writable pages. */
1805 /* Page tables and GDT/LDT need to be scanned for validity. */
1806 if ( type == PGT_writable_page )
1807 nx |= PGT_validated;
1810 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1812 /* Don't log failure if it could be a recursive-mapping attempt. */
1813 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1814 (type == PGT_l1_page_table) )
1815 return 0;
1816 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1817 (type == PGT_l2_page_table) )
1818 return 0;
1819 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1820 (type == PGT_l3_page_table) )
1821 return 0;
1822 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1823 "for mfn %lx (pfn %lx)",
1824 x, type, page_to_mfn(page),
1825 get_gpfn_from_mfn(page_to_mfn(page)));
1826 return 0;
1828 else if ( unlikely(!(x & PGT_validated)) )
1830 /* Someone else is updating validation of this page. Wait... */
1831 while ( (y = page->u.inuse.type_info) == x )
1832 cpu_relax();
1833 goto again;
1836 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1838 if ( unlikely(!(nx & PGT_validated)) )
1840 /* Try to validate page type; drop the new reference on failure. */
1841 if ( unlikely(!alloc_page_type(page, type)) )
1843 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1844 PRtype_info ": caf=%08x taf=%" PRtype_info,
1845 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1846 type, page->count_info, page->u.inuse.type_info);
1847 /* Noone else can get a reference. We hold the only ref. */
1848 page->u.inuse.type_info = 0;
1849 return 0;
1852 /* Noone else is updating simultaneously. */
1853 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1856 return 1;
1860 void cleanup_page_cacheattr(struct page_info *page)
1862 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1864 if ( likely(cacheattr == 0) )
1865 return;
1867 page->count_info &= ~PGC_cacheattr_mask;
1869 BUG_ON(is_xen_heap_frame(page));
1871 #ifdef __x86_64__
1872 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1873 1, PAGE_HYPERVISOR);
1874 #endif
1878 int new_guest_cr3(unsigned long mfn)
1880 struct vcpu *v = current;
1881 struct domain *d = v->domain;
1882 int okay;
1883 unsigned long old_base_mfn;
1885 #ifdef CONFIG_COMPAT
1886 if ( is_pv_32on64_domain(d) )
1888 okay = paging_mode_refcounts(d)
1889 ? 0 /* Old code was broken, but what should it be? */
1890 : mod_l4_entry(
1891 d,
1892 __va(pagetable_get_paddr(v->arch.guest_table)),
1893 l4e_from_pfn(
1894 mfn,
1895 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1896 pagetable_get_pfn(v->arch.guest_table));
1897 if ( unlikely(!okay) )
1899 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1900 return 0;
1903 invalidate_shadow_ldt(v);
1904 write_ptbase(v);
1906 return 1;
1908 #endif
1909 okay = paging_mode_refcounts(d)
1910 ? get_page_from_pagenr(mfn, d)
1911 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1912 if ( unlikely(!okay) )
1914 MEM_LOG("Error while installing new baseptr %lx", mfn);
1915 return 0;
1918 invalidate_shadow_ldt(v);
1920 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1922 v->arch.guest_table = pagetable_from_pfn(mfn);
1923 update_cr3(v);
1925 write_ptbase(v);
1927 if ( likely(old_base_mfn != 0) )
1929 if ( paging_mode_refcounts(d) )
1930 put_page(mfn_to_page(old_base_mfn));
1931 else
1932 put_page_and_type(mfn_to_page(old_base_mfn));
1935 return 1;
1938 static void process_deferred_ops(void)
1940 unsigned int deferred_ops;
1941 struct domain *d = current->domain;
1942 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1944 deferred_ops = info->deferred_ops;
1945 info->deferred_ops = 0;
1947 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1949 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1950 flush_tlb_mask(d->domain_dirty_cpumask);
1951 else
1952 flush_tlb_local();
1955 if ( deferred_ops & DOP_RELOAD_LDT )
1956 (void)map_ldt_shadow_page(0);
1958 if ( unlikely(info->foreign != NULL) )
1960 rcu_unlock_domain(info->foreign);
1961 info->foreign = NULL;
1965 static int set_foreigndom(domid_t domid)
1967 struct domain *e, *d = current->domain;
1968 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1969 int okay = 1;
1971 ASSERT(info->foreign == NULL);
1973 if ( likely(domid == DOMID_SELF) )
1974 goto out;
1976 if ( unlikely(domid == d->domain_id) )
1978 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1979 d->domain_id);
1980 okay = 0;
1982 else if ( unlikely(paging_mode_translate(d)) )
1984 MEM_LOG("Cannot mix foreign mappings with translated domains");
1985 okay = 0;
1987 else if ( !IS_PRIV(d) )
1989 switch ( domid )
1991 case DOMID_IO:
1992 info->foreign = rcu_lock_domain(dom_io);
1993 break;
1994 default:
1995 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1996 okay = 0;
1997 break;
2000 else
2002 info->foreign = e = rcu_lock_domain_by_id(domid);
2003 if ( e == NULL )
2005 switch ( domid )
2007 case DOMID_XEN:
2008 info->foreign = rcu_lock_domain(dom_xen);
2009 break;
2010 case DOMID_IO:
2011 info->foreign = rcu_lock_domain(dom_io);
2012 break;
2013 default:
2014 MEM_LOG("Unknown domain '%u'", domid);
2015 okay = 0;
2016 break;
2021 out:
2022 return okay;
2025 static inline cpumask_t vcpumask_to_pcpumask(
2026 struct domain *d, unsigned long vmask)
2028 unsigned int vcpu_id;
2029 cpumask_t pmask = CPU_MASK_NONE;
2030 struct vcpu *v;
2032 while ( vmask != 0 )
2034 vcpu_id = find_first_set_bit(vmask);
2035 vmask &= ~(1UL << vcpu_id);
2036 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2037 ((v = d->vcpu[vcpu_id]) != NULL) )
2038 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2041 return pmask;
2044 int do_mmuext_op(
2045 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2046 unsigned int count,
2047 XEN_GUEST_HANDLE(uint) pdone,
2048 unsigned int foreigndom)
2050 struct mmuext_op op;
2051 int rc = 0, i = 0, okay;
2052 unsigned long mfn = 0, gmfn = 0, type;
2053 unsigned int done = 0;
2054 struct page_info *page;
2055 struct vcpu *v = current;
2056 struct domain *d = v->domain;
2058 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2060 count &= ~MMU_UPDATE_PREEMPTED;
2061 if ( unlikely(!guest_handle_is_null(pdone)) )
2062 (void)copy_from_guest(&done, pdone, 1);
2064 else
2065 perfc_incr(calls_to_mmuext_op);
2067 if ( unlikely(!guest_handle_okay(uops, count)) )
2069 rc = -EFAULT;
2070 goto out;
2073 if ( !set_foreigndom(foreigndom) )
2075 rc = -ESRCH;
2076 goto out;
2079 LOCK_BIGLOCK(d);
2081 for ( i = 0; i < count; i++ )
2083 if ( hypercall_preempt_check() )
2085 rc = hypercall_create_continuation(
2086 __HYPERVISOR_mmuext_op, "hihi",
2087 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2088 break;
2091 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2093 MEM_LOG("Bad __copy_from_guest");
2094 rc = -EFAULT;
2095 break;
2098 okay = 1;
2099 gmfn = op.arg1.mfn;
2100 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2101 page = mfn_to_page(mfn);
2103 switch ( op.cmd )
2105 case MMUEXT_PIN_L1_TABLE:
2106 type = PGT_l1_page_table;
2107 goto pin_page;
2109 case MMUEXT_PIN_L2_TABLE:
2110 type = PGT_l2_page_table;
2111 goto pin_page;
2113 case MMUEXT_PIN_L3_TABLE:
2114 type = PGT_l3_page_table;
2115 goto pin_page;
2117 case MMUEXT_PIN_L4_TABLE:
2118 if ( is_pv_32bit_domain(FOREIGNDOM) )
2119 break;
2120 type = PGT_l4_page_table;
2122 pin_page:
2123 rc = xsm_memory_pin_page(current->domain, page);
2124 if ( rc )
2125 break;
2127 /* Ignore pinning of invalid paging levels. */
2128 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2129 break;
2131 if ( paging_mode_refcounts(FOREIGNDOM) )
2132 break;
2134 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2135 if ( unlikely(!okay) )
2137 MEM_LOG("Error while pinning mfn %lx", mfn);
2138 break;
2141 if ( unlikely(test_and_set_bit(_PGT_pinned,
2142 &page->u.inuse.type_info)) )
2144 MEM_LOG("Mfn %lx already pinned", mfn);
2145 put_page_and_type(page);
2146 okay = 0;
2147 break;
2150 /* A page is dirtied when its pin status is set. */
2151 paging_mark_dirty(d, mfn);
2153 /* We can race domain destruction (domain_relinquish_resources). */
2154 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2156 int drop_ref;
2157 spin_lock(&FOREIGNDOM->page_alloc_lock);
2158 drop_ref = (FOREIGNDOM->is_dying &&
2159 test_and_clear_bit(_PGT_pinned,
2160 &page->u.inuse.type_info));
2161 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2162 if ( drop_ref )
2163 put_page_and_type(page);
2166 break;
2168 case MMUEXT_UNPIN_TABLE:
2169 if ( paging_mode_refcounts(d) )
2170 break;
2172 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2174 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2175 mfn, page_get_owner(page));
2177 else if ( likely(test_and_clear_bit(_PGT_pinned,
2178 &page->u.inuse.type_info)) )
2180 put_page_and_type(page);
2181 put_page(page);
2182 /* A page is dirtied when its pin status is cleared. */
2183 paging_mark_dirty(d, mfn);
2185 else
2187 okay = 0;
2188 put_page(page);
2189 MEM_LOG("Mfn %lx not pinned", mfn);
2191 break;
2193 case MMUEXT_NEW_BASEPTR:
2194 okay = new_guest_cr3(mfn);
2195 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2196 break;
2198 #ifdef __x86_64__
2199 case MMUEXT_NEW_USER_BASEPTR: {
2200 unsigned long old_mfn;
2202 if ( mfn != 0 )
2204 if ( paging_mode_refcounts(d) )
2205 okay = get_page_from_pagenr(mfn, d);
2206 else
2207 okay = get_page_and_type_from_pagenr(
2208 mfn, PGT_root_page_table, d);
2209 if ( unlikely(!okay) )
2211 MEM_LOG("Error while installing new mfn %lx", mfn);
2212 break;
2216 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2217 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2219 if ( old_mfn != 0 )
2221 if ( paging_mode_refcounts(d) )
2222 put_page(mfn_to_page(old_mfn));
2223 else
2224 put_page_and_type(mfn_to_page(old_mfn));
2227 break;
2229 #endif
2231 case MMUEXT_TLB_FLUSH_LOCAL:
2232 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2233 break;
2235 case MMUEXT_INVLPG_LOCAL:
2236 if ( !paging_mode_enabled(d)
2237 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2238 flush_tlb_one_local(op.arg1.linear_addr);
2239 break;
2241 case MMUEXT_TLB_FLUSH_MULTI:
2242 case MMUEXT_INVLPG_MULTI:
2244 unsigned long vmask;
2245 cpumask_t pmask;
2246 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2248 okay = 0;
2249 break;
2251 pmask = vcpumask_to_pcpumask(d, vmask);
2252 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2253 flush_tlb_mask(pmask);
2254 else
2255 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2256 break;
2259 case MMUEXT_TLB_FLUSH_ALL:
2260 flush_tlb_mask(d->domain_dirty_cpumask);
2261 break;
2263 case MMUEXT_INVLPG_ALL:
2264 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2265 break;
2267 case MMUEXT_FLUSH_CACHE:
2268 if ( unlikely(!cache_flush_permitted(d)) )
2270 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2271 okay = 0;
2273 else
2275 wbinvd();
2277 break;
2279 case MMUEXT_SET_LDT:
2281 unsigned long ptr = op.arg1.linear_addr;
2282 unsigned long ents = op.arg2.nr_ents;
2284 if ( paging_mode_external(d) )
2286 MEM_LOG("ignoring SET_LDT hypercall from external "
2287 "domain %u", d->domain_id);
2288 okay = 0;
2290 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2291 (ents > 8192) ||
2292 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2294 okay = 0;
2295 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2297 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2298 (v->arch.guest_context.ldt_base != ptr) )
2300 invalidate_shadow_ldt(v);
2301 v->arch.guest_context.ldt_base = ptr;
2302 v->arch.guest_context.ldt_ents = ents;
2303 load_LDT(v);
2304 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2305 if ( ents != 0 )
2306 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2308 break;
2311 default:
2312 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2313 rc = -ENOSYS;
2314 okay = 0;
2315 break;
2318 if ( unlikely(!okay) )
2320 rc = rc ? rc : -EINVAL;
2321 break;
2324 guest_handle_add_offset(uops, 1);
2327 process_deferred_ops();
2329 UNLOCK_BIGLOCK(d);
2331 perfc_add(num_mmuext_ops, i);
2333 out:
2334 /* Add incremental work we have done to the @done output parameter. */
2335 if ( unlikely(!guest_handle_is_null(pdone)) )
2337 done += i;
2338 copy_to_guest(pdone, &done, 1);
2341 return rc;
2344 int do_mmu_update(
2345 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2346 unsigned int count,
2347 XEN_GUEST_HANDLE(uint) pdone,
2348 unsigned int foreigndom)
2350 struct mmu_update req;
2351 void *va;
2352 unsigned long gpfn, gmfn, mfn;
2353 struct page_info *page;
2354 int rc = 0, okay = 1, i = 0;
2355 unsigned int cmd, done = 0;
2356 struct vcpu *v = current;
2357 struct domain *d = v->domain;
2358 unsigned long type_info;
2359 struct domain_mmap_cache mapcache;
2361 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2363 count &= ~MMU_UPDATE_PREEMPTED;
2364 if ( unlikely(!guest_handle_is_null(pdone)) )
2365 (void)copy_from_guest(&done, pdone, 1);
2367 else
2368 perfc_incr(calls_to_mmu_update);
2370 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2372 rc = -EFAULT;
2373 goto out;
2376 if ( !set_foreigndom(foreigndom) )
2378 rc = -ESRCH;
2379 goto out;
2382 domain_mmap_cache_init(&mapcache);
2384 LOCK_BIGLOCK(d);
2386 for ( i = 0; i < count; i++ )
2388 if ( hypercall_preempt_check() )
2390 rc = hypercall_create_continuation(
2391 __HYPERVISOR_mmu_update, "hihi",
2392 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2393 break;
2396 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2398 MEM_LOG("Bad __copy_from_guest");
2399 rc = -EFAULT;
2400 break;
2403 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2404 okay = 0;
2406 switch ( cmd )
2408 /*
2409 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2410 */
2411 case MMU_NORMAL_PT_UPDATE:
2413 rc = xsm_mmu_normal_update(current->domain, req.val);
2414 if ( rc )
2415 break;
2417 gmfn = req.ptr >> PAGE_SHIFT;
2418 mfn = gmfn_to_mfn(d, gmfn);
2420 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2422 MEM_LOG("Could not get page for normal update");
2423 break;
2426 va = map_domain_page_with_cache(mfn, &mapcache);
2427 va = (void *)((unsigned long)va +
2428 (unsigned long)(req.ptr & ~PAGE_MASK));
2429 page = mfn_to_page(mfn);
2431 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2433 case PGT_l1_page_table:
2434 case PGT_l2_page_table:
2435 case PGT_l3_page_table:
2436 case PGT_l4_page_table:
2438 if ( paging_mode_refcounts(d) )
2440 MEM_LOG("mmu update on auto-refcounted domain!");
2441 break;
2444 if ( unlikely(!get_page_type(
2445 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2446 goto not_a_pt;
2448 switch ( type_info & PGT_type_mask )
2450 case PGT_l1_page_table:
2452 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2453 okay = mod_l1_entry(va, l1e, mfn);
2455 break;
2456 case PGT_l2_page_table:
2458 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2459 okay = mod_l2_entry(va, l2e, mfn, type_info);
2461 break;
2462 #if CONFIG_PAGING_LEVELS >= 3
2463 case PGT_l3_page_table:
2465 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2466 okay = mod_l3_entry(va, l3e, mfn);
2468 break;
2469 #endif
2470 #if CONFIG_PAGING_LEVELS >= 4
2471 case PGT_l4_page_table:
2473 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2474 okay = mod_l4_entry(d, va, l4e, mfn);
2476 break;
2477 #endif
2480 put_page_type(page);
2482 break;
2484 default:
2485 not_a_pt:
2487 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2488 break;
2490 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2492 put_page_type(page);
2494 break;
2497 unmap_domain_page_with_cache(va, &mapcache);
2499 put_page(page);
2500 break;
2502 case MMU_MACHPHYS_UPDATE:
2504 mfn = req.ptr >> PAGE_SHIFT;
2505 gpfn = req.val;
2507 rc = xsm_mmu_machphys_update(current->domain, mfn);
2508 if ( rc )
2509 break;
2511 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2513 MEM_LOG("Could not get page for mach->phys update");
2514 break;
2517 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2519 MEM_LOG("Mach-phys update on auto-translate guest");
2520 break;
2523 set_gpfn_from_mfn(mfn, gpfn);
2524 okay = 1;
2526 paging_mark_dirty(FOREIGNDOM, mfn);
2528 put_page(mfn_to_page(mfn));
2529 break;
2531 default:
2532 MEM_LOG("Invalid page update command %x", cmd);
2533 rc = -ENOSYS;
2534 okay = 0;
2535 break;
2538 if ( unlikely(!okay) )
2540 rc = rc ? rc : -EINVAL;
2541 break;
2544 guest_handle_add_offset(ureqs, 1);
2547 process_deferred_ops();
2549 UNLOCK_BIGLOCK(d);
2551 domain_mmap_cache_destroy(&mapcache);
2553 perfc_add(num_page_updates, i);
2555 out:
2556 /* Add incremental work we have done to the @done output parameter. */
2557 if ( unlikely(!guest_handle_is_null(pdone)) )
2559 done += i;
2560 copy_to_guest(pdone, &done, 1);
2563 return rc;
2567 static int create_grant_pte_mapping(
2568 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2570 int rc = GNTST_okay;
2571 void *va;
2572 unsigned long gmfn, mfn;
2573 struct page_info *page;
2574 u32 type;
2575 l1_pgentry_t ol1e;
2576 struct domain *d = v->domain;
2578 ASSERT(spin_is_locked(&d->big_lock));
2580 adjust_guest_l1e(nl1e, d);
2582 gmfn = pte_addr >> PAGE_SHIFT;
2583 mfn = gmfn_to_mfn(d, gmfn);
2585 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2587 MEM_LOG("Could not get page for normal update");
2588 return GNTST_general_error;
2591 va = map_domain_page(mfn);
2592 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2593 page = mfn_to_page(mfn);
2595 type = page->u.inuse.type_info & PGT_type_mask;
2596 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2598 MEM_LOG("Grant map attempted to update a non-L1 page");
2599 rc = GNTST_general_error;
2600 goto failed;
2603 ol1e = *(l1_pgentry_t *)va;
2604 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
2606 put_page_type(page);
2607 rc = GNTST_general_error;
2608 goto failed;
2611 if ( !paging_mode_refcounts(d) )
2612 put_page_from_l1e(ol1e, d);
2614 put_page_type(page);
2616 failed:
2617 unmap_domain_page(va);
2618 put_page(page);
2620 return rc;
2623 static int destroy_grant_pte_mapping(
2624 uint64_t addr, unsigned long frame, struct domain *d)
2626 int rc = GNTST_okay;
2627 void *va;
2628 unsigned long gmfn, mfn;
2629 struct page_info *page;
2630 u32 type;
2631 l1_pgentry_t ol1e;
2633 gmfn = addr >> PAGE_SHIFT;
2634 mfn = gmfn_to_mfn(d, gmfn);
2636 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2638 MEM_LOG("Could not get page for normal update");
2639 return GNTST_general_error;
2642 va = map_domain_page(mfn);
2643 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2644 page = mfn_to_page(mfn);
2646 type = page->u.inuse.type_info & PGT_type_mask;
2647 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2649 MEM_LOG("Grant map attempted to update a non-L1 page");
2650 rc = GNTST_general_error;
2651 goto failed;
2654 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2656 put_page_type(page);
2657 rc = GNTST_general_error;
2658 goto failed;
2661 /* Check that the virtual address supplied is actually mapped to frame. */
2662 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2664 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2665 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2666 put_page_type(page);
2667 rc = GNTST_general_error;
2668 goto failed;
2671 /* Delete pagetable entry. */
2672 if ( unlikely(!UPDATE_ENTRY(l1,
2673 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2674 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2676 MEM_LOG("Cannot delete PTE entry at %p", va);
2677 put_page_type(page);
2678 rc = GNTST_general_error;
2679 goto failed;
2682 put_page_type(page);
2684 failed:
2685 unmap_domain_page(va);
2686 put_page(page);
2687 return rc;
2691 static int create_grant_va_mapping(
2692 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2694 l1_pgentry_t *pl1e, ol1e;
2695 struct domain *d = v->domain;
2696 unsigned long gl1mfn;
2697 int okay;
2699 ASSERT(spin_is_locked(&d->big_lock));
2701 adjust_guest_l1e(nl1e, d);
2703 pl1e = guest_map_l1e(v, va, &gl1mfn);
2704 if ( !pl1e )
2706 MEM_LOG("Could not find L1 PTE for address %lx", va);
2707 return GNTST_general_error;
2709 ol1e = *pl1e;
2710 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2711 guest_unmap_l1e(v, pl1e);
2712 pl1e = NULL;
2714 if ( !okay )
2715 return GNTST_general_error;
2717 if ( !paging_mode_refcounts(d) )
2718 put_page_from_l1e(ol1e, d);
2720 return GNTST_okay;
2723 static int replace_grant_va_mapping(
2724 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2726 l1_pgentry_t *pl1e, ol1e;
2727 unsigned long gl1mfn;
2728 int rc = 0;
2730 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2731 if ( !pl1e )
2733 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2734 return GNTST_general_error;
2736 ol1e = *pl1e;
2738 /* Check that the virtual address supplied is actually mapped to frame. */
2739 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2741 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2742 l1e_get_pfn(ol1e), addr, frame);
2743 rc = GNTST_general_error;
2744 goto out;
2747 /* Delete pagetable entry. */
2748 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v)) )
2750 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2751 rc = GNTST_general_error;
2752 goto out;
2755 out:
2756 guest_unmap_l1e(v, pl1e);
2757 return rc;
2760 static int destroy_grant_va_mapping(
2761 unsigned long addr, unsigned long frame, struct vcpu *v)
2763 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2766 int create_grant_host_mapping(
2767 uint64_t addr, unsigned long frame, unsigned int flags)
2769 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2771 if ( (flags & GNTMAP_application_map) )
2772 l1e_add_flags(pte,_PAGE_USER);
2773 if ( !(flags & GNTMAP_readonly) )
2774 l1e_add_flags(pte,_PAGE_RW);
2776 if ( flags & GNTMAP_contains_pte )
2777 return create_grant_pte_mapping(addr, pte, current);
2778 return create_grant_va_mapping(addr, pte, current);
2781 int replace_grant_host_mapping(
2782 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2784 l1_pgentry_t *pl1e, ol1e;
2785 unsigned long gl1mfn;
2786 int rc;
2788 if ( flags & GNTMAP_contains_pte )
2790 if ( !new_addr )
2791 return destroy_grant_pte_mapping(addr, frame, current->domain);
2793 MEM_LOG("Unsupported grant table operation");
2794 return GNTST_general_error;
2797 if ( !new_addr )
2798 return destroy_grant_va_mapping(addr, frame, current);
2800 pl1e = guest_map_l1e(current, new_addr, &gl1mfn);
2801 if ( !pl1e )
2803 MEM_LOG("Could not find L1 PTE for address %lx",
2804 (unsigned long)new_addr);
2805 return GNTST_general_error;
2807 ol1e = *pl1e;
2809 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2810 gl1mfn, current)) )
2812 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2813 guest_unmap_l1e(current, pl1e);
2814 return GNTST_general_error;
2817 guest_unmap_l1e(current, pl1e);
2819 rc = replace_grant_va_mapping(addr, frame, ol1e, current);
2820 if ( rc && !paging_mode_refcounts(current->domain) )
2821 put_page_from_l1e(ol1e, current->domain);
2823 return rc;
2826 int steal_page(
2827 struct domain *d, struct page_info *page, unsigned int memflags)
2829 u32 _d, _nd, x, y;
2831 spin_lock(&d->page_alloc_lock);
2833 /*
2834 * The tricky bit: atomically release ownership while there is just one
2835 * benign reference to the page (PGC_allocated). If that reference
2836 * disappears then the deallocation routine will safely spin.
2837 */
2838 _d = pickle_domptr(d);
2839 _nd = page->u.inuse._domain;
2840 y = page->count_info;
2841 do {
2842 x = y;
2843 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2844 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2845 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2846 " caf=%08x, taf=%" PRtype_info "\n",
2847 (void *) page_to_mfn(page),
2848 d, d->domain_id, unpickle_domptr(_nd), x,
2849 page->u.inuse.type_info);
2850 spin_unlock(&d->page_alloc_lock);
2851 return -1;
2853 __asm__ __volatile__(
2854 LOCK_PREFIX "cmpxchg8b %2"
2855 : "=d" (_nd), "=a" (y),
2856 "=m" (*(volatile u64 *)(&page->count_info))
2857 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2858 } while (unlikely(_nd != _d) || unlikely(y != x));
2860 /*
2861 * Unlink from 'd'. At least one reference remains (now anonymous), so
2862 * noone else is spinning to try to delete this page from 'd'.
2863 */
2864 if ( !(memflags & MEMF_no_refcount) )
2865 d->tot_pages--;
2866 list_del(&page->list);
2868 spin_unlock(&d->page_alloc_lock);
2870 return 0;
2873 int do_update_va_mapping(unsigned long va, u64 val64,
2874 unsigned long flags)
2876 l1_pgentry_t val = l1e_from_intpte(val64);
2877 struct vcpu *v = current;
2878 struct domain *d = v->domain;
2879 l1_pgentry_t *pl1e;
2880 unsigned long vmask, bmap_ptr, gl1mfn;
2881 cpumask_t pmask;
2882 int rc = 0;
2884 perfc_incr(calls_to_update_va);
2886 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2887 return -EINVAL;
2889 rc = xsm_update_va_mapping(current->domain, val);
2890 if ( rc )
2891 return rc;
2893 LOCK_BIGLOCK(d);
2895 pl1e = guest_map_l1e(v, va, &gl1mfn);
2897 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2898 rc = -EINVAL;
2900 if ( pl1e )
2901 guest_unmap_l1e(v, pl1e);
2902 pl1e = NULL;
2904 process_deferred_ops();
2906 UNLOCK_BIGLOCK(d);
2908 switch ( flags & UVMF_FLUSHTYPE_MASK )
2910 case UVMF_TLB_FLUSH:
2911 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2913 case UVMF_LOCAL:
2914 flush_tlb_local();
2915 break;
2916 case UVMF_ALL:
2917 flush_tlb_mask(d->domain_dirty_cpumask);
2918 break;
2919 default:
2920 if ( unlikely(!is_pv_32on64_domain(d) ?
2921 get_user(vmask, (unsigned long *)bmap_ptr) :
2922 get_user(vmask, (unsigned int *)bmap_ptr)) )
2923 rc = -EFAULT;
2924 pmask = vcpumask_to_pcpumask(d, vmask);
2925 flush_tlb_mask(pmask);
2926 break;
2928 break;
2930 case UVMF_INVLPG:
2931 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2933 case UVMF_LOCAL:
2934 if ( !paging_mode_enabled(d)
2935 || (paging_invlpg(current, va) != 0) )
2936 flush_tlb_one_local(va);
2937 break;
2938 case UVMF_ALL:
2939 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2940 break;
2941 default:
2942 if ( unlikely(!is_pv_32on64_domain(d) ?
2943 get_user(vmask, (unsigned long *)bmap_ptr) :
2944 get_user(vmask, (unsigned int *)bmap_ptr)) )
2945 rc = -EFAULT;
2946 pmask = vcpumask_to_pcpumask(d, vmask);
2947 flush_tlb_one_mask(pmask, va);
2948 break;
2950 break;
2953 return rc;
2956 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2957 unsigned long flags,
2958 domid_t domid)
2960 int rc;
2962 if ( unlikely(!IS_PRIV(current->domain)) )
2963 return -EPERM;
2965 if ( !set_foreigndom(domid) )
2966 return -ESRCH;
2968 rc = do_update_va_mapping(va, val64, flags);
2970 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
2971 process_deferred_ops(); /* only to clear foreigndom */
2973 return rc;
2978 /*************************
2979 * Descriptor Tables
2980 */
2982 void destroy_gdt(struct vcpu *v)
2984 int i;
2985 unsigned long pfn;
2987 v->arch.guest_context.gdt_ents = 0;
2988 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2990 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2991 put_page_and_type(mfn_to_page(pfn));
2992 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2993 v->arch.guest_context.gdt_frames[i] = 0;
2998 long set_gdt(struct vcpu *v,
2999 unsigned long *frames,
3000 unsigned int entries)
3002 struct domain *d = v->domain;
3003 /* NB. There are 512 8-byte entries per GDT page. */
3004 int i, nr_pages = (entries + 511) / 512;
3005 unsigned long mfn;
3007 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3008 return -EINVAL;
3010 /* Check the pages in the new GDT. */
3011 for ( i = 0; i < nr_pages; i++ ) {
3012 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3013 if ( !mfn_valid(mfn) ||
3014 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3015 goto fail;
3018 /* Tear down the old GDT. */
3019 destroy_gdt(v);
3021 /* Install the new GDT. */
3022 v->arch.guest_context.gdt_ents = entries;
3023 for ( i = 0; i < nr_pages; i++ )
3025 v->arch.guest_context.gdt_frames[i] = frames[i];
3026 l1e_write(&v->arch.perdomain_ptes[i],
3027 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3030 return 0;
3032 fail:
3033 while ( i-- > 0 )
3034 put_page_and_type(mfn_to_page(frames[i]));
3035 return -EINVAL;
3039 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3041 int nr_pages = (entries + 511) / 512;
3042 unsigned long frames[16];
3043 long ret;
3045 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3046 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3047 return -EINVAL;
3049 if ( copy_from_guest(frames, frame_list, nr_pages) )
3050 return -EFAULT;
3052 LOCK_BIGLOCK(current->domain);
3054 if ( (ret = set_gdt(current, frames, entries)) == 0 )
3055 flush_tlb_local();
3057 UNLOCK_BIGLOCK(current->domain);
3059 return ret;
3063 long do_update_descriptor(u64 pa, u64 desc)
3065 struct domain *dom = current->domain;
3066 unsigned long gmfn = pa >> PAGE_SHIFT;
3067 unsigned long mfn;
3068 unsigned int offset;
3069 struct desc_struct *gdt_pent, d;
3070 struct page_info *page;
3071 long ret = -EINVAL;
3073 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3075 *(u64 *)&d = desc;
3077 LOCK_BIGLOCK(dom);
3079 mfn = gmfn_to_mfn(dom, gmfn);
3080 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3081 !mfn_valid(mfn) ||
3082 !check_descriptor(dom, &d) )
3084 UNLOCK_BIGLOCK(dom);
3085 return -EINVAL;
3088 page = mfn_to_page(mfn);
3089 if ( unlikely(!get_page(page, dom)) )
3091 UNLOCK_BIGLOCK(dom);
3092 return -EINVAL;
3095 /* Check if the given frame is in use in an unsafe context. */
3096 switch ( page->u.inuse.type_info & PGT_type_mask )
3098 case PGT_gdt_page:
3099 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3100 goto out;
3101 break;
3102 case PGT_ldt_page:
3103 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3104 goto out;
3105 break;
3106 default:
3107 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3108 goto out;
3109 break;
3112 paging_mark_dirty(dom, mfn);
3114 /* All is good so make the update. */
3115 gdt_pent = map_domain_page(mfn);
3116 memcpy(&gdt_pent[offset], &d, 8);
3117 unmap_domain_page(gdt_pent);
3119 put_page_type(page);
3121 ret = 0; /* success */
3123 out:
3124 put_page(page);
3126 UNLOCK_BIGLOCK(dom);
3128 return ret;
3131 typedef struct e820entry e820entry_t;
3132 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3134 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3136 switch ( op )
3138 case XENMEM_add_to_physmap:
3140 struct xen_add_to_physmap xatp;
3141 unsigned long prev_mfn, mfn = 0, gpfn;
3142 struct domain *d;
3144 if ( copy_from_guest(&xatp, arg, 1) )
3145 return -EFAULT;
3147 if ( xatp.domid == DOMID_SELF )
3148 d = rcu_lock_current_domain();
3149 else if ( !IS_PRIV(current->domain) )
3150 return -EPERM;
3151 else if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3152 return -ESRCH;
3154 if ( xsm_add_to_physmap(current->domain, d) )
3156 rcu_unlock_domain(d);
3157 return -EPERM;
3160 switch ( xatp.space )
3162 case XENMAPSPACE_shared_info:
3163 if ( xatp.idx == 0 )
3164 mfn = virt_to_mfn(d->shared_info);
3165 /* XXX: assumption here, this is called after E820 table is build
3166 * need the E820 to initialize MTRR.
3167 */
3168 if ( is_hvm_domain(d) ) {
3169 extern void init_mtrr_in_hyper(struct vcpu *);
3170 struct vcpu *vs;
3171 for_each_vcpu(d, vs)
3172 init_mtrr_in_hyper(vs);
3174 break;
3175 case XENMAPSPACE_grant_table:
3176 spin_lock(&d->grant_table->lock);
3178 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3179 (xatp.idx < max_nr_grant_frames) )
3180 gnttab_grow_table(d, xatp.idx + 1);
3182 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3183 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3185 spin_unlock(&d->grant_table->lock);
3186 break;
3187 default:
3188 break;
3191 if ( !paging_mode_translate(d) || (mfn == 0) )
3193 rcu_unlock_domain(d);
3194 return -EINVAL;
3197 LOCK_BIGLOCK(d);
3199 /* Remove previously mapped page if it was present. */
3200 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3201 if ( mfn_valid(prev_mfn) )
3203 if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) )
3204 /* Xen heap frames are simply unhooked from this phys slot. */
3205 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3206 else
3207 /* Normal domain memory is freed, to avoid leaking memory. */
3208 guest_remove_page(d, xatp.gpfn);
3211 /* Unmap from old location, if any. */
3212 gpfn = get_gpfn_from_mfn(mfn);
3213 if ( gpfn != INVALID_M2P_ENTRY )
3214 guest_physmap_remove_page(d, gpfn, mfn);
3216 /* Map at new location. */
3217 guest_physmap_add_page(d, xatp.gpfn, mfn);
3219 UNLOCK_BIGLOCK(d);
3221 rcu_unlock_domain(d);
3223 break;
3226 case XENMEM_set_memory_map:
3228 struct xen_foreign_memory_map fmap;
3229 struct domain *d;
3230 int rc;
3232 if ( copy_from_guest(&fmap, arg, 1) )
3233 return -EFAULT;
3235 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3236 return -EINVAL;
3238 if ( fmap.domid == DOMID_SELF )
3239 d = rcu_lock_current_domain();
3240 else if ( !IS_PRIV(current->domain) )
3241 return -EPERM;
3242 else if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3243 return -ESRCH;
3245 rc = xsm_domain_memory_map(d);
3246 if ( rc )
3248 rcu_unlock_domain(d);
3249 return rc;
3252 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3253 fmap.map.nr_entries) ? -EFAULT : 0;
3254 d->arch.nr_e820 = fmap.map.nr_entries;
3256 rcu_unlock_domain(d);
3257 return rc;
3260 case XENMEM_memory_map:
3262 struct xen_memory_map map;
3263 struct domain *d = current->domain;
3265 /* Backwards compatibility. */
3266 if ( d->arch.nr_e820 == 0 )
3267 return -ENOSYS;
3269 if ( copy_from_guest(&map, arg, 1) )
3270 return -EFAULT;
3272 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3273 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3274 copy_to_guest(arg, &map, 1) )
3275 return -EFAULT;
3277 return 0;
3280 case XENMEM_machine_memory_map:
3282 struct xen_memory_map memmap;
3283 XEN_GUEST_HANDLE(e820entry_t) buffer;
3284 int count;
3285 int rc;
3287 if ( !IS_PRIV(current->domain) )
3288 return -EINVAL;
3290 rc = xsm_machine_memory_map();
3291 if ( rc )
3292 return rc;
3294 if ( copy_from_guest(&memmap, arg, 1) )
3295 return -EFAULT;
3296 if ( memmap.nr_entries < e820.nr_map + 1 )
3297 return -EINVAL;
3299 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3301 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3302 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3303 return -EFAULT;
3305 memmap.nr_entries = count;
3307 if ( copy_to_guest(arg, &memmap, 1) )
3308 return -EFAULT;
3310 return 0;
3313 case XENMEM_machphys_mapping:
3315 static const struct xen_machphys_mapping mapping = {
3316 .v_start = MACH2PHYS_VIRT_START,
3317 .v_end = MACH2PHYS_VIRT_END,
3318 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3319 };
3321 if ( copy_to_guest(arg, &mapping, 1) )
3322 return -EFAULT;
3324 return 0;
3327 default:
3328 return subarch_memory_op(op, arg);
3331 return 0;
3335 /*************************
3336 * Writable Pagetables
3337 */
3339 struct ptwr_emulate_ctxt {
3340 struct x86_emulate_ctxt ctxt;
3341 unsigned long cr2;
3342 l1_pgentry_t pte;
3343 };
3345 static int ptwr_emulated_read(
3346 enum x86_segment seg,
3347 unsigned long offset,
3348 unsigned long *val,
3349 unsigned int bytes,
3350 struct x86_emulate_ctxt *ctxt)
3352 unsigned int rc;
3353 unsigned long addr = offset;
3355 *val = 0;
3356 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3358 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3359 return X86EMUL_EXCEPTION;
3362 return X86EMUL_OKAY;
3365 static int ptwr_emulated_update(
3366 unsigned long addr,
3367 paddr_t old,
3368 paddr_t val,
3369 unsigned int bytes,
3370 unsigned int do_cmpxchg,
3371 struct ptwr_emulate_ctxt *ptwr_ctxt)
3373 unsigned long mfn;
3374 unsigned long unaligned_addr = addr;
3375 struct page_info *page;
3376 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3377 struct vcpu *v = current;
3378 struct domain *d = v->domain;
3380 /* Only allow naturally-aligned stores within the original %cr2 page. */
3381 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3383 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3384 ptwr_ctxt->cr2, addr, bytes);
3385 return X86EMUL_UNHANDLEABLE;
3388 /* Turn a sub-word access into a full-word access. */
3389 if ( bytes != sizeof(paddr_t) )
3391 paddr_t full;
3392 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3394 /* Align address; read full word. */
3395 addr &= ~(sizeof(paddr_t)-1);
3396 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3398 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3399 return X86EMUL_EXCEPTION;
3401 /* Mask out bits provided by caller. */
3402 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3403 /* Shift the caller value and OR in the missing bits. */
3404 val &= (((paddr_t)1 << (bytes*8)) - 1);
3405 val <<= (offset)*8;
3406 val |= full;
3407 /* Also fill in missing parts of the cmpxchg old value. */
3408 old &= (((paddr_t)1 << (bytes*8)) - 1);
3409 old <<= (offset)*8;
3410 old |= full;
3413 pte = ptwr_ctxt->pte;
3414 mfn = l1e_get_pfn(pte);
3415 page = mfn_to_page(mfn);
3417 /* We are looking only for read-only mappings of p.t. pages. */
3418 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3419 ASSERT(mfn_valid(mfn));
3420 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3421 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3422 ASSERT(page_get_owner(page) == d);
3424 /* Check the new PTE. */
3425 nl1e = l1e_from_intpte(val);
3426 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3428 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3429 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3430 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3432 /*
3433 * If this is an upper-half write to a PAE PTE then we assume that
3434 * the guest has simply got the two writes the wrong way round. We
3435 * zap the PRESENT bit on the assumption that the bottom half will
3436 * be written immediately after we return to the guest.
3437 */
3438 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3439 l1e_get_intpte(nl1e));
3440 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3442 else
3444 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3445 return X86EMUL_UNHANDLEABLE;
3449 adjust_guest_l1e(nl1e, d);
3451 /* Checked successfully: do the update (write or cmpxchg). */
3452 pl1e = map_domain_page(mfn);
3453 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3454 if ( do_cmpxchg )
3456 int okay;
3457 intpte_t t = old;
3458 ol1e = l1e_from_intpte(old);
3460 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3461 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3462 okay = (okay && t == old);
3464 if ( !okay )
3466 unmap_domain_page(pl1e);
3467 put_page_from_l1e(nl1e, d);
3468 return X86EMUL_CMPXCHG_FAILED;
3471 else
3473 ol1e = *pl1e;
3474 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
3475 BUG();
3478 trace_ptwr_emulation(addr, nl1e);
3480 unmap_domain_page(pl1e);
3482 /* Finally, drop the old PTE. */
3483 put_page_from_l1e(ol1e, d);
3485 return X86EMUL_OKAY;
3488 static int ptwr_emulated_write(
3489 enum x86_segment seg,
3490 unsigned long offset,
3491 unsigned long val,
3492 unsigned int bytes,
3493 struct x86_emulate_ctxt *ctxt)
3495 return ptwr_emulated_update(
3496 offset, 0, val, bytes, 0,
3497 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3500 static int ptwr_emulated_cmpxchg(
3501 enum x86_segment seg,
3502 unsigned long offset,
3503 unsigned long old,
3504 unsigned long new,
3505 unsigned int bytes,
3506 struct x86_emulate_ctxt *ctxt)
3508 return ptwr_emulated_update(
3509 offset, old, new, bytes, 1,
3510 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3513 static int ptwr_emulated_cmpxchg8b(
3514 enum x86_segment seg,
3515 unsigned long offset,
3516 unsigned long old,
3517 unsigned long old_hi,
3518 unsigned long new,
3519 unsigned long new_hi,
3520 struct x86_emulate_ctxt *ctxt)
3522 if ( CONFIG_PAGING_LEVELS == 2 )
3523 return X86EMUL_UNHANDLEABLE;
3524 return ptwr_emulated_update(
3525 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3526 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3529 static struct x86_emulate_ops ptwr_emulate_ops = {
3530 .read = ptwr_emulated_read,
3531 .insn_fetch = ptwr_emulated_read,
3532 .write = ptwr_emulated_write,
3533 .cmpxchg = ptwr_emulated_cmpxchg,
3534 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3535 };
3537 /* Write page fault handler: check if guest is trying to modify a PTE. */
3538 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3539 struct cpu_user_regs *regs)
3541 struct domain *d = v->domain;
3542 struct page_info *page;
3543 l1_pgentry_t pte;
3544 struct ptwr_emulate_ctxt ptwr_ctxt;
3545 int rc;
3547 LOCK_BIGLOCK(d);
3549 /* Attempt to read the PTE that maps the VA being accessed. */
3550 guest_get_eff_l1e(v, addr, &pte);
3551 page = l1e_get_page(pte);
3553 /* We are looking only for read-only mappings of p.t. pages. */
3554 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3555 !mfn_valid(l1e_get_pfn(pte)) ||
3556 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3557 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3558 (page_get_owner(page) != d) )
3559 goto bail;
3561 ptwr_ctxt.ctxt.regs = regs;
3562 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3563 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3564 ptwr_ctxt.cr2 = addr;
3565 ptwr_ctxt.pte = pte;
3567 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3568 if ( rc == X86EMUL_UNHANDLEABLE )
3569 goto bail;
3571 UNLOCK_BIGLOCK(d);
3572 perfc_incr(ptwr_emulations);
3573 return EXCRET_fault_fixed;
3575 bail:
3576 UNLOCK_BIGLOCK(d);
3577 return 0;
3580 void free_xen_pagetable(void *v)
3582 extern int early_boot;
3584 BUG_ON(early_boot);
3586 if ( is_xen_heap_frame(virt_to_page(v)) )
3587 free_xenheap_page(v);
3588 else
3589 free_domheap_page(virt_to_page(v));
3592 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3593 #define l1f_to_l2f(f) ((f) | _PAGE_PSE)
3594 #define l2f_to_l1f(f) ((f) & ~_PAGE_PSE)
3596 /*
3597 * map_pages_to_xen() can be called with interrupts disabled:
3598 * * During early bootstrap; or
3599 * * alloc_xenheap_pages() via memguard_guard_range
3600 * In these cases it is safe to use flush_area_local():
3601 * * Because only the local CPU is online; or
3602 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3603 */
3604 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3605 flush_area_local((const void *)v, f) : \
3606 flush_area_all((const void *)v, f))
3608 int map_pages_to_xen(
3609 unsigned long virt,
3610 unsigned long mfn,
3611 unsigned long nr_mfns,
3612 unsigned int flags)
3614 l2_pgentry_t *pl2e, ol2e;
3615 l1_pgentry_t *pl1e, ol1e;
3616 unsigned int i;
3618 while ( nr_mfns != 0 )
3620 pl2e = virt_to_xen_l2e(virt);
3622 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3623 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3624 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3626 /* Super-page mapping. */
3627 ol2e = *pl2e;
3628 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_l2f(flags)));
3630 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3632 unsigned int flush_flags =
3633 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3635 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3637 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3638 flush_flags |= FLUSH_TLB_GLOBAL;
3639 if ( (l2e_get_flags(ol2e) ^ l1f_to_l2f(flags)) &
3640 l1f_to_l2f(PAGE_CACHE_ATTRS) )
3641 flush_flags |= FLUSH_CACHE;
3642 flush_area(virt, flush_flags);
3644 else
3646 pl1e = l2e_to_l1e(ol2e);
3647 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3649 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3650 flush_flags |= FLUSH_TLB_GLOBAL;
3651 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3652 PAGE_CACHE_ATTRS )
3653 flush_flags |= FLUSH_CACHE;
3655 flush_area(virt, flush_flags);
3656 free_xen_pagetable(pl1e);
3660 virt += 1UL << L2_PAGETABLE_SHIFT;
3661 mfn += 1UL << PAGETABLE_ORDER;
3662 nr_mfns -= 1UL << PAGETABLE_ORDER;
3664 else
3666 /* Normal page mapping. */
3667 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3669 pl1e = alloc_xen_pagetable();
3670 if ( pl1e == NULL )
3671 return -ENOMEM;
3672 clear_page(pl1e);
3673 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3674 __PAGE_HYPERVISOR));
3676 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3678 unsigned int flush_flags =
3679 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3681 /* Skip this PTE if there is no change. */
3682 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3683 l1_table_offset(virt)) == mfn) &&
3684 (((l2f_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3685 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3687 virt += 1UL << L1_PAGETABLE_SHIFT;
3688 mfn += 1UL;
3689 nr_mfns -= 1UL;
3690 continue;
3693 pl1e = alloc_xen_pagetable();
3694 if ( pl1e == NULL )
3695 return -ENOMEM;
3697 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3698 l1e_write(&pl1e[i],
3699 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3700 l2f_to_l1f(l2e_get_flags(*pl2e))));
3702 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3703 flush_flags |= FLUSH_TLB_GLOBAL;
3705 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3706 __PAGE_HYPERVISOR));
3707 flush_area(virt, flush_flags);
3710 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3711 ol1e = *pl1e;
3712 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3713 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3715 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3716 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3717 flush_flags |= FLUSH_TLB_GLOBAL;
3718 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3719 flush_flags |= FLUSH_CACHE;
3720 flush_area(virt, flush_flags);
3723 virt += 1UL << L1_PAGETABLE_SHIFT;
3724 mfn += 1UL;
3725 nr_mfns -= 1UL;
3727 if ( (flags == PAGE_HYPERVISOR) &&
3728 ((nr_mfns == 0) ||
3729 ((((virt >> PAGE_SHIFT) | mfn) &
3730 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3732 unsigned long base_mfn;
3733 pl1e = l2e_to_l1e(*pl2e);
3734 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3735 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3736 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3737 (l1e_get_flags(*pl1e) != flags) )
3738 break;
3739 if ( i == L1_PAGETABLE_ENTRIES )
3741 ol2e = *pl2e;
3742 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3743 l1f_to_l2f(flags)));
3744 flush_area(virt, (FLUSH_TLB_GLOBAL |
3745 FLUSH_ORDER(PAGETABLE_ORDER)));
3746 free_xen_pagetable(l2e_to_l1e(ol2e));
3752 return 0;
3755 void destroy_xen_mappings(unsigned long s, unsigned long e)
3757 l2_pgentry_t *pl2e;
3758 l1_pgentry_t *pl1e;
3759 unsigned int i;
3760 unsigned long v = s;
3762 ASSERT((s & ~PAGE_MASK) == 0);
3763 ASSERT((e & ~PAGE_MASK) == 0);
3765 while ( v < e )
3767 pl2e = virt_to_xen_l2e(v);
3769 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3771 v += 1UL << L2_PAGETABLE_SHIFT;
3772 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
3773 continue;
3776 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3778 if ( (l1_table_offset(v) == 0) &&
3779 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
3781 /* PSE: whole superpage is destroyed. */
3782 l2e_write_atomic(pl2e, l2e_empty());
3783 v += 1UL << L2_PAGETABLE_SHIFT;
3785 else
3787 /* PSE: shatter the superpage and try again. */
3788 pl1e = alloc_xen_pagetable();
3789 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3790 l1e_write(&pl1e[i],
3791 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3792 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3793 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3794 __PAGE_HYPERVISOR));
3797 else
3799 /* Ordinary 4kB mapping. */
3800 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
3801 l1e_write_atomic(pl1e, l1e_empty());
3802 v += PAGE_SIZE;
3804 /* If we are done with the L2E, check if it is now empty. */
3805 if ( (v != e) && (l1_table_offset(v) != 0) )
3806 continue;
3807 pl1e = l2e_to_l1e(*pl2e);
3808 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3809 if ( l1e_get_intpte(pl1e[i]) != 0 )
3810 break;
3811 if ( i == L1_PAGETABLE_ENTRIES )
3813 /* Empty: zap the L2E and free the L1 page. */
3814 l2e_write_atomic(pl2e, l2e_empty());
3815 flush_all(FLUSH_TLB_GLOBAL); /* flush before free */
3816 free_xen_pagetable(pl1e);
3821 flush_all(FLUSH_TLB_GLOBAL);
3824 void __set_fixmap(
3825 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3827 BUG_ON(idx >= __end_of_fixed_addresses);
3828 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3831 #ifdef MEMORY_GUARD
3833 void memguard_init(void)
3835 map_pages_to_xen(
3836 (unsigned long)__va(xen_phys_start),
3837 xen_phys_start >> PAGE_SHIFT,
3838 (xenheap_phys_end - xen_phys_start) >> PAGE_SHIFT,
3839 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3840 #ifdef __x86_64__
3841 map_pages_to_xen(
3842 XEN_VIRT_START,
3843 xen_phys_start >> PAGE_SHIFT,
3844 (__pa(&_end) + PAGE_SIZE - 1 - xen_phys_start) >> PAGE_SHIFT,
3845 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3846 #endif
3849 static void __memguard_change_range(void *p, unsigned long l, int guard)
3851 unsigned long _p = (unsigned long)p;
3852 unsigned long _l = (unsigned long)l;
3853 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3855 /* Ensure we are dealing with a page-aligned whole number of pages. */
3856 ASSERT((_p&~PAGE_MASK) == 0);
3857 ASSERT((_l&~PAGE_MASK) == 0);
3859 if ( guard )
3860 flags &= ~_PAGE_PRESENT;
3862 map_pages_to_xen(
3863 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3866 void memguard_guard_range(void *p, unsigned long l)
3868 __memguard_change_range(p, l, 1);
3871 void memguard_unguard_range(void *p, unsigned long l)
3873 __memguard_change_range(p, l, 0);
3876 #endif
3878 void memguard_guard_stack(void *p)
3880 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3881 p = (void *)((unsigned long)p + STACK_SIZE -
3882 PRIMARY_STACK_SIZE - PAGE_SIZE);
3883 memguard_guard_range(p, PAGE_SIZE);
3886 /*
3887 * Local variables:
3888 * mode: C
3889 * c-set-style: "BSD"
3890 * c-basic-offset: 4
3891 * tab-width: 4
3892 * indent-tabs-mode: nil
3893 * End:
3894 */