debuggers.hg

view xen/arch/x86/mm.c @ 10954:43ff88825b1a

[XEN] Allow add_to_physmap to be applied to DOMID_SELF.
Also sanitise handling of existing mappings of Xen heap
frames.
Signed-off-by: Steven Smith <ssmith@xensource.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Aug 03 14:20:45 2006 +0100 (2006-08-03)
parents 37f206c7405a
children 7e9699af7e12
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/shadow.h>
103 #include <asm/page.h>
104 #include <asm/flushtlb.h>
105 #include <asm/io.h>
106 #include <asm/ldt.h>
107 #include <asm/x86_emulate.h>
108 #include <public/memory.h>
110 #ifdef VERBOSE
111 #define MEM_LOG(_f, _a...) \
112 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
113 current->domain->domain_id , __LINE__ , ## _a )
114 #else
115 #define MEM_LOG(_f, _a...) ((void)0)
116 #endif
118 /*
119 * PTE updates can be done with ordinary writes except:
120 * 1. Debug builds get extra checking by using CMPXCHG[8B].
121 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
122 */
123 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
124 #define PTE_UPDATE_WITH_CMPXCHG
125 #endif
127 /*
128 * Both do_mmuext_op() and do_mmu_update():
129 * We steal the m.s.b. of the @count parameter to indicate whether this
130 * invocation of do_mmu_update() is resuming a previously preempted call.
131 */
132 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
134 static void free_l2_table(struct page_info *page);
135 static void free_l1_table(struct page_info *page);
137 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
138 unsigned long type);
139 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
141 /* Used to defer flushing of memory structures. */
142 static struct {
143 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
144 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
145 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
146 unsigned int deferred_ops;
147 /* If non-NULL, specifies a foreign subject domain for some operations. */
148 struct domain *foreign;
149 } __cacheline_aligned percpu_info[NR_CPUS];
151 /*
152 * Returns the current foreign domain; defaults to the currently-executing
153 * domain if a foreign override hasn't been specified.
154 */
155 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
157 /* Private domain structs for DOMID_XEN and DOMID_IO. */
158 static struct domain *dom_xen, *dom_io;
160 /* Frame table and its size in pages. */
161 struct page_info *frame_table;
162 unsigned long max_page;
163 unsigned long total_pages;
165 void __init init_frametable(void)
166 {
167 unsigned long nr_pages, page_step, i, mfn;
169 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
171 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
172 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
174 for ( i = 0; i < nr_pages; i += page_step )
175 {
176 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
177 if ( mfn == 0 )
178 panic("Not enough memory for frame table\n");
179 map_pages_to_xen(
180 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
181 mfn, page_step, PAGE_HYPERVISOR);
182 }
184 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
185 }
187 void arch_init_memory(void)
188 {
189 extern void subarch_init_memory(void);
191 unsigned long i, pfn, rstart_pfn, rend_pfn;
193 memset(percpu_info, 0, sizeof(percpu_info));
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain(DOMID_XEN);
201 BUG_ON(dom_xen == NULL);
203 /*
204 * Initialise our DOMID_IO domain.
205 * This domain owns I/O pages that are within the range of the page_info
206 * array. Mappings occur at the priv of the caller.
207 */
208 dom_io = alloc_domain(DOMID_IO);
209 BUG_ON(dom_io == NULL);
211 /* First 1MB of RAM is historically marked as I/O. */
212 for ( i = 0; i < 0x100; i++ )
213 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
215 /* Any areas not specified as RAM by the e820 map are considered I/O. */
216 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
217 {
218 if ( e820.map[i].type != E820_RAM )
219 continue;
220 /* Every page from cursor to start of next RAM region is I/O. */
221 rstart_pfn = PFN_UP(e820.map[i].addr);
222 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
223 for ( ; pfn < rstart_pfn; pfn++ )
224 {
225 BUG_ON(!mfn_valid(pfn));
226 share_xen_page_with_guest(
227 mfn_to_page(pfn), dom_io, XENSHARE_writable);
228 }
229 /* Skip the RAM region. */
230 pfn = rend_pfn;
231 }
232 BUG_ON(pfn != max_page);
234 subarch_init_memory();
235 }
237 void share_xen_page_with_guest(
238 struct page_info *page, struct domain *d, int readonly)
239 {
240 if ( page_get_owner(page) == d )
241 return;
243 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
245 spin_lock(&d->page_alloc_lock);
247 /* The incremented type count pins as writable or read-only. */
248 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
249 page->u.inuse.type_info |= PGT_validated | 1;
251 page_set_owner(page, d);
252 wmb(); /* install valid domain ptr before updating refcnt. */
253 ASSERT(page->count_info == 0);
254 page->count_info |= PGC_allocated | 1;
256 if ( unlikely(d->xenheap_pages++ == 0) )
257 get_knownalive_domain(d);
258 list_add_tail(&page->list, &d->xenpage_list);
260 spin_unlock(&d->page_alloc_lock);
261 }
263 void share_xen_page_with_privileged_guests(
264 struct page_info *page, int readonly)
265 {
266 share_xen_page_with_guest(page, dom_xen, readonly);
267 }
269 #if defined(CONFIG_X86_PAE)
271 #ifdef NDEBUG
272 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
273 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
274 #else
275 /*
276 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
277 * We cannot safely shadow the idle page table, nor shadow-mode page tables
278 * (detected by lack of an owning domain). As required for correctness, we
279 * always shadow PDPTs aboive 4GB.
280 */
281 #define l3tab_needs_shadow(mfn) \
282 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
283 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
284 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
285 ((mfn) >= 0x100000))
286 #endif
288 static l1_pgentry_t *fix_pae_highmem_pl1e;
290 /* Cache the address of PAE high-memory fixmap page tables. */
291 static int __init cache_pae_fixmap_address(void)
292 {
293 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
294 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
295 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
296 return 0;
297 }
298 __initcall(cache_pae_fixmap_address);
300 static void __write_ptbase(unsigned long mfn)
301 {
302 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
303 struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
304 unsigned int cpu = smp_processor_id();
306 /* Fast path 1: does this mfn need a shadow at all? */
307 if ( !l3tab_needs_shadow(mfn) )
308 {
309 write_cr3(mfn << PAGE_SHIFT);
310 /* Cache is no longer in use or valid (/after/ write to %cr3). */
311 cache->high_mfn = 0;
312 return;
313 }
315 /* Caching logic is not interrupt safe. */
316 ASSERT(!in_irq());
318 /* Fast path 2: is this mfn already cached? */
319 if ( cache->high_mfn == mfn )
320 {
321 write_cr3(__pa(cache->table[cache->inuse_idx]));
322 return;
323 }
325 /* Protects against pae_flush_pgd(). */
326 spin_lock(&cache->lock);
328 cache->inuse_idx ^= 1;
329 cache->high_mfn = mfn;
331 /* Map the guest L3 table and copy to the chosen low-memory cache. */
332 *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
333 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
334 lowmem_l3tab = cache->table[cache->inuse_idx];
335 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
336 *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
338 /* Install the low-memory L3 table in CR3. */
339 write_cr3(__pa(lowmem_l3tab));
341 spin_unlock(&cache->lock);
342 }
344 #else /* !CONFIG_X86_PAE */
346 static void __write_ptbase(unsigned long mfn)
347 {
348 write_cr3(mfn << PAGE_SHIFT);
349 }
351 #endif /* !CONFIG_X86_PAE */
353 void write_ptbase(struct vcpu *v)
354 {
355 __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
356 }
358 void invalidate_shadow_ldt(struct vcpu *v)
359 {
360 int i;
361 unsigned long pfn;
362 struct page_info *page;
364 if ( v->arch.shadow_ldt_mapcnt == 0 )
365 return;
367 v->arch.shadow_ldt_mapcnt = 0;
369 for ( i = 16; i < 32; i++ )
370 {
371 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
372 if ( pfn == 0 ) continue;
373 v->arch.perdomain_ptes[i] = l1e_empty();
374 page = mfn_to_page(pfn);
375 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
376 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
377 put_page_and_type(page);
378 }
380 /* Dispose of the (now possibly invalid) mappings from the TLB. */
381 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
382 }
385 static int alloc_segdesc_page(struct page_info *page)
386 {
387 struct desc_struct *descs;
388 int i;
390 descs = map_domain_page(page_to_mfn(page));
392 for ( i = 0; i < 512; i++ )
393 if ( unlikely(!check_descriptor(&descs[i])) )
394 goto fail;
396 unmap_domain_page(descs);
397 return 1;
399 fail:
400 unmap_domain_page(descs);
401 return 0;
402 }
405 /* Map shadow page at offset @off. */
406 int map_ldt_shadow_page(unsigned int off)
407 {
408 struct vcpu *v = current;
409 struct domain *d = v->domain;
410 unsigned long gmfn, mfn;
411 l1_pgentry_t l1e, nl1e;
412 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
413 int res;
415 #if defined(__x86_64__)
416 /* If in user mode, switch to kernel mode just to read LDT mapping. */
417 int user_mode = !(v->arch.flags & TF_kernel_mode);
418 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
419 #elif defined(__i386__)
420 #define TOGGLE_MODE() ((void)0)
421 #endif
423 BUG_ON(unlikely(in_irq()));
425 shadow_sync_va(v, gva);
427 TOGGLE_MODE();
428 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
429 sizeof(l1e));
430 TOGGLE_MODE();
432 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
433 return 0;
435 gmfn = l1e_get_pfn(l1e);
436 mfn = gmfn_to_mfn(d, gmfn);
437 if ( unlikely(!VALID_MFN(mfn)) )
438 return 0;
440 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
442 if ( !res && unlikely(shadow_mode_refcounts(d)) )
443 {
444 shadow_lock(d);
445 shadow_remove_all_write_access(d, gmfn, mfn);
446 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
447 shadow_unlock(d);
448 }
450 if ( unlikely(!res) )
451 return 0;
453 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
455 v->arch.perdomain_ptes[off + 16] = nl1e;
456 v->arch.shadow_ldt_mapcnt++;
458 return 1;
459 }
462 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
463 {
464 struct page_info *page = mfn_to_page(page_nr);
466 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
467 {
468 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
469 return 0;
470 }
472 return 1;
473 }
476 static int get_page_and_type_from_pagenr(unsigned long page_nr,
477 unsigned long type,
478 struct domain *d)
479 {
480 struct page_info *page = mfn_to_page(page_nr);
482 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
483 return 0;
485 if ( unlikely(!get_page_type(page, type)) )
486 {
487 put_page(page);
488 return 0;
489 }
491 return 1;
492 }
494 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
495 /*
496 * We allow root tables to map each other (a.k.a. linear page tables). It
497 * needs some special care with reference counts and access permissions:
498 * 1. The mapping entry must be read-only, or the guest may get write access
499 * to its own PTEs.
500 * 2. We must only bump the reference counts for an *already validated*
501 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
502 * on a validation that is required to complete that validation.
503 * 3. We only need to increment the reference counts for the mapped page
504 * frame if it is mapped by a different root table. This is sufficient and
505 * also necessary to allow validation of a root table mapping itself.
506 */
507 static int
508 get_linear_pagetable(
509 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
510 {
511 unsigned long x, y;
512 struct page_info *page;
513 unsigned long pfn;
515 ASSERT( !shadow_mode_refcounts(d) );
517 if ( (root_get_flags(re) & _PAGE_RW) )
518 {
519 MEM_LOG("Attempt to create linear p.t. with write perms");
520 return 0;
521 }
523 if ( (pfn = root_get_pfn(re)) != re_pfn )
524 {
525 /* Make sure the mapped frame belongs to the correct domain. */
526 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
527 return 0;
529 /*
530 * Make sure that the mapped frame is an already-validated L2 table.
531 * If so, atomically increment the count (checking for overflow).
532 */
533 page = mfn_to_page(pfn);
534 y = page->u.inuse.type_info;
535 do {
536 x = y;
537 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
538 unlikely((x & (PGT_type_mask|PGT_validated)) !=
539 (PGT_root_page_table|PGT_validated)) )
540 {
541 put_page(page);
542 return 0;
543 }
544 }
545 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
546 }
548 return 1;
549 }
550 #endif /* !CONFIG_X86_PAE */
552 int
553 get_page_from_l1e(
554 l1_pgentry_t l1e, struct domain *d)
555 {
556 unsigned long mfn = l1e_get_pfn(l1e);
557 struct page_info *page = mfn_to_page(mfn);
558 int okay;
560 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
561 return 1;
563 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
564 {
565 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
566 return 0;
567 }
569 if ( unlikely(!mfn_valid(mfn)) ||
570 unlikely(page_get_owner(page) == dom_io) )
571 {
572 /* DOMID_IO reverts to caller for privilege checks. */
573 if ( d == dom_io )
574 d = current->domain;
576 if ( !iomem_access_permitted(d, mfn, mfn) )
577 {
578 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
579 return 0;
580 }
582 /* No reference counting for out-of-range I/O pages. */
583 if ( !mfn_valid(mfn) )
584 return 1;
586 d = dom_io;
587 }
589 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
590 get_page_and_type(page, d, PGT_writable_page) :
591 get_page(page, d));
592 if ( !okay )
593 {
594 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
595 " for dom%d",
596 mfn, get_gpfn_from_mfn(mfn),
597 l1e_get_intpte(l1e), d->domain_id);
598 }
600 return okay;
601 }
604 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
605 static int
606 get_page_from_l2e(
607 l2_pgentry_t l2e, unsigned long pfn,
608 struct domain *d, unsigned long vaddr)
609 {
610 int rc;
612 ASSERT(!shadow_mode_refcounts(d));
614 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
615 return 1;
617 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
618 {
619 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
620 return 0;
621 }
623 vaddr >>= L2_PAGETABLE_SHIFT;
624 vaddr <<= PGT_va_shift;
625 rc = get_page_and_type_from_pagenr(
626 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
627 #if CONFIG_PAGING_LEVELS == 2
628 if ( unlikely(!rc) )
629 rc = get_linear_pagetable(l2e, pfn, d);
630 #endif
631 return rc;
632 }
635 #if CONFIG_PAGING_LEVELS >= 3
636 static int
637 get_page_from_l3e(
638 l3_pgentry_t l3e, unsigned long pfn,
639 struct domain *d, unsigned long vaddr)
640 {
641 int rc;
643 ASSERT(!shadow_mode_refcounts(d));
645 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
646 return 1;
648 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
649 {
650 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
651 return 0;
652 }
654 vaddr >>= L3_PAGETABLE_SHIFT;
655 vaddr <<= PGT_va_shift;
656 rc = get_page_and_type_from_pagenr(
657 l3e_get_pfn(l3e),
658 PGT_l2_page_table | vaddr, d);
659 return rc;
660 }
661 #endif /* 3 level */
663 #if CONFIG_PAGING_LEVELS >= 4
664 static int
665 get_page_from_l4e(
666 l4_pgentry_t l4e, unsigned long pfn,
667 struct domain *d, unsigned long vaddr)
668 {
669 int rc;
671 ASSERT( !shadow_mode_refcounts(d) );
673 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
674 return 1;
676 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
677 {
678 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
679 return 0;
680 }
682 vaddr >>= L4_PAGETABLE_SHIFT;
683 vaddr <<= PGT_va_shift;
684 rc = get_page_and_type_from_pagenr(
685 l4e_get_pfn(l4e),
686 PGT_l3_page_table | vaddr, d);
688 if ( unlikely(!rc) )
689 rc = get_linear_pagetable(l4e, pfn, d);
691 return rc;
692 }
693 #endif /* 4 level */
696 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
697 {
698 unsigned long pfn = l1e_get_pfn(l1e);
699 struct page_info *page = mfn_to_page(pfn);
700 struct domain *e;
701 struct vcpu *v;
703 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
704 return;
706 e = page_get_owner(page);
708 /*
709 * Check if this is a mapping that was established via a grant reference.
710 * If it was then we should not be here: we require that such mappings are
711 * explicitly destroyed via the grant-table interface.
712 *
713 * The upshot of this is that the guest can end up with active grants that
714 * it cannot destroy (because it no longer has a PTE to present to the
715 * grant-table interface). This can lead to subtle hard-to-catch bugs,
716 * hence a special grant PTE flag can be enabled to catch the bug early.
717 *
718 * (Note that the undestroyable active grants are not a security hole in
719 * Xen. All active grants can safely be cleaned up when the domain dies.)
720 */
721 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
722 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
723 {
724 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
725 l1e_get_intpte(l1e));
726 domain_crash(d);
727 }
729 if ( l1e_get_flags(l1e) & _PAGE_RW )
730 {
731 put_page_and_type(page);
732 }
733 else
734 {
735 /* We expect this is rare so we blow the entire shadow LDT. */
736 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
737 PGT_ldt_page)) &&
738 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
739 (d == e) )
740 {
741 for_each_vcpu ( d, v )
742 invalidate_shadow_ldt(v);
743 }
744 put_page(page);
745 }
746 }
749 /*
750 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
751 * Note also that this automatically deals correctly with linear p.t.'s.
752 */
753 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
754 {
755 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
756 (l2e_get_pfn(l2e) != pfn) )
757 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
758 }
761 #if CONFIG_PAGING_LEVELS >= 3
762 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
763 {
764 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
765 (l3e_get_pfn(l3e) != pfn) )
766 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
767 }
768 #endif
770 #if CONFIG_PAGING_LEVELS >= 4
771 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
772 {
773 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
774 (l4e_get_pfn(l4e) != pfn) )
775 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
776 }
777 #endif
779 static int alloc_l1_table(struct page_info *page)
780 {
781 struct domain *d = page_get_owner(page);
782 unsigned long pfn = page_to_mfn(page);
783 l1_pgentry_t *pl1e;
784 int i;
786 ASSERT(!shadow_mode_refcounts(d));
788 pl1e = map_domain_page(pfn);
790 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
791 if ( is_guest_l1_slot(i) &&
792 unlikely(!get_page_from_l1e(pl1e[i], d)) )
793 goto fail;
795 unmap_domain_page(pl1e);
796 return 1;
798 fail:
799 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
800 while ( i-- > 0 )
801 if ( is_guest_l1_slot(i) )
802 put_page_from_l1e(pl1e[i], d);
804 unmap_domain_page(pl1e);
805 return 0;
806 }
808 #ifdef CONFIG_X86_PAE
809 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
810 {
811 struct page_info *page;
812 l2_pgentry_t *pl2e;
813 l3_pgentry_t l3e3;
814 int i;
816 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
818 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
819 l3e3 = pl3e[3];
820 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
821 {
822 MEM_LOG("PAE L3 3rd slot is empty");
823 return 0;
824 }
826 /*
827 * The Xen-private mappings include linear mappings. The L2 thus cannot
828 * be shared by multiple L3 tables. The test here is adequate because:
829 * 1. Cannot appear in slots != 3 because the page would then then have
830 * unknown va backpointer, which get_page_type() explicitly disallows.
831 * 2. Cannot appear in another page table's L3:
832 * a. alloc_l3_table() calls this function and this check will fail
833 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
834 */
835 page = l3e_get_page(l3e3);
836 BUG_ON(page->u.inuse.type_info & PGT_pinned);
837 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
838 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
839 {
840 MEM_LOG("PAE L3 3rd slot is shared");
841 return 0;
842 }
844 /* Xen private mappings. */
845 pl2e = map_domain_page(l3e_get_pfn(l3e3));
846 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
847 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
848 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
849 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
850 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
851 l2e_from_page(
852 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
853 __PAGE_HYPERVISOR);
854 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
855 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
856 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
857 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
858 l2e_empty();
859 unmap_domain_page(pl2e);
861 return 1;
862 }
864 /* Flush a pgdir update into low-memory caches. */
865 static void pae_flush_pgd(
866 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
867 {
868 struct domain *d = page_get_owner(mfn_to_page(mfn));
869 struct vcpu *v;
870 intpte_t _ol3e, _nl3e, _pl3e;
871 l3_pgentry_t *l3tab_ptr;
872 struct pae_l3_cache *cache;
874 /* If below 4GB then the pgdir is not shadowed in low memory. */
875 if ( !l3tab_needs_shadow(mfn) )
876 return;
878 for_each_vcpu ( d, v )
879 {
880 cache = &v->arch.pae_l3_cache;
882 spin_lock(&cache->lock);
884 if ( cache->high_mfn == mfn )
885 {
886 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
887 _ol3e = l3e_get_intpte(*l3tab_ptr);
888 _nl3e = l3e_get_intpte(nl3e);
889 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
890 BUG_ON(_pl3e != _ol3e);
891 }
893 spin_unlock(&cache->lock);
894 }
896 flush_tlb_mask(d->domain_dirty_cpumask);
897 }
899 static inline int l1_backptr(
900 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
901 {
902 unsigned long l2_backptr = l2_type & PGT_va_mask;
903 ASSERT(l2_backptr != PGT_va_unknown);
904 ASSERT(l2_backptr != PGT_va_mutable);
905 *backptr =
906 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
907 (offset_in_l2 << L2_PAGETABLE_SHIFT);
908 return 1;
909 }
911 #elif CONFIG_X86_64
912 # define create_pae_xen_mappings(pl3e) (1)
913 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
915 static inline int l1_backptr(
916 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
917 {
918 unsigned long l2_backptr = l2_type & PGT_va_mask;
919 ASSERT(l2_backptr != PGT_va_unknown);
920 ASSERT(l2_backptr != PGT_va_mutable);
921 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
922 (offset_in_l2 << L2_PAGETABLE_SHIFT);
923 return 1;
924 }
926 static inline int l2_backptr(
927 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
928 {
929 unsigned long l3_backptr = l3_type & PGT_va_mask;
930 ASSERT(l3_backptr != PGT_va_unknown);
931 ASSERT(l3_backptr != PGT_va_mutable);
932 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
933 (offset_in_l3 << L3_PAGETABLE_SHIFT);
934 return 1;
935 }
937 static inline int l3_backptr(
938 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
939 {
940 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
941 return 1;
942 }
943 #else
944 # define create_pae_xen_mappings(pl3e) (1)
945 # define l1_backptr(bp,l2o,l2t) \
946 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
947 #endif
949 static int alloc_l2_table(struct page_info *page, unsigned long type)
950 {
951 struct domain *d = page_get_owner(page);
952 unsigned long pfn = page_to_mfn(page);
953 unsigned long vaddr;
954 l2_pgentry_t *pl2e;
955 int i;
957 /* See the code in shadow_promote() to understand why this is here. */
958 if ( (PGT_base_page_table == PGT_l2_page_table) &&
959 unlikely(shadow_mode_refcounts(d)) )
960 return 1;
961 ASSERT(!shadow_mode_refcounts(d));
963 pl2e = map_domain_page(pfn);
965 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
966 {
967 if ( !l1_backptr(&vaddr, i, type) )
968 goto fail;
969 if ( is_guest_l2_slot(type, i) &&
970 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
971 goto fail;
972 }
974 #if CONFIG_PAGING_LEVELS == 2
975 /* Xen private mappings. */
976 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
977 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
978 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
979 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
980 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
981 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
982 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
983 l2e_from_page(
984 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
985 __PAGE_HYPERVISOR);
986 #endif
988 unmap_domain_page(pl2e);
989 return 1;
991 fail:
992 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
993 while ( i-- > 0 )
994 if ( is_guest_l2_slot(type, i) )
995 put_page_from_l2e(pl2e[i], pfn);
997 unmap_domain_page(pl2e);
998 return 0;
999 }
1002 #if CONFIG_PAGING_LEVELS >= 3
1003 static int alloc_l3_table(struct page_info *page, unsigned long type)
1005 struct domain *d = page_get_owner(page);
1006 unsigned long pfn = page_to_mfn(page);
1007 unsigned long vaddr;
1008 l3_pgentry_t *pl3e;
1009 int i;
1011 /* See the code in shadow_promote() to understand why this is here. */
1012 if ( (PGT_base_page_table == PGT_l3_page_table) &&
1013 shadow_mode_refcounts(d) )
1014 return 1;
1015 ASSERT(!shadow_mode_refcounts(d));
1017 #ifdef CONFIG_X86_PAE
1018 /*
1019 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1020 * the weird 'extended cr3' format for dealing with high-order address
1021 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1022 */
1023 if ( (pfn >= 0x100000) &&
1024 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1025 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1027 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1028 return 0;
1030 #endif
1032 pl3e = map_domain_page(pfn);
1033 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1035 #if CONFIG_PAGING_LEVELS >= 4
1036 if ( !l2_backptr(&vaddr, i, type) )
1037 goto fail;
1038 #else
1039 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
1040 #endif
1041 if ( is_guest_l3_slot(i) &&
1042 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
1043 goto fail;
1046 if ( !create_pae_xen_mappings(pl3e) )
1047 goto fail;
1049 unmap_domain_page(pl3e);
1050 return 1;
1052 fail:
1053 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1054 while ( i-- > 0 )
1055 if ( is_guest_l3_slot(i) )
1056 put_page_from_l3e(pl3e[i], pfn);
1058 unmap_domain_page(pl3e);
1059 return 0;
1061 #else
1062 #define alloc_l3_table(page, type) (0)
1063 #endif
1065 #if CONFIG_PAGING_LEVELS >= 4
1066 static int alloc_l4_table(struct page_info *page, unsigned long type)
1068 struct domain *d = page_get_owner(page);
1069 unsigned long pfn = page_to_mfn(page);
1070 l4_pgentry_t *pl4e = page_to_virt(page);
1071 unsigned long vaddr;
1072 int i;
1074 /* See the code in shadow_promote() to understand why this is here. */
1075 if ( (PGT_base_page_table == PGT_l4_page_table) &&
1076 shadow_mode_refcounts(d) )
1077 return 1;
1078 ASSERT(!shadow_mode_refcounts(d));
1080 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1082 if ( !l3_backptr(&vaddr, i, type) )
1083 goto fail;
1085 if ( is_guest_l4_slot(i) &&
1086 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
1087 goto fail;
1090 /* Xen private mappings. */
1091 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1092 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1093 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1094 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1095 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1096 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1097 l4e_from_page(
1098 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1099 __PAGE_HYPERVISOR);
1101 return 1;
1103 fail:
1104 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1105 while ( i-- > 0 )
1106 if ( is_guest_l4_slot(i) )
1107 put_page_from_l4e(pl4e[i], pfn);
1109 return 0;
1111 #else
1112 #define alloc_l4_table(page, type) (0)
1113 #endif
1116 static void free_l1_table(struct page_info *page)
1118 struct domain *d = page_get_owner(page);
1119 unsigned long pfn = page_to_mfn(page);
1120 l1_pgentry_t *pl1e;
1121 int i;
1123 pl1e = map_domain_page(pfn);
1125 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1126 if ( is_guest_l1_slot(i) )
1127 put_page_from_l1e(pl1e[i], d);
1129 unmap_domain_page(pl1e);
1133 static void free_l2_table(struct page_info *page)
1135 unsigned long pfn = page_to_mfn(page);
1136 l2_pgentry_t *pl2e;
1137 int i;
1139 pl2e = map_domain_page(pfn);
1141 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1142 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1143 put_page_from_l2e(pl2e[i], pfn);
1145 unmap_domain_page(pl2e);
1149 #if CONFIG_PAGING_LEVELS >= 3
1151 static void free_l3_table(struct page_info *page)
1153 unsigned long pfn = page_to_mfn(page);
1154 l3_pgentry_t *pl3e;
1155 int i;
1157 pl3e = map_domain_page(pfn);
1159 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1160 if ( is_guest_l3_slot(i) )
1161 put_page_from_l3e(pl3e[i], pfn);
1163 unmap_domain_page(pl3e);
1166 #endif
1168 #if CONFIG_PAGING_LEVELS >= 4
1170 static void free_l4_table(struct page_info *page)
1172 unsigned long pfn = page_to_mfn(page);
1173 l4_pgentry_t *pl4e = page_to_virt(page);
1174 int i;
1176 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1177 if ( is_guest_l4_slot(i) )
1178 put_page_from_l4e(pl4e[i], pfn);
1181 #endif
1183 static inline int update_l1e(l1_pgentry_t *pl1e,
1184 l1_pgentry_t ol1e,
1185 l1_pgentry_t nl1e)
1187 #ifndef PTE_UPDATE_WITH_CMPXCHG
1188 return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
1189 #else
1190 intpte_t o = l1e_get_intpte(ol1e);
1191 intpte_t n = l1e_get_intpte(nl1e);
1193 for ( ; ; )
1195 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
1197 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1198 ": saw %" PRIpte,
1199 l1e_get_intpte(ol1e),
1200 l1e_get_intpte(nl1e),
1201 o);
1202 return 0;
1205 if ( o == l1e_get_intpte(ol1e) )
1206 break;
1208 /* Allowed to change in Accessed/Dirty flags only. */
1209 BUG_ON((o ^ l1e_get_intpte(ol1e)) &
1210 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
1211 ol1e = l1e_from_intpte(o);
1214 return 1;
1215 #endif
1219 /* Update the L1 entry at pl1e to new value nl1e. */
1220 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1222 l1_pgentry_t ol1e;
1223 struct domain *d = current->domain;
1225 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1226 return 0;
1228 if ( unlikely(shadow_mode_refcounts(d)) )
1229 return update_l1e(pl1e, ol1e, nl1e);
1231 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1233 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1235 MEM_LOG("Bad L1 flags %x",
1236 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1237 return 0;
1240 /* Fast path for identical mapping, r/w and presence. */
1241 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1242 return update_l1e(pl1e, ol1e, nl1e);
1244 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1245 return 0;
1247 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1249 put_page_from_l1e(nl1e, d);
1250 return 0;
1253 else
1255 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1256 return 0;
1259 put_page_from_l1e(ol1e, d);
1260 return 1;
1263 #ifndef PTE_UPDATE_WITH_CMPXCHG
1264 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
1265 #else
1266 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1267 for ( ; ; ) \
1268 { \
1269 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1270 _t ## e_get_intpte(_o), \
1271 _t ## e_get_intpte(_n)); \
1272 if ( __o == _t ## e_get_intpte(_o) ) \
1273 break; \
1274 /* Allowed to change in Accessed/Dirty flags only. */ \
1275 BUG_ON((__o ^ _t ## e_get_intpte(_o)) & \
1276 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); \
1277 _o = _t ## e_from_intpte(__o); \
1278 } \
1279 1; })
1280 #endif
1282 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1283 static int mod_l2_entry(l2_pgentry_t *pl2e,
1284 l2_pgentry_t nl2e,
1285 unsigned long pfn,
1286 unsigned long type)
1288 l2_pgentry_t ol2e;
1289 unsigned long vaddr = 0;
1291 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1293 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1294 return 0;
1297 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1298 return 0;
1300 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1302 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1304 MEM_LOG("Bad L2 flags %x",
1305 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1306 return 0;
1309 /* Fast path for identical mapping and presence. */
1310 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1311 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1313 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1314 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1315 return 0;
1317 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1319 put_page_from_l2e(nl2e, pfn);
1320 return 0;
1323 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1325 return 0;
1328 put_page_from_l2e(ol2e, pfn);
1329 return 1;
1333 #if CONFIG_PAGING_LEVELS >= 3
1335 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1336 static int mod_l3_entry(l3_pgentry_t *pl3e,
1337 l3_pgentry_t nl3e,
1338 unsigned long pfn,
1339 unsigned long type)
1341 l3_pgentry_t ol3e;
1342 unsigned long vaddr;
1343 int okay;
1345 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1347 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1348 return 0;
1351 #ifdef CONFIG_X86_PAE
1352 /*
1353 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1354 * would be a pain to ensure they remain continuously valid throughout.
1355 */
1356 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1357 return 0;
1358 #endif
1360 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1361 return 0;
1363 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1365 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1367 MEM_LOG("Bad L3 flags %x",
1368 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1369 return 0;
1372 /* Fast path for identical mapping and presence. */
1373 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1374 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1376 #if CONFIG_PAGING_LEVELS >= 4
1377 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1378 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1379 return 0;
1380 #else
1381 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1382 << L3_PAGETABLE_SHIFT;
1383 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1384 return 0;
1385 #endif
1387 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1389 put_page_from_l3e(nl3e, pfn);
1390 return 0;
1393 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1395 return 0;
1398 okay = create_pae_xen_mappings(pl3e);
1399 BUG_ON(!okay);
1401 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1403 put_page_from_l3e(ol3e, pfn);
1404 return 1;
1407 #endif
1409 #if CONFIG_PAGING_LEVELS >= 4
1411 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1412 static int mod_l4_entry(l4_pgentry_t *pl4e,
1413 l4_pgentry_t nl4e,
1414 unsigned long pfn,
1415 unsigned long type)
1417 l4_pgentry_t ol4e;
1418 unsigned long vaddr;
1420 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1422 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1423 return 0;
1426 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1427 return 0;
1429 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1431 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1433 MEM_LOG("Bad L4 flags %x",
1434 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1435 return 0;
1438 /* Fast path for identical mapping and presence. */
1439 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1440 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1442 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1443 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1444 return 0;
1446 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1448 put_page_from_l4e(nl4e, pfn);
1449 return 0;
1452 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1454 return 0;
1457 put_page_from_l4e(ol4e, pfn);
1458 return 1;
1461 #endif
1463 int alloc_page_type(struct page_info *page, unsigned long type)
1465 struct domain *owner = page_get_owner(page);
1467 if ( owner != NULL )
1468 mark_dirty(owner, page_to_mfn(page));
1470 switch ( type & PGT_type_mask )
1472 case PGT_l1_page_table:
1473 return alloc_l1_table(page);
1474 case PGT_l2_page_table:
1475 return alloc_l2_table(page, type);
1476 case PGT_l3_page_table:
1477 return alloc_l3_table(page, type);
1478 case PGT_l4_page_table:
1479 return alloc_l4_table(page, type);
1480 case PGT_gdt_page:
1481 case PGT_ldt_page:
1482 return alloc_segdesc_page(page);
1483 default:
1484 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1485 type, page->u.inuse.type_info,
1486 page->count_info);
1487 BUG();
1490 return 0;
1494 void free_page_type(struct page_info *page, unsigned long type)
1496 struct domain *owner = page_get_owner(page);
1497 unsigned long gmfn;
1499 if ( likely(owner != NULL) )
1501 /*
1502 * We have to flush before the next use of the linear mapping
1503 * (e.g., update_va_mapping()) or we could end up modifying a page
1504 * that is no longer a page table (and hence screw up ref counts).
1505 */
1506 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1508 if ( unlikely(shadow_mode_enabled(owner)) )
1510 /* Raw page tables are rewritten during save/restore. */
1511 if ( !shadow_mode_translate(owner) )
1512 mark_dirty(owner, page_to_mfn(page));
1514 if ( shadow_mode_refcounts(owner) )
1515 return;
1517 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1518 ASSERT(VALID_M2P(gmfn));
1519 remove_shadow(owner, gmfn, type & PGT_type_mask);
1523 switch ( type & PGT_type_mask )
1525 case PGT_l1_page_table:
1526 free_l1_table(page);
1527 break;
1529 case PGT_l2_page_table:
1530 free_l2_table(page);
1531 break;
1533 #if CONFIG_PAGING_LEVELS >= 3
1534 case PGT_l3_page_table:
1535 free_l3_table(page);
1536 break;
1537 #endif
1539 #if CONFIG_PAGING_LEVELS >= 4
1540 case PGT_l4_page_table:
1541 free_l4_table(page);
1542 break;
1543 #endif
1545 default:
1546 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1547 type, page_to_mfn(page));
1548 BUG();
1553 void put_page_type(struct page_info *page)
1555 unsigned long nx, x, y = page->u.inuse.type_info;
1557 again:
1558 do {
1559 x = y;
1560 nx = x - 1;
1562 ASSERT((x & PGT_count_mask) != 0);
1564 /*
1565 * The page should always be validated while a reference is held. The
1566 * exception is during domain destruction, when we forcibly invalidate
1567 * page-table pages if we detect a referential loop.
1568 * See domain.c:relinquish_list().
1569 */
1570 ASSERT((x & PGT_validated) ||
1571 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1573 if ( unlikely((nx & PGT_count_mask) == 0) )
1575 /* Record TLB information for flush later. Races are harmless. */
1576 page->tlbflush_timestamp = tlbflush_current_time();
1578 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1579 likely(nx & PGT_validated) )
1581 /*
1582 * Page-table pages must be unvalidated when count is zero. The
1583 * 'free' is safe because the refcnt is non-zero and validated
1584 * bit is clear => other ops will spin or fail.
1585 */
1586 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1587 x & ~PGT_validated)) != x) )
1588 goto again;
1589 /* We cleared the 'valid bit' so we do the clean up. */
1590 free_page_type(page, x);
1591 /* Carry on, but with the 'valid bit' now clear. */
1592 x &= ~PGT_validated;
1593 nx &= ~PGT_validated;
1596 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1597 (PGT_pinned|PGT_l1_page_table|1)) )
1599 /* Page is now only pinned. Make the back pointer mutable again. */
1600 nx |= PGT_va_mutable;
1603 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1607 int get_page_type(struct page_info *page, unsigned long type)
1609 unsigned long nx, x, y = page->u.inuse.type_info;
1611 again:
1612 do {
1613 x = y;
1614 nx = x + 1;
1615 if ( unlikely((nx & PGT_count_mask) == 0) )
1617 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1618 return 0;
1620 else if ( unlikely((x & PGT_count_mask) == 0) )
1622 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1624 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1626 /*
1627 * On type change we check to flush stale TLB
1628 * entries. This may be unnecessary (e.g., page
1629 * was GDT/LDT) but those circumstances should be
1630 * very rare.
1631 */
1632 cpumask_t mask =
1633 page_get_owner(page)->domain_dirty_cpumask;
1634 tlbflush_filter(mask, page->tlbflush_timestamp);
1636 if ( unlikely(!cpus_empty(mask)) )
1638 perfc_incrc(need_flush_tlb_flush);
1639 flush_tlb_mask(mask);
1643 /* We lose existing type, back pointer, and validity. */
1644 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1645 nx |= type;
1647 /* No special validation needed for writable pages. */
1648 /* Page tables and GDT/LDT need to be scanned for validity. */
1649 if ( type == PGT_writable_page )
1650 nx |= PGT_validated;
1653 else
1655 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1657 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1659 if ( (current->domain == page_get_owner(page)) &&
1660 ((x & PGT_type_mask) == PGT_writable_page) )
1662 /*
1663 * This ensures functions like set_gdt() see up-to-date
1664 * type info without needing to clean up writable p.t.
1665 * state on the fast path. We take this path only
1666 * when the current type is writable because:
1667 * 1. It's the only type that this path can decrement.
1668 * 2. If we take this path more liberally then we can
1669 * enter a recursive loop via get_page_from_l1e()
1670 * during pagetable revalidation.
1671 */
1672 LOCK_BIGLOCK(current->domain);
1673 cleanup_writable_pagetable(current->domain);
1674 y = page->u.inuse.type_info;
1675 UNLOCK_BIGLOCK(current->domain);
1676 /* Can we make progress now? */
1677 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1678 ((y & PGT_count_mask) == 0) )
1679 goto again;
1681 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1682 ((type & PGT_type_mask) != PGT_l1_page_table) )
1683 MEM_LOG("Bad type (saw %" PRtype_info
1684 " != exp %" PRtype_info ") "
1685 "for mfn %lx (pfn %lx)",
1686 x, type, page_to_mfn(page),
1687 get_gpfn_from_mfn(page_to_mfn(page)));
1688 return 0;
1690 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1692 /* The va backpointer is mutable, hence we update it. */
1693 nx &= ~PGT_va_mask;
1694 nx |= type; /* we know the actual type is correct */
1696 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1698 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1699 #ifdef CONFIG_X86_PAE
1700 /* We use backptr as extra typing. Cannot be unknown. */
1701 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1702 return 0;
1703 #endif
1704 /* Fixme: add code to propagate va_unknown to subtables. */
1705 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1706 !shadow_mode_refcounts(page_get_owner(page)) )
1707 return 0;
1708 /* This table is possibly mapped at multiple locations. */
1709 nx &= ~PGT_va_mask;
1710 nx |= PGT_va_unknown;
1713 if ( unlikely(!(x & PGT_validated)) )
1715 /* Someone else is updating validation of this page. Wait... */
1716 while ( (y = page->u.inuse.type_info) == x )
1717 cpu_relax();
1718 goto again;
1722 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1724 if ( unlikely(!(nx & PGT_validated)) )
1726 /* Try to validate page type; drop the new reference on failure. */
1727 if ( unlikely(!alloc_page_type(page, type)) )
1729 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1730 PRtype_info ": caf=%08x taf=%" PRtype_info,
1731 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1732 type, page->count_info, page->u.inuse.type_info);
1733 /* Noone else can get a reference. We hold the only ref. */
1734 page->u.inuse.type_info = 0;
1735 return 0;
1738 /* Noone else is updating simultaneously. */
1739 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1742 return 1;
1746 int new_guest_cr3(unsigned long mfn)
1748 struct vcpu *v = current;
1749 struct domain *d = v->domain;
1750 int okay;
1751 unsigned long old_base_mfn;
1753 ASSERT(writable_pagetable_in_sync(d));
1755 if ( shadow_mode_refcounts(d) )
1757 okay = get_page_from_pagenr(mfn, d);
1758 if ( unlikely(!okay) )
1760 MEM_LOG("Error while installing new baseptr %lx", mfn);
1761 return 0;
1764 else
1766 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1767 if ( unlikely(!okay) )
1769 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1770 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1771 v->arch.guest_table = pagetable_null();
1772 update_pagetables(v);
1773 write_cr3(__pa(idle_pg_table));
1774 if ( old_base_mfn != 0 )
1775 put_page_and_type(mfn_to_page(old_base_mfn));
1777 /* Retry the validation with no active p.t. for this VCPU. */
1778 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1779 if ( !okay )
1781 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1782 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1783 domain_crash(d);
1784 percpu_info[v->processor].deferred_ops = 0;
1785 return 0;
1790 invalidate_shadow_ldt(v);
1792 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1793 v->arch.guest_table = pagetable_from_pfn(mfn);
1794 update_pagetables(v); /* update shadow_table and monitor_table */
1796 write_ptbase(v);
1798 if ( likely(old_base_mfn != 0) )
1800 if ( shadow_mode_refcounts(d) )
1801 put_page(mfn_to_page(old_base_mfn));
1802 else
1803 put_page_and_type(mfn_to_page(old_base_mfn));
1806 /* CR3 also holds a ref to its shadow... */
1807 if ( shadow_mode_enabled(d) )
1809 if ( v->arch.monitor_shadow_ref )
1810 put_shadow_ref(v->arch.monitor_shadow_ref);
1811 v->arch.monitor_shadow_ref =
1812 pagetable_get_pfn(v->arch.monitor_table);
1813 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1814 get_shadow_ref(v->arch.monitor_shadow_ref);
1817 return 1;
1820 static void process_deferred_ops(unsigned int cpu)
1822 unsigned int deferred_ops;
1823 struct domain *d = current->domain;
1825 deferred_ops = percpu_info[cpu].deferred_ops;
1826 percpu_info[cpu].deferred_ops = 0;
1828 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1830 if ( shadow_mode_enabled(d) )
1831 shadow_sync_all(d);
1832 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1833 flush_tlb_mask(d->domain_dirty_cpumask);
1834 else
1835 local_flush_tlb();
1838 if ( deferred_ops & DOP_RELOAD_LDT )
1839 (void)map_ldt_shadow_page(0);
1841 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1843 put_domain(percpu_info[cpu].foreign);
1844 percpu_info[cpu].foreign = NULL;
1848 static int set_foreigndom(unsigned int cpu, domid_t domid)
1850 struct domain *e, *d = current->domain;
1851 int okay = 1;
1853 ASSERT(percpu_info[cpu].foreign == NULL);
1855 if ( likely(domid == DOMID_SELF) )
1856 goto out;
1858 if ( domid == d->domain_id )
1860 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1861 d->domain_id);
1862 okay = 0;
1864 else if ( !IS_PRIV(d) )
1866 switch ( domid )
1868 case DOMID_IO:
1869 get_knownalive_domain(dom_io);
1870 percpu_info[cpu].foreign = dom_io;
1871 break;
1872 default:
1873 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1874 okay = 0;
1875 break;
1878 else
1880 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1881 if ( e == NULL )
1883 switch ( domid )
1885 case DOMID_XEN:
1886 get_knownalive_domain(dom_xen);
1887 percpu_info[cpu].foreign = dom_xen;
1888 break;
1889 case DOMID_IO:
1890 get_knownalive_domain(dom_io);
1891 percpu_info[cpu].foreign = dom_io;
1892 break;
1893 default:
1894 MEM_LOG("Unknown domain '%u'", domid);
1895 okay = 0;
1896 break;
1901 out:
1902 return okay;
1905 static inline cpumask_t vcpumask_to_pcpumask(
1906 struct domain *d, unsigned long vmask)
1908 unsigned int vcpu_id;
1909 cpumask_t pmask = CPU_MASK_NONE;
1910 struct vcpu *v;
1912 while ( vmask != 0 )
1914 vcpu_id = find_first_set_bit(vmask);
1915 vmask &= ~(1UL << vcpu_id);
1916 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1917 ((v = d->vcpu[vcpu_id]) != NULL) )
1918 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1921 return pmask;
1924 int do_mmuext_op(
1925 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1926 unsigned int count,
1927 XEN_GUEST_HANDLE(uint) pdone,
1928 unsigned int foreigndom)
1930 struct mmuext_op op;
1931 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1932 unsigned long mfn, type;
1933 unsigned int done = 0;
1934 struct page_info *page;
1935 struct vcpu *v = current;
1936 struct domain *d = v->domain;
1938 LOCK_BIGLOCK(d);
1940 cleanup_writable_pagetable(d);
1942 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1944 count &= ~MMU_UPDATE_PREEMPTED;
1945 if ( unlikely(!guest_handle_is_null(pdone)) )
1946 (void)copy_from_guest(&done, pdone, 1);
1949 if ( !set_foreigndom(cpu, foreigndom) )
1951 rc = -ESRCH;
1952 goto out;
1955 if ( unlikely(!guest_handle_okay(uops, count)) )
1957 rc = -EFAULT;
1958 goto out;
1961 for ( i = 0; i < count; i++ )
1963 if ( hypercall_preempt_check() )
1965 rc = hypercall_create_continuation(
1966 __HYPERVISOR_mmuext_op, "hihi",
1967 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1968 break;
1971 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1973 MEM_LOG("Bad __copy_from_guest");
1974 rc = -EFAULT;
1975 break;
1978 okay = 1;
1979 mfn = op.arg1.mfn;
1980 page = mfn_to_page(mfn);
1982 switch ( op.cmd )
1984 case MMUEXT_PIN_L1_TABLE:
1985 type = PGT_l1_page_table | PGT_va_mutable;
1986 goto pin_page;
1988 case MMUEXT_PIN_L2_TABLE:
1989 case MMUEXT_PIN_L3_TABLE:
1990 case MMUEXT_PIN_L4_TABLE:
1991 /* Ignore pinning of subdirectories. */
1992 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1993 break;
1995 type = PGT_root_page_table;
1997 pin_page:
1998 if ( shadow_mode_refcounts(FOREIGNDOM) )
1999 break;
2001 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2002 if ( unlikely(!okay) )
2004 MEM_LOG("Error while pinning mfn %lx", mfn);
2005 break;
2008 if ( unlikely(test_and_set_bit(_PGT_pinned,
2009 &page->u.inuse.type_info)) )
2011 MEM_LOG("Mfn %lx already pinned", mfn);
2012 put_page_and_type(page);
2013 okay = 0;
2014 break;
2017 break;
2019 case MMUEXT_UNPIN_TABLE:
2020 if ( shadow_mode_refcounts(d) )
2021 break;
2023 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2025 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2026 mfn, page_get_owner(page));
2028 else if ( likely(test_and_clear_bit(_PGT_pinned,
2029 &page->u.inuse.type_info)) )
2031 put_page_and_type(page);
2032 put_page(page);
2034 else
2036 okay = 0;
2037 put_page(page);
2038 MEM_LOG("Mfn %lx not pinned", mfn);
2040 break;
2042 case MMUEXT_NEW_BASEPTR:
2043 mfn = gmfn_to_mfn(current->domain, mfn);
2044 okay = new_guest_cr3(mfn);
2045 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
2046 break;
2048 #ifdef __x86_64__
2049 case MMUEXT_NEW_USER_BASEPTR:
2050 okay = get_page_and_type_from_pagenr(
2051 mfn, PGT_root_page_table, d);
2052 if ( unlikely(!okay) )
2054 MEM_LOG("Error while installing new mfn %lx", mfn);
2056 else
2058 unsigned long old_mfn =
2059 pagetable_get_pfn(v->arch.guest_table_user);
2060 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2061 if ( old_mfn != 0 )
2062 put_page_and_type(mfn_to_page(old_mfn));
2064 break;
2065 #endif
2067 case MMUEXT_TLB_FLUSH_LOCAL:
2068 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
2069 break;
2071 case MMUEXT_INVLPG_LOCAL:
2072 if ( shadow_mode_enabled(d) )
2073 shadow_invlpg(v, op.arg1.linear_addr);
2074 local_flush_tlb_one(op.arg1.linear_addr);
2075 break;
2077 case MMUEXT_TLB_FLUSH_MULTI:
2078 case MMUEXT_INVLPG_MULTI:
2080 unsigned long vmask;
2081 cpumask_t pmask;
2082 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
2084 okay = 0;
2085 break;
2087 pmask = vcpumask_to_pcpumask(d, vmask);
2088 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2089 flush_tlb_mask(pmask);
2090 else
2091 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2092 break;
2095 case MMUEXT_TLB_FLUSH_ALL:
2096 flush_tlb_mask(d->domain_dirty_cpumask);
2097 break;
2099 case MMUEXT_INVLPG_ALL:
2100 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2101 break;
2103 case MMUEXT_FLUSH_CACHE:
2104 if ( unlikely(!cache_flush_permitted(d)) )
2106 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2107 okay = 0;
2109 else
2111 wbinvd();
2113 break;
2115 case MMUEXT_SET_LDT:
2117 unsigned long ptr = op.arg1.linear_addr;
2118 unsigned long ents = op.arg2.nr_ents;
2120 if ( shadow_mode_external(d) )
2122 MEM_LOG("ignoring SET_LDT hypercall from external "
2123 "domain %u", d->domain_id);
2124 okay = 0;
2126 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2127 (ents > 8192) ||
2128 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2130 okay = 0;
2131 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2133 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2134 (v->arch.guest_context.ldt_base != ptr) )
2136 invalidate_shadow_ldt(v);
2137 v->arch.guest_context.ldt_base = ptr;
2138 v->arch.guest_context.ldt_ents = ents;
2139 load_LDT(v);
2140 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
2141 if ( ents != 0 )
2142 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
2144 break;
2147 default:
2148 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2149 okay = 0;
2150 break;
2153 if ( unlikely(!okay) )
2155 rc = -EINVAL;
2156 break;
2159 guest_handle_add_offset(uops, 1);
2162 out:
2163 process_deferred_ops(cpu);
2165 /* Add incremental work we have done to the @done output parameter. */
2166 done += i;
2167 if ( unlikely(!guest_handle_is_null(pdone)) )
2168 copy_to_guest(pdone, &done, 1);
2170 UNLOCK_BIGLOCK(d);
2171 return rc;
2174 int do_mmu_update(
2175 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2176 unsigned int count,
2177 XEN_GUEST_HANDLE(uint) pdone,
2178 unsigned int foreigndom)
2180 struct mmu_update req;
2181 void *va;
2182 unsigned long gpfn, gmfn, mfn;
2183 struct page_info *page;
2184 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2185 unsigned int cmd, done = 0;
2186 struct vcpu *v = current;
2187 struct domain *d = v->domain;
2188 unsigned long type_info;
2189 struct domain_mmap_cache mapcache, sh_mapcache;
2191 LOCK_BIGLOCK(d);
2193 cleanup_writable_pagetable(d);
2195 if ( unlikely(shadow_mode_enabled(d)) )
2196 check_pagetable(v, "pre-mmu"); /* debug */
2198 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2200 count &= ~MMU_UPDATE_PREEMPTED;
2201 if ( unlikely(!guest_handle_is_null(pdone)) )
2202 (void)copy_from_guest(&done, pdone, 1);
2205 domain_mmap_cache_init(&mapcache);
2206 domain_mmap_cache_init(&sh_mapcache);
2208 if ( !set_foreigndom(cpu, foreigndom) )
2210 rc = -ESRCH;
2211 goto out;
2214 perfc_incrc(calls_to_mmu_update);
2215 perfc_addc(num_page_updates, count);
2216 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2218 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2220 rc = -EFAULT;
2221 goto out;
2224 for ( i = 0; i < count; i++ )
2226 if ( hypercall_preempt_check() )
2228 rc = hypercall_create_continuation(
2229 __HYPERVISOR_mmu_update, "hihi",
2230 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2231 break;
2234 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2236 MEM_LOG("Bad __copy_from_guest");
2237 rc = -EFAULT;
2238 break;
2241 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2242 okay = 0;
2244 switch ( cmd )
2246 /*
2247 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2248 */
2249 case MMU_NORMAL_PT_UPDATE:
2251 gmfn = req.ptr >> PAGE_SHIFT;
2252 mfn = gmfn_to_mfn(d, gmfn);
2254 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2256 MEM_LOG("Could not get page for normal update");
2257 break;
2260 va = map_domain_page_with_cache(mfn, &mapcache);
2261 va = (void *)((unsigned long)va +
2262 (unsigned long)(req.ptr & ~PAGE_MASK));
2263 page = mfn_to_page(mfn);
2265 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2267 case PGT_l1_page_table:
2268 case PGT_l2_page_table:
2269 case PGT_l3_page_table:
2270 case PGT_l4_page_table:
2272 ASSERT(!shadow_mode_refcounts(d));
2273 if ( unlikely(!get_page_type(
2274 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2275 goto not_a_pt;
2277 switch ( type_info & PGT_type_mask )
2279 case PGT_l1_page_table:
2281 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2282 okay = mod_l1_entry(va, l1e);
2283 if ( okay && unlikely(shadow_mode_enabled(d)) )
2284 shadow_l1_normal_pt_update(
2285 d, req.ptr, l1e, &sh_mapcache);
2287 break;
2288 case PGT_l2_page_table:
2290 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2291 okay = mod_l2_entry(
2292 (l2_pgentry_t *)va, l2e, mfn, type_info);
2293 if ( okay && unlikely(shadow_mode_enabled(d)) )
2294 shadow_l2_normal_pt_update(
2295 d, req.ptr, l2e, &sh_mapcache);
2297 break;
2298 #if CONFIG_PAGING_LEVELS >= 3
2299 case PGT_l3_page_table:
2301 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2302 okay = mod_l3_entry(va, l3e, mfn, type_info);
2303 if ( okay && unlikely(shadow_mode_enabled(d)) )
2304 shadow_l3_normal_pt_update(
2305 d, req.ptr, l3e, &sh_mapcache);
2307 break;
2308 #endif
2309 #if CONFIG_PAGING_LEVELS >= 4
2310 case PGT_l4_page_table:
2312 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2313 okay = mod_l4_entry(va, l4e, mfn, type_info);
2314 if ( okay && unlikely(shadow_mode_enabled(d)) )
2315 shadow_l4_normal_pt_update(
2316 d, req.ptr, l4e, &sh_mapcache);
2318 break;
2319 #endif
2322 put_page_type(page);
2324 break;
2326 default:
2327 not_a_pt:
2329 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2330 break;
2332 if ( shadow_mode_enabled(d) )
2334 shadow_lock(d);
2335 __mark_dirty(d, mfn);
2336 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2337 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2340 *(intpte_t *)va = req.val;
2341 okay = 1;
2343 if ( shadow_mode_enabled(d) )
2344 shadow_unlock(d);
2346 put_page_type(page);
2348 break;
2351 unmap_domain_page_with_cache(va, &mapcache);
2353 put_page(page);
2354 break;
2356 case MMU_MACHPHYS_UPDATE:
2358 if ( shadow_mode_translate(FOREIGNDOM) )
2360 MEM_LOG("can't mutate m2p table of translate mode guest");
2361 break;
2364 mfn = req.ptr >> PAGE_SHIFT;
2365 gpfn = req.val;
2367 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2369 MEM_LOG("Could not get page for mach->phys update");
2370 break;
2373 set_gpfn_from_mfn(mfn, gpfn);
2374 okay = 1;
2376 mark_dirty(FOREIGNDOM, mfn);
2378 put_page(mfn_to_page(mfn));
2379 break;
2381 default:
2382 MEM_LOG("Invalid page update command %x", cmd);
2383 break;
2386 if ( unlikely(!okay) )
2388 rc = -EINVAL;
2389 break;
2392 guest_handle_add_offset(ureqs, 1);
2395 out:
2396 domain_mmap_cache_destroy(&mapcache);
2397 domain_mmap_cache_destroy(&sh_mapcache);
2399 process_deferred_ops(cpu);
2401 /* Add incremental work we have done to the @done output parameter. */
2402 done += i;
2403 if ( unlikely(!guest_handle_is_null(pdone)) )
2404 copy_to_guest(pdone, &done, 1);
2406 if ( unlikely(shadow_mode_enabled(d)) )
2407 check_pagetable(v, "post-mmu"); /* debug */
2409 UNLOCK_BIGLOCK(d);
2410 return rc;
2414 static int create_grant_pte_mapping(
2415 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2417 int rc = GNTST_okay;
2418 void *va;
2419 unsigned long gmfn, mfn;
2420 struct page_info *page;
2421 u32 type_info;
2422 l1_pgentry_t ol1e;
2423 struct domain *d = v->domain;
2425 ASSERT(spin_is_locked(&d->big_lock));
2426 ASSERT(!shadow_mode_refcounts(d));
2428 gmfn = pte_addr >> PAGE_SHIFT;
2429 mfn = gmfn_to_mfn(d, gmfn);
2431 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2433 MEM_LOG("Could not get page for normal update");
2434 return GNTST_general_error;
2437 va = map_domain_page(mfn);
2438 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2439 page = mfn_to_page(mfn);
2441 type_info = page->u.inuse.type_info;
2442 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2443 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2445 MEM_LOG("Grant map attempted to update a non-L1 page");
2446 rc = GNTST_general_error;
2447 goto failed;
2450 ol1e = *(l1_pgentry_t *)va;
2451 if ( !update_l1e(va, ol1e, _nl1e) )
2453 put_page_type(page);
2454 rc = GNTST_general_error;
2455 goto failed;
2458 put_page_from_l1e(ol1e, d);
2460 if ( unlikely(shadow_mode_enabled(d)) )
2462 struct domain_mmap_cache sh_mapcache;
2463 domain_mmap_cache_init(&sh_mapcache);
2464 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2465 domain_mmap_cache_destroy(&sh_mapcache);
2468 put_page_type(page);
2470 failed:
2471 unmap_domain_page(va);
2472 put_page(page);
2473 return rc;
2476 static int destroy_grant_pte_mapping(
2477 unsigned long addr, unsigned long frame, struct domain *d)
2479 int rc = GNTST_okay;
2480 void *va;
2481 unsigned long gmfn, mfn;
2482 struct page_info *page;
2483 u32 type_info;
2484 l1_pgentry_t ol1e;
2486 ASSERT(!shadow_mode_refcounts(d));
2488 gmfn = addr >> PAGE_SHIFT;
2489 mfn = gmfn_to_mfn(d, gmfn);
2491 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2493 MEM_LOG("Could not get page for normal update");
2494 return GNTST_general_error;
2497 va = map_domain_page(mfn);
2498 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2499 page = mfn_to_page(mfn);
2501 type_info = page->u.inuse.type_info;
2502 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2503 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2505 MEM_LOG("Grant map attempted to update a non-L1 page");
2506 rc = GNTST_general_error;
2507 goto failed;
2510 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2512 put_page_type(page);
2513 rc = GNTST_general_error;
2514 goto failed;
2517 /* Check that the virtual address supplied is actually mapped to frame. */
2518 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2520 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2521 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2522 put_page_type(page);
2523 rc = GNTST_general_error;
2524 goto failed;
2527 /* Delete pagetable entry. */
2528 if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
2530 MEM_LOG("Cannot delete PTE entry at %p", va);
2531 put_page_type(page);
2532 rc = GNTST_general_error;
2533 goto failed;
2536 if ( unlikely(shadow_mode_enabled(d)) )
2538 struct domain_mmap_cache sh_mapcache;
2539 domain_mmap_cache_init(&sh_mapcache);
2540 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2541 domain_mmap_cache_destroy(&sh_mapcache);
2544 put_page_type(page);
2546 failed:
2547 unmap_domain_page(va);
2548 put_page(page);
2549 return rc;
2553 static int create_grant_va_mapping(
2554 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2556 l1_pgentry_t *pl1e, ol1e;
2557 struct domain *d = v->domain;
2559 ASSERT(spin_is_locked(&d->big_lock));
2560 ASSERT(!shadow_mode_refcounts(d));
2562 /*
2563 * This is actually overkill - we don't need to sync the L1 itself,
2564 * just everything involved in getting to this L1 (i.e. we need
2565 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2566 */
2567 __shadow_sync_va(v, va);
2569 pl1e = &linear_pg_table[l1_linear_offset(va)];
2571 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2572 !update_l1e(pl1e, ol1e, _nl1e) )
2573 return GNTST_general_error;
2575 put_page_from_l1e(ol1e, d);
2577 if ( unlikely(shadow_mode_enabled(d)) )
2578 shadow_do_update_va_mapping(va, _nl1e, v);
2580 return GNTST_okay;
2583 static int destroy_grant_va_mapping(
2584 unsigned long addr, unsigned long frame)
2586 l1_pgentry_t *pl1e, ol1e;
2588 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2590 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2592 MEM_LOG("Could not find PTE entry for address %lx", addr);
2593 return GNTST_general_error;
2596 /*
2597 * Check that the virtual address supplied is actually mapped to
2598 * frame.
2599 */
2600 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2602 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2603 l1e_get_pfn(ol1e), addr, frame);
2604 return GNTST_general_error;
2607 /* Delete pagetable entry. */
2608 if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
2610 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2611 return GNTST_general_error;
2614 return 0;
2617 int create_grant_host_mapping(
2618 unsigned long addr, unsigned long frame, unsigned int flags)
2620 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2622 if ( (flags & GNTMAP_application_map) )
2623 l1e_add_flags(pte,_PAGE_USER);
2624 if ( !(flags & GNTMAP_readonly) )
2625 l1e_add_flags(pte,_PAGE_RW);
2627 if ( flags & GNTMAP_contains_pte )
2628 return create_grant_pte_mapping(addr, pte, current);
2629 return create_grant_va_mapping(addr, pte, current);
2632 int destroy_grant_host_mapping(
2633 unsigned long addr, unsigned long frame, unsigned int flags)
2635 if ( flags & GNTMAP_contains_pte )
2636 return destroy_grant_pte_mapping(addr, frame, current->domain);
2637 return destroy_grant_va_mapping(addr, frame);
2640 int steal_page(
2641 struct domain *d, struct page_info *page, unsigned int memflags)
2643 u32 _d, _nd, x, y;
2645 spin_lock(&d->page_alloc_lock);
2647 /*
2648 * The tricky bit: atomically release ownership while there is just one
2649 * benign reference to the page (PGC_allocated). If that reference
2650 * disappears then the deallocation routine will safely spin.
2651 */
2652 _d = pickle_domptr(d);
2653 _nd = page->u.inuse._domain;
2654 y = page->count_info;
2655 do {
2656 x = y;
2657 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2658 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2659 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2660 " caf=%08x, taf=%" PRtype_info "\n",
2661 (void *) page_to_mfn(page),
2662 d, d->domain_id, unpickle_domptr(_nd), x,
2663 page->u.inuse.type_info);
2664 spin_unlock(&d->page_alloc_lock);
2665 return -1;
2667 __asm__ __volatile__(
2668 LOCK_PREFIX "cmpxchg8b %2"
2669 : "=d" (_nd), "=a" (y),
2670 "=m" (*(volatile u64 *)(&page->count_info))
2671 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2672 } while (unlikely(_nd != _d) || unlikely(y != x));
2674 /*
2675 * Unlink from 'd'. At least one reference remains (now anonymous), so
2676 * noone else is spinning to try to delete this page from 'd'.
2677 */
2678 if ( !(memflags & MEMF_no_refcount) )
2679 d->tot_pages--;
2680 list_del(&page->list);
2682 spin_unlock(&d->page_alloc_lock);
2684 return 0;
2687 int do_update_va_mapping(unsigned long va, u64 val64,
2688 unsigned long flags)
2690 l1_pgentry_t val = l1e_from_intpte(val64);
2691 struct vcpu *v = current;
2692 struct domain *d = v->domain;
2693 unsigned int cpu = smp_processor_id();
2694 unsigned long vmask, bmap_ptr;
2695 cpumask_t pmask;
2696 int rc = 0;
2698 perfc_incrc(calls_to_update_va);
2700 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2701 return -EINVAL;
2703 LOCK_BIGLOCK(d);
2705 cleanup_writable_pagetable(d);
2707 if ( unlikely(shadow_mode_enabled(d)) )
2708 check_pagetable(v, "pre-va"); /* debug */
2710 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2711 val)) )
2712 rc = -EINVAL;
2714 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2716 if ( unlikely(percpu_info[cpu].foreign &&
2717 (shadow_mode_translate(d) ||
2718 shadow_mode_translate(percpu_info[cpu].foreign))) )
2720 /*
2721 * The foreign domain's pfn's are in a different namespace. There's
2722 * not enough information in just a gpte to figure out how to
2723 * (re-)shadow this entry.
2724 */
2725 domain_crash(d);
2728 rc = shadow_do_update_va_mapping(va, val, v);
2730 check_pagetable(v, "post-va"); /* debug */
2733 switch ( flags & UVMF_FLUSHTYPE_MASK )
2735 case UVMF_TLB_FLUSH:
2736 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2738 case UVMF_LOCAL:
2739 if ( unlikely(shadow_mode_enabled(d)) )
2740 shadow_sync_all(d);
2741 local_flush_tlb();
2742 break;
2743 case UVMF_ALL:
2744 flush_tlb_mask(d->domain_dirty_cpumask);
2745 break;
2746 default:
2747 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2748 rc = -EFAULT;
2749 pmask = vcpumask_to_pcpumask(d, vmask);
2750 flush_tlb_mask(pmask);
2751 break;
2753 break;
2755 case UVMF_INVLPG:
2756 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2758 case UVMF_LOCAL:
2759 if ( unlikely(shadow_mode_enabled(d)) )
2760 shadow_invlpg(current, va);
2761 local_flush_tlb_one(va);
2762 break;
2763 case UVMF_ALL:
2764 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2765 break;
2766 default:
2767 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2768 rc = -EFAULT;
2769 pmask = vcpumask_to_pcpumask(d, vmask);
2770 flush_tlb_one_mask(pmask, va);
2771 break;
2773 break;
2776 process_deferred_ops(cpu);
2778 UNLOCK_BIGLOCK(d);
2780 return rc;
2783 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2784 unsigned long flags,
2785 domid_t domid)
2787 unsigned int cpu = smp_processor_id();
2788 int rc;
2790 if ( unlikely(!IS_PRIV(current->domain)) )
2791 return -EPERM;
2793 if ( !set_foreigndom(cpu, domid) )
2794 return -ESRCH;
2796 rc = do_update_va_mapping(va, val64, flags);
2798 return rc;
2803 /*************************
2804 * Descriptor Tables
2805 */
2807 void destroy_gdt(struct vcpu *v)
2809 int i;
2810 unsigned long pfn;
2812 v->arch.guest_context.gdt_ents = 0;
2813 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2815 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2816 put_page_and_type(mfn_to_page(pfn));
2817 v->arch.perdomain_ptes[i] = l1e_empty();
2818 v->arch.guest_context.gdt_frames[i] = 0;
2823 long set_gdt(struct vcpu *v,
2824 unsigned long *frames,
2825 unsigned int entries)
2827 struct domain *d = v->domain;
2828 /* NB. There are 512 8-byte entries per GDT page. */
2829 int i, nr_pages = (entries + 511) / 512;
2830 unsigned long mfn;
2832 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2833 return -EINVAL;
2835 shadow_sync_all(d);
2837 /* Check the pages in the new GDT. */
2838 for ( i = 0; i < nr_pages; i++ ) {
2839 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2840 if ( !mfn_valid(mfn) ||
2841 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2842 goto fail;
2845 /* Tear down the old GDT. */
2846 destroy_gdt(v);
2848 /* Install the new GDT. */
2849 v->arch.guest_context.gdt_ents = entries;
2850 for ( i = 0; i < nr_pages; i++ )
2852 v->arch.guest_context.gdt_frames[i] = frames[i];
2853 v->arch.perdomain_ptes[i] =
2854 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2857 return 0;
2859 fail:
2860 while ( i-- > 0 )
2861 put_page_and_type(mfn_to_page(frames[i]));
2862 return -EINVAL;
2866 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2868 int nr_pages = (entries + 511) / 512;
2869 unsigned long frames[16];
2870 long ret;
2872 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2873 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2874 return -EINVAL;
2876 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2877 return -EFAULT;
2879 LOCK_BIGLOCK(current->domain);
2881 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2882 local_flush_tlb();
2884 UNLOCK_BIGLOCK(current->domain);
2886 return ret;
2890 long do_update_descriptor(u64 pa, u64 desc)
2892 struct domain *dom = current->domain;
2893 unsigned long gmfn = pa >> PAGE_SHIFT;
2894 unsigned long mfn;
2895 unsigned int offset;
2896 struct desc_struct *gdt_pent, d;
2897 struct page_info *page;
2898 long ret = -EINVAL;
2900 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2902 *(u64 *)&d = desc;
2904 LOCK_BIGLOCK(dom);
2906 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2907 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2908 !mfn_valid(mfn) ||
2909 !check_descriptor(&d) )
2911 UNLOCK_BIGLOCK(dom);
2912 return -EINVAL;
2915 page = mfn_to_page(mfn);
2916 if ( unlikely(!get_page(page, dom)) )
2918 UNLOCK_BIGLOCK(dom);
2919 return -EINVAL;
2922 /* Check if the given frame is in use in an unsafe context. */
2923 switch ( page->u.inuse.type_info & PGT_type_mask )
2925 case PGT_gdt_page:
2926 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2927 goto out;
2928 break;
2929 case PGT_ldt_page:
2930 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2931 goto out;
2932 break;
2933 default:
2934 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2935 goto out;
2936 break;
2939 if ( shadow_mode_enabled(dom) )
2941 shadow_lock(dom);
2943 __mark_dirty(dom, mfn);
2945 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2946 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2949 /* All is good so make the update. */
2950 gdt_pent = map_domain_page(mfn);
2951 memcpy(&gdt_pent[offset], &d, 8);
2952 unmap_domain_page(gdt_pent);
2954 if ( shadow_mode_enabled(dom) )
2955 shadow_unlock(dom);
2957 put_page_type(page);
2959 ret = 0; /* success */
2961 out:
2962 put_page(page);
2964 UNLOCK_BIGLOCK(dom);
2966 return ret;
2969 typedef struct e820entry e820entry_t;
2970 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2972 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2974 switch ( op )
2976 case XENMEM_add_to_physmap:
2978 struct xen_add_to_physmap xatp;
2979 unsigned long prev_mfn, mfn = 0, gpfn;
2980 struct domain *d;
2982 if ( copy_from_guest(&xatp, arg, 1) )
2983 return -EFAULT;
2985 if ( xatp.domid == DOMID_SELF )
2987 d = current->domain;
2988 get_knownalive_domain(d);
2990 else if ( !IS_PRIV(current->domain) )
2991 return -EPERM;
2992 else if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2993 return -ESRCH;
2995 switch ( xatp.space )
2997 case XENMAPSPACE_shared_info:
2998 if ( xatp.idx == 0 )
2999 mfn = virt_to_mfn(d->shared_info);
3000 break;
3001 case XENMAPSPACE_grant_table:
3002 if ( xatp.idx < NR_GRANT_FRAMES )
3003 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
3004 break;
3005 default:
3006 break;
3009 if ( !shadow_mode_translate(d) || (mfn == 0) )
3011 put_domain(d);
3012 return -EINVAL;
3015 LOCK_BIGLOCK(d);
3017 /* Remove previously mapped page if it was present. */
3018 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3019 if ( mfn_valid(prev_mfn) )
3021 if ( IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)) )
3022 /* Xen heap frames are simply unhooked from this phys slot. */
3023 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3024 else
3025 /* Normal domain memory is freed, to avoid leaking memory. */
3026 guest_remove_page(d, xatp.gpfn);
3029 /* Unmap from old location, if any. */
3030 gpfn = get_gpfn_from_mfn(mfn);
3031 if ( gpfn != INVALID_M2P_ENTRY )
3032 guest_physmap_remove_page(d, gpfn, mfn);
3034 /* Map at new location. */
3035 guest_physmap_add_page(d, xatp.gpfn, mfn);
3037 UNLOCK_BIGLOCK(d);
3039 put_domain(d);
3041 break;
3044 case XENMEM_memory_map:
3046 return -ENOSYS;
3049 case XENMEM_machine_memory_map:
3051 struct xen_memory_map memmap;
3052 XEN_GUEST_HANDLE(e820entry_t) buffer;
3053 int count;
3055 if ( !IS_PRIV(current->domain) )
3056 return -EINVAL;
3058 if ( copy_from_guest(&memmap, arg, 1) )
3059 return -EFAULT;
3060 if ( memmap.nr_entries < e820.nr_map + 1 )
3061 return -EINVAL;
3063 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3065 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3066 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3067 return -EFAULT;
3069 memmap.nr_entries = count;
3071 if ( copy_to_guest(arg, &memmap, 1) )
3072 return -EFAULT;
3074 return 0;
3077 case XENMEM_machphys_mapping:
3079 struct xen_machphys_mapping mapping = {
3080 .v_start = MACH2PHYS_VIRT_START,
3081 .v_end = MACH2PHYS_VIRT_END,
3082 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3083 };
3085 if ( copy_to_guest(arg, &mapping, 1) )
3086 return -EFAULT;
3088 return 0;
3091 default:
3092 return subarch_memory_op(op, arg);
3095 return 0;
3099 /*************************
3100 * Writable Pagetables
3101 */
3103 #ifdef VVERBOSE
3104 int ptwr_debug = 0x0;
3105 #define PTWR_PRINTK(_f, _a...) \
3106 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
3107 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
3108 #else
3109 #define PTWR_PRINTK(_f, _a...) ((void)0)
3110 #endif
3113 #ifdef PERF_ARRAYS
3115 /**************** writeable pagetables profiling functions *****************/
3117 #define ptwr_eip_buckets 256
3119 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
3121 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
3123 struct {
3124 unsigned long eip;
3125 domid_t id;
3126 u32 val[ptwr_eip_stat_thresholdN];
3127 } typedef ptwr_eip_stat_t;
3129 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
3131 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
3133 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
3134 ptwr_eip_buckets;
3137 static void ptwr_eip_stat_inc(u32 *n)
3139 unsigned int i, j;
3141 if ( ++(*n) != 0 )
3142 return;
3144 *n = ~0;
3146 /* Re-scale all buckets. */
3147 for ( i = 0; i < ptwr_eip_buckets; i++ )
3148 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3149 ptwr_eip_stats[i].val[j] >>= 1;
3152 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
3154 unsigned int i, j, b;
3156 i = b = ptwr_eip_stat_hash(eip, id);
3158 do
3160 if ( !ptwr_eip_stats[i].eip )
3162 /* doesn't exist */
3163 ptwr_eip_stats[i].eip = eip;
3164 ptwr_eip_stats[i].id = id;
3165 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
3168 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
3170 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3171 if ( modified <= ptwr_eip_stat_threshold[j] )
3172 break;
3173 BUG_ON(j >= ptwr_eip_stat_thresholdN);
3174 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
3175 return;
3178 i = (i+1) % ptwr_eip_buckets;
3180 while ( i != b );
3182 printk("ptwr_eip_stat: too many EIPs in use!\n");
3184 ptwr_eip_stat_print();
3185 ptwr_eip_stat_reset();
3188 void ptwr_eip_stat_reset(void)
3190 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
3193 void ptwr_eip_stat_print(void)
3195 struct domain *e;
3196 domid_t d;
3197 unsigned int i, j;
3199 for_each_domain( e )
3201 d = e->domain_id;
3203 for ( i = 0; i < ptwr_eip_buckets; i++ )
3205 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
3206 continue;
3208 printk("D %5d eip %p ",
3209 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
3211 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3212 printk("<=%u %4u \t",
3213 ptwr_eip_stat_threshold[j],
3214 ptwr_eip_stats[i].val[j]);
3215 printk("\n");
3220 #else /* PERF_ARRAYS */
3222 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3224 #endif
3226 /*******************************************************************/
3228 /* Re-validate a given p.t. page, given its prior snapshot */
3229 int revalidate_l1(
3230 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3232 l1_pgentry_t ol1e, nl1e;
3233 int modified = 0, i;
3235 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3237 ol1e = snapshot[i];
3238 nl1e = l1page[i];
3240 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3241 continue;
3243 /* Update number of entries modified. */
3244 modified++;
3246 /*
3247 * Fast path for PTEs that have merely been write-protected
3248 * (e.g., during a Unix fork()). A strict reduction in privilege.
3249 */
3250 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3252 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3253 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3254 continue;
3257 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3259 /*
3260 * Make the remaining p.t's consistent before crashing, so the
3261 * reference counts are correct.
3262 */
3263 memcpy(&l1page[i], &snapshot[i],
3264 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3266 /* Crash the offending domain. */
3267 MEM_LOG("ptwr: Could not revalidate l1 page");
3268 domain_crash(d);
3269 break;
3272 put_page_from_l1e(ol1e, d);
3275 return modified;
3279 /* Flush the given writable p.t. page and write-protect it again. */
3280 void ptwr_flush(struct domain *d, const int which)
3282 unsigned long l1va;
3283 l1_pgentry_t *pl1e, pte, *ptep;
3284 l2_pgentry_t *pl2e;
3285 unsigned int modified;
3287 #ifdef CONFIG_X86_64
3288 struct vcpu *v = current;
3289 int user_mode = !(v->arch.flags & TF_kernel_mode);
3290 #endif
3292 ASSERT(!shadow_mode_enabled(d));
3294 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3295 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3296 __write_ptbase(pagetable_get_pfn(
3297 d->arch.ptwr[which].vcpu->arch.guest_table));
3298 else
3299 TOGGLE_MODE();
3301 l1va = d->arch.ptwr[which].l1va;
3302 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3304 /*
3305 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3306 */
3308 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3310 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3311 /*
3312 * Really a bug. We could read this PTE during the initial fault,
3313 * and pagetables can't have changed meantime.
3314 */
3315 BUG();
3317 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3318 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3319 l1e_remove_flags(pte, _PAGE_RW);
3321 /* Write-protect the p.t. page in the guest page table. */
3322 if ( unlikely(__put_user(pte, ptep)) )
3324 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3325 /*
3326 * Really a bug. We could write this PTE during the initial fault,
3327 * and pagetables can't have changed meantime.
3328 */
3329 BUG();
3332 /* Ensure that there are no stale writable mappings in any TLB. */
3333 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3334 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3335 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3336 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3338 /*
3339 * STEP 2. Validate any modified PTEs.
3340 */
3342 if ( likely(d == current->domain) )
3344 pl1e = map_domain_page(l1e_get_pfn(pte));
3345 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3346 unmap_domain_page(pl1e);
3347 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3348 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3349 d->arch.ptwr[which].prev_nr_updates = modified;
3351 else
3353 /*
3354 * Must make a temporary global mapping, since we are running in the
3355 * wrong address space, so no access to our own mapcache.
3356 */
3357 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3358 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3359 unmap_domain_page_global(pl1e);
3362 /*
3363 * STEP 3. Reattach the L1 p.t. page into the current address space.
3364 */
3366 if ( which == PTWR_PT_ACTIVE )
3368 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3369 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3372 /*
3373 * STEP 4. Final tidy-up.
3374 */
3376 d->arch.ptwr[which].l1va = 0;
3378 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3379 write_ptbase(current);
3380 else
3381 TOGGLE_MODE();
3384 static int ptwr_emulated_update(
3385 unsigned long addr,
3386 paddr_t old,
3387 paddr_t val,
3388 unsigned int bytes,
3389 unsigned int do_cmpxchg)
3391 unsigned long pfn, l1va;
3392 struct page_info *page;
3393 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3394 struct domain *d = current->domain;
3396 /* Aligned access only, thank you. */
3397 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3399 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3400 bytes, addr);
3401 return X86EMUL_UNHANDLEABLE;
3404 /* Turn a sub-word access into a full-word access. */
3405 if ( bytes != sizeof(paddr_t) )
3407 paddr_t full;
3408 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3410 /* Align address; read full word. */
3411 addr &= ~(sizeof(paddr_t)-1);
3412 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3414 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3415 return X86EMUL_PROPAGATE_FAULT;
3417 /* Mask out bits provided by caller. */
3418 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3419 /* Shift the caller value and OR in the missing bits. */
3420 val &= (((paddr_t)1 << (bytes*8)) - 1);
3421 val <<= (offset)*8;
3422 val |= full;
3423 /* Also fill in missing parts of the cmpxchg old value. */
3424 old &= (((paddr_t)1 << (bytes*8)) - 1);
3425 old <<= (offset)*8;
3426 old |= full;
3429 #if 0 /* XXX KAF: I don't think this can happen. */
3430 /*
3431 * We must not emulate an update to a PTE that is temporarily marked
3432 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3433 */
3434 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3435 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3436 ptwr_flush(d, PTWR_PT_ACTIVE);
3437 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3438 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3439 ptwr_flush(d, PTWR_PT_INACTIVE);
3440 #else
3441 BUG_ON(((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3442 (l1_linear_offset(l1va) == l1_linear_offset(addr)));
3443 BUG_ON(((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3444 (l1_linear_offset(l1va) == l1_linear_offset(addr)));
3445 #endif
3447 /* Read the PTE that maps the page being updated. */
3448 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3449 sizeof(pte)) )
3451 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3452 return X86EMUL_UNHANDLEABLE;
3455 pfn = l1e_get_pfn(pte);
3456 page = mfn_to_page(pfn);
3458 /* We are looking only for read-only mappings of p.t. pages. */
3459 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3460 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3461 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3462 ASSERT(page_get_owner(page) == d);
3464 /* Check the new PTE. */
3465 nl1e = l1e_from_intpte(val);
3466 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3468 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3469 return X86EMUL_UNHANDLEABLE;
3472 /* Checked successfully: do the update (write or cmpxchg). */
3473 pl1e = map_domain_page(page_to_mfn(page));
3474 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3475 if ( do_cmpxchg )
3477 ol1e = l1e_from_intpte(old);
3478 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3480 unmap_domain_page(pl1e);
3481 put_page_from_l1e(nl1e, d);
3482 return X86EMUL_CMPXCHG_FAILED;
3485 else
3487 ol1e = *pl1e;
3488 if ( !update_l1e(pl1e, ol1e, nl1e) )
3489 BUG();
3491 unmap_domain_page(pl1e);
3493 /* Finally, drop the old PTE. */
3494 put_page_from_l1e(ol1e, d);
3496 return X86EMUL_CONTINUE;
3499 static int ptwr_emulated_write(
3500 unsigned long addr,
3501 unsigned long val,
3502 unsigned int bytes,
3503 struct x86_emulate_ctxt *ctxt)
3505 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3508 static int ptwr_emulated_cmpxchg(
3509 unsigned long addr,
3510 unsigned long old,
3511 unsigned long new,
3512 unsigned int bytes,
3513 struct x86_emulate_ctxt *ctxt)
3515 return ptwr_emulated_update(addr, old, new, bytes, 1);
3518 static int ptwr_emulated_cmpxchg8b(
3519 unsigned long addr,
3520 unsigned long old,
3521 unsigned long old_hi,
3522 unsigned long new,
3523 unsigned long new_hi,
3524 struct x86_emulate_ctxt *ctxt)
3526 if ( CONFIG_PAGING_LEVELS == 2 )
3527 return X86EMUL_UNHANDLEABLE;
3528 else
3529 return ptwr_emulated_update(
3530 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3533 static struct x86_emulate_ops ptwr_emulate_ops = {
3534 .read_std = x86_emulate_read_std,
3535 .write_std = x86_emulate_write_std,
3536 .read_emulated = x86_emulate_read_std,
3537 .write_emulated = ptwr_emulated_write,
3538 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3539 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3540 };
3542 /* Write page fault handler: check if guest is trying to modify a PTE. */
3543 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3544 struct cpu_user_regs *regs)
3546 unsigned long pfn;
3547 struct page_info *page;
3548 l1_pgentry_t *pl1e, pte;
3549 l2_pgentry_t *pl2e, l2e;
3550 int which, flags;
3551 unsigned long l2_idx;
3552 struct x86_emulate_ctxt emul_ctxt;
3554 ASSERT(!shadow_mode_enabled(d));
3556 /*
3557 * Attempt to read the PTE that maps the VA being accessed. By checking for
3558 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3559 * NB. The L2 entry cannot be detached due to existing ptwr work: the
3560 * caller already checked that.
3561 */
3562 pl2e = &__linear_l2_table[l2_linear_offset(addr)];
3563 if ( __copy_from_user(&l2e, pl2e, sizeof(l2e)) ||
3564 !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3565 __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3566 sizeof(pte)) )
3568 return 0;
3571 pfn = l1e_get_pfn(pte);
3572 page = mfn_to_page(pfn);
3574 #ifdef CONFIG_X86_64
3575 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3576 #else
3577 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3578 #endif
3580 /*
3581 * Check the required flags for a valid wrpt mapping. If the page is
3582 * already writable then we can return straight to the guest (SMP race).
3583 * We decide whether or not to propagate the fault by testing for write
3584 * permissions in page directories by writing back to the linear mapping.
3585 */
3586 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3587 return __put_user(
3588 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3589 0 : EXCRET_not_a_fault;
3591 /* We are looking only for read-only mappings of p.t. pages. */
3592 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3593 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3594 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3595 (page_get_owner(page) != d) )
3597 return 0;
3600 #if 0 /* Leave this in as useful for debugging */
3601 goto emulate;
3602 #endif
3604 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3605 addr, pfn, (unsigned long)regs->eip);
3607 /* Get the L2 index at which this L1 p.t. is always mapped. */
3608 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3609 if ( unlikely(l2_idx >= PGT_va_unknown) )
3610 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3611 l2_idx >>= PGT_va_shift;
3613 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3614 goto emulate; /* Urk! Pagetable maps itself! */
3616 /*
3617 * Is the L1 p.t. mapped into the current address space? If so we call it
3618 * an ACTIVE p.t., otherwise it is INACTIVE.
3619 */
3620 pl2e = &__linear_l2_table[l2_idx];
3621 which = PTWR_PT_INACTIVE;
3623 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3625 /*
3626 * Check the PRESENT bit to set ACTIVE mode.
3627 * If the PRESENT bit is clear, we may be conflicting with the current
3628 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3629 * The ptwr_flush call below will restore the PRESENT bit.
3630 */
3631 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3632 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3633 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3634 which = PTWR_PT_ACTIVE;
3637 /*
3638 * Multi-processor guest? Then ensure that the page table is hooked into
3639 * at most one L2, and also ensure that there is only one mapping of the
3640 * page table itself (or there can be conflicting writable mappings from
3641 * other VCPUs).
3642 */
3643 if ( d->vcpu[0]->next_in_list != NULL )
3645 if ( /* Hooked into at most one L2 table (which this VCPU maps)? */
3646 ((page->u.inuse.type_info & PGT_count_mask) !=
3647 (!!(page->u.inuse.type_info & PGT_pinned) +
3648 (which == PTWR_PT_ACTIVE))) ||
3649 /* PTEs are mapped read-only in only one place? */
3650 ((page->count_info & PGC_count_mask) !=
3651 (!!(page->count_info & PGC_allocated) + /* alloc count */
3652 (page->u.inuse.type_info & PGT_count_mask) + /* type count */
3653 1)) ) /* map count */
3655 /* Could be conflicting writable mappings from other VCPUs. */
3656 cleanup_writable_pagetable(d);
3657 goto emulate;
3661 /*
3662 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at a
3663 * time. If there is already one, we must flush it out.
3664 */
3665 if ( d->arch.ptwr[which].l1va )
3666 ptwr_flush(d, which);
3668 /*
3669 * If last batch made no updates then we are probably stuck. Emulate this
3670 * update to ensure we make progress.
3671 */
3672 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3674 /* Ensure that we don't get stuck in an emulation-only rut. */
3675 d->arch.ptwr[which].prev_nr_updates = 1;
3676 goto emulate;
3679 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3680 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3681 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3683 /* For safety, disconnect the L1 p.t. page from current space. */
3684 if ( which == PTWR_PT_ACTIVE )
3686 l2e_remove_flags(l2e, _PAGE_PRESENT);
3687 if ( unlikely(__copy_to_user(pl2e, &l2e, sizeof(l2e))) )
3689 MEM_LOG("ptwr: Could not unhook l2e at %p", pl2e);
3690 domain_crash(d);
3691 return 0;
3693 flush_tlb_mask(d->domain_dirty_cpumask);
3696 /* Temporarily map the L1 page, and make a copy of it. */
3697 pl1e = map_domain_page(pfn);
3698 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3699 unmap_domain_page(pl1e);
3701 /* Finally, make the p.t. page writable by the guest OS. */
3702 l1e_add_flags(pte, _PAGE_RW);
3703 if ( unlikely(__put_user(pte.l1,
3704 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3706 MEM_LOG("ptwr: Could not update pte at %p",
3707 &linear_pg_table[l1_linear_offset(addr)]);
3708 domain_crash(d);
3709 return 0;
3712 /*
3713 * Now record the writable pagetable state *after* any accesses that can
3714 * cause a recursive page fault (i.e., those via the *_user() accessors).
3715 * Otherwise we can enter ptwr_flush() with half-done ptwr state.
3716 */
3717 d->arch.ptwr[which].l1va = addr | 1;
3718 d->arch.ptwr[which].l2_idx = l2_idx;
3719 d->arch.ptwr[which].vcpu = current;
3720 #ifdef PERF_ARRAYS
3721 d->arch.ptwr[which].eip = regs->eip;
3722 #endif
3724 return EXCRET_fault_fixed;
3726 emulate:
3727 emul_ctxt.regs = guest_cpu_user_regs();
3728 emul_ctxt.cr2 = addr;
3729 emul_ctxt.mode = X86EMUL_MODE_HOST;
3730 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3731 return 0;
3732 perfc_incrc(ptwr_emulations);
3733 return EXCRET_fault_fixed;
3736 int ptwr_init(struct domain *d)
3738 void *x = alloc_xenheap_page();
3739 void *y = alloc_xenheap_page();
3741 if ( (x == NULL) || (y == NULL) )
3743 free_xenheap_page(x);
3744 free_xenheap_page(y);
3745 return -ENOMEM;
3748 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3749 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3751 return 0;
3754 void ptwr_destroy(struct domain *d)
3756 LOCK_BIGLOCK(d);
3757 cleanup_writable_pagetable(d);
3758 UNLOCK_BIGLOCK(d);
3759 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3760 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3763 void cleanup_writable_pagetable(struct domain *d)
3765 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3766 return;
3768 if ( unlikely(shadow_mode_enabled(d)) )
3770 shadow_sync_all(d);
3772 else
3774 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3775 ptwr_flush(d, PTWR_PT_ACTIVE);
3776 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3777 ptwr_flush(d, PTWR_PT_INACTIVE);
3781 int map_pages_to_xen(
3782 unsigned long virt,
3783 unsigned long mfn,
3784 unsigned long nr_mfns,
3785 unsigned long flags)
3787 l2_pgentry_t *pl2e, ol2e;
3788 l1_pgentry_t *pl1e, ol1e;
3789 unsigned int i;
3791 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3792 flags &= ~MAP_SMALL_PAGES;
3794 while ( nr_mfns != 0 )
3796 pl2e = virt_to_xen_l2e(virt);
3798 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3799 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3800 !map_small_pages )
3802 /* Super-page mapping. */
3803 ol2e = *pl2e;
3804 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3806 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3808 local_flush_tlb_pge();
3809 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3810 free_xen_pagetable(l2e_get_page(ol2e));
3813 virt += 1UL << L2_PAGETABLE_SHIFT;
3814 mfn += 1UL << PAGETABLE_ORDER;
3815 nr_mfns -= 1UL << PAGETABLE_ORDER;
3817 else
3819 /* Normal page mapping. */
3820 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3822 pl1e = page_to_virt(alloc_xen_pagetable());
3823 clear_page(pl1e);
3824 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3826 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3828 pl1e = page_to_virt(alloc_xen_pagetable());
3829 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3830 pl1e[i] = l1e_from_pfn(
3831 l2e_get_pfn(*pl2e) + i,
3832 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3833 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3834 local_flush_tlb_pge();
3837 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3838 ol1e = *pl1e;
3839 *pl1e = l1e_from_pfn(mfn, flags);
3840 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3841 local_flush_tlb_one(virt);
3843 virt += 1UL << L1_PAGETABLE_SHIFT;
3844 mfn += 1UL;
3845 nr_mfns -= 1UL;
3849 return 0;
3852 void __set_fixmap(
3853 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3855 BUG_ON(idx >= __end_of_fixed_addresses);
3856 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3859 #ifdef MEMORY_GUARD
3861 void memguard_init(void)
3863 map_pages_to_xen(
3864 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3865 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3868 static void __memguard_change_range(void *p, unsigned long l, int guard)
3870 unsigned long _p = (unsigned long)p;
3871 unsigned long _l = (unsigned long)l;
3872 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3874 /* Ensure we are dealing with a page-aligned whole number of pages. */
3875 ASSERT((_p&PAGE_MASK) != 0);
3876 ASSERT((_l&PAGE_MASK) != 0);
3877 ASSERT((_p&~PAGE_MASK) == 0);
3878 ASSERT((_l&~PAGE_MASK) == 0);
3880 if ( guard )
3881 flags &= ~_PAGE_PRESENT;
3883 map_pages_to_xen(
3884 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3887 void memguard_guard_range(void *p, unsigned long l)
3889 __memguard_change_range(p, l, 1);
3892 void memguard_unguard_range(void *p, unsigned long l)
3894 __memguard_change_range(p, l, 0);
3897 #endif
3899 void memguard_guard_stack(void *p)
3901 BUILD_BUG_ON((DEBUG_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3902 p = (void *)((unsigned long)p + STACK_SIZE - DEBUG_STACK_SIZE - PAGE_SIZE);
3903 memguard_guard_range(p, PAGE_SIZE);
3906 /*
3907 * Local variables:
3908 * mode: C
3909 * c-set-style: "BSD"
3910 * c-basic-offset: 4
3911 * tab-width: 4
3912 * indent-tabs-mode: nil
3913 * End:
3914 */