debuggers.hg

view xen/arch/x86/mm.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents cecc76506afc
children 01ae7dc043ba
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *__read_mostly frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 int opt_allow_hugepage;
164 boolean_param("allowhugepage", opt_allow_hugepage);
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d)) ? \
171 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 #ifdef CONFIG_COMPAT
174 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
175 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
176 L3_DISALLOW_MASK : \
177 COMPAT_L3_DISALLOW_MASK)
178 #else
179 #define l3_disallow_mask(d) L3_DISALLOW_MASK
180 #endif
182 void __init init_frametable(void)
183 {
184 unsigned long nr_pages, page_step, i, mfn;
186 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
188 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
189 page_step = 1 << (cpu_has_page1gb ? L3_PAGETABLE_SHIFT - PAGE_SHIFT
190 : L2_PAGETABLE_SHIFT - PAGE_SHIFT);
192 for ( i = 0; i < nr_pages; i += page_step )
193 {
194 /*
195 * The hardcoded 4 below is arbitrary - just pick whatever you think
196 * is reasonable to waste as a trade-off for using a large page.
197 */
198 while (nr_pages + 4 - i < page_step)
199 page_step >>= PAGETABLE_ORDER;
200 mfn = alloc_boot_pages(page_step, page_step);
201 if ( mfn == 0 )
202 panic("Not enough memory for frame table\n");
203 map_pages_to_xen(
204 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
205 mfn, page_step, PAGE_HYPERVISOR);
206 }
208 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
209 }
211 void __init arch_init_memory(void)
212 {
213 extern void subarch_init_memory(void);
215 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
217 /*
218 * Initialise our DOMID_XEN domain.
219 * Any Xen-heap pages that we will allow to be mapped will have
220 * their domain field set to dom_xen.
221 */
222 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
223 BUG_ON(dom_xen == NULL);
225 /*
226 * Initialise our DOMID_IO domain.
227 * This domain owns I/O pages that are within the range of the page_info
228 * array. Mappings occur at the priv of the caller.
229 */
230 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
231 BUG_ON(dom_io == NULL);
233 /* First 1MB of RAM is historically marked as I/O. */
234 for ( i = 0; i < 0x100; i++ )
235 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
237 /* Any areas not specified as RAM by the e820 map are considered I/O. */
238 for ( i = 0, pfn = 0; pfn < max_page; i++ )
239 {
240 while ( (i < e820.nr_map) &&
241 (e820.map[i].type != E820_RAM) &&
242 (e820.map[i].type != E820_UNUSABLE) )
243 i++;
245 if ( i >= e820.nr_map )
246 {
247 /* No more RAM regions: mark as I/O right to end of memory map. */
248 rstart_pfn = rend_pfn = max_page;
249 }
250 else
251 {
252 /* Mark as I/O just up as far as next RAM region. */
253 rstart_pfn = min_t(unsigned long, max_page,
254 PFN_UP(e820.map[i].addr));
255 rend_pfn = max_t(unsigned long, rstart_pfn,
256 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
257 }
259 /*
260 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
261 * In particular this ensures that RAM holes are respected even in
262 * the statically-initialised 1-16MB mapping area.
263 */
264 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
265 ioend_pfn = rstart_pfn;
266 #if defined(CONFIG_X86_32)
267 ioend_pfn = min_t(unsigned long, ioend_pfn,
268 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
269 #endif
270 if ( iostart_pfn < ioend_pfn )
271 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
272 (unsigned long)mfn_to_virt(ioend_pfn));
274 /* Mark as I/O up to next RAM region. */
275 for ( ; pfn < rstart_pfn; pfn++ )
276 {
277 BUG_ON(!mfn_valid(pfn));
278 share_xen_page_with_guest(
279 mfn_to_page(pfn), dom_io, XENSHARE_writable);
280 }
282 /* Skip the RAM region. */
283 pfn = rend_pfn;
284 }
286 subarch_init_memory();
287 }
289 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
290 {
291 uint64_t maddr = pfn_to_paddr(mfn);
292 int i;
294 for ( i = 0; i < e820.nr_map; i++ )
295 {
296 switch ( e820.map[i].type )
297 {
298 case E820_RAM:
299 if ( mem_type & RAM_TYPE_CONVENTIONAL )
300 break;
301 continue;
302 case E820_RESERVED:
303 if ( mem_type & RAM_TYPE_RESERVED )
304 break;
305 continue;
306 case E820_UNUSABLE:
307 if ( mem_type & RAM_TYPE_UNUSABLE )
308 break;
309 continue;
310 case E820_ACPI:
311 case E820_NVS:
312 if ( mem_type & RAM_TYPE_ACPI )
313 break;
314 continue;
315 default:
316 /* unknown */
317 continue;
318 }
320 /* Test the range. */
321 if ( (e820.map[i].addr <= maddr) &&
322 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
323 return 1;
324 }
326 return 0;
327 }
329 unsigned long domain_get_maximum_gpfn(struct domain *d)
330 {
331 if ( is_hvm_domain(d) )
332 return d->arch.p2m->max_mapped_pfn;
333 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
334 return arch_get_max_pfn(d) - 1;
335 }
337 void share_xen_page_with_guest(
338 struct page_info *page, struct domain *d, int readonly)
339 {
340 if ( page_get_owner(page) == d )
341 return;
343 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
345 spin_lock(&d->page_alloc_lock);
347 /* The incremented type count pins as writable or read-only. */
348 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
349 page->u.inuse.type_info |= PGT_validated | 1;
351 page_set_owner(page, d);
352 wmb(); /* install valid domain ptr before updating refcnt. */
353 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
355 /* Only add to the allocation list if the domain isn't dying. */
356 if ( !d->is_dying )
357 {
358 page->count_info |= PGC_allocated | 1;
359 if ( unlikely(d->xenheap_pages++ == 0) )
360 get_knownalive_domain(d);
361 page_list_add_tail(page, &d->xenpage_list);
362 }
364 spin_unlock(&d->page_alloc_lock);
365 }
367 void share_xen_page_with_privileged_guests(
368 struct page_info *page, int readonly)
369 {
370 share_xen_page_with_guest(page, dom_xen, readonly);
371 }
373 #if defined(__i386__)
375 #ifdef NDEBUG
376 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
377 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
378 #else
379 /*
380 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
381 * We cannot safely shadow the idle page table, nor shadow page tables
382 * (detected by zero reference count). As required for correctness, we
383 * always shadow PDPTs above 4GB.
384 */
385 #define l3tab_needs_shadow(mfn) \
386 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
387 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
388 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
389 ((mfn) >= 0x100000))
390 #endif
392 static l1_pgentry_t *fix_pae_highmem_pl1e;
394 /* Cache the address of PAE high-memory fixmap page tables. */
395 static int __init cache_pae_fixmap_address(void)
396 {
397 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
398 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
399 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
400 return 0;
401 }
402 __initcall(cache_pae_fixmap_address);
404 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
406 void make_cr3(struct vcpu *v, unsigned long mfn)
407 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
408 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
409 {
410 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
411 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
412 unsigned int cpu = smp_processor_id();
414 /* Fast path: does this mfn need a shadow at all? */
415 if ( !l3tab_needs_shadow(mfn) )
416 {
417 v->arch.cr3 = mfn << PAGE_SHIFT;
418 /* Cache is no longer in use or valid */
419 cache->high_mfn = 0;
420 return;
421 }
423 /* Caching logic is not interrupt safe. */
424 ASSERT(!in_irq());
426 /* Protects against pae_flush_pgd(). */
427 spin_lock(&cache->lock);
429 cache->inuse_idx ^= 1;
430 cache->high_mfn = mfn;
432 /* Map the guest L3 table and copy to the chosen low-memory cache. */
433 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
434 /* First check the previous high mapping can't be in the TLB.
435 * (i.e. have we loaded CR3 since we last did this?) */
436 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
437 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
438 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
439 lowmem_l3tab = cache->table[cache->inuse_idx];
440 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
441 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
442 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
444 v->arch.cr3 = __pa(lowmem_l3tab);
446 spin_unlock(&cache->lock);
447 }
449 #else /* !defined(__i386__) */
451 void make_cr3(struct vcpu *v, unsigned long mfn)
452 {
453 v->arch.cr3 = mfn << PAGE_SHIFT;
454 }
456 #endif /* !defined(__i386__) */
458 void write_ptbase(struct vcpu *v)
459 {
460 write_cr3(v->arch.cr3);
461 }
463 /*
464 * Should be called after CR3 is updated.
465 *
466 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
467 * for HVM guests, arch.monitor_table and hvm's guest CR3.
468 *
469 * Update ref counts to shadow tables appropriately.
470 */
471 void update_cr3(struct vcpu *v)
472 {
473 unsigned long cr3_mfn=0;
475 if ( paging_mode_enabled(v->domain) )
476 {
477 paging_update_cr3(v);
478 return;
479 }
481 #if CONFIG_PAGING_LEVELS == 4
482 if ( !(v->arch.flags & TF_kernel_mode) )
483 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
484 else
485 #endif
486 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
488 make_cr3(v, cr3_mfn);
489 }
492 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
493 {
494 int i;
495 unsigned long pfn;
496 struct page_info *page;
498 BUG_ON(unlikely(in_irq()));
500 spin_lock(&v->arch.shadow_ldt_lock);
502 if ( v->arch.shadow_ldt_mapcnt == 0 )
503 goto out;
505 v->arch.shadow_ldt_mapcnt = 0;
507 for ( i = 16; i < 32; i++ )
508 {
509 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
510 if ( pfn == 0 ) continue;
511 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
512 page = mfn_to_page(pfn);
513 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
514 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
515 put_page_and_type(page);
516 }
518 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
519 if ( flush )
520 flush_tlb_mask(&v->vcpu_dirty_cpumask);
522 out:
523 spin_unlock(&v->arch.shadow_ldt_lock);
524 }
527 static int alloc_segdesc_page(struct page_info *page)
528 {
529 struct desc_struct *descs;
530 int i;
532 descs = map_domain_page(page_to_mfn(page));
534 for ( i = 0; i < 512; i++ )
535 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
536 goto fail;
538 unmap_domain_page(descs);
539 return 0;
541 fail:
542 unmap_domain_page(descs);
543 return -EINVAL;
544 }
547 /* Map shadow page at offset @off. */
548 int map_ldt_shadow_page(unsigned int off)
549 {
550 struct vcpu *v = current;
551 struct domain *d = v->domain;
552 unsigned long gmfn, mfn;
553 l1_pgentry_t l1e, nl1e;
554 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
555 int okay;
557 BUG_ON(unlikely(in_irq()));
559 guest_get_eff_kern_l1e(v, gva, &l1e);
560 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
561 return 0;
563 gmfn = l1e_get_pfn(l1e);
564 mfn = gmfn_to_mfn(d, gmfn);
565 if ( unlikely(!mfn_valid(mfn)) )
566 return 0;
568 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
569 if ( unlikely(!okay) )
570 return 0;
572 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
574 spin_lock(&v->arch.shadow_ldt_lock);
575 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
576 v->arch.shadow_ldt_mapcnt++;
577 spin_unlock(&v->arch.shadow_ldt_lock);
579 return 1;
580 }
583 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
584 {
585 struct page_info *page = mfn_to_page(page_nr);
587 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
588 {
589 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
590 return 0;
591 }
593 return 1;
594 }
597 static int get_page_and_type_from_pagenr(unsigned long page_nr,
598 unsigned long type,
599 struct domain *d,
600 int partial,
601 int preemptible)
602 {
603 struct page_info *page = mfn_to_page(page_nr);
604 int rc;
606 if ( likely(partial >= 0) &&
607 unlikely(!get_page_from_pagenr(page_nr, d)) )
608 return -EINVAL;
610 rc = (preemptible ?
611 get_page_type_preemptible(page, type) :
612 (get_page_type(page, type) ? 0 : -EINVAL));
614 if ( unlikely(rc) && partial >= 0 )
615 put_page(page);
617 return rc;
618 }
620 static int get_data_page(
621 struct page_info *page, struct domain *d, int writeable)
622 {
623 int rc;
625 if ( writeable )
626 rc = get_page_and_type(page, d, PGT_writable_page);
627 else
628 rc = get_page(page, d);
630 return rc;
631 }
633 static void put_data_page(
634 struct page_info *page, int writeable)
635 {
636 if ( writeable )
637 put_page_and_type(page);
638 else
639 put_page(page);
640 }
642 /*
643 * We allow root tables to map each other (a.k.a. linear page tables). It
644 * needs some special care with reference counts and access permissions:
645 * 1. The mapping entry must be read-only, or the guest may get write access
646 * to its own PTEs.
647 * 2. We must only bump the reference counts for an *already validated*
648 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
649 * on a validation that is required to complete that validation.
650 * 3. We only need to increment the reference counts for the mapped page
651 * frame if it is mapped by a different root table. This is sufficient and
652 * also necessary to allow validation of a root table mapping itself.
653 */
654 #define define_get_linear_pagetable(level) \
655 static int \
656 get_##level##_linear_pagetable( \
657 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
658 { \
659 unsigned long x, y; \
660 struct page_info *page; \
661 unsigned long pfn; \
662 \
663 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
664 { \
665 MEM_LOG("Attempt to create linear p.t. with write perms"); \
666 return 0; \
667 } \
668 \
669 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
670 { \
671 /* Make sure the mapped frame belongs to the correct domain. */ \
672 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
673 return 0; \
674 \
675 /* \
676 * Ensure that the mapped frame is an already-validated page table. \
677 * If so, atomically increment the count (checking for overflow). \
678 */ \
679 page = mfn_to_page(pfn); \
680 y = page->u.inuse.type_info; \
681 do { \
682 x = y; \
683 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
684 unlikely((x & (PGT_type_mask|PGT_validated)) != \
685 (PGT_##level##_page_table|PGT_validated)) ) \
686 { \
687 put_page(page); \
688 return 0; \
689 } \
690 } \
691 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
692 } \
693 \
694 return 1; \
695 }
698 int is_iomem_page(unsigned long mfn)
699 {
700 struct page_info *page;
702 if ( !mfn_valid(mfn) )
703 return 1;
705 /* Caller must know that it is an iomem page, or a reference is held. */
706 page = mfn_to_page(mfn);
707 ASSERT((page->count_info & PGC_count_mask) != 0);
709 return (page_get_owner(page) == dom_io);
710 }
712 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
713 {
714 #ifdef __x86_64__
715 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
716 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
717 unsigned long xen_va =
718 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
720 if ( unlikely(alias) && cacheattr )
721 map_pages_to_xen(xen_va, mfn, 1, 0);
722 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
723 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
724 if ( unlikely(alias) && !cacheattr )
725 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
726 #endif
727 }
729 int
730 get_page_from_l1e(
731 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
732 {
733 unsigned long mfn = l1e_get_pfn(l1e);
734 struct page_info *page = mfn_to_page(mfn);
735 uint32_t l1f = l1e_get_flags(l1e);
736 struct vcpu *curr = current;
737 struct domain *real_pg_owner;
739 if ( !(l1f & _PAGE_PRESENT) )
740 return 1;
742 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
743 {
744 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
745 return 0;
746 }
748 if ( !mfn_valid(mfn) ||
749 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
750 {
751 /* Only needed the reference to confirm dom_io ownership. */
752 if ( mfn_valid(mfn) )
753 put_page(page);
755 /* DOMID_IO reverts to caller for privilege checks. */
756 if ( pg_owner == dom_io )
757 pg_owner = curr->domain;
759 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
760 {
761 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
762 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
763 pg_owner->domain_id, mfn);
764 return 0;
765 }
767 return 1;
768 }
770 if ( real_pg_owner == NULL )
771 goto could_not_pin;
773 if ( unlikely(real_pg_owner != pg_owner) )
774 {
775 /*
776 * Let privileged domains transfer the right to map their target
777 * domain's pages. This is used to allow stub-domain pvfb export to
778 * dom0, until pvfb supports granted mappings. At that time this
779 * minor hack can go away.
780 */
781 if ( (pg_owner == l1e_owner) || !IS_PRIV_FOR(pg_owner, real_pg_owner) )
782 goto could_not_pin;
783 pg_owner = real_pg_owner;
784 }
786 /* Foreign mappings into guests in shadow external mode don't
787 * contribute to writeable mapping refcounts. (This allows the
788 * qemu-dm helper process in dom0 to map the domain's memory without
789 * messing up the count of "real" writable mappings.) */
790 if ( (l1f & _PAGE_RW) &&
791 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) &&
792 !get_page_type(page, PGT_writable_page) )
793 goto could_not_pin;
795 if ( pte_flags_to_cacheattr(l1f) !=
796 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
797 {
798 unsigned long x, nx, y = page->count_info;
799 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
801 if ( is_xen_heap_page(page) )
802 {
803 if ( (l1f & _PAGE_RW) &&
804 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
805 put_page_type(page);
806 put_page(page);
807 MEM_LOG("Attempt to change cache attributes of Xen heap page");
808 return 0;
809 }
811 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
812 {
813 x = y;
814 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
815 y = cmpxchg(&page->count_info, x, nx);
816 }
818 update_xen_mappings(mfn, cacheattr);
819 }
821 return 1;
823 could_not_pin:
824 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
825 " for l1e_owner=%d, pg_owner=%d",
826 mfn, get_gpfn_from_mfn(mfn),
827 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
828 if ( real_pg_owner != NULL )
829 put_page(page);
830 return 0;
831 }
834 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
835 define_get_linear_pagetable(l2);
836 static int
837 get_page_from_l2e(
838 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
839 {
840 unsigned long mfn = l2e_get_pfn(l2e);
841 int rc;
843 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
844 return 1;
846 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
847 {
848 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
849 return -EINVAL;
850 }
852 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
853 {
854 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
855 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
856 rc = 0;
857 }
858 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
859 {
860 rc = -EINVAL;
861 }
862 else
863 {
864 unsigned long m = mfn;
865 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
867 do {
868 rc = get_data_page(mfn_to_page(m), d, writeable);
869 if ( unlikely(!rc) )
870 {
871 while ( m-- > mfn )
872 put_data_page(mfn_to_page(m), writeable);
873 return -EINVAL;
874 }
875 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
876 }
878 return rc;
879 }
882 define_get_linear_pagetable(l3);
883 static int
884 get_page_from_l3e(
885 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
886 {
887 int rc;
889 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
890 return 1;
892 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
893 {
894 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
895 return -EINVAL;
896 }
898 rc = get_page_and_type_from_pagenr(
899 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
900 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
901 rc = 0;
903 return rc;
904 }
906 #if CONFIG_PAGING_LEVELS >= 4
907 define_get_linear_pagetable(l4);
908 static int
909 get_page_from_l4e(
910 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
911 {
912 int rc;
914 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
915 return 1;
917 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
918 {
919 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
920 return -EINVAL;
921 }
923 rc = get_page_and_type_from_pagenr(
924 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
925 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
926 rc = 0;
928 return rc;
929 }
930 #endif /* 4 level */
932 #ifdef __x86_64__
934 #ifdef USER_MAPPINGS_ARE_GLOBAL
935 #define adjust_guest_l1e(pl1e, d) \
936 do { \
937 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
938 likely(!is_pv_32on64_domain(d)) ) \
939 { \
940 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
941 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
942 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
943 MEM_LOG("Global bit is set to kernel page %lx", \
944 l1e_get_pfn((pl1e))); \
945 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
946 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
947 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
948 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
949 } \
950 } while ( 0 )
951 #else
952 #define adjust_guest_l1e(pl1e, d) \
953 do { \
954 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
955 likely(!is_pv_32on64_domain(d)) ) \
956 l1e_add_flags((pl1e), _PAGE_USER); \
957 } while ( 0 )
958 #endif
960 #define adjust_guest_l2e(pl2e, d) \
961 do { \
962 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
963 likely(!is_pv_32on64_domain(d)) ) \
964 l2e_add_flags((pl2e), _PAGE_USER); \
965 } while ( 0 )
967 #define adjust_guest_l3e(pl3e, d) \
968 do { \
969 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
970 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
971 _PAGE_USER : \
972 _PAGE_USER|_PAGE_RW); \
973 } while ( 0 )
975 #define adjust_guest_l4e(pl4e, d) \
976 do { \
977 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
978 likely(!is_pv_32on64_domain(d)) ) \
979 l4e_add_flags((pl4e), _PAGE_USER); \
980 } while ( 0 )
982 #else /* !defined(__x86_64__) */
984 #define adjust_guest_l1e(_p, _d) ((void)(_d))
985 #define adjust_guest_l2e(_p, _d) ((void)(_d))
986 #define adjust_guest_l3e(_p, _d) ((void)(_d))
988 #endif
990 #ifdef CONFIG_COMPAT
991 #define unadjust_guest_l3e(pl3e, d) \
992 do { \
993 if ( unlikely(is_pv_32on64_domain(d)) && \
994 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
995 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
996 } while ( 0 )
997 #else
998 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
999 #endif
1001 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1003 unsigned long pfn = l1e_get_pfn(l1e);
1004 struct page_info *page;
1005 struct domain *pg_owner;
1006 struct vcpu *v;
1008 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1009 return;
1011 page = mfn_to_page(pfn);
1012 pg_owner = page_get_owner(page);
1014 /*
1015 * Check if this is a mapping that was established via a grant reference.
1016 * If it was then we should not be here: we require that such mappings are
1017 * explicitly destroyed via the grant-table interface.
1019 * The upshot of this is that the guest can end up with active grants that
1020 * it cannot destroy (because it no longer has a PTE to present to the
1021 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1022 * hence a special grant PTE flag can be enabled to catch the bug early.
1024 * (Note that the undestroyable active grants are not a security hole in
1025 * Xen. All active grants can safely be cleaned up when the domain dies.)
1026 */
1027 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1028 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1030 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1031 l1e_get_intpte(l1e));
1032 domain_crash(l1e_owner);
1035 /* Remember we didn't take a type-count of foreign writable mappings
1036 * to paging-external domains */
1037 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1038 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1040 put_page_and_type(page);
1042 else
1044 /* We expect this is rare so we blow the entire shadow LDT. */
1045 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1046 PGT_seg_desc_page)) &&
1047 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1048 (l1e_owner == pg_owner) )
1050 for_each_vcpu ( pg_owner, v )
1051 invalidate_shadow_ldt(v, 1);
1053 put_page(page);
1058 /*
1059 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1060 * Note also that this automatically deals correctly with linear p.t.'s.
1061 */
1062 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1064 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1065 return 1;
1067 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1069 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1070 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1072 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1073 do {
1074 put_data_page(mfn_to_page(m), writeable);
1075 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1077 else
1079 put_page_and_type(l2e_get_page(l2e));
1082 return 0;
1085 static int __put_page_type(struct page_info *, int preemptible);
1087 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1088 int partial, int preemptible)
1090 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1091 return 1;
1093 #ifdef __x86_64__
1094 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1096 unsigned long mfn = l3e_get_pfn(l3e);
1097 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1099 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1100 do {
1101 put_data_page(mfn_to_page(mfn), writeable);
1102 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1104 return 0;
1106 #endif
1108 if ( unlikely(partial > 0) )
1109 return __put_page_type(l3e_get_page(l3e), preemptible);
1111 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1114 #if CONFIG_PAGING_LEVELS >= 4
1115 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1116 int partial, int preemptible)
1118 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1119 (l4e_get_pfn(l4e) != pfn) )
1121 if ( unlikely(partial > 0) )
1122 return __put_page_type(l4e_get_page(l4e), preemptible);
1123 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1125 return 1;
1127 #endif
1129 static int alloc_l1_table(struct page_info *page)
1131 struct domain *d = page_get_owner(page);
1132 unsigned long pfn = page_to_mfn(page);
1133 l1_pgentry_t *pl1e;
1134 unsigned int i;
1136 pl1e = map_domain_page(pfn);
1138 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1140 if ( is_guest_l1_slot(i) &&
1141 unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
1142 goto fail;
1144 adjust_guest_l1e(pl1e[i], d);
1147 unmap_domain_page(pl1e);
1148 return 0;
1150 fail:
1151 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1152 while ( i-- > 0 )
1153 if ( is_guest_l1_slot(i) )
1154 put_page_from_l1e(pl1e[i], d);
1156 unmap_domain_page(pl1e);
1157 return -EINVAL;
1160 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1162 struct page_info *page;
1163 l2_pgentry_t *pl2e;
1164 l3_pgentry_t l3e3;
1165 #ifndef CONFIG_COMPAT
1166 l2_pgentry_t l2e;
1167 int i;
1168 #endif
1170 if ( !is_pv_32bit_domain(d) )
1171 return 1;
1173 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1175 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1176 l3e3 = pl3e[3];
1177 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1179 MEM_LOG("PAE L3 3rd slot is empty");
1180 return 0;
1183 /*
1184 * The Xen-private mappings include linear mappings. The L2 thus cannot
1185 * be shared by multiple L3 tables. The test here is adequate because:
1186 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1187 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1188 * 2. Cannot appear in another page table's L3:
1189 * a. alloc_l3_table() calls this function and this check will fail
1190 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1191 */
1192 page = l3e_get_page(l3e3);
1193 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1194 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1195 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1196 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1198 MEM_LOG("PAE L3 3rd slot is shared");
1199 return 0;
1202 /* Xen private mappings. */
1203 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1204 #ifndef CONFIG_COMPAT
1205 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1206 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1207 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1208 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1210 l2e = l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
1211 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1213 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1215 l2e = l2e_empty();
1216 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1217 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1218 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1220 #else
1221 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1222 &compat_idle_pg_table_l2[
1223 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1224 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1225 #endif
1226 unmap_domain_page(pl2e);
1228 return 1;
1231 #ifdef __i386__
1232 /* Flush a pgdir update into low-memory caches. */
1233 static void pae_flush_pgd(
1234 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1236 struct domain *d = page_get_owner(mfn_to_page(mfn));
1237 struct vcpu *v;
1238 intpte_t _ol3e, _nl3e, _pl3e;
1239 l3_pgentry_t *l3tab_ptr;
1240 struct pae_l3_cache *cache;
1242 if ( unlikely(shadow_mode_enabled(d)) )
1244 cpumask_t m = CPU_MASK_NONE;
1245 /* Re-shadow this l3 table on any vcpus that are using it */
1246 for_each_vcpu ( d, v )
1247 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1249 paging_update_cr3(v);
1250 cpus_or(m, m, v->vcpu_dirty_cpumask);
1252 flush_tlb_mask(&m);
1255 /* If below 4GB then the pgdir is not shadowed in low memory. */
1256 if ( !l3tab_needs_shadow(mfn) )
1257 return;
1259 for_each_vcpu ( d, v )
1261 cache = &v->arch.pae_l3_cache;
1263 spin_lock(&cache->lock);
1265 if ( cache->high_mfn == mfn )
1267 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1268 _ol3e = l3e_get_intpte(*l3tab_ptr);
1269 _nl3e = l3e_get_intpte(nl3e);
1270 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1271 BUG_ON(_pl3e != _ol3e);
1274 spin_unlock(&cache->lock);
1277 flush_tlb_mask(&d->domain_dirty_cpumask);
1279 #else
1280 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1281 #endif
1283 static int alloc_l2_table(struct page_info *page, unsigned long type,
1284 int preemptible)
1286 struct domain *d = page_get_owner(page);
1287 unsigned long pfn = page_to_mfn(page);
1288 l2_pgentry_t *pl2e;
1289 unsigned int i;
1290 int rc = 0;
1292 pl2e = map_domain_page(pfn);
1294 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1296 if ( preemptible && i && hypercall_preempt_check() )
1298 page->nr_validated_ptes = i;
1299 rc = -EAGAIN;
1300 break;
1303 if ( !is_guest_l2_slot(d, type, i) ||
1304 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1305 continue;
1307 if ( rc < 0 )
1309 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1310 while ( i-- > 0 )
1311 if ( is_guest_l2_slot(d, type, i) )
1312 put_page_from_l2e(pl2e[i], pfn);
1313 break;
1316 adjust_guest_l2e(pl2e[i], d);
1319 unmap_domain_page(pl2e);
1320 return rc > 0 ? 0 : rc;
1323 static int alloc_l3_table(struct page_info *page, int preemptible)
1325 struct domain *d = page_get_owner(page);
1326 unsigned long pfn = page_to_mfn(page);
1327 l3_pgentry_t *pl3e;
1328 unsigned int i;
1329 int rc = 0, partial = page->partial_pte;
1331 #if CONFIG_PAGING_LEVELS == 3
1332 /*
1333 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1334 * the weird 'extended cr3' format for dealing with high-order address
1335 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1336 */
1337 if ( (pfn >= 0x100000) &&
1338 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1339 d->vcpu && d->vcpu[0] && d->vcpu[0]->is_initialised )
1341 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1342 return -EINVAL;
1344 #endif
1346 pl3e = map_domain_page(pfn);
1348 /*
1349 * PAE guests allocate full pages, but aren't required to initialize
1350 * more than the first four entries; when running in compatibility
1351 * mode, however, the full page is visible to the MMU, and hence all
1352 * 512 entries must be valid/verified, which is most easily achieved
1353 * by clearing them out.
1354 */
1355 if ( is_pv_32on64_domain(d) )
1356 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1358 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1359 i++, partial = 0 )
1361 if ( is_pv_32bit_domain(d) && (i == 3) )
1363 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1364 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1365 rc = -EINVAL;
1366 else
1367 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1368 PGT_l2_page_table |
1369 PGT_pae_xen_l2,
1370 d, partial, preemptible);
1372 else if ( !is_guest_l3_slot(i) ||
1373 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1374 partial, preemptible)) > 0 )
1375 continue;
1377 if ( rc == -EAGAIN )
1379 page->nr_validated_ptes = i;
1380 page->partial_pte = partial ?: 1;
1382 else if ( rc == -EINTR && i )
1384 page->nr_validated_ptes = i;
1385 page->partial_pte = 0;
1386 rc = -EAGAIN;
1388 if ( rc < 0 )
1389 break;
1391 adjust_guest_l3e(pl3e[i], d);
1394 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1395 rc = -EINVAL;
1396 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1398 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1399 while ( i-- > 0 )
1401 if ( !is_guest_l3_slot(i) )
1402 continue;
1403 unadjust_guest_l3e(pl3e[i], d);
1404 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1408 unmap_domain_page(pl3e);
1409 return rc > 0 ? 0 : rc;
1412 #if CONFIG_PAGING_LEVELS >= 4
1413 static int alloc_l4_table(struct page_info *page, int preemptible)
1415 struct domain *d = page_get_owner(page);
1416 unsigned long pfn = page_to_mfn(page);
1417 l4_pgentry_t *pl4e = page_to_virt(page);
1418 unsigned int i;
1419 int rc = 0, partial = page->partial_pte;
1421 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1422 i++, partial = 0 )
1424 if ( !is_guest_l4_slot(d, i) ||
1425 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1426 partial, preemptible)) > 0 )
1427 continue;
1429 if ( rc == -EAGAIN )
1431 page->nr_validated_ptes = i;
1432 page->partial_pte = partial ?: 1;
1434 else if ( rc == -EINTR )
1436 if ( i )
1438 page->nr_validated_ptes = i;
1439 page->partial_pte = 0;
1440 rc = -EAGAIN;
1443 else if ( rc < 0 )
1445 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1446 while ( i-- > 0 )
1447 if ( is_guest_l4_slot(d, i) )
1448 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1450 if ( rc < 0 )
1451 return rc;
1453 adjust_guest_l4e(pl4e[i], d);
1456 /* Xen private mappings. */
1457 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1458 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1459 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1460 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1461 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1462 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1463 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1464 __PAGE_HYPERVISOR);
1466 return rc > 0 ? 0 : rc;
1468 #else
1469 #define alloc_l4_table(page, preemptible) (-EINVAL)
1470 #endif
1473 static void free_l1_table(struct page_info *page)
1475 struct domain *d = page_get_owner(page);
1476 unsigned long pfn = page_to_mfn(page);
1477 l1_pgentry_t *pl1e;
1478 unsigned int i;
1480 pl1e = map_domain_page(pfn);
1482 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1483 if ( is_guest_l1_slot(i) )
1484 put_page_from_l1e(pl1e[i], d);
1486 unmap_domain_page(pl1e);
1490 static int free_l2_table(struct page_info *page, int preemptible)
1492 #ifdef CONFIG_COMPAT
1493 struct domain *d = page_get_owner(page);
1494 #endif
1495 unsigned long pfn = page_to_mfn(page);
1496 l2_pgentry_t *pl2e;
1497 unsigned int i = page->nr_validated_ptes - 1;
1498 int err = 0;
1500 pl2e = map_domain_page(pfn);
1502 ASSERT(page->nr_validated_ptes);
1503 do {
1504 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1505 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1506 preemptible && i && hypercall_preempt_check() )
1508 page->nr_validated_ptes = i;
1509 err = -EAGAIN;
1511 } while ( !err && i-- );
1513 unmap_domain_page(pl2e);
1515 if ( !err )
1516 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1518 return err;
1521 static int free_l3_table(struct page_info *page, int preemptible)
1523 struct domain *d = page_get_owner(page);
1524 unsigned long pfn = page_to_mfn(page);
1525 l3_pgentry_t *pl3e;
1526 int rc = 0, partial = page->partial_pte;
1527 unsigned int i = page->nr_validated_ptes - !partial;
1529 pl3e = map_domain_page(pfn);
1531 do {
1532 if ( is_guest_l3_slot(i) )
1534 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1535 if ( rc < 0 )
1536 break;
1537 partial = 0;
1538 if ( rc > 0 )
1539 continue;
1540 unadjust_guest_l3e(pl3e[i], d);
1542 } while ( i-- );
1544 unmap_domain_page(pl3e);
1546 if ( rc == -EAGAIN )
1548 page->nr_validated_ptes = i;
1549 page->partial_pte = partial ?: -1;
1551 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1553 page->nr_validated_ptes = i + 1;
1554 page->partial_pte = 0;
1555 rc = -EAGAIN;
1557 return rc > 0 ? 0 : rc;
1560 #if CONFIG_PAGING_LEVELS >= 4
1561 static int free_l4_table(struct page_info *page, int preemptible)
1563 struct domain *d = page_get_owner(page);
1564 unsigned long pfn = page_to_mfn(page);
1565 l4_pgentry_t *pl4e = page_to_virt(page);
1566 int rc = 0, partial = page->partial_pte;
1567 unsigned int i = page->nr_validated_ptes - !partial;
1569 do {
1570 if ( is_guest_l4_slot(d, i) )
1571 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1572 if ( rc < 0 )
1573 break;
1574 partial = 0;
1575 } while ( i-- );
1577 if ( rc == -EAGAIN )
1579 page->nr_validated_ptes = i;
1580 page->partial_pte = partial ?: -1;
1582 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1584 page->nr_validated_ptes = i + 1;
1585 page->partial_pte = 0;
1586 rc = -EAGAIN;
1588 return rc > 0 ? 0 : rc;
1590 #else
1591 #define free_l4_table(page, preemptible) (-EINVAL)
1592 #endif
1594 static int page_lock(struct page_info *page)
1596 unsigned long x, nx;
1598 do {
1599 while ( (x = page->u.inuse.type_info) & PGT_locked )
1600 cpu_relax();
1601 nx = x + (1 | PGT_locked);
1602 if ( !(x & PGT_validated) ||
1603 !(x & PGT_count_mask) ||
1604 !(nx & PGT_count_mask) )
1605 return 0;
1606 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1608 return 1;
1611 static void page_unlock(struct page_info *page)
1613 unsigned long x, nx, y = page->u.inuse.type_info;
1615 do {
1616 x = y;
1617 nx = x - (1 | PGT_locked);
1618 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1621 /* How to write an entry to the guest pagetables.
1622 * Returns 0 for failure (pointer not valid), 1 for success. */
1623 static inline int update_intpte(intpte_t *p,
1624 intpte_t old,
1625 intpte_t new,
1626 unsigned long mfn,
1627 struct vcpu *v,
1628 int preserve_ad)
1630 int rv = 1;
1631 #ifndef PTE_UPDATE_WITH_CMPXCHG
1632 if ( !preserve_ad )
1634 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1636 else
1637 #endif
1639 intpte_t t = old;
1640 for ( ; ; )
1642 intpte_t _new = new;
1643 if ( preserve_ad )
1644 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1646 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1647 if ( unlikely(rv == 0) )
1649 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1650 ": saw %" PRIpte, old, _new, t);
1651 break;
1654 if ( t == old )
1655 break;
1657 /* Allowed to change in Accessed/Dirty flags only. */
1658 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1660 old = t;
1663 return rv;
1666 /* Macro that wraps the appropriate type-changes around update_intpte().
1667 * Arguments are: type, ptr, old, new, mfn, vcpu */
1668 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1669 update_intpte(&_t ## e_get_intpte(*(_p)), \
1670 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1671 (_m), (_v), (_ad))
1673 /* Update the L1 entry at pl1e to new value nl1e. */
1674 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1675 unsigned long gl1mfn, int preserve_ad,
1676 struct vcpu *vcpu)
1678 l1_pgentry_t ol1e;
1679 struct domain *d = vcpu->domain;
1680 unsigned long mfn;
1681 p2m_type_t p2mt;
1682 int rc = 1;
1684 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1685 return 0;
1687 if ( unlikely(paging_mode_refcounts(d)) )
1689 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu, preserve_ad);
1690 return rc;
1693 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1695 /* Translate foreign guest addresses. */
1696 mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
1697 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1698 return 0;
1699 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1700 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1702 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1704 MEM_LOG("Bad L1 flags %x",
1705 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1706 return 0;
1709 /* Fast path for identical mapping, r/w and presence. */
1710 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1712 adjust_guest_l1e(nl1e, d);
1713 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1714 preserve_ad);
1715 return rc;
1718 if ( unlikely(!get_page_from_l1e(nl1e, d, FOREIGNDOM)) )
1719 return 0;
1721 adjust_guest_l1e(nl1e, d);
1722 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1723 preserve_ad)) )
1725 ol1e = nl1e;
1726 rc = 0;
1729 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1730 preserve_ad)) )
1732 return 0;
1735 put_page_from_l1e(ol1e, d);
1736 return rc;
1740 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1741 static int mod_l2_entry(l2_pgentry_t *pl2e,
1742 l2_pgentry_t nl2e,
1743 unsigned long pfn,
1744 int preserve_ad,
1745 struct vcpu *vcpu)
1747 l2_pgentry_t ol2e;
1748 struct domain *d = vcpu->domain;
1749 struct page_info *l2pg = mfn_to_page(pfn);
1750 unsigned long type = l2pg->u.inuse.type_info;
1751 int rc = 1;
1753 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1755 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1756 return 0;
1759 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1760 return 0;
1762 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1764 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1766 MEM_LOG("Bad L2 flags %x",
1767 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1768 return 0;
1771 /* Fast path for identical mapping and presence. */
1772 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1774 adjust_guest_l2e(nl2e, d);
1775 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1776 return rc;
1779 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1780 return 0;
1782 adjust_guest_l2e(nl2e, d);
1783 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1784 preserve_ad)) )
1786 ol2e = nl2e;
1787 rc = 0;
1790 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1791 preserve_ad)) )
1793 return 0;
1796 put_page_from_l2e(ol2e, pfn);
1797 return rc;
1800 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1801 static int mod_l3_entry(l3_pgentry_t *pl3e,
1802 l3_pgentry_t nl3e,
1803 unsigned long pfn,
1804 int preserve_ad,
1805 int preemptible,
1806 struct vcpu *vcpu)
1808 l3_pgentry_t ol3e;
1809 struct domain *d = vcpu->domain;
1810 int rc = 0;
1812 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1814 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1815 return -EINVAL;
1818 /*
1819 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1820 * would be a pain to ensure they remain continuously valid throughout.
1821 */
1822 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1823 return -EINVAL;
1825 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1826 return -EFAULT;
1828 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1830 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1832 MEM_LOG("Bad L3 flags %x",
1833 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1834 return -EINVAL;
1837 /* Fast path for identical mapping and presence. */
1838 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1840 adjust_guest_l3e(nl3e, d);
1841 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1842 return rc ? 0 : -EFAULT;
1845 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1846 if ( unlikely(rc < 0) )
1847 return rc;
1848 rc = 0;
1850 adjust_guest_l3e(nl3e, d);
1851 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1852 preserve_ad)) )
1854 ol3e = nl3e;
1855 rc = -EFAULT;
1858 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1859 preserve_ad)) )
1861 return -EFAULT;
1864 if ( likely(rc == 0) )
1866 if ( !create_pae_xen_mappings(d, pl3e) )
1867 BUG();
1869 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1872 put_page_from_l3e(ol3e, pfn, 0, 0);
1873 return rc;
1876 #if CONFIG_PAGING_LEVELS >= 4
1878 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1879 static int mod_l4_entry(l4_pgentry_t *pl4e,
1880 l4_pgentry_t nl4e,
1881 unsigned long pfn,
1882 int preserve_ad,
1883 int preemptible,
1884 struct vcpu *vcpu)
1886 struct domain *d = vcpu->domain;
1887 l4_pgentry_t ol4e;
1888 int rc = 0;
1890 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1892 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1893 return -EINVAL;
1896 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1897 return -EFAULT;
1899 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1901 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1903 MEM_LOG("Bad L4 flags %x",
1904 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1905 return -EINVAL;
1908 /* Fast path for identical mapping and presence. */
1909 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1911 adjust_guest_l4e(nl4e, d);
1912 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1913 return rc ? 0 : -EFAULT;
1916 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1917 if ( unlikely(rc < 0) )
1918 return rc;
1919 rc = 0;
1921 adjust_guest_l4e(nl4e, d);
1922 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1923 preserve_ad)) )
1925 ol4e = nl4e;
1926 rc = -EFAULT;
1929 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1930 preserve_ad)) )
1932 return -EFAULT;
1935 put_page_from_l4e(ol4e, pfn, 0, 0);
1936 return rc;
1939 #endif
1941 void put_page(struct page_info *page)
1943 unsigned long nx, x, y = page->count_info;
1945 do {
1946 ASSERT((y & PGC_count_mask) != 0);
1947 x = y;
1948 nx = x - 1;
1950 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1952 if ( unlikely((nx & PGC_count_mask) == 0) )
1954 cleanup_page_cacheattr(page);
1955 free_domheap_page(page);
1960 struct domain *page_get_owner_and_reference(struct page_info *page)
1962 unsigned long x, y = page->count_info;
1964 do {
1965 x = y;
1966 /*
1967 * Count == 0: Page is not allocated, so we cannot take a reference.
1968 * Count == -1: Reference count would wrap, which is invalid.
1969 * Count == -2: Remaining unused ref is reserved for get_page_light().
1970 */
1971 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
1972 return NULL;
1974 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
1976 return page_get_owner(page);
1980 int get_page(struct page_info *page, struct domain *domain)
1982 struct domain *owner = page_get_owner_and_reference(page);
1984 if ( likely(owner == domain) )
1985 return 1;
1987 if ( owner != NULL )
1988 put_page(page);
1990 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1991 gdprintk(XENLOG_INFO,
1992 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
1993 PRtype_info "\n",
1994 page_to_mfn(page), domain, owner,
1995 page->count_info, page->u.inuse.type_info);
1996 return 0;
1999 /*
2000 * Special version of get_page() to be used exclusively when
2001 * - a page is known to already have a non-zero reference count
2002 * - the page does not need its owner to be checked
2003 * - it will not be called more than once without dropping the thus
2004 * acquired reference again.
2005 * Due to get_page() reserving one reference, this call cannot fail.
2006 */
2007 static void get_page_light(struct page_info *page)
2009 unsigned long x, nx, y = page->count_info;
2011 do {
2012 x = y;
2013 nx = x + 1;
2014 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2015 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2016 y = cmpxchg(&page->count_info, x, nx);
2018 while ( unlikely(y != x) );
2021 static int alloc_page_type(struct page_info *page, unsigned long type,
2022 int preemptible)
2024 struct domain *owner = page_get_owner(page);
2025 int rc;
2027 /* A page table is dirtied when its type count becomes non-zero. */
2028 if ( likely(owner != NULL) )
2029 paging_mark_dirty(owner, page_to_mfn(page));
2031 switch ( type & PGT_type_mask )
2033 case PGT_l1_page_table:
2034 rc = alloc_l1_table(page);
2035 break;
2036 case PGT_l2_page_table:
2037 rc = alloc_l2_table(page, type, preemptible);
2038 break;
2039 case PGT_l3_page_table:
2040 rc = alloc_l3_table(page, preemptible);
2041 break;
2042 case PGT_l4_page_table:
2043 rc = alloc_l4_table(page, preemptible);
2044 break;
2045 case PGT_seg_desc_page:
2046 rc = alloc_segdesc_page(page);
2047 break;
2048 default:
2049 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2050 type, page->u.inuse.type_info,
2051 page->count_info);
2052 rc = -EINVAL;
2053 BUG();
2056 /* No need for atomic update of type_info here: noone else updates it. */
2057 wmb();
2058 if ( rc == -EAGAIN )
2060 get_page_light(page);
2061 page->u.inuse.type_info |= PGT_partial;
2063 else if ( rc == -EINTR )
2065 ASSERT((page->u.inuse.type_info &
2066 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2067 page->u.inuse.type_info &= ~PGT_count_mask;
2069 else if ( rc )
2071 ASSERT(rc < 0);
2072 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2073 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2074 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2075 type, page->count_info, page->u.inuse.type_info);
2076 page->u.inuse.type_info = 0;
2078 else
2080 page->u.inuse.type_info |= PGT_validated;
2083 return rc;
2087 int free_page_type(struct page_info *page, unsigned long type,
2088 int preemptible)
2090 struct domain *owner = page_get_owner(page);
2091 unsigned long gmfn;
2092 int rc;
2094 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2096 /* A page table is dirtied when its type count becomes zero. */
2097 paging_mark_dirty(owner, page_to_mfn(page));
2099 if ( shadow_mode_refcounts(owner) )
2100 return 0;
2102 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2103 ASSERT(VALID_M2P(gmfn));
2104 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2107 if ( !(type & PGT_partial) )
2109 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2110 page->partial_pte = 0;
2113 switch ( type & PGT_type_mask )
2115 case PGT_l1_page_table:
2116 free_l1_table(page);
2117 rc = 0;
2118 break;
2119 case PGT_l2_page_table:
2120 rc = free_l2_table(page, preemptible);
2121 break;
2122 case PGT_l3_page_table:
2123 #if CONFIG_PAGING_LEVELS == 3
2124 if ( !(type & PGT_partial) )
2125 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2126 #endif
2127 rc = free_l3_table(page, preemptible);
2128 break;
2129 case PGT_l4_page_table:
2130 rc = free_l4_table(page, preemptible);
2131 break;
2132 default:
2133 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2134 rc = -EINVAL;
2135 BUG();
2138 return rc;
2142 static int __put_final_page_type(
2143 struct page_info *page, unsigned long type, int preemptible)
2145 int rc = free_page_type(page, type, preemptible);
2147 /* No need for atomic update of type_info here: noone else updates it. */
2148 if ( rc == 0 )
2150 /*
2151 * Record TLB information for flush later. We do not stamp page tables
2152 * when running in shadow mode:
2153 * 1. Pointless, since it's the shadow pt's which must be tracked.
2154 * 2. Shadow mode reuses this field for shadowed page tables to
2155 * store flags info -- we don't want to conflict with that.
2156 */
2157 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2158 (page->count_info & PGC_page_table)) )
2159 page->tlbflush_timestamp = tlbflush_current_time();
2160 wmb();
2161 page->u.inuse.type_info--;
2163 else if ( rc == -EINTR )
2165 ASSERT((page->u.inuse.type_info &
2166 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2167 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2168 (page->count_info & PGC_page_table)) )
2169 page->tlbflush_timestamp = tlbflush_current_time();
2170 wmb();
2171 page->u.inuse.type_info |= PGT_validated;
2173 else
2175 BUG_ON(rc != -EAGAIN);
2176 wmb();
2177 get_page_light(page);
2178 page->u.inuse.type_info |= PGT_partial;
2181 return rc;
2185 static int __put_page_type(struct page_info *page,
2186 int preemptible)
2188 unsigned long nx, x, y = page->u.inuse.type_info;
2189 int rc = 0;
2191 for ( ; ; )
2193 x = y;
2194 nx = x - 1;
2196 ASSERT((x & PGT_count_mask) != 0);
2198 if ( unlikely((nx & PGT_count_mask) == 0) )
2200 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2201 likely(nx & (PGT_validated|PGT_partial)) )
2203 /*
2204 * Page-table pages must be unvalidated when count is zero. The
2205 * 'free' is safe because the refcnt is non-zero and validated
2206 * bit is clear => other ops will spin or fail.
2207 */
2208 nx = x & ~(PGT_validated|PGT_partial);
2209 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2210 x, nx)) != x) )
2211 continue;
2212 /* We cleared the 'valid bit' so we do the clean up. */
2213 rc = __put_final_page_type(page, x, preemptible);
2214 if ( x & PGT_partial )
2215 put_page(page);
2216 break;
2219 /*
2220 * Record TLB information for flush later. We do not stamp page
2221 * tables when running in shadow mode:
2222 * 1. Pointless, since it's the shadow pt's which must be tracked.
2223 * 2. Shadow mode reuses this field for shadowed page tables to
2224 * store flags info -- we don't want to conflict with that.
2225 */
2226 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2227 (page->count_info & PGC_page_table)) )
2228 page->tlbflush_timestamp = tlbflush_current_time();
2231 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2232 break;
2234 if ( preemptible && hypercall_preempt_check() )
2235 return -EINTR;
2238 return rc;
2242 static int __get_page_type(struct page_info *page, unsigned long type,
2243 int preemptible)
2245 unsigned long nx, x, y = page->u.inuse.type_info;
2246 int rc = 0;
2248 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2250 for ( ; ; )
2252 x = y;
2253 nx = x + 1;
2254 if ( unlikely((nx & PGT_count_mask) == 0) )
2256 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2257 return -EINVAL;
2259 else if ( unlikely((x & PGT_count_mask) == 0) )
2261 struct domain *d = page_get_owner(page);
2263 /* Normally we should never let a page go from type count 0
2264 * to type count 1 when it is shadowed. One exception:
2265 * out-of-sync shadowed pages are allowed to become
2266 * writeable. */
2267 if ( d && shadow_mode_enabled(d)
2268 && (page->count_info & PGC_page_table)
2269 && !((page->shadow_flags & (1u<<29))
2270 && type == PGT_writable_page) )
2271 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2273 ASSERT(!(x & PGT_pae_xen_l2));
2274 if ( (x & PGT_type_mask) != type )
2276 /*
2277 * On type change we check to flush stale TLB entries. This
2278 * may be unnecessary (e.g., page was GDT/LDT) but those
2279 * circumstances should be very rare.
2280 */
2281 cpumask_t mask = d->domain_dirty_cpumask;
2283 /* Don't flush if the timestamp is old enough */
2284 tlbflush_filter(mask, page->tlbflush_timestamp);
2286 if ( unlikely(!cpus_empty(mask)) &&
2287 /* Shadow mode: track only writable pages. */
2288 (!shadow_mode_enabled(page_get_owner(page)) ||
2289 ((nx & PGT_type_mask) == PGT_writable_page)) )
2291 perfc_incr(need_flush_tlb_flush);
2292 flush_tlb_mask(&mask);
2295 /* We lose existing type and validity. */
2296 nx &= ~(PGT_type_mask | PGT_validated);
2297 nx |= type;
2299 /* No special validation needed for writable pages. */
2300 /* Page tables and GDT/LDT need to be scanned for validity. */
2301 if ( type == PGT_writable_page )
2302 nx |= PGT_validated;
2305 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2307 /* Don't log failure if it could be a recursive-mapping attempt. */
2308 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2309 (type == PGT_l1_page_table) )
2310 return -EINVAL;
2311 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2312 (type == PGT_l2_page_table) )
2313 return -EINVAL;
2314 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2315 (type == PGT_l3_page_table) )
2316 return -EINVAL;
2317 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2318 "for mfn %lx (pfn %lx)",
2319 x, type, page_to_mfn(page),
2320 get_gpfn_from_mfn(page_to_mfn(page)));
2321 return -EINVAL;
2323 else if ( unlikely(!(x & PGT_validated)) )
2325 if ( !(x & PGT_partial) )
2327 /* Someone else is updating validation of this page. Wait... */
2328 while ( (y = page->u.inuse.type_info) == x )
2330 if ( preemptible && hypercall_preempt_check() )
2331 return -EINTR;
2332 cpu_relax();
2334 continue;
2336 /* Type ref count was left at 1 when PGT_partial got set. */
2337 ASSERT((x & PGT_count_mask) == 1);
2338 nx = x & ~PGT_partial;
2341 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2342 break;
2344 if ( preemptible && hypercall_preempt_check() )
2345 return -EINTR;
2348 if ( unlikely((x & PGT_type_mask) != type) )
2350 /* Special pages should not be accessible from devices. */
2351 struct domain *d = page_get_owner(page);
2352 if ( d && unlikely(need_iommu(d)) )
2354 if ( (x & PGT_type_mask) == PGT_writable_page )
2355 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2356 else if ( type == PGT_writable_page )
2357 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2358 page_to_mfn(page));
2362 if ( unlikely(!(nx & PGT_validated)) )
2364 if ( !(x & PGT_partial) )
2366 page->nr_validated_ptes = 0;
2367 page->partial_pte = 0;
2369 rc = alloc_page_type(page, type, preemptible);
2372 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2373 put_page(page);
2375 return rc;
2378 void put_page_type(struct page_info *page)
2380 int rc = __put_page_type(page, 0);
2381 ASSERT(rc == 0);
2382 (void)rc;
2385 int get_page_type(struct page_info *page, unsigned long type)
2387 int rc = __get_page_type(page, type, 0);
2388 if ( likely(rc == 0) )
2389 return 1;
2390 ASSERT(rc == -EINVAL);
2391 return 0;
2394 int put_page_type_preemptible(struct page_info *page)
2396 return __put_page_type(page, 1);
2399 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2401 return __get_page_type(page, type, 1);
2404 void cleanup_page_cacheattr(struct page_info *page)
2406 uint32_t cacheattr =
2407 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2409 if ( likely(cacheattr == 0) )
2410 return;
2412 page->count_info &= ~PGC_cacheattr_mask;
2414 BUG_ON(is_xen_heap_page(page));
2416 update_xen_mappings(page_to_mfn(page), 0);
2420 int new_guest_cr3(unsigned long mfn)
2422 struct vcpu *curr = current;
2423 struct domain *d = curr->domain;
2424 int okay;
2425 unsigned long old_base_mfn;
2427 #ifdef CONFIG_COMPAT
2428 if ( is_pv_32on64_domain(d) )
2430 okay = paging_mode_refcounts(d)
2431 ? 0 /* Old code was broken, but what should it be? */
2432 : mod_l4_entry(
2433 __va(pagetable_get_paddr(curr->arch.guest_table)),
2434 l4e_from_pfn(
2435 mfn,
2436 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2437 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2438 if ( unlikely(!okay) )
2440 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2441 return 0;
2444 invalidate_shadow_ldt(curr, 0);
2445 write_ptbase(curr);
2447 return 1;
2449 #endif
2450 okay = paging_mode_refcounts(d)
2451 ? get_page_from_pagenr(mfn, d)
2452 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2453 if ( unlikely(!okay) )
2455 MEM_LOG("Error while installing new baseptr %lx", mfn);
2456 return 0;
2459 invalidate_shadow_ldt(curr, 0);
2461 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2463 curr->arch.guest_table = pagetable_from_pfn(mfn);
2464 update_cr3(curr);
2466 write_ptbase(curr);
2468 if ( likely(old_base_mfn != 0) )
2470 if ( paging_mode_refcounts(d) )
2471 put_page(mfn_to_page(old_base_mfn));
2472 else
2473 put_page_and_type(mfn_to_page(old_base_mfn));
2476 return 1;
2479 static void process_deferred_ops(void)
2481 unsigned int deferred_ops;
2482 struct domain *d = current->domain;
2483 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2485 deferred_ops = info->deferred_ops;
2486 info->deferred_ops = 0;
2488 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2490 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2491 flush_tlb_mask(&d->domain_dirty_cpumask);
2492 else
2493 flush_tlb_local();
2496 /*
2497 * Do this after flushing TLBs, to ensure we see fresh LDT mappings
2498 * via the linear pagetable mapping.
2499 */
2500 if ( deferred_ops & DOP_RELOAD_LDT )
2501 (void)map_ldt_shadow_page(0);
2503 if ( unlikely(info->foreign != NULL) )
2505 rcu_unlock_domain(info->foreign);
2506 info->foreign = NULL;
2510 static int set_foreigndom(domid_t domid)
2512 struct domain *e, *d = current->domain;
2513 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2514 int okay = 1;
2516 ASSERT(info->foreign == NULL);
2518 if ( likely(domid == DOMID_SELF) )
2519 goto out;
2521 if ( unlikely(domid == d->domain_id) )
2523 MEM_LOG("Cannot specify itself as foreign domain");
2524 okay = 0;
2526 else if ( unlikely(paging_mode_translate(d)) )
2528 MEM_LOG("Cannot mix foreign mappings with translated domains");
2529 okay = 0;
2531 else switch ( domid )
2533 case DOMID_IO:
2534 info->foreign = rcu_lock_domain(dom_io);
2535 break;
2536 case DOMID_XEN:
2537 if (!IS_PRIV(d)) {
2538 MEM_LOG("Cannot set foreign dom");
2539 okay = 0;
2540 break;
2542 info->foreign = rcu_lock_domain(dom_xen);
2543 break;
2544 default:
2545 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2547 MEM_LOG("Unknown domain '%u'", domid);
2548 okay = 0;
2549 break;
2551 if ( !IS_PRIV_FOR(d, e) )
2553 MEM_LOG("Cannot set foreign dom");
2554 okay = 0;
2555 rcu_unlock_domain(e);
2556 break;
2558 info->foreign = e;
2559 break;
2562 out:
2563 return okay;
2566 static inline int vcpumask_to_pcpumask(
2567 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2569 unsigned int vcpu_id, vcpu_bias, offs;
2570 unsigned long vmask;
2571 struct vcpu *v;
2572 bool_t is_native = !is_pv_32on64_domain(d);
2574 cpus_clear(*pmask);
2575 for ( vmask = 0, offs = 0; ; ++offs)
2577 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2578 if ( vcpu_bias >= d->max_vcpus )
2579 return 0;
2581 if ( unlikely(is_native ?
2582 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2583 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2584 offs, 1)) )
2586 cpus_clear(*pmask);
2587 return -EFAULT;
2590 while ( vmask )
2592 vcpu_id = find_first_set_bit(vmask);
2593 vmask &= ~(1UL << vcpu_id);
2594 vcpu_id += vcpu_bias;
2595 if ( (vcpu_id >= d->max_vcpus) )
2596 return 0;
2597 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2598 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2603 #ifdef __i386__
2604 static inline void *fixmap_domain_page(unsigned long mfn)
2606 unsigned int cpu = smp_processor_id();
2607 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2609 l1e_write(fix_pae_highmem_pl1e - cpu,
2610 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2611 flush_tlb_one_local(ptr);
2612 return ptr;
2614 static inline void fixunmap_domain_page(const void *ptr)
2616 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2618 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2619 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2621 #else
2622 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2623 #define fixunmap_domain_page(ptr) ((void)(ptr))
2624 #endif
2626 int do_mmuext_op(
2627 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2628 unsigned int count,
2629 XEN_GUEST_HANDLE(uint) pdone,
2630 unsigned int foreigndom)
2632 struct mmuext_op op;
2633 int rc = 0, i = 0, okay;
2634 unsigned long mfn = 0, gmfn = 0, type;
2635 unsigned int done = 0;
2636 struct page_info *page;
2637 struct vcpu *curr = current;
2638 struct domain *d = curr->domain;
2640 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2642 count &= ~MMU_UPDATE_PREEMPTED;
2643 if ( unlikely(!guest_handle_is_null(pdone)) )
2644 (void)copy_from_guest(&done, pdone, 1);
2646 else
2647 perfc_incr(calls_to_mmuext_op);
2649 if ( unlikely(!guest_handle_okay(uops, count)) )
2651 rc = -EFAULT;
2652 goto out;
2655 if ( !set_foreigndom(foreigndom) )
2657 rc = -ESRCH;
2658 goto out;
2661 for ( i = 0; i < count; i++ )
2663 if ( hypercall_preempt_check() )
2665 rc = -EAGAIN;
2666 break;
2669 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2671 MEM_LOG("Bad __copy_from_guest");
2672 rc = -EFAULT;
2673 break;
2676 okay = 1;
2677 gmfn = op.arg1.mfn;
2678 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2679 page = mfn_to_page(mfn);
2681 switch ( op.cmd )
2683 case MMUEXT_PIN_L1_TABLE:
2684 type = PGT_l1_page_table;
2685 goto pin_page;
2687 case MMUEXT_PIN_L2_TABLE:
2688 type = PGT_l2_page_table;
2689 goto pin_page;
2691 case MMUEXT_PIN_L3_TABLE:
2692 type = PGT_l3_page_table;
2693 goto pin_page;
2695 case MMUEXT_PIN_L4_TABLE:
2696 if ( is_pv_32bit_domain(FOREIGNDOM) )
2697 break;
2698 type = PGT_l4_page_table;
2700 pin_page:
2701 rc = xsm_memory_pin_page(d, page);
2702 if ( rc )
2703 break;
2705 /* Ignore pinning of invalid paging levels. */
2706 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2707 break;
2709 if ( paging_mode_refcounts(FOREIGNDOM) )
2710 break;
2712 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2713 okay = !rc;
2714 if ( unlikely(!okay) )
2716 if ( rc == -EINTR )
2717 rc = -EAGAIN;
2718 else if ( rc != -EAGAIN )
2719 MEM_LOG("Error while pinning mfn %lx", mfn);
2720 break;
2723 if ( unlikely(test_and_set_bit(_PGT_pinned,
2724 &page->u.inuse.type_info)) )
2726 MEM_LOG("Mfn %lx already pinned", mfn);
2727 put_page_and_type(page);
2728 okay = 0;
2729 break;
2732 /* A page is dirtied when its pin status is set. */
2733 paging_mark_dirty(d, mfn);
2735 /* We can race domain destruction (domain_relinquish_resources). */
2736 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2738 int drop_ref;
2739 spin_lock(&FOREIGNDOM->page_alloc_lock);
2740 drop_ref = (FOREIGNDOM->is_dying &&
2741 test_and_clear_bit(_PGT_pinned,
2742 &page->u.inuse.type_info));
2743 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2744 if ( drop_ref )
2745 put_page_and_type(page);
2748 break;
2750 case MMUEXT_UNPIN_TABLE:
2751 if ( paging_mode_refcounts(d) )
2752 break;
2754 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2756 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2757 mfn, page_get_owner(page));
2759 else if ( likely(test_and_clear_bit(_PGT_pinned,
2760 &page->u.inuse.type_info)) )
2762 put_page_and_type(page);
2763 put_page(page);
2764 if ( !rc )
2766 /* A page is dirtied when its pin status is cleared. */
2767 paging_mark_dirty(d, mfn);
2770 else
2772 okay = 0;
2773 put_page(page);
2774 MEM_LOG("Mfn %lx not pinned", mfn);
2776 break;
2778 case MMUEXT_NEW_BASEPTR:
2779 okay = new_guest_cr3(mfn);
2780 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2781 break;
2783 #ifdef __x86_64__
2784 case MMUEXT_NEW_USER_BASEPTR: {
2785 unsigned long old_mfn;
2787 if ( mfn != 0 )
2789 if ( paging_mode_refcounts(d) )
2790 okay = get_page_from_pagenr(mfn, d);
2791 else
2792 okay = !get_page_and_type_from_pagenr(
2793 mfn, PGT_root_page_table, d, 0, 0);
2794 if ( unlikely(!okay) )
2796 MEM_LOG("Error while installing new mfn %lx", mfn);
2797 break;
2801 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2802 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2804 if ( old_mfn != 0 )
2806 if ( paging_mode_refcounts(d) )
2807 put_page(mfn_to_page(old_mfn));
2808 else
2809 put_page_and_type(mfn_to_page(old_mfn));
2812 break;
2814 #endif
2816 case MMUEXT_TLB_FLUSH_LOCAL:
2817 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2818 break;
2820 case MMUEXT_INVLPG_LOCAL:
2821 if ( !paging_mode_enabled(d)
2822 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2823 flush_tlb_one_local(op.arg1.linear_addr);
2824 break;
2826 case MMUEXT_TLB_FLUSH_MULTI:
2827 case MMUEXT_INVLPG_MULTI:
2829 cpumask_t pmask;
2831 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
2833 okay = 0;
2834 break;
2836 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2837 flush_tlb_mask(&pmask);
2838 else
2839 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
2840 break;
2843 case MMUEXT_TLB_FLUSH_ALL:
2844 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
2845 break;
2847 case MMUEXT_INVLPG_ALL:
2848 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
2849 break;
2851 case MMUEXT_FLUSH_CACHE:
2852 if ( unlikely(!cache_flush_permitted(d)) )
2854 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2855 okay = 0;
2857 else
2859 wbinvd();
2861 break;
2863 case MMUEXT_SET_LDT:
2865 unsigned long ptr = op.arg1.linear_addr;
2866 unsigned long ents = op.arg2.nr_ents;
2868 if ( paging_mode_external(d) )
2870 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2871 okay = 0;
2873 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2874 (ents > 8192) ||
2875 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2877 okay = 0;
2878 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2880 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2881 (curr->arch.guest_context.ldt_base != ptr) )
2883 invalidate_shadow_ldt(curr, 0);
2884 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2885 curr->arch.guest_context.ldt_base = ptr;
2886 curr->arch.guest_context.ldt_ents = ents;
2887 load_LDT(curr);
2888 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2889 if ( ents != 0 )
2890 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2892 break;
2895 case MMUEXT_CLEAR_PAGE:
2897 unsigned char *ptr;
2899 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2900 FOREIGNDOM, 0, 0);
2901 if ( unlikely(!okay) )
2903 MEM_LOG("Error while clearing mfn %lx", mfn);
2904 break;
2907 /* A page is dirtied when it's being cleared. */
2908 paging_mark_dirty(d, mfn);
2910 ptr = fixmap_domain_page(mfn);
2911 clear_page(ptr);
2912 fixunmap_domain_page(ptr);
2914 put_page_and_type(page);
2915 break;
2918 case MMUEXT_COPY_PAGE:
2920 const unsigned char *src;
2921 unsigned char *dst;
2922 unsigned long src_mfn;
2924 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2925 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2926 if ( unlikely(!okay) )
2928 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2929 break;
2932 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2933 FOREIGNDOM, 0, 0);
2934 if ( unlikely(!okay) )
2936 put_page(mfn_to_page(src_mfn));
2937 MEM_LOG("Error while copying to mfn %lx", mfn);
2938 break;
2941 /* A page is dirtied when it's being copied to. */
2942 paging_mark_dirty(d, mfn);
2944 src = map_domain_page(src_mfn);
2945 dst = fixmap_domain_page(mfn);
2946 copy_page(dst, src);
2947 fixunmap_domain_page(dst);
2948 unmap_domain_page(src);
2950 put_page_and_type(page);
2951 put_page(mfn_to_page(src_mfn));
2952 break;
2955 default:
2956 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2957 rc = -ENOSYS;
2958 okay = 0;
2959 break;
2962 if ( unlikely(!okay) )
2964 rc = rc ? rc : -EINVAL;
2965 break;
2968 guest_handle_add_offset(uops, 1);
2971 if ( rc == -EAGAIN )
2972 rc = hypercall_create_continuation(
2973 __HYPERVISOR_mmuext_op, "hihi",
2974 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2976 process_deferred_ops();
2978 perfc_add(num_mmuext_ops, i);
2980 out:
2981 /* Add incremental work we have done to the @done output parameter. */
2982 if ( unlikely(!guest_handle_is_null(pdone)) )
2984 done += i;
2985 copy_to_guest(pdone, &done, 1);
2988 return rc;
2991 int do_mmu_update(
2992 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2993 unsigned int count,
2994 XEN_GUEST_HANDLE(uint) pdone,
2995 unsigned int foreigndom)
2997 struct mmu_update req;
2998 void *va;
2999 unsigned long gpfn, gmfn, mfn;
3000 struct page_info *page;
3001 int rc = 0, okay = 1, i = 0;
3002 unsigned int cmd, done = 0;
3003 struct domain *d = current->domain;
3004 struct domain_mmap_cache mapcache;
3006 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3008 count &= ~MMU_UPDATE_PREEMPTED;
3009 if ( unlikely(!guest_handle_is_null(pdone)) )
3010 (void)copy_from_guest(&done, pdone, 1);
3012 else
3013 perfc_incr(calls_to_mmu_update);
3015 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3017 rc = -EFAULT;
3018 goto out;
3021 if ( !set_foreigndom(foreigndom) )
3023 rc = -ESRCH;
3024 goto out;
3027 domain_mmap_cache_init(&mapcache);
3029 for ( i = 0; i < count; i++ )
3031 if ( hypercall_preempt_check() )
3033 rc = -EAGAIN;
3034 break;
3037 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3039 MEM_LOG("Bad __copy_from_guest");
3040 rc = -EFAULT;
3041 break;
3044 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3045 okay = 0;
3047 switch ( cmd )
3049 /*
3050 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3051 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3052 * current A/D bits.
3053 */
3054 case MMU_NORMAL_PT_UPDATE:
3055 case MMU_PT_UPDATE_PRESERVE_AD:
3056 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
3057 if ( rc )
3058 break;
3060 req.ptr -= cmd;
3061 gmfn = req.ptr >> PAGE_SHIFT;
3062 mfn = gmfn_to_mfn(d, gmfn);
3064 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
3066 MEM_LOG("Could not get page for normal update");
3067 break;
3070 va = map_domain_page_with_cache(mfn, &mapcache);
3071 va = (void *)((unsigned long)va +
3072 (unsigned long)(req.ptr & ~PAGE_MASK));
3073 page = mfn_to_page(mfn);
3075 if ( page_lock(page) )
3077 switch ( page->u.inuse.type_info & PGT_type_mask )
3079 case PGT_l1_page_table:
3081 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3082 okay = mod_l1_entry(va, l1e, mfn,
3083 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3084 current);
3086 break;
3087 case PGT_l2_page_table:
3089 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3090 okay = mod_l2_entry(va, l2e, mfn,
3091 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3092 current);
3094 break;
3095 case PGT_l3_page_table:
3097 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3098 rc = mod_l3_entry(va, l3e, mfn,
3099 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3100 current);
3101 okay = !rc;
3103 break;
3104 #if CONFIG_PAGING_LEVELS >= 4
3105 case PGT_l4_page_table:
3107 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3108 rc = mod_l4_entry(va, l4e, mfn,
3109 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3110 current);
3111 okay = !rc;
3113 break;
3114 #endif
3115 case PGT_writable_page:
3116 perfc_incr(writable_mmu_updates);
3117 okay = paging_write_guest_entry(
3118 current, va, req.val, _mfn(mfn));
3119 break;
3121 page_unlock(page);
3122 if ( rc == -EINTR )
3123 rc = -EAGAIN;
3125 else if ( get_page_type(page, PGT_writable_page) )
3127 perfc_incr(writable_mmu_updates);
3128 okay = paging_write_guest_entry(
3129 current, va, req.val, _mfn(mfn));
3130 put_page_type(page);
3133 unmap_domain_page_with_cache(va, &mapcache);
3134 put_page(page);
3135 break;
3137 case MMU_MACHPHYS_UPDATE:
3139 mfn = req.ptr >> PAGE_SHIFT;
3140 gpfn = req.val;
3142 rc = xsm_mmu_machphys_update(d, mfn);
3143 if ( rc )
3144 break;
3146 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3148 MEM_LOG("Could not get page for mach->phys update");
3149 break;
3152 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3154 MEM_LOG("Mach-phys update on auto-translate guest");
3155 break;
3158 set_gpfn_from_mfn(mfn, gpfn);
3159 okay = 1;
3161 paging_mark_dirty(FOREIGNDOM, mfn);
3163 put_page(mfn_to_page(mfn));
3164 break;
3166 default:
3167 MEM_LOG("Invalid page update command %x", cmd);
3168 rc = -ENOSYS;
3169 okay = 0;
3170 break;
3173 if ( unlikely(!okay) )
3175 rc = rc ? rc : -EINVAL;
3176 break;
3179 guest_handle_add_offset(ureqs, 1);
3182 if ( rc == -EAGAIN )
3183 rc = hypercall_create_continuation(
3184 __HYPERVISOR_mmu_update, "hihi",
3185 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3187 process_deferred_ops();
3189 domain_mmap_cache_destroy(&mapcache);
3191 perfc_add(num_page_updates, i);
3193 out:
3194 /* Add incremental work we have done to the @done output parameter. */
3195 if ( unlikely(!guest_handle_is_null(pdone)) )
3197 done += i;
3198 copy_to_guest(pdone, &done, 1);
3201 return rc;
3205 static int create_grant_pte_mapping(
3206 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3208 int rc = GNTST_okay;
3209 void *va;
3210 unsigned long gmfn, mfn;
3211 struct page_info *page;
3212 l1_pgentry_t ol1e;
3213 struct domain *d = v->domain;
3215 ASSERT(domain_is_locked(d));
3217 adjust_guest_l1e(nl1e, d);
3219 gmfn = pte_addr >> PAGE_SHIFT;
3220 mfn = gmfn_to_mfn(d, gmfn);
3222 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3224 MEM_LOG("Could not get page for normal update");
3225 return GNTST_general_error;
3228 va = map_domain_page(mfn);
3229 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3230 page = mfn_to_page(mfn);
3232 if ( !page_lock(page) )
3234 rc = GNTST_general_error;
3235 goto failed;
3238 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3240 page_unlock(page);
3241 rc = GNTST_general_error;
3242 goto failed;
3245 ol1e = *(l1_pgentry_t *)va;
3246 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3248 page_unlock(page);
3249 rc = GNTST_general_error;
3250 goto failed;
3253 page_unlock(page);
3255 if ( !paging_mode_refcounts(d) )
3256 put_page_from_l1e(ol1e, d);
3258 failed:
3259 unmap_domain_page(va);
3260 put_page(page);
3262 return rc;
3265 static int destroy_grant_pte_mapping(
3266 uint64_t addr, unsigned long frame, struct domain *d)
3268 int rc = GNTST_okay;
3269 void *va;
3270 unsigned long gmfn, mfn;
3271 struct page_info *page;
3272 l1_pgentry_t ol1e;
3274 gmfn = addr >> PAGE_SHIFT;
3275 mfn = gmfn_to_mfn(d, gmfn);
3277 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3279 MEM_LOG("Could not get page for normal update");
3280 return GNTST_general_error;
3283 va = map_domain_page(mfn);
3284 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3285 page = mfn_to_page(mfn);
3287 if ( !page_lock(page) )
3289 rc = GNTST_general_error;
3290 goto failed;
3293 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3295 page_unlock(page);
3296 rc = GNTST_general_error;
3297 goto failed;
3300 ol1e = *(l1_pgentry_t *)va;
3302 /* Check that the virtual address supplied is actually mapped to frame. */
3303 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3305 page_unlock(page);
3306 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3307 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3308 rc = GNTST_general_error;
3309 goto failed;
3312 /* Delete pagetable entry. */
3313 if ( unlikely(!UPDATE_ENTRY
3314 (l1,
3315 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3316 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3317 0)) )
3319 page_unlock(page);
3320 MEM_LOG("Cannot delete PTE entry at %p", va);
3321 rc = GNTST_general_error;
3322 goto failed;
3325 page_unlock(page);
3327 failed:
3328 unmap_domain_page(va);
3329 put_page(page);
3330 return rc;
3334 static int create_grant_va_mapping(
3335 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3337 l1_pgentry_t *pl1e, ol1e;
3338 struct domain *d = v->domain;
3339 unsigned long gl1mfn;
3340 struct page_info *l1pg;
3341 int okay;
3343 ASSERT(domain_is_locked(d));
3345 adjust_guest_l1e(nl1e, d);
3347 pl1e = guest_map_l1e(v, va, &gl1mfn);
3348 if ( !pl1e )
3350 MEM_LOG("Could not find L1 PTE for address %lx", va);
3351 return GNTST_general_error;
3354 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3356 guest_unmap_l1e(v, pl1e);
3357 return GNTST_general_error;
3360 l1pg = mfn_to_page(gl1mfn);
3361 if ( !page_lock(l1pg) )
3363 put_page(l1pg);
3364 guest_unmap_l1e(v, pl1e);
3365 return GNTST_general_error;
3368 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3370 page_unlock(l1pg);
3371 put_page(l1pg);
3372 guest_unmap_l1e(v, pl1e);
3373 return GNTST_general_error;
3376 ol1e = *pl1e;
3377 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3379 page_unlock(l1pg);
3380 put_page(l1pg);
3381 guest_unmap_l1e(v, pl1e);
3383 if ( okay && !paging_mode_refcounts(d) )
3384 put_page_from_l1e(ol1e, d);
3386 return okay ? GNTST_okay : GNTST_general_error;
3389 static int replace_grant_va_mapping(
3390 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3392 l1_pgentry_t *pl1e, ol1e;
3393 unsigned long gl1mfn;
3394 struct page_info *l1pg;
3395 int rc = 0;
3397 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3398 if ( !pl1e )
3400 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3401 return GNTST_general_error;
3404 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3406 rc = GNTST_general_error;
3407 goto out;
3410 l1pg = mfn_to_page(gl1mfn);
3411 if ( !page_lock(l1pg) )
3413 rc = GNTST_general_error;
3414 put_page(l1pg);
3415 goto out;
3418 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3420 rc = GNTST_general_error;
3421 goto unlock_and_out;
3424 ol1e = *pl1e;
3426 /* Check that the virtual address supplied is actually mapped to frame. */
3427 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3429 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3430 l1e_get_pfn(ol1e), addr, frame);
3431 rc = GNTST_general_error;
3432 goto unlock_and_out;
3435 /* Delete pagetable entry. */
3436 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3438 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3439 rc = GNTST_general_error;
3440 goto unlock_and_out;
3443 unlock_and_out:
3444 page_unlock(l1pg);
3445 put_page(l1pg);
3446 out:
3447 guest_unmap_l1e(v, pl1e);
3448 return rc;
3451 static int destroy_grant_va_mapping(
3452 unsigned long addr, unsigned long frame, struct vcpu *v)
3454 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3457 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3458 unsigned int flags, unsigned int cache_flags)
3460 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3462 if ( (flags & GNTMAP_application_map) )
3463 l1e_add_flags(pte,_PAGE_USER);
3464 if ( !(flags & GNTMAP_readonly) )
3465 l1e_add_flags(pte,_PAGE_RW);
3467 l1e_add_flags(pte,
3468 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3469 & _PAGE_AVAIL);
3471 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3473 if ( flags & GNTMAP_contains_pte )
3474 return create_grant_pte_mapping(addr, pte, current);
3475 return create_grant_va_mapping(addr, pte, current);
3478 int replace_grant_host_mapping(
3479 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3481 struct vcpu *curr = current;
3482 l1_pgentry_t *pl1e, ol1e;
3483 unsigned long gl1mfn;
3484 struct page_info *l1pg;
3485 int rc;
3487 if ( flags & GNTMAP_contains_pte )
3489 if ( !new_addr )
3490 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3492 MEM_LOG("Unsupported grant table operation");
3493 return GNTST_general_error;
3496 if ( !new_addr )
3497 return destroy_grant_va_mapping(addr, frame, curr);
3499 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3500 if ( !pl1e )
3502 MEM_LOG("Could not find L1 PTE for address %lx",
3503 (unsigned long)new_addr);
3504 return GNTST_general_error;
3507 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3509 guest_unmap_l1e(curr, pl1e);
3510 return GNTST_general_error;
3513 l1pg = mfn_to_page(gl1mfn);
3514 if ( !page_lock(l1pg) )
3516 put_page(l1pg);
3517 guest_unmap_l1e(curr, pl1e);
3518 return GNTST_general_error;
3521 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3523 page_unlock(l1pg);
3524 put_page(l1pg);
3525 guest_unmap_l1e(curr, pl1e);
3526 return GNTST_general_error;
3529 ol1e = *pl1e;
3531 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3532 gl1mfn, curr, 0)) )
3534 page_unlock(l1pg);
3535 put_page(l1pg);
3536 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3537 guest_unmap_l1e(curr, pl1e);
3538 return GNTST_general_error;
3541 page_unlock(l1pg);
3542 put_page(l1pg);
3543 guest_unmap_l1e(curr, pl1e);
3545 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3546 if ( rc && !paging_mode_refcounts(curr->domain) )
3547 put_page_from_l1e(ol1e, curr->domain);
3549 return rc;
3552 int donate_page(
3553 struct domain *d, struct page_info *page, unsigned int memflags)
3555 spin_lock(&d->page_alloc_lock);
3557 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
3558 goto fail;
3560 if ( d->is_dying )
3561 goto fail;
3563 if ( page->count_info & ~(PGC_allocated | 1) )
3564 goto fail;
3566 if ( !(memflags & MEMF_no_refcount) )
3568 if ( d->tot_pages >= d->max_pages )
3569 goto fail;
3570 d->tot_pages++;
3573 page->count_info = PGC_allocated | 1;
3574 page_set_owner(page, d);
3575 page_list_add_tail(page,&d->page_list);
3577 spin_unlock(&d->page_alloc_lock);
3578 return 0;
3580 fail:
3581 spin_unlock(&d->page_alloc_lock);
3582 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3583 (void *)page_to_mfn(page), d, d->domain_id,
3584 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3585 return -1;
3588 int steal_page(
3589 struct domain *d, struct page_info *page, unsigned int memflags)
3591 unsigned long x, y;
3593 spin_lock(&d->page_alloc_lock);
3595 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3596 goto fail;
3598 /*
3599 * We require there is just one reference (PGC_allocated). We temporarily
3600 * drop this reference now so that we can safely swizzle the owner.
3601 */
3602 y = page->count_info;
3603 do {
3604 x = y;
3605 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3606 goto fail;
3607 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3608 } while ( y != x );
3610 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3611 page_set_owner(page, NULL);
3612 y = page->count_info;
3613 do {
3614 x = y;
3615 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3616 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3618 /* Unlink from original owner. */
3619 if ( !(memflags & MEMF_no_refcount) )
3620 d->tot_pages--;
3621 page_list_del(page, &d->page_list);
3623 spin_unlock(&d->page_alloc_lock);
3624 return 0;
3626 fail:
3627 spin_unlock(&d->page_alloc_lock);
3628 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3629 (void *)page_to_mfn(page), d, d->domain_id,
3630 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3631 return -1;
3634 int do_update_va_mapping(unsigned long va, u64 val64,
3635 unsigned long flags)
3637 l1_pgentry_t val = l1e_from_intpte(val64);
3638 struct vcpu *v = current;
3639 struct domain *d = v->domain;
3640 struct page_info *gl1pg;
3641 l1_pgentry_t *pl1e;
3642 unsigned long bmap_ptr, gl1mfn;
3643 cpumask_t pmask;
3644 int rc;
3646 perfc_incr(calls_to_update_va);
3648 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3649 if ( rc )
3650 return rc;
3652 rc = -EINVAL;
3653 pl1e = guest_map_l1e(v, va, &gl1mfn);
3654 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3655 goto out;
3657 gl1pg = mfn_to_page(gl1mfn);
3658 if ( !page_lock(gl1pg) )
3660 put_page(gl1pg);
3661 goto out;
3664 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3666 page_unlock(gl1pg);
3667 put_page(gl1pg);
3668 goto out;
3671 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v) ? 0 : -EINVAL;
3673 page_unlock(gl1pg);
3674 put_page(gl1pg);
3676 out:
3677 if ( pl1e )
3678 guest_unmap_l1e(v, pl1e);
3680 switch ( flags & UVMF_FLUSHTYPE_MASK )
3682 case UVMF_TLB_FLUSH:
3683 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3685 case UVMF_LOCAL:
3686 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
3687 break;
3688 case UVMF_ALL:
3689 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
3690 break;
3691 default:
3692 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3693 break;
3694 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3695 void),
3696 &pmask);
3697 if ( cpu_isset(smp_processor_id(), pmask) )
3698 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
3699 flush_tlb_mask(&pmask);
3700 break;
3702 break;
3704 case UVMF_INVLPG:
3705 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3706 break;
3707 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3709 case UVMF_LOCAL:
3710 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3711 break;
3712 if ( !paging_mode_enabled(d) ||
3713 (paging_invlpg(v, va) != 0) )
3714 flush_tlb_one_local(va);
3715 break;
3716 case UVMF_ALL:
3717 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
3718 break;
3719 default:
3720 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3721 void),
3722 &pmask);
3723 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3724 cpu_clear(smp_processor_id(), pmask);
3725 flush_tlb_one_mask(&pmask, va);
3726 break;
3728 break;
3731 process_deferred_ops();
3733 return rc;
3736 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3737 unsigned long flags,
3738 domid_t domid)
3740 int rc;
3742 if ( !set_foreigndom(domid) )
3743 return -ESRCH;
3745 rc = do_update_va_mapping(va, val64, flags);
3747 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3748 process_deferred_ops(); /* only to clear foreigndom */
3750 return rc;
3755 /*************************
3756 * Descriptor Tables
3757 */
3759 void destroy_gdt(struct vcpu *v)
3761 int i;
3762 unsigned long pfn;
3764 v->arch.guest_context.gdt_ents = 0;
3765 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3767 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3768 put_page_and_type(mfn_to_page(pfn));
3769 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3770 v->arch.guest_context.gdt_frames[i] = 0;
3775 long set_gdt(struct vcpu *v,
3776 unsigned long *frames,
3777 unsigned int entries)
3779 struct domain *d = v->domain;
3780 /* NB. There are 512 8-byte entries per GDT page. */
3781 int i, nr_pages = (entries + 511) / 512;
3782 unsigned long mfn;
3784 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3785 return -EINVAL;
3787 /* Check the pages in the new GDT. */
3788 for ( i = 0; i < nr_pages; i++ )
3790 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3791 if ( !mfn_valid(mfn) ||
3792 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3793 goto fail;
3796 /* Tear down the old GDT. */
3797 destroy_gdt(v);
3799 /* Install the new GDT. */
3800 v->arch.guest_context.gdt_ents = entries;
3801 for ( i = 0; i < nr_pages; i++ )
3803 v->arch.guest_context.gdt_frames[i] = frames[i];
3804 l1e_write(&v->arch.perdomain_ptes[i],
3805 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3808 return 0;
3810 fail:
3811 while ( i-- > 0 )
3812 put_page_and_type(mfn_to_page(frames[i]));
3813 return -EINVAL;
3817 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3819 int nr_pages = (entries + 511) / 512;
3820 unsigned long frames[16];
3821 struct vcpu *curr = current;
3822 long ret;
3824 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3825 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3826 return -EINVAL;
3828 if ( copy_from_guest(frames, frame_list, nr_pages) )
3829 return -EFAULT;
3831 domain_lock(curr->domain);
3833 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3834 flush_tlb_local();
3836 domain_unlock(curr->domain);
3838 return ret;
3842 long do_update_descriptor(u64 pa, u64 desc)
3844 struct domain *dom = current->domain;
3845 unsigned long gmfn = pa >> PAGE_SHIFT;
3846 unsigned long mfn;
3847 unsigned int offset;
3848 struct desc_struct *gdt_pent, d;
3849 struct page_info *page;
3850 long ret = -EINVAL;
3852 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3854 *(u64 *)&d = desc;
3856 mfn = gmfn_to_mfn(dom, gmfn);
3857 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3858 !mfn_valid(mfn) ||
3859 !check_descriptor(dom, &d) )
3860 return -EINVAL;
3862 page = mfn_to_page(mfn);
3863 if ( unlikely(!get_page(page, dom)) )
3864 return -EINVAL;
3866 /* Check if the given frame is in use in an unsafe context. */
3867 switch ( page->u.inuse.type_info & PGT_type_mask )
3869 case PGT_seg_desc_page:
3870 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3871 goto out;
3872 break;
3873 default:
3874 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3875 goto out;
3876 break;
3879 paging_mark_dirty(dom, mfn);
3881 /* All is good so make the update. */
3882 gdt_pent = map_domain_page(mfn);
3883 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3884 unmap_domain_page(gdt_pent);
3886 put_page_type(page);
3888 ret = 0; /* success */
3890 out:
3891 put_page(page);
3893 return ret;
3896 typedef struct e820entry e820entry_t;
3897 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3899 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3901 struct page_info *page = NULL;
3902 int rc;
3904 switch ( op )
3906 case XENMEM_add_to_physmap:
3908 struct xen_add_to_physmap xatp;
3909 unsigned long prev_mfn, mfn = 0, gpfn;
3910 struct domain *d;
3912 if ( copy_from_guest(&xatp, arg, 1) )
3913 return -EFAULT;
3915 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3916 if ( rc != 0 )
3917 return rc;
3919 if ( xsm_add_to_physmap(current->domain, d) )
3921 rcu_unlock_domain(d);
3922 return -EPERM;
3925 switch ( xatp.space )
3927 case XENMAPSPACE_shared_info:
3928 if ( xatp.idx == 0 )
3929 mfn = virt_to_mfn(d->shared_info);
3930 break;
3931 case XENMAPSPACE_grant_table:
3932 spin_lock(&d->grant_table->lock);
3934 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3935 (xatp.idx < max_nr_grant_frames) )
3936 gnttab_grow_table(d, xatp.idx + 1);
3938 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3939 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3941 spin_unlock(&d->grant_table->lock);
3942 break;
3943 case XENMAPSPACE_gmfn:
3944 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3945 if ( !get_page_from_pagenr(xatp.idx, d) )
3946 break;
3947 mfn = xatp.idx;
3948 page = mfn_to_page(mfn);
3949 break;
3950 default:
3951 break;
3954 if ( !paging_mode_translate(d) || (mfn == 0) )
3956 if ( page )
3957 put_page(page);
3958 rcu_unlock_domain(d);
3959 return -EINVAL;
3962 domain_lock(d);
3964 /* Remove previously mapped page if it was present. */
3965 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3966 if ( mfn_valid(prev_mfn) )
3968 if ( is_xen_heap_mfn(prev_mfn) )
3969 /* Xen heap frames are simply unhooked from this phys slot. */
3970 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3971 else
3972 /* Normal domain memory is freed, to avoid leaking memory. */
3973 guest_remove_page(d, xatp.gpfn);
3976 /* Unmap from old location, if any. */
3977 gpfn = get_gpfn_from_mfn(mfn);
3978 if ( gpfn != INVALID_M2P_ENTRY )
3979 guest_physmap_remove_page(d, gpfn, mfn, 0);
3981 /* Map at new location. */
3982 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3984 domain_unlock(d);
3986 if ( page )
3987 put_page(page);
3989 rcu_unlock_domain(d);
3991 break;
3994 case XENMEM_set_memory_map:
3996 struct xen_foreign_memory_map fmap;
3997 struct domain *d;
3998 int rc;
4000 if ( copy_from_guest(&fmap, arg, 1) )
4001 return -EFAULT;
4003 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
4004 return -EINVAL;
4006 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4007 if ( rc != 0 )
4008 return rc;
4010 rc = xsm_domain_memory_map(d);
4011 if ( rc )
4013 rcu_unlock_domain(d);
4014 return rc;
4017 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
4018 fmap.map.nr_entries) ? -EFAULT : 0;
4019 d->arch.nr_e820 = fmap.map.nr_entries;
4021 rcu_unlock_domain(d);
4022 return rc;
4025 case XENMEM_memory_map:
4027 struct xen_memory_map map;
4028 struct domain *d = current->domain;
4030 /* Backwards compatibility. */
4031 if ( d->arch.nr_e820 == 0 )
4032 return -ENOSYS;
4034 if ( copy_from_guest(&map, arg, 1) )
4035 return -EFAULT;
4037 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4038 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4039 copy_to_guest(arg, &map, 1) )
4040 return -EFAULT;
4042 return 0;
4045 case XENMEM_machine_memory_map:
4047 struct xen_memory_map memmap;
4048 XEN_GUEST_HANDLE(e820entry_t) buffer;
4049 int count;
4050 int rc;
4052 if ( !IS_PRIV(current->domain) )
4053 return -EINVAL;
4055 rc = xsm_machine_memory_map();
4056 if ( rc )
4057 return rc;
4059 if ( copy_from_guest(&memmap, arg, 1) )
4060 return -EFAULT;
4061 if ( memmap.nr_entries < e820.nr_map + 1 )
4062 return -EINVAL;
4064 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
4066 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
4067 if ( copy_to_guest(buffer, e820.map, count) < 0 )
4068 return -EFAULT;
4070 memmap.nr_entries = count;
4072 if ( copy_to_guest(arg, &memmap, 1) )
4073 return -EFAULT;
4075 return 0;
4078 case XENMEM_machphys_mapping:
4080 static const struct xen_machphys_mapping mapping = {
4081 .v_start = MACH2PHYS_VIRT_START,
4082 .v_end = MACH2PHYS_VIRT_END,
4083 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4084 };
4086 if ( copy_to_guest(arg, &mapping, 1) )
4087 return -EFAULT;
4089 return 0;
4092 case XENMEM_set_pod_target:
4093 case XENMEM_get_pod_target:
4095 xen_pod_target_t target;
4096 struct domain *d;
4098 /* Support DOMID_SELF? */
4099 if ( !IS_PRIV(current->domain) )
4100 return -EINVAL;
4102 if ( copy_from_guest(&target, arg, 1) )
4103 return -EFAULT;
4105 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4106 if ( rc != 0 )
4107 return rc;
4109 if ( op == XENMEM_set_pod_target )
4111 if ( target.target_pages > d->max_pages )
4113 rc = -EINVAL;
4114 goto pod_target_out_unlock;
4117 rc = p2m_pod_set_mem_target(d, target.target_pages);
4120 target.tot_pages = d->tot_pages;
4121 target.pod_cache_pages = d->arch.p2m->pod.count;
4122 target.pod_entries = d->arch.p2m->pod.entry_count;
4124 if ( copy_to_guest(arg, &target, 1) )
4126 rc= -EFAULT;
4127 goto pod_target_out_unlock;
4130 pod_target_out_unlock:
4131 rcu_unlock_domain(d);
4132 return rc;
4135 default:
4136 return subarch_memory_op(op, arg);
4139 return 0;
4143 /*************************
4144 * Writable Pagetables
4145 */
4147 struct ptwr_emulate_ctxt {
4148 struct x86_emulate_ctxt ctxt;
4149 unsigned long cr2;
4150 l1_pgentry_t pte;
4151 };
4153 static int ptwr_emulated_read(
4154 enum x86_segment seg,
4155 unsigned long offset,
4156 void *p_data,
4157 unsigned int bytes,
4158 struct x86_emulate_ctxt *ctxt)
4160 unsigned int rc;
4161 unsigned long addr = offset;
4163 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4165 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4166 return X86EMUL_EXCEPTION;
4169 return X86EMUL_OKAY;
4172 static int ptwr_emulated_update(
4173 unsigned long addr,
4174 paddr_t old,
4175 paddr_t val,
4176 unsigned int bytes,
4177 unsigned int do_cmpxchg,
4178 struct ptwr_emulate_ctxt *ptwr_ctxt)
4180 unsigned long mfn;
4181 unsigned long unaligned_addr = addr;
4182 struct page_info *page;
4183 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4184 struct vcpu *v = current;
4185 struct domain *d = v->domain;
4187 /* Only allow naturally-aligned stores within the original %cr2 page. */
4188 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4190 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4191 ptwr_ctxt->cr2, addr, bytes);
4192 return X86EMUL_UNHANDLEABLE;
4195 /* Turn a sub-word access into a full-word access. */
4196 if ( bytes != sizeof(paddr_t) )
4198 paddr_t full;
4199 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4201 /* Align address; read full word. */
4202 addr &= ~(sizeof(paddr_t)-1);
4203 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4205 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4206 return X86EMUL_EXCEPTION;
4208 /* Mask out bits provided by caller. */
4209 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4210 /* Shift the caller value and OR in the missing bits. */
4211 val &= (((paddr_t)1 << (bytes*8)) - 1);
4212 val <<= (offset)*8;
4213 val |= full;
4214 /* Also fill in missing parts of the cmpxchg old value. */
4215 old &= (((paddr_t)1 << (bytes*8)) - 1);
4216 old <<= (offset)*8;
4217 old |= full;
4220 pte = ptwr_ctxt->pte;
4221 mfn = l1e_get_pfn(pte);
4222 page = mfn_to_page(mfn);
4224 /* We are looking only for read-only mappings of p.t. pages. */
4225 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4226 ASSERT(mfn_valid(mfn));
4227 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4228 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4229 ASSERT(page_get_owner(page) == d);
4231 /* Check the new PTE. */
4232 nl1e = l1e_from_intpte(val);
4233 if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
4235 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4236 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4238 /*
4239 * If this is an upper-half write to a PAE PTE then we assume that
4240 * the guest has simply got the two writes the wrong way round. We
4241 * zap the PRESENT bit on the assumption that the bottom half will
4242 * be written immediately after we return to the guest.
4243 */
4244 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4245 l1e_get_intpte(nl1e));
4246 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4248 else
4250 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4251 return X86EMUL_UNHANDLEABLE;
4255 adjust_guest_l1e(nl1e, d);
4257 /* Checked successfully: do the update (write or cmpxchg). */
4258 pl1e = map_domain_page(mfn);
4259 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4260 if ( do_cmpxchg )
4262 int okay;
4263 intpte_t t = old;
4264 ol1e = l1e_from_intpte(old);
4266 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4267 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4268 okay = (okay && t == old);
4270 if ( !okay )
4272 unmap_domain_page(pl1e);
4273 put_page_from_l1e(nl1e, d);
4274 return X86EMUL_CMPXCHG_FAILED;
4277 else
4279 ol1e = *pl1e;
4280 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4281 BUG();
4284 trace_ptwr_emulation(addr, nl1e);
4286 unmap_domain_page(pl1e);
4288 /* Finally, drop the old PTE. */
4289 put_page_from_l1e(ol1e, d);
4291 return X86EMUL_OKAY;
4294 static int ptwr_emulated_write(
4295 enum x86_segment seg,
4296 unsigned long offset,
4297 void *p_data,
4298 unsigned int bytes,
4299 struct x86_emulate_ctxt *ctxt)
4301 paddr_t val = 0;
4303 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4305 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4306 offset, bytes);
4307 return X86EMUL_UNHANDLEABLE;
4310 memcpy(&val, p_data, bytes);
4312 return ptwr_emulated_update(
4313 offset, 0, val, bytes, 0,
4314 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4317 static int ptwr_emulated_cmpxchg(
4318 enum x86_segment seg,
4319 unsigned long offset,
4320 void *p_old,
4321 void *p_new,
4322 unsigned int bytes,
4323 struct x86_emulate_ctxt *ctxt)
4325 paddr_t old = 0, new = 0;
4327 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4329 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4330 offset, bytes);
4331 return X86EMUL_UNHANDLEABLE;
4334 memcpy(&old, p_old, bytes);
4335 memcpy(&new, p_new, bytes);
4337 return ptwr_emulated_update(
4338 offset, old, new, bytes, 1,
4339 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4342 static struct x86_emulate_ops ptwr_emulate_ops = {
4343 .read = ptwr_emulated_read,
4344 .insn_fetch = ptwr_emulated_read,
4345 .write = ptwr_emulated_write,
4346 .cmpxchg = ptwr_emulated_cmpxchg,
4347 };
4349 /* Write page fault handler: check if guest is trying to modify a PTE. */
4350 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4351 struct cpu_user_regs *regs)
4353 struct domain *d = v->domain;
4354 struct page_info *page;
4355 l1_pgentry_t pte;
4356 struct ptwr_emulate_ctxt ptwr_ctxt;
4357 int rc;
4359 /* Attempt to read the PTE that maps the VA being accessed. */
4360 guest_get_eff_l1e(v, addr, &pte);
4362 /* We are looking only for read-only mappings of p.t. pages. */
4363 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4364 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4365 goto bail;
4367 page = l1e_get_page(pte);
4368 if ( !page_lock(page) )
4370 put_page(page);
4371 goto bail;
4374 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4376 page_unlock(page);
4377 put_page(page);
4378 goto bail;
4381 ptwr_ctxt.ctxt.regs = regs;
4382 ptwr_ctxt.ctxt.force_writeback = 0;
4383 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4384 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4385 ptwr_ctxt.cr2 = addr;
4386 ptwr_ctxt.pte = pte;
4388 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4390 page_unlock(page);
4391 put_page(page);
4393 if ( rc == X86EMUL_UNHANDLEABLE )
4394 goto bail;
4396 perfc_incr(ptwr_emulations);
4397 return EXCRET_fault_fixed;
4399 bail:
4400 return 0;
4403 void free_xen_pagetable(void *v)
4405 extern int early_boot;
4407 if ( early_boot )
4408 return;
4410 if ( is_xen_heap_page(virt_to_page(v)) )
4411 free_xenheap_page(v);
4412 else
4413 free_domheap_page(virt_to_page(v));
4416 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4417 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4418 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4420 /*
4421 * map_pages_to_xen() can be called with interrupts disabled:
4422 * * During early bootstrap; or
4423 * * alloc_xenheap_pages() via memguard_guard_range
4424 * In these cases it is safe to use flush_area_local():
4425 * * Because only the local CPU is online; or
4426 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4427 */
4428 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4429 flush_area_local((const void *)v, f) : \
4430 flush_area_all((const void *)v, f))
4432 int map_pages_to_xen(
4433 unsigned long virt,
4434 unsigned long mfn,
4435 unsigned long nr_mfns,
4436 unsigned int flags)
4438 l2_pgentry_t *pl2e, ol2e;
4439 l1_pgentry_t *pl1e, ol1e;
4440 unsigned int i;
4442 while ( nr_mfns != 0 )
4444 #ifdef __x86_64__
4445 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4446 l3_pgentry_t ol3e = *pl3e;
4448 if ( cpu_has_page1gb &&
4449 !(((virt >> PAGE_SHIFT) | mfn) &
4450 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4451 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4452 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4454 /* 1GB-page mapping. */
4455 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4457 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4459 unsigned int flush_flags =
4460 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4462 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4464 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4465 flush_flags |= FLUSH_TLB_GLOBAL;
4466 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4467 PAGE_CACHE_ATTRS )
4468 flush_flags |= FLUSH_CACHE;
4469 flush_area(virt, flush_flags);
4471 else
4473 pl2e = l3e_to_l2e(ol3e);
4474 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4476 ol2e = pl2e[i];
4477 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4478 continue;
4479 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4481 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4482 flush_flags |= FLUSH_TLB_GLOBAL;
4483 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4484 PAGE_CACHE_ATTRS )
4485 flush_flags |= FLUSH_CACHE;
4487 else
4489 unsigned int j;
4491 pl1e = l2e_to_l1e(ol2e);
4492 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4494 ol1e = pl1e[j];
4495 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4496 flush_flags |= FLUSH_TLB_GLOBAL;
4497 if ( (l1e_get_flags(ol1e) ^ flags) &
4498 PAGE_CACHE_ATTRS )
4499 flush_flags |= FLUSH_CACHE;
4503 flush_area(virt, flush_flags);
4504 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4506 ol2e = pl2e[i];
4507 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4508 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4509 free_xen_pagetable(l2e_to_l1e(ol2e));
4511 free_xen_pagetable(pl2e);
4515 virt += 1UL << L3_PAGETABLE_SHIFT;
4516 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4517 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4518 continue;
4521 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4522 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4524 unsigned int flush_flags =
4525 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4527 /* Skip this PTE if there is no change. */
4528 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4529 L1_PAGETABLE_ENTRIES - 1)) +
4530 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4531 l1_table_offset(virt) == mfn) &&
4532 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4533 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4535 /* We can skip to end of L3 superpage if we got a match. */
4536 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4537 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4538 if ( i > nr_mfns )
4539 i = nr_mfns;
4540 virt += i << PAGE_SHIFT;
4541 mfn += i;
4542 nr_mfns -= i;
4543 continue;
4546 pl2e = alloc_xen_pagetable();
4547 if ( pl2e == NULL )
4548 return -ENOMEM;
4550 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4551 l2e_write(pl2e + i,
4552 l2e_from_pfn(l3e_get_pfn(ol3e) +
4553 (i << PAGETABLE_ORDER),
4554 l3e_get_flags(ol3e)));
4556 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4557 flush_flags |= FLUSH_TLB_GLOBAL;
4559 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4560 __PAGE_HYPERVISOR));
4561 flush_area(virt, flush_flags);
4563 #endif
4565 pl2e = virt_to_xen_l2e(virt);
4567 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4568 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4569 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4571 /* Super-page mapping. */
4572 ol2e = *pl2e;
4573 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4575 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4577 unsigned int flush_flags =
4578 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4580 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4582 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4583 flush_flags |= FLUSH_TLB_GLOBAL;
4584 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4585 PAGE_CACHE_ATTRS )
4586 flush_flags |= FLUSH_CACHE;
4587 flush_area(virt, flush_flags);
4589 else
4591 pl1e = l2e_to_l1e(ol2e);
4592 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4594 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4595 flush_flags |= FLUSH_TLB_GLOBAL;
4596 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4597 PAGE_CACHE_ATTRS )
4598 flush_flags |= FLUSH_CACHE;
4600 flush_area(virt, flush_flags);
4601 free_xen_pagetable(pl1e);
4605 virt += 1UL << L2_PAGETABLE_SHIFT;
4606 mfn += 1UL << PAGETABLE_ORDER;
4607 nr_mfns -= 1UL << PAGETABLE_ORDER;
4609 else
4611 /* Normal page mapping. */
4612 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4614 pl1e = alloc_xen_pagetable();
4615 if ( pl1e == NULL )
4616 return -ENOMEM;
4617 clear_page(pl1e);
4618 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4619 __PAGE_HYPERVISOR));
4621 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4623 unsigned int flush_flags =
4624 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4626 /* Skip this PTE if there is no change. */
4627 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4628 l1_table_offset(virt)) == mfn) &&
4629 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4630 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4632 /* We can skip to end of L2 superpage if we got a match. */
4633 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4634 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4635 if ( i > nr_mfns )
4636 i = nr_mfns;
4637 virt += i << L1_PAGETABLE_SHIFT;
4638 mfn += i;
4639 nr_mfns -= i;
4640 goto check_l3;
4643 pl1e = alloc_xen_pagetable();
4644 if ( pl1e == NULL )
4645 return -ENOMEM;
4647 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4648 l1e_write(&pl1e[i],
4649 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4650 lNf_to_l1f(l2e_get_flags(*pl2e))));
4652 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4653 flush_flags |= FLUSH_TLB_GLOBAL;
4655 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4656 __PAGE_HYPERVISOR));
4657 flush_area(virt, flush_flags);
4660 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4661 ol1e = *pl1e;
4662 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4663 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4665 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4666 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4667 flush_flags |= FLUSH_TLB_GLOBAL;
4668 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4669 flush_flags |= FLUSH_CACHE;
4670 flush_area(virt, flush_flags);
4673 virt += 1UL << L1_PAGETABLE_SHIFT;
4674 mfn += 1UL;
4675 nr_mfns -= 1UL;
4677 if ( (flags == PAGE_HYPERVISOR) &&
4678 ((nr_mfns == 0) ||
4679 ((((virt >> PAGE_SHIFT) | mfn) &
4680 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4682 unsigned long base_mfn;
4683 pl1e = l2e_to_l1e(*pl2e);
4684 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4685 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4686 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4687 (l1e_get_flags(*pl1e) != flags) )
4688 break;
4689 if ( i == L1_PAGETABLE_ENTRIES )
4691 ol2e = *pl2e;
4692 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4693 l1f_to_lNf(flags)));
4694 flush_area(virt - PAGE_SIZE,
4695 FLUSH_TLB_GLOBAL |
4696 FLUSH_ORDER(PAGETABLE_ORDER));
4697 free_xen_pagetable(l2e_to_l1e(ol2e));
4702 check_l3: ;
4703 #ifdef __x86_64__
4704 if ( cpu_has_page1gb &&
4705 (flags == PAGE_HYPERVISOR) &&
4706 ((nr_mfns == 0) ||
4707 !(((virt >> PAGE_SHIFT) | mfn) &
4708 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4710 unsigned long base_mfn;
4712 ol3e = *pl3e;
4713 pl2e = l3e_to_l2e(ol3e);
4714 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4715 L1_PAGETABLE_ENTRIES - 1);
4716 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4717 if ( (l2e_get_pfn(*pl2e) !=
4718 (base_mfn + (i << PAGETABLE_ORDER))) ||
4719 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4720 break;
4721 if ( i == L2_PAGETABLE_ENTRIES )
4723 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4724 l1f_to_lNf(flags)));
4725 flush_area(virt - PAGE_SIZE,
4726 FLUSH_TLB_GLOBAL |
4727 FLUSH_ORDER(2*PAGETABLE_ORDER));
4728 free_xen_pagetable(l3e_to_l2e(ol3e));
4731 #endif
4734 return 0;
4737 void destroy_xen_mappings(unsigned long s, unsigned long e)
4739 l2_pgentry_t *pl2e;
4740 l1_pgentry_t *pl1e;
4741 unsigned int i;
4742 unsigned long v = s;
4744 ASSERT((s & ~PAGE_MASK) == 0);
4745 ASSERT((e & ~PAGE_MASK) == 0);
4747 while ( v < e )
4749 #ifdef __x86_64__
4750 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4752 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4754 v += 1UL << L3_PAGETABLE_SHIFT;
4755 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4756 continue;
4759 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4761 if ( l2_table_offset(v) == 0 &&
4762 l1_table_offset(v) == 0 &&
4763 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4765 /* PAGE1GB: whole superpage is destroyed. */
4766 l3e_write_atomic(pl3e, l3e_empty());
4767 v += 1UL << L3_PAGETABLE_SHIFT;
4768 continue;
4771 /* PAGE1GB: shatter the superpage and fall through. */
4772 pl2e = alloc_xen_pagetable();
4773 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4774 l2e_write(pl2e + i,
4775 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4776 (i << PAGETABLE_ORDER),
4777 l3e_get_flags(*pl3e)));
4778 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4779 __PAGE_HYPERVISOR));
4781 #endif
4783 pl2e = virt_to_xen_l2e(v);
4785 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4787 v += 1UL << L2_PAGETABLE_SHIFT;
4788 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4789 continue;
4792 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4794 if ( (l1_table_offset(v) == 0) &&
4795 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4797 /* PSE: whole superpage is destroyed. */
4798 l2e_write_atomic(pl2e, l2e_empty());
4799 v += 1UL << L2_PAGETABLE_SHIFT;
4801 else
4803 /* PSE: shatter the superpage and try again. */
4804 pl1e = alloc_xen_pagetable();
4805 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4806 l1e_write(&pl1e[i],
4807 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4808 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4809 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4810 __PAGE_HYPERVISOR));
4813 else
4815 /* Ordinary 4kB mapping. */
4816 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4817 l1e_write_atomic(pl1e, l1e_empty());
4818 v += PAGE_SIZE;
4820 /* If we are done with the L2E, check if it is now empty. */
4821 if ( (v != e) && (l1_table_offset(v) != 0) )
4822 continue;
4823 pl1e = l2e_to_l1e(*pl2e);
4824 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4825 if ( l1e_get_intpte(pl1e[i]) != 0 )
4826 break;
4827 if ( i == L1_PAGETABLE_ENTRIES )
4829 /* Empty: zap the L2E and free the L1 page. */
4830 l2e_write_atomic(pl2e, l2e_empty());
4831 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4832 free_xen_pagetable(pl1e);
4836 #ifdef __x86_64__
4837 /* If we are done with the L3E, check if it is now empty. */
4838 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4839 continue;
4840 pl2e = l3e_to_l2e(*pl3e);
4841 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4842 if ( l2e_get_intpte(pl2e[i]) != 0 )
4843 break;
4844 if ( i == L2_PAGETABLE_ENTRIES )
4846 /* Empty: zap the L3E and free the L2 page. */
4847 l3e_write_atomic(pl3e, l3e_empty());
4848 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4849 free_xen_pagetable(pl2e);
4851 #endif
4854 flush_area(NULL, FLUSH_TLB_GLOBAL);
4857 void __set_fixmap(
4858 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4860 BUG_ON(idx >= __end_of_fixed_addresses);
4861 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4864 #ifdef MEMORY_GUARD
4866 void memguard_init(void)
4868 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4869 #ifdef __i386__
4870 map_pages_to_xen(
4871 (unsigned long)__va(start),
4872 start >> PAGE_SHIFT,
4873 (xenheap_phys_end - start) >> PAGE_SHIFT,
4874 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4875 #else
4876 map_pages_to_xen(
4877 (unsigned long)__va(start),
4878 start >> PAGE_SHIFT,
4879 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4880 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4881 BUG_ON(start != xen_phys_start);
4882 map_pages_to_xen(
4883 XEN_VIRT_START,
4884 start >> PAGE_SHIFT,
4885 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4886 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4887 #endif
4890 static void __memguard_change_range(void *p, unsigned long l, int guard)
4892 unsigned long _p = (unsigned long)p;
4893 unsigned long _l = (unsigned long)l;
4894 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4896 /* Ensure we are dealing with a page-aligned whole number of pages. */
4897 ASSERT((_p&~PAGE_MASK) == 0);
4898 ASSERT((_l&~PAGE_MASK) == 0);
4900 if ( guard )
4901 flags &= ~_PAGE_PRESENT;
4903 map_pages_to_xen(
4904 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4907 void memguard_guard_range(void *p, unsigned long l)
4909 __memguard_change_range(p, l, 1);
4912 void memguard_unguard_range(void *p, unsigned long l)
4914 __memguard_change_range(p, l, 0);
4917 #endif
4919 void memguard_guard_stack(void *p)
4921 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4922 p = (void *)((unsigned long)p + STACK_SIZE -
4923 PRIMARY_STACK_SIZE - PAGE_SIZE);
4924 memguard_guard_range(p, PAGE_SIZE);
4927 /*
4928 * Local variables:
4929 * mode: C
4930 * c-set-style: "BSD"
4931 * c-basic-offset: 4
4932 * tab-width: 4
4933 * indent-tabs-mode: nil
4934 * End:
4935 */