debuggers.hg

view xen/arch/x86/mm.c @ 20964:a3fa6d444b25

Fix domain reference leaks

Besides two unlikely/rarely hit ones in x86 code, the main offender
was tmh_client_from_cli_id(), which didn't even have a counterpart
(albeit it had a comment correctly saying that it causes d->refcnt to
get incremented). Unfortunately(?) this required a bit of code
restructuring (as I needed to change the code anyway, I also fixed
a couple os missing bounds checks which would sooner or later be
reported as security vulnerabilities), so I would hope Dan could give
it his blessing before it gets applied.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 10 09:18:43 2010 +0000 (2010-02-10)
parents d311d1efc25e
children 9a1d7caa2024
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <public/sched.h>
114 #include <xsm/xsm.h>
115 #include <xen/trace.h>
116 #include <asm/setup.h>
117 #include <asm/mem_sharing.h>
119 /*
120 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
121 * mappings to avoid type conflicts with fixed-range MTRRs covering the
122 * lowest megabyte of physical memory. In any case the VGA hole should be
123 * mapped with type UC.
124 */
125 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
126 l1_identmap[L1_PAGETABLE_ENTRIES];
128 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
130 /*
131 * PTE updates can be done with ordinary writes except:
132 * 1. Debug builds get extra checking by using CMPXCHG[8B].
133 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
134 */
135 #if !defined(NDEBUG) || defined(__i386__)
136 #define PTE_UPDATE_WITH_CMPXCHG
137 #endif
139 int mem_hotplug = 0;
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 struct domain *dom_xen, *dom_io, *dom_cow;
144 /* Frame table size in pages. */
145 unsigned long max_page;
146 unsigned long total_pages;
148 unsigned long __read_mostly pdx_group_valid[BITS_TO_LONGS(
149 (FRAMETABLE_SIZE / sizeof(*frame_table) + PDX_GROUP_COUNT - 1)
150 / PDX_GROUP_COUNT)] = { [0] = 1 };
152 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
154 int opt_allow_hugepage;
155 boolean_param("allowhugepage", opt_allow_hugepage);
157 #define l1_disallow_mask(d) \
158 ((d != dom_io) && \
159 (rangeset_is_empty((d)->iomem_caps) && \
160 rangeset_is_empty((d)->arch.ioport_caps) && \
161 !has_arch_pdevs(d)) ? \
162 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
164 #ifdef __x86_64__
165 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
166 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
167 L3_DISALLOW_MASK : \
168 COMPAT_L3_DISALLOW_MASK)
169 #else
170 #define l3_disallow_mask(d) L3_DISALLOW_MASK
171 #endif
173 static void __init init_frametable_chunk(void *start, void *end)
174 {
175 unsigned long s = (unsigned long)start;
176 unsigned long e = (unsigned long)end;
177 unsigned long step, mfn;
179 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
180 for ( ; s < e; s += step << PAGE_SHIFT )
181 {
182 step = 1UL << (cpu_has_page1gb &&
183 !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
184 L3_PAGETABLE_SHIFT - PAGE_SHIFT :
185 L2_PAGETABLE_SHIFT - PAGE_SHIFT);
186 /*
187 * The hardcoded 4 below is arbitrary - just pick whatever you think
188 * is reasonable to waste as a trade-off for using a large page.
189 */
190 while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
191 step >>= PAGETABLE_ORDER;
192 do {
193 mfn = alloc_boot_pages(step, step);
194 } while ( !mfn && (step >>= PAGETABLE_ORDER) );
195 if ( !mfn )
196 panic("Not enough memory for frame table");
197 map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
198 }
200 memset(start, 0, end - start);
201 memset(end, -1, s - (unsigned long)end);
202 }
204 void __init init_frametable(void)
205 {
206 unsigned int sidx, eidx, nidx;
207 unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
209 #ifdef __x86_64__
210 BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_END);
211 #endif
212 BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
214 for ( sidx = 0; ; sidx = nidx )
215 {
216 eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
217 nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
218 if ( nidx >= max_idx )
219 break;
220 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
221 pdx_to_page(eidx * PDX_GROUP_COUNT));
222 }
223 if ( !mem_hotplug )
224 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
225 pdx_to_page(max_pdx - 1) + 1);
226 else
227 {
228 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
229 pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1);
230 memset(pdx_to_page(max_pdx), -1,
231 (unsigned long)pdx_to_page(max_idx * PDX_GROUP_COUNT) -
232 (unsigned long)pdx_to_page(max_pdx));
233 }
234 }
236 void __init arch_init_memory(void)
237 {
238 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
240 /*
241 * Initialise our DOMID_XEN domain.
242 * Any Xen-heap pages that we will allow to be mapped will have
243 * their domain field set to dom_xen.
244 */
245 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
246 BUG_ON(dom_xen == NULL);
248 /*
249 * Initialise our DOMID_IO domain.
250 * This domain owns I/O pages that are within the range of the page_info
251 * array. Mappings occur at the priv of the caller.
252 */
253 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
254 BUG_ON(dom_io == NULL);
256 /*
257 * Initialise our DOMID_IO domain.
258 * This domain owns sharable pages.
259 */
260 dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
261 BUG_ON(dom_cow == NULL);
263 /* First 1MB of RAM is historically marked as I/O. */
264 for ( i = 0; i < 0x100; i++ )
265 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
267 /* Any areas not specified as RAM by the e820 map are considered I/O. */
268 for ( i = 0, pfn = 0; pfn < max_page; i++ )
269 {
270 while ( (i < e820.nr_map) &&
271 (e820.map[i].type != E820_RAM) &&
272 (e820.map[i].type != E820_UNUSABLE) )
273 i++;
275 if ( i >= e820.nr_map )
276 {
277 /* No more RAM regions: mark as I/O right to end of memory map. */
278 rstart_pfn = rend_pfn = max_page;
279 }
280 else
281 {
282 /* Mark as I/O just up as far as next RAM region. */
283 rstart_pfn = min_t(unsigned long, max_page,
284 PFN_UP(e820.map[i].addr));
285 rend_pfn = max_t(unsigned long, rstart_pfn,
286 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
287 }
289 /*
290 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
291 * In particular this ensures that RAM holes are respected even in
292 * the statically-initialised 1-16MB mapping area.
293 */
294 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
295 #if defined(CONFIG_X86_32)
296 ioend_pfn = min_t(unsigned long, rstart_pfn,
297 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
298 #else
299 ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
300 #endif
301 if ( iostart_pfn < ioend_pfn )
302 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
303 (unsigned long)mfn_to_virt(ioend_pfn));
305 /* Mark as I/O up to next RAM region. */
306 for ( ; pfn < rstart_pfn; pfn++ )
307 {
308 if ( !mfn_valid(pfn) )
309 continue;
310 share_xen_page_with_guest(
311 mfn_to_page(pfn), dom_io, XENSHARE_writable);
312 }
314 /* Skip the RAM region. */
315 pfn = rend_pfn;
316 }
318 subarch_init_memory();
320 mem_sharing_init();
321 }
323 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
324 {
325 uint64_t maddr = pfn_to_paddr(mfn);
326 int i;
328 for ( i = 0; i < e820.nr_map; i++ )
329 {
330 switch ( e820.map[i].type )
331 {
332 case E820_RAM:
333 if ( mem_type & RAM_TYPE_CONVENTIONAL )
334 break;
335 continue;
336 case E820_RESERVED:
337 if ( mem_type & RAM_TYPE_RESERVED )
338 break;
339 continue;
340 case E820_UNUSABLE:
341 if ( mem_type & RAM_TYPE_UNUSABLE )
342 break;
343 continue;
344 case E820_ACPI:
345 case E820_NVS:
346 if ( mem_type & RAM_TYPE_ACPI )
347 break;
348 continue;
349 default:
350 /* unknown */
351 continue;
352 }
354 /* Test the range. */
355 if ( (e820.map[i].addr <= maddr) &&
356 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
357 return 1;
358 }
360 return 0;
361 }
363 unsigned long domain_get_maximum_gpfn(struct domain *d)
364 {
365 if ( is_hvm_domain(d) )
366 return d->arch.p2m->max_mapped_pfn;
367 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
368 return arch_get_max_pfn(d) - 1;
369 }
371 void share_xen_page_with_guest(
372 struct page_info *page, struct domain *d, int readonly)
373 {
374 if ( page_get_owner(page) == d )
375 return;
377 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
379 spin_lock(&d->page_alloc_lock);
381 /* The incremented type count pins as writable or read-only. */
382 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
383 page->u.inuse.type_info |= PGT_validated | 1;
385 page_set_owner(page, d);
386 wmb(); /* install valid domain ptr before updating refcnt. */
387 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
389 /* Only add to the allocation list if the domain isn't dying. */
390 if ( !d->is_dying )
391 {
392 page->count_info |= PGC_allocated | 1;
393 if ( unlikely(d->xenheap_pages++ == 0) )
394 get_knownalive_domain(d);
395 page_list_add_tail(page, &d->xenpage_list);
396 }
398 spin_unlock(&d->page_alloc_lock);
399 }
401 void share_xen_page_with_privileged_guests(
402 struct page_info *page, int readonly)
403 {
404 share_xen_page_with_guest(page, dom_xen, readonly);
405 }
407 #if defined(__i386__)
409 #ifdef NDEBUG
410 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
411 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
412 #else
413 /*
414 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
415 * We cannot safely shadow the idle page table, nor shadow page tables
416 * (detected by zero reference count). As required for correctness, we
417 * always shadow PDPTs above 4GB.
418 */
419 #define l3tab_needs_shadow(mfn) \
420 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
421 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
422 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
423 ((mfn) >= 0x100000))
424 #endif
426 static l1_pgentry_t *fix_pae_highmem_pl1e;
428 /* Cache the address of PAE high-memory fixmap page tables. */
429 static int __init cache_pae_fixmap_address(void)
430 {
431 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
432 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
433 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
434 return 0;
435 }
436 __initcall(cache_pae_fixmap_address);
438 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
440 void make_cr3(struct vcpu *v, unsigned long mfn)
441 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
442 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
443 {
444 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
445 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
446 unsigned int cpu = smp_processor_id();
448 /* Fast path: does this mfn need a shadow at all? */
449 if ( !l3tab_needs_shadow(mfn) )
450 {
451 v->arch.cr3 = mfn << PAGE_SHIFT;
452 /* Cache is no longer in use or valid */
453 cache->high_mfn = 0;
454 return;
455 }
457 /* Caching logic is not interrupt safe. */
458 ASSERT(!in_irq());
460 /* Protects against pae_flush_pgd(). */
461 spin_lock(&cache->lock);
463 cache->inuse_idx ^= 1;
464 cache->high_mfn = mfn;
466 /* Map the guest L3 table and copy to the chosen low-memory cache. */
467 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
468 /* First check the previous high mapping can't be in the TLB.
469 * (i.e. have we loaded CR3 since we last did this?) */
470 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
471 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
472 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
473 lowmem_l3tab = cache->table[cache->inuse_idx];
474 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
475 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
476 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
478 v->arch.cr3 = __pa(lowmem_l3tab);
480 spin_unlock(&cache->lock);
481 }
483 #else /* !defined(__i386__) */
485 void make_cr3(struct vcpu *v, unsigned long mfn)
486 {
487 v->arch.cr3 = mfn << PAGE_SHIFT;
488 }
490 #endif /* !defined(__i386__) */
492 void write_ptbase(struct vcpu *v)
493 {
494 write_cr3(v->arch.cr3);
495 }
497 /*
498 * Should be called after CR3 is updated.
499 *
500 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
501 * for HVM guests, arch.monitor_table and hvm's guest CR3.
502 *
503 * Update ref counts to shadow tables appropriately.
504 */
505 void update_cr3(struct vcpu *v)
506 {
507 unsigned long cr3_mfn=0;
509 if ( paging_mode_enabled(v->domain) )
510 {
511 paging_update_cr3(v);
512 return;
513 }
515 #if CONFIG_PAGING_LEVELS == 4
516 if ( !(v->arch.flags & TF_kernel_mode) )
517 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
518 else
519 #endif
520 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
522 make_cr3(v, cr3_mfn);
523 }
526 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
527 {
528 int i;
529 unsigned long pfn;
530 struct page_info *page;
532 BUG_ON(unlikely(in_irq()));
534 spin_lock(&v->arch.shadow_ldt_lock);
536 if ( v->arch.shadow_ldt_mapcnt == 0 )
537 goto out;
539 v->arch.shadow_ldt_mapcnt = 0;
541 for ( i = 16; i < 32; i++ )
542 {
543 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
544 if ( pfn == 0 ) continue;
545 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
546 page = mfn_to_page(pfn);
547 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
548 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
549 put_page_and_type(page);
550 }
552 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
553 if ( flush )
554 flush_tlb_mask(&v->vcpu_dirty_cpumask);
556 out:
557 spin_unlock(&v->arch.shadow_ldt_lock);
558 }
561 static int alloc_segdesc_page(struct page_info *page)
562 {
563 struct desc_struct *descs;
564 int i;
566 descs = __map_domain_page(page);
568 for ( i = 0; i < 512; i++ )
569 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
570 goto fail;
572 unmap_domain_page(descs);
573 return 0;
575 fail:
576 unmap_domain_page(descs);
577 return -EINVAL;
578 }
581 /* Map shadow page at offset @off. */
582 int map_ldt_shadow_page(unsigned int off)
583 {
584 struct vcpu *v = current;
585 struct domain *d = v->domain;
586 unsigned long gmfn, mfn;
587 l1_pgentry_t l1e, nl1e;
588 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
589 int okay;
591 BUG_ON(unlikely(in_irq()));
593 guest_get_eff_kern_l1e(v, gva, &l1e);
594 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
595 return 0;
597 gmfn = l1e_get_pfn(l1e);
598 mfn = gmfn_to_mfn(d, gmfn);
599 if ( unlikely(!mfn_valid(mfn)) )
600 return 0;
602 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
603 if ( unlikely(!okay) )
604 return 0;
606 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
608 spin_lock(&v->arch.shadow_ldt_lock);
609 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
610 v->arch.shadow_ldt_mapcnt++;
611 spin_unlock(&v->arch.shadow_ldt_lock);
613 return 1;
614 }
617 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
618 {
619 struct page_info *page = mfn_to_page(page_nr);
621 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
622 {
623 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
624 return 0;
625 }
627 return 1;
628 }
631 static int get_page_and_type_from_pagenr(unsigned long page_nr,
632 unsigned long type,
633 struct domain *d,
634 int partial,
635 int preemptible)
636 {
637 struct page_info *page = mfn_to_page(page_nr);
638 int rc;
640 if ( likely(partial >= 0) &&
641 unlikely(!get_page_from_pagenr(page_nr, d)) )
642 return -EINVAL;
644 rc = (preemptible ?
645 get_page_type_preemptible(page, type) :
646 (get_page_type(page, type) ? 0 : -EINVAL));
648 if ( unlikely(rc) && partial >= 0 )
649 put_page(page);
651 return rc;
652 }
654 static int get_data_page(
655 struct page_info *page, struct domain *d, int writeable)
656 {
657 int rc;
659 if ( writeable )
660 rc = get_page_and_type(page, d, PGT_writable_page);
661 else
662 rc = get_page(page, d);
664 return rc;
665 }
667 static void put_data_page(
668 struct page_info *page, int writeable)
669 {
670 if ( writeable )
671 put_page_and_type(page);
672 else
673 put_page(page);
674 }
676 /*
677 * We allow root tables to map each other (a.k.a. linear page tables). It
678 * needs some special care with reference counts and access permissions:
679 * 1. The mapping entry must be read-only, or the guest may get write access
680 * to its own PTEs.
681 * 2. We must only bump the reference counts for an *already validated*
682 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
683 * on a validation that is required to complete that validation.
684 * 3. We only need to increment the reference counts for the mapped page
685 * frame if it is mapped by a different root table. This is sufficient and
686 * also necessary to allow validation of a root table mapping itself.
687 */
688 #define define_get_linear_pagetable(level) \
689 static int \
690 get_##level##_linear_pagetable( \
691 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
692 { \
693 unsigned long x, y; \
694 struct page_info *page; \
695 unsigned long pfn; \
696 \
697 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
698 { \
699 MEM_LOG("Attempt to create linear p.t. with write perms"); \
700 return 0; \
701 } \
702 \
703 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
704 { \
705 /* Make sure the mapped frame belongs to the correct domain. */ \
706 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
707 return 0; \
708 \
709 /* \
710 * Ensure that the mapped frame is an already-validated page table. \
711 * If so, atomically increment the count (checking for overflow). \
712 */ \
713 page = mfn_to_page(pfn); \
714 y = page->u.inuse.type_info; \
715 do { \
716 x = y; \
717 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
718 unlikely((x & (PGT_type_mask|PGT_validated)) != \
719 (PGT_##level##_page_table|PGT_validated)) ) \
720 { \
721 put_page(page); \
722 return 0; \
723 } \
724 } \
725 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
726 } \
727 \
728 return 1; \
729 }
732 int is_iomem_page(unsigned long mfn)
733 {
734 struct page_info *page;
736 if ( !mfn_valid(mfn) )
737 return 1;
739 /* Caller must know that it is an iomem page, or a reference is held. */
740 page = mfn_to_page(mfn);
741 ASSERT((page->count_info & PGC_count_mask) != 0);
743 return (page_get_owner(page) == dom_io);
744 }
746 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
747 {
748 #ifdef __x86_64__
749 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
750 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
751 unsigned long xen_va =
752 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
754 if ( unlikely(alias) && cacheattr )
755 map_pages_to_xen(xen_va, mfn, 1, 0);
756 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
757 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
758 if ( unlikely(alias) && !cacheattr )
759 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
760 #endif
761 }
763 int
764 get_page_from_l1e(
765 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
766 {
767 unsigned long mfn = l1e_get_pfn(l1e);
768 struct page_info *page = mfn_to_page(mfn);
769 uint32_t l1f = l1e_get_flags(l1e);
770 struct vcpu *curr = current;
771 struct domain *real_pg_owner;
773 if ( !(l1f & _PAGE_PRESENT) )
774 return 1;
776 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
777 {
778 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
779 return 0;
780 }
782 if ( !mfn_valid(mfn) ||
783 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
784 {
785 /* Only needed the reference to confirm dom_io ownership. */
786 if ( mfn_valid(mfn) )
787 put_page(page);
789 /* DOMID_IO reverts to caller for privilege checks. */
790 if ( pg_owner == dom_io )
791 pg_owner = curr->domain;
793 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
794 {
795 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
796 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
797 pg_owner->domain_id, mfn);
798 return 0;
799 }
801 return 1;
802 }
804 if ( unlikely(real_pg_owner != pg_owner) )
805 {
806 /*
807 * Let privileged domains transfer the right to map their target
808 * domain's pages. This is used to allow stub-domain pvfb export to
809 * dom0, until pvfb supports granted mappings. At that time this
810 * minor hack can go away.
811 */
812 if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
813 !IS_PRIV_FOR(pg_owner, real_pg_owner) )
814 goto could_not_pin;
815 pg_owner = real_pg_owner;
816 }
818 /* Foreign mappings into guests in shadow external mode don't
819 * contribute to writeable mapping refcounts. (This allows the
820 * qemu-dm helper process in dom0 to map the domain's memory without
821 * messing up the count of "real" writable mappings.) */
822 if ( (l1f & _PAGE_RW) &&
823 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) &&
824 !get_page_type(page, PGT_writable_page) )
825 goto could_not_pin;
827 if ( pte_flags_to_cacheattr(l1f) !=
828 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
829 {
830 unsigned long x, nx, y = page->count_info;
831 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
833 if ( is_xen_heap_page(page) )
834 {
835 if ( (l1f & _PAGE_RW) &&
836 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
837 put_page_type(page);
838 put_page(page);
839 MEM_LOG("Attempt to change cache attributes of Xen heap page");
840 return 0;
841 }
843 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
844 {
845 x = y;
846 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
847 y = cmpxchg(&page->count_info, x, nx);
848 }
850 update_xen_mappings(mfn, cacheattr);
851 }
853 return 1;
855 could_not_pin:
856 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
857 " for l1e_owner=%d, pg_owner=%d",
858 mfn, get_gpfn_from_mfn(mfn),
859 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
860 if ( real_pg_owner != NULL )
861 put_page(page);
862 return 0;
863 }
866 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
867 define_get_linear_pagetable(l2);
868 static int
869 get_page_from_l2e(
870 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
871 {
872 unsigned long mfn = l2e_get_pfn(l2e);
873 int rc;
875 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
876 return 1;
878 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
879 {
880 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
881 return -EINVAL;
882 }
884 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
885 {
886 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
887 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
888 rc = 0;
889 }
890 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
891 {
892 rc = -EINVAL;
893 }
894 else
895 {
896 unsigned long m = mfn;
897 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
899 do {
900 if ( !mfn_valid(m) ||
901 !get_data_page(mfn_to_page(m), d, writeable) )
902 {
903 while ( m-- > mfn )
904 put_data_page(mfn_to_page(m), writeable);
905 return -EINVAL;
906 }
907 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
909 rc = 1;
910 }
912 return rc;
913 }
916 define_get_linear_pagetable(l3);
917 static int
918 get_page_from_l3e(
919 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
920 {
921 int rc;
923 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
924 return 1;
926 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
927 {
928 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
929 return -EINVAL;
930 }
932 rc = get_page_and_type_from_pagenr(
933 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
934 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
935 rc = 0;
937 return rc;
938 }
940 #if CONFIG_PAGING_LEVELS >= 4
941 define_get_linear_pagetable(l4);
942 static int
943 get_page_from_l4e(
944 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
945 {
946 int rc;
948 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
949 return 1;
951 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
952 {
953 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
954 return -EINVAL;
955 }
957 rc = get_page_and_type_from_pagenr(
958 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
959 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
960 rc = 0;
962 return rc;
963 }
964 #endif /* 4 level */
966 #ifdef __x86_64__
968 #ifdef USER_MAPPINGS_ARE_GLOBAL
969 #define adjust_guest_l1e(pl1e, d) \
970 do { \
971 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
972 likely(!is_pv_32on64_domain(d)) ) \
973 { \
974 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
975 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
976 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
977 MEM_LOG("Global bit is set to kernel page %lx", \
978 l1e_get_pfn((pl1e))); \
979 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
980 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
981 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
982 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
983 } \
984 } while ( 0 )
985 #else
986 #define adjust_guest_l1e(pl1e, d) \
987 do { \
988 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
989 likely(!is_pv_32on64_domain(d)) ) \
990 l1e_add_flags((pl1e), _PAGE_USER); \
991 } while ( 0 )
992 #endif
994 #define adjust_guest_l2e(pl2e, d) \
995 do { \
996 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
997 likely(!is_pv_32on64_domain(d)) ) \
998 l2e_add_flags((pl2e), _PAGE_USER); \
999 } while ( 0 )
1001 #define adjust_guest_l3e(pl3e, d) \
1002 do { \
1003 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1004 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
1005 _PAGE_USER : \
1006 _PAGE_USER|_PAGE_RW); \
1007 } while ( 0 )
1009 #define adjust_guest_l4e(pl4e, d) \
1010 do { \
1011 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
1012 likely(!is_pv_32on64_domain(d)) ) \
1013 l4e_add_flags((pl4e), _PAGE_USER); \
1014 } while ( 0 )
1016 #else /* !defined(__x86_64__) */
1018 #define adjust_guest_l1e(_p, _d) ((void)(_d))
1019 #define adjust_guest_l2e(_p, _d) ((void)(_d))
1020 #define adjust_guest_l3e(_p, _d) ((void)(_d))
1022 #endif
1024 #ifdef __x86_64__
1025 #define unadjust_guest_l3e(pl3e, d) \
1026 do { \
1027 if ( unlikely(is_pv_32on64_domain(d)) && \
1028 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
1029 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
1030 } while ( 0 )
1031 #else
1032 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
1033 #endif
1035 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1037 unsigned long pfn = l1e_get_pfn(l1e);
1038 struct page_info *page;
1039 struct domain *pg_owner;
1040 struct vcpu *v;
1042 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1043 return;
1045 page = mfn_to_page(pfn);
1046 pg_owner = page_get_owner(page);
1048 /*
1049 * Check if this is a mapping that was established via a grant reference.
1050 * If it was then we should not be here: we require that such mappings are
1051 * explicitly destroyed via the grant-table interface.
1053 * The upshot of this is that the guest can end up with active grants that
1054 * it cannot destroy (because it no longer has a PTE to present to the
1055 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1056 * hence a special grant PTE flag can be enabled to catch the bug early.
1058 * (Note that the undestroyable active grants are not a security hole in
1059 * Xen. All active grants can safely be cleaned up when the domain dies.)
1060 */
1061 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1062 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1064 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1065 l1e_get_intpte(l1e));
1066 domain_crash(l1e_owner);
1069 /* Remember we didn't take a type-count of foreign writable mappings
1070 * to paging-external domains */
1071 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1072 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1074 put_page_and_type(page);
1076 else
1078 /* We expect this is rare so we blow the entire shadow LDT. */
1079 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1080 PGT_seg_desc_page)) &&
1081 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1082 (l1e_owner == pg_owner) )
1084 for_each_vcpu ( pg_owner, v )
1085 invalidate_shadow_ldt(v, 1);
1087 put_page(page);
1092 /*
1093 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1094 * Note also that this automatically deals correctly with linear p.t.'s.
1095 */
1096 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1098 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1099 return 1;
1101 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1103 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1104 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1106 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1107 do {
1108 put_data_page(mfn_to_page(m), writeable);
1109 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1111 else
1113 put_page_and_type(l2e_get_page(l2e));
1116 return 0;
1119 static int __put_page_type(struct page_info *, int preemptible);
1121 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1122 int partial, int preemptible)
1124 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1125 return 1;
1127 #ifdef __x86_64__
1128 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1130 unsigned long mfn = l3e_get_pfn(l3e);
1131 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1133 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1134 do {
1135 put_data_page(mfn_to_page(mfn), writeable);
1136 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1138 return 0;
1140 #endif
1142 if ( unlikely(partial > 0) )
1143 return __put_page_type(l3e_get_page(l3e), preemptible);
1145 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1148 #if CONFIG_PAGING_LEVELS >= 4
1149 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1150 int partial, int preemptible)
1152 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1153 (l4e_get_pfn(l4e) != pfn) )
1155 if ( unlikely(partial > 0) )
1156 return __put_page_type(l4e_get_page(l4e), preemptible);
1157 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1159 return 1;
1161 #endif
1163 static int alloc_l1_table(struct page_info *page)
1165 struct domain *d = page_get_owner(page);
1166 unsigned long pfn = page_to_mfn(page);
1167 l1_pgentry_t *pl1e;
1168 unsigned int i;
1170 pl1e = map_domain_page(pfn);
1172 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1174 if ( is_guest_l1_slot(i) &&
1175 unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
1176 goto fail;
1178 adjust_guest_l1e(pl1e[i], d);
1181 unmap_domain_page(pl1e);
1182 return 0;
1184 fail:
1185 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1186 while ( i-- > 0 )
1187 if ( is_guest_l1_slot(i) )
1188 put_page_from_l1e(pl1e[i], d);
1190 unmap_domain_page(pl1e);
1191 return -EINVAL;
1194 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1196 struct page_info *page;
1197 l3_pgentry_t l3e3;
1198 #ifdef __i386__
1199 l2_pgentry_t *pl2e, l2e;
1200 int i;
1201 #endif
1203 if ( !is_pv_32bit_domain(d) )
1204 return 1;
1206 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1208 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1209 l3e3 = pl3e[3];
1210 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1212 MEM_LOG("PAE L3 3rd slot is empty");
1213 return 0;
1216 /*
1217 * The Xen-private mappings include linear mappings. The L2 thus cannot
1218 * be shared by multiple L3 tables. The test here is adequate because:
1219 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1220 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1221 * 2. Cannot appear in another page table's L3:
1222 * a. alloc_l3_table() calls this function and this check will fail
1223 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1224 */
1225 page = l3e_get_page(l3e3);
1226 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1227 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1228 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1229 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1231 MEM_LOG("PAE L3 3rd slot is shared");
1232 return 0;
1235 #ifdef __i386__
1236 /* Xen linear pagetable mappings. */
1237 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1238 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1240 l2e = l2e_empty();
1241 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1242 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1243 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1245 unmap_domain_page(pl2e);
1246 #endif
1248 return 1;
1251 #ifdef __i386__
1252 /* Flush a pgdir update into low-memory caches. */
1253 static void pae_flush_pgd(
1254 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1256 struct domain *d = page_get_owner(mfn_to_page(mfn));
1257 struct vcpu *v;
1258 intpte_t _ol3e, _nl3e, _pl3e;
1259 l3_pgentry_t *l3tab_ptr;
1260 struct pae_l3_cache *cache;
1262 if ( unlikely(shadow_mode_enabled(d)) )
1264 cpumask_t m = CPU_MASK_NONE;
1265 /* Re-shadow this l3 table on any vcpus that are using it */
1266 for_each_vcpu ( d, v )
1267 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1269 paging_update_cr3(v);
1270 cpus_or(m, m, v->vcpu_dirty_cpumask);
1272 flush_tlb_mask(&m);
1275 /* If below 4GB then the pgdir is not shadowed in low memory. */
1276 if ( !l3tab_needs_shadow(mfn) )
1277 return;
1279 for_each_vcpu ( d, v )
1281 cache = &v->arch.pae_l3_cache;
1283 spin_lock(&cache->lock);
1285 if ( cache->high_mfn == mfn )
1287 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1288 _ol3e = l3e_get_intpte(*l3tab_ptr);
1289 _nl3e = l3e_get_intpte(nl3e);
1290 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1291 BUG_ON(_pl3e != _ol3e);
1294 spin_unlock(&cache->lock);
1297 flush_tlb_mask(&d->domain_dirty_cpumask);
1299 #else
1300 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1301 #endif
1303 static int alloc_l2_table(struct page_info *page, unsigned long type,
1304 int preemptible)
1306 struct domain *d = page_get_owner(page);
1307 unsigned long pfn = page_to_mfn(page);
1308 l2_pgentry_t *pl2e;
1309 unsigned int i;
1310 int rc = 0;
1312 pl2e = map_domain_page(pfn);
1314 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1316 if ( preemptible && i && hypercall_preempt_check() )
1318 page->nr_validated_ptes = i;
1319 rc = -EAGAIN;
1320 break;
1323 if ( !is_guest_l2_slot(d, type, i) ||
1324 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1325 continue;
1327 if ( rc < 0 )
1329 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1330 while ( i-- > 0 )
1331 if ( is_guest_l2_slot(d, type, i) )
1332 put_page_from_l2e(pl2e[i], pfn);
1333 break;
1336 adjust_guest_l2e(pl2e[i], d);
1339 if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1341 /* Xen private mappings. */
1342 #if defined(__i386__)
1343 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1344 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1345 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1346 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1347 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i],
1348 l2e_from_page(perdomain_pt_page(d, i),
1349 __PAGE_HYPERVISOR));
1350 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1351 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1352 #else
1353 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1354 &compat_idle_pg_table_l2[
1355 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1356 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1357 #endif
1360 unmap_domain_page(pl2e);
1361 return rc > 0 ? 0 : rc;
1364 static int alloc_l3_table(struct page_info *page, int preemptible)
1366 struct domain *d = page_get_owner(page);
1367 unsigned long pfn = page_to_mfn(page);
1368 l3_pgentry_t *pl3e;
1369 unsigned int i;
1370 int rc = 0, partial = page->partial_pte;
1372 #if CONFIG_PAGING_LEVELS == 3
1373 /*
1374 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1375 * the weird 'extended cr3' format for dealing with high-order address
1376 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1377 */
1378 if ( (pfn >= 0x100000) &&
1379 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1380 d->vcpu && d->vcpu[0] && d->vcpu[0]->is_initialised )
1382 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1383 return -EINVAL;
1385 #endif
1387 pl3e = map_domain_page(pfn);
1389 /*
1390 * PAE guests allocate full pages, but aren't required to initialize
1391 * more than the first four entries; when running in compatibility
1392 * mode, however, the full page is visible to the MMU, and hence all
1393 * 512 entries must be valid/verified, which is most easily achieved
1394 * by clearing them out.
1395 */
1396 if ( is_pv_32on64_domain(d) )
1397 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1399 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1400 i++, partial = 0 )
1402 if ( is_pv_32bit_domain(d) && (i == 3) )
1404 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1405 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1406 rc = -EINVAL;
1407 else
1408 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1409 PGT_l2_page_table |
1410 PGT_pae_xen_l2,
1411 d, partial, preemptible);
1413 else if ( !is_guest_l3_slot(i) ||
1414 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1415 partial, preemptible)) > 0 )
1416 continue;
1418 if ( rc == -EAGAIN )
1420 page->nr_validated_ptes = i;
1421 page->partial_pte = partial ?: 1;
1423 else if ( rc == -EINTR && i )
1425 page->nr_validated_ptes = i;
1426 page->partial_pte = 0;
1427 rc = -EAGAIN;
1429 if ( rc < 0 )
1430 break;
1432 adjust_guest_l3e(pl3e[i], d);
1435 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1436 rc = -EINVAL;
1437 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1439 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1440 while ( i-- > 0 )
1442 if ( !is_guest_l3_slot(i) )
1443 continue;
1444 unadjust_guest_l3e(pl3e[i], d);
1445 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1449 unmap_domain_page(pl3e);
1450 return rc > 0 ? 0 : rc;
1453 #if CONFIG_PAGING_LEVELS >= 4
1454 static int alloc_l4_table(struct page_info *page, int preemptible)
1456 struct domain *d = page_get_owner(page);
1457 unsigned long pfn = page_to_mfn(page);
1458 l4_pgentry_t *pl4e = page_to_virt(page);
1459 unsigned int i;
1460 int rc = 0, partial = page->partial_pte;
1462 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1463 i++, partial = 0 )
1465 if ( !is_guest_l4_slot(d, i) ||
1466 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1467 partial, preemptible)) > 0 )
1468 continue;
1470 if ( rc == -EAGAIN )
1472 page->nr_validated_ptes = i;
1473 page->partial_pte = partial ?: 1;
1475 else if ( rc == -EINTR )
1477 if ( i )
1479 page->nr_validated_ptes = i;
1480 page->partial_pte = 0;
1481 rc = -EAGAIN;
1484 else if ( rc < 0 )
1486 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1487 while ( i-- > 0 )
1488 if ( is_guest_l4_slot(d, i) )
1489 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1491 if ( rc < 0 )
1492 return rc;
1494 adjust_guest_l4e(pl4e[i], d);
1497 /* Xen private mappings. */
1498 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1499 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1500 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1501 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1502 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1503 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1504 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1505 __PAGE_HYPERVISOR);
1507 return rc > 0 ? 0 : rc;
1509 #else
1510 #define alloc_l4_table(page, preemptible) (-EINVAL)
1511 #endif
1514 static void free_l1_table(struct page_info *page)
1516 struct domain *d = page_get_owner(page);
1517 unsigned long pfn = page_to_mfn(page);
1518 l1_pgentry_t *pl1e;
1519 unsigned int i;
1521 pl1e = map_domain_page(pfn);
1523 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1524 if ( is_guest_l1_slot(i) )
1525 put_page_from_l1e(pl1e[i], d);
1527 unmap_domain_page(pl1e);
1531 static int free_l2_table(struct page_info *page, int preemptible)
1533 #ifdef __x86_64__
1534 struct domain *d = page_get_owner(page);
1535 #endif
1536 unsigned long pfn = page_to_mfn(page);
1537 l2_pgentry_t *pl2e;
1538 unsigned int i = page->nr_validated_ptes - 1;
1539 int err = 0;
1541 pl2e = map_domain_page(pfn);
1543 ASSERT(page->nr_validated_ptes);
1544 do {
1545 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1546 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1547 preemptible && i && hypercall_preempt_check() )
1549 page->nr_validated_ptes = i;
1550 err = -EAGAIN;
1552 } while ( !err && i-- );
1554 unmap_domain_page(pl2e);
1556 if ( !err )
1557 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1559 return err;
1562 static int free_l3_table(struct page_info *page, int preemptible)
1564 struct domain *d = page_get_owner(page);
1565 unsigned long pfn = page_to_mfn(page);
1566 l3_pgentry_t *pl3e;
1567 int rc = 0, partial = page->partial_pte;
1568 unsigned int i = page->nr_validated_ptes - !partial;
1570 pl3e = map_domain_page(pfn);
1572 do {
1573 if ( is_guest_l3_slot(i) )
1575 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1576 if ( rc < 0 )
1577 break;
1578 partial = 0;
1579 if ( rc > 0 )
1580 continue;
1581 unadjust_guest_l3e(pl3e[i], d);
1583 } while ( i-- );
1585 unmap_domain_page(pl3e);
1587 if ( rc == -EAGAIN )
1589 page->nr_validated_ptes = i;
1590 page->partial_pte = partial ?: -1;
1592 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1594 page->nr_validated_ptes = i + 1;
1595 page->partial_pte = 0;
1596 rc = -EAGAIN;
1598 return rc > 0 ? 0 : rc;
1601 #if CONFIG_PAGING_LEVELS >= 4
1602 static int free_l4_table(struct page_info *page, int preemptible)
1604 struct domain *d = page_get_owner(page);
1605 unsigned long pfn = page_to_mfn(page);
1606 l4_pgentry_t *pl4e = page_to_virt(page);
1607 int rc = 0, partial = page->partial_pte;
1608 unsigned int i = page->nr_validated_ptes - !partial;
1610 do {
1611 if ( is_guest_l4_slot(d, i) )
1612 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1613 if ( rc < 0 )
1614 break;
1615 partial = 0;
1616 } while ( i-- );
1618 if ( rc == -EAGAIN )
1620 page->nr_validated_ptes = i;
1621 page->partial_pte = partial ?: -1;
1623 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1625 page->nr_validated_ptes = i + 1;
1626 page->partial_pte = 0;
1627 rc = -EAGAIN;
1629 return rc > 0 ? 0 : rc;
1631 #else
1632 #define free_l4_table(page, preemptible) (-EINVAL)
1633 #endif
1635 static int page_lock(struct page_info *page)
1637 unsigned long x, nx;
1639 do {
1640 while ( (x = page->u.inuse.type_info) & PGT_locked )
1641 cpu_relax();
1642 nx = x + (1 | PGT_locked);
1643 if ( !(x & PGT_validated) ||
1644 !(x & PGT_count_mask) ||
1645 !(nx & PGT_count_mask) )
1646 return 0;
1647 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1649 return 1;
1652 static void page_unlock(struct page_info *page)
1654 unsigned long x, nx, y = page->u.inuse.type_info;
1656 do {
1657 x = y;
1658 nx = x - (1 | PGT_locked);
1659 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1662 /* How to write an entry to the guest pagetables.
1663 * Returns 0 for failure (pointer not valid), 1 for success. */
1664 static inline int update_intpte(intpte_t *p,
1665 intpte_t old,
1666 intpte_t new,
1667 unsigned long mfn,
1668 struct vcpu *v,
1669 int preserve_ad)
1671 int rv = 1;
1672 #ifndef PTE_UPDATE_WITH_CMPXCHG
1673 if ( !preserve_ad )
1675 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1677 else
1678 #endif
1680 intpte_t t = old;
1681 for ( ; ; )
1683 intpte_t _new = new;
1684 if ( preserve_ad )
1685 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1687 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1688 if ( unlikely(rv == 0) )
1690 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1691 ": saw %" PRIpte, old, _new, t);
1692 break;
1695 if ( t == old )
1696 break;
1698 /* Allowed to change in Accessed/Dirty flags only. */
1699 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1701 old = t;
1704 return rv;
1707 /* Macro that wraps the appropriate type-changes around update_intpte().
1708 * Arguments are: type, ptr, old, new, mfn, vcpu */
1709 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1710 update_intpte(&_t ## e_get_intpte(*(_p)), \
1711 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1712 (_m), (_v), (_ad))
1714 /* Update the L1 entry at pl1e to new value nl1e. */
1715 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1716 unsigned long gl1mfn, int preserve_ad,
1717 struct vcpu *pt_vcpu, struct domain *pg_dom)
1719 l1_pgentry_t ol1e;
1720 struct domain *pt_dom = pt_vcpu->domain;
1721 unsigned long mfn;
1722 p2m_type_t p2mt;
1723 int rc = 1;
1725 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1726 return 0;
1728 if ( unlikely(paging_mode_refcounts(pt_dom)) )
1730 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad);
1731 return rc;
1734 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1736 /* Translate foreign guest addresses. */
1737 mfn = mfn_x(gfn_to_mfn(pg_dom, l1e_get_pfn(nl1e), &p2mt));
1738 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1739 return 0;
1740 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1741 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1743 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1745 MEM_LOG("Bad L1 flags %x",
1746 l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1747 return 0;
1750 /* Fast path for identical mapping, r/w and presence. */
1751 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1753 adjust_guest_l1e(nl1e, pt_dom);
1754 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1755 preserve_ad);
1756 return rc;
1759 if ( unlikely(!get_page_from_l1e(nl1e, pt_dom, pg_dom)) )
1760 return 0;
1762 adjust_guest_l1e(nl1e, pt_dom);
1763 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1764 preserve_ad)) )
1766 ol1e = nl1e;
1767 rc = 0;
1770 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1771 preserve_ad)) )
1773 return 0;
1776 put_page_from_l1e(ol1e, pt_dom);
1777 return rc;
1781 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1782 static int mod_l2_entry(l2_pgentry_t *pl2e,
1783 l2_pgentry_t nl2e,
1784 unsigned long pfn,
1785 int preserve_ad,
1786 struct vcpu *vcpu)
1788 l2_pgentry_t ol2e;
1789 struct domain *d = vcpu->domain;
1790 struct page_info *l2pg = mfn_to_page(pfn);
1791 unsigned long type = l2pg->u.inuse.type_info;
1792 int rc = 1;
1794 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1796 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1797 return 0;
1800 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1801 return 0;
1803 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1805 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1807 MEM_LOG("Bad L2 flags %x",
1808 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1809 return 0;
1812 /* Fast path for identical mapping and presence. */
1813 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1815 adjust_guest_l2e(nl2e, d);
1816 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1817 return rc;
1820 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1821 return 0;
1823 adjust_guest_l2e(nl2e, d);
1824 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1825 preserve_ad)) )
1827 ol2e = nl2e;
1828 rc = 0;
1831 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1832 preserve_ad)) )
1834 return 0;
1837 put_page_from_l2e(ol2e, pfn);
1838 return rc;
1841 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1842 static int mod_l3_entry(l3_pgentry_t *pl3e,
1843 l3_pgentry_t nl3e,
1844 unsigned long pfn,
1845 int preserve_ad,
1846 int preemptible,
1847 struct vcpu *vcpu)
1849 l3_pgentry_t ol3e;
1850 struct domain *d = vcpu->domain;
1851 int rc = 0;
1853 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1855 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1856 return -EINVAL;
1859 /*
1860 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1861 * would be a pain to ensure they remain continuously valid throughout.
1862 */
1863 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1864 return -EINVAL;
1866 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1867 return -EFAULT;
1869 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1871 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1873 MEM_LOG("Bad L3 flags %x",
1874 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1875 return -EINVAL;
1878 /* Fast path for identical mapping and presence. */
1879 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1881 adjust_guest_l3e(nl3e, d);
1882 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1883 return rc ? 0 : -EFAULT;
1886 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1887 if ( unlikely(rc < 0) )
1888 return rc;
1889 rc = 0;
1891 adjust_guest_l3e(nl3e, d);
1892 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1893 preserve_ad)) )
1895 ol3e = nl3e;
1896 rc = -EFAULT;
1899 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1900 preserve_ad)) )
1902 return -EFAULT;
1905 if ( likely(rc == 0) )
1907 if ( !create_pae_xen_mappings(d, pl3e) )
1908 BUG();
1910 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1913 put_page_from_l3e(ol3e, pfn, 0, 0);
1914 return rc;
1917 #if CONFIG_PAGING_LEVELS >= 4
1919 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1920 static int mod_l4_entry(l4_pgentry_t *pl4e,
1921 l4_pgentry_t nl4e,
1922 unsigned long pfn,
1923 int preserve_ad,
1924 int preemptible,
1925 struct vcpu *vcpu)
1927 struct domain *d = vcpu->domain;
1928 l4_pgentry_t ol4e;
1929 int rc = 0;
1931 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1933 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1934 return -EINVAL;
1937 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1938 return -EFAULT;
1940 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1942 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1944 MEM_LOG("Bad L4 flags %x",
1945 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1946 return -EINVAL;
1949 /* Fast path for identical mapping and presence. */
1950 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1952 adjust_guest_l4e(nl4e, d);
1953 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1954 return rc ? 0 : -EFAULT;
1957 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1958 if ( unlikely(rc < 0) )
1959 return rc;
1960 rc = 0;
1962 adjust_guest_l4e(nl4e, d);
1963 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1964 preserve_ad)) )
1966 ol4e = nl4e;
1967 rc = -EFAULT;
1970 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1971 preserve_ad)) )
1973 return -EFAULT;
1976 put_page_from_l4e(ol4e, pfn, 0, 0);
1977 return rc;
1980 #endif
1982 void put_page(struct page_info *page)
1984 unsigned long nx, x, y = page->count_info;
1986 do {
1987 ASSERT((y & PGC_count_mask) != 0);
1988 x = y;
1989 nx = x - 1;
1991 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1993 if ( unlikely((nx & PGC_count_mask) == 0) )
1995 cleanup_page_cacheattr(page);
1996 free_domheap_page(page);
2001 struct domain *page_get_owner_and_reference(struct page_info *page)
2003 unsigned long x, y = page->count_info;
2005 do {
2006 x = y;
2007 /*
2008 * Count == 0: Page is not allocated, so we cannot take a reference.
2009 * Count == -1: Reference count would wrap, which is invalid.
2010 * Count == -2: Remaining unused ref is reserved for get_page_light().
2011 */
2012 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2013 return NULL;
2015 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2017 return page_get_owner(page);
2021 int get_page(struct page_info *page, struct domain *domain)
2023 struct domain *owner = page_get_owner_and_reference(page);
2025 if ( likely(owner == domain) )
2026 return 1;
2028 if ( owner != NULL )
2029 put_page(page);
2031 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
2032 gdprintk(XENLOG_INFO,
2033 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
2034 PRtype_info "\n",
2035 page_to_mfn(page), domain, owner,
2036 page->count_info, page->u.inuse.type_info);
2037 return 0;
2040 /*
2041 * Special version of get_page() to be used exclusively when
2042 * - a page is known to already have a non-zero reference count
2043 * - the page does not need its owner to be checked
2044 * - it will not be called more than once without dropping the thus
2045 * acquired reference again.
2046 * Due to get_page() reserving one reference, this call cannot fail.
2047 */
2048 static void get_page_light(struct page_info *page)
2050 unsigned long x, nx, y = page->count_info;
2052 do {
2053 x = y;
2054 nx = x + 1;
2055 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2056 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2057 y = cmpxchg(&page->count_info, x, nx);
2059 while ( unlikely(y != x) );
2062 static int alloc_page_type(struct page_info *page, unsigned long type,
2063 int preemptible)
2065 struct domain *owner = page_get_owner(page);
2066 int rc;
2068 /* A page table is dirtied when its type count becomes non-zero. */
2069 if ( likely(owner != NULL) )
2070 paging_mark_dirty(owner, page_to_mfn(page));
2072 switch ( type & PGT_type_mask )
2074 case PGT_l1_page_table:
2075 rc = alloc_l1_table(page);
2076 break;
2077 case PGT_l2_page_table:
2078 rc = alloc_l2_table(page, type, preemptible);
2079 break;
2080 case PGT_l3_page_table:
2081 rc = alloc_l3_table(page, preemptible);
2082 break;
2083 case PGT_l4_page_table:
2084 rc = alloc_l4_table(page, preemptible);
2085 break;
2086 case PGT_seg_desc_page:
2087 rc = alloc_segdesc_page(page);
2088 break;
2089 default:
2090 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2091 type, page->u.inuse.type_info,
2092 page->count_info);
2093 rc = -EINVAL;
2094 BUG();
2097 /* No need for atomic update of type_info here: noone else updates it. */
2098 wmb();
2099 if ( rc == -EAGAIN )
2101 get_page_light(page);
2102 page->u.inuse.type_info |= PGT_partial;
2104 else if ( rc == -EINTR )
2106 ASSERT((page->u.inuse.type_info &
2107 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2108 page->u.inuse.type_info &= ~PGT_count_mask;
2110 else if ( rc )
2112 ASSERT(rc < 0);
2113 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2114 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2115 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2116 type, page->count_info, page->u.inuse.type_info);
2117 page->u.inuse.type_info = 0;
2119 else
2121 page->u.inuse.type_info |= PGT_validated;
2124 return rc;
2128 int free_page_type(struct page_info *page, unsigned long type,
2129 int preemptible)
2131 struct domain *owner = page_get_owner(page);
2132 unsigned long gmfn;
2133 int rc;
2135 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2137 /* A page table is dirtied when its type count becomes zero. */
2138 paging_mark_dirty(owner, page_to_mfn(page));
2140 if ( shadow_mode_refcounts(owner) )
2141 return 0;
2143 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2144 ASSERT(VALID_M2P(gmfn));
2145 /* Page sharing not supported for shadowed domains */
2146 if(!SHARED_M2P(gmfn))
2147 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2150 if ( !(type & PGT_partial) )
2152 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2153 page->partial_pte = 0;
2156 switch ( type & PGT_type_mask )
2158 case PGT_l1_page_table:
2159 free_l1_table(page);
2160 rc = 0;
2161 break;
2162 case PGT_l2_page_table:
2163 rc = free_l2_table(page, preemptible);
2164 break;
2165 case PGT_l3_page_table:
2166 #if CONFIG_PAGING_LEVELS == 3
2167 if ( !(type & PGT_partial) )
2168 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2169 #endif
2170 rc = free_l3_table(page, preemptible);
2171 break;
2172 case PGT_l4_page_table:
2173 rc = free_l4_table(page, preemptible);
2174 break;
2175 default:
2176 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2177 rc = -EINVAL;
2178 BUG();
2181 return rc;
2185 static int __put_final_page_type(
2186 struct page_info *page, unsigned long type, int preemptible)
2188 int rc = free_page_type(page, type, preemptible);
2190 /* No need for atomic update of type_info here: noone else updates it. */
2191 if ( rc == 0 )
2193 /*
2194 * Record TLB information for flush later. We do not stamp page tables
2195 * when running in shadow mode:
2196 * 1. Pointless, since it's the shadow pt's which must be tracked.
2197 * 2. Shadow mode reuses this field for shadowed page tables to
2198 * store flags info -- we don't want to conflict with that.
2199 */
2200 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2201 (page->count_info & PGC_page_table)) )
2202 page->tlbflush_timestamp = tlbflush_current_time();
2203 wmb();
2204 page->u.inuse.type_info--;
2206 else if ( rc == -EINTR )
2208 ASSERT((page->u.inuse.type_info &
2209 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2210 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2211 (page->count_info & PGC_page_table)) )
2212 page->tlbflush_timestamp = tlbflush_current_time();
2213 wmb();
2214 page->u.inuse.type_info |= PGT_validated;
2216 else
2218 BUG_ON(rc != -EAGAIN);
2219 wmb();
2220 get_page_light(page);
2221 page->u.inuse.type_info |= PGT_partial;
2224 return rc;
2228 static int __put_page_type(struct page_info *page,
2229 int preemptible)
2231 unsigned long nx, x, y = page->u.inuse.type_info;
2232 int rc = 0;
2234 for ( ; ; )
2236 x = y;
2237 nx = x - 1;
2239 ASSERT((x & PGT_count_mask) != 0);
2241 if ( unlikely((nx & PGT_count_mask) == 0) )
2243 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2244 likely(nx & (PGT_validated|PGT_partial)) )
2246 /*
2247 * Page-table pages must be unvalidated when count is zero. The
2248 * 'free' is safe because the refcnt is non-zero and validated
2249 * bit is clear => other ops will spin or fail.
2250 */
2251 nx = x & ~(PGT_validated|PGT_partial);
2252 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2253 x, nx)) != x) )
2254 continue;
2255 /* We cleared the 'valid bit' so we do the clean up. */
2256 rc = __put_final_page_type(page, x, preemptible);
2257 if ( x & PGT_partial )
2258 put_page(page);
2259 break;
2262 /*
2263 * Record TLB information for flush later. We do not stamp page
2264 * tables when running in shadow mode:
2265 * 1. Pointless, since it's the shadow pt's which must be tracked.
2266 * 2. Shadow mode reuses this field for shadowed page tables to
2267 * store flags info -- we don't want to conflict with that.
2268 */
2269 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2270 (page->count_info & PGC_page_table)) )
2271 page->tlbflush_timestamp = tlbflush_current_time();
2274 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2275 break;
2277 if ( preemptible && hypercall_preempt_check() )
2278 return -EINTR;
2281 return rc;
2285 static int __get_page_type(struct page_info *page, unsigned long type,
2286 int preemptible)
2288 unsigned long nx, x, y = page->u.inuse.type_info;
2289 int rc = 0;
2291 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2293 for ( ; ; )
2295 x = y;
2296 nx = x + 1;
2297 if ( unlikely((nx & PGT_count_mask) == 0) )
2299 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2300 return -EINVAL;
2302 else if ( unlikely((x & PGT_count_mask) == 0) )
2304 struct domain *d = page_get_owner(page);
2306 /* Normally we should never let a page go from type count 0
2307 * to type count 1 when it is shadowed. One exception:
2308 * out-of-sync shadowed pages are allowed to become
2309 * writeable. */
2310 if ( d && shadow_mode_enabled(d)
2311 && (page->count_info & PGC_page_table)
2312 && !((page->shadow_flags & (1u<<29))
2313 && type == PGT_writable_page) )
2314 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2316 ASSERT(!(x & PGT_pae_xen_l2));
2317 if ( (x & PGT_type_mask) != type )
2319 /*
2320 * On type change we check to flush stale TLB entries. This
2321 * may be unnecessary (e.g., page was GDT/LDT) but those
2322 * circumstances should be very rare.
2323 */
2324 cpumask_t mask = d->domain_dirty_cpumask;
2326 /* Don't flush if the timestamp is old enough */
2327 tlbflush_filter(mask, page->tlbflush_timestamp);
2329 if ( unlikely(!cpus_empty(mask)) &&
2330 /* Shadow mode: track only writable pages. */
2331 (!shadow_mode_enabled(page_get_owner(page)) ||
2332 ((nx & PGT_type_mask) == PGT_writable_page)) )
2334 perfc_incr(need_flush_tlb_flush);
2335 flush_tlb_mask(&mask);
2338 /* We lose existing type and validity. */
2339 nx &= ~(PGT_type_mask | PGT_validated);
2340 nx |= type;
2342 /* No special validation needed for writable pages. */
2343 /* Page tables and GDT/LDT need to be scanned for validity. */
2344 if ( type == PGT_writable_page )
2345 nx |= PGT_validated;
2348 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2350 /* Don't log failure if it could be a recursive-mapping attempt. */
2351 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2352 (type == PGT_l1_page_table) )
2353 return -EINVAL;
2354 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2355 (type == PGT_l2_page_table) )
2356 return -EINVAL;
2357 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2358 (type == PGT_l3_page_table) )
2359 return -EINVAL;
2360 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2361 "for mfn %lx (pfn %lx)",
2362 x, type, page_to_mfn(page),
2363 get_gpfn_from_mfn(page_to_mfn(page)));
2364 return -EINVAL;
2366 else if ( unlikely(!(x & PGT_validated)) )
2368 if ( !(x & PGT_partial) )
2370 /* Someone else is updating validation of this page. Wait... */
2371 while ( (y = page->u.inuse.type_info) == x )
2373 if ( preemptible && hypercall_preempt_check() )
2374 return -EINTR;
2375 cpu_relax();
2377 continue;
2379 /* Type ref count was left at 1 when PGT_partial got set. */
2380 ASSERT((x & PGT_count_mask) == 1);
2381 nx = x & ~PGT_partial;
2384 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2385 break;
2387 if ( preemptible && hypercall_preempt_check() )
2388 return -EINTR;
2391 if ( unlikely((x & PGT_type_mask) != type) )
2393 /* Special pages should not be accessible from devices. */
2394 struct domain *d = page_get_owner(page);
2395 if ( d && !is_hvm_domain(d) && unlikely(need_iommu(d)) )
2397 if ( (x & PGT_type_mask) == PGT_writable_page )
2398 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2399 else if ( type == PGT_writable_page )
2400 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2401 page_to_mfn(page));
2405 if ( unlikely(!(nx & PGT_validated)) )
2407 if ( !(x & PGT_partial) )
2409 page->nr_validated_ptes = 0;
2410 page->partial_pte = 0;
2412 rc = alloc_page_type(page, type, preemptible);
2415 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2416 put_page(page);
2418 return rc;
2421 void put_page_type(struct page_info *page)
2423 int rc = __put_page_type(page, 0);
2424 ASSERT(rc == 0);
2425 (void)rc;
2428 int get_page_type(struct page_info *page, unsigned long type)
2430 int rc = __get_page_type(page, type, 0);
2431 if ( likely(rc == 0) )
2432 return 1;
2433 ASSERT(rc == -EINVAL);
2434 return 0;
2437 int put_page_type_preemptible(struct page_info *page)
2439 return __put_page_type(page, 1);
2442 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2444 return __get_page_type(page, type, 1);
2447 void cleanup_page_cacheattr(struct page_info *page)
2449 uint32_t cacheattr =
2450 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2452 if ( likely(cacheattr == 0) )
2453 return;
2455 page->count_info &= ~PGC_cacheattr_mask;
2457 BUG_ON(is_xen_heap_page(page));
2459 update_xen_mappings(page_to_mfn(page), 0);
2463 int new_guest_cr3(unsigned long mfn)
2465 struct vcpu *curr = current;
2466 struct domain *d = curr->domain;
2467 int okay;
2468 unsigned long old_base_mfn;
2470 #ifdef __x86_64__
2471 if ( is_pv_32on64_domain(d) )
2473 okay = paging_mode_refcounts(d)
2474 ? 0 /* Old code was broken, but what should it be? */
2475 : mod_l4_entry(
2476 __va(pagetable_get_paddr(curr->arch.guest_table)),
2477 l4e_from_pfn(
2478 mfn,
2479 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2480 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2481 if ( unlikely(!okay) )
2483 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2484 return 0;
2487 invalidate_shadow_ldt(curr, 0);
2488 write_ptbase(curr);
2490 return 1;
2492 #endif
2493 okay = paging_mode_refcounts(d)
2494 ? get_page_from_pagenr(mfn, d)
2495 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2496 if ( unlikely(!okay) )
2498 MEM_LOG("Error while installing new baseptr %lx", mfn);
2499 return 0;
2502 invalidate_shadow_ldt(curr, 0);
2504 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2506 curr->arch.guest_table = pagetable_from_pfn(mfn);
2507 update_cr3(curr);
2509 write_ptbase(curr);
2511 if ( likely(old_base_mfn != 0) )
2513 if ( paging_mode_refcounts(d) )
2514 put_page(mfn_to_page(old_base_mfn));
2515 else
2516 put_page_and_type(mfn_to_page(old_base_mfn));
2519 return 1;
2522 static struct domain *get_pg_owner(domid_t domid)
2524 struct domain *pg_owner = NULL, *curr = current->domain;
2526 if ( likely(domid == DOMID_SELF) )
2528 pg_owner = rcu_lock_domain(curr);
2529 goto out;
2532 if ( unlikely(domid == curr->domain_id) )
2534 MEM_LOG("Cannot specify itself as foreign domain");
2535 goto out;
2538 if ( unlikely(paging_mode_translate(curr)) )
2540 MEM_LOG("Cannot mix foreign mappings with translated domains");
2541 goto out;
2544 switch ( domid )
2546 case DOMID_IO:
2547 pg_owner = rcu_lock_domain(dom_io);
2548 break;
2549 case DOMID_XEN:
2550 if ( !IS_PRIV(curr) )
2552 MEM_LOG("Cannot set foreign dom");
2553 break;
2555 pg_owner = rcu_lock_domain(dom_xen);
2556 break;
2557 default:
2558 if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2560 MEM_LOG("Unknown domain '%u'", domid);
2561 break;
2563 if ( !IS_PRIV_FOR(curr, pg_owner) )
2565 MEM_LOG("Cannot set foreign dom");
2566 rcu_unlock_domain(pg_owner);
2567 pg_owner = NULL;
2569 break;
2572 out:
2573 return pg_owner;
2576 static void put_pg_owner(struct domain *pg_owner)
2578 rcu_unlock_domain(pg_owner);
2581 static inline int vcpumask_to_pcpumask(
2582 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2584 unsigned int vcpu_id, vcpu_bias, offs;
2585 unsigned long vmask;
2586 struct vcpu *v;
2587 bool_t is_native = !is_pv_32on64_domain(d);
2589 cpus_clear(*pmask);
2590 for ( vmask = 0, offs = 0; ; ++offs)
2592 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2593 if ( vcpu_bias >= d->max_vcpus )
2594 return 0;
2596 if ( unlikely(is_native ?
2597 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2598 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2599 offs, 1)) )
2601 cpus_clear(*pmask);
2602 return -EFAULT;
2605 while ( vmask )
2607 vcpu_id = find_first_set_bit(vmask);
2608 vmask &= ~(1UL << vcpu_id);
2609 vcpu_id += vcpu_bias;
2610 if ( (vcpu_id >= d->max_vcpus) )
2611 return 0;
2612 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2613 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2618 #ifdef __i386__
2619 static inline void *fixmap_domain_page(unsigned long mfn)
2621 unsigned int cpu = smp_processor_id();
2622 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2624 l1e_write(fix_pae_highmem_pl1e - cpu,
2625 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2626 flush_tlb_one_local(ptr);
2627 return ptr;
2629 static inline void fixunmap_domain_page(const void *ptr)
2631 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2633 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2634 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2636 #else
2637 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2638 #define fixunmap_domain_page(ptr) ((void)(ptr))
2639 #endif
2641 int do_mmuext_op(
2642 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2643 unsigned int count,
2644 XEN_GUEST_HANDLE(uint) pdone,
2645 unsigned int foreigndom)
2647 struct mmuext_op op;
2648 int rc = 0, i = 0, okay;
2649 unsigned long type;
2650 unsigned int done = 0;
2651 struct vcpu *curr = current;
2652 struct domain *d = curr->domain;
2653 struct domain *pg_owner;
2655 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2657 count &= ~MMU_UPDATE_PREEMPTED;
2658 if ( unlikely(!guest_handle_is_null(pdone)) )
2659 (void)copy_from_guest(&done, pdone, 1);
2661 else
2662 perfc_incr(calls_to_mmuext_op);
2664 if ( unlikely(!guest_handle_okay(uops, count)) )
2666 rc = -EFAULT;
2667 goto out;
2670 if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
2672 rc = -ESRCH;
2673 goto out;
2676 for ( i = 0; i < count; i++ )
2678 if ( hypercall_preempt_check() )
2680 rc = -EAGAIN;
2681 break;
2684 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2686 MEM_LOG("Bad __copy_from_guest");
2687 rc = -EFAULT;
2688 break;
2691 okay = 1;
2693 switch ( op.cmd )
2695 case MMUEXT_PIN_L1_TABLE:
2696 type = PGT_l1_page_table;
2697 goto pin_page;
2699 case MMUEXT_PIN_L2_TABLE:
2700 type = PGT_l2_page_table;
2701 goto pin_page;
2703 case MMUEXT_PIN_L3_TABLE:
2704 type = PGT_l3_page_table;
2705 goto pin_page;
2707 case MMUEXT_PIN_L4_TABLE:
2708 if ( is_pv_32bit_domain(pg_owner) )
2709 break;
2710 type = PGT_l4_page_table;
2712 pin_page: {
2713 unsigned long mfn;
2714 struct page_info *page;
2716 /* Ignore pinning of invalid paging levels. */
2717 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2718 break;
2720 if ( paging_mode_refcounts(pg_owner) )
2721 break;
2723 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
2724 rc = get_page_and_type_from_pagenr(mfn, type, pg_owner, 0, 1);
2725 okay = !rc;
2726 if ( unlikely(!okay) )
2728 if ( rc == -EINTR )
2729 rc = -EAGAIN;
2730 else if ( rc != -EAGAIN )
2731 MEM_LOG("Error while pinning mfn %lx", mfn);
2732 break;
2735 page = mfn_to_page(mfn);
2737 if ( (rc = xsm_memory_pin_page(d, page)) != 0 )
2739 put_page_and_type(page);
2740 okay = 0;
2741 break;
2744 if ( unlikely(test_and_set_bit(_PGT_pinned,
2745 &page->u.inuse.type_info)) )
2747 MEM_LOG("Mfn %lx already pinned", mfn);
2748 put_page_and_type(page);
2749 okay = 0;
2750 break;
2753 /* A page is dirtied when its pin status is set. */
2754 paging_mark_dirty(pg_owner, mfn);
2756 /* We can race domain destruction (domain_relinquish_resources). */
2757 if ( unlikely(pg_owner != d) )
2759 int drop_ref;
2760 spin_lock(&pg_owner->page_alloc_lock);
2761 drop_ref = (pg_owner->is_dying &&
2762 test_and_clear_bit(_PGT_pinned,
2763 &page->u.inuse.type_info));
2764 spin_unlock(&pg_owner->page_alloc_lock);
2765 if ( drop_ref )
2766 put_page_and_type(page);
2769 break;
2772 case MMUEXT_UNPIN_TABLE: {
2773 unsigned long mfn;
2774 struct page_info *page;
2776 if ( paging_mode_refcounts(pg_owner) )
2777 break;
2779 mfn = gmfn_to_mfn(pg_owner, op.arg1.mfn);
2780 if ( unlikely(!(okay = get_page_from_pagenr(mfn, pg_owner))) )
2782 MEM_LOG("Mfn %lx bad domain", mfn);
2783 break;
2786 page = mfn_to_page(mfn);
2788 if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
2790 okay = 0;
2791 put_page(page);
2792 MEM_LOG("Mfn %lx not pinned", mfn);
2793 break;
2796 put_page_and_type(page);
2797 put_page(page);
2799 /* A page is dirtied when its pin status is cleared. */
2800 paging_mark_dirty(pg_owner, mfn);
2802 break;
2805 case MMUEXT_NEW_BASEPTR:
2806 okay = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn));
2807 break;
2809 #ifdef __x86_64__
2810 case MMUEXT_NEW_USER_BASEPTR: {
2811 unsigned long old_mfn, mfn;
2813 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2814 if ( mfn != 0 )
2816 if ( paging_mode_refcounts(d) )
2817 okay = get_page_from_pagenr(mfn, d);
2818 else
2819 okay = !get_page_and_type_from_pagenr(
2820 mfn, PGT_root_page_table, d, 0, 0);
2821 if ( unlikely(!okay) )
2823 MEM_LOG("Error while installing new mfn %lx", mfn);
2824 break;
2828 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2829 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2831 if ( old_mfn != 0 )
2833 if ( paging_mode_refcounts(d) )
2834 put_page(mfn_to_page(old_mfn));
2835 else
2836 put_page_and_type(mfn_to_page(old_mfn));
2839 break;
2841 #endif
2843 case MMUEXT_TLB_FLUSH_LOCAL:
2844 flush_tlb_local();
2845 break;
2847 case MMUEXT_INVLPG_LOCAL:
2848 if ( !paging_mode_enabled(d)
2849 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2850 flush_tlb_one_local(op.arg1.linear_addr);
2851 break;
2853 case MMUEXT_TLB_FLUSH_MULTI:
2854 case MMUEXT_INVLPG_MULTI:
2856 cpumask_t pmask;
2858 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
2860 okay = 0;
2861 break;
2863 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2864 flush_tlb_mask(&pmask);
2865 else
2866 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
2867 break;
2870 case MMUEXT_TLB_FLUSH_ALL:
2871 flush_tlb_mask(&d->domain_dirty_cpumask);
2872 break;
2874 case MMUEXT_INVLPG_ALL:
2875 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
2876 break;
2878 case MMUEXT_FLUSH_CACHE:
2879 if ( unlikely(!cache_flush_permitted(d)) )
2881 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2882 okay = 0;
2884 else
2886 wbinvd();
2888 break;
2890 case MMUEXT_SET_LDT:
2892 unsigned long ptr = op.arg1.linear_addr;
2893 unsigned long ents = op.arg2.nr_ents;
2895 if ( paging_mode_external(d) )
2897 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2898 okay = 0;
2900 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2901 (ents > 8192) ||
2902 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2904 okay = 0;
2905 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2907 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2908 (curr->arch.guest_context.ldt_base != ptr) )
2910 invalidate_shadow_ldt(curr, 0);
2911 flush_tlb_local();
2912 curr->arch.guest_context.ldt_base = ptr;
2913 curr->arch.guest_context.ldt_ents = ents;
2914 load_LDT(curr);
2915 if ( ents != 0 )
2916 (void)map_ldt_shadow_page(0);
2918 break;
2921 case MMUEXT_CLEAR_PAGE: {
2922 unsigned long mfn;
2923 unsigned char *ptr;
2925 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2926 okay = !get_page_and_type_from_pagenr(
2927 mfn, PGT_writable_page, d, 0, 0);
2928 if ( unlikely(!okay) )
2930 MEM_LOG("Error while clearing mfn %lx", mfn);
2931 break;
2934 /* A page is dirtied when it's being cleared. */
2935 paging_mark_dirty(d, mfn);
2937 ptr = fixmap_domain_page(mfn);
2938 clear_page(ptr);
2939 fixunmap_domain_page(ptr);
2941 put_page_and_type(mfn_to_page(mfn));
2942 break;
2945 case MMUEXT_COPY_PAGE:
2947 const unsigned char *src;
2948 unsigned char *dst;
2949 unsigned long src_mfn, mfn;
2951 src_mfn = gmfn_to_mfn(d, op.arg2.src_mfn);
2952 okay = get_page_from_pagenr(src_mfn, d);
2953 if ( unlikely(!okay) )
2955 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2956 break;
2959 mfn = gmfn_to_mfn(d, op.arg1.mfn);
2960 okay = !get_page_and_type_from_pagenr(
2961 mfn, PGT_writable_page, d, 0, 0);
2962 if ( unlikely(!okay) )
2964 put_page(mfn_to_page(src_mfn));
2965 MEM_LOG("Error while copying to mfn %lx", mfn);
2966 break;
2969 /* A page is dirtied when it's being copied to. */
2970 paging_mark_dirty(d, mfn);
2972 src = map_domain_page(src_mfn);
2973 dst = fixmap_domain_page(mfn);
2974 copy_page(dst, src);
2975 fixunmap_domain_page(dst);
2976 unmap_domain_page(src);
2978 put_page_and_type(mfn_to_page(mfn));
2979 put_page(mfn_to_page(src_mfn));
2980 break;
2983 default:
2984 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2985 rc = -ENOSYS;
2986 okay = 0;
2987 break;
2990 if ( unlikely(!okay) )
2992 rc = rc ? rc : -EINVAL;
2993 break;
2996 guest_handle_add_offset(uops, 1);
2999 if ( rc == -EAGAIN )
3000 rc = hypercall_create_continuation(
3001 __HYPERVISOR_mmuext_op, "hihi",
3002 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3004 put_pg_owner(pg_owner);
3006 perfc_add(num_mmuext_ops, i);
3008 out:
3009 /* Add incremental work we have done to the @done output parameter. */
3010 if ( unlikely(!guest_handle_is_null(pdone)) )
3012 done += i;
3013 copy_to_guest(pdone, &done, 1);
3016 return rc;
3019 int do_mmu_update(
3020 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
3021 unsigned int count,
3022 XEN_GUEST_HANDLE(uint) pdone,
3023 unsigned int foreigndom)
3025 struct mmu_update req;
3026 void *va;
3027 unsigned long gpfn, gmfn, mfn;
3028 struct page_info *page;
3029 int rc = 0, okay = 1, i = 0;
3030 unsigned int cmd, done = 0, pt_dom;
3031 struct domain *d = current->domain, *pt_owner = d, *pg_owner;
3032 struct vcpu *v = current;
3033 struct domain_mmap_cache mapcache;
3035 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3037 count &= ~MMU_UPDATE_PREEMPTED;
3038 if ( unlikely(!guest_handle_is_null(pdone)) )
3039 (void)copy_from_guest(&done, pdone, 1);
3041 else
3042 perfc_incr(calls_to_mmu_update);
3044 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3046 rc = -EFAULT;
3047 goto out;
3050 if ( (pt_dom = foreigndom >> 16) != 0 )
3052 /* Pagetables belong to a foreign domain (PFD). */
3053 if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3055 rc = -EINVAL;
3056 goto out;
3058 if ( pt_owner == d )
3059 rcu_unlock_domain(pt_owner);
3060 if ( (v = pt_owner->vcpu ? pt_owner->vcpu[0] : NULL) == NULL )
3062 rc = -EINVAL;
3063 goto out;
3065 if ( !IS_PRIV_FOR(d, pt_owner) )
3067 rc = -ESRCH;
3068 goto out;
3072 if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3074 rc = -ESRCH;
3075 goto out;
3078 domain_mmap_cache_init(&mapcache);
3080 for ( i = 0; i < count; i++ )
3082 if ( hypercall_preempt_check() )
3084 rc = -EAGAIN;
3085 break;
3088 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3090 MEM_LOG("Bad __copy_from_guest");
3091 rc = -EFAULT;
3092 break;
3095 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3096 okay = 0;
3098 switch ( cmd )
3100 /*
3101 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3102 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3103 * current A/D bits.
3104 */
3105 case MMU_NORMAL_PT_UPDATE:
3106 case MMU_PT_UPDATE_PRESERVE_AD:
3108 p2m_type_t p2mt;
3110 rc = xsm_mmu_normal_update(d, pg_owner, req.val);
3111 if ( rc )
3112 break;
3114 req.ptr -= cmd;
3115 gmfn = req.ptr >> PAGE_SHIFT;
3116 mfn = mfn_x(gfn_to_mfn(pt_owner, gmfn, &p2mt));
3117 if ( !p2m_is_valid(p2mt) )
3118 mfn = INVALID_MFN;
3120 if ( p2m_is_paged(p2mt) )
3122 p2m_mem_paging_populate(pg_owner, gmfn);
3124 rc = -ENOENT;
3125 break;
3128 if ( unlikely(!get_page_from_pagenr(mfn, pt_owner)) )
3130 MEM_LOG("Could not get page for normal update");
3131 break;
3134 va = map_domain_page_with_cache(mfn, &mapcache);
3135 va = (void *)((unsigned long)va +
3136 (unsigned long)(req.ptr & ~PAGE_MASK));
3137 page = mfn_to_page(mfn);
3139 if ( page_lock(page) )
3141 switch ( page->u.inuse.type_info & PGT_type_mask )
3143 case PGT_l1_page_table:
3145 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3146 p2m_type_t l1e_p2mt;
3147 gfn_to_mfn(pg_owner, l1e_get_pfn(l1e), &l1e_p2mt);
3149 if ( p2m_is_paged(l1e_p2mt) )
3151 p2m_mem_paging_populate(pg_owner, l1e_get_pfn(l1e));
3153 rc = -ENOENT;
3154 break;
3156 else if ( p2m_ram_paging_in_start == l1e_p2mt )
3158 rc = -ENOENT;
3159 break;
3161 /* XXX: Ugly: pull all the checks into a separate function.
3162 * Don't want to do it now, not to interfere with mem_paging
3163 * patches */
3164 else if ( p2m_ram_shared == l1e_p2mt )
3166 /* Unshare the page for RW foreign mappings */
3167 if(l1e_get_flags(l1e) & _PAGE_RW)
3169 rc = mem_sharing_unshare_page(pg_owner,
3170 l1e_get_pfn(l1e),
3171 0);
3172 if(rc) break;
3176 okay = mod_l1_entry(va, l1e, mfn,
3177 cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3178 pg_owner);
3180 break;
3181 case PGT_l2_page_table:
3183 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3184 p2m_type_t l2e_p2mt;
3185 gfn_to_mfn(pg_owner, l2e_get_pfn(l2e), &l2e_p2mt);
3187 if ( p2m_is_paged(l2e_p2mt) )
3189 p2m_mem_paging_populate(pg_owner, l2e_get_pfn(l2e));
3191 rc = -ENOENT;
3192 break;
3194 else if ( p2m_ram_paging_in_start == l2e_p2mt )
3196 rc = -ENOENT;
3197 break;
3199 else if ( p2m_ram_shared == l2e_p2mt )
3201 MEM_LOG("Unexpected attempt to map shared page.\n");
3202 rc = -EINVAL;
3203 break;
3207 okay = mod_l2_entry(va, l2e, mfn,
3208 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3210 break;
3211 case PGT_l3_page_table:
3213 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3214 p2m_type_t l3e_p2mt;
3215 gfn_to_mfn(pg_owner, l3e_get_pfn(l3e), &l3e_p2mt);
3217 if ( p2m_is_paged(l3e_p2mt) )
3219 p2m_mem_paging_populate(pg_owner, l3e_get_pfn(l3e));
3221 rc = -ENOENT;
3222 break;
3224 else if ( p2m_ram_paging_in_start == l3e_p2mt )
3226 rc = -ENOENT;
3227 break;
3229 else if ( p2m_ram_shared == l3e_p2mt )
3231 MEM_LOG("Unexpected attempt to map shared page.\n");
3232 rc = -EINVAL;
3233 break;
3236 rc = mod_l3_entry(va, l3e, mfn,
3237 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3238 okay = !rc;
3240 break;
3241 #if CONFIG_PAGING_LEVELS >= 4
3242 case PGT_l4_page_table:
3244 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3245 p2m_type_t l4e_p2mt;
3246 gfn_to_mfn(pg_owner, l4e_get_pfn(l4e), &l4e_p2mt);
3248 if ( p2m_is_paged(l4e_p2mt) )
3250 p2m_mem_paging_populate(pg_owner, l4e_get_pfn(l4e));
3252 rc = -ENOENT;
3253 break;
3255 else if ( p2m_ram_paging_in_start == l4e_p2mt )
3257 rc = -ENOENT;
3258 break;
3260 else if ( p2m_ram_shared == l4e_p2mt )
3262 MEM_LOG("Unexpected attempt to map shared page.\n");
3263 rc = -EINVAL;
3264 break;
3267 rc = mod_l4_entry(va, l4e, mfn,
3268 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1, v);
3269 okay = !rc;
3271 break;
3272 #endif
3273 case PGT_writable_page:
3274 perfc_incr(writable_mmu_updates);
3275 okay = paging_write_guest_entry(
3276 v, va, req.val, _mfn(mfn));
3277 break;
3279 page_unlock(page);
3280 if ( rc == -EINTR )
3281 rc = -EAGAIN;
3283 else if ( get_page_type(page, PGT_writable_page) )
3285 perfc_incr(writable_mmu_updates);
3286 okay = paging_write_guest_entry(
3287 v, va, req.val, _mfn(mfn));
3288 put_page_type(page);
3291 unmap_domain_page_with_cache(va, &mapcache);
3292 put_page(page);
3294 break;
3296 case MMU_MACHPHYS_UPDATE:
3298 mfn = req.ptr >> PAGE_SHIFT;
3299 gpfn = req.val;
3301 rc = xsm_mmu_machphys_update(d, mfn);
3302 if ( rc )
3303 break;
3305 if ( unlikely(!get_page_from_pagenr(mfn, pg_owner)) )
3307 MEM_LOG("Could not get page for mach->phys update");
3308 break;
3311 if ( unlikely(paging_mode_translate(pg_owner)) )
3313 MEM_LOG("Mach-phys update on auto-translate guest");
3314 break;
3317 set_gpfn_from_mfn(mfn, gpfn);
3318 okay = 1;
3320 paging_mark_dirty(pg_owner, mfn);
3322 put_page(mfn_to_page(mfn));
3323 break;
3325 default:
3326 MEM_LOG("Invalid page update command %x", cmd);
3327 rc = -ENOSYS;
3328 okay = 0;
3329 break;
3332 if ( unlikely(!okay) )
3334 rc = rc ? rc : -EINVAL;
3335 break;
3338 guest_handle_add_offset(ureqs, 1);
3341 if ( rc == -EAGAIN )
3342 rc = hypercall_create_continuation(
3343 __HYPERVISOR_mmu_update, "hihi",
3344 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3346 put_pg_owner(pg_owner);
3348 domain_mmap_cache_destroy(&mapcache);
3350 perfc_add(num_page_updates, i);
3352 out:
3353 if ( pt_owner && (pt_owner != d) )
3354 rcu_unlock_domain(pt_owner);
3356 /* Add incremental work we have done to the @done output parameter. */
3357 if ( unlikely(!guest_handle_is_null(pdone)) )
3359 done += i;
3360 copy_to_guest(pdone, &done, 1);
3363 return rc;
3367 static int create_grant_pte_mapping(
3368 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3370 int rc = GNTST_okay;
3371 void *va;
3372 unsigned long gmfn, mfn;
3373 struct page_info *page;
3374 l1_pgentry_t ol1e;
3375 struct domain *d = v->domain;
3377 ASSERT(domain_is_locked(d));
3379 adjust_guest_l1e(nl1e, d);
3381 gmfn = pte_addr >> PAGE_SHIFT;
3382 mfn = gmfn_to_mfn(d, gmfn);
3384 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3386 MEM_LOG("Could not get page for normal update");
3387 return GNTST_general_error;
3390 va = map_domain_page(mfn);
3391 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3392 page = mfn_to_page(mfn);
3394 if ( !page_lock(page) )
3396 rc = GNTST_general_error;
3397 goto failed;
3400 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3402 page_unlock(page);
3403 rc = GNTST_general_error;
3404 goto failed;
3407 ol1e = *(l1_pgentry_t *)va;
3408 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3410 page_unlock(page);
3411 rc = GNTST_general_error;
3412 goto failed;
3415 page_unlock(page);
3417 if ( !paging_mode_refcounts(d) )
3418 put_page_from_l1e(ol1e, d);
3420 failed:
3421 unmap_domain_page(va);
3422 put_page(page);
3424 return rc;
3427 static int destroy_grant_pte_mapping(
3428 uint64_t addr, unsigned long frame, struct domain *d)
3430 int rc = GNTST_okay;
3431 void *va;
3432 unsigned long gmfn, mfn;
3433 struct page_info *page;
3434 l1_pgentry_t ol1e;
3436 gmfn = addr >> PAGE_SHIFT;
3437 mfn = gmfn_to_mfn(d, gmfn);
3439 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3441 MEM_LOG("Could not get page for normal update");
3442 return GNTST_general_error;
3445 va = map_domain_page(mfn);
3446 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3447 page = mfn_to_page(mfn);
3449 if ( !page_lock(page) )
3451 rc = GNTST_general_error;
3452 goto failed;
3455 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3457 page_unlock(page);
3458 rc = GNTST_general_error;
3459 goto failed;
3462 ol1e = *(l1_pgentry_t *)va;
3464 /* Check that the virtual address supplied is actually mapped to frame. */
3465 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3467 page_unlock(page);
3468 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3469 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3470 rc = GNTST_general_error;
3471 goto failed;
3474 /* Delete pagetable entry. */
3475 if ( unlikely(!UPDATE_ENTRY
3476 (l1,
3477 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3478 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3479 0)) )
3481 page_unlock(page);
3482 MEM_LOG("Cannot delete PTE entry at %p", va);
3483 rc = GNTST_general_error;
3484 goto failed;
3487 page_unlock(page);
3489 failed:
3490 unmap_domain_page(va);
3491 put_page(page);
3492 return rc;
3496 static int create_grant_va_mapping(
3497 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3499 l1_pgentry_t *pl1e, ol1e;
3500 struct domain *d = v->domain;
3501 unsigned long gl1mfn;
3502 struct page_info *l1pg;
3503 int okay;
3505 ASSERT(domain_is_locked(d));
3507 adjust_guest_l1e(nl1e, d);
3509 pl1e = guest_map_l1e(v, va, &gl1mfn);
3510 if ( !pl1e )
3512 MEM_LOG("Could not find L1 PTE for address %lx", va);
3513 return GNTST_general_error;
3516 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3518 guest_unmap_l1e(v, pl1e);
3519 return GNTST_general_error;
3522 l1pg = mfn_to_page(gl1mfn);
3523 if ( !page_lock(l1pg) )
3525 put_page(l1pg);
3526 guest_unmap_l1e(v, pl1e);
3527 return GNTST_general_error;
3530 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3532 page_unlock(l1pg);
3533 put_page(l1pg);
3534 guest_unmap_l1e(v, pl1e);
3535 return GNTST_general_error;
3538 ol1e = *pl1e;
3539 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3541 page_unlock(l1pg);
3542 put_page(l1pg);
3543 guest_unmap_l1e(v, pl1e);
3545 if ( okay && !paging_mode_refcounts(d) )
3546 put_page_from_l1e(ol1e, d);
3548 return okay ? GNTST_okay : GNTST_general_error;
3551 static int replace_grant_va_mapping(
3552 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3554 l1_pgentry_t *pl1e, ol1e;
3555 unsigned long gl1mfn;
3556 struct page_info *l1pg;
3557 int rc = 0;
3559 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3560 if ( !pl1e )
3562 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3563 return GNTST_general_error;
3566 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3568 rc = GNTST_general_error;
3569 goto out;
3572 l1pg = mfn_to_page(gl1mfn);
3573 if ( !page_lock(l1pg) )
3575 rc = GNTST_general_error;
3576 put_page(l1pg);
3577 goto out;
3580 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3582 rc = GNTST_general_error;
3583 goto unlock_and_out;
3586 ol1e = *pl1e;
3588 /* Check that the virtual address supplied is actually mapped to frame. */
3589 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3591 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3592 l1e_get_pfn(ol1e), addr, frame);
3593 rc = GNTST_general_error;
3594 goto unlock_and_out;
3597 /* Delete pagetable entry. */
3598 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3600 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3601 rc = GNTST_general_error;
3602 goto unlock_and_out;
3605 unlock_and_out:
3606 page_unlock(l1pg);
3607 put_page(l1pg);
3608 out:
3609 guest_unmap_l1e(v, pl1e);
3610 return rc;
3613 static int destroy_grant_va_mapping(
3614 unsigned long addr, unsigned long frame, struct vcpu *v)
3616 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3619 static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame,
3620 unsigned int flags,
3621 unsigned int cache_flags)
3623 p2m_type_t p2mt;
3624 int rc;
3626 if ( cache_flags || (flags & ~GNTMAP_readonly) != GNTMAP_host_map )
3627 return GNTST_general_error;
3629 if ( flags & GNTMAP_readonly )
3630 p2mt = p2m_grant_map_ro;
3631 else
3632 p2mt = p2m_grant_map_rw;
3633 rc = guest_physmap_add_entry(current->domain, addr >> PAGE_SHIFT,
3634 frame, 0, p2mt);
3635 if ( rc )
3636 return GNTST_general_error;
3637 else
3638 return GNTST_okay;
3641 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3642 unsigned int flags, unsigned int cache_flags)
3644 l1_pgentry_t pte;
3646 if ( paging_mode_external(current->domain) )
3647 return create_grant_p2m_mapping(addr, frame, flags, cache_flags);
3649 pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3650 if ( (flags & GNTMAP_application_map) )
3651 l1e_add_flags(pte,_PAGE_USER);
3652 if ( !(flags & GNTMAP_readonly) )
3653 l1e_add_flags(pte,_PAGE_RW);
3655 l1e_add_flags(pte,
3656 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3657 & _PAGE_AVAIL);
3659 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3661 if ( flags & GNTMAP_contains_pte )
3662 return create_grant_pte_mapping(addr, pte, current);
3663 return create_grant_va_mapping(addr, pte, current);
3666 static int replace_grant_p2m_mapping(
3667 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3669 unsigned long gfn = (unsigned long)(addr >> PAGE_SHIFT);
3670 p2m_type_t type;
3671 mfn_t old_mfn;
3673 if ( new_addr != 0 || (flags & GNTMAP_contains_pte) )
3674 return GNTST_general_error;
3676 old_mfn = gfn_to_mfn_current(gfn, &type);
3677 if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame )
3679 gdprintk(XENLOG_WARNING,
3680 "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n",
3681 type, mfn_x(old_mfn), frame);
3682 return GNTST_general_error;
3684 guest_physmap_remove_page(current->domain, gfn, frame, 0);
3686 return GNTST_okay;
3689 int replace_grant_host_mapping(
3690 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3692 struct vcpu *curr = current;
3693 l1_pgentry_t *pl1e, ol1e;
3694 unsigned long gl1mfn;
3695 struct page_info *l1pg;
3696 int rc;
3698 if ( paging_mode_external(current->domain) )
3699 return replace_grant_p2m_mapping(addr, frame, new_addr, flags);
3701 if ( flags & GNTMAP_contains_pte )
3703 if ( !new_addr )
3704 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3706 MEM_LOG("Unsupported grant table operation");
3707 return GNTST_general_error;
3710 if ( !new_addr )
3711 return destroy_grant_va_mapping(addr, frame, curr);
3713 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3714 if ( !pl1e )
3716 MEM_LOG("Could not find L1 PTE for address %lx",
3717 (unsigned long)new_addr);
3718 return GNTST_general_error;
3721 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3723 guest_unmap_l1e(curr, pl1e);
3724 return GNTST_general_error;
3727 l1pg = mfn_to_page(gl1mfn);
3728 if ( !page_lock(l1pg) )
3730 put_page(l1pg);
3731 guest_unmap_l1e(curr, pl1e);
3732 return GNTST_general_error;
3735 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3737 page_unlock(l1pg);
3738 put_page(l1pg);
3739 guest_unmap_l1e(curr, pl1e);
3740 return GNTST_general_error;
3743 ol1e = *pl1e;
3745 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3746 gl1mfn, curr, 0)) )
3748 page_unlock(l1pg);
3749 put_page(l1pg);
3750 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3751 guest_unmap_l1e(curr, pl1e);
3752 return GNTST_general_error;
3755 page_unlock(l1pg);
3756 put_page(l1pg);
3757 guest_unmap_l1e(curr, pl1e);
3759 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3760 if ( rc && !paging_mode_refcounts(curr->domain) )
3761 put_page_from_l1e(ol1e, curr->domain);
3763 return rc;
3766 int donate_page(
3767 struct domain *d, struct page_info *page, unsigned int memflags)
3769 spin_lock(&d->page_alloc_lock);
3771 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
3772 goto fail;
3774 if ( d->is_dying )
3775 goto fail;
3777 if ( page->count_info & ~(PGC_allocated | 1) )
3778 goto fail;
3780 if ( !(memflags & MEMF_no_refcount) )
3782 if ( d->tot_pages >= d->max_pages )
3783 goto fail;
3784 d->tot_pages++;
3787 page->count_info = PGC_allocated | 1;
3788 page_set_owner(page, d);
3789 page_list_add_tail(page,&d->page_list);
3791 spin_unlock(&d->page_alloc_lock);
3792 return 0;
3794 fail:
3795 spin_unlock(&d->page_alloc_lock);
3796 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3797 (void *)page_to_mfn(page), d, d->domain_id,
3798 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3799 return -1;
3802 int steal_page(
3803 struct domain *d, struct page_info *page, unsigned int memflags)
3805 unsigned long x, y;
3806 bool_t drop_dom_ref = 0;
3808 spin_lock(&d->page_alloc_lock);
3810 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3811 goto fail;
3813 /*
3814 * We require there is just one reference (PGC_allocated). We temporarily
3815 * drop this reference now so that we can safely swizzle the owner.
3816 */
3817 y = page->count_info;
3818 do {
3819 x = y;
3820 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3821 goto fail;
3822 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3823 } while ( y != x );
3825 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3826 page_set_owner(page, NULL);
3827 y = page->count_info;
3828 do {
3829 x = y;
3830 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3831 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3833 /* Unlink from original owner. */
3834 if ( !(memflags & MEMF_no_refcount) && !--d->tot_pages )
3835 drop_dom_ref = 1;
3836 page_list_del(page, &d->page_list);
3838 spin_unlock(&d->page_alloc_lock);
3839 if ( unlikely(drop_dom_ref) )
3840 put_domain(d);
3841 return 0;
3843 fail:
3844 spin_unlock(&d->page_alloc_lock);
3845 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3846 (void *)page_to_mfn(page), d, d->domain_id,
3847 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3848 return -1;
3851 int page_make_sharable(struct domain *d,
3852 struct page_info *page,
3853 int expected_refcnt)
3855 unsigned long x, nx, y;
3857 /* Acquire ref first, so that the page doesn't dissapear from us */
3858 if(!get_page(page, d))
3859 return -EINVAL;
3861 spin_lock(&d->page_alloc_lock);
3863 /* Change page type and count atomically */
3864 y = page->u.inuse.type_info;
3865 nx = PGT_shared_page | PGT_validated | 1;
3866 do {
3867 x = y;
3868 /* We can only change the type if count is zero, and
3869 type is PGT_none */
3870 if((x & (PGT_type_mask | PGT_count_mask)) != PGT_none)
3872 put_page(page);
3873 spin_unlock(&d->page_alloc_lock);
3874 return -EEXIST;
3876 y = cmpxchg(&page->u.inuse.type_info, x, nx);
3877 } while(x != y);
3879 /* Check if the ref count is 2. The first from PGT_allocated, and the second
3880 * from get_page at the top of this function */
3881 if(page->count_info != (PGC_allocated | (2 + expected_refcnt)))
3883 /* Return type count back to zero */
3884 put_page_and_type(page);
3885 spin_unlock(&d->page_alloc_lock);
3886 return -E2BIG;
3889 page_set_owner(page, dom_cow);
3890 d->tot_pages--;
3891 page_list_del(page, &d->page_list);
3892 spin_unlock(&d->page_alloc_lock);
3894 /* NOTE: We are not putting the page back. In effect this function acquires
3895 * one ref and type ref for the caller */
3897 return 0;
3900 int page_make_private(struct domain *d, struct page_info *page)
3902 unsigned long x, y;
3904 if(!get_page(page, dom_cow))
3905 return -EINVAL;
3907 spin_lock(&d->page_alloc_lock);
3909 /* Change page type and count atomically */
3910 y = page->u.inuse.type_info;
3911 do {
3912 x = y;
3913 /* We can only change the type if count is one */
3914 if((x & (PGT_type_mask | PGT_count_mask)) !=
3915 (PGT_shared_page | 1))
3917 put_page(page);
3918 spin_unlock(&d->page_alloc_lock);
3919 return -EEXIST;
3921 y = cmpxchg(&page->u.inuse.type_info, x, PGT_none);
3922 } while(x != y);
3924 /* We dropped type ref above, drop one ref count too */
3925 put_page(page);
3927 /* Change the owner */
3928 ASSERT(page_get_owner(page) == dom_cow);
3929 page_set_owner(page, d);
3931 d->tot_pages++;
3932 page_list_add_tail(page, &d->page_list);
3933 spin_unlock(&d->page_alloc_lock);
3935 put_page(page);
3937 return 0;
3940 static int __do_update_va_mapping(
3941 unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
3943 l1_pgentry_t val = l1e_from_intpte(val64);
3944 struct vcpu *v = current;
3945 struct domain *d = v->domain;
3946 struct page_info *gl1pg;
3947 l1_pgentry_t *pl1e;
3948 unsigned long bmap_ptr, gl1mfn;
3949 cpumask_t pmask;
3950 int rc;
3952 perfc_incr(calls_to_update_va);
3954 rc = xsm_update_va_mapping(d, pg_owner, val);
3955 if ( rc )
3956 return rc;
3958 rc = -EINVAL;
3959 pl1e = guest_map_l1e(v, va, &gl1mfn);
3960 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3961 goto out;
3963 gl1pg = mfn_to_page(gl1mfn);
3964 if ( !page_lock(gl1pg) )
3966 put_page(gl1pg);
3967 goto out;
3970 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3972 page_unlock(gl1pg);
3973 put_page(gl1pg);
3974 goto out;
3977 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v, pg_owner) ? 0 : -EINVAL;
3979 page_unlock(gl1pg);
3980 put_page(gl1pg);
3982 out:
3983 if ( pl1e )
3984 guest_unmap_l1e(v, pl1e);
3986 switch ( flags & UVMF_FLUSHTYPE_MASK )
3988 case UVMF_TLB_FLUSH:
3989 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3991 case UVMF_LOCAL:
3992 flush_tlb_local();
3993 break;
3994 case UVMF_ALL:
3995 flush_tlb_mask(&d->domain_dirty_cpumask);
3996 break;
3997 default:
3998 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3999 void),
4000 &pmask);
4001 flush_tlb_mask(&pmask);
4002 break;
4004 break;
4006 case UVMF_INVLPG:
4007 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
4009 case UVMF_LOCAL:
4010 if ( !paging_mode_enabled(d) ||
4011 (paging_invlpg(v, va) != 0) )
4012 flush_tlb_one_local(va);
4013 break;
4014 case UVMF_ALL:
4015 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
4016 break;
4017 default:
4018 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
4019 void),
4020 &pmask);
4021 flush_tlb_one_mask(&pmask, va);
4022 break;
4024 break;
4027 return rc;
4030 int do_update_va_mapping(unsigned long va, u64 val64,
4031 unsigned long flags)
4033 return __do_update_va_mapping(va, val64, flags, current->domain);
4036 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
4037 unsigned long flags,
4038 domid_t domid)
4040 struct domain *pg_owner;
4041 int rc;
4043 if ( (pg_owner = get_pg_owner(domid)) == NULL )
4044 return -ESRCH;
4046 rc = __do_update_va_mapping(va, val64, flags, pg_owner);
4048 put_pg_owner(pg_owner);
4050 return rc;
4055 /*************************
4056 * Descriptor Tables
4057 */
4059 void destroy_gdt(struct vcpu *v)
4061 int i;
4062 unsigned long pfn;
4064 v->arch.guest_context.gdt_ents = 0;
4065 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
4067 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
4068 put_page_and_type(mfn_to_page(pfn));
4069 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
4070 v->arch.guest_context.gdt_frames[i] = 0;
4075 long set_gdt(struct vcpu *v,
4076 unsigned long *frames,
4077 unsigned int entries)
4079 struct domain *d = v->domain;
4080 /* NB. There are 512 8-byte entries per GDT page. */
4081 int i, nr_pages = (entries + 511) / 512;
4082 unsigned long mfn;
4084 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4085 return -EINVAL;
4087 /* Check the pages in the new GDT. */
4088 for ( i = 0; i < nr_pages; i++ )
4090 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
4091 if ( !mfn_valid(mfn) ||
4092 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
4093 goto fail;
4096 /* Tear down the old GDT. */
4097 destroy_gdt(v);
4099 /* Install the new GDT. */
4100 v->arch.guest_context.gdt_ents = entries;
4101 for ( i = 0; i < nr_pages; i++ )
4103 v->arch.guest_context.gdt_frames[i] = frames[i];
4104 l1e_write(&v->arch.perdomain_ptes[i],
4105 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
4108 return 0;
4110 fail:
4111 while ( i-- > 0 )
4112 put_page_and_type(mfn_to_page(frames[i]));
4113 return -EINVAL;
4117 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
4119 int nr_pages = (entries + 511) / 512;
4120 unsigned long frames[16];
4121 struct vcpu *curr = current;
4122 long ret;
4124 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
4125 if ( entries > FIRST_RESERVED_GDT_ENTRY )
4126 return -EINVAL;
4128 if ( copy_from_guest(frames, frame_list, nr_pages) )
4129 return -EFAULT;
4131 domain_lock(curr->domain);
4133 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
4134 flush_tlb_local();
4136 domain_unlock(curr->domain);
4138 return ret;
4142 long do_update_descriptor(u64 pa, u64 desc)
4144 struct domain *dom = current->domain;
4145 unsigned long gmfn = pa >> PAGE_SHIFT;
4146 unsigned long mfn;
4147 unsigned int offset;
4148 struct desc_struct *gdt_pent, d;
4149 struct page_info *page;
4150 long ret = -EINVAL;
4152 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
4154 *(u64 *)&d = desc;
4156 mfn = gmfn_to_mfn(dom, gmfn);
4157 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
4158 !mfn_valid(mfn) ||
4159 !check_descriptor(dom, &d) )
4160 return -EINVAL;
4162 page = mfn_to_page(mfn);
4163 if ( unlikely(!get_page(page, dom)) )
4164 return -EINVAL;
4166 /* Check if the given frame is in use in an unsafe context. */
4167 switch ( page->u.inuse.type_info & PGT_type_mask )
4169 case PGT_seg_desc_page:
4170 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
4171 goto out;
4172 break;
4173 default:
4174 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
4175 goto out;
4176 break;
4179 paging_mark_dirty(dom, mfn);
4181 /* All is good so make the update. */
4182 gdt_pent = map_domain_page(mfn);
4183 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
4184 unmap_domain_page(gdt_pent);
4186 put_page_type(page);
4188 ret = 0; /* success */
4190 out:
4191 put_page(page);
4193 return ret;
4196 typedef struct e820entry e820entry_t;
4197 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
4199 struct memory_map_context
4201 unsigned int n;
4202 unsigned long s;
4203 struct xen_memory_map map;
4204 };
4206 static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4208 struct memory_map_context *ctxt = p;
4210 if ( s > ctxt->s )
4212 e820entry_t ent;
4213 XEN_GUEST_HANDLE(e820entry_t) buffer;
4215 if ( ctxt->n + 1 >= ctxt->map.nr_entries )
4216 return -EINVAL;
4217 ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4218 ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4219 ent.type = E820_RESERVED;
4220 buffer = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4221 if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) < 0 )
4222 return -EFAULT;
4223 ctxt->n++;
4225 ctxt->s = e + 1;
4227 return 0;
4230 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
4232 struct page_info *page = NULL;
4233 int rc;
4235 switch ( op )
4237 case XENMEM_add_to_physmap:
4239 struct xen_add_to_physmap xatp;
4240 unsigned long prev_mfn, mfn = 0, gpfn;
4241 struct domain *d;
4243 if ( copy_from_guest(&xatp, arg, 1) )
4244 return -EFAULT;
4246 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
4247 if ( rc != 0 )
4248 return rc;
4250 if ( xsm_add_to_physmap(current->domain, d) )
4252 rcu_unlock_domain(d);
4253 return -EPERM;
4256 switch ( xatp.space )
4258 case XENMAPSPACE_shared_info:
4259 if ( xatp.idx == 0 )
4260 mfn = virt_to_mfn(d->shared_info);
4261 break;
4262 case XENMAPSPACE_grant_table:
4263 spin_lock(&d->grant_table->lock);
4265 if ( d->grant_table->gt_version == 0 )
4266 d->grant_table->gt_version = 1;
4268 if ( d->grant_table->gt_version == 2 &&
4269 (xatp.idx & XENMAPIDX_grant_table_status) )
4271 xatp.idx &= ~XENMAPIDX_grant_table_status;
4272 if ( xatp.idx < nr_status_frames(d->grant_table) )
4273 mfn = virt_to_mfn(d->grant_table->status[xatp.idx]);
4275 else
4277 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
4278 (xatp.idx < max_nr_grant_frames) )
4279 gnttab_grow_table(d, xatp.idx + 1);
4281 if ( xatp.idx < nr_grant_frames(d->grant_table) )
4282 mfn = virt_to_mfn(d->grant_table->shared_raw[xatp.idx]);
4285 spin_unlock(&d->grant_table->lock);
4286 break;
4287 case XENMAPSPACE_gmfn:
4289 p2m_type_t p2mt;
4291 xatp.idx = mfn_x(gfn_to_mfn_unshare(d, xatp.idx, &p2mt, 0));
4292 /* If the page is still shared, exit early */
4293 if ( p2m_is_shared(p2mt) )
4295 rcu_unlock_domain(d);
4296 return -ENOMEM;
4298 if ( !get_page_from_pagenr(xatp.idx, d) )
4299 break;
4300 mfn = xatp.idx;
4301 page = mfn_to_page(mfn);
4302 break;
4304 default:
4305 break;
4308 if ( !paging_mode_translate(d) || (mfn == 0) )
4310 if ( page )
4311 put_page(page);
4312 rcu_unlock_domain(d);
4313 return -EINVAL;
4316 domain_lock(d);
4318 if ( page )
4319 put_page(page);
4321 /* Remove previously mapped page if it was present. */
4322 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
4323 if ( mfn_valid(prev_mfn) )
4325 if ( is_xen_heap_mfn(prev_mfn) )
4326 /* Xen heap frames are simply unhooked from this phys slot. */
4327 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
4328 else
4329 /* Normal domain memory is freed, to avoid leaking memory. */
4330 guest_remove_page(d, xatp.gpfn);
4333 /* Unmap from old location, if any. */
4334 gpfn = get_gpfn_from_mfn(mfn);
4335 ASSERT( gpfn != SHARED_M2P_ENTRY );
4336 if ( gpfn != INVALID_M2P_ENTRY )
4337 guest_physmap_remove_page(d, gpfn, mfn, 0);
4339 /* Map at new location. */
4340 rc = guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
4342 domain_unlock(d);
4344 rcu_unlock_domain(d);
4346 return rc;
4349 case XENMEM_set_memory_map:
4351 struct xen_foreign_memory_map fmap;
4352 struct domain *d;
4354 if ( copy_from_guest(&fmap, arg, 1) )
4355 return -EFAULT;
4357 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
4358 return -EINVAL;
4360 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4361 if ( rc != 0 )
4362 return rc;
4364 rc = xsm_domain_memory_map(d);
4365 if ( rc )
4367 rcu_unlock_domain(d);
4368 return rc;
4371 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
4372 fmap.map.nr_entries) ? -EFAULT : 0;
4373 d->arch.nr_e820 = fmap.map.nr_entries;
4375 rcu_unlock_domain(d);
4376 return rc;
4379 case XENMEM_memory_map:
4381 struct xen_memory_map map;
4382 struct domain *d = current->domain;
4384 /* Backwards compatibility. */
4385 if ( d->arch.nr_e820 == 0 )
4386 return -ENOSYS;
4388 if ( copy_from_guest(&map, arg, 1) )
4389 return -EFAULT;
4391 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4392 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4393 copy_to_guest(arg, &map, 1) )
4394 return -EFAULT;
4396 return 0;
4399 case XENMEM_machine_memory_map:
4401 struct memory_map_context ctxt;
4402 XEN_GUEST_HANDLE(e820entry_t) buffer;
4403 unsigned int i;
4405 if ( !IS_PRIV(current->domain) )
4406 return -EINVAL;
4408 rc = xsm_machine_memory_map();
4409 if ( rc )
4410 return rc;
4412 if ( copy_from_guest(&ctxt.map, arg, 1) )
4413 return -EFAULT;
4414 if ( ctxt.map.nr_entries < e820.nr_map + 1 )
4415 return -EINVAL;
4417 buffer = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4418 if ( !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4419 return -EFAULT;
4421 for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4423 unsigned long s = PFN_DOWN(e820.map[i].addr);
4425 if ( s )
4427 rc = rangeset_report_ranges(current->domain->iomem_caps,
4428 ctxt.s, s - 1,
4429 handle_iomem_range, &ctxt);
4430 if ( !rc )
4431 rc = handle_iomem_range(s, s, &ctxt);
4432 if ( rc )
4433 return rc;
4435 if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4436 return -EINVAL;
4437 if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) < 0 )
4438 return -EFAULT;
4439 ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4442 if ( ctxt.s )
4444 rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4445 ~0UL, handle_iomem_range, &ctxt);
4446 if ( !rc && ctxt.s )
4447 rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4448 if ( rc )
4449 return rc;
4452 ctxt.map.nr_entries = ctxt.n;
4454 if ( copy_to_guest(arg, &ctxt.map, 1) )
4455 return -EFAULT;
4457 return 0;
4460 case XENMEM_machphys_mapping:
4462 struct xen_machphys_mapping mapping = {
4463 .v_start = MACH2PHYS_VIRT_START,
4464 .v_end = MACH2PHYS_VIRT_END,
4465 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4466 };
4468 if ( !mem_hotplug )
4469 mapping.max_mfn = max_page - 1;
4470 if ( copy_to_guest(arg, &mapping, 1) )
4471 return -EFAULT;
4473 return 0;
4476 case XENMEM_set_pod_target:
4477 case XENMEM_get_pod_target:
4479 xen_pod_target_t target;
4480 struct domain *d;
4482 /* Support DOMID_SELF? */
4483 if ( !IS_PRIV(current->domain) )
4484 return -EINVAL;
4486 if ( copy_from_guest(&target, arg, 1) )
4487 return -EFAULT;
4489 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4490 if ( rc != 0 )
4491 return rc;
4493 if ( op == XENMEM_set_pod_target )
4495 if ( target.target_pages > d->max_pages )
4497 rc = -EINVAL;
4498 goto pod_target_out_unlock;
4501 rc = p2m_pod_set_mem_target(d, target.target_pages);
4504 target.tot_pages = d->tot_pages;
4505 target.pod_cache_pages = d->arch.p2m->pod.count;
4506 target.pod_entries = d->arch.p2m->pod.entry_count;
4508 if ( copy_to_guest(arg, &target, 1) )
4510 rc= -EFAULT;
4511 goto pod_target_out_unlock;
4514 pod_target_out_unlock:
4515 rcu_unlock_domain(d);
4516 return rc;
4519 case XENMEM_get_sharing_freed_pages:
4520 return mem_sharing_get_nr_saved_mfns();
4522 default:
4523 return subarch_memory_op(op, arg);
4526 return 0;
4530 /*************************
4531 * Writable Pagetables
4532 */
4534 struct ptwr_emulate_ctxt {
4535 struct x86_emulate_ctxt ctxt;
4536 unsigned long cr2;
4537 l1_pgentry_t pte;
4538 };
4540 static int ptwr_emulated_read(
4541 enum x86_segment seg,
4542 unsigned long offset,
4543 void *p_data,
4544 unsigned int bytes,
4545 struct x86_emulate_ctxt *ctxt)
4547 unsigned int rc;
4548 unsigned long addr = offset;
4550 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4552 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4553 return X86EMUL_EXCEPTION;
4556 return X86EMUL_OKAY;
4559 static int ptwr_emulated_update(
4560 unsigned long addr,
4561 paddr_t old,
4562 paddr_t val,
4563 unsigned int bytes,
4564 unsigned int do_cmpxchg,
4565 struct ptwr_emulate_ctxt *ptwr_ctxt)
4567 unsigned long mfn;
4568 unsigned long unaligned_addr = addr;
4569 struct page_info *page;
4570 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4571 struct vcpu *v = current;
4572 struct domain *d = v->domain;
4574 /* Only allow naturally-aligned stores within the original %cr2 page. */
4575 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4577 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4578 ptwr_ctxt->cr2, addr, bytes);
4579 return X86EMUL_UNHANDLEABLE;
4582 /* Turn a sub-word access into a full-word access. */
4583 if ( bytes != sizeof(paddr_t) )
4585 paddr_t full;
4586 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4588 /* Align address; read full word. */
4589 addr &= ~(sizeof(paddr_t)-1);
4590 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4592 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4593 return X86EMUL_EXCEPTION;
4595 /* Mask out bits provided by caller. */
4596 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4597 /* Shift the caller value and OR in the missing bits. */
4598 val &= (((paddr_t)1 << (bytes*8)) - 1);
4599 val <<= (offset)*8;
4600 val |= full;
4601 /* Also fill in missing parts of the cmpxchg old value. */
4602 old &= (((paddr_t)1 << (bytes*8)) - 1);
4603 old <<= (offset)*8;
4604 old |= full;
4607 pte = ptwr_ctxt->pte;
4608 mfn = l1e_get_pfn(pte);
4609 page = mfn_to_page(mfn);
4611 /* We are looking only for read-only mappings of p.t. pages. */
4612 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4613 ASSERT(mfn_valid(mfn));
4614 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4615 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4616 ASSERT(page_get_owner(page) == d);
4618 /* Check the new PTE. */
4619 nl1e = l1e_from_intpte(val);
4620 if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
4622 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4623 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4625 /*
4626 * If this is an upper-half write to a PAE PTE then we assume that
4627 * the guest has simply got the two writes the wrong way round. We
4628 * zap the PRESENT bit on the assumption that the bottom half will
4629 * be written immediately after we return to the guest.
4630 */
4631 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4632 l1e_get_intpte(nl1e));
4633 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4635 else
4637 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4638 return X86EMUL_UNHANDLEABLE;
4642 adjust_guest_l1e(nl1e, d);
4644 /* Checked successfully: do the update (write or cmpxchg). */
4645 pl1e = map_domain_page(mfn);
4646 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4647 if ( do_cmpxchg )
4649 int okay;
4650 intpte_t t = old;
4651 ol1e = l1e_from_intpte(old);
4653 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4654 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4655 okay = (okay && t == old);
4657 if ( !okay )
4659 unmap_domain_page(pl1e);
4660 put_page_from_l1e(nl1e, d);
4661 return X86EMUL_CMPXCHG_FAILED;
4664 else
4666 ol1e = *pl1e;
4667 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4668 BUG();
4671 trace_ptwr_emulation(addr, nl1e);
4673 unmap_domain_page(pl1e);
4675 /* Finally, drop the old PTE. */
4676 put_page_from_l1e(ol1e, d);
4678 return X86EMUL_OKAY;
4681 static int ptwr_emulated_write(
4682 enum x86_segment seg,
4683 unsigned long offset,
4684 void *p_data,
4685 unsigned int bytes,
4686 struct x86_emulate_ctxt *ctxt)
4688 paddr_t val = 0;
4690 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4692 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4693 offset, bytes);
4694 return X86EMUL_UNHANDLEABLE;
4697 memcpy(&val, p_data, bytes);
4699 return ptwr_emulated_update(
4700 offset, 0, val, bytes, 0,
4701 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4704 static int ptwr_emulated_cmpxchg(
4705 enum x86_segment seg,
4706 unsigned long offset,
4707 void *p_old,
4708 void *p_new,
4709 unsigned int bytes,
4710 struct x86_emulate_ctxt *ctxt)
4712 paddr_t old = 0, new = 0;
4714 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4716 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4717 offset, bytes);
4718 return X86EMUL_UNHANDLEABLE;
4721 memcpy(&old, p_old, bytes);
4722 memcpy(&new, p_new, bytes);
4724 return ptwr_emulated_update(
4725 offset, old, new, bytes, 1,
4726 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4729 static const struct x86_emulate_ops ptwr_emulate_ops = {
4730 .read = ptwr_emulated_read,
4731 .insn_fetch = ptwr_emulated_read,
4732 .write = ptwr_emulated_write,
4733 .cmpxchg = ptwr_emulated_cmpxchg,
4734 };
4736 /* Write page fault handler: check if guest is trying to modify a PTE. */
4737 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4738 struct cpu_user_regs *regs)
4740 struct domain *d = v->domain;
4741 struct page_info *page;
4742 l1_pgentry_t pte;
4743 struct ptwr_emulate_ctxt ptwr_ctxt;
4744 int rc;
4746 /* Attempt to read the PTE that maps the VA being accessed. */
4747 guest_get_eff_l1e(v, addr, &pte);
4749 /* We are looking only for read-only mappings of p.t. pages. */
4750 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4751 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4752 goto bail;
4754 page = l1e_get_page(pte);
4755 if ( !page_lock(page) )
4757 put_page(page);
4758 goto bail;
4761 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4763 page_unlock(page);
4764 put_page(page);
4765 goto bail;
4768 ptwr_ctxt.ctxt.regs = regs;
4769 ptwr_ctxt.ctxt.force_writeback = 0;
4770 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4771 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4772 ptwr_ctxt.cr2 = addr;
4773 ptwr_ctxt.pte = pte;
4775 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4777 page_unlock(page);
4778 put_page(page);
4780 if ( rc == X86EMUL_UNHANDLEABLE )
4781 goto bail;
4783 perfc_incr(ptwr_emulations);
4784 return EXCRET_fault_fixed;
4786 bail:
4787 return 0;
4790 void free_xen_pagetable(void *v)
4792 if ( early_boot )
4793 return;
4795 if ( is_xen_heap_page(virt_to_page(v)) )
4796 free_xenheap_page(v);
4797 else
4798 free_domheap_page(virt_to_page(v));
4801 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4802 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4803 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4805 /*
4806 * map_pages_to_xen() can be called with interrupts disabled:
4807 * * During early bootstrap; or
4808 * * alloc_xenheap_pages() via memguard_guard_range
4809 * In these cases it is safe to use flush_area_local():
4810 * * Because only the local CPU is online; or
4811 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4812 */
4813 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4814 flush_area_local((const void *)v, f) : \
4815 flush_area_all((const void *)v, f))
4817 int map_pages_to_xen(
4818 unsigned long virt,
4819 unsigned long mfn,
4820 unsigned long nr_mfns,
4821 unsigned int flags)
4823 l2_pgentry_t *pl2e, ol2e;
4824 l1_pgentry_t *pl1e, ol1e;
4825 unsigned int i;
4827 while ( nr_mfns != 0 )
4829 #ifdef __x86_64__
4830 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4831 l3_pgentry_t ol3e = *pl3e;
4833 if ( cpu_has_page1gb &&
4834 !(((virt >> PAGE_SHIFT) | mfn) &
4835 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4836 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4837 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4839 /* 1GB-page mapping. */
4840 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4842 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4844 unsigned int flush_flags =
4845 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4847 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4849 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4850 flush_flags |= FLUSH_TLB_GLOBAL;
4851 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4852 PAGE_CACHE_ATTRS )
4853 flush_flags |= FLUSH_CACHE;
4854 flush_area(virt, flush_flags);
4856 else
4858 pl2e = l3e_to_l2e(ol3e);
4859 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4861 ol2e = pl2e[i];
4862 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4863 continue;
4864 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4866 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4867 flush_flags |= FLUSH_TLB_GLOBAL;
4868 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4869 PAGE_CACHE_ATTRS )
4870 flush_flags |= FLUSH_CACHE;
4872 else
4874 unsigned int j;
4876 pl1e = l2e_to_l1e(ol2e);
4877 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4879 ol1e = pl1e[j];
4880 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4881 flush_flags |= FLUSH_TLB_GLOBAL;
4882 if ( (l1e_get_flags(ol1e) ^ flags) &
4883 PAGE_CACHE_ATTRS )
4884 flush_flags |= FLUSH_CACHE;
4888 flush_area(virt, flush_flags);
4889 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4891 ol2e = pl2e[i];
4892 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4893 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4894 free_xen_pagetable(l2e_to_l1e(ol2e));
4896 free_xen_pagetable(pl2e);
4900 virt += 1UL << L3_PAGETABLE_SHIFT;
4901 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4902 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4903 continue;
4906 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4907 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4909 unsigned int flush_flags =
4910 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4912 /* Skip this PTE if there is no change. */
4913 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4914 L1_PAGETABLE_ENTRIES - 1)) +
4915 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4916 l1_table_offset(virt) == mfn) &&
4917 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4918 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4920 /* We can skip to end of L3 superpage if we got a match. */
4921 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4922 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4923 if ( i > nr_mfns )
4924 i = nr_mfns;
4925 virt += i << PAGE_SHIFT;
4926 mfn += i;
4927 nr_mfns -= i;
4928 continue;
4931 pl2e = alloc_xen_pagetable();
4932 if ( pl2e == NULL )
4933 return -ENOMEM;
4935 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4936 l2e_write(pl2e + i,
4937 l2e_from_pfn(l3e_get_pfn(ol3e) +
4938 (i << PAGETABLE_ORDER),
4939 l3e_get_flags(ol3e)));
4941 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4942 flush_flags |= FLUSH_TLB_GLOBAL;
4944 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4945 __PAGE_HYPERVISOR));
4946 flush_area(virt, flush_flags);
4948 #endif
4950 pl2e = virt_to_xen_l2e(virt);
4952 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4953 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4954 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4956 /* Super-page mapping. */
4957 ol2e = *pl2e;
4958 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4960 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4962 unsigned int flush_flags =
4963 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4965 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4967 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4968 flush_flags |= FLUSH_TLB_GLOBAL;
4969 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4970 PAGE_CACHE_ATTRS )
4971 flush_flags |= FLUSH_CACHE;
4972 flush_area(virt, flush_flags);
4974 else
4976 pl1e = l2e_to_l1e(ol2e);
4977 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4979 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4980 flush_flags |= FLUSH_TLB_GLOBAL;
4981 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4982 PAGE_CACHE_ATTRS )
4983 flush_flags |= FLUSH_CACHE;
4985 flush_area(virt, flush_flags);
4986 free_xen_pagetable(pl1e);
4990 virt += 1UL << L2_PAGETABLE_SHIFT;
4991 mfn += 1UL << PAGETABLE_ORDER;
4992 nr_mfns -= 1UL << PAGETABLE_ORDER;
4994 else
4996 /* Normal page mapping. */
4997 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4999 pl1e = alloc_xen_pagetable();
5000 if ( pl1e == NULL )
5001 return -ENOMEM;
5002 clear_page(pl1e);
5003 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5004 __PAGE_HYPERVISOR));
5006 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5008 unsigned int flush_flags =
5009 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
5011 /* Skip this PTE if there is no change. */
5012 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
5013 l1_table_offset(virt)) == mfn) &&
5014 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
5015 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
5017 /* We can skip to end of L2 superpage if we got a match. */
5018 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
5019 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
5020 if ( i > nr_mfns )
5021 i = nr_mfns;
5022 virt += i << L1_PAGETABLE_SHIFT;
5023 mfn += i;
5024 nr_mfns -= i;
5025 goto check_l3;
5028 pl1e = alloc_xen_pagetable();
5029 if ( pl1e == NULL )
5030 return -ENOMEM;
5032 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5033 l1e_write(&pl1e[i],
5034 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5035 lNf_to_l1f(l2e_get_flags(*pl2e))));
5037 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
5038 flush_flags |= FLUSH_TLB_GLOBAL;
5040 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5041 __PAGE_HYPERVISOR));
5042 flush_area(virt, flush_flags);
5045 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
5046 ol1e = *pl1e;
5047 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
5048 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
5050 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
5051 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
5052 flush_flags |= FLUSH_TLB_GLOBAL;
5053 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
5054 flush_flags |= FLUSH_CACHE;
5055 flush_area(virt, flush_flags);
5058 virt += 1UL << L1_PAGETABLE_SHIFT;
5059 mfn += 1UL;
5060 nr_mfns -= 1UL;
5062 if ( (flags == PAGE_HYPERVISOR) &&
5063 ((nr_mfns == 0) ||
5064 ((((virt >> PAGE_SHIFT) | mfn) &
5065 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
5067 unsigned long base_mfn;
5068 pl1e = l2e_to_l1e(*pl2e);
5069 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
5070 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
5071 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
5072 (l1e_get_flags(*pl1e) != flags) )
5073 break;
5074 if ( i == L1_PAGETABLE_ENTRIES )
5076 ol2e = *pl2e;
5077 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
5078 l1f_to_lNf(flags)));
5079 flush_area(virt - PAGE_SIZE,
5080 FLUSH_TLB_GLOBAL |
5081 FLUSH_ORDER(PAGETABLE_ORDER));
5082 free_xen_pagetable(l2e_to_l1e(ol2e));
5087 check_l3: ;
5088 #ifdef __x86_64__
5089 if ( cpu_has_page1gb &&
5090 (flags == PAGE_HYPERVISOR) &&
5091 ((nr_mfns == 0) ||
5092 !(((virt >> PAGE_SHIFT) | mfn) &
5093 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
5095 unsigned long base_mfn;
5097 ol3e = *pl3e;
5098 pl2e = l3e_to_l2e(ol3e);
5099 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
5100 L1_PAGETABLE_ENTRIES - 1);
5101 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
5102 if ( (l2e_get_pfn(*pl2e) !=
5103 (base_mfn + (i << PAGETABLE_ORDER))) ||
5104 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
5105 break;
5106 if ( i == L2_PAGETABLE_ENTRIES )
5108 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
5109 l1f_to_lNf(flags)));
5110 flush_area(virt - PAGE_SIZE,
5111 FLUSH_TLB_GLOBAL |
5112 FLUSH_ORDER(2*PAGETABLE_ORDER));
5113 free_xen_pagetable(l3e_to_l2e(ol3e));
5116 #endif
5119 return 0;
5122 void destroy_xen_mappings(unsigned long s, unsigned long e)
5124 l2_pgentry_t *pl2e;
5125 l1_pgentry_t *pl1e;
5126 unsigned int i;
5127 unsigned long v = s;
5129 ASSERT((s & ~PAGE_MASK) == 0);
5130 ASSERT((e & ~PAGE_MASK) == 0);
5132 while ( v < e )
5134 #ifdef __x86_64__
5135 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
5137 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
5139 v += 1UL << L3_PAGETABLE_SHIFT;
5140 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
5141 continue;
5144 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
5146 if ( l2_table_offset(v) == 0 &&
5147 l1_table_offset(v) == 0 &&
5148 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
5150 /* PAGE1GB: whole superpage is destroyed. */
5151 l3e_write_atomic(pl3e, l3e_empty());
5152 v += 1UL << L3_PAGETABLE_SHIFT;
5153 continue;
5156 /* PAGE1GB: shatter the superpage and fall through. */
5157 pl2e = alloc_xen_pagetable();
5158 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5159 l2e_write(pl2e + i,
5160 l2e_from_pfn(l3e_get_pfn(*pl3e) +
5161 (i << PAGETABLE_ORDER),
5162 l3e_get_flags(*pl3e)));
5163 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5164 __PAGE_HYPERVISOR));
5166 #endif
5168 pl2e = virt_to_xen_l2e(v);
5170 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5172 v += 1UL << L2_PAGETABLE_SHIFT;
5173 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
5174 continue;
5177 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5179 if ( (l1_table_offset(v) == 0) &&
5180 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
5182 /* PSE: whole superpage is destroyed. */
5183 l2e_write_atomic(pl2e, l2e_empty());
5184 v += 1UL << L2_PAGETABLE_SHIFT;
5186 else
5188 /* PSE: shatter the superpage and try again. */
5189 pl1e = alloc_xen_pagetable();
5190 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5191 l1e_write(&pl1e[i],
5192 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5193 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5194 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5195 __PAGE_HYPERVISOR));
5198 else
5200 /* Ordinary 4kB mapping. */
5201 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5202 l1e_write_atomic(pl1e, l1e_empty());
5203 v += PAGE_SIZE;
5205 /* If we are done with the L2E, check if it is now empty. */
5206 if ( (v != e) && (l1_table_offset(v) != 0) )
5207 continue;
5208 pl1e = l2e_to_l1e(*pl2e);
5209 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5210 if ( l1e_get_intpte(pl1e[i]) != 0 )
5211 break;
5212 if ( i == L1_PAGETABLE_ENTRIES )
5214 /* Empty: zap the L2E and free the L1 page. */
5215 l2e_write_atomic(pl2e, l2e_empty());
5216 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5217 free_xen_pagetable(pl1e);
5221 #ifdef __x86_64__
5222 /* If we are done with the L3E, check if it is now empty. */
5223 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
5224 continue;
5225 pl2e = l3e_to_l2e(*pl3e);
5226 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5227 if ( l2e_get_intpte(pl2e[i]) != 0 )
5228 break;
5229 if ( i == L2_PAGETABLE_ENTRIES )
5231 /* Empty: zap the L3E and free the L2 page. */
5232 l3e_write_atomic(pl3e, l3e_empty());
5233 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5234 free_xen_pagetable(pl2e);
5236 #endif
5239 flush_area(NULL, FLUSH_TLB_GLOBAL);
5242 void __set_fixmap(
5243 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5245 BUG_ON(idx >= __end_of_fixed_addresses);
5246 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
5249 #ifdef MEMORY_GUARD
5251 void memguard_init(void)
5253 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
5254 #ifdef __i386__
5255 map_pages_to_xen(
5256 (unsigned long)__va(start),
5257 start >> PAGE_SHIFT,
5258 (xenheap_phys_end - start) >> PAGE_SHIFT,
5259 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5260 #else
5261 map_pages_to_xen(
5262 (unsigned long)__va(start),
5263 start >> PAGE_SHIFT,
5264 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5265 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5266 BUG_ON(start != xen_phys_start);
5267 map_pages_to_xen(
5268 XEN_VIRT_START,
5269 start >> PAGE_SHIFT,
5270 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
5271 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
5272 #endif
5275 static void __memguard_change_range(void *p, unsigned long l, int guard)
5277 unsigned long _p = (unsigned long)p;
5278 unsigned long _l = (unsigned long)l;
5279 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
5281 /* Ensure we are dealing with a page-aligned whole number of pages. */
5282 ASSERT((_p&~PAGE_MASK) == 0);
5283 ASSERT((_l&~PAGE_MASK) == 0);
5285 if ( guard )
5286 flags &= ~_PAGE_PRESENT;
5288 map_pages_to_xen(
5289 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5292 void memguard_guard_range(void *p, unsigned long l)
5294 __memguard_change_range(p, l, 1);
5297 void memguard_unguard_range(void *p, unsigned long l)
5299 __memguard_change_range(p, l, 0);
5302 #endif
5304 void memguard_guard_stack(void *p)
5306 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5307 p = (void *)((unsigned long)p + STACK_SIZE -
5308 PRIMARY_STACK_SIZE - PAGE_SIZE);
5309 memguard_guard_range(p, PAGE_SIZE);
5312 /*
5313 * Local variables:
5314 * mode: C
5315 * c-set-style: "BSD"
5316 * c-basic-offset: 4
5317 * tab-width: 4
5318 * indent-tabs-mode: nil
5319 * End:
5320 */