debuggers.hg

view xen/arch/x86/mm/p2m.c @ 20924:6ade83cb21ca

xentrace: Trace p2m events

Add more tracing to aid in debugging ballooning / PoD:
* Nested page faults for EPT/NPT systems
* set_p2m_enry
* Decrease reservation (for ballooning)
* PoD populate, zero reclaim, superpage splinter

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 03 09:35:23 2010 +0000 (2010-02-03)
parents 47ec2d131c22
children 4a3e131f7498
line source
1 /******************************************************************************
2 * arch/x86/mm/p2m.c
3 *
4 * physical-to-machine mappings for automatically-translated domains.
5 *
6 * Parts of this code are Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
7 * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices.
8 * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc.
9 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
10 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 */
27 #include <asm/domain.h>
28 #include <asm/page.h>
29 #include <asm/paging.h>
30 #include <asm/p2m.h>
31 #include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
32 #include <xen/iommu.h>
33 #include <asm/mem_event.h>
34 #include <public/mem_event.h>
35 #include <asm/mem_sharing.h>
36 #include <xen/event.h>
38 /* Debugging and auditing of the P2M code? */
39 #define P2M_AUDIT 0
40 #define P2M_DEBUGGING 0
42 /* Printouts */
43 #define P2M_PRINTK(_f, _a...) \
44 debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
45 #define P2M_ERROR(_f, _a...) \
46 printk("pg error: %s(): " _f, __func__, ##_a)
47 #if P2M_DEBUGGING
48 #define P2M_DEBUG(_f, _a...) \
49 debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a)
50 #else
51 #define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0)
52 #endif
55 /* Override macros from asm/page.h to make them work with mfn_t */
56 #undef mfn_to_page
57 #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
58 #undef mfn_valid
59 #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
60 #undef page_to_mfn
61 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
64 /* PTE flags for the various types of p2m entry */
65 #define P2M_BASE_FLAGS \
66 (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
68 #define SUPERPAGE_PAGES (1UL << 9)
69 #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
71 static unsigned long p2m_type_to_flags(p2m_type_t t)
72 {
73 unsigned long flags;
74 #ifdef __x86_64__
75 flags = (unsigned long)(t & 0x3fff) << 9;
76 #else
77 flags = (t & 0x7UL) << 9;
78 #endif
79 #ifndef HAVE_GRANT_MAP_P2M
80 BUG_ON(p2m_is_grant(t));
81 #endif
82 switch(t)
83 {
84 case p2m_invalid:
85 default:
86 return flags;
87 case p2m_ram_rw:
88 case p2m_grant_map_rw:
89 return flags | P2M_BASE_FLAGS | _PAGE_RW;
90 case p2m_ram_logdirty:
91 return flags | P2M_BASE_FLAGS;
92 case p2m_ram_ro:
93 case p2m_grant_map_ro:
94 return flags | P2M_BASE_FLAGS;
95 case p2m_ram_shared:
96 return flags | P2M_BASE_FLAGS;
97 case p2m_mmio_dm:
98 return flags;
99 case p2m_mmio_direct:
100 return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
101 case p2m_populate_on_demand:
102 return flags;
103 }
104 }
106 #if P2M_AUDIT
107 static void audit_p2m(struct domain *d);
108 #else
109 # define audit_p2m(_d) do { (void)(_d); } while(0)
110 #endif /* P2M_AUDIT */
112 // Find the next level's P2M entry, checking for out-of-range gfn's...
113 // Returns NULL on error.
114 //
115 static l1_pgentry_t *
116 p2m_find_entry(void *table, unsigned long *gfn_remainder,
117 unsigned long gfn, u32 shift, u32 max)
118 {
119 u32 index;
121 index = *gfn_remainder >> shift;
122 if ( index >= max )
123 {
124 P2M_DEBUG("gfn=0x%lx out of range "
125 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
126 gfn, *gfn_remainder, shift, index, max);
127 return NULL;
128 }
129 *gfn_remainder &= (1 << shift) - 1;
130 return (l1_pgentry_t *)table + index;
131 }
133 // Walk one level of the P2M table, allocating a new table if required.
134 // Returns 0 on error.
135 //
136 static int
137 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
138 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
139 u32 max, unsigned long type)
140 {
141 l1_pgentry_t *l1_entry;
142 l1_pgentry_t *p2m_entry;
143 l1_pgentry_t new_entry;
144 void *next;
145 int i;
146 ASSERT(d->arch.p2m->alloc_page);
148 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
149 shift, max)) )
150 return 0;
152 /* PoD: Not present doesn't imply empty. */
153 if ( !l1e_get_flags(*p2m_entry) )
154 {
155 struct page_info *pg = d->arch.p2m->alloc_page(d);
156 if ( pg == NULL )
157 return 0;
158 page_list_add_tail(pg, &d->arch.p2m->pages);
159 pg->u.inuse.type_info = type | 1 | PGT_validated;
160 pg->count_info |= 1;
162 new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
163 __PAGE_HYPERVISOR|_PAGE_USER);
165 switch ( type ) {
166 case PGT_l3_page_table:
167 paging_write_p2m_entry(d, gfn,
168 p2m_entry, *table_mfn, new_entry, 4);
169 break;
170 case PGT_l2_page_table:
171 #if CONFIG_PAGING_LEVELS == 3
172 /* for PAE mode, PDPE only has PCD/PWT/P bits available */
173 new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
174 #endif
175 paging_write_p2m_entry(d, gfn,
176 p2m_entry, *table_mfn, new_entry, 3);
177 break;
178 case PGT_l1_page_table:
179 paging_write_p2m_entry(d, gfn,
180 p2m_entry, *table_mfn, new_entry, 2);
181 break;
182 default:
183 BUG();
184 break;
185 }
186 }
188 ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
190 /* split single large page into 4KB page in P2M table */
191 if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
192 {
193 unsigned long flags, pfn;
194 struct page_info *pg = d->arch.p2m->alloc_page(d);
195 if ( pg == NULL )
196 return 0;
197 page_list_add_tail(pg, &d->arch.p2m->pages);
198 pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
199 pg->count_info |= 1;
201 /* New splintered mappings inherit the flags of the old superpage,
202 * with a little reorganisation for the _PAGE_PSE_PAT bit. */
203 flags = l1e_get_flags(*p2m_entry);
204 pfn = l1e_get_pfn(*p2m_entry);
205 if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */
206 pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
207 else
208 flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
210 l1_entry = __map_domain_page(pg);
211 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
212 {
213 new_entry = l1e_from_pfn(pfn + i, flags);
214 paging_write_p2m_entry(d, gfn,
215 l1_entry+i, *table_mfn, new_entry, 1);
216 }
217 unmap_domain_page(l1_entry);
219 new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
220 __PAGE_HYPERVISOR|_PAGE_USER);
221 paging_write_p2m_entry(d, gfn,
222 p2m_entry, *table_mfn, new_entry, 2);
223 }
225 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
226 next = map_domain_page(mfn_x(*table_mfn));
227 unmap_domain_page(*table);
228 *table = next;
230 return 1;
231 }
233 /*
234 * Populate-on-demand functionality
235 */
236 static
237 int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
238 unsigned int page_order, p2m_type_t p2mt);
240 static int
241 p2m_pod_cache_add(struct domain *d,
242 struct page_info *page,
243 unsigned long order)
244 {
245 int i;
246 struct page_info *p;
247 struct p2m_domain *p2md = d->arch.p2m;
249 #ifndef NDEBUG
250 mfn_t mfn;
252 mfn = page_to_mfn(page);
254 /* Check to make sure this is a contiguous region */
255 if( mfn_x(mfn) & ((1 << order) - 1) )
256 {
257 printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
258 __func__, mfn_x(mfn), order, ((1UL << order) - 1));
259 return -1;
260 }
262 for(i=0; i < 1 << order ; i++) {
263 struct domain * od;
265 p = mfn_to_page(_mfn(mfn_x(mfn) + i));
266 od = page_get_owner(p);
267 if(od != d)
268 {
269 printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
270 __func__, mfn_x(mfn), d->domain_id,
271 od?od->domain_id:-1);
272 return -1;
273 }
274 }
275 #endif
277 ASSERT(p2m_locked_by_me(p2md));
279 /*
280 * Pages from domain_alloc and returned by the balloon driver aren't
281 * guaranteed to be zero; but by reclaiming zero pages, we implicitly
282 * promise to provide zero pages. So we scrub pages before using.
283 */
284 for ( i = 0; i < (1 << order); i++ )
285 {
286 char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
287 clear_page(b);
288 unmap_domain_page(b);
289 }
291 spin_lock(&d->page_alloc_lock);
293 /* First, take all pages off the domain list */
294 for(i=0; i < 1 << order ; i++)
295 {
296 p = page + i;
297 page_list_del(p, &d->page_list);
298 }
300 /* Then add the first one to the appropriate populate-on-demand list */
301 switch(order)
302 {
303 case 9:
304 page_list_add_tail(page, &p2md->pod.super); /* lock: page_alloc */
305 p2md->pod.count += 1 << order;
306 break;
307 case 0:
308 page_list_add_tail(page, &p2md->pod.single); /* lock: page_alloc */
309 p2md->pod.count += 1 ;
310 break;
311 default:
312 BUG();
313 }
315 /* Ensure that the PoD cache has never been emptied.
316 * This may cause "zombie domains" since the page will never be freed. */
317 BUG_ON( d->arch.relmem != RELMEM_not_started );
319 spin_unlock(&d->page_alloc_lock);
321 return 0;
322 }
324 /* Get a page of size order from the populate-on-demand cache. Will break
325 * down 2-meg pages into singleton pages automatically. Returns null if
326 * a superpage is requested and no superpages are available. Must be called
327 * with the d->page_lock held. */
328 static struct page_info * p2m_pod_cache_get(struct domain *d,
329 unsigned long order)
330 {
331 struct p2m_domain *p2md = d->arch.p2m;
332 struct page_info *p = NULL;
333 int i;
335 if ( order == 9 && page_list_empty(&p2md->pod.super) )
336 {
337 return NULL;
338 }
339 else if ( order == 0 && page_list_empty(&p2md->pod.single) )
340 {
341 unsigned long mfn;
342 struct page_info *q;
344 BUG_ON( page_list_empty(&p2md->pod.super) );
346 /* Break up a superpage to make single pages. NB count doesn't
347 * need to be adjusted. */
348 p = page_list_remove_head(&p2md->pod.super);
349 mfn = mfn_x(page_to_mfn(p));
351 for ( i=0; i<SUPERPAGE_PAGES; i++ )
352 {
353 q = mfn_to_page(_mfn(mfn+i));
354 page_list_add_tail(q, &p2md->pod.single);
355 }
356 }
358 switch ( order )
359 {
360 case 9:
361 BUG_ON( page_list_empty(&p2md->pod.super) );
362 p = page_list_remove_head(&p2md->pod.super);
363 p2md->pod.count -= 1 << order; /* Lock: page_alloc */
364 break;
365 case 0:
366 BUG_ON( page_list_empty(&p2md->pod.single) );
367 p = page_list_remove_head(&p2md->pod.single);
368 p2md->pod.count -= 1;
369 break;
370 default:
371 BUG();
372 }
374 /* Put the pages back on the domain page_list */
375 for ( i = 0 ; i < (1 << order) ; i++ )
376 {
377 BUG_ON(page_get_owner(p + i) != d);
378 page_list_add_tail(p + i, &d->page_list);
379 }
381 return p;
382 }
384 /* Set the size of the cache, allocating or freeing as necessary. */
385 static int
386 p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
387 {
388 struct p2m_domain *p2md = d->arch.p2m;
389 int ret = 0;
391 /* Increasing the target */
392 while ( pod_target > p2md->pod.count )
393 {
394 struct page_info * page;
395 int order;
397 if ( (pod_target - p2md->pod.count) >= SUPERPAGE_PAGES )
398 order = 9;
399 else
400 order = 0;
401 retry:
402 page = alloc_domheap_pages(d, order, 0);
403 if ( unlikely(page == NULL) )
404 {
405 if ( order == 9 )
406 {
407 /* If we can't allocate a superpage, try singleton pages */
408 order = 0;
409 goto retry;
410 }
412 printk("%s: Unable to allocate domheap page for pod cache. target %lu cachesize %d\n",
413 __func__, pod_target, p2md->pod.count);
414 ret = -ENOMEM;
415 goto out;
416 }
418 p2m_pod_cache_add(d, page, order);
419 }
421 /* Decreasing the target */
422 /* We hold the p2m lock here, so we don't need to worry about
423 * cache disappearing under our feet. */
424 while ( pod_target < p2md->pod.count )
425 {
426 struct page_info * page;
427 int order, i;
429 /* Grab the lock before checking that pod.super is empty, or the last
430 * entries may disappear before we grab the lock. */
431 spin_lock(&d->page_alloc_lock);
433 if ( (p2md->pod.count - pod_target) > SUPERPAGE_PAGES
434 && !page_list_empty(&p2md->pod.super) )
435 order = 9;
436 else
437 order = 0;
439 page = p2m_pod_cache_get(d, order);
441 ASSERT(page != NULL);
443 spin_unlock(&d->page_alloc_lock);
445 /* Then free them */
446 for ( i = 0 ; i < (1 << order) ; i++ )
447 {
448 /* Copied from common/memory.c:guest_remove_page() */
449 if ( unlikely(!get_page(page+i, d)) )
450 {
451 gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
452 ret = -EINVAL;
453 goto out;
454 }
456 if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
457 put_page_and_type(page+i);
459 if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
460 put_page(page+i);
462 put_page(page+i);
463 }
464 }
466 out:
467 return ret;
468 }
470 /*
471 * The "right behavior" here requires some careful thought. First, some
472 * definitions:
473 * + M: static_max
474 * + B: number of pages the balloon driver has ballooned down to.
475 * + P: Number of populated pages.
476 * + T: Old target
477 * + T': New target
478 *
479 * The following equations should hold:
480 * 0 <= P <= T <= B <= M
481 * d->arch.p2m->pod.entry_count == B - P
482 * d->tot_pages == P + d->arch.p2m->pod.count
483 *
484 * Now we have the following potential cases to cover:
485 * B <T': Set the PoD cache size equal to the number of outstanding PoD
486 * entries. The balloon driver will deflate the balloon to give back
487 * the remainder of the ram to the guest OS.
488 * T <T'<B : Increase PoD cache size.
489 * T'<T<=B : Here we have a choice. We can decrease the size of the cache,
490 * get the memory right away. However, that means every time we
491 * reduce the memory target we risk the guest attempting to populate the
492 * memory before the balloon driver has reached its new target. Safer to
493 * never reduce the cache size here, but only when the balloon driver frees
494 * PoD ranges.
495 *
496 * If there are many zero pages, we could reach the target also by doing
497 * zero sweeps and marking the ranges PoD; but the balloon driver will have
498 * to free this memory eventually anyway, so we don't actually gain that much
499 * by doing so.
500 *
501 * NB that the equation (B<T') may require adjustment to the cache
502 * size as PoD pages are freed as well; i.e., freeing a PoD-backed
503 * entry when pod.entry_count == pod.count requires us to reduce both
504 * pod.entry_count and pod.count.
505 */
506 int
507 p2m_pod_set_mem_target(struct domain *d, unsigned long target)
508 {
509 unsigned pod_target;
510 struct p2m_domain *p2md = d->arch.p2m;
511 int ret = 0;
512 unsigned long populated;
514 p2m_lock(p2md);
516 /* P == B: Nothing to do. */
517 if ( p2md->pod.entry_count == 0 )
518 goto out;
520 /* Don't do anything if the domain is being torn down */
521 if ( d->is_dying )
522 goto out;
524 /* T' < B: Don't reduce the cache size; let the balloon driver
525 * take care of it. */
526 if ( target < d->tot_pages )
527 goto out;
529 populated = d->tot_pages - p2md->pod.count;
531 pod_target = target - populated;
533 /* B < T': Set the cache size equal to # of outstanding entries,
534 * let the balloon driver fill in the rest. */
535 if ( pod_target > p2md->pod.entry_count )
536 pod_target = p2md->pod.entry_count;
538 ASSERT( pod_target >= p2md->pod.count );
540 ret = p2m_pod_set_cache_target(d, pod_target);
542 out:
543 p2m_unlock(p2md);
545 return ret;
546 }
548 void
549 p2m_pod_empty_cache(struct domain *d)
550 {
551 struct p2m_domain *p2md = d->arch.p2m;
552 struct page_info *page;
554 /* After this barrier no new PoD activities can happen. */
555 BUG_ON(!d->is_dying);
556 spin_barrier(&p2md->lock);
558 spin_lock(&d->page_alloc_lock);
560 while ( (page = page_list_remove_head(&p2md->pod.super)) )
561 {
562 int i;
564 for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
565 {
566 BUG_ON(page_get_owner(page + i) != d);
567 page_list_add_tail(page + i, &d->page_list);
568 }
570 p2md->pod.count -= SUPERPAGE_PAGES;
571 }
573 while ( (page = page_list_remove_head(&p2md->pod.single)) )
574 {
575 BUG_ON(page_get_owner(page) != d);
576 page_list_add_tail(page, &d->page_list);
578 p2md->pod.count -= 1;
579 }
581 BUG_ON(p2md->pod.count != 0);
583 spin_unlock(&d->page_alloc_lock);
584 }
586 /* This function is needed for two reasons:
587 * + To properly handle clearing of PoD entries
588 * + To "steal back" memory being freed for the PoD cache, rather than
589 * releasing it.
590 *
591 * Once both of these functions have been completed, we can return and
592 * allow decrease_reservation() to handle everything else.
593 */
594 int
595 p2m_pod_decrease_reservation(struct domain *d,
596 xen_pfn_t gpfn,
597 unsigned int order)
598 {
599 struct p2m_domain *p2md = d->arch.p2m;
600 int ret=0;
601 int i;
603 int steal_for_cache = 0;
604 int pod = 0, nonpod = 0, ram = 0;
607 /* If we don't have any outstanding PoD entries, let things take their
608 * course */
609 if ( p2md->pod.entry_count == 0 )
610 goto out;
612 /* Figure out if we need to steal some freed memory for our cache */
613 steal_for_cache = ( p2md->pod.entry_count > p2md->pod.count );
615 p2m_lock(p2md);
616 audit_p2m(d);
618 if ( unlikely(d->is_dying) )
619 goto out_unlock;
621 /* See what's in here. */
622 /* FIXME: Add contiguous; query for PSE entries? */
623 for ( i=0; i<(1<<order); i++)
624 {
625 p2m_type_t t;
627 gfn_to_mfn_query(d, gpfn + i, &t);
629 if ( t == p2m_populate_on_demand )
630 pod++;
631 else
632 {
633 nonpod++;
634 if ( p2m_is_ram(t) )
635 ram++;
636 }
637 }
639 /* No populate-on-demand? Don't need to steal anything? Then we're done!*/
640 if(!pod && !steal_for_cache)
641 goto out_unlock;
643 if ( !nonpod )
644 {
645 /* All PoD: Mark the whole region invalid and tell caller
646 * we're done. */
647 set_p2m_entry(d, gpfn, _mfn(INVALID_MFN), order, p2m_invalid);
648 p2md->pod.entry_count-=(1<<order); /* Lock: p2m */
649 BUG_ON(p2md->pod.entry_count < 0);
650 ret = 1;
651 goto out_entry_check;
652 }
654 /* FIXME: Steal contig 2-meg regions for cache */
656 /* Process as long as:
657 * + There are PoD entries to handle, or
658 * + There is ram left, and we want to steal it
659 */
660 for ( i=0;
661 i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
662 i++)
663 {
664 mfn_t mfn;
665 p2m_type_t t;
667 mfn = gfn_to_mfn_query(d, gpfn + i, &t);
668 if ( t == p2m_populate_on_demand )
669 {
670 set_p2m_entry(d, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid);
671 p2md->pod.entry_count--; /* Lock: p2m */
672 BUG_ON(p2md->pod.entry_count < 0);
673 pod--;
674 }
675 else if ( steal_for_cache && p2m_is_ram(t) )
676 {
677 struct page_info *page;
679 ASSERT(mfn_valid(mfn));
681 page = mfn_to_page(mfn);
683 set_p2m_entry(d, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid);
684 set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
686 p2m_pod_cache_add(d, page, 0);
688 steal_for_cache = ( p2md->pod.entry_count > p2md->pod.count );
690 nonpod--;
691 ram--;
692 }
693 }
695 /* If there are no more non-PoD entries, tell decrease_reservation() that
696 * there's nothing left to do. */
697 if ( nonpod == 0 )
698 ret = 1;
700 out_entry_check:
701 /* If we've reduced our "liabilities" beyond our "assets", free some */
702 if ( p2md->pod.entry_count < p2md->pod.count )
703 {
704 p2m_pod_set_cache_target(d, p2md->pod.entry_count);
705 }
707 out_unlock:
708 audit_p2m(d);
709 p2m_unlock(p2md);
711 out:
712 return ret;
713 }
715 void
716 p2m_pod_dump_data(struct domain *d)
717 {
718 struct p2m_domain *p2md = d->arch.p2m;
720 printk(" PoD entries=%d cachesize=%d\n",
721 p2md->pod.entry_count, p2md->pod.count);
722 }
725 /* Search for all-zero superpages to be reclaimed as superpages for the
726 * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
727 static int
728 p2m_pod_zero_check_superpage(struct domain *d, unsigned long gfn)
729 {
730 mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
731 p2m_type_t type, type0 = 0;
732 unsigned long * map = NULL;
733 int ret=0, reset = 0;
734 int i, j;
735 int max_ref = 1;
737 if ( !superpage_aligned(gfn) )
738 goto out;
740 /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
741 if ( paging_mode_shadow(d) )
742 max_ref++;
744 /* Look up the mfns, checking to make sure they're the same mfn
745 * and aligned, and mapping them. */
746 for ( i=0; i<SUPERPAGE_PAGES; i++ )
747 {
749 mfn = gfn_to_mfn_query(d, gfn + i, &type);
751 if ( i == 0 )
752 {
753 mfn0 = mfn;
754 type0 = type;
755 }
757 /* Conditions that must be met for superpage-superpage:
758 * + All gfns are ram types
759 * + All gfns have the same type
760 * + All of the mfns are allocated to a domain
761 * + None of the mfns are used as pagetables, or allocated via xenheap
762 * + The first mfn is 2-meg aligned
763 * + All the other mfns are in sequence
764 * Adding for good measure:
765 * + None of the mfns are likely to be mapped elsewhere (refcount
766 * 2 or less for shadow, 1 for hap)
767 */
768 if ( !p2m_is_ram(type)
769 || type != type0
770 || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
771 || ( (mfn_to_page(mfn)->count_info & (PGC_page_table|PGC_xen_heap)) != 0 )
772 || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap ) != 0 )
773 || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
774 || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
775 || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
776 goto out;
777 }
779 /* Now, do a quick check to see if it may be zero before unmapping. */
780 for ( i=0; i<SUPERPAGE_PAGES; i++ )
781 {
782 /* Quick zero-check */
783 map = map_domain_page(mfn_x(mfn0) + i);
785 for ( j=0; j<16; j++ )
786 if( *(map+j) != 0 )
787 break;
789 unmap_domain_page(map);
791 if ( j < 16 )
792 goto out;
794 }
796 /* Try to remove the page, restoring old mapping if it fails. */
797 set_p2m_entry(d, gfn,
798 _mfn(POPULATE_ON_DEMAND_MFN), 9,
799 p2m_populate_on_demand);
801 /* Make none of the MFNs are used elsewhere... for example, mapped
802 * via the grant table interface, or by qemu. Allow one refcount for
803 * being allocated to the domain. */
804 for ( i=0; i < SUPERPAGE_PAGES; i++ )
805 {
806 mfn = _mfn(mfn_x(mfn0) + i);
807 if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
808 {
809 reset = 1;
810 goto out_reset;
811 }
812 }
814 /* Finally, do a full zero-check */
815 for ( i=0; i < SUPERPAGE_PAGES; i++ )
816 {
817 map = map_domain_page(mfn_x(mfn0) + i);
819 for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
820 if( *(map+j) != 0 )
821 {
822 reset = 1;
823 break;
824 }
826 unmap_domain_page(map);
828 if ( reset )
829 goto out_reset;
830 }
832 if ( tb_init_done )
833 {
834 struct {
835 u64 gfn, mfn;
836 int d:16,order:16;
837 } t;
839 t.gfn = gfn;
840 t.mfn = mfn_x(mfn);
841 t.d = d->domain_id;
842 t.order = 9;
844 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), (unsigned char *)&t);
845 }
847 /* Finally! We've passed all the checks, and can add the mfn superpage
848 * back on the PoD cache, and account for the new p2m PoD entries */
849 p2m_pod_cache_add(d, mfn_to_page(mfn0), 9);
850 d->arch.p2m->pod.entry_count += SUPERPAGE_PAGES;
852 out_reset:
853 if ( reset )
854 set_p2m_entry(d, gfn, mfn0, 9, type0);
856 out:
857 return ret;
858 }
860 static void
861 p2m_pod_zero_check(struct domain *d, unsigned long *gfns, int count)
862 {
863 mfn_t mfns[count];
864 p2m_type_t types[count];
865 unsigned long * map[count];
867 int i, j;
868 int max_ref = 1;
870 /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
871 if ( paging_mode_shadow(d) )
872 max_ref++;
874 /* First, get the gfn list, translate to mfns, and map the pages. */
875 for ( i=0; i<count; i++ )
876 {
877 mfns[i] = gfn_to_mfn_query(d, gfns[i], types + i);
878 /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped
879 elsewhere, map it; otherwise, skip. */
880 if ( p2m_is_ram(types[i])
881 && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 )
882 && ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 )
883 && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) )
884 map[i] = map_domain_page(mfn_x(mfns[i]));
885 else
886 map[i] = NULL;
887 }
889 /* Then, go through and check for zeroed pages, removing write permission
890 * for those with zeroes. */
891 for ( i=0; i<count; i++ )
892 {
893 if(!map[i])
894 continue;
896 /* Quick zero-check */
897 for ( j=0; j<16; j++ )
898 if( *(map[i]+j) != 0 )
899 break;
901 if ( j < 16 )
902 {
903 unmap_domain_page(map[i]);
904 map[i] = NULL;
905 continue;
906 }
908 /* Try to remove the page, restoring old mapping if it fails. */
909 set_p2m_entry(d, gfns[i],
910 _mfn(POPULATE_ON_DEMAND_MFN), 0,
911 p2m_populate_on_demand);
913 /* See if the page was successfully unmapped. (Allow one refcount
914 * for being allocated to a domain.) */
915 if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
916 {
917 unmap_domain_page(map[i]);
918 map[i] = NULL;
920 set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
922 continue;
923 }
924 }
926 /* Now check each page for real */
927 for ( i=0; i < count; i++ )
928 {
929 if(!map[i])
930 continue;
932 for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
933 if( *(map[i]+j) != 0 )
934 break;
936 unmap_domain_page(map[i]);
938 /* See comment in p2m_pod_zero_check_superpage() re gnttab
939 * check timing. */
940 if ( j < PAGE_SIZE/sizeof(*map[i]) )
941 {
942 set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
943 }
944 else
945 {
946 if ( tb_init_done )
947 {
948 struct {
949 u64 gfn, mfn;
950 int d:16,order:16;
951 } t;
953 t.gfn = gfns[i];
954 t.mfn = mfn_x(mfns[i]);
955 t.d = d->domain_id;
956 t.order = 0;
958 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), (unsigned char *)&t);
959 }
961 /* Add to cache, and account for the new p2m PoD entry */
962 p2m_pod_cache_add(d, mfn_to_page(mfns[i]), 0);
963 d->arch.p2m->pod.entry_count++;
964 }
965 }
967 }
969 #define POD_SWEEP_LIMIT 1024
970 static void
971 p2m_pod_emergency_sweep_super(struct domain *d)
972 {
973 struct p2m_domain *p2md = d->arch.p2m;
974 unsigned long i, start, limit;
976 if ( p2md->pod.reclaim_super == 0 )
977 {
978 p2md->pod.reclaim_super = (p2md->pod.max_guest>>9)<<9;
979 p2md->pod.reclaim_super -= SUPERPAGE_PAGES;
980 }
982 start = p2md->pod.reclaim_super;
983 limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
985 for ( i=p2md->pod.reclaim_super ; i > 0 ; i-=SUPERPAGE_PAGES )
986 {
987 p2m_pod_zero_check_superpage(d, i);
988 /* Stop if we're past our limit and we have found *something*.
989 *
990 * NB that this is a zero-sum game; we're increasing our cache size
991 * by increasing our 'debt'. Since we hold the p2m lock,
992 * (entry_count - count) must remain the same. */
993 if ( !page_list_empty(&p2md->pod.super) && i < limit )
994 break;
995 }
997 p2md->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0;
999 }
1001 #define POD_SWEEP_STRIDE 16
1002 static void
1003 p2m_pod_emergency_sweep(struct domain *d)
1005 struct p2m_domain *p2md = d->arch.p2m;
1006 unsigned long gfns[POD_SWEEP_STRIDE];
1007 unsigned long i, j=0, start, limit;
1008 p2m_type_t t;
1011 if ( p2md->pod.reclaim_single == 0 )
1012 p2md->pod.reclaim_single = p2md->pod.max_guest;
1014 start = p2md->pod.reclaim_single;
1015 limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
1017 /* FIXME: Figure out how to avoid superpages */
1018 for ( i=p2md->pod.reclaim_single ; i > 0 ; i-- )
1020 gfn_to_mfn_query(d, i, &t );
1021 if ( p2m_is_ram(t) )
1023 gfns[j] = i;
1024 j++;
1025 BUG_ON(j > POD_SWEEP_STRIDE);
1026 if ( j == POD_SWEEP_STRIDE )
1028 p2m_pod_zero_check(d, gfns, j);
1029 j = 0;
1032 /* Stop if we're past our limit and we have found *something*.
1034 * NB that this is a zero-sum game; we're increasing our cache size
1035 * by re-increasing our 'debt'. Since we hold the p2m lock,
1036 * (entry_count - count) must remain the same. */
1037 if ( p2md->pod.count > 0 && i < limit )
1038 break;
1041 if ( j )
1042 p2m_pod_zero_check(d, gfns, j);
1044 p2md->pod.reclaim_single = i ? i - 1 : i;
1048 int
1049 p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
1050 unsigned int order,
1051 p2m_query_t q)
1053 struct page_info *p = NULL; /* Compiler warnings */
1054 unsigned long gfn_aligned;
1055 mfn_t mfn;
1056 struct p2m_domain *p2md = d->arch.p2m;
1057 int i;
1059 ASSERT(p2m_locked_by_me(d->arch.p2m));
1061 /* This check is done with the p2m lock held. This will make sure that
1062 * even if d->is_dying changes under our feet, p2m_pod_empty_cache()
1063 * won't start until we're done. */
1064 if ( unlikely(d->is_dying) )
1065 goto out_fail;
1067 /* If we're low, start a sweep */
1068 if ( order == 9 && page_list_empty(&p2md->pod.super) )
1069 p2m_pod_emergency_sweep_super(d);
1071 if ( page_list_empty(&p2md->pod.single) &&
1072 ( ( order == 0 )
1073 || (order == 9 && page_list_empty(&p2md->pod.super) ) ) )
1074 p2m_pod_emergency_sweep(d);
1076 /* Keep track of the highest gfn demand-populated by a guest fault */
1077 if ( q == p2m_guest && gfn > p2md->pod.max_guest )
1078 p2md->pod.max_guest = gfn;
1080 spin_lock(&d->page_alloc_lock);
1082 if ( p2md->pod.count == 0 )
1083 goto out_of_memory;
1085 /* Get a page f/ the cache. A NULL return value indicates that the
1086 * 2-meg range should be marked singleton PoD, and retried */
1087 if ( (p = p2m_pod_cache_get(d, order)) == NULL )
1088 goto remap_and_retry;
1090 mfn = page_to_mfn(p);
1092 BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
1094 spin_unlock(&d->page_alloc_lock);
1096 gfn_aligned = (gfn >> order) << order;
1098 set_p2m_entry(d, gfn_aligned, mfn, order, p2m_ram_rw);
1100 for( i = 0 ; i < (1UL << order) ; i++ )
1101 set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
1103 p2md->pod.entry_count -= (1 << order); /* Lock: p2m */
1104 BUG_ON(p2md->pod.entry_count < 0);
1106 if ( tb_init_done )
1108 struct {
1109 u64 gfn, mfn;
1110 int d:16,order:16;
1111 } t;
1113 t.gfn = gfn;
1114 t.mfn = mfn_x(mfn);
1115 t.d = d->domain_id;
1116 t.order = order;
1118 __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), (unsigned char *)&t);
1121 return 0;
1122 out_of_memory:
1123 spin_unlock(&d->page_alloc_lock);
1125 printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n",
1126 __func__, d->tot_pages, p2md->pod.entry_count);
1127 domain_crash(d);
1128 out_fail:
1129 return -1;
1130 remap_and_retry:
1131 BUG_ON(order != 9);
1132 spin_unlock(&d->page_alloc_lock);
1134 /* Remap this 2-meg region in singleton chunks */
1135 gfn_aligned = (gfn>>order)<<order;
1136 for(i=0; i<(1<<order); i++)
1137 set_p2m_entry(d, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
1138 p2m_populate_on_demand);
1139 if ( tb_init_done )
1141 struct {
1142 u64 gfn;
1143 int d:16;
1144 } t;
1146 t.gfn = gfn;
1147 t.d = d->domain_id;
1149 __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), (unsigned char *)&t);
1152 return 0;
1155 /* Non-ept "lock-and-check" wrapper */
1156 static int p2m_pod_check_and_populate(struct domain *d, unsigned long gfn,
1157 l1_pgentry_t *p2m_entry, int order,
1158 p2m_query_t q)
1160 /* Only take the lock if we don't already have it. Otherwise it
1161 * wouldn't be safe to do p2m lookups with the p2m lock held */
1162 int do_locking = !p2m_locked_by_me(d->arch.p2m);
1163 int r;
1165 if ( do_locking )
1166 p2m_lock(d->arch.p2m);
1168 audit_p2m(d);
1170 /* Check to make sure this is still PoD */
1171 if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand )
1173 if ( do_locking )
1174 p2m_unlock(d->arch.p2m);
1175 return 0;
1178 r = p2m_pod_demand_populate(d, gfn, order, q);
1180 audit_p2m(d);
1181 if ( do_locking )
1182 p2m_unlock(d->arch.p2m);
1184 return r;
1187 // Returns 0 on error (out of memory)
1188 static int
1189 p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
1190 unsigned int page_order, p2m_type_t p2mt)
1192 // XXX -- this might be able to be faster iff current->domain == d
1193 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1194 void *table =map_domain_page(mfn_x(table_mfn));
1195 unsigned long i, gfn_remainder = gfn;
1196 l1_pgentry_t *p2m_entry;
1197 l1_pgentry_t entry_content;
1198 l2_pgentry_t l2e_content;
1199 int rv=0;
1201 if ( tb_init_done )
1203 struct {
1204 u64 gfn, mfn;
1205 int p2mt;
1206 int d:16,order:16;
1207 } t;
1209 t.gfn = gfn;
1210 t.mfn = mfn_x(mfn);
1211 t.p2mt = p2mt;
1212 t.d = d->domain_id;
1213 t.order = page_order;
1215 __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), (unsigned char *)&t);
1218 #if CONFIG_PAGING_LEVELS >= 4
1219 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1220 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1221 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1222 goto out;
1223 #endif
1224 /*
1225 * When using PAE Xen, we only allow 33 bits of pseudo-physical
1226 * address in translated guests (i.e. 8 GBytes). This restriction
1227 * comes from wanting to map the P2M table into the 16MB RO_MPT hole
1228 * in Xen's address space for translated PV guests.
1229 * When using AMD's NPT on PAE Xen, we are restricted to 4GB.
1230 */
1231 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1232 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1233 ((CONFIG_PAGING_LEVELS == 3)
1234 ? (d->arch.hvm_domain.hap_enabled ? 4 : 8)
1235 : L3_PAGETABLE_ENTRIES),
1236 PGT_l2_page_table) )
1237 goto out;
1239 if ( page_order == 0 )
1241 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1242 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1243 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1244 goto out;
1246 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1247 0, L1_PAGETABLE_ENTRIES);
1248 ASSERT(p2m_entry);
1250 if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
1251 entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
1252 else
1253 entry_content = l1e_empty();
1255 /* level 1 entry */
1256 paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
1258 else
1260 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1261 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1262 L2_PAGETABLE_ENTRIES);
1263 ASSERT(p2m_entry);
1265 /* FIXME: Deal with 4k replaced by 2meg pages */
1266 if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
1267 !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
1269 P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
1270 domain_crash(d);
1271 goto out;
1274 if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
1275 l2e_content = l2e_from_pfn(mfn_x(mfn),
1276 p2m_type_to_flags(p2mt) | _PAGE_PSE);
1277 else
1278 l2e_content = l2e_empty();
1280 entry_content.l1 = l2e_content.l2;
1281 paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
1284 /* Track the highest gfn for which we have ever had a valid mapping */
1285 if ( mfn_valid(mfn)
1286 && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
1287 d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
1289 if ( iommu_enabled && need_iommu(d) )
1291 if ( p2mt == p2m_ram_rw )
1292 for ( i = 0; i < (1UL << page_order); i++ )
1293 iommu_map_page(d, gfn+i, mfn_x(mfn)+i );
1294 else
1295 for ( int i = 0; i < (1UL << page_order); i++ )
1296 iommu_unmap_page(d, gfn+i);
1299 /* Success */
1300 rv = 1;
1302 out:
1303 unmap_domain_page(table);
1304 return rv;
1307 static mfn_t
1308 p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t,
1309 p2m_query_t q)
1311 mfn_t mfn;
1312 paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
1313 l2_pgentry_t *l2e;
1314 l1_pgentry_t *l1e;
1316 ASSERT(paging_mode_translate(d));
1318 /* XXX This is for compatibility with the old model, where anything not
1319 * XXX marked as RAM was considered to be emulated MMIO space.
1320 * XXX Once we start explicitly registering MMIO regions in the p2m
1321 * XXX we will return p2m_invalid for unmapped gfns */
1322 *t = p2m_mmio_dm;
1324 mfn = pagetable_get_mfn(d->arch.phys_table);
1326 if ( gfn > d->arch.p2m->max_mapped_pfn )
1327 /* This pfn is higher than the highest the p2m map currently holds */
1328 return _mfn(INVALID_MFN);
1330 #if CONFIG_PAGING_LEVELS >= 4
1332 l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
1333 l4e += l4_table_offset(addr);
1334 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1336 unmap_domain_page(l4e);
1337 return _mfn(INVALID_MFN);
1339 mfn = _mfn(l4e_get_pfn(*l4e));
1340 unmap_domain_page(l4e);
1342 #endif
1344 l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
1345 #if CONFIG_PAGING_LEVELS == 3
1346 /* On PAE hosts the p2m has eight l3 entries, not four (see
1347 * shadow_set_p2m_entry()) so we can't use l3_table_offset.
1348 * Instead, just count the number of l3es from zero. It's safe
1349 * to do this because we already checked that the gfn is within
1350 * the bounds of the p2m. */
1351 l3e += (addr >> L3_PAGETABLE_SHIFT);
1352 #else
1353 l3e += l3_table_offset(addr);
1354 #endif
1355 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1357 unmap_domain_page(l3e);
1358 return _mfn(INVALID_MFN);
1360 mfn = _mfn(l3e_get_pfn(*l3e));
1361 unmap_domain_page(l3e);
1364 l2e = map_domain_page(mfn_x(mfn));
1365 l2e += l2_table_offset(addr);
1367 pod_retry_l2:
1368 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1370 /* PoD: Try to populate a 2-meg chunk */
1371 if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
1373 if ( q != p2m_query ) {
1374 if ( !p2m_pod_check_and_populate(d, gfn,
1375 (l1_pgentry_t *)l2e, 9, q) )
1376 goto pod_retry_l2;
1377 } else
1378 *t = p2m_populate_on_demand;
1381 unmap_domain_page(l2e);
1382 return _mfn(INVALID_MFN);
1384 else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
1386 mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
1387 *t = p2m_flags_to_type(l2e_get_flags(*l2e));
1388 unmap_domain_page(l2e);
1390 ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
1391 return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
1394 mfn = _mfn(l2e_get_pfn(*l2e));
1395 unmap_domain_page(l2e);
1397 l1e = map_domain_page(mfn_x(mfn));
1398 l1e += l1_table_offset(addr);
1399 pod_retry_l1:
1400 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1402 /* PoD: Try to populate */
1403 if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
1405 if ( q != p2m_query ) {
1406 if ( !p2m_pod_check_and_populate(d, gfn,
1407 (l1_pgentry_t *)l1e, 0, q) )
1408 goto pod_retry_l1;
1409 } else
1410 *t = p2m_populate_on_demand;
1413 unmap_domain_page(l1e);
1414 return _mfn(INVALID_MFN);
1416 mfn = _mfn(l1e_get_pfn(*l1e));
1417 *t = p2m_flags_to_type(l1e_get_flags(*l1e));
1418 unmap_domain_page(l1e);
1420 ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
1421 return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
1424 /* Read the current domain's p2m table (through the linear mapping). */
1425 static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t,
1426 p2m_query_t q)
1428 mfn_t mfn = _mfn(INVALID_MFN);
1429 p2m_type_t p2mt = p2m_mmio_dm;
1430 paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
1431 /* XXX This is for compatibility with the old model, where anything not
1432 * XXX marked as RAM was considered to be emulated MMIO space.
1433 * XXX Once we start explicitly registering MMIO regions in the p2m
1434 * XXX we will return p2m_invalid for unmapped gfns */
1436 if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
1438 l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
1439 l2_pgentry_t l2e = l2e_empty();
1440 int ret;
1442 ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
1443 / sizeof(l1_pgentry_t));
1445 /*
1446 * Read & process L2
1447 */
1448 p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
1449 + l2_linear_offset(addr)];
1451 pod_retry_l2:
1452 ret = __copy_from_user(&l2e,
1453 p2m_entry,
1454 sizeof(l2e));
1455 if ( ret != 0
1456 || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1458 if( (l2e_get_flags(l2e) & _PAGE_PSE)
1459 && ( p2m_flags_to_type(l2e_get_flags(l2e))
1460 == p2m_populate_on_demand ) )
1462 /* The read has succeeded, so we know that the mapping
1463 * exits at this point. */
1464 if ( q != p2m_query )
1466 if ( !p2m_pod_check_and_populate(current->domain, gfn,
1467 p2m_entry, 9, q) )
1468 goto pod_retry_l2;
1470 /* Allocate failed. */
1471 p2mt = p2m_invalid;
1472 printk("%s: Allocate failed!\n", __func__);
1473 goto out;
1475 else
1477 p2mt = p2m_populate_on_demand;
1478 goto out;
1482 goto pod_retry_l1;
1485 if (l2e_get_flags(l2e) & _PAGE_PSE)
1487 p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
1488 ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
1490 if ( p2m_is_valid(p2mt) )
1491 mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
1492 else
1493 p2mt = p2m_mmio_dm;
1495 goto out;
1498 /*
1499 * Read and process L1
1500 */
1502 /* Need to __copy_from_user because the p2m is sparse and this
1503 * part might not exist */
1504 pod_retry_l1:
1505 p2m_entry = &phys_to_machine_mapping[gfn];
1507 ret = __copy_from_user(&l1e,
1508 p2m_entry,
1509 sizeof(l1e));
1511 if ( ret == 0 ) {
1512 p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
1513 ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
1515 if ( p2m_flags_to_type(l1e_get_flags(l1e))
1516 == p2m_populate_on_demand )
1518 /* The read has succeeded, so we know that the mapping
1519 * exits at this point. */
1520 if ( q != p2m_query )
1522 if ( !p2m_pod_check_and_populate(current->domain, gfn,
1523 (l1_pgentry_t *)p2m_entry, 0, q) )
1524 goto pod_retry_l1;
1526 /* Allocate failed. */
1527 p2mt = p2m_invalid;
1528 goto out;
1530 else
1532 p2mt = p2m_populate_on_demand;
1533 goto out;
1537 if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
1538 mfn = _mfn(l1e_get_pfn(l1e));
1539 else
1540 /* XXX see above */
1541 p2mt = p2m_mmio_dm;
1544 out:
1545 *t = p2mt;
1546 return mfn;
1549 /* Init the datastructures for later use by the p2m code */
1550 int p2m_init(struct domain *d)
1552 struct p2m_domain *p2m;
1554 p2m = xmalloc(struct p2m_domain);
1555 if ( p2m == NULL )
1556 return -ENOMEM;
1558 d->arch.p2m = p2m;
1560 memset(p2m, 0, sizeof(*p2m));
1561 p2m_lock_init(p2m);
1562 INIT_PAGE_LIST_HEAD(&p2m->pages);
1563 INIT_PAGE_LIST_HEAD(&p2m->pod.super);
1564 INIT_PAGE_LIST_HEAD(&p2m->pod.single);
1566 p2m->set_entry = p2m_set_entry;
1567 p2m->get_entry = p2m_gfn_to_mfn;
1568 p2m->get_entry_current = p2m_gfn_to_mfn_current;
1569 p2m->change_entry_type_global = p2m_change_type_global;
1571 if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
1572 (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
1573 ept_p2m_init(d);
1575 return 0;
1578 void p2m_change_entry_type_global(struct domain *d,
1579 p2m_type_t ot, p2m_type_t nt)
1581 struct p2m_domain *p2m = d->arch.p2m;
1583 p2m_lock(p2m);
1584 p2m->change_entry_type_global(d, ot, nt);
1585 p2m_unlock(p2m);
1588 static
1589 int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
1590 unsigned int page_order, p2m_type_t p2mt)
1592 unsigned long todo = 1ul << page_order;
1593 unsigned int order;
1594 int rc = 1;
1596 while ( todo )
1598 if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled )
1599 order = (((gfn | mfn_x(mfn) | todo) & (SUPERPAGE_PAGES - 1)) == 0) ?
1600 9 : 0;
1601 else
1602 order = 0;
1603 if ( !d->arch.p2m->set_entry(d, gfn, mfn, order, p2mt) )
1604 rc = 0;
1605 gfn += 1ul << order;
1606 if ( mfn_x(mfn) != INVALID_MFN )
1607 mfn = _mfn(mfn_x(mfn) + (1ul << order));
1608 todo -= 1ul << order;
1611 return rc;
1614 // Allocate a new p2m table for a domain.
1615 //
1616 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1617 // controlled by CONFIG_PAGING_LEVELS).
1618 //
1619 // The alloc_page and free_page functions will be used to get memory to
1620 // build the p2m, and to release it again at the end of day.
1621 //
1622 // Returns 0 for success or -errno.
1623 //
1624 int p2m_alloc_table(struct domain *d,
1625 struct page_info * (*alloc_page)(struct domain *d),
1626 void (*free_page)(struct domain *d, struct page_info *pg))
1629 mfn_t mfn = _mfn(INVALID_MFN);
1630 struct page_info *page, *p2m_top;
1631 unsigned int page_count = 0;
1632 unsigned long gfn = -1UL;
1633 struct p2m_domain *p2m = d->arch.p2m;
1635 p2m_lock(p2m);
1637 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
1639 P2M_ERROR("p2m already allocated for this domain\n");
1640 p2m_unlock(p2m);
1641 return -EINVAL;
1644 P2M_PRINTK("allocating p2m table\n");
1646 p2m->alloc_page = alloc_page;
1647 p2m->free_page = free_page;
1649 p2m_top = p2m->alloc_page(d);
1650 if ( p2m_top == NULL )
1652 p2m_unlock(p2m);
1653 return -ENOMEM;
1655 page_list_add_tail(p2m_top, &p2m->pages);
1657 p2m_top->count_info = 1;
1658 p2m_top->u.inuse.type_info =
1659 #if CONFIG_PAGING_LEVELS == 4
1660 PGT_l4_page_table
1661 #else
1662 PGT_l3_page_table
1663 #endif
1664 | 1 | PGT_validated;
1666 d->arch.phys_table = pagetable_from_mfn(page_to_mfn(p2m_top));
1668 P2M_PRINTK("populating p2m table\n");
1670 /* Initialise physmap tables for slot zero. Other code assumes this. */
1671 if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), 0,
1672 p2m_invalid) )
1673 goto error;
1675 /* Copy all existing mappings from the page list and m2p */
1676 page_list_for_each(page, &d->page_list)
1678 mfn = page_to_mfn(page);
1679 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1680 /* Pages should not be shared that early */
1681 ASSERT(gfn != SHARED_M2P_ENTRY);
1682 page_count++;
1683 if (
1684 #ifdef __x86_64__
1685 (gfn != 0x5555555555555555L)
1686 #else
1687 (gfn != 0x55555555L)
1688 #endif
1689 && gfn != INVALID_M2P_ENTRY
1690 && !set_p2m_entry(d, gfn, mfn, 0, p2m_ram_rw) )
1691 goto error;
1694 P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
1695 p2m_unlock(p2m);
1696 return 0;
1698 error:
1699 P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1700 PRI_mfn "\n", gfn, mfn_x(mfn));
1701 p2m_unlock(p2m);
1702 return -ENOMEM;
1705 void p2m_teardown(struct domain *d)
1706 /* Return all the p2m pages to Xen.
1707 * We know we don't have any extra mappings to these pages */
1709 struct page_info *pg;
1710 struct p2m_domain *p2m = d->arch.p2m;
1711 unsigned long gfn;
1712 p2m_type_t t;
1713 mfn_t mfn;
1715 p2m_lock(p2m);
1716 for(gfn=0; gfn < p2m->max_mapped_pfn; gfn++)
1718 mfn = p2m->get_entry(d, gfn, &t, p2m_query);
1719 if(mfn_valid(mfn) && (t == p2m_ram_shared))
1720 BUG_ON(mem_sharing_unshare_page(d, gfn, MEM_SHARING_DESTROY_GFN));
1722 d->arch.phys_table = pagetable_null();
1724 while ( (pg = page_list_remove_head(&p2m->pages)) )
1725 p2m->free_page(d, pg);
1726 p2m_unlock(p2m);
1729 void p2m_final_teardown(struct domain *d)
1731 xfree(d->arch.p2m);
1732 d->arch.p2m = NULL;
1735 #if P2M_AUDIT
1736 static void audit_p2m(struct domain *d)
1738 struct page_info *page;
1739 struct domain *od;
1740 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
1741 int entry_count = 0;
1742 mfn_t p2mfn;
1743 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
1744 int test_linear;
1745 p2m_type_t type;
1747 if ( !paging_mode_translate(d) )
1748 return;
1750 //P2M_PRINTK("p2m audit starts\n");
1752 test_linear = ( (d == current->domain)
1753 && !pagetable_is_null(current->arch.monitor_table) );
1754 if ( test_linear )
1755 flush_tlb_local();
1757 spin_lock(&d->page_alloc_lock);
1759 /* Audit part one: walk the domain's page allocation list, checking
1760 * the m2p entries. */
1761 page_list_for_each ( page, &d->page_list )
1763 mfn = mfn_x(page_to_mfn(page));
1765 // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
1767 od = page_get_owner(page);
1769 if ( od != d )
1771 P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
1772 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
1773 continue;
1776 gfn = get_gpfn_from_mfn(mfn);
1777 if ( gfn == INVALID_M2P_ENTRY )
1779 orphans_i++;
1780 //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
1781 // mfn);
1782 continue;
1785 if ( gfn == 0x55555555 )
1787 orphans_d++;
1788 //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
1789 // mfn);
1790 continue;
1793 if ( gfn == SHARED_P2M_ENTRY)
1795 P2M_PRINTK("shared mfn (%lx) on domain page list!\n",
1796 mfn);
1797 continue;
1800 p2mfn = gfn_to_mfn_type_foreign(d, gfn, &type, p2m_query);
1801 if ( mfn_x(p2mfn) != mfn )
1803 mpbad++;
1804 P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
1805 " (-> gfn %#lx)\n",
1806 mfn, gfn, mfn_x(p2mfn),
1807 (mfn_valid(p2mfn)
1808 ? get_gpfn_from_mfn(mfn_x(p2mfn))
1809 : -1u));
1810 /* This m2p entry is stale: the domain has another frame in
1811 * this physical slot. No great disaster, but for neatness,
1812 * blow away the m2p entry. */
1813 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
1816 if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
1818 lp2mfn = mfn_x(gfn_to_mfn_query(d, gfn, &type));
1819 if ( lp2mfn != mfn_x(p2mfn) )
1821 P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
1822 "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn));
1826 // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
1827 // mfn, gfn, p2mfn, lp2mfn);
1830 spin_unlock(&d->page_alloc_lock);
1832 /* Audit part two: walk the domain's p2m table, checking the entries. */
1833 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
1835 l2_pgentry_t *l2e;
1836 l1_pgentry_t *l1e;
1837 int i1, i2;
1839 #if CONFIG_PAGING_LEVELS == 4
1840 l4_pgentry_t *l4e;
1841 l3_pgentry_t *l3e;
1842 int i3, i4;
1843 l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
1844 #else /* CONFIG_PAGING_LEVELS == 3 */
1845 l3_pgentry_t *l3e;
1846 int i3;
1847 l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
1848 #endif
1850 gfn = 0;
1851 #if CONFIG_PAGING_LEVELS >= 4
1852 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
1854 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
1856 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
1857 continue;
1859 l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
1860 #endif
1861 for ( i3 = 0;
1862 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
1863 i3++ )
1865 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
1867 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
1868 continue;
1870 l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
1871 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
1873 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
1875 if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
1876 && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
1877 == p2m_populate_on_demand ) )
1878 entry_count+=SUPERPAGE_PAGES;
1879 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
1880 continue;
1883 /* check for super page */
1884 if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
1886 mfn = l2e_get_pfn(l2e[i2]);
1887 ASSERT(mfn_valid(_mfn(mfn)));
1888 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
1890 m2pfn = get_gpfn_from_mfn(mfn+i1);
1891 /* Allow shared M2Ps */
1892 if ( (m2pfn != (gfn + i1)) &&
1893 (m2pfn != SHARED_M2P_ENTRY) )
1895 pmbad++;
1896 P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
1897 " -> gfn %#lx\n", gfn+i1, mfn+i1,
1898 m2pfn);
1899 BUG();
1902 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
1903 continue;
1906 l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
1908 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
1910 p2m_type_t type;
1912 type = p2m_flags_to_type(l1e_get_flags(l1e[i1]));
1913 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
1915 if ( type == p2m_populate_on_demand )
1916 entry_count++;
1917 continue;
1919 mfn = l1e_get_pfn(l1e[i1]);
1920 ASSERT(mfn_valid(_mfn(mfn)));
1921 m2pfn = get_gpfn_from_mfn(mfn);
1922 if ( m2pfn != gfn &&
1923 type != p2m_mmio_direct &&
1924 !p2m_is_grant(type) &&
1925 !p2m_is_shared(type) )
1927 pmbad++;
1928 printk("mismatch: gfn %#lx -> mfn %#lx"
1929 " -> gfn %#lx\n", gfn, mfn, m2pfn);
1930 P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
1931 " -> gfn %#lx\n", gfn, mfn, m2pfn);
1932 BUG();
1935 unmap_domain_page(l1e);
1937 unmap_domain_page(l2e);
1939 #if CONFIG_PAGING_LEVELS >= 4
1940 unmap_domain_page(l3e);
1942 #endif
1944 #if CONFIG_PAGING_LEVELS == 4
1945 unmap_domain_page(l4e);
1946 #else /* CONFIG_PAGING_LEVELS == 3 */
1947 unmap_domain_page(l3e);
1948 #endif
1952 if ( entry_count != d->arch.p2m->pod.entry_count )
1954 printk("%s: refcounted entry count %d, audit count %d!\n",
1955 __func__,
1956 d->arch.p2m->pod.entry_count,
1957 entry_count);
1958 BUG();
1961 //P2M_PRINTK("p2m audit complete\n");
1962 //if ( orphans_i | orphans_d | mpbad | pmbad )
1963 // P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
1964 // orphans_i + orphans_d, orphans_i, orphans_d,
1965 if ( mpbad | pmbad )
1966 P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
1967 pmbad, mpbad);
1969 #endif /* P2M_AUDIT */
1973 static void
1974 p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
1975 unsigned int page_order)
1977 unsigned long i;
1978 mfn_t mfn_return;
1979 p2m_type_t t;
1981 if ( !paging_mode_translate(d) )
1983 if ( need_iommu(d) )
1984 for ( i = 0; i < (1 << page_order); i++ )
1985 iommu_unmap_page(d, mfn + i);
1986 return;
1989 P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
1991 for ( i = 0; i < (1UL << page_order); i++ )
1993 mfn_return = d->arch.p2m->get_entry(d, gfn + i, &t, p2m_query);
1994 if ( !p2m_is_grant(t) )
1995 set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
1996 ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) );
1998 set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
2001 void
2002 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2003 unsigned long mfn, unsigned int page_order)
2005 p2m_lock(d->arch.p2m);
2006 audit_p2m(d);
2007 p2m_remove_page(d, gfn, mfn, page_order);
2008 audit_p2m(d);
2009 p2m_unlock(d->arch.p2m);
2012 #if CONFIG_PAGING_LEVELS == 3
2013 static int gfn_check_limit(
2014 struct domain *d, unsigned long gfn, unsigned int order)
2016 /*
2017 * 32bit AMD nested paging does not support over 4GB guest due to
2018 * hardware translation limit. This limitation is checked by comparing
2019 * gfn with 0xfffffUL.
2020 */
2021 if ( !paging_mode_hap(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
2022 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
2023 return 0;
2025 if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
2026 dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
2027 " 4GB: specify 'hap=0' domain config option.\n",
2028 d->domain_id);
2030 return -EINVAL;
2032 #else
2033 #define gfn_check_limit(d, g, o) 0
2034 #endif
2036 int
2037 guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
2038 unsigned int order)
2040 struct p2m_domain *p2md = d->arch.p2m;
2041 unsigned long i;
2042 p2m_type_t ot;
2043 mfn_t omfn;
2044 int pod_count = 0;
2045 int rc = 0;
2047 BUG_ON(!paging_mode_translate(d));
2049 rc = gfn_check_limit(d, gfn, order);
2050 if ( rc != 0 )
2051 return rc;
2053 p2m_lock(p2md);
2054 audit_p2m(d);
2056 P2M_DEBUG("mark pod gfn=%#lx\n", gfn);
2058 /* Make sure all gpfns are unused */
2059 for ( i = 0; i < (1UL << order); i++ )
2061 omfn = gfn_to_mfn_query(d, gfn + i, &ot);
2062 if ( p2m_is_ram(ot) )
2064 printk("%s: gfn_to_mfn returned type %d!\n",
2065 __func__, ot);
2066 rc = -EBUSY;
2067 goto out;
2069 else if ( ot == p2m_populate_on_demand )
2071 /* Count how man PoD entries we'll be replacing if successful */
2072 pod_count++;
2076 /* Now, actually do the two-way mapping */
2077 if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
2078 p2m_populate_on_demand) )
2079 rc = -EINVAL;
2080 else
2082 p2md->pod.entry_count += 1 << order; /* Lock: p2m */
2083 p2md->pod.entry_count -= pod_count;
2084 BUG_ON(p2md->pod.entry_count < 0);
2087 audit_p2m(d);
2088 p2m_unlock(p2md);
2090 out:
2091 return rc;
2095 int
2096 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
2097 unsigned long mfn, unsigned int page_order,
2098 p2m_type_t t)
2100 unsigned long i, ogfn;
2101 p2m_type_t ot;
2102 mfn_t omfn;
2103 int pod_count = 0;
2104 int rc = 0;
2106 if ( !paging_mode_translate(d) )
2108 if ( need_iommu(d) && t == p2m_ram_rw )
2110 for ( i = 0; i < (1 << page_order); i++ )
2111 if ( (rc = iommu_map_page(d, mfn + i, mfn + i)) != 0 )
2113 while ( i-- > 0 )
2114 iommu_unmap_page(d, mfn + i);
2115 return rc;
2118 return 0;
2121 rc = gfn_check_limit(d, gfn, page_order);
2122 if ( rc != 0 )
2123 return rc;
2125 p2m_lock(d->arch.p2m);
2126 audit_p2m(d);
2128 P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2130 /* First, remove m->p mappings for existing p->m mappings */
2131 for ( i = 0; i < (1UL << page_order); i++ )
2133 omfn = gfn_to_mfn_query(d, gfn + i, &ot);
2134 if ( p2m_is_grant(ot) )
2136 /* Really shouldn't be unmapping grant maps this way */
2137 domain_crash(d);
2138 p2m_unlock(d->arch.p2m);
2139 return -EINVAL;
2141 else if ( p2m_is_ram(ot) )
2143 ASSERT(mfn_valid(omfn));
2144 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2146 else if ( ot == p2m_populate_on_demand )
2148 /* Count how man PoD entries we'll be replacing if successful */
2149 pod_count++;
2153 /* Then, look for m->p mappings for this range and deal with them */
2154 for ( i = 0; i < (1UL << page_order); i++ )
2156 if ( page_get_owner(mfn_to_page(_mfn(mfn + i))) != d )
2157 continue;
2158 ogfn = mfn_to_gfn(d, _mfn(mfn+i));
2159 if (
2160 #ifdef __x86_64__
2161 (ogfn != 0x5555555555555555L)
2162 #else
2163 (ogfn != 0x55555555L)
2164 #endif
2165 && (ogfn != INVALID_M2P_ENTRY)
2166 && (ogfn != gfn + i) )
2168 /* This machine frame is already mapped at another physical
2169 * address */
2170 P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2171 mfn + i, ogfn, gfn + i);
2172 omfn = gfn_to_mfn_query(d, ogfn, &ot);
2173 /* If we get here, we know the local domain owns the page,
2174 so it can't have been grant mapped in. */
2175 BUG_ON( p2m_is_grant(ot) );
2176 if ( p2m_is_ram(ot) )
2178 ASSERT(mfn_valid(omfn));
2179 P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
2180 ogfn , mfn_x(omfn));
2181 if ( mfn_x(omfn) == (mfn + i) )
2182 p2m_remove_page(d, ogfn, mfn + i, 0);
2187 /* Now, actually do the two-way mapping */
2188 if ( mfn_valid(_mfn(mfn)) )
2190 if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
2191 rc = -EINVAL;
2192 if ( !p2m_is_grant(t) )
2194 for ( i = 0; i < (1UL << page_order); i++ )
2195 set_gpfn_from_mfn(mfn+i, gfn+i);
2198 else
2200 gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
2201 gfn, mfn);
2202 if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order,
2203 p2m_invalid) )
2204 rc = -EINVAL;
2205 else
2207 d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */
2208 BUG_ON(d->arch.p2m->pod.entry_count < 0);
2212 audit_p2m(d);
2213 p2m_unlock(d->arch.p2m);
2215 return rc;
2218 /* Walk the whole p2m table, changing any entries of the old type
2219 * to the new type. This is used in hardware-assisted paging to
2220 * quickly enable or diable log-dirty tracking */
2221 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
2223 unsigned long mfn, gfn, flags;
2224 l1_pgentry_t l1e_content;
2225 l1_pgentry_t *l1e;
2226 l2_pgentry_t *l2e;
2227 mfn_t l1mfn, l2mfn;
2228 unsigned long i1, i2, i3;
2229 l3_pgentry_t *l3e;
2230 #if CONFIG_PAGING_LEVELS == 4
2231 l4_pgentry_t *l4e;
2232 unsigned long i4;
2233 #endif /* CONFIG_PAGING_LEVELS == 4 */
2235 BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
2237 if ( !paging_mode_translate(d) )
2238 return;
2240 if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
2241 return;
2243 ASSERT(p2m_locked_by_me(d->arch.p2m));
2245 #if CONFIG_PAGING_LEVELS == 4
2246 l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
2247 #else /* CONFIG_PAGING_LEVELS == 3 */
2248 l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
2249 #endif
2251 #if CONFIG_PAGING_LEVELS >= 4
2252 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
2254 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
2256 continue;
2258 l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
2259 #endif
2260 for ( i3 = 0;
2261 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
2262 i3++ )
2264 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
2266 continue;
2268 l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
2269 l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
2270 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
2272 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
2274 continue;
2277 if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
2279 flags = l2e_get_flags(l2e[i2]);
2280 if ( p2m_flags_to_type(flags) != ot )
2281 continue;
2282 mfn = l2e_get_pfn(l2e[i2]);
2283 /* Do not use get_gpfn_from_mfn because it may return
2284 SHARED_M2P_ENTRY */
2285 gfn = (i2 + (i3
2286 #if CONFIG_PAGING_LEVELS >= 4
2287 + (i4 * L3_PAGETABLE_ENTRIES)
2288 #endif
2290 * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
2291 flags = p2m_type_to_flags(nt);
2292 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
2293 paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
2294 l2mfn, l1e_content, 2);
2295 continue;
2298 l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
2299 l1e = map_domain_page(mfn_x(l1mfn));
2301 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
2303 flags = l1e_get_flags(l1e[i1]);
2304 if ( p2m_flags_to_type(flags) != ot )
2305 continue;
2306 mfn = l1e_get_pfn(l1e[i1]);
2307 gfn = i1 + (i2 + (i3
2308 #if CONFIG_PAGING_LEVELS >= 4
2309 + (i4 * L3_PAGETABLE_ENTRIES)
2310 #endif
2312 * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
2313 /* create a new 1le entry with the new type */
2314 flags = p2m_type_to_flags(nt);
2315 l1e_content = l1e_from_pfn(mfn, flags);
2316 paging_write_p2m_entry(d, gfn, &l1e[i1],
2317 l1mfn, l1e_content, 1);
2319 unmap_domain_page(l1e);
2321 unmap_domain_page(l2e);
2323 #if CONFIG_PAGING_LEVELS >= 4
2324 unmap_domain_page(l3e);
2326 #endif
2328 #if CONFIG_PAGING_LEVELS == 4
2329 unmap_domain_page(l4e);
2330 #else /* CONFIG_PAGING_LEVELS == 3 */
2331 unmap_domain_page(l3e);
2332 #endif
2336 /* Modify the p2m type of a single gfn from ot to nt, returning the
2337 * entry's previous type */
2338 p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
2339 p2m_type_t ot, p2m_type_t nt)
2341 p2m_type_t pt;
2342 mfn_t mfn;
2344 BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
2346 p2m_lock(d->arch.p2m);
2348 mfn = gfn_to_mfn(d, gfn, &pt);
2349 if ( pt == ot )
2350 set_p2m_entry(d, gfn, mfn, 0, nt);
2352 p2m_unlock(d->arch.p2m);
2354 return pt;
2357 int
2358 set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
2360 int rc = 0;
2361 p2m_type_t ot;
2362 mfn_t omfn;
2364 if ( !paging_mode_translate(d) )
2365 return 0;
2367 omfn = gfn_to_mfn_query(d, gfn, &ot);
2368 if ( p2m_is_grant(ot) )
2370 domain_crash(d);
2371 return 0;
2373 else if ( p2m_is_ram(ot) )
2375 ASSERT(mfn_valid(omfn));
2376 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2379 P2M_DEBUG("set mmio %lx %lx\n", gfn, mfn_x(mfn));
2380 p2m_lock(d->arch.p2m);
2381 rc = set_p2m_entry(d, gfn, mfn, 0, p2m_mmio_direct);
2382 p2m_unlock(d->arch.p2m);
2383 if ( 0 == rc )
2384 gdprintk(XENLOG_ERR,
2385 "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
2386 gmfn_to_mfn(d, gfn));
2387 return rc;
2390 int
2391 clear_mmio_p2m_entry(struct domain *d, unsigned long gfn)
2393 int rc = 0;
2394 unsigned long mfn;
2396 if ( !paging_mode_translate(d) )
2397 return 0;
2399 mfn = gmfn_to_mfn(d, gfn);
2400 if ( INVALID_MFN == mfn )
2402 gdprintk(XENLOG_ERR,
2403 "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
2404 return 0;
2406 p2m_lock(d->arch.p2m);
2407 rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0, 0);
2408 p2m_unlock(d->arch.p2m);
2410 return rc;
2413 int
2414 set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
2416 int rc = 0;
2417 p2m_type_t ot;
2418 mfn_t omfn;
2420 if ( !paging_mode_translate(d) )
2421 return 0;
2423 omfn = gfn_to_mfn_query(d, gfn, &ot);
2424 /* At the moment we only allow p2m change if gfn has already been made
2425 * sharable first */
2426 ASSERT(p2m_is_shared(ot));
2427 ASSERT(mfn_valid(omfn));
2428 /* XXX: M2P translations have to be handled properly for shared pages */
2429 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2431 P2M_DEBUG("set shared %lx %lx\n", gfn, mfn_x(mfn));
2432 rc = set_p2m_entry(d, gfn, mfn, 0, p2m_ram_shared);
2433 if ( 0 == rc )
2434 gdprintk(XENLOG_ERR,
2435 "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
2436 gmfn_to_mfn(d, gfn));
2437 return rc;
2440 int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn)
2442 struct page_info *page;
2443 p2m_type_t p2mt;
2444 mfn_t mfn;
2445 int ret;
2447 mfn = gfn_to_mfn(d, gfn, &p2mt);
2449 /* Check if mfn is valid */
2450 ret = -EINVAL;
2451 if ( !mfn_valid(mfn) )
2452 goto out;
2454 /* Check p2m type */
2455 ret = -EAGAIN;
2456 if ( !p2m_is_pageable(p2mt) )
2457 goto out;
2459 /* Check for io memory page */
2460 if ( is_iomem_page(mfn_x(mfn)) )
2461 goto out;
2463 /* Check page count and type */
2464 page = mfn_to_page(mfn);
2465 if ( (page->count_info & (PGC_count_mask | PGC_allocated)) !=
2466 (1 | PGC_allocated) )
2467 goto out;
2469 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_none )
2470 goto out;
2472 /* Fix p2m entry */
2473 p2m_lock(d->arch.p2m);
2474 set_p2m_entry(d, gfn, mfn, 0, p2m_ram_paging_out);
2475 p2m_unlock(d->arch.p2m);
2477 ret = 0;
2479 out:
2480 return ret;
2483 int p2m_mem_paging_evict(struct domain *d, unsigned long gfn)
2485 struct page_info *page;
2486 p2m_type_t p2mt;
2487 mfn_t mfn;
2489 /* Get mfn */
2490 mfn = gfn_to_mfn(d, gfn, &p2mt);
2491 if ( unlikely(!mfn_valid(mfn)) )
2492 return -EINVAL;
2494 if ( (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) ||
2495 (p2mt == p2m_ram_paging_in_start) )
2496 return -EINVAL;
2498 /* Get the page so it doesn't get modified under Xen's feet */
2499 page = mfn_to_page(mfn);
2500 if ( unlikely(!get_page(page, d)) )
2501 return -EINVAL;
2503 /* Decrement guest domain's ref count of the page */
2504 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
2505 put_page(page);
2507 /* Remove mapping from p2m table */
2508 p2m_lock(d->arch.p2m);
2509 set_p2m_entry(d, gfn, _mfn(PAGING_MFN), 0, p2m_ram_paged);
2510 p2m_unlock(d->arch.p2m);
2512 /* Put the page back so it gets freed */
2513 put_page(page);
2515 return 0;
2518 void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
2520 struct vcpu *v = current;
2521 mem_event_request_t req;
2522 p2m_type_t p2mt;
2524 memset(&req, 0, sizeof(req));
2526 /* Check that there's space on the ring for this request */
2527 if ( mem_event_check_ring(d) )
2528 return;
2530 /* Fix p2m mapping */
2531 /* XXX: It seems inefficient to have this here, as it's only needed
2532 * in one case (ept guest accessing paging out page) */
2533 gfn_to_mfn(d, gfn, &p2mt);
2534 if ( p2mt != p2m_ram_paging_out )
2536 p2m_lock(d->arch.p2m);
2537 set_p2m_entry(d, gfn, _mfn(PAGING_MFN), 0, p2m_ram_paging_in_start);
2538 p2m_unlock(d->arch.p2m);
2541 /* Pause domain */
2542 if ( v->domain->domain_id == d->domain_id )
2544 vcpu_pause_nosync(v);
2545 req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED;
2548 /* Send request to pager */
2549 req.gfn = gfn;
2550 req.p2mt = p2mt;
2551 req.vcpu_id = v->vcpu_id;
2553 mem_event_put_request(d, &req);
2556 int p2m_mem_paging_prep(struct domain *d, unsigned long gfn)
2558 struct page_info *page;
2560 /* Get a free page */
2561 page = alloc_domheap_page(d, 0);
2562 if ( unlikely(page == NULL) )
2563 return -EINVAL;
2565 /* Fix p2m mapping */
2566 p2m_lock(d->arch.p2m);
2567 set_p2m_entry(d, gfn, page_to_mfn(page), 0, p2m_ram_paging_in);
2568 p2m_unlock(d->arch.p2m);
2570 return 0;
2573 void p2m_mem_paging_resume(struct domain *d)
2575 mem_event_response_t rsp;
2576 p2m_type_t p2mt;
2577 mfn_t mfn;
2579 /* Pull the response off the ring */
2580 mem_event_get_response(d, &rsp);
2582 /* Fix p2m entry */
2583 mfn = gfn_to_mfn(d, rsp.gfn, &p2mt);
2584 p2m_lock(d->arch.p2m);
2585 set_p2m_entry(d, rsp.gfn, mfn, 0, p2m_ram_rw);
2586 p2m_unlock(d->arch.p2m);
2588 /* Unpause domain */
2589 if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
2590 vcpu_unpause(d->vcpu[rsp.vcpu_id]);
2592 /* Unpause any domains that were paused because the ring was full */
2593 mem_event_unpause_vcpus(d);
2597 /*
2598 * Local variables:
2599 * mode: C
2600 * c-set-style: "BSD"
2601 * c-basic-offset: 4
2602 * indent-tabs-mode: nil
2603 * End:
2604 */