debuggers.hg

view xen/arch/x86/memory.c @ 3668:d55d523078f7

bitkeeper revision 1.1159.212.77 (4202221693AFbvFZWeMHHIjQfbzTIQ)

More x86_64 prgress. Many more gaps filled in. Next step is DOM0
construction.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Thu Feb 03 13:07:34 2005 +0000 (2005-02-03)
parents 060c1ea52343
children 677cb76cff18
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
105 #ifdef VERBOSE
106 #define MEM_LOG(_f, _a...) \
107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
108 current->domain->id , __LINE__ , ## _a )
109 #else
110 #define MEM_LOG(_f, _a...) ((void)0)
111 #endif
113 static int alloc_l2_table(struct pfn_info *page);
114 static int alloc_l1_table(struct pfn_info *page);
115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
117 u32 type,
118 struct domain *d);
120 static void free_l2_table(struct pfn_info *page);
121 static void free_l1_table(struct pfn_info *page);
123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
126 /* Used to defer flushing of memory structures. */
127 static struct {
128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
130 unsigned long deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct pfn_info *frame_table;
146 unsigned long frame_table_size;
147 unsigned long max_page;
149 void __init init_frametable(void)
150 {
151 unsigned long i, p;
153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
154 frame_table_size = max_page * sizeof(struct pfn_info);
155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
158 {
159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
160 if ( p == 0 )
161 panic("Not enough memory for frame table\n");
162 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
163 4UL << 20, PAGE_HYPERVISOR);
164 }
166 memset(frame_table, 0, frame_table_size);
167 }
169 void arch_init_memory(void)
170 {
171 extern void subarch_init_memory(struct domain *);
173 memset(percpu_info, 0, sizeof(percpu_info));
175 /*
176 * Initialise our DOMID_XEN domain.
177 * Any Xen-heap pages that we will allow to be mapped will have
178 * their domain field set to dom_xen.
179 */
180 dom_xen = alloc_domain_struct();
181 atomic_set(&dom_xen->refcnt, 1);
182 dom_xen->id = DOMID_XEN;
184 /*
185 * Initialise our DOMID_IO domain.
186 * This domain owns no pages but is considered a special case when
187 * mapping I/O pages, as the mappings occur at the priv of the caller.
188 */
189 dom_io = alloc_domain_struct();
190 atomic_set(&dom_io->refcnt, 1);
191 dom_io->id = DOMID_IO;
193 subarch_init_memory(dom_xen);
194 }
196 static void __invalidate_shadow_ldt(struct exec_domain *d)
197 {
198 int i;
199 unsigned long pfn;
200 struct pfn_info *page;
202 d->mm.shadow_ldt_mapcnt = 0;
204 for ( i = 16; i < 32; i++ )
205 {
206 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
207 if ( pfn == 0 ) continue;
208 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
209 page = &frame_table[pfn];
210 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
211 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
212 put_page_and_type(page);
213 }
215 /* Dispose of the (now possibly invalid) mappings from the TLB. */
216 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
217 }
220 static inline void invalidate_shadow_ldt(struct exec_domain *d)
221 {
222 if ( d->mm.shadow_ldt_mapcnt != 0 )
223 __invalidate_shadow_ldt(d);
224 }
227 static int alloc_segdesc_page(struct pfn_info *page)
228 {
229 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
230 int i;
232 for ( i = 0; i < 512; i++ )
233 if ( unlikely(!check_descriptor(&descs[i*2])) )
234 goto fail;
236 unmap_domain_mem(descs);
237 return 1;
239 fail:
240 unmap_domain_mem(descs);
241 return 0;
242 }
245 /* Map shadow page at offset @off. */
246 int map_ldt_shadow_page(unsigned int off)
247 {
248 struct exec_domain *ed = current;
249 struct domain *d = ed->domain;
250 unsigned long l1e;
252 if ( unlikely(in_irq()) )
253 BUG();
255 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
256 PAGE_SHIFT) + off]);
258 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
259 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
260 d, PGT_ldt_page)) )
261 return 0;
263 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
264 ed->mm.shadow_ldt_mapcnt++;
266 return 1;
267 }
270 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
271 {
272 struct pfn_info *page = &frame_table[page_nr];
274 if ( unlikely(!pfn_is_ram(page_nr)) )
275 {
276 MEM_LOG("Pfn %08lx is not RAM", page_nr);
277 return 0;
278 }
280 if ( unlikely(!get_page(page, d)) )
281 {
282 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
283 return 0;
284 }
286 return 1;
287 }
290 static int get_page_and_type_from_pagenr(unsigned long page_nr,
291 u32 type,
292 struct domain *d)
293 {
294 struct pfn_info *page = &frame_table[page_nr];
296 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
297 return 0;
299 if ( unlikely(!get_page_type(page, type)) )
300 {
301 #ifdef VERBOSE
302 if ( (type & PGT_type_mask) != PGT_l1_page_table )
303 MEM_LOG("Bad page type for pfn %08lx (%08x)",
304 page_nr, page->u.inuse.type_info);
305 #endif
306 put_page(page);
307 return 0;
308 }
310 return 1;
311 }
314 /*
315 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
316 * needs some special care with reference counst and access permissions:
317 * 1. The mapping entry must be read-only, or the guest may get write access
318 * to its own PTEs.
319 * 2. We must only bump the reference counts for an *already validated*
320 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
321 * on a validation that is required to complete that validation.
322 * 3. We only need to increment the reference counts for the mapped page
323 * frame if it is mapped by a different L2 table. This is sufficient and
324 * also necessary to allow validation of an L2 table mapping itself.
325 */
326 static int
327 get_linear_pagetable(
328 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
329 {
330 u32 x, y;
331 struct pfn_info *page;
333 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
334 {
335 MEM_LOG("Attempt to create linear p.t. with write perms");
336 return 0;
337 }
339 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
340 {
341 /* Make sure the mapped frame belongs to the correct domain. */
342 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
343 return 0;
345 /*
346 * Make sure that the mapped frame is an already-validated L2 table.
347 * If so, atomically increment the count (checking for overflow).
348 */
349 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
350 y = page->u.inuse.type_info;
351 do {
352 x = y;
353 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
354 unlikely((x & (PGT_type_mask|PGT_validated)) !=
355 (PGT_l2_page_table|PGT_validated)) )
356 {
357 put_page(page);
358 return 0;
359 }
360 }
361 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
362 }
364 return 1;
365 }
368 static int
369 get_page_from_l1e(
370 l1_pgentry_t l1e, struct domain *d)
371 {
372 unsigned long l1v = l1_pgentry_val(l1e);
373 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
374 struct pfn_info *page = &frame_table[pfn];
375 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
377 if ( !(l1v & _PAGE_PRESENT) )
378 return 1;
380 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
381 {
382 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
383 return 0;
384 }
386 if ( unlikely(!pfn_is_ram(pfn)) )
387 {
388 /* Revert to caller privileges if FD == DOMID_IO. */
389 if ( d == dom_io )
390 d = current->domain;
392 if ( IS_PRIV(d) )
393 return 1;
395 if ( IS_CAPABLE_PHYSDEV(d) )
396 return domain_iomem_in_pfn(d, pfn);
398 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
399 return 0;
400 }
402 return ((l1v & _PAGE_RW) ?
403 get_page_and_type(page, d, PGT_writable_page) :
404 get_page(page, d));
405 }
408 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
409 static int
410 get_page_from_l2e(
411 l2_pgentry_t l2e, unsigned long pfn,
412 struct domain *d, unsigned long va_idx)
413 {
414 int rc;
416 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
417 return 1;
419 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
420 {
421 MEM_LOG("Bad L2 page type settings %04lx",
422 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
423 return 0;
424 }
426 rc = get_page_and_type_from_pagenr(
427 l2_pgentry_to_pagenr(l2e),
428 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
430 if ( unlikely(!rc) )
431 return get_linear_pagetable(l2e, pfn, d);
433 return 1;
434 }
437 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
438 {
439 unsigned long l1v = l1_pgentry_val(l1e);
440 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
441 struct pfn_info *page = &frame_table[pfn];
442 struct domain *e;
444 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
445 return;
447 e = page->u.inuse.domain;
448 if ( unlikely(e != d) )
449 {
450 /*
451 * Unmap a foreign page that may have been mapped via a grant table.
452 * Note that this can fail for a privileged domain that can map foreign
453 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
454 * counted via a grant entry and some counted directly in the page
455 * structure's reference count. Note that reference counts won't get
456 * dangerously confused as long as we always try to decrement the
457 * grant entry first. We may end up with a mismatch between which
458 * mappings and which unmappings are counted via the grant entry, but
459 * really it doesn't matter as privileged domains have carte blanche.
460 */
461 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
462 return;
463 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
464 }
466 if ( l1v & _PAGE_RW )
467 {
468 put_page_and_type(page);
469 }
470 else
471 {
472 /* We expect this is rare so we blow the entire shadow LDT. */
473 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
474 PGT_ldt_page)) &&
475 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
476 invalidate_shadow_ldt(e->exec_domain[0]);
477 put_page(page);
478 }
479 }
482 /*
483 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
484 * Note also that this automatically deals correctly with linear p.t.'s.
485 */
486 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
487 {
488 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
489 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
490 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
491 }
494 static int alloc_l2_table(struct pfn_info *page)
495 {
496 struct domain *d = page->u.inuse.domain;
497 unsigned long page_nr = page_to_pfn(page);
498 l2_pgentry_t *pl2e;
499 int i;
501 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
503 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
504 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
505 goto fail;
507 #if defined(__i386__)
508 /* Now we add our private high mappings. */
509 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
510 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
511 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
512 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
513 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
514 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
515 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
516 __PAGE_HYPERVISOR);
517 #endif
519 unmap_domain_mem(pl2e);
520 return 1;
522 fail:
523 while ( i-- > 0 )
524 put_page_from_l2e(pl2e[i], page_nr);
526 unmap_domain_mem(pl2e);
527 return 0;
528 }
531 static int alloc_l1_table(struct pfn_info *page)
532 {
533 struct domain *d = page->u.inuse.domain;
534 unsigned long page_nr = page_to_pfn(page);
535 l1_pgentry_t *pl1e;
536 int i;
538 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
540 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
541 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
542 goto fail;
544 unmap_domain_mem(pl1e);
545 return 1;
547 fail:
548 while ( i-- > 0 )
549 put_page_from_l1e(pl1e[i], d);
551 unmap_domain_mem(pl1e);
552 return 0;
553 }
556 static void free_l2_table(struct pfn_info *page)
557 {
558 unsigned long page_nr = page - frame_table;
559 l2_pgentry_t *pl2e;
560 int i;
562 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
564 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
565 put_page_from_l2e(pl2e[i], page_nr);
567 unmap_domain_mem(pl2e);
568 }
571 static void free_l1_table(struct pfn_info *page)
572 {
573 struct domain *d = page->u.inuse.domain;
574 unsigned long page_nr = page - frame_table;
575 l1_pgentry_t *pl1e;
576 int i;
578 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
580 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
581 put_page_from_l1e(pl1e[i], d);
583 unmap_domain_mem(pl1e);
584 }
587 static inline int update_l2e(l2_pgentry_t *pl2e,
588 l2_pgentry_t ol2e,
589 l2_pgentry_t nl2e)
590 {
591 unsigned long o = cmpxchg((unsigned long *)pl2e,
592 l2_pgentry_val(ol2e),
593 l2_pgentry_val(nl2e));
594 if ( o != l2_pgentry_val(ol2e) )
595 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
596 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
597 return (o == l2_pgentry_val(ol2e));
598 }
601 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
602 static int mod_l2_entry(l2_pgentry_t *pl2e,
603 l2_pgentry_t nl2e,
604 unsigned long pfn)
605 {
606 l2_pgentry_t ol2e;
607 unsigned long _ol2e;
609 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
610 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
611 {
612 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
613 return 0;
614 }
616 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
617 return 0;
618 ol2e = mk_l2_pgentry(_ol2e);
620 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
621 {
622 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
623 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
624 return update_l2e(pl2e, ol2e, nl2e);
626 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
627 ((unsigned long)pl2e &
628 ~PAGE_MASK) >> 2)) )
629 return 0;
631 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
632 {
633 put_page_from_l2e(nl2e, pfn);
634 return 0;
635 }
637 put_page_from_l2e(ol2e, pfn);
638 return 1;
639 }
641 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
642 return 0;
644 put_page_from_l2e(ol2e, pfn);
645 return 1;
646 }
649 static inline int update_l1e(l1_pgentry_t *pl1e,
650 l1_pgentry_t ol1e,
651 l1_pgentry_t nl1e)
652 {
653 unsigned long o = l1_pgentry_val(ol1e);
654 unsigned long n = l1_pgentry_val(nl1e);
656 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
657 unlikely(o != l1_pgentry_val(ol1e)) )
658 {
659 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
660 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
661 return 0;
662 }
664 return 1;
665 }
668 /* Update the L1 entry at pl1e to new value nl1e. */
669 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
670 {
671 l1_pgentry_t ol1e;
672 unsigned long _ol1e;
673 struct domain *d = current->domain;
675 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
676 {
677 MEM_LOG("Bad get_user\n");
678 return 0;
679 }
681 ol1e = mk_l1_pgentry(_ol1e);
683 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
684 {
685 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
686 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
687 return update_l1e(pl1e, ol1e, nl1e);
689 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
690 return 0;
692 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
693 {
694 put_page_from_l1e(nl1e, d);
695 return 0;
696 }
698 put_page_from_l1e(ol1e, d);
699 return 1;
700 }
702 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
703 return 0;
705 put_page_from_l1e(ol1e, d);
706 return 1;
707 }
710 int alloc_page_type(struct pfn_info *page, unsigned int type)
711 {
712 switch ( type )
713 {
714 case PGT_l1_page_table:
715 return alloc_l1_table(page);
716 case PGT_l2_page_table:
717 return alloc_l2_table(page);
718 case PGT_gdt_page:
719 case PGT_ldt_page:
720 return alloc_segdesc_page(page);
721 default:
722 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
723 type, page->u.inuse.type_info,
724 page->count_info);
725 BUG();
726 }
728 return 0;
729 }
732 void free_page_type(struct pfn_info *page, unsigned int type)
733 {
734 struct domain *d = page->u.inuse.domain;
736 switch ( type )
737 {
738 case PGT_l1_page_table:
739 free_l1_table(page);
740 break;
742 case PGT_l2_page_table:
743 free_l2_table(page);
744 break;
746 default:
747 BUG();
748 }
750 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
751 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
752 {
753 unshadow_table(page_to_pfn(page), type);
754 put_shadow_status(&d->exec_domain[0]->mm);
755 }
756 }
759 void put_page_type(struct pfn_info *page)
760 {
761 u32 nx, x, y = page->u.inuse.type_info;
763 again:
764 do {
765 x = y;
766 nx = x - 1;
768 ASSERT((x & PGT_count_mask) != 0);
770 /*
771 * The page should always be validated while a reference is held. The
772 * exception is during domain destruction, when we forcibly invalidate
773 * page-table pages if we detect a referential loop.
774 * See domain.c:relinquish_list().
775 */
776 ASSERT((x & PGT_validated) ||
777 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
779 if ( unlikely((nx & PGT_count_mask) == 0) )
780 {
781 /* Record TLB information for flush later. Races are harmless. */
782 page->tlbflush_timestamp = tlbflush_current_time();
784 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
785 likely(nx & PGT_validated) )
786 {
787 /*
788 * Page-table pages must be unvalidated when count is zero. The
789 * 'free' is safe because the refcnt is non-zero and validated
790 * bit is clear => other ops will spin or fail.
791 */
792 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
793 x & ~PGT_validated)) != x) )
794 goto again;
795 /* We cleared the 'valid bit' so we do the clear up. */
796 free_page_type(page, x & PGT_type_mask);
797 /* Carry on, but with the 'valid bit' now clear. */
798 x &= ~PGT_validated;
799 nx &= ~PGT_validated;
800 }
801 }
802 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
803 (PGT_pinned | 1)) )
804 {
805 /* Page is now only pinned. Make the back pointer mutable again. */
806 nx |= PGT_va_mutable;
807 }
808 }
809 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
810 }
813 int get_page_type(struct pfn_info *page, u32 type)
814 {
815 u32 nx, x, y = page->u.inuse.type_info;
817 again:
818 do {
819 x = y;
820 nx = x + 1;
821 if ( unlikely((nx & PGT_count_mask) == 0) )
822 {
823 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
824 return 0;
825 }
826 else if ( unlikely((x & PGT_count_mask) == 0) )
827 {
828 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
829 {
830 /*
831 * On type change we check to flush stale TLB entries. This
832 * may be unnecessary (e.g., page was GDT/LDT) but those
833 * circumstances should be very rare.
834 */
835 struct domain *d = page->u.inuse.domain;
836 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
837 page->tlbflush_timestamp)) )
838 {
839 perfc_incr(need_flush_tlb_flush);
840 flush_tlb_cpu(d->exec_domain[0]->processor);
841 }
843 /* We lose existing type, back pointer, and validity. */
844 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
845 nx |= type;
847 /* No special validation needed for writable pages. */
848 /* Page tables and GDT/LDT need to be scanned for validity. */
849 if ( type == PGT_writable_page )
850 nx |= PGT_validated;
851 }
852 }
853 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
854 {
855 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
856 {
857 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
858 ((type & PGT_type_mask) != PGT_l1_page_table) )
859 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
860 x & PGT_type_mask, type, page_to_pfn(page));
861 return 0;
862 }
863 else if ( (x & PGT_va_mask) == PGT_va_mutable )
864 {
865 /* The va backpointer is mutable, hence we update it. */
866 nx &= ~PGT_va_mask;
867 nx |= type; /* we know the actual type is correct */
868 }
869 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
870 {
871 /* This table is potentially mapped at multiple locations. */
872 nx &= ~PGT_va_mask;
873 nx |= PGT_va_unknown;
874 }
875 }
876 else if ( unlikely(!(x & PGT_validated)) )
877 {
878 /* Someone else is updating validation of this page. Wait... */
879 while ( (y = page->u.inuse.type_info) == x )
880 {
881 rep_nop();
882 barrier();
883 }
884 goto again;
885 }
886 }
887 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
889 if ( unlikely(!(nx & PGT_validated)) )
890 {
891 /* Try to validate page type; drop the new reference on failure. */
892 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
893 {
894 MEM_LOG("Error while validating pfn %08lx for type %08x."
895 " caf=%08x taf=%08x\n",
896 page_to_pfn(page), type,
897 page->count_info,
898 page->u.inuse.type_info);
899 /* Noone else can get a reference. We hold the only ref. */
900 page->u.inuse.type_info = 0;
901 return 0;
902 }
904 /* Noone else is updating simultaneously. */
905 __set_bit(_PGT_validated, &page->u.inuse.type_info);
906 }
908 return 1;
909 }
912 int new_guest_cr3(unsigned long pfn)
913 {
914 struct exec_domain *ed = current;
915 struct domain *d = ed->domain;
916 int okay, cpu = smp_processor_id();
917 unsigned long old_base_pfn;
919 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
920 if ( likely(okay) )
921 {
922 invalidate_shadow_ldt(ed);
924 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
925 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
926 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
928 shadow_mk_pagetable(&ed->mm);
930 write_ptbase(&ed->mm);
932 put_page_and_type(&frame_table[old_base_pfn]);
933 }
934 else
935 {
936 MEM_LOG("Error while installing new baseptr %08lx", pfn);
937 }
939 return okay;
940 }
942 static int do_extended_command(unsigned long ptr, unsigned long val)
943 {
944 int okay = 1, cpu = smp_processor_id();
945 unsigned int cmd = val & MMUEXT_CMD_MASK;
946 unsigned long pfn = ptr >> PAGE_SHIFT;
947 struct pfn_info *page = &frame_table[pfn];
948 struct exec_domain *ed = current;
949 struct domain *d = ed->domain, *nd, *e;
950 u32 x, y;
951 domid_t domid;
952 grant_ref_t gntref;
954 switch ( cmd )
955 {
956 case MMUEXT_PIN_L1_TABLE:
957 case MMUEXT_PIN_L2_TABLE:
958 /*
959 * We insist that, if you pin an L1 page, it's the first thing that
960 * you do to it. This is because we require the backptr to still be
961 * mutable. This assumption seems safe.
962 */
963 okay = get_page_and_type_from_pagenr(
964 pfn,
965 ((cmd==MMUEXT_PIN_L2_TABLE) ?
966 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
967 FOREIGNDOM);
969 if ( unlikely(!okay) )
970 {
971 MEM_LOG("Error while pinning pfn %08lx", pfn);
972 break;
973 }
975 if ( unlikely(test_and_set_bit(_PGT_pinned,
976 &page->u.inuse.type_info)) )
977 {
978 MEM_LOG("Pfn %08lx already pinned", pfn);
979 put_page_and_type(page);
980 okay = 0;
981 break;
982 }
984 break;
986 case MMUEXT_UNPIN_TABLE:
987 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
988 {
989 MEM_LOG("Page %08lx bad domain (dom=%p)",
990 ptr, page->u.inuse.domain);
991 }
992 else if ( likely(test_and_clear_bit(_PGT_pinned,
993 &page->u.inuse.type_info)) )
994 {
995 put_page_and_type(page);
996 put_page(page);
997 }
998 else
999 {
1000 okay = 0;
1001 put_page(page);
1002 MEM_LOG("Pfn %08lx not pinned", pfn);
1004 break;
1006 case MMUEXT_NEW_BASEPTR:
1007 okay = new_guest_cr3(pfn);
1008 break;
1010 case MMUEXT_TLB_FLUSH:
1011 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1012 break;
1014 case MMUEXT_INVLPG:
1015 __flush_tlb_one(ptr);
1016 break;
1018 case MMUEXT_FLUSH_CACHE:
1019 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1021 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1022 okay = 0;
1024 else
1026 wbinvd();
1028 break;
1030 case MMUEXT_SET_LDT:
1032 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1033 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1034 (ents > 8192) ||
1035 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1036 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1038 okay = 0;
1039 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1041 else if ( (ed->mm.ldt_ents != ents) ||
1042 (ed->mm.ldt_base != ptr) )
1044 invalidate_shadow_ldt(ed);
1045 ed->mm.ldt_base = ptr;
1046 ed->mm.ldt_ents = ents;
1047 load_LDT(ed);
1048 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1049 if ( ents != 0 )
1050 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1052 break;
1055 case MMUEXT_SET_FOREIGNDOM:
1056 domid = (domid_t)(val >> 16);
1058 if ( (e = percpu_info[cpu].foreign) != NULL )
1059 put_domain(e);
1060 percpu_info[cpu].foreign = NULL;
1062 if ( !IS_PRIV(d) )
1064 switch ( domid )
1066 case DOMID_IO:
1067 get_knownalive_domain(dom_io);
1068 percpu_info[cpu].foreign = dom_io;
1069 break;
1070 default:
1071 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1072 okay = 0;
1073 break;
1076 else
1078 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1079 if ( e == NULL )
1081 switch ( domid )
1083 case DOMID_XEN:
1084 get_knownalive_domain(dom_xen);
1085 percpu_info[cpu].foreign = dom_xen;
1086 break;
1087 case DOMID_IO:
1088 get_knownalive_domain(dom_io);
1089 percpu_info[cpu].foreign = dom_io;
1090 break;
1091 default:
1092 MEM_LOG("Unknown domain '%u'", domid);
1093 okay = 0;
1094 break;
1098 break;
1100 case MMUEXT_TRANSFER_PAGE:
1101 domid = (domid_t)(val >> 16);
1102 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1104 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1105 unlikely(!pfn_is_ram(pfn)) ||
1106 unlikely((e = find_domain_by_id(domid)) == NULL) )
1108 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1109 okay = 0;
1110 break;
1113 spin_lock(&d->page_alloc_lock);
1115 /*
1116 * The tricky bit: atomically release ownership while there is just one
1117 * benign reference to the page (PGC_allocated). If that reference
1118 * disappears then the deallocation routine will safely spin.
1119 */
1120 nd = page->u.inuse.domain;
1121 y = page->count_info;
1122 do {
1123 x = y;
1124 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1125 (1|PGC_allocated)) ||
1126 unlikely(nd != d) )
1128 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1129 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1130 d, d->id, nd, x, page->u.inuse.type_info);
1131 spin_unlock(&d->page_alloc_lock);
1132 put_domain(e);
1133 return 0;
1135 __asm__ __volatile__(
1136 LOCK_PREFIX "cmpxchg8b %2"
1137 : "=d" (nd), "=a" (y),
1138 "=m" (*(volatile u64 *)(&page->count_info))
1139 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1141 while ( unlikely(nd != d) || unlikely(y != x) );
1143 /*
1144 * Unlink from 'd'. At least one reference remains (now anonymous), so
1145 * noone else is spinning to try to delete this page from 'd'.
1146 */
1147 d->tot_pages--;
1148 list_del(&page->list);
1150 spin_unlock(&d->page_alloc_lock);
1152 spin_lock(&e->page_alloc_lock);
1154 /*
1155 * Check that 'e' will accept the page and has reservation headroom.
1156 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1157 */
1158 ASSERT(e->tot_pages <= e->max_pages);
1159 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1160 unlikely(e->tot_pages == e->max_pages) ||
1161 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1163 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1164 "provided a bad grant ref, or is dying (%08lx).\n",
1165 e->tot_pages, e->max_pages, e->d_flags);
1166 spin_unlock(&e->page_alloc_lock);
1167 put_domain(e);
1168 okay = 0;
1169 break;
1172 /* Okay, add the page to 'e'. */
1173 if ( unlikely(e->tot_pages++ == 0) )
1174 get_knownalive_domain(e);
1175 list_add_tail(&page->list, &e->page_list);
1176 page->u.inuse.domain = e;
1178 spin_unlock(&e->page_alloc_lock);
1180 /* Transfer is all done: tell the guest about its new page frame. */
1181 gnttab_notify_transfer(e, gntref, pfn);
1183 put_domain(e);
1184 break;
1186 case MMUEXT_REASSIGN_PAGE:
1187 if ( unlikely(!IS_PRIV(d)) )
1189 MEM_LOG("Dom %u has no reassignment priv", d->id);
1190 okay = 0;
1191 break;
1194 e = percpu_info[cpu].foreign;
1195 if ( unlikely(e == NULL) )
1197 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1198 okay = 0;
1199 break;
1202 /*
1203 * Grab both page_list locks, in order. This prevents the page from
1204 * disappearing elsewhere while we modify the owner, and we'll need
1205 * both locks if we're successful so that we can change lists.
1206 */
1207 if ( d < e )
1209 spin_lock(&d->page_alloc_lock);
1210 spin_lock(&e->page_alloc_lock);
1212 else
1214 spin_lock(&e->page_alloc_lock);
1215 spin_lock(&d->page_alloc_lock);
1218 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1219 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1220 unlikely(IS_XEN_HEAP_FRAME(page)) )
1222 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1223 okay = 0;
1224 goto reassign_fail;
1227 /*
1228 * The tricky bit: atomically change owner while there is just one
1229 * benign reference to the page (PGC_allocated). If that reference
1230 * disappears then the deallocation routine will safely spin.
1231 */
1232 nd = page->u.inuse.domain;
1233 y = page->count_info;
1234 do {
1235 x = y;
1236 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1237 (1|PGC_allocated)) ||
1238 unlikely(nd != d) )
1240 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1241 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1242 d, d->id, nd, x, page->u.inuse.type_info);
1243 okay = 0;
1244 goto reassign_fail;
1246 __asm__ __volatile__(
1247 LOCK_PREFIX "cmpxchg8b %3"
1248 : "=d" (nd), "=a" (y), "=c" (e),
1249 "=m" (*(volatile u64 *)(&page->count_info))
1250 : "0" (d), "1" (x), "c" (e), "b" (x) );
1252 while ( unlikely(nd != d) || unlikely(y != x) );
1254 /*
1255 * Unlink from 'd'. We transferred at least one reference to 'e', so
1256 * noone else is spinning to try to delete this page from 'd'.
1257 */
1258 d->tot_pages--;
1259 list_del(&page->list);
1261 /*
1262 * Add the page to 'e'. Someone may already have removed the last
1263 * reference and want to remove the page from 'e'. However, we have
1264 * the lock so they'll spin waiting for us.
1265 */
1266 if ( unlikely(e->tot_pages++ == 0) )
1267 get_knownalive_domain(e);
1268 list_add_tail(&page->list, &e->page_list);
1270 reassign_fail:
1271 spin_unlock(&d->page_alloc_lock);
1272 spin_unlock(&e->page_alloc_lock);
1273 break;
1275 case MMUEXT_CLEAR_FOREIGNDOM:
1276 if ( (e = percpu_info[cpu].foreign) != NULL )
1277 put_domain(e);
1278 percpu_info[cpu].foreign = NULL;
1279 break;
1281 default:
1282 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1283 okay = 0;
1284 break;
1287 return okay;
1290 int do_mmu_update(
1291 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1293 /*
1294 * We steal the m.s.b. of the @count parameter to indicate whether this
1295 * invocation of do_mmu_update() is resuming a previously preempted call.
1296 * We steal the next 15 bits to remember the current FOREIGNDOM.
1297 */
1298 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1299 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1300 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1302 mmu_update_t req;
1303 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1304 struct pfn_info *page;
1305 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1306 unsigned int cmd, done = 0;
1307 unsigned long prev_spfn = 0;
1308 l1_pgentry_t *prev_spl1e = 0;
1309 struct exec_domain *ed = current;
1310 struct domain *d = ed->domain;
1311 u32 type_info;
1312 domid_t domid;
1314 LOCK_BIGLOCK(d);
1316 cleanup_writable_pagetable(d);
1318 /*
1319 * If we are resuming after preemption, read how much work we have already
1320 * done. This allows us to set the @done output parameter correctly.
1321 * We also reset FOREIGNDOM here.
1322 */
1323 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1325 if ( !(count & MMU_UPDATE_PREEMPTED) )
1327 /* Count overflow into private FOREIGNDOM field. */
1328 MEM_LOG("do_mmu_update count is too large");
1329 rc = -EINVAL;
1330 goto out;
1332 count &= ~MMU_UPDATE_PREEMPTED;
1333 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1334 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1335 if ( unlikely(pdone != NULL) )
1336 (void)get_user(done, pdone);
1337 if ( (domid != current->domain->id) &&
1338 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1340 rc = -EINVAL;
1341 goto out;
1345 perfc_incrc(calls_to_mmu_update);
1346 perfc_addc(num_page_updates, count);
1348 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1350 rc = -EFAULT;
1351 goto out;
1354 for ( i = 0; i < count; i++ )
1356 if ( hypercall_preempt_check() )
1358 rc = hypercall_create_continuation(
1359 __HYPERVISOR_mmu_update, 3, ureqs,
1360 (count - i) |
1361 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1362 MMU_UPDATE_PREEMPTED, pdone);
1363 break;
1366 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1368 MEM_LOG("Bad __copy_from_user");
1369 rc = -EFAULT;
1370 break;
1373 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1374 pfn = req.ptr >> PAGE_SHIFT;
1376 okay = 0;
1378 switch ( cmd )
1380 /*
1381 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1382 */
1383 case MMU_NORMAL_PT_UPDATE:
1384 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1386 MEM_LOG("Could not get page for normal update");
1387 break;
1390 if ( likely(prev_pfn == pfn) )
1392 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1394 else
1396 if ( prev_pfn != 0 )
1397 unmap_domain_mem((void *)va);
1398 va = (unsigned long)map_domain_mem(req.ptr);
1399 prev_pfn = pfn;
1402 page = &frame_table[pfn];
1403 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1405 case PGT_l1_page_table:
1406 if ( likely(get_page_type(
1407 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1409 okay = mod_l1_entry((l1_pgentry_t *)va,
1410 mk_l1_pgentry(req.val));
1412 if ( unlikely(ed->mm.shadow_mode) && okay &&
1413 (get_shadow_status(&ed->mm, page-frame_table) &
1414 PSH_shadowed) )
1416 shadow_l1_normal_pt_update(
1417 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1418 put_shadow_status(&ed->mm);
1421 put_page_type(page);
1423 break;
1424 case PGT_l2_page_table:
1425 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1427 okay = mod_l2_entry((l2_pgentry_t *)va,
1428 mk_l2_pgentry(req.val),
1429 pfn);
1431 if ( unlikely(ed->mm.shadow_mode) && okay &&
1432 (get_shadow_status(&ed->mm, page-frame_table) &
1433 PSH_shadowed) )
1435 shadow_l2_normal_pt_update(req.ptr, req.val);
1436 put_shadow_status(&ed->mm);
1439 put_page_type(page);
1441 break;
1442 default:
1443 if ( likely(get_page_type(page, PGT_writable_page)) )
1445 *(unsigned long *)va = req.val;
1446 okay = 1;
1447 put_page_type(page);
1449 break;
1452 put_page(page);
1453 break;
1455 case MMU_MACHPHYS_UPDATE:
1456 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1458 MEM_LOG("Could not get page for mach->phys update");
1459 break;
1462 machine_to_phys_mapping[pfn] = req.val;
1463 okay = 1;
1465 /*
1466 * If in log-dirty mode, mark the corresponding pseudo-physical
1467 * page as dirty.
1468 */
1469 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1470 mark_dirty(&ed->mm, pfn) )
1471 ed->mm.shadow_dirty_block_count++;
1473 put_page(&frame_table[pfn]);
1474 break;
1476 /*
1477 * MMU_EXTENDED_COMMAND: Extended command is specified
1478 * in the least-siginificant bits of the 'value' field.
1479 */
1480 case MMU_EXTENDED_COMMAND:
1481 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1482 okay = do_extended_command(req.ptr, req.val);
1483 break;
1485 default:
1486 MEM_LOG("Invalid page update command %08lx", req.ptr);
1487 break;
1490 if ( unlikely(!okay) )
1492 rc = -EINVAL;
1493 break;
1496 ureqs++;
1499 out:
1500 if ( prev_pfn != 0 )
1501 unmap_domain_mem((void *)va);
1503 if ( unlikely(prev_spl1e != 0) )
1504 unmap_domain_mem((void *)prev_spl1e);
1506 deferred_ops = percpu_info[cpu].deferred_ops;
1507 percpu_info[cpu].deferred_ops = 0;
1509 if ( deferred_ops & DOP_FLUSH_TLB )
1510 local_flush_tlb();
1512 if ( deferred_ops & DOP_RELOAD_LDT )
1513 (void)map_ldt_shadow_page(0);
1515 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1517 put_domain(percpu_info[cpu].foreign);
1518 percpu_info[cpu].foreign = NULL;
1521 /* Add incremental work we have done to the @done output parameter. */
1522 if ( unlikely(pdone != NULL) )
1523 __put_user(done + i, pdone);
1525 UNLOCK_BIGLOCK(d);
1526 return rc;
1530 int do_update_va_mapping(unsigned long page_nr,
1531 unsigned long val,
1532 unsigned long flags)
1534 struct exec_domain *ed = current;
1535 struct domain *d = ed->domain;
1536 int err = 0;
1537 unsigned int cpu = ed->processor;
1538 unsigned long deferred_ops;
1540 perfc_incrc(calls_to_update_va);
1542 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1543 return -EINVAL;
1545 LOCK_BIGLOCK(d);
1547 cleanup_writable_pagetable(d);
1549 /*
1550 * XXX When we make this support 4MB superpages we should also deal with
1551 * the case of updating L2 entries.
1552 */
1554 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1555 mk_l1_pgentry(val))) )
1556 err = -EINVAL;
1558 if ( unlikely(ed->mm.shadow_mode) )
1560 unsigned long sval;
1562 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1564 if ( unlikely(__put_user(sval, ((unsigned long *)(
1565 &shadow_linear_pg_table[page_nr])))) )
1567 /*
1568 * Since L2's are guranteed RW, failure indicates the page was not
1569 * shadowed, so ignore.
1570 */
1571 perfc_incrc(shadow_update_va_fail);
1574 /*
1575 * If we're in log-dirty mode then we need to note that we've updated
1576 * the PTE in the PT-holding page. We need the machine frame number
1577 * for this.
1578 */
1579 if ( ed->mm.shadow_mode == SHM_logdirty )
1580 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1582 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1585 deferred_ops = percpu_info[cpu].deferred_ops;
1586 percpu_info[cpu].deferred_ops = 0;
1588 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1589 unlikely(flags & UVMF_FLUSH_TLB) )
1590 local_flush_tlb();
1591 else if ( unlikely(flags & UVMF_INVLPG) )
1592 __flush_tlb_one(page_nr << PAGE_SHIFT);
1594 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1595 (void)map_ldt_shadow_page(0);
1597 UNLOCK_BIGLOCK(d);
1599 return err;
1602 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1603 unsigned long val,
1604 unsigned long flags,
1605 domid_t domid)
1607 unsigned int cpu = smp_processor_id();
1608 struct domain *d;
1609 int rc;
1611 if ( unlikely(!IS_PRIV(current->domain)) )
1612 return -EPERM;
1614 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1615 if ( unlikely(d == NULL) )
1617 MEM_LOG("Unknown domain '%u'", domid);
1618 return -ESRCH;
1621 rc = do_update_va_mapping(page_nr, val, flags);
1623 put_domain(d);
1624 percpu_info[cpu].foreign = NULL;
1626 return rc;
1631 /*************************
1632 * Writable Pagetables
1633 */
1635 ptwr_info_t ptwr_info[NR_CPUS];
1637 #ifdef VERBOSE
1638 int ptwr_debug = 0x0;
1639 #define PTWR_PRINTK(_f, _a...) \
1640 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1641 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1642 #else
1643 #define PTWR_PRINTK(_f, _a...) ((void)0)
1644 #endif
1646 /* Flush the given writable p.t. page and write-protect it again. */
1647 void ptwr_flush(const int which)
1649 unsigned long sstat, spte, pte, *ptep, l1va;
1650 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1651 l2_pgentry_t *pl2e;
1652 int i, cpu = smp_processor_id();
1653 struct exec_domain *ed = current;
1654 struct domain *d = ed->domain;
1656 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1657 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1659 /*
1660 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1661 */
1663 if ( unlikely(__get_user(pte, ptep)) )
1665 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1666 /*
1667 * Really a bug. We could read this PTE during the initial fault,
1668 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1669 */
1670 BUG();
1672 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1673 PTWR_PRINT_WHICH, ptep, pte);
1674 pte &= ~_PAGE_RW;
1676 if ( unlikely(ed->mm.shadow_mode) )
1678 /* Write-protect the p.t. page in the shadow page table. */
1679 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1680 __put_user(
1681 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1683 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1684 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1685 if ( sstat & PSH_shadowed )
1686 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1689 /* Write-protect the p.t. page in the guest page table. */
1690 if ( unlikely(__put_user(pte, ptep)) )
1692 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1693 /*
1694 * Really a bug. We could write this PTE during the initial fault,
1695 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1696 */
1697 BUG();
1700 /* Ensure that there are no stale writable mappings in any TLB. */
1701 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1702 #if 1
1703 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1704 #else
1705 flush_tlb_all();
1706 #endif
1707 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1708 PTWR_PRINT_WHICH, ptep, pte);
1710 /*
1711 * STEP 2. Validate any modified PTEs.
1712 */
1714 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1715 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1717 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1718 nl1e = pl1e[i];
1720 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1721 continue;
1723 /*
1724 * Fast path for PTEs that have merely been write-protected
1725 * (e.g., during a Unix fork()). A strict reduction in privilege.
1726 */
1727 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1729 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1731 if ( unlikely(sl1e != NULL) )
1732 l1pte_propagate_from_guest(
1733 &ed->mm, &l1_pgentry_val(nl1e),
1734 &l1_pgentry_val(sl1e[i]));
1735 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1737 continue;
1740 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1742 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1743 /*
1744 * Make the remaining p.t's consistent before crashing, so the
1745 * reference counts are correct.
1746 */
1747 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1748 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1749 unmap_domain_mem(pl1e);
1750 ptwr_info[cpu].ptinfo[which].l1va = 0;
1751 UNLOCK_BIGLOCK(d);
1752 domain_crash();
1755 if ( unlikely(sl1e != NULL) )
1756 l1pte_propagate_from_guest(
1757 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1759 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1760 put_page_from_l1e(ol1e, d);
1762 unmap_domain_mem(pl1e);
1764 /*
1765 * STEP 3. Reattach the L1 p.t. page into the current address space.
1766 */
1768 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1770 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1771 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1774 /*
1775 * STEP 4. Final tidy-up.
1776 */
1778 ptwr_info[cpu].ptinfo[which].l1va = 0;
1780 if ( unlikely(sl1e != NULL) )
1782 unmap_domain_mem(sl1e);
1783 put_shadow_status(&ed->mm);
1787 /* Write page fault handler: check if guest is trying to modify a PTE. */
1788 int ptwr_do_page_fault(unsigned long addr)
1790 unsigned long pte, pfn, l2e;
1791 struct pfn_info *page;
1792 l2_pgentry_t *pl2e;
1793 int which, cpu = smp_processor_id();
1794 u32 l2_idx;
1796 /*
1797 * Attempt to read the PTE that maps the VA being accessed. By checking for
1798 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1799 */
1800 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1801 _PAGE_PRESENT) ||
1802 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1804 return 0;
1807 pfn = pte >> PAGE_SHIFT;
1808 page = &frame_table[pfn];
1810 /* We are looking only for read-only mappings of p.t. pages. */
1811 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1812 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1814 return 0;
1817 /* Get the L2 index at which this L1 p.t. is always mapped. */
1818 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1819 if ( unlikely(l2_idx >= PGT_va_unknown) )
1821 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1823 l2_idx >>= PGT_va_shift;
1825 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1827 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1828 domain_crash();
1831 /*
1832 * Is the L1 p.t. mapped into the current address space? If so we call it
1833 * an ACTIVE p.t., otherwise it is INACTIVE.
1834 */
1835 pl2e = &linear_l2_table[l2_idx];
1836 l2e = l2_pgentry_val(*pl2e);
1837 which = PTWR_PT_INACTIVE;
1838 if ( (l2e >> PAGE_SHIFT) == pfn )
1840 /* Check the PRESENT bit to set ACTIVE. */
1841 if ( likely(l2e & _PAGE_PRESENT) )
1842 which = PTWR_PT_ACTIVE;
1843 else {
1844 /*
1845 * If the PRESENT bit is clear, we may be conflicting with
1846 * the current ACTIVE p.t. (it may be the same p.t. mapped
1847 * at another virt addr).
1848 * The ptwr_flush call below will restore the PRESENT bit.
1849 */
1850 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1851 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1852 which = PTWR_PT_ACTIVE;
1856 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1857 "pfn %08lx\n", PTWR_PRINT_WHICH,
1858 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1860 /*
1861 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1862 * time. If there is already one, we must flush it out.
1863 */
1864 if ( ptwr_info[cpu].ptinfo[which].l1va )
1865 ptwr_flush(which);
1867 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1868 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1870 /* For safety, disconnect the L1 p.t. page from current space. */
1871 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1873 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1874 #if 1
1875 flush_tlb(); /* XXX Multi-CPU guests? */
1876 #else
1877 flush_tlb_all();
1878 #endif
1881 /* Temporarily map the L1 page, and make a copy of it. */
1882 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1883 memcpy(ptwr_info[cpu].ptinfo[which].page,
1884 ptwr_info[cpu].ptinfo[which].pl1e,
1885 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1887 /* Finally, make the p.t. page writable by the guest OS. */
1888 pte |= _PAGE_RW;
1889 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1890 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1891 if ( unlikely(__put_user(pte, (unsigned long *)
1892 &linear_pg_table[addr>>PAGE_SHIFT])) )
1894 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1895 &linear_pg_table[addr>>PAGE_SHIFT]);
1896 /* Toss the writable pagetable state and crash. */
1897 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1898 ptwr_info[cpu].ptinfo[which].l1va = 0;
1899 domain_crash();
1902 return EXCRET_fault_fixed;
1905 static __init int ptwr_init(void)
1907 int i;
1909 for ( i = 0; i < smp_num_cpus; i++ )
1911 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1912 (void *)alloc_xenheap_page();
1913 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1914 (void *)alloc_xenheap_page();
1917 return 0;
1919 __initcall(ptwr_init);
1924 /************************************************************************/
1925 /************************************************************************/
1926 /************************************************************************/
1928 #ifndef NDEBUG
1930 void ptwr_status(void)
1932 unsigned long pte, *ptep, pfn;
1933 struct pfn_info *page;
1934 int cpu = smp_processor_id();
1936 ptep = (unsigned long *)&linear_pg_table
1937 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1939 if ( __get_user(pte, ptep) ) {
1940 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1941 domain_crash();
1944 pfn = pte >> PAGE_SHIFT;
1945 page = &frame_table[pfn];
1946 printk("need to alloc l1 page %p\n", page);
1947 /* make pt page writable */
1948 printk("need to make read-only l1-page at %p is %08lx\n",
1949 ptep, pte);
1951 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1952 return;
1954 if ( __get_user(pte, (unsigned long *)
1955 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1956 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1957 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1958 domain_crash();
1960 pfn = pte >> PAGE_SHIFT;
1961 page = &frame_table[pfn];
1964 void audit_domain(struct domain *d)
1966 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1968 void adjust (struct pfn_info *page, int dir, int adjtype)
1970 int count = page->count_info & PGC_count_mask;
1972 if ( adjtype )
1974 int tcount = page->u.inuse.type_info & PGT_count_mask;
1976 ttot++;
1978 tcount += dir;
1980 if ( tcount < 0 )
1982 /* This will only come out once. */
1983 printk("Audit %d: type count whent below zero pfn=%x "
1984 "taf=%x otaf=%x\n",
1985 d->id, page-frame_table,
1986 page->u.inuse.type_info,
1987 page->tlbflush_timestamp);
1990 page->u.inuse.type_info =
1991 (page->u.inuse.type_info & ~PGT_count_mask) |
1992 (tcount & PGT_count_mask);
1995 ctot++;
1996 count += dir;
1997 if ( count < 0 )
1999 /* This will only come out once. */
2000 printk("Audit %d: general count whent below zero pfn=%x "
2001 "taf=%x otaf=%x\n",
2002 d->id, page-frame_table,
2003 page->u.inuse.type_info,
2004 page->tlbflush_timestamp);
2007 page->count_info =
2008 (page->count_info & ~PGC_count_mask) |
2009 (count & PGC_count_mask);
2013 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2015 unsigned long pfn, *pt;
2016 struct list_head *list_ent;
2017 struct pfn_info *page;
2018 int i;
2020 list_ent = d->page_list.next;
2021 for ( i = 0; (list_ent != &d->page_list); i++ )
2023 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2024 page = &frame_table[pfn];
2026 switch ( page->u.inuse.type_info & PGT_type_mask )
2028 case PGT_l1_page_table:
2029 case PGT_l2_page_table:
2030 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2031 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2032 if ( (pt[i] & _PAGE_PRESENT) &&
2033 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2034 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2035 d->id, i, pfn, page->u.inuse.type_info,
2036 page->count_info);
2037 unmap_domain_mem(pt);
2040 list_ent = frame_table[pfn].list.next;
2045 void scan_for_pfn_remote(unsigned long xpfn)
2047 struct domain *e;
2048 for_each_domain ( e )
2049 scan_for_pfn( e, xpfn );
2052 int i;
2053 unsigned long pfn;
2054 struct list_head *list_ent;
2055 struct pfn_info *page;
2057 if ( d != current->domain )
2058 domain_pause(d);
2059 synchronise_pagetables(~0UL);
2061 printk("pt base=%lx sh_info=%x\n",
2062 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2063 virt_to_page(d->shared_info)-frame_table);
2065 spin_lock(&d->page_alloc_lock);
2067 /* PHASE 0 */
2069 list_ent = d->page_list.next;
2070 for ( i = 0; (list_ent != &d->page_list); i++ )
2072 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2073 page = &frame_table[pfn];
2075 if ( page->u.inuse.domain != d )
2076 BUG();
2078 if ( (page->u.inuse.type_info & PGT_count_mask) >
2079 (page->count_info & PGC_count_mask) )
2080 printk("taf > caf %x %x pfn=%lx\n",
2081 page->u.inuse.type_info, page->count_info, pfn );
2083 #if 0 /* SYSV shared memory pages plus writeable files. */
2084 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2085 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2087 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2088 pfn,
2089 page->u.inuse.type_info,
2090 page->count_info );
2091 scan_for_pfn_remote(pfn);
2093 #endif
2094 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2095 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2097 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2098 pfn,
2099 page->u.inuse.type_info,
2100 page->count_info );
2103 /* Use tlbflush_timestamp to store original type_info. */
2104 page->tlbflush_timestamp = page->u.inuse.type_info;
2106 list_ent = frame_table[pfn].list.next;
2110 /* PHASE 1 */
2112 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2114 list_ent = d->page_list.next;
2115 for ( i = 0; (list_ent != &d->page_list); i++ )
2117 unsigned long *pt;
2118 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2119 page = &frame_table[pfn];
2121 if ( page->u.inuse.domain != d )
2122 BUG();
2124 switch ( page->u.inuse.type_info & PGT_type_mask )
2126 case PGT_l2_page_table:
2128 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2129 printk("Audit %d: L2 not validated %x\n",
2130 d->id, page->u.inuse.type_info);
2132 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2133 printk("Audit %d: L2 not pinned %x\n",
2134 d->id, page->u.inuse.type_info);
2135 else
2136 adjust( page, -1, 1 );
2138 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2140 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2142 if ( pt[i] & _PAGE_PRESENT )
2144 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2145 struct pfn_info *l1page = &frame_table[l1pfn];
2147 if ( l1page->u.inuse.domain != d )
2149 printk("L2: Skip bizarre page belonging to other "
2150 "dom %p\n", l1page->u.inuse.domain);
2151 continue;
2154 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2155 PGT_l2_page_table )
2156 printk("Audit %d: [%x] Found %s Linear PT "
2157 "t=%x pfn=%lx\n", d->id, i,
2158 (l1pfn==pfn) ? "Self" : "Other",
2159 l1page->u.inuse.type_info,
2160 l1pfn);
2161 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2162 PGT_l1_page_table )
2163 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2164 d->id, i,
2165 l1page->u.inuse.type_info,
2166 l1pfn);
2168 adjust(l1page, -1, 1);
2172 unmap_domain_mem(pt);
2174 break;
2177 case PGT_l1_page_table:
2179 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2180 adjust( page, -1, 1 );
2182 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2183 printk("Audit %d: L1 not validated %x\n",
2184 d->id, page->u.inuse.type_info);
2185 #if 0
2186 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2187 printk("Audit %d: L1 not pinned %x\n",
2188 d->id, page->u.inuse.type_info);
2189 #endif
2190 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2192 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2194 if ( pt[i] & _PAGE_PRESENT )
2196 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2197 struct pfn_info *l1page = &frame_table[l1pfn];
2199 if ( l1pfn < 0x100 )
2201 lowmem_mappings++;
2202 continue;
2205 if ( l1pfn > max_page )
2207 io_mappings++;
2208 continue;
2211 if ( pt[i] & _PAGE_RW )
2214 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2215 PGT_l1_page_table ||
2216 (l1page->u.inuse.type_info & PGT_type_mask) ==
2217 PGT_l2_page_table )
2218 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2219 d->id, i,
2220 l1page->u.inuse.type_info,
2221 l1pfn);
2225 if ( l1page->u.inuse.domain != d )
2227 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2228 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2229 d->id, pfn, i,
2230 (unsigned long)l1page->u.inuse.domain,
2231 l1pfn,
2232 l1page->count_info,
2233 l1page->u.inuse.type_info,
2234 machine_to_phys_mapping[l1pfn]);
2235 continue;
2238 adjust(l1page, -1, 0);
2242 unmap_domain_mem(pt);
2244 break;
2247 list_ent = frame_table[pfn].list.next;
2250 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2251 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2252 d->id, lowmem_mappings, io_mappings);
2254 /* PHASE 2 */
2256 ctot = ttot = 0;
2257 list_ent = d->page_list.next;
2258 for ( i = 0; (list_ent != &d->page_list); i++ )
2260 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2261 page = &frame_table[pfn];
2263 switch ( page->u.inuse.type_info & PGT_type_mask)
2265 case PGT_l1_page_table:
2266 case PGT_l2_page_table:
2267 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2269 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2270 d->id, page->u.inuse.type_info,
2271 page->tlbflush_timestamp,
2272 page->count_info, pfn );
2273 scan_for_pfn_remote(pfn);
2275 default:
2276 if ( (page->count_info & PGC_count_mask) != 1 )
2278 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2279 d->id,
2280 page->count_info,
2281 page->u.inuse.type_info,
2282 page->tlbflush_timestamp, pfn );
2283 scan_for_pfn_remote(pfn);
2285 break;
2288 list_ent = frame_table[pfn].list.next;
2291 /* PHASE 3 */
2293 list_ent = d->page_list.next;
2294 for ( i = 0; (list_ent != &d->page_list); i++ )
2296 unsigned long *pt;
2297 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2298 page = &frame_table[pfn];
2300 switch ( page->u.inuse.type_info & PGT_type_mask )
2302 case PGT_l2_page_table:
2303 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2304 adjust( page, 1, 1 );
2306 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2308 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2310 if ( pt[i] & _PAGE_PRESENT )
2312 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2313 struct pfn_info *l1page = &frame_table[l1pfn];
2315 if ( l1page->u.inuse.domain == d)
2316 adjust(l1page, 1, 1);
2320 unmap_domain_mem(pt);
2321 break;
2323 case PGT_l1_page_table:
2324 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2325 adjust( page, 1, 1 );
2327 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2329 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2331 if ( pt[i] & _PAGE_PRESENT )
2333 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2334 struct pfn_info *l1page = &frame_table[l1pfn];
2336 if ( (l1page->u.inuse.domain != d) ||
2337 (l1pfn < 0x100) || (l1pfn > max_page) )
2338 continue;
2340 adjust(l1page, 1, 0);
2344 unmap_domain_mem(pt);
2345 break;
2349 page->tlbflush_timestamp = 0;
2351 list_ent = frame_table[pfn].list.next;
2354 spin_unlock(&d->page_alloc_lock);
2356 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2358 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2360 if ( d != current->domain )
2361 domain_unpause(d);
2364 void audit_domains(void)
2366 struct domain *d;
2367 for_each_domain ( d )
2368 audit_domain(d);
2371 void audit_domains_key(unsigned char key)
2373 audit_domains();
2376 #endif