debuggers.hg

view xen/arch/x86/memory.c @ 2660:4d484af4aca0

bitkeeper revision 1.1159.1.219 (416a9e3bO5_6f1e0AMMNVRk0kCNWpQ)

Bug fix, from code inspection.

MMUEXT_TRANSFER_PAGE error case would have inappropriately caused an
ASSERT, due to a "break" statement getting caught by a while loop, rather
than the intended switch statement.
author mafetter@fleming.research
date Mon Oct 11 14:52:43 2004 +0000 (2004-10-11)
parents 215824d97bfc
children b7b15f4a7ebc
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->domain , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } percpu_info[NR_CPUS] __cacheline_aligned;
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
148 /*
149 * We are rather picky about the layout of 'struct pfn_info'. The
150 * count_info and domain fields must be adjacent, as we perform atomic
151 * 64-bit operations on them. Also, just for sanity, we assert the size
152 * of the structure here.
153 */
154 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
155 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
156 (sizeof(struct pfn_info) != 24) )
157 {
158 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
159 offsetof(struct pfn_info, count_info),
160 offsetof(struct pfn_info, u.inuse.domain),
161 sizeof(struct pfn_info));
162 for ( ; ; ) ;
163 }
165 memset(percpu_info, 0, sizeof(percpu_info));
167 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
168 memset(machine_to_phys_mapping, 0x55, 4<<20);
170 /*
171 * Initialise our DOMID_XEN domain.
172 * Any Xen-heap pages that we will allow to be mapped will have
173 * their domain field set to dom_xen.
174 */
175 dom_xen = alloc_domain_struct();
176 atomic_set(&dom_xen->refcnt, 1);
177 dom_xen->domain = DOMID_XEN;
179 /*
180 * Initialise our DOMID_IO domain.
181 * This domain owns no pages but is considered a special case when
182 * mapping I/O pages, as the mappings occur at the priv of the caller.
183 */
184 dom_io = alloc_domain_struct();
185 atomic_set(&dom_io->refcnt, 1);
186 dom_io->domain = DOMID_IO;
188 /* M2P table is mappable read-only by privileged domains. */
189 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
190 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
191 mfn++ )
192 {
193 frame_table[mfn].count_info = PGC_allocated | 1;
194 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
195 frame_table[mfn].u.inuse.domain = dom_xen;
196 }
197 }
199 static void __invalidate_shadow_ldt(struct domain *d)
200 {
201 int i;
202 unsigned long pfn;
203 struct pfn_info *page;
205 d->mm.shadow_ldt_mapcnt = 0;
207 for ( i = 16; i < 32; i++ )
208 {
209 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
210 if ( pfn == 0 ) continue;
211 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
212 page = &frame_table[pfn];
213 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
214 ASSERT_PAGE_IS_DOMAIN(page, d);
215 put_page_and_type(page);
216 }
218 /* Dispose of the (now possibly invalid) mappings from the TLB. */
219 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
220 }
223 static inline void invalidate_shadow_ldt(struct domain *d)
224 {
225 if ( d->mm.shadow_ldt_mapcnt != 0 )
226 __invalidate_shadow_ldt(d);
227 }
230 static int alloc_segdesc_page(struct pfn_info *page)
231 {
232 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
233 int i;
235 for ( i = 0; i < 512; i++ )
236 if ( unlikely(!check_descriptor(&descs[i*2])) )
237 goto fail;
239 unmap_domain_mem(descs);
240 return 1;
242 fail:
243 unmap_domain_mem(descs);
244 return 0;
245 }
248 /* Map shadow page at offset @off. */
249 int map_ldt_shadow_page(unsigned int off)
250 {
251 struct domain *d = current;
252 unsigned long l1e;
254 if ( unlikely(in_irq()) )
255 BUG();
257 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
258 PAGE_SHIFT) + off]);
260 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
261 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
262 d, PGT_ldt_page)) )
263 return 0;
265 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
266 d->mm.shadow_ldt_mapcnt++;
268 return 1;
269 }
272 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
273 {
274 struct pfn_info *page = &frame_table[page_nr];
276 if ( unlikely(!pfn_is_ram(page_nr)) )
277 {
278 MEM_LOG("Pfn %08lx is not RAM", page_nr);
279 return 0;
280 }
282 if ( unlikely(!get_page(page, d)) )
283 {
284 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
285 return 0;
286 }
288 return 1;
289 }
292 static int get_page_and_type_from_pagenr(unsigned long page_nr,
293 u32 type,
294 struct domain *d)
295 {
296 struct pfn_info *page = &frame_table[page_nr];
298 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
299 return 0;
301 if ( unlikely(!get_page_type(page, type)) )
302 {
303 #ifdef VERBOSE
304 if ( (type & PGT_type_mask) != PGT_l1_page_table )
305 MEM_LOG("Bad page type for pfn %08lx (%08x)",
306 page_nr, page->u.inuse.type_info);
307 #endif
308 put_page(page);
309 return 0;
310 }
312 return 1;
313 }
316 /*
317 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
318 * needs some special care with reference counst and access permissions:
319 * 1. The mapping entry must be read-only, or the guest may get write access
320 * to its own PTEs.
321 * 2. We must only bump the reference counts for an *already validated*
322 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
323 * on a validation that is required to complete that validation.
324 * 3. We only need to increment the reference counts for the mapped page
325 * frame if it is mapped by a different L2 table. This is sufficient and
326 * also necessary to allow validation of an L2 table mapping itself.
327 */
328 static int
329 get_linear_pagetable(
330 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
331 {
332 u32 x, y;
333 struct pfn_info *page;
335 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
336 {
337 MEM_LOG("Attempt to create linear p.t. with write perms");
338 return 0;
339 }
341 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
342 {
343 /* Make sure the mapped frame belongs to the correct domain. */
344 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
345 return 0;
347 /*
348 * Make sure that the mapped frame is an already-validated L2 table.
349 * If so, atomically increment the count (checking for overflow).
350 */
351 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
352 y = page->u.inuse.type_info;
353 do {
354 x = y;
355 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
356 unlikely((x & (PGT_type_mask|PGT_validated)) !=
357 (PGT_l2_page_table|PGT_validated)) )
358 {
359 put_page(page);
360 return 0;
361 }
362 }
363 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
364 }
366 return 1;
367 }
370 static inline int
371 readonly_page_from_l1e(
372 l1_pgentry_t l1e)
373 {
374 struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
375 unsigned long l1v = l1_pgentry_val(l1e);
377 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) )
378 return 0;
379 put_page_type(page);
380 return 1;
381 }
383 static int
384 get_page_from_l1e(
385 l1_pgentry_t l1e, struct domain *d)
386 {
387 unsigned long l1v = l1_pgentry_val(l1e);
388 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
389 struct pfn_info *page = &frame_table[pfn];
390 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
392 if ( !(l1v & _PAGE_PRESENT) )
393 return 1;
395 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
396 {
397 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
398 return 0;
399 }
401 if ( unlikely(!pfn_is_ram(pfn)) )
402 {
403 /* Revert to caller privileges if FD == DOMID_IO. */
404 if ( d == dom_io )
405 d = current;
407 if ( IS_PRIV(d) )
408 return 1;
410 if ( IS_CAPABLE_PHYSDEV(d) )
411 return domain_iomem_in_pfn(d, pfn);
413 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
414 return 0;
415 }
417 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
418 return 0;
420 if ( l1v & _PAGE_RW )
421 {
422 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
423 return 0;
424 }
426 return 1;
427 }
430 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
431 static int
432 get_page_from_l2e(
433 l2_pgentry_t l2e, unsigned long pfn,
434 struct domain *d, unsigned long va_idx)
435 {
436 int rc;
438 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
439 return 1;
441 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
442 {
443 MEM_LOG("Bad L2 page type settings %04lx",
444 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
445 return 0;
446 }
448 rc = get_page_and_type_from_pagenr(
449 l2_pgentry_to_pagenr(l2e),
450 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
452 if ( unlikely(!rc) )
453 return get_linear_pagetable(l2e, pfn, d);
455 return 1;
456 }
459 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
460 {
461 unsigned long l1v = l1_pgentry_val(l1e);
462 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
463 struct pfn_info *page = &frame_table[pfn];
464 struct domain *e = page->u.inuse.domain;
466 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
467 return;
469 if ( unlikely(e != d) )
470 {
471 /*
472 * Unmap a foreign page that may have been mapped via a grant table.
473 * Note that this can fail for a privileged domain that can map foreign
474 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
475 * counted via a grant entry and some counted directly in the page
476 * structure's reference count. Note that reference counts won't get
477 * dangerously confused as long as we always try to decrement the
478 * grant entry first. We may end up with a mismatch between which
479 * mappings and which unmappings are counted via the grant entry, but
480 * really it doesn't matter as privileged domains have carte blanche.
481 */
482 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
483 return;
484 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
485 }
487 if ( l1v & _PAGE_RW )
488 {
489 put_page_and_type(page);
490 }
491 else
492 {
493 /* We expect this is rare so we blow the entire shadow LDT. */
494 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
495 PGT_ldt_page)) &&
496 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
497 invalidate_shadow_ldt(e);
498 put_page(page);
499 }
500 }
503 /*
504 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
505 * Note also that this automatically deals correctly with linear p.t.'s.
506 */
507 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
508 {
509 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
510 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
511 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
512 }
515 static int alloc_l2_table(struct pfn_info *page)
516 {
517 struct domain *d = page->u.inuse.domain;
518 unsigned long page_nr = page_to_pfn(page);
519 l2_pgentry_t *pl2e;
520 int i;
522 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
524 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
525 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
526 goto fail;
527 }
529 #if defined(__i386__)
530 /* Now we add our private high mappings. */
531 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
532 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
533 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
534 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
535 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
536 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
537 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
538 __PAGE_HYPERVISOR);
539 #endif
541 unmap_domain_mem(pl2e);
542 return 1;
544 fail:
545 while ( i-- > 0 )
546 put_page_from_l2e(pl2e[i], page_nr);
548 unmap_domain_mem(pl2e);
549 return 0;
550 }
553 static int alloc_l1_table(struct pfn_info *page)
554 {
555 struct domain *d = page->u.inuse.domain;
556 unsigned long page_nr = page_to_pfn(page);
557 l1_pgentry_t *pl1e;
558 int i;
560 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
562 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
563 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
564 goto fail;
566 unmap_domain_mem(pl1e);
567 return 1;
569 fail:
570 while ( i-- > 0 )
571 put_page_from_l1e(pl1e[i], d);
573 unmap_domain_mem(pl1e);
574 return 0;
575 }
578 static void free_l2_table(struct pfn_info *page)
579 {
580 unsigned long page_nr = page - frame_table;
581 l2_pgentry_t *pl2e;
582 int i;
584 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
586 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
587 put_page_from_l2e(pl2e[i], page_nr);
589 unmap_domain_mem(pl2e);
590 }
593 static void free_l1_table(struct pfn_info *page)
594 {
595 struct domain *d = page->u.inuse.domain;
596 unsigned long page_nr = page - frame_table;
597 l1_pgentry_t *pl1e;
598 int i;
600 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
602 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
603 put_page_from_l1e(pl1e[i], d);
605 unmap_domain_mem(pl1e);
606 }
609 static inline int update_l2e(l2_pgentry_t *pl2e,
610 l2_pgentry_t ol2e,
611 l2_pgentry_t nl2e)
612 {
613 unsigned long o = cmpxchg((unsigned long *)pl2e,
614 l2_pgentry_val(ol2e),
615 l2_pgentry_val(nl2e));
616 if ( o != l2_pgentry_val(ol2e) )
617 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
618 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
619 return (o == l2_pgentry_val(ol2e));
620 }
623 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
624 static int mod_l2_entry(l2_pgentry_t *pl2e,
625 l2_pgentry_t nl2e,
626 unsigned long pfn)
627 {
628 l2_pgentry_t ol2e;
629 unsigned long _ol2e;
631 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
632 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
633 {
634 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
635 return 0;
636 }
638 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
639 return 0;
640 ol2e = mk_l2_pgentry(_ol2e);
642 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
643 {
644 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
645 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
646 return update_l2e(pl2e, ol2e, nl2e);
648 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
649 ((unsigned long)pl2e &
650 ~PAGE_MASK) >> 2)) )
651 return 0;
653 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
654 {
655 put_page_from_l2e(nl2e, pfn);
656 return 0;
657 }
659 put_page_from_l2e(ol2e, pfn);
660 return 1;
661 }
663 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
664 return 0;
666 put_page_from_l2e(ol2e, pfn);
667 return 1;
668 }
671 static inline int update_l1e(l1_pgentry_t *pl1e,
672 l1_pgentry_t ol1e,
673 l1_pgentry_t nl1e)
674 {
675 unsigned long o = l1_pgentry_val(ol1e);
676 unsigned long n = l1_pgentry_val(nl1e);
678 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
679 unlikely(o != l1_pgentry_val(ol1e)) )
680 {
681 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
682 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
683 return 0;
684 }
686 return 1;
687 }
690 /* Update the L1 entry at pl1e to new value nl1e. */
691 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
692 {
693 l1_pgentry_t ol1e;
694 unsigned long _ol1e;
695 struct domain *d = current;
697 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
698 {
699 MEM_LOG("Bad get_user\n");
700 return 0;
701 }
703 ol1e = mk_l1_pgentry(_ol1e);
705 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
706 {
707 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
708 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
709 return update_l1e(pl1e, ol1e, nl1e);
711 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
712 return 0;
714 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
715 {
716 put_page_from_l1e(nl1e, d);
717 return 0;
718 }
720 put_page_from_l1e(ol1e, d);
721 return 1;
722 }
724 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
725 return 0;
727 put_page_from_l1e(ol1e, d);
728 return 1;
729 }
732 int alloc_page_type(struct pfn_info *page, unsigned int type)
733 {
734 switch ( type )
735 {
736 case PGT_l1_page_table:
737 return alloc_l1_table(page);
738 case PGT_l2_page_table:
739 return alloc_l2_table(page);
740 case PGT_gdt_page:
741 case PGT_ldt_page:
742 return alloc_segdesc_page(page);
743 default:
744 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
745 type, page->u.inuse.type_info,
746 page->count_info);
747 BUG();
748 }
750 return 0;
751 }
754 void free_page_type(struct pfn_info *page, unsigned int type)
755 {
756 struct domain *d = page->u.inuse.domain;
758 switch ( type )
759 {
760 case PGT_l1_page_table:
761 free_l1_table(page);
762 break;
764 case PGT_l2_page_table:
765 free_l2_table(page);
766 break;
768 default:
769 BUG();
770 }
772 if ( unlikely(d->mm.shadow_mode) &&
773 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
774 {
775 unshadow_table(page_to_pfn(page), type);
776 put_shadow_status(&d->mm);
777 }
778 }
781 void put_page_type(struct pfn_info *page)
782 {
783 u32 nx, x, y = page->u.inuse.type_info;
785 again:
786 do {
787 x = y;
788 nx = x - 1;
790 ASSERT((x & PGT_count_mask) != 0);
792 /*
793 * The page should always be validated while a reference is held. The
794 * exception is during domain destruction, when we forcibly invalidate
795 * page-table pages if we detect a referential loop.
796 * See domain.c:relinquish_list().
797 */
798 ASSERT((x & PGT_validated) ||
799 test_bit(DF_DYING, &page->u.inuse.domain->flags));
801 if ( unlikely((nx & PGT_count_mask) == 0) )
802 {
803 /* Record TLB information for flush later. Races are harmless. */
804 page->tlbflush_timestamp = tlbflush_clock;
806 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
807 likely(nx & PGT_validated) )
808 {
809 /*
810 * Page-table pages must be unvalidated when count is zero. The
811 * 'free' is safe because the refcnt is non-zero and validated
812 * bit is clear => other ops will spin or fail.
813 */
814 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
815 x & ~PGT_validated)) != x) )
816 goto again;
817 /* We cleared the 'valid bit' so we do the clear up. */
818 free_page_type(page, x & PGT_type_mask);
819 /* Carry on, but with the 'valid bit' now clear. */
820 x &= ~PGT_validated;
821 nx &= ~PGT_validated;
822 }
823 }
824 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
825 (PGT_pinned | 1)) )
826 {
827 /* Page is now only pinned. Make the back pointer mutable again. */
828 nx |= PGT_va_mutable;
829 }
830 }
831 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
832 }
835 int get_page_type(struct pfn_info *page, u32 type)
836 {
837 u32 nx, x, y = page->u.inuse.type_info;
839 again:
840 do {
841 x = y;
842 nx = x + 1;
843 if ( unlikely((nx & PGT_count_mask) == 0) )
844 {
845 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
846 return 0;
847 }
848 else if ( unlikely((x & PGT_count_mask) == 0) )
849 {
850 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
851 {
852 /*
853 * On type change we check to flush stale TLB entries. This
854 * may be unnecessary (e.g., page was GDT/LDT) but those
855 * circumstances should be very rare.
856 */
857 struct domain *d = page->u.inuse.domain;
858 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
859 page->tlbflush_timestamp)) )
860 {
861 perfc_incr(need_flush_tlb_flush);
862 flush_tlb_cpu(d->processor);
863 }
865 /* We lose existing type, back pointer, and validity. */
866 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
867 nx |= type;
869 /* No special validation needed for writable pages. */
870 /* Page tables and GDT/LDT need to be scanned for validity. */
871 if ( type == PGT_writable_page )
872 nx |= PGT_validated;
873 }
874 }
875 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
876 {
877 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
878 {
879 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
880 ((type & PGT_type_mask) != PGT_l1_page_table) )
881 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
882 x & PGT_type_mask, type, page_to_pfn(page));
883 return 0;
884 }
885 else if ( (x & PGT_va_mask) == PGT_va_mutable )
886 {
887 /* The va backpointer is mutable, hence we update it. */
888 nx &= ~PGT_va_mask;
889 nx |= type; /* we know the actual type is correct */
890 }
891 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
892 {
893 /* This table is potentially mapped at multiple locations. */
894 nx &= ~PGT_va_mask;
895 nx |= PGT_va_unknown;
896 }
897 }
898 else if ( unlikely(!(x & PGT_validated)) )
899 {
900 /* Someone else is updating validation of this page. Wait... */
901 while ( (y = page->u.inuse.type_info) == x )
902 {
903 rep_nop();
904 barrier();
905 }
906 goto again;
907 }
908 }
909 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
911 if ( unlikely(!(nx & PGT_validated)) )
912 {
913 /* Try to validate page type; drop the new reference on failure. */
914 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
915 {
916 MEM_LOG("Error while validating pfn %08lx for type %08x."
917 " caf=%08x taf=%08x\n",
918 page_to_pfn(page), type,
919 page->count_info,
920 page->u.inuse.type_info);
921 /* Noone else can get a reference. We hold the only ref. */
922 page->u.inuse.type_info = 0;
923 return 0;
924 }
926 /* Noone else is updating simultaneously. */
927 __set_bit(_PGT_validated, &page->u.inuse.type_info);
928 }
930 return 1;
931 }
934 static int do_extended_command(unsigned long ptr, unsigned long val)
935 {
936 int okay = 1, cpu = smp_processor_id();
937 unsigned int cmd = val & MMUEXT_CMD_MASK;
938 unsigned long pfn = ptr >> PAGE_SHIFT;
939 unsigned long old_base_pfn;
940 struct pfn_info *page = &frame_table[pfn];
941 struct domain *d = current, *nd, *e;
942 u32 x, y;
943 domid_t domid;
944 grant_ref_t gntref;
946 switch ( cmd )
947 {
948 case MMUEXT_PIN_L1_TABLE:
949 case MMUEXT_PIN_L2_TABLE:
950 /*
951 * We insist that, if you pin an L1 page, it's the first thing that
952 * you do to it. This is because we require the backptr to still be
953 * mutable. This assumption seems safe.
954 */
955 okay = get_page_and_type_from_pagenr(
956 pfn,
957 ((cmd==MMUEXT_PIN_L2_TABLE) ?
958 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
959 FOREIGNDOM);
961 if ( unlikely(!okay) )
962 {
963 MEM_LOG("Error while pinning pfn %08lx", pfn);
964 break;
965 }
967 if ( unlikely(test_and_set_bit(_PGT_pinned,
968 &page->u.inuse.type_info)) )
969 {
970 MEM_LOG("Pfn %08lx already pinned", pfn);
971 put_page_and_type(page);
972 okay = 0;
973 break;
974 }
976 break;
978 case MMUEXT_UNPIN_TABLE:
979 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
980 {
981 MEM_LOG("Page %08lx bad domain (dom=%p)",
982 ptr, page->u.inuse.domain);
983 }
984 else if ( likely(test_and_clear_bit(_PGT_pinned,
985 &page->u.inuse.type_info)) )
986 {
987 put_page_and_type(page);
988 put_page(page);
989 }
990 else
991 {
992 okay = 0;
993 put_page(page);
994 MEM_LOG("Pfn %08lx not pinned", pfn);
995 }
996 break;
998 case MMUEXT_NEW_BASEPTR:
999 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
1000 if ( likely(okay) )
1002 invalidate_shadow_ldt(d);
1004 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1005 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
1006 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
1008 shadow_mk_pagetable(&d->mm);
1010 write_ptbase(&d->mm);
1012 put_page_and_type(&frame_table[old_base_pfn]);
1014 /*
1015 * Note that we tick the clock /after/ dropping the old base's
1016 * reference count. If the page tables got freed then this will
1017 * avoid unnecessary TLB flushes when the pages are reused. */
1018 tlb_clocktick();
1020 else
1022 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1024 break;
1026 case MMUEXT_TLB_FLUSH:
1027 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1028 break;
1030 case MMUEXT_INVLPG:
1031 __flush_tlb_one(ptr);
1032 break;
1034 case MMUEXT_FLUSH_CACHE:
1035 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1037 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1038 okay = 0;
1040 else
1042 wbinvd();
1044 break;
1046 case MMUEXT_SET_LDT:
1048 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1049 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1050 (ents > 8192) ||
1051 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1052 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1054 okay = 0;
1055 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1057 else if ( (d->mm.ldt_ents != ents) ||
1058 (d->mm.ldt_base != ptr) )
1060 invalidate_shadow_ldt(d);
1061 d->mm.ldt_base = ptr;
1062 d->mm.ldt_ents = ents;
1063 load_LDT(d);
1064 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1065 if ( ents != 0 )
1066 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1068 break;
1071 case MMUEXT_SET_FOREIGNDOM:
1072 domid = (domid_t)(val >> 16);
1074 if ( (e = percpu_info[cpu].foreign) != NULL )
1075 put_domain(e);
1076 percpu_info[cpu].foreign = NULL;
1078 if ( !IS_PRIV(d) )
1080 switch ( domid )
1082 case DOMID_IO:
1083 get_knownalive_domain(dom_io);
1084 percpu_info[cpu].foreign = dom_io;
1085 break;
1086 default:
1087 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain);
1088 okay = 0;
1089 break;
1092 else
1094 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1095 if ( e == NULL )
1097 switch ( domid )
1099 case DOMID_XEN:
1100 get_knownalive_domain(dom_xen);
1101 percpu_info[cpu].foreign = dom_xen;
1102 break;
1103 case DOMID_IO:
1104 get_knownalive_domain(dom_io);
1105 percpu_info[cpu].foreign = dom_io;
1106 break;
1107 default:
1108 MEM_LOG("Unknown domain '%u'", domid);
1109 okay = 0;
1110 break;
1114 break;
1116 case MMUEXT_TRANSFER_PAGE:
1117 domid = (domid_t)(val >> 16);
1118 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1120 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1121 unlikely(!pfn_is_ram(pfn)) ||
1122 unlikely((e = find_domain_by_id(domid)) == NULL) )
1124 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1125 okay = 0;
1126 break;
1129 spin_lock(&d->page_alloc_lock);
1131 /*
1132 * The tricky bit: atomically release ownership while there is just one
1133 * benign reference to the page (PGC_allocated). If that reference
1134 * disappears then the deallocation routine will safely spin.
1135 */
1136 nd = page->u.inuse.domain;
1137 y = page->count_info;
1138 do {
1139 x = y;
1140 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1141 (1|PGC_allocated)) ||
1142 unlikely(nd != d) )
1144 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1145 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1146 d, d->domain, nd, x, page->u.inuse.type_info);
1147 spin_unlock(&d->page_alloc_lock);
1148 put_domain(e);
1149 okay = 0;
1150 break;
1152 __asm__ __volatile__(
1153 LOCK_PREFIX "cmpxchg8b %2"
1154 : "=d" (nd), "=a" (y),
1155 "=m" (*(volatile u64 *)(&page->count_info))
1156 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1158 while ( unlikely(nd != d) || unlikely(y != x) );
1159 if (!okay) break;
1161 /*
1162 * Unlink from 'd'. At least one reference remains (now anonymous), so
1163 * noone else is spinning to try to delete this page from 'd'.
1164 */
1165 d->tot_pages--;
1166 list_del(&page->list);
1168 spin_unlock(&d->page_alloc_lock);
1170 spin_lock(&e->page_alloc_lock);
1172 /*
1173 * Check that 'e' will accept the page and has reservation headroom.
1174 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1175 */
1176 ASSERT(e->tot_pages <= e->max_pages);
1177 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1178 unlikely(e->tot_pages == e->max_pages) ||
1179 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1181 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1182 "provided a bad grant ref, or is dying (%08lx).\n",
1183 e->tot_pages, e->max_pages, e->flags);
1184 spin_unlock(&e->page_alloc_lock);
1185 put_domain(e);
1186 okay = 0;
1187 break;
1190 /* Okay, add the page to 'e'. */
1191 if ( unlikely(e->tot_pages++ == 0) )
1192 get_knownalive_domain(e);
1193 list_add_tail(&page->list, &e->page_list);
1194 page->u.inuse.domain = e;
1196 spin_unlock(&e->page_alloc_lock);
1198 /* Transfer is all done: tell the guest about its new page frame. */
1199 gnttab_notify_transfer(e, gntref, pfn);
1201 put_domain(e);
1202 break;
1204 case MMUEXT_REASSIGN_PAGE:
1205 if ( unlikely(!IS_PRIV(d)) )
1207 MEM_LOG("Dom %u has no reassignment priv", d->domain);
1208 okay = 0;
1209 break;
1212 e = percpu_info[cpu].foreign;
1213 if ( unlikely(e == NULL) )
1215 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1216 okay = 0;
1217 break;
1220 /*
1221 * Grab both page_list locks, in order. This prevents the page from
1222 * disappearing elsewhere while we modify the owner, and we'll need
1223 * both locks if we're successful so that we can change lists.
1224 */
1225 if ( d < e )
1227 spin_lock(&d->page_alloc_lock);
1228 spin_lock(&e->page_alloc_lock);
1230 else
1232 spin_lock(&e->page_alloc_lock);
1233 spin_lock(&d->page_alloc_lock);
1236 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1237 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1238 unlikely(IS_XEN_HEAP_FRAME(page)) )
1240 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1241 okay = 0;
1242 goto reassign_fail;
1245 /*
1246 * The tricky bit: atomically change owner while there is just one
1247 * benign reference to the page (PGC_allocated). If that reference
1248 * disappears then the deallocation routine will safely spin.
1249 */
1250 nd = page->u.inuse.domain;
1251 y = page->count_info;
1252 do {
1253 x = y;
1254 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1255 (1|PGC_allocated)) ||
1256 unlikely(nd != d) )
1258 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1259 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1260 d, d->domain, nd, x, page->u.inuse.type_info);
1261 okay = 0;
1262 goto reassign_fail;
1264 __asm__ __volatile__(
1265 LOCK_PREFIX "cmpxchg8b %3"
1266 : "=d" (nd), "=a" (y), "=c" (e),
1267 "=m" (*(volatile u64 *)(&page->count_info))
1268 : "0" (d), "1" (x), "c" (e), "b" (x) );
1270 while ( unlikely(nd != d) || unlikely(y != x) );
1272 /*
1273 * Unlink from 'd'. We transferred at least one reference to 'e', so
1274 * noone else is spinning to try to delete this page from 'd'.
1275 */
1276 d->tot_pages--;
1277 list_del(&page->list);
1279 /*
1280 * Add the page to 'e'. Someone may already have removed the last
1281 * reference and want to remove the page from 'e'. However, we have
1282 * the lock so they'll spin waiting for us.
1283 */
1284 if ( unlikely(e->tot_pages++ == 0) )
1285 get_knownalive_domain(e);
1286 list_add_tail(&page->list, &e->page_list);
1288 reassign_fail:
1289 spin_unlock(&d->page_alloc_lock);
1290 spin_unlock(&e->page_alloc_lock);
1291 break;
1293 case MMUEXT_CLEAR_FOREIGNDOM:
1294 if ( (e = percpu_info[cpu].foreign) != NULL )
1295 put_domain(e);
1296 percpu_info[cpu].foreign = NULL;
1297 break;
1299 default:
1300 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1301 okay = 0;
1302 break;
1305 return okay;
1309 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1311 mmu_update_t req;
1312 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1313 struct pfn_info *page;
1314 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1315 unsigned int cmd;
1316 unsigned long prev_spfn = 0;
1317 l1_pgentry_t *prev_spl1e = 0;
1318 struct domain *d = current;
1319 u32 type_info;
1321 perfc_incrc(calls_to_mmu_update);
1322 perfc_addc(num_page_updates, count);
1324 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1326 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) )
1327 return -EFAULT;
1329 for ( i = 0; i < count; i++ )
1331 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1333 MEM_LOG("Bad __copy_from_user");
1334 rc = -EFAULT;
1335 break;
1338 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1339 pfn = req.ptr >> PAGE_SHIFT;
1341 okay = 0;
1343 switch ( cmd )
1345 /*
1346 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1347 */
1348 case MMU_NORMAL_PT_UPDATE:
1349 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1351 MEM_LOG("Could not get page for normal update");
1352 break;
1355 if ( likely(prev_pfn == pfn) )
1357 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1359 else
1361 if ( prev_pfn != 0 )
1362 unmap_domain_mem((void *)va);
1363 va = (unsigned long)map_domain_mem(req.ptr);
1364 prev_pfn = pfn;
1367 page = &frame_table[pfn];
1368 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1370 case PGT_l1_page_table:
1371 if ( likely(get_page_type(
1372 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1374 okay = mod_l1_entry((l1_pgentry_t *)va,
1375 mk_l1_pgentry(req.val));
1377 if ( unlikely(d->mm.shadow_mode) && okay &&
1378 (get_shadow_status(&d->mm, page-frame_table) &
1379 PSH_shadowed) )
1381 shadow_l1_normal_pt_update(
1382 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1383 put_shadow_status(&d->mm);
1386 put_page_type(page);
1388 break;
1389 case PGT_l2_page_table:
1390 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1392 okay = mod_l2_entry((l2_pgentry_t *)va,
1393 mk_l2_pgentry(req.val),
1394 pfn);
1396 if ( unlikely(d->mm.shadow_mode) && okay &&
1397 (get_shadow_status(&d->mm, page-frame_table) &
1398 PSH_shadowed) )
1400 shadow_l2_normal_pt_update(req.ptr, req.val);
1401 put_shadow_status(&d->mm);
1404 put_page_type(page);
1406 break;
1407 default:
1408 if ( likely(get_page_type(page, PGT_writable_page)) )
1410 *(unsigned long *)va = req.val;
1411 okay = 1;
1412 put_page_type(page);
1414 break;
1417 put_page(page);
1418 break;
1420 case MMU_MACHPHYS_UPDATE:
1421 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1423 MEM_LOG("Could not get page for mach->phys update");
1424 break;
1427 machine_to_phys_mapping[pfn] = req.val;
1428 okay = 1;
1430 /*
1431 * If in log-dirty mode, mark the corresponding pseudo-physical
1432 * page as dirty.
1433 */
1434 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1435 mark_dirty(&d->mm, pfn) )
1436 d->mm.shadow_dirty_block_count++;
1438 put_page(&frame_table[pfn]);
1439 break;
1441 /*
1442 * MMU_EXTENDED_COMMAND: Extended command is specified
1443 * in the least-siginificant bits of the 'value' field.
1444 */
1445 case MMU_EXTENDED_COMMAND:
1446 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1447 okay = do_extended_command(req.ptr, req.val);
1448 break;
1450 default:
1451 MEM_LOG("Invalid page update command %08lx", req.ptr);
1452 break;
1455 if ( unlikely(!okay) )
1457 rc = -EINVAL;
1458 break;
1461 ureqs++;
1464 if ( prev_pfn != 0 )
1465 unmap_domain_mem((void *)va);
1467 if ( unlikely(prev_spl1e != 0) )
1468 unmap_domain_mem((void *)prev_spl1e);
1470 deferred_ops = percpu_info[cpu].deferred_ops;
1471 percpu_info[cpu].deferred_ops = 0;
1473 if ( deferred_ops & DOP_FLUSH_TLB )
1474 local_flush_tlb();
1476 if ( deferred_ops & DOP_RELOAD_LDT )
1477 (void)map_ldt_shadow_page(0);
1479 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1481 put_domain(percpu_info[cpu].foreign);
1482 percpu_info[cpu].foreign = NULL;
1485 if ( unlikely(success_count != NULL) )
1486 put_user(count, success_count);
1488 return rc;
1492 int do_update_va_mapping(unsigned long page_nr,
1493 unsigned long val,
1494 unsigned long flags)
1496 struct domain *d = current;
1497 int err = 0;
1498 unsigned int cpu = d->processor;
1499 unsigned long deferred_ops;
1501 perfc_incrc(calls_to_update_va);
1503 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1504 return -EINVAL;
1506 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1508 /*
1509 * XXX When we make this support 4MB superpages we should also deal with
1510 * the case of updating L2 entries.
1511 */
1513 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1514 mk_l1_pgentry(val))) )
1515 err = -EINVAL;
1517 if ( unlikely(d->mm.shadow_mode) )
1519 unsigned long sval;
1521 l1pte_no_fault(&d->mm, &val, &sval);
1523 if ( unlikely(__put_user(sval, ((unsigned long *)(
1524 &shadow_linear_pg_table[page_nr])))) )
1526 /*
1527 * Since L2's are guranteed RW, failure indicates the page was not
1528 * shadowed, so ignore.
1529 */
1530 perfc_incrc(shadow_update_va_fail);
1533 /*
1534 * If we're in log-dirty mode then we need to note that we've updated
1535 * the PTE in the PT-holding page. We need the machine frame number
1536 * for this.
1537 */
1538 if ( d->mm.shadow_mode == SHM_logdirty )
1539 mark_dirty( &current->mm, va_to_l1mfn(page_nr<<PAGE_SHIFT) );
1541 check_pagetable(d, d->mm.pagetable, "va"); /* debug */
1544 deferred_ops = percpu_info[cpu].deferred_ops;
1545 percpu_info[cpu].deferred_ops = 0;
1547 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1548 unlikely(flags & UVMF_FLUSH_TLB) )
1549 local_flush_tlb();
1550 else if ( unlikely(flags & UVMF_INVLPG) )
1551 __flush_tlb_one(page_nr << PAGE_SHIFT);
1553 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1554 (void)map_ldt_shadow_page(0);
1556 return err;
1559 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1560 unsigned long val,
1561 unsigned long flags,
1562 domid_t domid)
1564 unsigned int cpu = smp_processor_id();
1565 struct domain *d;
1566 int rc;
1568 if ( unlikely(!IS_PRIV(current)) )
1569 return -EPERM;
1571 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1572 if ( unlikely(d == NULL) )
1574 MEM_LOG("Unknown domain '%u'", domid);
1575 return -ESRCH;
1578 rc = do_update_va_mapping(page_nr, val, flags);
1580 put_domain(d);
1581 percpu_info[cpu].foreign = NULL;
1583 return rc;
1588 /*************************
1589 * Writable Pagetables
1590 */
1592 ptwr_info_t ptwr_info[NR_CPUS] =
1593 { [ 0 ... NR_CPUS-1 ] =
1595 .ptinfo[PTWR_PT_ACTIVE].l1va = 0,
1596 .ptinfo[PTWR_PT_ACTIVE].page = 0,
1597 .ptinfo[PTWR_PT_INACTIVE].l1va = 0,
1598 .ptinfo[PTWR_PT_INACTIVE].page = 0,
1600 };
1602 #ifdef VERBOSE
1603 int ptwr_debug = 0x0;
1604 #define PTWR_PRINTK(_f, _a...) \
1605 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1606 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1607 #else
1608 #define PTWR_PRINTK(_f, _a...) ((void)0)
1609 #endif
1611 void ptwr_flush(const int which)
1613 unsigned long pte, *ptep, l1va;
1614 l1_pgentry_t *pl1e;
1615 l2_pgentry_t *pl2e, nl2e;
1616 int cpu = smp_processor_id();
1617 int i;
1619 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1620 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1622 /* make pt page write protected */
1623 if ( unlikely(__get_user(pte, ptep)) ) {
1624 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1625 domain_crash();
1627 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1628 PTWR_PRINT_WHICH, ptep, pte);
1629 pte &= ~_PAGE_RW;
1631 if ( unlikely(current->mm.shadow_mode) ) {
1632 unsigned long spte;
1633 l1pte_no_fault(&current->mm, &pte, &spte);
1634 __put_user( spte, (unsigned long *)&shadow_linear_pg_table
1635 [l1va>>PAGE_SHIFT] );
1638 if ( unlikely(__put_user(pte, ptep)) ) {
1639 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1640 domain_crash();
1642 __flush_tlb_one(l1va);
1643 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1644 PTWR_PRINT_WHICH, ptep, pte);
1646 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1647 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) {
1648 l1_pgentry_t ol1e, nl1e;
1649 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1650 nl1e = pl1e[i];
1651 if (likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)))
1652 continue;
1653 if (likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e) | _PAGE_RW))
1654 && readonly_page_from_l1e(nl1e))
1655 continue;
1656 if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
1657 put_page_from_l1e(ol1e, current);
1658 if (unlikely(!get_page_from_l1e(nl1e, current))) {
1659 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1660 domain_crash();
1663 unmap_domain_mem(pl1e);
1665 if (which == PTWR_PT_ACTIVE && likely(!current->mm.shadow_mode)) {
1666 /* reconnect l1 page (no need if shadow mode) */
1667 pl2e = &linear_l2_table[ptwr_info[cpu].active_pteidx];
1668 nl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1669 update_l2e(pl2e, *pl2e, nl2e);
1672 if ( unlikely(current->mm.shadow_mode) )
1674 unsigned long sstat =
1675 get_shadow_status(&current->mm, pte >> PAGE_SHIFT);
1677 if ( sstat & PSH_shadowed )
1679 int i;
1680 unsigned long spfn = sstat & PSH_pfn_mask;
1681 l1_pgentry_t *sl1e = map_domain_mem( spfn << PAGE_SHIFT );
1683 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1685 l1pte_no_fault(&current->mm,
1686 &l1_pgentry_val(
1687 ptwr_info[cpu].ptinfo[which].page[i]),
1688 &l1_pgentry_val(sl1e[i]));
1690 unmap_domain_mem(sl1e);
1691 put_shadow_status(&current->mm);
1696 ptwr_info[cpu].ptinfo[which].l1va = 0;
1699 int ptwr_do_page_fault(unsigned long addr)
1701 /* write page fault, check if we're trying to modify an l1 page table */
1702 unsigned long pte, pfn;
1703 struct pfn_info *page;
1704 l2_pgentry_t *pl2e, nl2e;
1705 int cpu = smp_processor_id();
1706 int which;
1708 #if 0
1709 PTWR_PRINTK("get user %p for va %08lx\n",
1710 &linear_pg_table[addr>>PAGE_SHIFT], addr);
1711 #endif
1713 /* Testing for page_present in the L2 avoids lots of unncessary fixups */
1714 if ( (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
1715 _PAGE_PRESENT) &&
1716 (__get_user(pte, (unsigned long *)
1717 &linear_pg_table[addr >> PAGE_SHIFT]) == 0) )
1719 if ( (pte & _PAGE_RW) && (pte & _PAGE_PRESENT) )
1720 return 0; /* we can't help. Maybe shadow mode can? */
1722 pfn = pte >> PAGE_SHIFT;
1723 #if 0
1724 PTWR_PRINTK("check pte %08lx = pfn %08lx for va %08lx\n",
1725 pte, pfn, addr);
1726 #endif
1727 page = &frame_table[pfn];
1728 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
1730 u32 va_mask = page->u.inuse.type_info & PGT_va_mask;
1732 if ( unlikely(va_mask >= PGT_va_unknown) )
1733 domain_crash();
1734 va_mask >>= PGT_va_shift;
1736 pl2e = &linear_l2_table[va_mask];
1738 which = (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) ?
1739 PTWR_PT_INACTIVE : PTWR_PT_ACTIVE;
1741 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1742 "pt for %08x, pfn %08lx\n", PTWR_PRINT_WHICH,
1743 addr, va_mask << L2_PAGETABLE_SHIFT, pfn);
1745 if ( ptwr_info[cpu].ptinfo[which].l1va )
1746 ptwr_flush(which);
1747 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1749 if (which == PTWR_PT_ACTIVE) {
1750 ptwr_info[cpu].active_pteidx = va_mask;
1751 if ( likely(!current->mm.shadow_mode) ) {
1752 /* disconnect l1 page (unnecessary in shadow mode) */
1753 nl2e = mk_l2_pgentry((l2_pgentry_val(*pl2e) &
1754 ~_PAGE_PRESENT));
1755 update_l2e(pl2e, *pl2e, nl2e);
1756 flush_tlb();
1760 ptwr_info[cpu].ptinfo[which].pl1e =
1761 map_domain_mem(pfn << PAGE_SHIFT);
1762 memcpy(ptwr_info[cpu].ptinfo[which].page,
1763 ptwr_info[cpu].ptinfo[which].pl1e,
1764 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1766 /* make pt page writable */
1767 pte |= _PAGE_RW;
1768 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1769 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1770 if ( unlikely(__put_user(pte, (unsigned long *)
1771 &linear_pg_table[addr>>PAGE_SHIFT])) ) {
1772 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1773 &linear_pg_table[addr>>PAGE_SHIFT]);
1774 domain_crash();
1777 /* maybe fall through to shadow mode to propagate writeable L1 */
1778 return ( !current->mm.shadow_mode );
1781 return 0;
1784 static __init int ptwr_init(void)
1786 int i;
1788 for ( i = 0; i < smp_num_cpus; i++ )
1790 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1791 (void *)alloc_xenheap_page();
1792 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1793 (void *)alloc_xenheap_page();
1794 machine_to_phys_mapping[virt_to_phys(
1795 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page)>>PAGE_SHIFT] =
1796 INVALID_P2M_ENTRY;
1797 machine_to_phys_mapping[virt_to_phys(
1798 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page)>>PAGE_SHIFT] =
1799 INVALID_P2M_ENTRY;
1802 return 0;
1804 __initcall(ptwr_init);
1806 #ifndef NDEBUG
1807 void ptwr_status(void)
1809 unsigned long pte, *ptep, pfn;
1810 struct pfn_info *page;
1811 int cpu = smp_processor_id();
1813 ptep = (unsigned long *)&linear_pg_table
1814 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1816 if ( __get_user(pte, ptep) ) {
1817 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1818 domain_crash();
1821 pfn = pte >> PAGE_SHIFT;
1822 page = &frame_table[pfn];
1823 printk("need to alloc l1 page %p\n", page);
1824 /* make pt page writable */
1825 printk("need to make read-only l1-page at %p is %08lx\n",
1826 ptep, pte);
1828 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1829 return;
1831 if ( __get_user(pte, (unsigned long *)
1832 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1833 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1834 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1835 domain_crash();
1837 pfn = pte >> PAGE_SHIFT;
1838 page = &frame_table[pfn];
1842 /************************************************************************/
1845 void audit_domain(struct domain *d)
1847 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1849 void adjust (struct pfn_info *page, int dir, int adjtype)
1851 int count = page->count_info & PGC_count_mask;
1853 if ( adjtype )
1855 int tcount = page->u.inuse.type_info & PGT_count_mask;
1857 ttot++;
1859 tcount += dir;
1861 if ( tcount < 0 )
1863 /* This will only come out once. */
1864 printk("Audit %d: type count whent below zero pfn=%x "
1865 "taf=%x otaf=%x\n",
1866 d->domain, page-frame_table,
1867 page->u.inuse.type_info,
1868 page->tlbflush_timestamp);
1871 page->u.inuse.type_info =
1872 (page->u.inuse.type_info & ~PGT_count_mask) |
1873 (tcount & PGT_count_mask);
1876 ctot++;
1877 count += dir;
1878 if ( count < 0 )
1880 /* This will only come out once. */
1881 printk("Audit %d: general count whent below zero pfn=%x "
1882 "taf=%x otaf=%x\n",
1883 d->domain, page-frame_table,
1884 page->u.inuse.type_info,
1885 page->tlbflush_timestamp);
1888 page->count_info =
1889 (page->count_info & ~PGC_count_mask) |
1890 (count & PGC_count_mask);
1894 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1896 unsigned long pfn, *pt;
1897 struct list_head *list_ent;
1898 struct pfn_info *page;
1899 int i;
1901 list_ent = d->page_list.next;
1902 for ( i = 0; (list_ent != &d->page_list); i++ )
1904 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1905 page = &frame_table[pfn];
1907 switch ( page->u.inuse.type_info & PGT_type_mask )
1909 case PGT_l1_page_table:
1910 case PGT_l2_page_table:
1911 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1912 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1913 if ( (pt[i] & _PAGE_PRESENT) &&
1914 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1915 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1916 d->domain, i, pfn, page->u.inuse.type_info,
1917 page->count_info);
1918 unmap_domain_mem(pt);
1921 list_ent = frame_table[pfn].list.next;
1926 void scan_for_pfn_remote(unsigned long xpfn)
1928 struct domain *e;
1929 for_each_domain ( e )
1930 scan_for_pfn( e, xpfn );
1933 int i;
1934 unsigned long pfn;
1935 struct list_head *list_ent;
1936 struct pfn_info *page;
1938 if ( d != current )
1939 domain_pause(d);
1940 synchronise_pagetables(~0UL);
1942 printk("pt base=%lx sh_info=%x\n",
1943 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
1944 virt_to_page(d->shared_info)-frame_table);
1946 spin_lock(&d->page_alloc_lock);
1948 /* PHASE 0 */
1950 list_ent = d->page_list.next;
1951 for ( i = 0; (list_ent != &d->page_list); i++ )
1953 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1954 page = &frame_table[pfn];
1956 if ( page->u.inuse.domain != d )
1957 BUG();
1959 if ( (page->u.inuse.type_info & PGT_count_mask) >
1960 (page->count_info & PGC_count_mask) )
1961 printk("taf > caf %x %x pfn=%lx\n",
1962 page->u.inuse.type_info, page->count_info, pfn );
1964 #if 0 /* SYSV shared memory pages plus writeable files. */
1965 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
1966 (page->u.inuse.type_info & PGT_count_mask) > 1 )
1968 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
1969 pfn,
1970 page->u.inuse.type_info,
1971 page->count_info );
1972 scan_for_pfn_remote(pfn);
1974 #endif
1975 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
1976 (page->u.inuse.type_info & PGT_count_mask) > 1 )
1978 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
1979 pfn,
1980 page->u.inuse.type_info,
1981 page->count_info );
1984 /* Use tlbflush_timestamp to store original type_info. */
1985 page->tlbflush_timestamp = page->u.inuse.type_info;
1987 list_ent = frame_table[pfn].list.next;
1991 /* PHASE 1 */
1993 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
1995 list_ent = d->page_list.next;
1996 for ( i = 0; (list_ent != &d->page_list); i++ )
1998 unsigned long *pt;
1999 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2000 page = &frame_table[pfn];
2002 if ( page->u.inuse.domain != d )
2003 BUG();
2005 switch ( page->u.inuse.type_info & PGT_type_mask )
2007 case PGT_l2_page_table:
2009 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2010 printk("Audit %d: L2 not validated %x\n",
2011 d->domain, page->u.inuse.type_info);
2013 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2014 printk("Audit %d: L2 not pinned %x\n",
2015 d->domain, page->u.inuse.type_info);
2016 else
2017 adjust( page, -1, 1 );
2019 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2021 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2023 if ( pt[i] & _PAGE_PRESENT )
2025 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2026 struct pfn_info *l1page = &frame_table[l1pfn];
2028 if ( l1page->u.inuse.domain != d )
2030 printk("L2: Skip bizarre page belonging to other "
2031 "dom %p\n", l1page->u.inuse.domain);
2032 continue;
2035 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2036 PGT_l2_page_table )
2037 printk("Audit %d: [%x] Found %s Linear PT "
2038 "t=%x pfn=%lx\n", d->domain, i,
2039 (l1pfn==pfn) ? "Self" : "Other",
2040 l1page->u.inuse.type_info,
2041 l1pfn);
2042 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2043 PGT_l1_page_table )
2044 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2045 d->domain, i,
2046 l1page->u.inuse.type_info,
2047 l1pfn);
2049 adjust(l1page, -1, 1);
2053 unmap_domain_mem(pt);
2055 break;
2058 case PGT_l1_page_table:
2060 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2061 adjust( page, -1, 1 );
2063 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2064 printk("Audit %d: L1 not validated %x\n",
2065 d->domain, page->u.inuse.type_info);
2066 #if 0
2067 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2068 printk("Audit %d: L1 not pinned %x\n",
2069 d->domain, page->u.inuse.type_info);
2070 #endif
2071 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2073 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2075 if ( pt[i] & _PAGE_PRESENT )
2077 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2078 struct pfn_info *l1page = &frame_table[l1pfn];
2080 if ( l1pfn < 0x100 )
2082 lowmem_mappings++;
2083 continue;
2086 if ( l1pfn > max_page )
2088 io_mappings++;
2089 continue;
2092 if ( pt[i] & _PAGE_RW )
2095 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2096 PGT_l1_page_table ||
2097 (l1page->u.inuse.type_info & PGT_type_mask) ==
2098 PGT_l2_page_table )
2099 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2100 d->domain, i,
2101 l1page->u.inuse.type_info,
2102 l1pfn);
2106 if ( l1page->u.inuse.domain != d )
2108 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2109 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2110 d->domain, pfn, i,
2111 (unsigned long)l1page->u.inuse.domain,
2112 l1pfn,
2113 l1page->count_info,
2114 l1page->u.inuse.type_info,
2115 machine_to_phys_mapping[l1pfn]);
2116 continue;
2119 adjust(l1page, -1, 0);
2123 unmap_domain_mem(pt);
2125 break;
2128 list_ent = frame_table[pfn].list.next;
2131 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2132 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2133 d->domain, lowmem_mappings, io_mappings);
2135 /* PHASE 2 */
2137 ctot = ttot = 0;
2138 list_ent = d->page_list.next;
2139 for ( i = 0; (list_ent != &d->page_list); i++ )
2141 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2142 page = &frame_table[pfn];
2144 switch ( page->u.inuse.type_info & PGT_type_mask)
2146 case PGT_l1_page_table:
2147 case PGT_l2_page_table:
2148 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2150 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2151 d->domain, page->u.inuse.type_info,
2152 page->tlbflush_timestamp,
2153 page->count_info, pfn );
2154 scan_for_pfn_remote(pfn);
2156 default:
2157 if ( (page->count_info & PGC_count_mask) != 1 )
2159 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2160 d->domain,
2161 page->count_info,
2162 page->u.inuse.type_info,
2163 page->tlbflush_timestamp, pfn );
2164 scan_for_pfn_remote(pfn);
2166 break;
2169 list_ent = frame_table[pfn].list.next;
2172 /* PHASE 3 */
2174 list_ent = d->page_list.next;
2175 for ( i = 0; (list_ent != &d->page_list); i++ )
2177 unsigned long *pt;
2178 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2179 page = &frame_table[pfn];
2181 switch ( page->u.inuse.type_info & PGT_type_mask )
2183 case PGT_l2_page_table:
2184 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2185 adjust( page, 1, 1 );
2187 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2189 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2191 if ( pt[i] & _PAGE_PRESENT )
2193 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2194 struct pfn_info *l1page = &frame_table[l1pfn];
2196 if ( l1page->u.inuse.domain == d)
2197 adjust(l1page, 1, 1);
2201 unmap_domain_mem(pt);
2202 break;
2204 case PGT_l1_page_table:
2205 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2206 adjust( page, 1, 1 );
2208 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2210 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2212 if ( pt[i] & _PAGE_PRESENT )
2214 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2215 struct pfn_info *l1page = &frame_table[l1pfn];
2217 if ( (l1page->u.inuse.domain != d) ||
2218 (l1pfn < 0x100) || (l1pfn > max_page) )
2219 continue;
2221 adjust(l1page, 1, 0);
2225 unmap_domain_mem(pt);
2226 break;
2230 page->tlbflush_timestamp = 0;
2232 list_ent = frame_table[pfn].list.next;
2235 spin_unlock(&d->page_alloc_lock);
2237 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2239 printk("Audit %d: Done. ctot=%d ttot=%d\n",d->domain, ctot, ttot );
2241 if ( d != current )
2242 domain_unpause(d);
2245 void audit_domains(void)
2247 struct domain *d;
2248 for_each_domain ( d )
2249 audit_domain(d);
2252 void audit_domains_key(unsigned char key, void *dev_id,
2253 struct pt_regs *regs)
2255 open_softirq(MEMAUDIT_SOFTIRQ, audit_domains);
2256 raise_softirq(MEMAUDIT_SOFTIRQ);
2260 #endif