debuggers.hg

view xen/arch/x86/memory.c @ 2663:f0ed7653341e

bitkeeper revision 1.1159.1.222 (416c00558HL2Jw-kOYa6NaZn4JiaJQ)

Cleaned up the writable p.t. code and fixed a bug when shadow mode
is enabled.
author kaf24@freefall.cl.cam.ac.uk
date Tue Oct 12 16:03:33 2004 +0000 (2004-10-12)
parents b7b15f4a7ebc
children 0174982516f6
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->domain , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } percpu_info[NR_CPUS] __cacheline_aligned;
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
148 /*
149 * We are rather picky about the layout of 'struct pfn_info'. The
150 * count_info and domain fields must be adjacent, as we perform atomic
151 * 64-bit operations on them. Also, just for sanity, we assert the size
152 * of the structure here.
153 */
154 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
155 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
156 (sizeof(struct pfn_info) != 24) )
157 {
158 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
159 offsetof(struct pfn_info, count_info),
160 offsetof(struct pfn_info, u.inuse.domain),
161 sizeof(struct pfn_info));
162 for ( ; ; ) ;
163 }
165 memset(percpu_info, 0, sizeof(percpu_info));
167 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
168 memset(machine_to_phys_mapping, 0x55, 4<<20);
170 /*
171 * Initialise our DOMID_XEN domain.
172 * Any Xen-heap pages that we will allow to be mapped will have
173 * their domain field set to dom_xen.
174 */
175 dom_xen = alloc_domain_struct();
176 atomic_set(&dom_xen->refcnt, 1);
177 dom_xen->domain = DOMID_XEN;
179 /*
180 * Initialise our DOMID_IO domain.
181 * This domain owns no pages but is considered a special case when
182 * mapping I/O pages, as the mappings occur at the priv of the caller.
183 */
184 dom_io = alloc_domain_struct();
185 atomic_set(&dom_io->refcnt, 1);
186 dom_io->domain = DOMID_IO;
188 /* M2P table is mappable read-only by privileged domains. */
189 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
190 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
191 mfn++ )
192 {
193 frame_table[mfn].count_info = PGC_allocated | 1;
194 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
195 frame_table[mfn].u.inuse.domain = dom_xen;
196 }
197 }
199 static void __invalidate_shadow_ldt(struct domain *d)
200 {
201 int i;
202 unsigned long pfn;
203 struct pfn_info *page;
205 d->mm.shadow_ldt_mapcnt = 0;
207 for ( i = 16; i < 32; i++ )
208 {
209 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
210 if ( pfn == 0 ) continue;
211 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
212 page = &frame_table[pfn];
213 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
214 ASSERT_PAGE_IS_DOMAIN(page, d);
215 put_page_and_type(page);
216 }
218 /* Dispose of the (now possibly invalid) mappings from the TLB. */
219 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
220 }
223 static inline void invalidate_shadow_ldt(struct domain *d)
224 {
225 if ( d->mm.shadow_ldt_mapcnt != 0 )
226 __invalidate_shadow_ldt(d);
227 }
230 static int alloc_segdesc_page(struct pfn_info *page)
231 {
232 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
233 int i;
235 for ( i = 0; i < 512; i++ )
236 if ( unlikely(!check_descriptor(&descs[i*2])) )
237 goto fail;
239 unmap_domain_mem(descs);
240 return 1;
242 fail:
243 unmap_domain_mem(descs);
244 return 0;
245 }
248 /* Map shadow page at offset @off. */
249 int map_ldt_shadow_page(unsigned int off)
250 {
251 struct domain *d = current;
252 unsigned long l1e;
254 if ( unlikely(in_irq()) )
255 BUG();
257 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
258 PAGE_SHIFT) + off]);
260 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
261 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
262 d, PGT_ldt_page)) )
263 return 0;
265 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
266 d->mm.shadow_ldt_mapcnt++;
268 return 1;
269 }
272 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
273 {
274 struct pfn_info *page = &frame_table[page_nr];
276 if ( unlikely(!pfn_is_ram(page_nr)) )
277 {
278 MEM_LOG("Pfn %08lx is not RAM", page_nr);
279 return 0;
280 }
282 if ( unlikely(!get_page(page, d)) )
283 {
284 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
285 return 0;
286 }
288 return 1;
289 }
292 static int get_page_and_type_from_pagenr(unsigned long page_nr,
293 u32 type,
294 struct domain *d)
295 {
296 struct pfn_info *page = &frame_table[page_nr];
298 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
299 return 0;
301 if ( unlikely(!get_page_type(page, type)) )
302 {
303 #ifdef VERBOSE
304 if ( (type & PGT_type_mask) != PGT_l1_page_table )
305 MEM_LOG("Bad page type for pfn %08lx (%08x)",
306 page_nr, page->u.inuse.type_info);
307 #endif
308 put_page(page);
309 return 0;
310 }
312 return 1;
313 }
316 /*
317 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
318 * needs some special care with reference counst and access permissions:
319 * 1. The mapping entry must be read-only, or the guest may get write access
320 * to its own PTEs.
321 * 2. We must only bump the reference counts for an *already validated*
322 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
323 * on a validation that is required to complete that validation.
324 * 3. We only need to increment the reference counts for the mapped page
325 * frame if it is mapped by a different L2 table. This is sufficient and
326 * also necessary to allow validation of an L2 table mapping itself.
327 */
328 static int
329 get_linear_pagetable(
330 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
331 {
332 u32 x, y;
333 struct pfn_info *page;
335 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
336 {
337 MEM_LOG("Attempt to create linear p.t. with write perms");
338 return 0;
339 }
341 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
342 {
343 /* Make sure the mapped frame belongs to the correct domain. */
344 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
345 return 0;
347 /*
348 * Make sure that the mapped frame is an already-validated L2 table.
349 * If so, atomically increment the count (checking for overflow).
350 */
351 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
352 y = page->u.inuse.type_info;
353 do {
354 x = y;
355 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
356 unlikely((x & (PGT_type_mask|PGT_validated)) !=
357 (PGT_l2_page_table|PGT_validated)) )
358 {
359 put_page(page);
360 return 0;
361 }
362 }
363 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
364 }
366 return 1;
367 }
370 static int
371 get_page_from_l1e(
372 l1_pgentry_t l1e, struct domain *d)
373 {
374 unsigned long l1v = l1_pgentry_val(l1e);
375 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
376 struct pfn_info *page = &frame_table[pfn];
377 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
379 if ( !(l1v & _PAGE_PRESENT) )
380 return 1;
382 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
383 {
384 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
385 return 0;
386 }
388 if ( unlikely(!pfn_is_ram(pfn)) )
389 {
390 /* Revert to caller privileges if FD == DOMID_IO. */
391 if ( d == dom_io )
392 d = current;
394 if ( IS_PRIV(d) )
395 return 1;
397 if ( IS_CAPABLE_PHYSDEV(d) )
398 return domain_iomem_in_pfn(d, pfn);
400 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
401 return 0;
402 }
404 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
405 return 0;
407 if ( l1v & _PAGE_RW )
408 {
409 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
410 return 0;
411 }
413 return 1;
414 }
417 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
418 static int
419 get_page_from_l2e(
420 l2_pgentry_t l2e, unsigned long pfn,
421 struct domain *d, unsigned long va_idx)
422 {
423 int rc;
425 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
426 return 1;
428 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
429 {
430 MEM_LOG("Bad L2 page type settings %04lx",
431 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
432 return 0;
433 }
435 rc = get_page_and_type_from_pagenr(
436 l2_pgentry_to_pagenr(l2e),
437 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
439 if ( unlikely(!rc) )
440 return get_linear_pagetable(l2e, pfn, d);
442 return 1;
443 }
446 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
447 {
448 unsigned long l1v = l1_pgentry_val(l1e);
449 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
450 struct pfn_info *page = &frame_table[pfn];
451 struct domain *e = page->u.inuse.domain;
453 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
454 return;
456 if ( unlikely(e != d) )
457 {
458 /*
459 * Unmap a foreign page that may have been mapped via a grant table.
460 * Note that this can fail for a privileged domain that can map foreign
461 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
462 * counted via a grant entry and some counted directly in the page
463 * structure's reference count. Note that reference counts won't get
464 * dangerously confused as long as we always try to decrement the
465 * grant entry first. We may end up with a mismatch between which
466 * mappings and which unmappings are counted via the grant entry, but
467 * really it doesn't matter as privileged domains have carte blanche.
468 */
469 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
470 return;
471 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
472 }
474 if ( l1v & _PAGE_RW )
475 {
476 put_page_and_type(page);
477 }
478 else
479 {
480 /* We expect this is rare so we blow the entire shadow LDT. */
481 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
482 PGT_ldt_page)) &&
483 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
484 invalidate_shadow_ldt(e);
485 put_page(page);
486 }
487 }
490 /*
491 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
492 * Note also that this automatically deals correctly with linear p.t.'s.
493 */
494 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
495 {
496 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
497 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
498 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
499 }
502 static int alloc_l2_table(struct pfn_info *page)
503 {
504 struct domain *d = page->u.inuse.domain;
505 unsigned long page_nr = page_to_pfn(page);
506 l2_pgentry_t *pl2e;
507 int i;
509 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
511 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
512 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
513 goto fail;
514 }
516 #if defined(__i386__)
517 /* Now we add our private high mappings. */
518 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
519 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
520 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
521 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
522 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
523 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
524 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
525 __PAGE_HYPERVISOR);
526 #endif
528 unmap_domain_mem(pl2e);
529 return 1;
531 fail:
532 while ( i-- > 0 )
533 put_page_from_l2e(pl2e[i], page_nr);
535 unmap_domain_mem(pl2e);
536 return 0;
537 }
540 static int alloc_l1_table(struct pfn_info *page)
541 {
542 struct domain *d = page->u.inuse.domain;
543 unsigned long page_nr = page_to_pfn(page);
544 l1_pgentry_t *pl1e;
545 int i;
547 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
549 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
550 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
551 goto fail;
553 unmap_domain_mem(pl1e);
554 return 1;
556 fail:
557 while ( i-- > 0 )
558 put_page_from_l1e(pl1e[i], d);
560 unmap_domain_mem(pl1e);
561 return 0;
562 }
565 static void free_l2_table(struct pfn_info *page)
566 {
567 unsigned long page_nr = page - frame_table;
568 l2_pgentry_t *pl2e;
569 int i;
571 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
573 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
574 put_page_from_l2e(pl2e[i], page_nr);
576 unmap_domain_mem(pl2e);
577 }
580 static void free_l1_table(struct pfn_info *page)
581 {
582 struct domain *d = page->u.inuse.domain;
583 unsigned long page_nr = page - frame_table;
584 l1_pgentry_t *pl1e;
585 int i;
587 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
589 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
590 put_page_from_l1e(pl1e[i], d);
592 unmap_domain_mem(pl1e);
593 }
596 static inline int update_l2e(l2_pgentry_t *pl2e,
597 l2_pgentry_t ol2e,
598 l2_pgentry_t nl2e)
599 {
600 unsigned long o = cmpxchg((unsigned long *)pl2e,
601 l2_pgentry_val(ol2e),
602 l2_pgentry_val(nl2e));
603 if ( o != l2_pgentry_val(ol2e) )
604 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
605 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
606 return (o == l2_pgentry_val(ol2e));
607 }
610 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
611 static int mod_l2_entry(l2_pgentry_t *pl2e,
612 l2_pgentry_t nl2e,
613 unsigned long pfn)
614 {
615 l2_pgentry_t ol2e;
616 unsigned long _ol2e;
618 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
619 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
620 {
621 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
622 return 0;
623 }
625 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
626 return 0;
627 ol2e = mk_l2_pgentry(_ol2e);
629 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
630 {
631 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
632 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
633 return update_l2e(pl2e, ol2e, nl2e);
635 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
636 ((unsigned long)pl2e &
637 ~PAGE_MASK) >> 2)) )
638 return 0;
640 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
641 {
642 put_page_from_l2e(nl2e, pfn);
643 return 0;
644 }
646 put_page_from_l2e(ol2e, pfn);
647 return 1;
648 }
650 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
651 return 0;
653 put_page_from_l2e(ol2e, pfn);
654 return 1;
655 }
658 static inline int update_l1e(l1_pgentry_t *pl1e,
659 l1_pgentry_t ol1e,
660 l1_pgentry_t nl1e)
661 {
662 unsigned long o = l1_pgentry_val(ol1e);
663 unsigned long n = l1_pgentry_val(nl1e);
665 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
666 unlikely(o != l1_pgentry_val(ol1e)) )
667 {
668 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
669 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
670 return 0;
671 }
673 return 1;
674 }
677 /* Update the L1 entry at pl1e to new value nl1e. */
678 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
679 {
680 l1_pgentry_t ol1e;
681 unsigned long _ol1e;
682 struct domain *d = current;
684 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
685 {
686 MEM_LOG("Bad get_user\n");
687 return 0;
688 }
690 ol1e = mk_l1_pgentry(_ol1e);
692 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
693 {
694 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
695 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
696 return update_l1e(pl1e, ol1e, nl1e);
698 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
699 return 0;
701 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
702 {
703 put_page_from_l1e(nl1e, d);
704 return 0;
705 }
707 put_page_from_l1e(ol1e, d);
708 return 1;
709 }
711 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
712 return 0;
714 put_page_from_l1e(ol1e, d);
715 return 1;
716 }
719 int alloc_page_type(struct pfn_info *page, unsigned int type)
720 {
721 switch ( type )
722 {
723 case PGT_l1_page_table:
724 return alloc_l1_table(page);
725 case PGT_l2_page_table:
726 return alloc_l2_table(page);
727 case PGT_gdt_page:
728 case PGT_ldt_page:
729 return alloc_segdesc_page(page);
730 default:
731 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
732 type, page->u.inuse.type_info,
733 page->count_info);
734 BUG();
735 }
737 return 0;
738 }
741 void free_page_type(struct pfn_info *page, unsigned int type)
742 {
743 struct domain *d = page->u.inuse.domain;
745 switch ( type )
746 {
747 case PGT_l1_page_table:
748 free_l1_table(page);
749 break;
751 case PGT_l2_page_table:
752 free_l2_table(page);
753 break;
755 default:
756 BUG();
757 }
759 if ( unlikely(d->mm.shadow_mode) &&
760 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
761 {
762 unshadow_table(page_to_pfn(page), type);
763 put_shadow_status(&d->mm);
764 }
765 }
768 void put_page_type(struct pfn_info *page)
769 {
770 u32 nx, x, y = page->u.inuse.type_info;
772 again:
773 do {
774 x = y;
775 nx = x - 1;
777 ASSERT((x & PGT_count_mask) != 0);
779 /*
780 * The page should always be validated while a reference is held. The
781 * exception is during domain destruction, when we forcibly invalidate
782 * page-table pages if we detect a referential loop.
783 * See domain.c:relinquish_list().
784 */
785 ASSERT((x & PGT_validated) ||
786 test_bit(DF_DYING, &page->u.inuse.domain->flags));
788 if ( unlikely((nx & PGT_count_mask) == 0) )
789 {
790 /* Record TLB information for flush later. Races are harmless. */
791 page->tlbflush_timestamp = tlbflush_clock;
793 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
794 likely(nx & PGT_validated) )
795 {
796 /*
797 * Page-table pages must be unvalidated when count is zero. The
798 * 'free' is safe because the refcnt is non-zero and validated
799 * bit is clear => other ops will spin or fail.
800 */
801 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
802 x & ~PGT_validated)) != x) )
803 goto again;
804 /* We cleared the 'valid bit' so we do the clear up. */
805 free_page_type(page, x & PGT_type_mask);
806 /* Carry on, but with the 'valid bit' now clear. */
807 x &= ~PGT_validated;
808 nx &= ~PGT_validated;
809 }
810 }
811 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
812 (PGT_pinned | 1)) )
813 {
814 /* Page is now only pinned. Make the back pointer mutable again. */
815 nx |= PGT_va_mutable;
816 }
817 }
818 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
819 }
822 int get_page_type(struct pfn_info *page, u32 type)
823 {
824 u32 nx, x, y = page->u.inuse.type_info;
826 again:
827 do {
828 x = y;
829 nx = x + 1;
830 if ( unlikely((nx & PGT_count_mask) == 0) )
831 {
832 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
833 return 0;
834 }
835 else if ( unlikely((x & PGT_count_mask) == 0) )
836 {
837 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
838 {
839 /*
840 * On type change we check to flush stale TLB entries. This
841 * may be unnecessary (e.g., page was GDT/LDT) but those
842 * circumstances should be very rare.
843 */
844 struct domain *d = page->u.inuse.domain;
845 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
846 page->tlbflush_timestamp)) )
847 {
848 perfc_incr(need_flush_tlb_flush);
849 flush_tlb_cpu(d->processor);
850 }
852 /* We lose existing type, back pointer, and validity. */
853 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
854 nx |= type;
856 /* No special validation needed for writable pages. */
857 /* Page tables and GDT/LDT need to be scanned for validity. */
858 if ( type == PGT_writable_page )
859 nx |= PGT_validated;
860 }
861 }
862 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
863 {
864 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
865 {
866 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
867 ((type & PGT_type_mask) != PGT_l1_page_table) )
868 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
869 x & PGT_type_mask, type, page_to_pfn(page));
870 return 0;
871 }
872 else if ( (x & PGT_va_mask) == PGT_va_mutable )
873 {
874 /* The va backpointer is mutable, hence we update it. */
875 nx &= ~PGT_va_mask;
876 nx |= type; /* we know the actual type is correct */
877 }
878 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
879 {
880 /* This table is potentially mapped at multiple locations. */
881 nx &= ~PGT_va_mask;
882 nx |= PGT_va_unknown;
883 }
884 }
885 else if ( unlikely(!(x & PGT_validated)) )
886 {
887 /* Someone else is updating validation of this page. Wait... */
888 while ( (y = page->u.inuse.type_info) == x )
889 {
890 rep_nop();
891 barrier();
892 }
893 goto again;
894 }
895 }
896 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
898 if ( unlikely(!(nx & PGT_validated)) )
899 {
900 /* Try to validate page type; drop the new reference on failure. */
901 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
902 {
903 MEM_LOG("Error while validating pfn %08lx for type %08x."
904 " caf=%08x taf=%08x\n",
905 page_to_pfn(page), type,
906 page->count_info,
907 page->u.inuse.type_info);
908 /* Noone else can get a reference. We hold the only ref. */
909 page->u.inuse.type_info = 0;
910 return 0;
911 }
913 /* Noone else is updating simultaneously. */
914 __set_bit(_PGT_validated, &page->u.inuse.type_info);
915 }
917 return 1;
918 }
921 static int do_extended_command(unsigned long ptr, unsigned long val)
922 {
923 int okay = 1, cpu = smp_processor_id();
924 unsigned int cmd = val & MMUEXT_CMD_MASK;
925 unsigned long pfn = ptr >> PAGE_SHIFT;
926 unsigned long old_base_pfn;
927 struct pfn_info *page = &frame_table[pfn];
928 struct domain *d = current, *nd, *e;
929 u32 x, y;
930 domid_t domid;
931 grant_ref_t gntref;
933 switch ( cmd )
934 {
935 case MMUEXT_PIN_L1_TABLE:
936 case MMUEXT_PIN_L2_TABLE:
937 /*
938 * We insist that, if you pin an L1 page, it's the first thing that
939 * you do to it. This is because we require the backptr to still be
940 * mutable. This assumption seems safe.
941 */
942 okay = get_page_and_type_from_pagenr(
943 pfn,
944 ((cmd==MMUEXT_PIN_L2_TABLE) ?
945 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
946 FOREIGNDOM);
948 if ( unlikely(!okay) )
949 {
950 MEM_LOG("Error while pinning pfn %08lx", pfn);
951 break;
952 }
954 if ( unlikely(test_and_set_bit(_PGT_pinned,
955 &page->u.inuse.type_info)) )
956 {
957 MEM_LOG("Pfn %08lx already pinned", pfn);
958 put_page_and_type(page);
959 okay = 0;
960 break;
961 }
963 break;
965 case MMUEXT_UNPIN_TABLE:
966 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
967 {
968 MEM_LOG("Page %08lx bad domain (dom=%p)",
969 ptr, page->u.inuse.domain);
970 }
971 else if ( likely(test_and_clear_bit(_PGT_pinned,
972 &page->u.inuse.type_info)) )
973 {
974 put_page_and_type(page);
975 put_page(page);
976 }
977 else
978 {
979 okay = 0;
980 put_page(page);
981 MEM_LOG("Pfn %08lx not pinned", pfn);
982 }
983 break;
985 case MMUEXT_NEW_BASEPTR:
986 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
987 if ( likely(okay) )
988 {
989 invalidate_shadow_ldt(d);
991 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
992 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
993 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
995 shadow_mk_pagetable(&d->mm);
997 write_ptbase(&d->mm);
999 put_page_and_type(&frame_table[old_base_pfn]);
1001 /*
1002 * Note that we tick the clock /after/ dropping the old base's
1003 * reference count. If the page tables got freed then this will
1004 * avoid unnecessary TLB flushes when the pages are reused. */
1005 tlb_clocktick();
1007 else
1009 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1011 break;
1013 case MMUEXT_TLB_FLUSH:
1014 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1015 break;
1017 case MMUEXT_INVLPG:
1018 __flush_tlb_one(ptr);
1019 break;
1021 case MMUEXT_FLUSH_CACHE:
1022 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1024 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1025 okay = 0;
1027 else
1029 wbinvd();
1031 break;
1033 case MMUEXT_SET_LDT:
1035 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1036 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1037 (ents > 8192) ||
1038 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1039 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1041 okay = 0;
1042 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1044 else if ( (d->mm.ldt_ents != ents) ||
1045 (d->mm.ldt_base != ptr) )
1047 invalidate_shadow_ldt(d);
1048 d->mm.ldt_base = ptr;
1049 d->mm.ldt_ents = ents;
1050 load_LDT(d);
1051 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1052 if ( ents != 0 )
1053 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1055 break;
1058 case MMUEXT_SET_FOREIGNDOM:
1059 domid = (domid_t)(val >> 16);
1061 if ( (e = percpu_info[cpu].foreign) != NULL )
1062 put_domain(e);
1063 percpu_info[cpu].foreign = NULL;
1065 if ( !IS_PRIV(d) )
1067 switch ( domid )
1069 case DOMID_IO:
1070 get_knownalive_domain(dom_io);
1071 percpu_info[cpu].foreign = dom_io;
1072 break;
1073 default:
1074 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain);
1075 okay = 0;
1076 break;
1079 else
1081 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1082 if ( e == NULL )
1084 switch ( domid )
1086 case DOMID_XEN:
1087 get_knownalive_domain(dom_xen);
1088 percpu_info[cpu].foreign = dom_xen;
1089 break;
1090 case DOMID_IO:
1091 get_knownalive_domain(dom_io);
1092 percpu_info[cpu].foreign = dom_io;
1093 break;
1094 default:
1095 MEM_LOG("Unknown domain '%u'", domid);
1096 okay = 0;
1097 break;
1101 break;
1103 case MMUEXT_TRANSFER_PAGE:
1104 domid = (domid_t)(val >> 16);
1105 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1107 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1108 unlikely(!pfn_is_ram(pfn)) ||
1109 unlikely((e = find_domain_by_id(domid)) == NULL) )
1111 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1112 okay = 0;
1113 break;
1116 spin_lock(&d->page_alloc_lock);
1118 /*
1119 * The tricky bit: atomically release ownership while there is just one
1120 * benign reference to the page (PGC_allocated). If that reference
1121 * disappears then the deallocation routine will safely spin.
1122 */
1123 nd = page->u.inuse.domain;
1124 y = page->count_info;
1125 do {
1126 x = y;
1127 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1128 (1|PGC_allocated)) ||
1129 unlikely(nd != d) )
1131 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1132 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1133 d, d->domain, nd, x, page->u.inuse.type_info);
1134 spin_unlock(&d->page_alloc_lock);
1135 put_domain(e);
1136 return 0;
1138 __asm__ __volatile__(
1139 LOCK_PREFIX "cmpxchg8b %2"
1140 : "=d" (nd), "=a" (y),
1141 "=m" (*(volatile u64 *)(&page->count_info))
1142 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1144 while ( unlikely(nd != d) || unlikely(y != x) );
1146 /*
1147 * Unlink from 'd'. At least one reference remains (now anonymous), so
1148 * noone else is spinning to try to delete this page from 'd'.
1149 */
1150 d->tot_pages--;
1151 list_del(&page->list);
1153 spin_unlock(&d->page_alloc_lock);
1155 spin_lock(&e->page_alloc_lock);
1157 /*
1158 * Check that 'e' will accept the page and has reservation headroom.
1159 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1160 */
1161 ASSERT(e->tot_pages <= e->max_pages);
1162 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1163 unlikely(e->tot_pages == e->max_pages) ||
1164 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1166 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1167 "provided a bad grant ref, or is dying (%08lx).\n",
1168 e->tot_pages, e->max_pages, e->flags);
1169 spin_unlock(&e->page_alloc_lock);
1170 put_domain(e);
1171 okay = 0;
1172 break;
1175 /* Okay, add the page to 'e'. */
1176 if ( unlikely(e->tot_pages++ == 0) )
1177 get_knownalive_domain(e);
1178 list_add_tail(&page->list, &e->page_list);
1179 page->u.inuse.domain = e;
1181 spin_unlock(&e->page_alloc_lock);
1183 /* Transfer is all done: tell the guest about its new page frame. */
1184 gnttab_notify_transfer(e, gntref, pfn);
1186 put_domain(e);
1187 break;
1189 case MMUEXT_REASSIGN_PAGE:
1190 if ( unlikely(!IS_PRIV(d)) )
1192 MEM_LOG("Dom %u has no reassignment priv", d->domain);
1193 okay = 0;
1194 break;
1197 e = percpu_info[cpu].foreign;
1198 if ( unlikely(e == NULL) )
1200 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1201 okay = 0;
1202 break;
1205 /*
1206 * Grab both page_list locks, in order. This prevents the page from
1207 * disappearing elsewhere while we modify the owner, and we'll need
1208 * both locks if we're successful so that we can change lists.
1209 */
1210 if ( d < e )
1212 spin_lock(&d->page_alloc_lock);
1213 spin_lock(&e->page_alloc_lock);
1215 else
1217 spin_lock(&e->page_alloc_lock);
1218 spin_lock(&d->page_alloc_lock);
1221 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1222 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1223 unlikely(IS_XEN_HEAP_FRAME(page)) )
1225 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1226 okay = 0;
1227 goto reassign_fail;
1230 /*
1231 * The tricky bit: atomically change owner while there is just one
1232 * benign reference to the page (PGC_allocated). If that reference
1233 * disappears then the deallocation routine will safely spin.
1234 */
1235 nd = page->u.inuse.domain;
1236 y = page->count_info;
1237 do {
1238 x = y;
1239 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1240 (1|PGC_allocated)) ||
1241 unlikely(nd != d) )
1243 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1244 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1245 d, d->domain, nd, x, page->u.inuse.type_info);
1246 okay = 0;
1247 goto reassign_fail;
1249 __asm__ __volatile__(
1250 LOCK_PREFIX "cmpxchg8b %3"
1251 : "=d" (nd), "=a" (y), "=c" (e),
1252 "=m" (*(volatile u64 *)(&page->count_info))
1253 : "0" (d), "1" (x), "c" (e), "b" (x) );
1255 while ( unlikely(nd != d) || unlikely(y != x) );
1257 /*
1258 * Unlink from 'd'. We transferred at least one reference to 'e', so
1259 * noone else is spinning to try to delete this page from 'd'.
1260 */
1261 d->tot_pages--;
1262 list_del(&page->list);
1264 /*
1265 * Add the page to 'e'. Someone may already have removed the last
1266 * reference and want to remove the page from 'e'. However, we have
1267 * the lock so they'll spin waiting for us.
1268 */
1269 if ( unlikely(e->tot_pages++ == 0) )
1270 get_knownalive_domain(e);
1271 list_add_tail(&page->list, &e->page_list);
1273 reassign_fail:
1274 spin_unlock(&d->page_alloc_lock);
1275 spin_unlock(&e->page_alloc_lock);
1276 break;
1278 case MMUEXT_CLEAR_FOREIGNDOM:
1279 if ( (e = percpu_info[cpu].foreign) != NULL )
1280 put_domain(e);
1281 percpu_info[cpu].foreign = NULL;
1282 break;
1284 default:
1285 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1286 okay = 0;
1287 break;
1290 return okay;
1294 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1296 mmu_update_t req;
1297 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1298 struct pfn_info *page;
1299 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1300 unsigned int cmd;
1301 unsigned long prev_spfn = 0;
1302 l1_pgentry_t *prev_spl1e = 0;
1303 struct domain *d = current;
1304 u32 type_info;
1306 perfc_incrc(calls_to_mmu_update);
1307 perfc_addc(num_page_updates, count);
1309 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1311 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) )
1312 return -EFAULT;
1314 for ( i = 0; i < count; i++ )
1316 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1318 MEM_LOG("Bad __copy_from_user");
1319 rc = -EFAULT;
1320 break;
1323 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1324 pfn = req.ptr >> PAGE_SHIFT;
1326 okay = 0;
1328 switch ( cmd )
1330 /*
1331 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1332 */
1333 case MMU_NORMAL_PT_UPDATE:
1334 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1336 MEM_LOG("Could not get page for normal update");
1337 break;
1340 if ( likely(prev_pfn == pfn) )
1342 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1344 else
1346 if ( prev_pfn != 0 )
1347 unmap_domain_mem((void *)va);
1348 va = (unsigned long)map_domain_mem(req.ptr);
1349 prev_pfn = pfn;
1352 page = &frame_table[pfn];
1353 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1355 case PGT_l1_page_table:
1356 if ( likely(get_page_type(
1357 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1359 okay = mod_l1_entry((l1_pgentry_t *)va,
1360 mk_l1_pgentry(req.val));
1362 if ( unlikely(d->mm.shadow_mode) && okay &&
1363 (get_shadow_status(&d->mm, page-frame_table) &
1364 PSH_shadowed) )
1366 shadow_l1_normal_pt_update(
1367 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1368 put_shadow_status(&d->mm);
1371 put_page_type(page);
1373 break;
1374 case PGT_l2_page_table:
1375 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1377 okay = mod_l2_entry((l2_pgentry_t *)va,
1378 mk_l2_pgentry(req.val),
1379 pfn);
1381 if ( unlikely(d->mm.shadow_mode) && okay &&
1382 (get_shadow_status(&d->mm, page-frame_table) &
1383 PSH_shadowed) )
1385 shadow_l2_normal_pt_update(req.ptr, req.val);
1386 put_shadow_status(&d->mm);
1389 put_page_type(page);
1391 break;
1392 default:
1393 if ( likely(get_page_type(page, PGT_writable_page)) )
1395 *(unsigned long *)va = req.val;
1396 okay = 1;
1397 put_page_type(page);
1399 break;
1402 put_page(page);
1403 break;
1405 case MMU_MACHPHYS_UPDATE:
1406 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1408 MEM_LOG("Could not get page for mach->phys update");
1409 break;
1412 machine_to_phys_mapping[pfn] = req.val;
1413 okay = 1;
1415 /*
1416 * If in log-dirty mode, mark the corresponding pseudo-physical
1417 * page as dirty.
1418 */
1419 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1420 mark_dirty(&d->mm, pfn) )
1421 d->mm.shadow_dirty_block_count++;
1423 put_page(&frame_table[pfn]);
1424 break;
1426 /*
1427 * MMU_EXTENDED_COMMAND: Extended command is specified
1428 * in the least-siginificant bits of the 'value' field.
1429 */
1430 case MMU_EXTENDED_COMMAND:
1431 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1432 okay = do_extended_command(req.ptr, req.val);
1433 break;
1435 default:
1436 MEM_LOG("Invalid page update command %08lx", req.ptr);
1437 break;
1440 if ( unlikely(!okay) )
1442 rc = -EINVAL;
1443 break;
1446 ureqs++;
1449 if ( prev_pfn != 0 )
1450 unmap_domain_mem((void *)va);
1452 if ( unlikely(prev_spl1e != 0) )
1453 unmap_domain_mem((void *)prev_spl1e);
1455 deferred_ops = percpu_info[cpu].deferred_ops;
1456 percpu_info[cpu].deferred_ops = 0;
1458 if ( deferred_ops & DOP_FLUSH_TLB )
1459 local_flush_tlb();
1461 if ( deferred_ops & DOP_RELOAD_LDT )
1462 (void)map_ldt_shadow_page(0);
1464 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1466 put_domain(percpu_info[cpu].foreign);
1467 percpu_info[cpu].foreign = NULL;
1470 if ( unlikely(success_count != NULL) )
1471 put_user(count, success_count);
1473 return rc;
1477 int do_update_va_mapping(unsigned long page_nr,
1478 unsigned long val,
1479 unsigned long flags)
1481 struct domain *d = current;
1482 int err = 0;
1483 unsigned int cpu = d->processor;
1484 unsigned long deferred_ops;
1486 perfc_incrc(calls_to_update_va);
1488 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1489 return -EINVAL;
1491 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1493 /*
1494 * XXX When we make this support 4MB superpages we should also deal with
1495 * the case of updating L2 entries.
1496 */
1498 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1499 mk_l1_pgentry(val))) )
1500 err = -EINVAL;
1502 if ( unlikely(d->mm.shadow_mode) )
1504 unsigned long sval;
1506 l1pte_no_fault(&d->mm, &val, &sval);
1508 if ( unlikely(__put_user(sval, ((unsigned long *)(
1509 &shadow_linear_pg_table[page_nr])))) )
1511 /*
1512 * Since L2's are guranteed RW, failure indicates the page was not
1513 * shadowed, so ignore.
1514 */
1515 perfc_incrc(shadow_update_va_fail);
1518 /*
1519 * If we're in log-dirty mode then we need to note that we've updated
1520 * the PTE in the PT-holding page. We need the machine frame number
1521 * for this.
1522 */
1523 if ( d->mm.shadow_mode == SHM_logdirty )
1524 mark_dirty( &current->mm, va_to_l1mfn(page_nr<<PAGE_SHIFT) );
1526 check_pagetable(d, d->mm.pagetable, "va"); /* debug */
1529 deferred_ops = percpu_info[cpu].deferred_ops;
1530 percpu_info[cpu].deferred_ops = 0;
1532 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1533 unlikely(flags & UVMF_FLUSH_TLB) )
1534 local_flush_tlb();
1535 else if ( unlikely(flags & UVMF_INVLPG) )
1536 __flush_tlb_one(page_nr << PAGE_SHIFT);
1538 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1539 (void)map_ldt_shadow_page(0);
1541 return err;
1544 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1545 unsigned long val,
1546 unsigned long flags,
1547 domid_t domid)
1549 unsigned int cpu = smp_processor_id();
1550 struct domain *d;
1551 int rc;
1553 if ( unlikely(!IS_PRIV(current)) )
1554 return -EPERM;
1556 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1557 if ( unlikely(d == NULL) )
1559 MEM_LOG("Unknown domain '%u'", domid);
1560 return -ESRCH;
1563 rc = do_update_va_mapping(page_nr, val, flags);
1565 put_domain(d);
1566 percpu_info[cpu].foreign = NULL;
1568 return rc;
1573 /*************************
1574 * Writable Pagetables
1575 */
1577 ptwr_info_t ptwr_info[NR_CPUS];
1579 #ifdef VERBOSE
1580 int ptwr_debug = 0x0;
1581 #define PTWR_PRINTK(_f, _a...) \
1582 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1583 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1584 #else
1585 #define PTWR_PRINTK(_f, _a...) ((void)0)
1586 #endif
1588 /* Flush the given writable p.t. page and write-protect it again. */
1589 void ptwr_flush(const int which)
1591 unsigned long sstat, spte, pte, *ptep, l1va;
1592 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1593 l2_pgentry_t *pl2e, nl2e;
1594 int i, cpu = smp_processor_id();
1595 struct domain *d = current;
1597 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1598 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1600 /*
1601 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1602 */
1604 if ( unlikely(__get_user(pte, ptep)) )
1606 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1607 domain_crash();
1609 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1610 PTWR_PRINT_WHICH, ptep, pte);
1611 pte &= ~_PAGE_RW;
1613 if ( unlikely(d->mm.shadow_mode) )
1615 /* Write-protect the p.t. page in the shadow page table. */
1616 l1pte_no_fault(&d->mm, &pte, &spte);
1617 __put_user(
1618 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1620 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1621 sstat = get_shadow_status(&d->mm, pte >> PAGE_SHIFT);
1622 if ( sstat & PSH_shadowed )
1623 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1626 /* Write-protect the p.t. page in the guest page table. */
1627 if ( unlikely(__put_user(pte, ptep)) )
1629 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1630 domain_crash();
1633 /* Ensure that there are no stale writable mappings in any TLB. */
1634 __flush_tlb_one(l1va);
1635 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1636 PTWR_PRINT_WHICH, ptep, pte);
1638 /*
1639 * STEP 2. Validate any modified PTEs.
1640 */
1642 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1643 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1645 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1646 nl1e = pl1e[i];
1648 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1649 continue;
1651 /*
1652 * Fast path for PTEs that have merely been write-protected
1653 * (e.g., during a Unix fork()). A strict reduction in privilege.
1654 */
1655 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1657 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1659 if ( unlikely(sl1e != NULL) )
1660 l1pte_no_fault(
1661 &d->mm, &l1_pgentry_val(nl1e),
1662 &l1_pgentry_val(sl1e[i]));
1663 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1665 continue;
1668 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1670 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1671 domain_crash();
1674 if ( unlikely(sl1e != NULL) )
1675 l1pte_no_fault(
1676 &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1678 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1679 put_page_from_l1e(ol1e, d);
1681 unmap_domain_mem(pl1e);
1683 /*
1684 * STEP 3. Reattach the L1 p.t. page into the current address space.
1685 */
1687 if ( (which == PTWR_PT_ACTIVE) && likely(!d->mm.shadow_mode) )
1689 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1690 nl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1691 update_l2e(pl2e, *pl2e, nl2e);
1694 /*
1695 * STEP 4. Final tidy-up.
1696 */
1698 ptwr_info[cpu].ptinfo[which].l1va = 0;
1700 if ( unlikely(sl1e != NULL) )
1702 unmap_domain_mem(sl1e);
1703 put_shadow_status(&d->mm);
1707 /* Write page fault handler: check if guest is trying to modify a PTE. */
1708 int ptwr_do_page_fault(unsigned long addr)
1710 unsigned long pte, pfn;
1711 struct pfn_info *page;
1712 l2_pgentry_t *pl2e, nl2e;
1713 int which, cpu = smp_processor_id();
1714 u32 l2_idx;
1716 /*
1717 * Attempt to read the PTE that maps the VA being accessed. By checking for
1718 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1719 */
1720 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1721 _PAGE_PRESENT) ||
1722 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1723 return 0;
1725 pfn = pte >> PAGE_SHIFT;
1726 page = &frame_table[pfn];
1728 /* We are looking only for read-only mappings of p.t. pages. */
1729 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1730 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1731 return 0;
1733 /* Get the L2 index at which this L1 p.t. is always mapped. */
1734 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1735 if ( unlikely(l2_idx >= PGT_va_unknown) )
1736 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1737 l2_idx >>= PGT_va_shift;
1739 /*
1740 * Is the L1 p.t. mapped into the current address space? If so we call it
1741 * an ACTIVE p.t., otherwise it is INACTIVE.
1742 */
1743 pl2e = &linear_l2_table[l2_idx];
1744 which = (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) ?
1745 PTWR_PT_INACTIVE : PTWR_PT_ACTIVE;
1747 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1748 "pfn %08lx\n", PTWR_PRINT_WHICH,
1749 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1751 /*
1752 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1753 * time. If there is already one, we must flush it out.
1754 */
1755 if ( ptwr_info[cpu].ptinfo[which].l1va )
1756 ptwr_flush(which);
1758 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1759 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1761 /* For safety, disconnect the L1 p.t. page from current space. */
1762 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1764 nl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) & ~_PAGE_PRESENT);
1765 update_l2e(pl2e, *pl2e, nl2e);
1766 flush_tlb();
1769 /* Temporarily map the L1 page, and make a copy of it. */
1770 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1771 memcpy(ptwr_info[cpu].ptinfo[which].page,
1772 ptwr_info[cpu].ptinfo[which].pl1e,
1773 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1775 /* Finally, make the p.t. page writable by the guest OS. */
1776 pte |= _PAGE_RW;
1777 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1778 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1779 if ( unlikely(__put_user(pte, (unsigned long *)
1780 &linear_pg_table[addr>>PAGE_SHIFT])) )
1782 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1783 &linear_pg_table[addr>>PAGE_SHIFT]);
1784 domain_crash();
1787 /* Maybe fall through to shadow mode to propagate writable L1. */
1788 return !current->mm.shadow_mode;
1791 static __init int ptwr_init(void)
1793 int i;
1795 for ( i = 0; i < smp_num_cpus; i++ )
1797 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1798 (void *)alloc_xenheap_page();
1799 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1800 (void *)alloc_xenheap_page();
1803 return 0;
1805 __initcall(ptwr_init);
1810 /************************************************************************/
1811 /************************************************************************/
1812 /************************************************************************/
1814 #ifndef NDEBUG
1816 void ptwr_status(void)
1818 unsigned long pte, *ptep, pfn;
1819 struct pfn_info *page;
1820 int cpu = smp_processor_id();
1822 ptep = (unsigned long *)&linear_pg_table
1823 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1825 if ( __get_user(pte, ptep) ) {
1826 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1827 domain_crash();
1830 pfn = pte >> PAGE_SHIFT;
1831 page = &frame_table[pfn];
1832 printk("need to alloc l1 page %p\n", page);
1833 /* make pt page writable */
1834 printk("need to make read-only l1-page at %p is %08lx\n",
1835 ptep, pte);
1837 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1838 return;
1840 if ( __get_user(pte, (unsigned long *)
1841 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1842 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1843 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1844 domain_crash();
1846 pfn = pte >> PAGE_SHIFT;
1847 page = &frame_table[pfn];
1850 void audit_domain(struct domain *d)
1852 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1854 void adjust (struct pfn_info *page, int dir, int adjtype)
1856 int count = page->count_info & PGC_count_mask;
1858 if ( adjtype )
1860 int tcount = page->u.inuse.type_info & PGT_count_mask;
1862 ttot++;
1864 tcount += dir;
1866 if ( tcount < 0 )
1868 /* This will only come out once. */
1869 printk("Audit %d: type count whent below zero pfn=%x "
1870 "taf=%x otaf=%x\n",
1871 d->domain, page-frame_table,
1872 page->u.inuse.type_info,
1873 page->tlbflush_timestamp);
1876 page->u.inuse.type_info =
1877 (page->u.inuse.type_info & ~PGT_count_mask) |
1878 (tcount & PGT_count_mask);
1881 ctot++;
1882 count += dir;
1883 if ( count < 0 )
1885 /* This will only come out once. */
1886 printk("Audit %d: general count whent below zero pfn=%x "
1887 "taf=%x otaf=%x\n",
1888 d->domain, page-frame_table,
1889 page->u.inuse.type_info,
1890 page->tlbflush_timestamp);
1893 page->count_info =
1894 (page->count_info & ~PGC_count_mask) |
1895 (count & PGC_count_mask);
1899 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1901 unsigned long pfn, *pt;
1902 struct list_head *list_ent;
1903 struct pfn_info *page;
1904 int i;
1906 list_ent = d->page_list.next;
1907 for ( i = 0; (list_ent != &d->page_list); i++ )
1909 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1910 page = &frame_table[pfn];
1912 switch ( page->u.inuse.type_info & PGT_type_mask )
1914 case PGT_l1_page_table:
1915 case PGT_l2_page_table:
1916 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1917 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1918 if ( (pt[i] & _PAGE_PRESENT) &&
1919 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1920 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1921 d->domain, i, pfn, page->u.inuse.type_info,
1922 page->count_info);
1923 unmap_domain_mem(pt);
1926 list_ent = frame_table[pfn].list.next;
1931 void scan_for_pfn_remote(unsigned long xpfn)
1933 struct domain *e;
1934 for_each_domain ( e )
1935 scan_for_pfn( e, xpfn );
1938 int i;
1939 unsigned long pfn;
1940 struct list_head *list_ent;
1941 struct pfn_info *page;
1943 if ( d != current )
1944 domain_pause(d);
1945 synchronise_pagetables(~0UL);
1947 printk("pt base=%lx sh_info=%x\n",
1948 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
1949 virt_to_page(d->shared_info)-frame_table);
1951 spin_lock(&d->page_alloc_lock);
1953 /* PHASE 0 */
1955 list_ent = d->page_list.next;
1956 for ( i = 0; (list_ent != &d->page_list); i++ )
1958 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1959 page = &frame_table[pfn];
1961 if ( page->u.inuse.domain != d )
1962 BUG();
1964 if ( (page->u.inuse.type_info & PGT_count_mask) >
1965 (page->count_info & PGC_count_mask) )
1966 printk("taf > caf %x %x pfn=%lx\n",
1967 page->u.inuse.type_info, page->count_info, pfn );
1969 #if 0 /* SYSV shared memory pages plus writeable files. */
1970 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
1971 (page->u.inuse.type_info & PGT_count_mask) > 1 )
1973 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
1974 pfn,
1975 page->u.inuse.type_info,
1976 page->count_info );
1977 scan_for_pfn_remote(pfn);
1979 #endif
1980 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
1981 (page->u.inuse.type_info & PGT_count_mask) > 1 )
1983 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
1984 pfn,
1985 page->u.inuse.type_info,
1986 page->count_info );
1989 /* Use tlbflush_timestamp to store original type_info. */
1990 page->tlbflush_timestamp = page->u.inuse.type_info;
1992 list_ent = frame_table[pfn].list.next;
1996 /* PHASE 1 */
1998 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2000 list_ent = d->page_list.next;
2001 for ( i = 0; (list_ent != &d->page_list); i++ )
2003 unsigned long *pt;
2004 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2005 page = &frame_table[pfn];
2007 if ( page->u.inuse.domain != d )
2008 BUG();
2010 switch ( page->u.inuse.type_info & PGT_type_mask )
2012 case PGT_l2_page_table:
2014 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2015 printk("Audit %d: L2 not validated %x\n",
2016 d->domain, page->u.inuse.type_info);
2018 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2019 printk("Audit %d: L2 not pinned %x\n",
2020 d->domain, page->u.inuse.type_info);
2021 else
2022 adjust( page, -1, 1 );
2024 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2026 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2028 if ( pt[i] & _PAGE_PRESENT )
2030 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2031 struct pfn_info *l1page = &frame_table[l1pfn];
2033 if ( l1page->u.inuse.domain != d )
2035 printk("L2: Skip bizarre page belonging to other "
2036 "dom %p\n", l1page->u.inuse.domain);
2037 continue;
2040 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2041 PGT_l2_page_table )
2042 printk("Audit %d: [%x] Found %s Linear PT "
2043 "t=%x pfn=%lx\n", d->domain, i,
2044 (l1pfn==pfn) ? "Self" : "Other",
2045 l1page->u.inuse.type_info,
2046 l1pfn);
2047 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2048 PGT_l1_page_table )
2049 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2050 d->domain, i,
2051 l1page->u.inuse.type_info,
2052 l1pfn);
2054 adjust(l1page, -1, 1);
2058 unmap_domain_mem(pt);
2060 break;
2063 case PGT_l1_page_table:
2065 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2066 adjust( page, -1, 1 );
2068 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2069 printk("Audit %d: L1 not validated %x\n",
2070 d->domain, page->u.inuse.type_info);
2071 #if 0
2072 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2073 printk("Audit %d: L1 not pinned %x\n",
2074 d->domain, page->u.inuse.type_info);
2075 #endif
2076 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2078 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2080 if ( pt[i] & _PAGE_PRESENT )
2082 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2083 struct pfn_info *l1page = &frame_table[l1pfn];
2085 if ( l1pfn < 0x100 )
2087 lowmem_mappings++;
2088 continue;
2091 if ( l1pfn > max_page )
2093 io_mappings++;
2094 continue;
2097 if ( pt[i] & _PAGE_RW )
2100 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2101 PGT_l1_page_table ||
2102 (l1page->u.inuse.type_info & PGT_type_mask) ==
2103 PGT_l2_page_table )
2104 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2105 d->domain, i,
2106 l1page->u.inuse.type_info,
2107 l1pfn);
2111 if ( l1page->u.inuse.domain != d )
2113 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2114 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2115 d->domain, pfn, i,
2116 (unsigned long)l1page->u.inuse.domain,
2117 l1pfn,
2118 l1page->count_info,
2119 l1page->u.inuse.type_info,
2120 machine_to_phys_mapping[l1pfn]);
2121 continue;
2124 adjust(l1page, -1, 0);
2128 unmap_domain_mem(pt);
2130 break;
2133 list_ent = frame_table[pfn].list.next;
2136 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2137 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2138 d->domain, lowmem_mappings, io_mappings);
2140 /* PHASE 2 */
2142 ctot = ttot = 0;
2143 list_ent = d->page_list.next;
2144 for ( i = 0; (list_ent != &d->page_list); i++ )
2146 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2147 page = &frame_table[pfn];
2149 switch ( page->u.inuse.type_info & PGT_type_mask)
2151 case PGT_l1_page_table:
2152 case PGT_l2_page_table:
2153 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2155 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2156 d->domain, page->u.inuse.type_info,
2157 page->tlbflush_timestamp,
2158 page->count_info, pfn );
2159 scan_for_pfn_remote(pfn);
2161 default:
2162 if ( (page->count_info & PGC_count_mask) != 1 )
2164 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2165 d->domain,
2166 page->count_info,
2167 page->u.inuse.type_info,
2168 page->tlbflush_timestamp, pfn );
2169 scan_for_pfn_remote(pfn);
2171 break;
2174 list_ent = frame_table[pfn].list.next;
2177 /* PHASE 3 */
2179 list_ent = d->page_list.next;
2180 for ( i = 0; (list_ent != &d->page_list); i++ )
2182 unsigned long *pt;
2183 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2184 page = &frame_table[pfn];
2186 switch ( page->u.inuse.type_info & PGT_type_mask )
2188 case PGT_l2_page_table:
2189 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2190 adjust( page, 1, 1 );
2192 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2194 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2196 if ( pt[i] & _PAGE_PRESENT )
2198 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2199 struct pfn_info *l1page = &frame_table[l1pfn];
2201 if ( l1page->u.inuse.domain == d)
2202 adjust(l1page, 1, 1);
2206 unmap_domain_mem(pt);
2207 break;
2209 case PGT_l1_page_table:
2210 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2211 adjust( page, 1, 1 );
2213 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2215 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2217 if ( pt[i] & _PAGE_PRESENT )
2219 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2220 struct pfn_info *l1page = &frame_table[l1pfn];
2222 if ( (l1page->u.inuse.domain != d) ||
2223 (l1pfn < 0x100) || (l1pfn > max_page) )
2224 continue;
2226 adjust(l1page, 1, 0);
2230 unmap_domain_mem(pt);
2231 break;
2235 page->tlbflush_timestamp = 0;
2237 list_ent = frame_table[pfn].list.next;
2240 spin_unlock(&d->page_alloc_lock);
2242 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2244 printk("Audit %d: Done. ctot=%d ttot=%d\n",d->domain, ctot, ttot );
2246 if ( d != current )
2247 domain_unpause(d);
2250 void audit_domains(void)
2252 struct domain *d;
2253 for_each_domain ( d )
2254 audit_domain(d);
2257 void audit_domains_key(unsigned char key, void *dev_id,
2258 struct pt_regs *regs)
2260 open_softirq(MEMAUDIT_SOFTIRQ, audit_domains);
2261 raise_softirq(MEMAUDIT_SOFTIRQ);
2264 #endif