debuggers.hg

view xen/arch/x86/memory.c @ 2637:0dfd459518e4

bitkeeper revision 1.1159.1.205 (4162aff3DKXHUIthGYqb0hkSmWnxQw)

Clean up memory auditing, and always an audit a domain before
destroying it. (debug builds only)
author kaf24@freefall.cl.cam.ac.uk
date Tue Oct 05 14:30:11 2004 +0000 (2004-10-05)
parents a4fbb98f00cb
children 71985683e5f3
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->domain , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } percpu_info[NR_CPUS] __cacheline_aligned;
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
148 /*
149 * We are rather picky about the layout of 'struct pfn_info'. The
150 * count_info and domain fields must be adjacent, as we perform atomic
151 * 64-bit operations on them. Also, just for sanity, we assert the size
152 * of the structure here.
153 */
154 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
155 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
156 (sizeof(struct pfn_info) != 24) )
157 {
158 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
159 offsetof(struct pfn_info, count_info),
160 offsetof(struct pfn_info, u.inuse.domain),
161 sizeof(struct pfn_info));
162 for ( ; ; ) ;
163 }
165 memset(percpu_info, 0, sizeof(percpu_info));
167 for ( mfn = 0; mfn < max_page; mfn++ )
168 frame_table[mfn].count_info |= PGC_always_set;
170 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
171 memset(machine_to_phys_mapping, 0x55, 4<<20);
173 /*
174 * Initialise our DOMID_XEN domain.
175 * Any Xen-heap pages that we will allow to be mapped will have
176 * their domain field set to dom_xen.
177 */
178 dom_xen = alloc_domain_struct();
179 atomic_set(&dom_xen->refcnt, 1);
180 dom_xen->domain = DOMID_XEN;
182 /*
183 * Initialise our DOMID_IO domain.
184 * This domain owns no pages but is considered a special case when
185 * mapping I/O pages, as the mappings occur at the priv of the caller.
186 */
187 dom_io = alloc_domain_struct();
188 atomic_set(&dom_io->refcnt, 1);
189 dom_io->domain = DOMID_IO;
191 /* M2P table is mappable read-only by privileged domains. */
192 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
193 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
194 mfn++ )
195 {
196 frame_table[mfn].count_info |= PGC_allocated | 1;
197 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
198 frame_table[mfn].u.inuse.domain = dom_xen;
199 }
200 }
202 static void __invalidate_shadow_ldt(struct domain *d)
203 {
204 int i;
205 unsigned long pfn;
206 struct pfn_info *page;
208 d->mm.shadow_ldt_mapcnt = 0;
210 for ( i = 16; i < 32; i++ )
211 {
212 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
213 if ( pfn == 0 ) continue;
214 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
215 page = &frame_table[pfn];
216 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
217 ASSERT_PAGE_IS_DOMAIN(page, d);
218 put_page_and_type(page);
219 }
221 /* Dispose of the (now possibly invalid) mappings from the TLB. */
222 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
223 }
226 static inline void invalidate_shadow_ldt(struct domain *d)
227 {
228 if ( d->mm.shadow_ldt_mapcnt != 0 )
229 __invalidate_shadow_ldt(d);
230 }
233 static int alloc_segdesc_page(struct pfn_info *page)
234 {
235 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
236 int i;
238 for ( i = 0; i < 512; i++ )
239 if ( unlikely(!check_descriptor(&descs[i*2])) )
240 goto fail;
242 unmap_domain_mem(descs);
243 return 1;
245 fail:
246 unmap_domain_mem(descs);
247 return 0;
248 }
251 /* Map shadow page at offset @off. */
252 int map_ldt_shadow_page(unsigned int off)
253 {
254 struct domain *d = current;
255 unsigned long l1e;
257 if ( unlikely(in_irq()) )
258 BUG();
260 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
261 PAGE_SHIFT) + off]);
263 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
264 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
265 d, PGT_ldt_page)) )
266 return 0;
268 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
269 d->mm.shadow_ldt_mapcnt++;
271 return 1;
272 }
275 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
276 {
277 struct pfn_info *page = &frame_table[page_nr];
279 if ( unlikely(!pfn_is_ram(page_nr)) )
280 {
281 MEM_LOG("Pfn %08lx is not RAM", page_nr);
282 return 0;
283 }
285 if ( unlikely(!get_page(page, d)) )
286 {
287 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
288 return 0;
289 }
291 return 1;
292 }
295 static int get_page_and_type_from_pagenr(unsigned long page_nr,
296 u32 type,
297 struct domain *d)
298 {
299 struct pfn_info *page = &frame_table[page_nr];
301 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
302 return 0;
304 if ( unlikely(!get_page_type(page, type)) )
305 {
306 #ifdef VERBOSE
307 if ( (type & PGT_type_mask) != PGT_l1_page_table )
308 MEM_LOG("Bad page type for pfn %08lx (%08x)",
309 page_nr, page->u.inuse.type_info);
310 #endif
311 put_page(page);
312 return 0;
313 }
315 return 1;
316 }
319 /*
320 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
321 * needs some special care with reference counst and access permissions:
322 * 1. The mapping entry must be read-only, or the guest may get write access
323 * to its own PTEs.
324 * 2. We must only bump the reference counts for an *already validated*
325 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
326 * on a validation that is required to complete that validation.
327 * 3. We only need to increment the reference counts for the mapped page
328 * frame if it is mapped by a different L2 table. This is sufficient and
329 * also necessary to allow validation of an L2 table mapping itself.
330 */
331 static int
332 get_linear_pagetable(
333 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
334 {
335 u32 x, y;
336 struct pfn_info *page;
338 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
339 {
340 MEM_LOG("Attempt to create linear p.t. with write perms");
341 return 0;
342 }
344 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
345 {
346 /* Make sure the mapped frame belongs to the correct domain. */
347 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
348 return 0;
350 /*
351 * Make sure that the mapped frame is an already-validated L2 table.
352 * If so, atomically increment the count (checking for overflow).
353 */
354 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
355 y = page->u.inuse.type_info;
356 do {
357 x = y;
358 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
359 unlikely((x & (PGT_type_mask|PGT_validated)) !=
360 (PGT_l2_page_table|PGT_validated)) )
361 {
362 put_page(page);
363 return 0;
364 }
365 }
366 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
367 }
369 return 1;
370 }
373 static inline int
374 readonly_page_from_l1e(
375 l1_pgentry_t l1e)
376 {
377 struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
378 unsigned long l1v = l1_pgentry_val(l1e);
380 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) )
381 return 0;
382 put_page_type(page);
383 return 1;
384 }
386 static int
387 get_page_from_l1e(
388 l1_pgentry_t l1e, struct domain *d)
389 {
390 unsigned long l1v = l1_pgentry_val(l1e);
391 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
392 struct pfn_info *page = &frame_table[pfn];
393 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
395 if ( !(l1v & _PAGE_PRESENT) )
396 return 1;
398 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
399 {
400 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
401 return 0;
402 }
404 if ( unlikely(!pfn_is_ram(pfn)) )
405 {
406 /* SPECIAL CASE 1. Mapping an I/O page. */
408 /* Revert to caller privileges if FD == DOMID_IO. */
409 if ( d == dom_io )
410 d = current;
412 if ( IS_PRIV(d) )
413 return 1;
415 if ( IS_CAPABLE_PHYSDEV(d) )
416 return domain_iomem_in_pfn(d, pfn);
418 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
419 return 0;
420 }
422 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
423 {
424 /* SPECIAL CASE 2. Mapping a foreign page via a grant table. */
426 int rc;
427 struct domain *e;
428 u32 count_info;
429 /*
430 * Yuk! Amazingly this is the simplest way to get a guaranteed atomic
431 * snapshot of a 64-bit value on IA32. x86/64 solves this of course!
432 * Basically it's a no-op CMPXCHG, to get us the current contents.
433 * No need for LOCK prefix -- we know that count_info is never zero
434 * because it contains PGC_always_set.
435 */
436 ASSERT(test_bit(_PGC_always_set, &page->count_info));
437 __asm__ __volatile__(
438 "cmpxchg8b %2"
439 : "=d" (e), "=a" (count_info),
440 "=m" (*(volatile u64 *)(&page->count_info))
441 : "0" (0), "1" (0), "c" (0), "b" (0) );
442 if ( unlikely((count_info & PGC_count_mask) == 0) ||
443 unlikely(e == NULL) || unlikely(!get_domain(e)) )
444 return 0;
445 rc = gnttab_try_map(
446 e, d, pfn, (l1v & _PAGE_RW) ? GNTTAB_MAP_RW : GNTTAB_MAP_RO);
447 put_domain(e);
448 return rc;
449 }
451 if ( l1v & _PAGE_RW )
452 {
453 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
454 return 0;
455 }
457 return 1;
458 }
461 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
462 static int
463 get_page_from_l2e(
464 l2_pgentry_t l2e, unsigned long pfn,
465 struct domain *d, unsigned long va_idx)
466 {
467 int rc;
469 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
470 return 1;
472 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
473 {
474 MEM_LOG("Bad L2 page type settings %04lx",
475 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
476 return 0;
477 }
479 rc = get_page_and_type_from_pagenr(
480 l2_pgentry_to_pagenr(l2e),
481 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
483 if ( unlikely(!rc) )
484 return get_linear_pagetable(l2e, pfn, d);
486 return 1;
487 }
490 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
491 {
492 unsigned long l1v = l1_pgentry_val(l1e);
493 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
494 struct pfn_info *page = &frame_table[pfn];
495 struct domain *e = page->u.inuse.domain;
497 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
498 return;
500 if ( unlikely(e != d) )
501 {
502 /*
503 * Unmap a foreign page that may have been mapped via a grant table.
504 * Note that this can fail for a privileged domain that can map foreign
505 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
506 * counted via a grant entry and some counted directly in the page
507 * structure's reference count. Note that reference counts won't get
508 * dangerously confused as long as we always try to decrement the
509 * grant entry first. We may end up with a mismatch between which
510 * mappings and which unmappings are counted via the grant entry, but
511 * really it doesn't matter as privileged domains have carte blanche.
512 */
513 if ( likely(gnttab_try_map(e, d, pfn, (l1v & _PAGE_RW) ?
514 GNTTAB_UNMAP_RW : GNTTAB_UNMAP_RO)) )
515 return;
516 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
517 }
519 if ( l1v & _PAGE_RW )
520 {
521 put_page_and_type(page);
522 }
523 else
524 {
525 /* We expect this is rare so we blow the entire shadow LDT. */
526 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
527 PGT_ldt_page)) &&
528 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
529 invalidate_shadow_ldt(e);
530 put_page(page);
531 }
532 }
535 /*
536 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
537 * Note also that this automatically deals correctly with linear p.t.'s.
538 */
539 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
540 {
541 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
542 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
543 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
544 }
547 static int alloc_l2_table(struct pfn_info *page)
548 {
549 struct domain *d = page->u.inuse.domain;
550 unsigned long page_nr = page_to_pfn(page);
551 l2_pgentry_t *pl2e;
552 int i;
554 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
556 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
557 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
558 goto fail;
559 }
561 #if defined(__i386__)
562 /* Now we add our private high mappings. */
563 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
564 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
565 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
566 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
567 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
568 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
569 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
570 __PAGE_HYPERVISOR);
571 #endif
573 unmap_domain_mem(pl2e);
574 return 1;
576 fail:
577 while ( i-- > 0 )
578 put_page_from_l2e(pl2e[i], page_nr);
580 unmap_domain_mem(pl2e);
581 return 0;
582 }
585 static int alloc_l1_table(struct pfn_info *page)
586 {
587 struct domain *d = page->u.inuse.domain;
588 unsigned long page_nr = page_to_pfn(page);
589 l1_pgentry_t *pl1e;
590 int i;
592 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
594 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
595 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
596 goto fail;
598 unmap_domain_mem(pl1e);
599 return 1;
601 fail:
602 while ( i-- > 0 )
603 put_page_from_l1e(pl1e[i], d);
605 unmap_domain_mem(pl1e);
606 return 0;
607 }
610 static void free_l2_table(struct pfn_info *page)
611 {
612 unsigned long page_nr = page - frame_table;
613 l2_pgentry_t *pl2e;
614 int i;
616 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
618 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
619 put_page_from_l2e(pl2e[i], page_nr);
621 unmap_domain_mem(pl2e);
622 }
625 static void free_l1_table(struct pfn_info *page)
626 {
627 struct domain *d = page->u.inuse.domain;
628 unsigned long page_nr = page - frame_table;
629 l1_pgentry_t *pl1e;
630 int i;
632 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
634 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
635 put_page_from_l1e(pl1e[i], d);
637 unmap_domain_mem(pl1e);
638 }
641 static inline int update_l2e(l2_pgentry_t *pl2e,
642 l2_pgentry_t ol2e,
643 l2_pgentry_t nl2e)
644 {
645 unsigned long o = cmpxchg((unsigned long *)pl2e,
646 l2_pgentry_val(ol2e),
647 l2_pgentry_val(nl2e));
648 if ( o != l2_pgentry_val(ol2e) )
649 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
650 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
651 return (o == l2_pgentry_val(ol2e));
652 }
655 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
656 static int mod_l2_entry(l2_pgentry_t *pl2e,
657 l2_pgentry_t nl2e,
658 unsigned long pfn)
659 {
660 l2_pgentry_t ol2e;
661 unsigned long _ol2e;
663 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
664 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
665 {
666 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
667 return 0;
668 }
670 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
671 return 0;
672 ol2e = mk_l2_pgentry(_ol2e);
674 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
675 {
676 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
677 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
678 return update_l2e(pl2e, ol2e, nl2e);
680 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
681 ((unsigned long)pl2e &
682 ~PAGE_MASK) >> 2)) )
683 return 0;
685 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
686 {
687 put_page_from_l2e(nl2e, pfn);
688 return 0;
689 }
691 put_page_from_l2e(ol2e, pfn);
692 return 1;
693 }
695 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
696 return 0;
698 put_page_from_l2e(ol2e, pfn);
699 return 1;
700 }
703 static inline int update_l1e(l1_pgentry_t *pl1e,
704 l1_pgentry_t ol1e,
705 l1_pgentry_t nl1e)
706 {
707 unsigned long o = l1_pgentry_val(ol1e);
708 unsigned long n = l1_pgentry_val(nl1e);
710 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
711 unlikely(o != l1_pgentry_val(ol1e)) )
712 {
713 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
714 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
715 return 0;
716 }
718 return 1;
719 }
722 /* Update the L1 entry at pl1e to new value nl1e. */
723 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
724 {
725 l1_pgentry_t ol1e;
726 unsigned long _ol1e;
727 struct domain *d = current;
729 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
730 {
731 MEM_LOG("Bad get_user\n");
732 return 0;
733 }
735 ol1e = mk_l1_pgentry(_ol1e);
737 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
738 {
739 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
740 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
741 return update_l1e(pl1e, ol1e, nl1e);
743 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
744 return 0;
746 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
747 {
748 put_page_from_l1e(nl1e, d);
749 return 0;
750 }
752 put_page_from_l1e(ol1e, d);
753 return 1;
754 }
756 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
757 return 0;
759 put_page_from_l1e(ol1e, d);
760 return 1;
761 }
764 int alloc_page_type(struct pfn_info *page, unsigned int type)
765 {
766 switch ( type )
767 {
768 case PGT_l1_page_table:
769 return alloc_l1_table(page);
770 case PGT_l2_page_table:
771 return alloc_l2_table(page);
772 case PGT_gdt_page:
773 case PGT_ldt_page:
774 return alloc_segdesc_page(page);
775 default:
776 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
777 type, page->u.inuse.type_info,
778 page->count_info);
779 BUG();
780 }
782 return 0;
783 }
786 void free_page_type(struct pfn_info *page, unsigned int type)
787 {
788 struct domain *d = page->u.inuse.domain;
790 switch ( type )
791 {
792 case PGT_l1_page_table:
793 free_l1_table(page);
794 break;
796 case PGT_l2_page_table:
797 free_l2_table(page);
798 break;
800 default:
801 BUG();
802 }
804 if ( unlikely(d->mm.shadow_mode) &&
805 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
806 {
807 unshadow_table(page_to_pfn(page), type);
808 put_shadow_status(&d->mm);
809 }
810 }
813 void put_page_type(struct pfn_info *page)
814 {
815 u32 nx, x, y = page->u.inuse.type_info;
817 again:
818 do {
819 x = y;
820 nx = x - 1;
822 ASSERT((x & PGT_count_mask) != 0);
824 /*
825 * The page should always be validated while a reference is held. The
826 * exception is during domain destruction, when we forcibly invalidate
827 * page-table pages if we detect a referential loop.
828 * See domain.c:relinquish_list().
829 */
830 ASSERT((x & PGT_validated) ||
831 test_bit(DF_DYING, &page->u.inuse.domain->flags));
833 if ( unlikely((nx & PGT_count_mask) == 0) )
834 {
835 /* Record TLB information for flush later. Races are harmless. */
836 page->tlbflush_timestamp = tlbflush_clock;
838 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
839 likely(nx & PGT_validated) )
840 {
841 /*
842 * Page-table pages must be unvalidated when count is zero. The
843 * 'free' is safe because the refcnt is non-zero and validated
844 * bit is clear => other ops will spin or fail.
845 */
846 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
847 x & ~PGT_validated)) != x) )
848 goto again;
849 /* We cleared the 'valid bit' so we do the clear up. */
850 free_page_type(page, x & PGT_type_mask);
851 /* Carry on, but with the 'valid bit' now clear. */
852 x &= ~PGT_validated;
853 nx &= ~PGT_validated;
854 }
855 }
856 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
857 (PGT_pinned | 1)) )
858 {
859 /* Page is now only pinned. Make the back pointer mutable again. */
860 nx |= PGT_va_mutable;
861 }
862 }
863 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
864 }
867 int get_page_type(struct pfn_info *page, u32 type)
868 {
869 u32 nx, x, y = page->u.inuse.type_info;
871 again:
872 do {
873 x = y;
874 nx = x + 1;
875 if ( unlikely((nx & PGT_count_mask) == 0) )
876 {
877 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
878 return 0;
879 }
880 else if ( unlikely((x & PGT_count_mask) == 0) )
881 {
882 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
883 {
884 /*
885 * On type change we check to flush stale TLB entries. This
886 * may be unnecessary (e.g., page was GDT/LDT) but those
887 * circumstances should be very rare.
888 */
889 struct domain *d = page->u.inuse.domain;
890 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
891 page->tlbflush_timestamp)) )
892 {
893 perfc_incr(need_flush_tlb_flush);
894 flush_tlb_cpu(d->processor);
895 }
897 /* We lose existing type, back pointer, and validity. */
898 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
899 nx |= type;
901 /* No special validation needed for writable pages. */
902 /* Page tables and GDT/LDT need to be scanned for validity. */
903 if ( type == PGT_writable_page )
904 nx |= PGT_validated;
905 }
906 }
907 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
908 {
909 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
910 {
911 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
912 ((type & PGT_type_mask) != PGT_l1_page_table) )
913 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
914 x & PGT_type_mask, type, page_to_pfn(page));
915 return 0;
916 }
917 else if ( (x & PGT_va_mask) == PGT_va_mutable )
918 {
919 /* The va backpointer is mutable, hence we update it. */
920 nx &= ~PGT_va_mask;
921 nx |= type; /* we know the actual type is correct */
922 }
923 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
924 {
925 /* This table is potentially mapped at multiple locations. */
926 nx &= ~PGT_va_mask;
927 nx |= PGT_va_unknown;
928 }
929 }
930 else if ( unlikely(!(x & PGT_validated)) )
931 {
932 /* Someone else is updating validation of this page. Wait... */
933 while ( (y = page->u.inuse.type_info) == x )
934 {
935 rep_nop();
936 barrier();
937 }
938 goto again;
939 }
940 }
941 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
943 if ( unlikely(!(nx & PGT_validated)) )
944 {
945 /* Try to validate page type; drop the new reference on failure. */
946 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
947 {
948 MEM_LOG("Error while validating pfn %08lx for type %08x."
949 " caf=%08x taf=%08x\n",
950 page_to_pfn(page), type,
951 page->count_info,
952 page->u.inuse.type_info);
953 /* Noone else can get a reference. We hold the only ref. */
954 page->u.inuse.type_info = 0;
955 return 0;
956 }
958 /* Noone else is updating simultaneously. */
959 __set_bit(_PGT_validated, &page->u.inuse.type_info);
960 }
962 return 1;
963 }
966 static int do_extended_command(unsigned long ptr, unsigned long val)
967 {
968 int okay = 1, cpu = smp_processor_id();
969 unsigned int cmd = val & MMUEXT_CMD_MASK;
970 unsigned long pfn = ptr >> PAGE_SHIFT;
971 unsigned long old_base_pfn;
972 struct pfn_info *page = &frame_table[pfn];
973 struct domain *d = current, *nd, *e;
974 u32 x, y;
975 domid_t domid;
976 grant_ref_t gntref;
978 switch ( cmd )
979 {
980 case MMUEXT_PIN_L1_TABLE:
981 case MMUEXT_PIN_L2_TABLE:
982 /*
983 * We insist that, if you pin an L1 page, it's the first thing that
984 * you do to it. This is because we require the backptr to still be
985 * mutable. This assumption seems safe.
986 */
987 okay = get_page_and_type_from_pagenr(
988 pfn,
989 ((cmd==MMUEXT_PIN_L2_TABLE) ?
990 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
991 FOREIGNDOM);
993 if ( unlikely(!okay) )
994 {
995 MEM_LOG("Error while pinning pfn %08lx", pfn);
996 break;
997 }
999 if ( unlikely(test_and_set_bit(_PGT_pinned,
1000 &page->u.inuse.type_info)) )
1002 MEM_LOG("Pfn %08lx already pinned", pfn);
1003 put_page_and_type(page);
1004 okay = 0;
1005 break;
1008 break;
1010 case MMUEXT_UNPIN_TABLE:
1011 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1013 MEM_LOG("Page %08lx bad domain (dom=%p)",
1014 ptr, page->u.inuse.domain);
1016 else if ( likely(test_and_clear_bit(_PGT_pinned,
1017 &page->u.inuse.type_info)) )
1019 put_page_and_type(page);
1020 put_page(page);
1022 else
1024 okay = 0;
1025 put_page(page);
1026 MEM_LOG("Pfn %08lx not pinned", pfn);
1028 break;
1030 case MMUEXT_NEW_BASEPTR:
1031 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
1032 if ( likely(okay) )
1034 invalidate_shadow_ldt(d);
1036 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1037 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
1038 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
1040 shadow_mk_pagetable(&d->mm);
1042 write_ptbase(&d->mm);
1044 put_page_and_type(&frame_table[old_base_pfn]);
1046 /*
1047 * Note that we tick the clock /after/ dropping the old base's
1048 * reference count. If the page tables got freed then this will
1049 * avoid unnecessary TLB flushes when the pages are reused. */
1050 tlb_clocktick();
1052 else
1054 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1056 break;
1058 case MMUEXT_TLB_FLUSH:
1059 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1060 break;
1062 case MMUEXT_INVLPG:
1063 __flush_tlb_one(ptr);
1064 break;
1066 case MMUEXT_FLUSH_CACHE:
1067 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1069 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1070 okay = 0;
1072 else
1074 wbinvd();
1076 break;
1078 case MMUEXT_SET_LDT:
1080 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1081 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1082 (ents > 8192) ||
1083 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1084 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1086 okay = 0;
1087 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1089 else if ( (d->mm.ldt_ents != ents) ||
1090 (d->mm.ldt_base != ptr) )
1092 invalidate_shadow_ldt(d);
1093 d->mm.ldt_base = ptr;
1094 d->mm.ldt_ents = ents;
1095 load_LDT(d);
1096 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1097 if ( ents != 0 )
1098 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1100 break;
1103 case MMUEXT_SET_FOREIGNDOM:
1104 domid = (domid_t)(val >> 16);
1106 if ( (e = percpu_info[cpu].foreign) != NULL )
1107 put_domain(e);
1108 percpu_info[cpu].foreign = NULL;
1110 if ( !IS_PRIV(d) )
1112 switch ( domid )
1114 case DOMID_IO:
1115 get_knownalive_domain(dom_io);
1116 percpu_info[cpu].foreign = dom_io;
1117 break;
1118 default:
1119 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain);
1120 okay = 0;
1121 break;
1124 else
1126 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1127 if ( e == NULL )
1129 switch ( domid )
1131 case DOMID_XEN:
1132 get_knownalive_domain(dom_xen);
1133 percpu_info[cpu].foreign = dom_xen;
1134 break;
1135 case DOMID_IO:
1136 get_knownalive_domain(dom_io);
1137 percpu_info[cpu].foreign = dom_io;
1138 break;
1139 default:
1140 MEM_LOG("Unknown domain '%u'", domid);
1141 okay = 0;
1142 break;
1146 break;
1148 case MMUEXT_TRANSFER_PAGE:
1149 domid = (domid_t)(val >> 16);
1150 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1152 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1153 unlikely(!pfn_is_ram(pfn)) ||
1154 unlikely((e = find_domain_by_id(domid)) == NULL) )
1156 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1157 okay = 0;
1158 break;
1161 spin_lock(&d->page_alloc_lock);
1163 /*
1164 * The tricky bit: atomically release ownership while there is just one
1165 * benign reference to the page (PGC_allocated). If that reference
1166 * disappears then the deallocation routine will safely spin.
1167 */
1168 nd = page->u.inuse.domain;
1169 y = page->count_info;
1170 do {
1171 x = y;
1172 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1173 (1|PGC_allocated)) ||
1174 unlikely(nd != d) )
1176 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1177 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1178 d, d->domain, nd, x, page->u.inuse.type_info);
1179 spin_unlock(&d->page_alloc_lock);
1180 put_domain(e);
1181 okay = 0;
1182 break;
1184 __asm__ __volatile__(
1185 LOCK_PREFIX "cmpxchg8b %2"
1186 : "=d" (nd), "=a" (y),
1187 "=m" (*(volatile u64 *)(&page->count_info))
1188 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1190 while ( unlikely(nd != d) || unlikely(y != x) );
1192 /*
1193 * Unlink from 'd'. At least one reference remains (now anonymous), so
1194 * noone else is spinning to try to delete this page from 'd'.
1195 */
1196 d->tot_pages--;
1197 list_del(&page->list);
1199 spin_unlock(&d->page_alloc_lock);
1201 spin_lock(&e->page_alloc_lock);
1203 /*
1204 * Check that 'e' will accept the page and has reservation headroom.
1205 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1206 */
1207 ASSERT(e->tot_pages <= e->max_pages);
1208 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1209 unlikely(e->tot_pages == e->max_pages) ||
1210 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1212 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1213 "provided a bad grant ref, or is dying (%08lx).\n",
1214 e->tot_pages, e->max_pages, e->flags);
1215 spin_unlock(&e->page_alloc_lock);
1216 put_domain(e);
1217 okay = 0;
1218 break;
1221 /* Okay, add the page to 'e'. */
1222 if ( unlikely(e->tot_pages++ == 0) )
1223 get_knownalive_domain(e);
1224 list_add_tail(&page->list, &e->page_list);
1225 page->u.inuse.domain = e;
1227 spin_unlock(&e->page_alloc_lock);
1229 /* Transfer is all done: tell the guest about its new page frame. */
1230 gnttab_notify_transfer(e, gntref, pfn);
1232 put_domain(e);
1233 break;
1235 case MMUEXT_REASSIGN_PAGE:
1236 if ( unlikely(!IS_PRIV(d)) )
1238 MEM_LOG("Dom %u has no reassignment priv", d->domain);
1239 okay = 0;
1240 break;
1243 e = percpu_info[cpu].foreign;
1244 if ( unlikely(e == NULL) )
1246 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1247 okay = 0;
1248 break;
1251 /*
1252 * Grab both page_list locks, in order. This prevents the page from
1253 * disappearing elsewhere while we modify the owner, and we'll need
1254 * both locks if we're successful so that we can change lists.
1255 */
1256 if ( d < e )
1258 spin_lock(&d->page_alloc_lock);
1259 spin_lock(&e->page_alloc_lock);
1261 else
1263 spin_lock(&e->page_alloc_lock);
1264 spin_lock(&d->page_alloc_lock);
1267 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1268 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1269 unlikely(IS_XEN_HEAP_FRAME(page)) )
1271 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1272 okay = 0;
1273 goto reassign_fail;
1276 /*
1277 * The tricky bit: atomically change owner while there is just one
1278 * benign reference to the page (PGC_allocated). If that reference
1279 * disappears then the deallocation routine will safely spin.
1280 */
1281 nd = page->u.inuse.domain;
1282 y = page->count_info;
1283 do {
1284 x = y;
1285 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1286 (1|PGC_allocated)) ||
1287 unlikely(nd != d) )
1289 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1290 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1291 d, d->domain, nd, x, page->u.inuse.type_info);
1292 okay = 0;
1293 goto reassign_fail;
1295 __asm__ __volatile__(
1296 LOCK_PREFIX "cmpxchg8b %3"
1297 : "=d" (nd), "=a" (y), "=c" (e),
1298 "=m" (*(volatile u64 *)(&page->count_info))
1299 : "0" (d), "1" (x), "c" (e), "b" (x) );
1301 while ( unlikely(nd != d) || unlikely(y != x) );
1303 /*
1304 * Unlink from 'd'. We transferred at least one reference to 'e', so
1305 * noone else is spinning to try to delete this page from 'd'.
1306 */
1307 d->tot_pages--;
1308 list_del(&page->list);
1310 /*
1311 * Add the page to 'e'. Someone may already have removed the last
1312 * reference and want to remove the page from 'e'. However, we have
1313 * the lock so they'll spin waiting for us.
1314 */
1315 if ( unlikely(e->tot_pages++ == 0) )
1316 get_knownalive_domain(e);
1317 list_add_tail(&page->list, &e->page_list);
1319 reassign_fail:
1320 spin_unlock(&d->page_alloc_lock);
1321 spin_unlock(&e->page_alloc_lock);
1322 break;
1324 case MMUEXT_CLEAR_FOREIGNDOM:
1325 if ( (e = percpu_info[cpu].foreign) != NULL )
1326 put_domain(e);
1327 percpu_info[cpu].foreign = NULL;
1328 break;
1330 default:
1331 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1332 okay = 0;
1333 break;
1336 return okay;
1340 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1342 mmu_update_t req;
1343 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1344 struct pfn_info *page;
1345 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1346 unsigned int cmd;
1347 unsigned long prev_spfn = 0;
1348 l1_pgentry_t *prev_spl1e = 0;
1349 struct domain *d = current;
1350 u32 type_info;
1352 perfc_incrc(calls_to_mmu_update);
1353 perfc_addc(num_page_updates, count);
1355 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1357 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) )
1358 return -EFAULT;
1360 for ( i = 0; i < count; i++ )
1362 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1364 MEM_LOG("Bad __copy_from_user");
1365 rc = -EFAULT;
1366 break;
1369 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1370 pfn = req.ptr >> PAGE_SHIFT;
1372 okay = 0;
1374 switch ( cmd )
1376 /*
1377 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1378 */
1379 case MMU_NORMAL_PT_UPDATE:
1380 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1382 MEM_LOG("Could not get page for normal update");
1383 break;
1386 if ( likely(prev_pfn == pfn) )
1388 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1390 else
1392 if ( prev_pfn != 0 )
1393 unmap_domain_mem((void *)va);
1394 va = (unsigned long)map_domain_mem(req.ptr);
1395 prev_pfn = pfn;
1398 page = &frame_table[pfn];
1399 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1401 case PGT_l1_page_table:
1402 if ( likely(get_page_type(
1403 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1405 okay = mod_l1_entry((l1_pgentry_t *)va,
1406 mk_l1_pgentry(req.val));
1408 if ( unlikely(d->mm.shadow_mode) && okay &&
1409 (get_shadow_status(&d->mm, page-frame_table) &
1410 PSH_shadowed) )
1412 shadow_l1_normal_pt_update(
1413 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1414 put_shadow_status(&d->mm);
1417 put_page_type(page);
1419 break;
1420 case PGT_l2_page_table:
1421 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1423 okay = mod_l2_entry((l2_pgentry_t *)va,
1424 mk_l2_pgentry(req.val),
1425 pfn);
1427 if ( unlikely(d->mm.shadow_mode) && okay &&
1428 (get_shadow_status(&d->mm, page-frame_table) &
1429 PSH_shadowed) )
1431 shadow_l2_normal_pt_update(req.ptr, req.val);
1432 put_shadow_status(&d->mm);
1435 put_page_type(page);
1437 break;
1438 default:
1439 if ( likely(get_page_type(page, PGT_writable_page)) )
1441 *(unsigned long *)va = req.val;
1442 okay = 1;
1443 put_page_type(page);
1445 break;
1448 put_page(page);
1449 break;
1451 case MMU_MACHPHYS_UPDATE:
1452 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1454 MEM_LOG("Could not get page for mach->phys update");
1455 break;
1458 machine_to_phys_mapping[pfn] = req.val;
1459 okay = 1;
1461 /*
1462 * If in log-dirty mode, mark the corresponding pseudo-physical
1463 * page as dirty.
1464 */
1465 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1466 mark_dirty(&d->mm, pfn) )
1467 d->mm.shadow_dirty_block_count++;
1469 put_page(&frame_table[pfn]);
1470 break;
1472 /*
1473 * MMU_EXTENDED_COMMAND: Extended command is specified
1474 * in the least-siginificant bits of the 'value' field.
1475 */
1476 case MMU_EXTENDED_COMMAND:
1477 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1478 okay = do_extended_command(req.ptr, req.val);
1479 break;
1481 default:
1482 MEM_LOG("Invalid page update command %08lx", req.ptr);
1483 break;
1486 if ( unlikely(!okay) )
1488 rc = -EINVAL;
1489 break;
1492 ureqs++;
1495 if ( prev_pfn != 0 )
1496 unmap_domain_mem((void *)va);
1498 if ( unlikely(prev_spl1e != 0) )
1499 unmap_domain_mem((void *)prev_spl1e);
1501 deferred_ops = percpu_info[cpu].deferred_ops;
1502 percpu_info[cpu].deferred_ops = 0;
1504 if ( deferred_ops & DOP_FLUSH_TLB )
1505 local_flush_tlb();
1507 if ( deferred_ops & DOP_RELOAD_LDT )
1508 (void)map_ldt_shadow_page(0);
1510 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1512 put_domain(percpu_info[cpu].foreign);
1513 percpu_info[cpu].foreign = NULL;
1516 if ( unlikely(success_count != NULL) )
1517 put_user(count, success_count);
1519 return rc;
1523 int do_update_va_mapping(unsigned long page_nr,
1524 unsigned long val,
1525 unsigned long flags)
1527 struct domain *d = current;
1528 int err = 0;
1529 unsigned int cpu = d->processor;
1530 unsigned long deferred_ops;
1532 perfc_incrc(calls_to_update_va);
1534 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1535 return -EINVAL;
1537 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1539 /*
1540 * XXX When we make this support 4MB superpages we should also deal with
1541 * the case of updating L2 entries.
1542 */
1544 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1545 mk_l1_pgentry(val))) )
1546 err = -EINVAL;
1548 if ( unlikely(d->mm.shadow_mode) )
1550 unsigned long sval;
1552 l1pte_no_fault(&d->mm, &val, &sval);
1554 if ( unlikely(__put_user(sval, ((unsigned long *)(
1555 &shadow_linear_pg_table[page_nr])))) )
1557 /*
1558 * Since L2's are guranteed RW, failure indicates the page was not
1559 * shadowed, so ignore.
1560 */
1561 perfc_incrc(shadow_update_va_fail);
1564 /*
1565 * If we're in log-dirty mode then we need to note that we've updated
1566 * the PTE in the PT-holding page. We need the machine frame number
1567 * for this.
1568 */
1569 if ( d->mm.shadow_mode == SHM_logdirty )
1570 mark_dirty( &current->mm, va_to_l1mfn(page_nr<<PAGE_SHIFT) );
1572 check_pagetable(d, d->mm.pagetable, "va"); /* debug */
1575 deferred_ops = percpu_info[cpu].deferred_ops;
1576 percpu_info[cpu].deferred_ops = 0;
1578 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1579 unlikely(flags & UVMF_FLUSH_TLB) )
1580 local_flush_tlb();
1581 else if ( unlikely(flags & UVMF_INVLPG) )
1582 __flush_tlb_one(page_nr << PAGE_SHIFT);
1584 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1585 (void)map_ldt_shadow_page(0);
1587 return err;
1590 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1591 unsigned long val,
1592 unsigned long flags,
1593 domid_t domid)
1595 unsigned int cpu = smp_processor_id();
1596 struct domain *d;
1597 int rc;
1599 if ( unlikely(!IS_PRIV(current)) )
1600 return -EPERM;
1602 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1603 if ( unlikely(d == NULL) )
1605 MEM_LOG("Unknown domain '%u'", domid);
1606 return -ESRCH;
1609 rc = do_update_va_mapping(page_nr, val, flags);
1611 put_domain(d);
1612 percpu_info[cpu].foreign = NULL;
1614 return rc;
1619 /*************************
1620 * Writable Pagetables
1621 */
1623 ptwr_info_t ptwr_info[NR_CPUS] =
1624 { [ 0 ... NR_CPUS-1 ] =
1626 .ptinfo[PTWR_PT_ACTIVE].l1va = 0,
1627 .ptinfo[PTWR_PT_ACTIVE].page = 0,
1628 .ptinfo[PTWR_PT_INACTIVE].l1va = 0,
1629 .ptinfo[PTWR_PT_INACTIVE].page = 0,
1631 };
1633 #ifdef VERBOSE
1634 int ptwr_debug = 0x0;
1635 #define PTWR_PRINTK(w, x) if ( unlikely(ptwr_debug & (w)) ) printk x
1636 #define PP_ALL 0xff
1637 #else
1638 #define PTWR_PRINTK(w, x)
1639 #endif
1641 void ptwr_flush(const int which)
1643 unsigned long pte, *ptep;
1644 l1_pgentry_t *pl1e;
1645 l2_pgentry_t *pl2e, nl2e;
1646 int cpu = smp_processor_id();
1647 int i;
1649 ptep = (unsigned long *)&linear_pg_table
1650 [ptwr_info[cpu].ptinfo[which].l1va>>PAGE_SHIFT];
1652 /* make pt page write protected */
1653 if ( unlikely(__get_user(pte, ptep)) ) {
1654 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1655 domain_crash();
1657 PTWR_PRINTK(PP_ALL, ("disconnected_l1va at %p is %08lx\n",
1658 ptep, pte));
1659 pte &= ~_PAGE_RW;
1660 if ( unlikely(__put_user(pte, ptep)) ) {
1661 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1662 domain_crash();
1664 __flush_tlb_one(ptwr_info[cpu].ptinfo[which].l1va);
1665 PTWR_PRINTK(PP_ALL, ("disconnected_l1va at %p now %08lx\n",
1666 ptep, pte));
1668 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1669 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) {
1670 l1_pgentry_t ol1e, nl1e;
1671 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1672 nl1e = pl1e[i];
1673 if (likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)))
1674 continue;
1675 if (likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e) | _PAGE_RW))
1676 && readonly_page_from_l1e(nl1e))
1677 continue;
1678 if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
1679 put_page_from_l1e(ol1e, current);
1680 if (unlikely(!get_page_from_l1e(nl1e, current))) {
1681 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1682 domain_crash();
1685 unmap_domain_mem(pl1e);
1687 if (which == PTWR_PT_ACTIVE) {
1688 /* reconnect l1 page */
1689 pl2e = &linear_l2_table[ptwr_info[cpu].active_pteidx];
1690 nl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1691 update_l2e(pl2e, *pl2e, nl2e);
1694 if ( unlikely(current->mm.shadow_mode) )
1696 unsigned long spte;
1697 unsigned long sstat =
1698 get_shadow_status(&current->mm, pte >> PAGE_SHIFT);
1700 if ( sstat & PSH_shadowed )
1702 int i;
1703 unsigned long spfn = sstat & PSH_pfn_mask;
1704 l1_pgentry_t *sl1e = map_domain_mem( spfn << PAGE_SHIFT );
1706 for( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1708 l1pte_no_fault(&current->mm,
1709 &l1_pgentry_val(
1710 ptwr_info[cpu].ptinfo[which].page[i]),
1711 &l1_pgentry_val(sl1e[i]));
1713 unmap_domain_mem(sl1e);
1714 put_shadow_status(&current->mm);
1717 l1pte_no_fault(&current->mm, &pte, &spte);
1718 __put_user(spte, (unsigned long *)&shadow_linear_pg_table
1719 [ptwr_info[cpu].ptinfo[which].l1va>>PAGE_SHIFT]);
1722 ptwr_info[cpu].ptinfo[which].l1va = 0;
1725 int ptwr_do_page_fault(unsigned long addr)
1727 /* write page fault, check if we're trying to modify an l1 page table */
1728 unsigned long pte, pfn;
1729 struct pfn_info *page;
1730 l2_pgentry_t *pl2e, nl2e;
1731 int cpu = smp_processor_id();
1732 int which;
1734 #if 0
1735 PTWR_PRINTK(PP_ALL, ("get user %p for va %08lx\n",
1736 &linear_pg_table[addr>>PAGE_SHIFT], addr));
1737 #endif
1739 /* Testing for page_present in the L2 avoids lots of unncessary fixups */
1740 if ( (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
1741 _PAGE_PRESENT) &&
1742 (__get_user(pte, (unsigned long *)
1743 &linear_pg_table[addr >> PAGE_SHIFT]) == 0) )
1745 if( (pte & _PAGE_RW) && (pte & _PAGE_PRESENT) )
1746 return 0; /* we can't help. Maybe shadow mode can? */
1748 pfn = pte >> PAGE_SHIFT;
1749 #if 0
1750 PTWR_PRINTK(PP_ALL, ("check pte %08lx = pfn %08lx for va %08lx\n", pte,
1751 pfn, addr));
1752 #endif
1753 page = &frame_table[pfn];
1754 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
1756 u32 va_mask = page->u.inuse.type_info & PGT_va_mask;
1758 if ( unlikely(va_mask >= PGT_va_unknown) )
1759 domain_crash();
1760 va_mask >>= PGT_va_shift;
1762 pl2e = &linear_l2_table[va_mask];
1763 PTWR_PRINTK(PP_ALL, ("page_fault on l1 pt at va %08lx, pt for %08x"
1764 ", pfn %08lx\n", addr,
1765 va_mask << L2_PAGETABLE_SHIFT, pfn));
1767 which = (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) ?
1768 PTWR_PT_INACTIVE : PTWR_PT_ACTIVE;
1770 if ( ptwr_info[cpu].ptinfo[which].l1va )
1771 ptwr_flush(which);
1772 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1774 if (which == PTWR_PT_ACTIVE) {
1775 ptwr_info[cpu].active_pteidx = va_mask;
1776 /* disconnect l1 page */
1777 nl2e = mk_l2_pgentry((l2_pgentry_val(*pl2e) & ~_PAGE_PRESENT));
1778 update_l2e(pl2e, *pl2e, nl2e);
1779 flush_tlb();
1782 ptwr_info[cpu].ptinfo[which].pl1e =
1783 map_domain_mem(pfn << PAGE_SHIFT);
1784 memcpy(ptwr_info[cpu].ptinfo[which].page,
1785 ptwr_info[cpu].ptinfo[which].pl1e,
1786 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1788 /* make pt page writable */
1789 pte |= _PAGE_RW;
1790 PTWR_PRINTK(PP_ALL, ("update %p pte to %08lx\n",
1791 &linear_pg_table[addr>>PAGE_SHIFT], pte));
1792 if ( unlikely(__put_user(pte, (unsigned long *)
1793 &linear_pg_table[addr>>PAGE_SHIFT])) ) {
1794 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1795 &linear_pg_table[addr>>PAGE_SHIFT]);
1796 domain_crash();
1799 /* maybe fall through to shadow mode to propagate */
1800 return ( !current->mm.shadow_mode );
1803 return 0;
1806 static __init int ptwr_init(void)
1808 int i;
1810 for ( i = 0; i < smp_num_cpus; i++ )
1812 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1813 (void *)alloc_xenheap_page();
1814 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1815 (void *)alloc_xenheap_page();
1816 machine_to_phys_mapping[virt_to_phys(
1817 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page)>>PAGE_SHIFT] =
1818 INVALID_P2M_ENTRY;
1819 machine_to_phys_mapping[virt_to_phys(
1820 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page)>>PAGE_SHIFT] =
1821 INVALID_P2M_ENTRY;
1824 return 0;
1826 __initcall(ptwr_init);
1828 #ifndef NDEBUG
1829 void ptwr_status(void)
1831 unsigned long pte, *ptep, pfn;
1832 struct pfn_info *page;
1833 int cpu = smp_processor_id();
1835 ptep = (unsigned long *)&linear_pg_table
1836 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1838 if ( __get_user(pte, ptep) ) {
1839 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1840 domain_crash();
1843 pfn = pte >> PAGE_SHIFT;
1844 page = &frame_table[pfn];
1845 printk("need to alloc l1 page %p\n", page);
1846 /* make pt page writable */
1847 printk("need to make read-only l1-page at %p is %08lx\n",
1848 ptep, pte);
1850 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1851 return;
1853 if ( __get_user(pte, (unsigned long *)
1854 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1855 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1856 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1857 domain_crash();
1859 pfn = pte >> PAGE_SHIFT;
1860 page = &frame_table[pfn];
1864 /************************************************************************/
1867 void audit_domain(struct domain *d)
1869 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1871 void adjust (struct pfn_info *page, int dir, int adjtype)
1873 int count = page->count_info & PGC_count_mask;
1875 if ( adjtype )
1877 int tcount = page->u.inuse.type_info & PGT_count_mask;
1879 ttot++;
1881 tcount += dir;
1883 if ( tcount < 0 )
1885 /* This will only come out once. */
1886 printk("Audit %d: type count whent below zero pfn=%x "
1887 "taf=%x otaf=%x\n",
1888 d->domain, page-frame_table,
1889 page->u.inuse.type_info,
1890 page->tlbflush_timestamp);
1893 page->u.inuse.type_info =
1894 (page->u.inuse.type_info & ~PGT_count_mask) |
1895 (tcount & PGT_count_mask);
1898 ctot++;
1899 count += dir;
1900 if ( count < 0 )
1902 /* This will only come out once. */
1903 printk("Audit %d: general count whent below zero pfn=%x "
1904 "taf=%x otaf=%x\n",
1905 d->domain, page-frame_table,
1906 page->u.inuse.type_info,
1907 page->tlbflush_timestamp);
1910 page->count_info =
1911 (page->count_info & ~PGC_count_mask) |
1912 (count & PGC_count_mask);
1916 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1918 unsigned long pfn, *pt;
1919 struct list_head *list_ent;
1920 struct pfn_info *page;
1921 int i;
1923 list_ent = d->page_list.next;
1924 for ( i = 0; (list_ent != &d->page_list); i++ )
1926 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1927 page = &frame_table[pfn];
1929 switch ( page->u.inuse.type_info & PGT_type_mask )
1931 case PGT_l1_page_table:
1932 case PGT_l2_page_table:
1933 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1934 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1935 if ( (pt[i] & _PAGE_PRESENT) &&
1936 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1937 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1938 d->domain, i, pfn, page->u.inuse.type_info,
1939 page->count_info);
1940 unmap_domain_mem(pt);
1943 list_ent = frame_table[pfn].list.next;
1948 void scan_for_pfn_remote(unsigned long xpfn)
1950 struct domain *e;
1951 for_each_domain ( e )
1952 scan_for_pfn( e, xpfn );
1955 int i;
1956 unsigned long pfn;
1957 struct list_head *list_ent;
1958 struct pfn_info *page;
1960 if ( d != current )
1961 domain_pause(d);
1962 synchronise_pagetables(~0UL);
1964 printk("pt base=%lx sh_info=%x\n",
1965 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
1966 virt_to_page(d->shared_info)-frame_table);
1968 spin_lock(&d->page_alloc_lock);
1970 /* PHASE 0 */
1972 list_ent = d->page_list.next;
1973 for ( i = 0; (list_ent != &d->page_list); i++ )
1975 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1976 page = &frame_table[pfn];
1978 if ( page->u.inuse.domain != d )
1979 BUG();
1981 if ( (page->u.inuse.type_info & PGT_count_mask) >
1982 (page->count_info & PGC_count_mask) )
1983 printk("taf > caf %x %x pfn=%lx\n",
1984 page->u.inuse.type_info, page->count_info, pfn );
1986 #if 0 /* SYSV shared memory pages plus writeable files. */
1987 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
1988 (page->u.inuse.type_info & PGT_count_mask) > 1 )
1990 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
1991 pfn,
1992 page->u.inuse.type_info,
1993 page->count_info );
1994 scan_for_pfn_remote(pfn);
1996 #endif
1997 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
1998 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2000 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2001 pfn,
2002 page->u.inuse.type_info,
2003 page->count_info );
2006 /* Use tlbflush_timestamp to store original type_info. */
2007 page->tlbflush_timestamp = page->u.inuse.type_info;
2009 list_ent = frame_table[pfn].list.next;
2013 /* PHASE 1 */
2015 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2017 list_ent = d->page_list.next;
2018 for ( i = 0; (list_ent != &d->page_list); i++ )
2020 unsigned long *pt;
2021 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2022 page = &frame_table[pfn];
2024 if ( page->u.inuse.domain != d )
2025 BUG();
2027 switch ( page->u.inuse.type_info & PGT_type_mask )
2029 case PGT_l2_page_table:
2031 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2032 printk("Audit %d: L2 not validated %x\n",
2033 d->domain, page->u.inuse.type_info);
2035 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2036 printk("Audit %d: L2 not pinned %x\n",
2037 d->domain, page->u.inuse.type_info);
2038 else
2039 adjust( page, -1, 1 );
2041 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2043 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2045 if ( pt[i] & _PAGE_PRESENT )
2047 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2048 struct pfn_info *l1page = &frame_table[l1pfn];
2050 if ( l1page->u.inuse.domain != d )
2052 printk("L2: Skip bizarre page belonging to other "
2053 "dom %p\n", l1page->u.inuse.domain);
2054 continue;
2057 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2058 PGT_l2_page_table )
2059 printk("Audit %d: [%x] Found %s Linear PT "
2060 "t=%x pfn=%lx\n", d->domain, i,
2061 (l1pfn==pfn) ? "Self" : "Other",
2062 l1page->u.inuse.type_info,
2063 l1pfn);
2064 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2065 PGT_l1_page_table )
2066 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2067 d->domain, i,
2068 l1page->u.inuse.type_info,
2069 l1pfn);
2071 adjust(l1page, -1, 1);
2075 unmap_domain_mem(pt);
2077 break;
2080 case PGT_l1_page_table:
2082 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2083 adjust( page, -1, 1 );
2085 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2086 printk("Audit %d: L1 not validated %x\n",
2087 d->domain, page->u.inuse.type_info);
2088 #if 0
2089 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2090 printk("Audit %d: L1 not pinned %x\n",
2091 d->domain, page->u.inuse.type_info);
2092 #endif
2093 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2095 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2097 if ( pt[i] & _PAGE_PRESENT )
2099 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2100 struct pfn_info *l1page = &frame_table[l1pfn];
2102 if ( l1pfn < 0x100 )
2104 lowmem_mappings++;
2105 continue;
2108 if ( l1pfn > max_page )
2110 io_mappings++;
2111 continue;
2114 if ( pt[i] & _PAGE_RW )
2117 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2118 PGT_l1_page_table ||
2119 (l1page->u.inuse.type_info & PGT_type_mask) ==
2120 PGT_l2_page_table )
2121 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2122 d->domain, i,
2123 l1page->u.inuse.type_info,
2124 l1pfn);
2128 if ( l1page->u.inuse.domain != d )
2130 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2131 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2132 d->domain, pfn, i,
2133 (unsigned long)l1page->u.inuse.domain,
2134 l1pfn,
2135 l1page->count_info,
2136 l1page->u.inuse.type_info,
2137 machine_to_phys_mapping[l1pfn]);
2138 continue;
2141 adjust(l1page, -1, 0);
2145 unmap_domain_mem(pt);
2147 break;
2150 list_ent = frame_table[pfn].list.next;
2153 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2154 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2155 d->domain, lowmem_mappings, io_mappings);
2157 /* PHASE 2 */
2159 ctot = ttot = 0;
2160 list_ent = d->page_list.next;
2161 for ( i = 0; (list_ent != &d->page_list); i++ )
2163 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2164 page = &frame_table[pfn];
2166 switch ( page->u.inuse.type_info & PGT_type_mask)
2168 case PGT_l1_page_table:
2169 case PGT_l2_page_table:
2170 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2172 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2173 d->domain, page->u.inuse.type_info,
2174 page->tlbflush_timestamp,
2175 page->count_info, pfn );
2176 scan_for_pfn_remote(pfn);
2178 default:
2179 if ( (page->count_info & PGC_count_mask) != 1 )
2181 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2182 d->domain,
2183 page->count_info,
2184 page->u.inuse.type_info,
2185 page->tlbflush_timestamp, pfn );
2186 scan_for_pfn_remote(pfn);
2188 break;
2191 list_ent = frame_table[pfn].list.next;
2194 /* PHASE 3 */
2196 list_ent = d->page_list.next;
2197 for ( i = 0; (list_ent != &d->page_list); i++ )
2199 unsigned long *pt;
2200 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2201 page = &frame_table[pfn];
2203 switch ( page->u.inuse.type_info & PGT_type_mask )
2205 case PGT_l2_page_table:
2206 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2207 adjust( page, 1, 1 );
2209 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2211 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2213 if ( pt[i] & _PAGE_PRESENT )
2215 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2216 struct pfn_info *l1page = &frame_table[l1pfn];
2218 if ( l1page->u.inuse.domain == d)
2219 adjust(l1page, 1, 1);
2223 unmap_domain_mem(pt);
2224 break;
2226 case PGT_l1_page_table:
2227 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2228 adjust( page, 1, 1 );
2230 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2232 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2234 if ( pt[i] & _PAGE_PRESENT )
2236 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2237 struct pfn_info *l1page = &frame_table[l1pfn];
2239 if ( (l1page->u.inuse.domain != d) ||
2240 (l1pfn < 0x100) || (l1pfn > max_page) )
2241 continue;
2243 adjust(l1page, 1, 0);
2247 unmap_domain_mem(pt);
2248 break;
2252 page->tlbflush_timestamp = 0;
2254 list_ent = frame_table[pfn].list.next;
2257 spin_unlock(&d->page_alloc_lock);
2259 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2261 printk("Audit %d: Done. ctot=%d ttot=%d\n",d->domain, ctot, ttot );
2263 if ( d != current )
2264 domain_unpause(d);
2267 void audit_domains(void)
2269 struct domain *d;
2270 for_each_domain ( d )
2271 audit_domain(d);
2274 void audit_domains_key(unsigned char key, void *dev_id,
2275 struct pt_regs *regs)
2277 open_softirq(MEMAUDIT_SOFTIRQ, audit_domains);
2278 raise_softirq(MEMAUDIT_SOFTIRQ);
2282 #endif