debuggers.hg

view xen/arch/x86/memory.c @ 3129:e0351a3744a5

bitkeeper revision 1.1159.187.4 (41a471c8NjyQJy-vepqpb8H7LdzHzA)

Allow preemption of long-running hypercalls for softirq processing.
author kaf24@scramble.cl.cam.ac.uk
date Wed Nov 24 11:34:32 2004 +0000 (2004-11-24)
parents fef4b77be191
children 2754a2ed61c3 2fae9947de6f b013a6b30d9e 3aad77958a08
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->id , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
148 /*
149 * We are rather picky about the layout of 'struct pfn_info'. The
150 * count_info and domain fields must be adjacent, as we perform atomic
151 * 64-bit operations on them. Also, just for sanity, we assert the size
152 * of the structure here.
153 */
154 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
155 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
156 (sizeof(struct pfn_info) != 24) )
157 {
158 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
159 offsetof(struct pfn_info, count_info),
160 offsetof(struct pfn_info, u.inuse.domain),
161 sizeof(struct pfn_info));
162 for ( ; ; ) ;
163 }
165 memset(percpu_info, 0, sizeof(percpu_info));
167 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
168 memset(machine_to_phys_mapping, 0x55, 4<<20);
170 /*
171 * Initialise our DOMID_XEN domain.
172 * Any Xen-heap pages that we will allow to be mapped will have
173 * their domain field set to dom_xen.
174 */
175 dom_xen = alloc_domain_struct();
176 atomic_set(&dom_xen->refcnt, 1);
177 dom_xen->id = DOMID_XEN;
179 /*
180 * Initialise our DOMID_IO domain.
181 * This domain owns no pages but is considered a special case when
182 * mapping I/O pages, as the mappings occur at the priv of the caller.
183 */
184 dom_io = alloc_domain_struct();
185 atomic_set(&dom_io->refcnt, 1);
186 dom_io->id = DOMID_IO;
188 /* M2P table is mappable read-only by privileged domains. */
189 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
190 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
191 mfn++ )
192 {
193 frame_table[mfn].count_info = PGC_allocated | 1;
194 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
195 frame_table[mfn].u.inuse.domain = dom_xen;
196 }
197 }
199 static void __invalidate_shadow_ldt(struct domain *d)
200 {
201 int i;
202 unsigned long pfn;
203 struct pfn_info *page;
205 d->mm.shadow_ldt_mapcnt = 0;
207 for ( i = 16; i < 32; i++ )
208 {
209 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
210 if ( pfn == 0 ) continue;
211 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
212 page = &frame_table[pfn];
213 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
214 ASSERT_PAGE_IS_DOMAIN(page, d);
215 put_page_and_type(page);
216 }
218 /* Dispose of the (now possibly invalid) mappings from the TLB. */
219 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
220 }
223 static inline void invalidate_shadow_ldt(struct domain *d)
224 {
225 if ( d->mm.shadow_ldt_mapcnt != 0 )
226 __invalidate_shadow_ldt(d);
227 }
230 static int alloc_segdesc_page(struct pfn_info *page)
231 {
232 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
233 int i;
235 for ( i = 0; i < 512; i++ )
236 if ( unlikely(!check_descriptor(&descs[i*2])) )
237 goto fail;
239 unmap_domain_mem(descs);
240 return 1;
242 fail:
243 unmap_domain_mem(descs);
244 return 0;
245 }
248 /* Map shadow page at offset @off. */
249 int map_ldt_shadow_page(unsigned int off)
250 {
251 struct domain *d = current;
252 unsigned long l1e;
254 if ( unlikely(in_irq()) )
255 BUG();
257 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
258 PAGE_SHIFT) + off]);
260 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
261 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
262 d, PGT_ldt_page)) )
263 return 0;
265 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
266 d->mm.shadow_ldt_mapcnt++;
268 return 1;
269 }
272 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
273 {
274 struct pfn_info *page = &frame_table[page_nr];
276 if ( unlikely(!pfn_is_ram(page_nr)) )
277 {
278 MEM_LOG("Pfn %08lx is not RAM", page_nr);
279 return 0;
280 }
282 if ( unlikely(!get_page(page, d)) )
283 {
284 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
285 return 0;
286 }
288 return 1;
289 }
292 static int get_page_and_type_from_pagenr(unsigned long page_nr,
293 u32 type,
294 struct domain *d)
295 {
296 struct pfn_info *page = &frame_table[page_nr];
298 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
299 return 0;
301 if ( unlikely(!get_page_type(page, type)) )
302 {
303 #ifdef VERBOSE
304 if ( (type & PGT_type_mask) != PGT_l1_page_table )
305 MEM_LOG("Bad page type for pfn %08lx (%08x)",
306 page_nr, page->u.inuse.type_info);
307 #endif
308 put_page(page);
309 return 0;
310 }
312 return 1;
313 }
316 /*
317 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
318 * needs some special care with reference counst and access permissions:
319 * 1. The mapping entry must be read-only, or the guest may get write access
320 * to its own PTEs.
321 * 2. We must only bump the reference counts for an *already validated*
322 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
323 * on a validation that is required to complete that validation.
324 * 3. We only need to increment the reference counts for the mapped page
325 * frame if it is mapped by a different L2 table. This is sufficient and
326 * also necessary to allow validation of an L2 table mapping itself.
327 */
328 static int
329 get_linear_pagetable(
330 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
331 {
332 u32 x, y;
333 struct pfn_info *page;
335 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
336 {
337 MEM_LOG("Attempt to create linear p.t. with write perms");
338 return 0;
339 }
341 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
342 {
343 /* Make sure the mapped frame belongs to the correct domain. */
344 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
345 return 0;
347 /*
348 * Make sure that the mapped frame is an already-validated L2 table.
349 * If so, atomically increment the count (checking for overflow).
350 */
351 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
352 y = page->u.inuse.type_info;
353 do {
354 x = y;
355 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
356 unlikely((x & (PGT_type_mask|PGT_validated)) !=
357 (PGT_l2_page_table|PGT_validated)) )
358 {
359 put_page(page);
360 return 0;
361 }
362 }
363 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
364 }
366 return 1;
367 }
370 static int
371 get_page_from_l1e(
372 l1_pgentry_t l1e, struct domain *d)
373 {
374 unsigned long l1v = l1_pgentry_val(l1e);
375 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
376 struct pfn_info *page = &frame_table[pfn];
377 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
379 if ( !(l1v & _PAGE_PRESENT) )
380 return 1;
382 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
383 {
384 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
385 return 0;
386 }
388 if ( unlikely(!pfn_is_ram(pfn)) )
389 {
390 /* Revert to caller privileges if FD == DOMID_IO. */
391 if ( d == dom_io )
392 d = current;
394 if ( IS_PRIV(d) )
395 return 1;
397 if ( IS_CAPABLE_PHYSDEV(d) )
398 return domain_iomem_in_pfn(d, pfn);
400 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
401 return 0;
402 }
404 return ((l1v & _PAGE_RW) ?
405 get_page_and_type(page, d, PGT_writable_page) :
406 get_page(page, d));
407 }
410 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
411 static int
412 get_page_from_l2e(
413 l2_pgentry_t l2e, unsigned long pfn,
414 struct domain *d, unsigned long va_idx)
415 {
416 int rc;
418 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
419 return 1;
421 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
422 {
423 MEM_LOG("Bad L2 page type settings %04lx",
424 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
425 return 0;
426 }
428 rc = get_page_and_type_from_pagenr(
429 l2_pgentry_to_pagenr(l2e),
430 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
432 if ( unlikely(!rc) )
433 return get_linear_pagetable(l2e, pfn, d);
435 return 1;
436 }
439 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
440 {
441 unsigned long l1v = l1_pgentry_val(l1e);
442 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
443 struct pfn_info *page = &frame_table[pfn];
444 struct domain *e = page->u.inuse.domain;
446 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
447 return;
449 if ( unlikely(e != d) )
450 {
451 /*
452 * Unmap a foreign page that may have been mapped via a grant table.
453 * Note that this can fail for a privileged domain that can map foreign
454 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
455 * counted via a grant entry and some counted directly in the page
456 * structure's reference count. Note that reference counts won't get
457 * dangerously confused as long as we always try to decrement the
458 * grant entry first. We may end up with a mismatch between which
459 * mappings and which unmappings are counted via the grant entry, but
460 * really it doesn't matter as privileged domains have carte blanche.
461 */
462 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
463 return;
464 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
465 }
467 if ( l1v & _PAGE_RW )
468 {
469 put_page_and_type(page);
470 }
471 else
472 {
473 /* We expect this is rare so we blow the entire shadow LDT. */
474 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
475 PGT_ldt_page)) &&
476 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
477 invalidate_shadow_ldt(e);
478 put_page(page);
479 }
480 }
483 /*
484 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
485 * Note also that this automatically deals correctly with linear p.t.'s.
486 */
487 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
488 {
489 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
490 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
491 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
492 }
495 static int alloc_l2_table(struct pfn_info *page)
496 {
497 struct domain *d = page->u.inuse.domain;
498 unsigned long page_nr = page_to_pfn(page);
499 l2_pgentry_t *pl2e;
500 int i;
502 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
504 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
505 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
506 goto fail;
507 }
509 #if defined(__i386__)
510 /* Now we add our private high mappings. */
511 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
512 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
513 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
514 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
515 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
516 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
517 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
518 __PAGE_HYPERVISOR);
519 #endif
521 unmap_domain_mem(pl2e);
522 return 1;
524 fail:
525 while ( i-- > 0 )
526 put_page_from_l2e(pl2e[i], page_nr);
528 unmap_domain_mem(pl2e);
529 return 0;
530 }
533 static int alloc_l1_table(struct pfn_info *page)
534 {
535 struct domain *d = page->u.inuse.domain;
536 unsigned long page_nr = page_to_pfn(page);
537 l1_pgentry_t *pl1e;
538 int i;
540 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
542 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
543 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
544 goto fail;
546 unmap_domain_mem(pl1e);
547 return 1;
549 fail:
550 while ( i-- > 0 )
551 put_page_from_l1e(pl1e[i], d);
553 unmap_domain_mem(pl1e);
554 return 0;
555 }
558 static void free_l2_table(struct pfn_info *page)
559 {
560 unsigned long page_nr = page - frame_table;
561 l2_pgentry_t *pl2e;
562 int i;
564 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
566 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
567 put_page_from_l2e(pl2e[i], page_nr);
569 unmap_domain_mem(pl2e);
570 }
573 static void free_l1_table(struct pfn_info *page)
574 {
575 struct domain *d = page->u.inuse.domain;
576 unsigned long page_nr = page - frame_table;
577 l1_pgentry_t *pl1e;
578 int i;
580 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
582 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
583 put_page_from_l1e(pl1e[i], d);
585 unmap_domain_mem(pl1e);
586 }
589 static inline int update_l2e(l2_pgentry_t *pl2e,
590 l2_pgentry_t ol2e,
591 l2_pgentry_t nl2e)
592 {
593 unsigned long o = cmpxchg((unsigned long *)pl2e,
594 l2_pgentry_val(ol2e),
595 l2_pgentry_val(nl2e));
596 if ( o != l2_pgentry_val(ol2e) )
597 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
598 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
599 return (o == l2_pgentry_val(ol2e));
600 }
603 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
604 static int mod_l2_entry(l2_pgentry_t *pl2e,
605 l2_pgentry_t nl2e,
606 unsigned long pfn)
607 {
608 l2_pgentry_t ol2e;
609 unsigned long _ol2e;
611 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
612 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
613 {
614 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
615 return 0;
616 }
618 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
619 return 0;
620 ol2e = mk_l2_pgentry(_ol2e);
622 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
623 {
624 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
625 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
626 return update_l2e(pl2e, ol2e, nl2e);
628 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
629 ((unsigned long)pl2e &
630 ~PAGE_MASK) >> 2)) )
631 return 0;
633 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
634 {
635 put_page_from_l2e(nl2e, pfn);
636 return 0;
637 }
639 put_page_from_l2e(ol2e, pfn);
640 return 1;
641 }
643 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
644 return 0;
646 put_page_from_l2e(ol2e, pfn);
647 return 1;
648 }
651 static inline int update_l1e(l1_pgentry_t *pl1e,
652 l1_pgentry_t ol1e,
653 l1_pgentry_t nl1e)
654 {
655 unsigned long o = l1_pgentry_val(ol1e);
656 unsigned long n = l1_pgentry_val(nl1e);
658 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
659 unlikely(o != l1_pgentry_val(ol1e)) )
660 {
661 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
662 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
663 return 0;
664 }
666 return 1;
667 }
670 /* Update the L1 entry at pl1e to new value nl1e. */
671 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
672 {
673 l1_pgentry_t ol1e;
674 unsigned long _ol1e;
675 struct domain *d = current;
677 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
678 {
679 MEM_LOG("Bad get_user\n");
680 return 0;
681 }
683 ol1e = mk_l1_pgentry(_ol1e);
685 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
686 {
687 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
688 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
689 return update_l1e(pl1e, ol1e, nl1e);
691 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
692 return 0;
694 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
695 {
696 put_page_from_l1e(nl1e, d);
697 return 0;
698 }
700 put_page_from_l1e(ol1e, d);
701 return 1;
702 }
704 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
705 return 0;
707 put_page_from_l1e(ol1e, d);
708 return 1;
709 }
712 int alloc_page_type(struct pfn_info *page, unsigned int type)
713 {
714 switch ( type )
715 {
716 case PGT_l1_page_table:
717 return alloc_l1_table(page);
718 case PGT_l2_page_table:
719 return alloc_l2_table(page);
720 case PGT_gdt_page:
721 case PGT_ldt_page:
722 return alloc_segdesc_page(page);
723 default:
724 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
725 type, page->u.inuse.type_info,
726 page->count_info);
727 BUG();
728 }
730 return 0;
731 }
734 void free_page_type(struct pfn_info *page, unsigned int type)
735 {
736 struct domain *d = page->u.inuse.domain;
738 switch ( type )
739 {
740 case PGT_l1_page_table:
741 free_l1_table(page);
742 break;
744 case PGT_l2_page_table:
745 free_l2_table(page);
746 break;
748 default:
749 BUG();
750 }
752 if ( unlikely(d->mm.shadow_mode) &&
753 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
754 {
755 unshadow_table(page_to_pfn(page), type);
756 put_shadow_status(&d->mm);
757 }
758 }
761 void put_page_type(struct pfn_info *page)
762 {
763 u32 nx, x, y = page->u.inuse.type_info;
765 again:
766 do {
767 x = y;
768 nx = x - 1;
770 ASSERT((x & PGT_count_mask) != 0);
772 /*
773 * The page should always be validated while a reference is held. The
774 * exception is during domain destruction, when we forcibly invalidate
775 * page-table pages if we detect a referential loop.
776 * See domain.c:relinquish_list().
777 */
778 ASSERT((x & PGT_validated) ||
779 test_bit(DF_DYING, &page->u.inuse.domain->flags));
781 if ( unlikely((nx & PGT_count_mask) == 0) )
782 {
783 /* Record TLB information for flush later. Races are harmless. */
784 page->tlbflush_timestamp = tlbflush_current_time();
786 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
787 likely(nx & PGT_validated) )
788 {
789 /*
790 * Page-table pages must be unvalidated when count is zero. The
791 * 'free' is safe because the refcnt is non-zero and validated
792 * bit is clear => other ops will spin or fail.
793 */
794 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
795 x & ~PGT_validated)) != x) )
796 goto again;
797 /* We cleared the 'valid bit' so we do the clear up. */
798 free_page_type(page, x & PGT_type_mask);
799 /* Carry on, but with the 'valid bit' now clear. */
800 x &= ~PGT_validated;
801 nx &= ~PGT_validated;
802 }
803 }
804 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
805 (PGT_pinned | 1)) )
806 {
807 /* Page is now only pinned. Make the back pointer mutable again. */
808 nx |= PGT_va_mutable;
809 }
810 }
811 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
812 }
815 int get_page_type(struct pfn_info *page, u32 type)
816 {
817 u32 nx, x, y = page->u.inuse.type_info;
819 again:
820 do {
821 x = y;
822 nx = x + 1;
823 if ( unlikely((nx & PGT_count_mask) == 0) )
824 {
825 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
826 return 0;
827 }
828 else if ( unlikely((x & PGT_count_mask) == 0) )
829 {
830 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
831 {
832 /*
833 * On type change we check to flush stale TLB entries. This
834 * may be unnecessary (e.g., page was GDT/LDT) but those
835 * circumstances should be very rare.
836 */
837 struct domain *d = page->u.inuse.domain;
838 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
839 page->tlbflush_timestamp)) )
840 {
841 perfc_incr(need_flush_tlb_flush);
842 flush_tlb_cpu(d->processor);
843 }
845 /* We lose existing type, back pointer, and validity. */
846 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
847 nx |= type;
849 /* No special validation needed for writable pages. */
850 /* Page tables and GDT/LDT need to be scanned for validity. */
851 if ( type == PGT_writable_page )
852 nx |= PGT_validated;
853 }
854 }
855 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
856 {
857 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
858 {
859 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
860 ((type & PGT_type_mask) != PGT_l1_page_table) )
861 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
862 x & PGT_type_mask, type, page_to_pfn(page));
863 return 0;
864 }
865 else if ( (x & PGT_va_mask) == PGT_va_mutable )
866 {
867 /* The va backpointer is mutable, hence we update it. */
868 nx &= ~PGT_va_mask;
869 nx |= type; /* we know the actual type is correct */
870 }
871 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
872 {
873 /* This table is potentially mapped at multiple locations. */
874 nx &= ~PGT_va_mask;
875 nx |= PGT_va_unknown;
876 }
877 }
878 else if ( unlikely(!(x & PGT_validated)) )
879 {
880 /* Someone else is updating validation of this page. Wait... */
881 while ( (y = page->u.inuse.type_info) == x )
882 {
883 rep_nop();
884 barrier();
885 }
886 goto again;
887 }
888 }
889 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
891 if ( unlikely(!(nx & PGT_validated)) )
892 {
893 /* Try to validate page type; drop the new reference on failure. */
894 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
895 {
896 MEM_LOG("Error while validating pfn %08lx for type %08x."
897 " caf=%08x taf=%08x\n",
898 page_to_pfn(page), type,
899 page->count_info,
900 page->u.inuse.type_info);
901 /* Noone else can get a reference. We hold the only ref. */
902 page->u.inuse.type_info = 0;
903 return 0;
904 }
906 /* Noone else is updating simultaneously. */
907 __set_bit(_PGT_validated, &page->u.inuse.type_info);
908 }
910 return 1;
911 }
914 static int do_extended_command(unsigned long ptr, unsigned long val)
915 {
916 int okay = 1, cpu = smp_processor_id();
917 unsigned int cmd = val & MMUEXT_CMD_MASK;
918 unsigned long pfn = ptr >> PAGE_SHIFT;
919 unsigned long old_base_pfn;
920 struct pfn_info *page = &frame_table[pfn];
921 struct domain *d = current, *nd, *e;
922 u32 x, y;
923 domid_t domid;
924 grant_ref_t gntref;
926 switch ( cmd )
927 {
928 case MMUEXT_PIN_L1_TABLE:
929 case MMUEXT_PIN_L2_TABLE:
930 /*
931 * We insist that, if you pin an L1 page, it's the first thing that
932 * you do to it. This is because we require the backptr to still be
933 * mutable. This assumption seems safe.
934 */
935 okay = get_page_and_type_from_pagenr(
936 pfn,
937 ((cmd==MMUEXT_PIN_L2_TABLE) ?
938 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
939 FOREIGNDOM);
941 if ( unlikely(!okay) )
942 {
943 MEM_LOG("Error while pinning pfn %08lx", pfn);
944 break;
945 }
947 if ( unlikely(test_and_set_bit(_PGT_pinned,
948 &page->u.inuse.type_info)) )
949 {
950 MEM_LOG("Pfn %08lx already pinned", pfn);
951 put_page_and_type(page);
952 okay = 0;
953 break;
954 }
956 break;
958 case MMUEXT_UNPIN_TABLE:
959 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
960 {
961 MEM_LOG("Page %08lx bad domain (dom=%p)",
962 ptr, page->u.inuse.domain);
963 }
964 else if ( likely(test_and_clear_bit(_PGT_pinned,
965 &page->u.inuse.type_info)) )
966 {
967 put_page_and_type(page);
968 put_page(page);
969 }
970 else
971 {
972 okay = 0;
973 put_page(page);
974 MEM_LOG("Pfn %08lx not pinned", pfn);
975 }
976 break;
978 case MMUEXT_NEW_BASEPTR:
979 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
980 if ( likely(okay) )
981 {
982 invalidate_shadow_ldt(d);
984 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
985 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
986 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
988 shadow_mk_pagetable(&d->mm);
990 write_ptbase(&d->mm);
992 put_page_and_type(&frame_table[old_base_pfn]);
993 }
994 else
995 {
996 MEM_LOG("Error while installing new baseptr %08lx", ptr);
997 }
998 break;
1000 case MMUEXT_TLB_FLUSH:
1001 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1002 break;
1004 case MMUEXT_INVLPG:
1005 __flush_tlb_one(ptr);
1006 break;
1008 case MMUEXT_FLUSH_CACHE:
1009 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1011 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1012 okay = 0;
1014 else
1016 wbinvd();
1018 break;
1020 case MMUEXT_SET_LDT:
1022 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1023 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1024 (ents > 8192) ||
1025 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1026 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1028 okay = 0;
1029 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1031 else if ( (d->mm.ldt_ents != ents) ||
1032 (d->mm.ldt_base != ptr) )
1034 invalidate_shadow_ldt(d);
1035 d->mm.ldt_base = ptr;
1036 d->mm.ldt_ents = ents;
1037 load_LDT(d);
1038 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1039 if ( ents != 0 )
1040 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1042 break;
1045 case MMUEXT_SET_FOREIGNDOM:
1046 domid = (domid_t)(val >> 16);
1048 if ( (e = percpu_info[cpu].foreign) != NULL )
1049 put_domain(e);
1050 percpu_info[cpu].foreign = NULL;
1052 if ( !IS_PRIV(d) )
1054 switch ( domid )
1056 case DOMID_IO:
1057 get_knownalive_domain(dom_io);
1058 percpu_info[cpu].foreign = dom_io;
1059 break;
1060 default:
1061 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1062 okay = 0;
1063 break;
1066 else
1068 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1069 if ( e == NULL )
1071 switch ( domid )
1073 case DOMID_XEN:
1074 get_knownalive_domain(dom_xen);
1075 percpu_info[cpu].foreign = dom_xen;
1076 break;
1077 case DOMID_IO:
1078 get_knownalive_domain(dom_io);
1079 percpu_info[cpu].foreign = dom_io;
1080 break;
1081 default:
1082 MEM_LOG("Unknown domain '%u'", domid);
1083 okay = 0;
1084 break;
1088 break;
1090 case MMUEXT_TRANSFER_PAGE:
1091 domid = (domid_t)(val >> 16);
1092 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1094 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1095 unlikely(!pfn_is_ram(pfn)) ||
1096 unlikely((e = find_domain_by_id(domid)) == NULL) )
1098 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1099 okay = 0;
1100 break;
1103 spin_lock(&d->page_alloc_lock);
1105 /*
1106 * The tricky bit: atomically release ownership while there is just one
1107 * benign reference to the page (PGC_allocated). If that reference
1108 * disappears then the deallocation routine will safely spin.
1109 */
1110 nd = page->u.inuse.domain;
1111 y = page->count_info;
1112 do {
1113 x = y;
1114 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1115 (1|PGC_allocated)) ||
1116 unlikely(nd != d) )
1118 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1119 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1120 d, d->id, nd, x, page->u.inuse.type_info);
1121 spin_unlock(&d->page_alloc_lock);
1122 put_domain(e);
1123 return 0;
1125 __asm__ __volatile__(
1126 LOCK_PREFIX "cmpxchg8b %2"
1127 : "=d" (nd), "=a" (y),
1128 "=m" (*(volatile u64 *)(&page->count_info))
1129 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1131 while ( unlikely(nd != d) || unlikely(y != x) );
1133 /*
1134 * Unlink from 'd'. At least one reference remains (now anonymous), so
1135 * noone else is spinning to try to delete this page from 'd'.
1136 */
1137 d->tot_pages--;
1138 list_del(&page->list);
1140 spin_unlock(&d->page_alloc_lock);
1142 spin_lock(&e->page_alloc_lock);
1144 /*
1145 * Check that 'e' will accept the page and has reservation headroom.
1146 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1147 */
1148 ASSERT(e->tot_pages <= e->max_pages);
1149 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1150 unlikely(e->tot_pages == e->max_pages) ||
1151 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1153 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1154 "provided a bad grant ref, or is dying (%08lx).\n",
1155 e->tot_pages, e->max_pages, e->flags);
1156 spin_unlock(&e->page_alloc_lock);
1157 put_domain(e);
1158 okay = 0;
1159 break;
1162 /* Okay, add the page to 'e'. */
1163 if ( unlikely(e->tot_pages++ == 0) )
1164 get_knownalive_domain(e);
1165 list_add_tail(&page->list, &e->page_list);
1166 page->u.inuse.domain = e;
1168 spin_unlock(&e->page_alloc_lock);
1170 /* Transfer is all done: tell the guest about its new page frame. */
1171 gnttab_notify_transfer(e, gntref, pfn);
1173 put_domain(e);
1174 break;
1176 case MMUEXT_REASSIGN_PAGE:
1177 if ( unlikely(!IS_PRIV(d)) )
1179 MEM_LOG("Dom %u has no reassignment priv", d->id);
1180 okay = 0;
1181 break;
1184 e = percpu_info[cpu].foreign;
1185 if ( unlikely(e == NULL) )
1187 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1188 okay = 0;
1189 break;
1192 /*
1193 * Grab both page_list locks, in order. This prevents the page from
1194 * disappearing elsewhere while we modify the owner, and we'll need
1195 * both locks if we're successful so that we can change lists.
1196 */
1197 if ( d < e )
1199 spin_lock(&d->page_alloc_lock);
1200 spin_lock(&e->page_alloc_lock);
1202 else
1204 spin_lock(&e->page_alloc_lock);
1205 spin_lock(&d->page_alloc_lock);
1208 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1209 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1210 unlikely(IS_XEN_HEAP_FRAME(page)) )
1212 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1213 okay = 0;
1214 goto reassign_fail;
1217 /*
1218 * The tricky bit: atomically change owner while there is just one
1219 * benign reference to the page (PGC_allocated). If that reference
1220 * disappears then the deallocation routine will safely spin.
1221 */
1222 nd = page->u.inuse.domain;
1223 y = page->count_info;
1224 do {
1225 x = y;
1226 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1227 (1|PGC_allocated)) ||
1228 unlikely(nd != d) )
1230 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1231 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1232 d, d->id, nd, x, page->u.inuse.type_info);
1233 okay = 0;
1234 goto reassign_fail;
1236 __asm__ __volatile__(
1237 LOCK_PREFIX "cmpxchg8b %3"
1238 : "=d" (nd), "=a" (y), "=c" (e),
1239 "=m" (*(volatile u64 *)(&page->count_info))
1240 : "0" (d), "1" (x), "c" (e), "b" (x) );
1242 while ( unlikely(nd != d) || unlikely(y != x) );
1244 /*
1245 * Unlink from 'd'. We transferred at least one reference to 'e', so
1246 * noone else is spinning to try to delete this page from 'd'.
1247 */
1248 d->tot_pages--;
1249 list_del(&page->list);
1251 /*
1252 * Add the page to 'e'. Someone may already have removed the last
1253 * reference and want to remove the page from 'e'. However, we have
1254 * the lock so they'll spin waiting for us.
1255 */
1256 if ( unlikely(e->tot_pages++ == 0) )
1257 get_knownalive_domain(e);
1258 list_add_tail(&page->list, &e->page_list);
1260 reassign_fail:
1261 spin_unlock(&d->page_alloc_lock);
1262 spin_unlock(&e->page_alloc_lock);
1263 break;
1265 case MMUEXT_CLEAR_FOREIGNDOM:
1266 if ( (e = percpu_info[cpu].foreign) != NULL )
1267 put_domain(e);
1268 percpu_info[cpu].foreign = NULL;
1269 break;
1271 default:
1272 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1273 okay = 0;
1274 break;
1277 return okay;
1281 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1283 mmu_update_t req;
1284 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1285 struct pfn_info *page;
1286 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1287 unsigned int cmd;
1288 unsigned long prev_spfn = 0;
1289 l1_pgentry_t *prev_spl1e = 0;
1290 struct domain *d = current;
1291 u32 type_info;
1293 perfc_incrc(calls_to_mmu_update);
1294 perfc_addc(num_page_updates, count);
1296 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1298 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) )
1299 return -EFAULT;
1301 for ( i = 0; i < count; i++ )
1303 hypercall_may_preempt(
1304 __HYPERVISOR_mmu_update, 3, ureqs, count-i, success_count);
1306 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1308 MEM_LOG("Bad __copy_from_user");
1309 rc = -EFAULT;
1310 break;
1313 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1314 pfn = req.ptr >> PAGE_SHIFT;
1316 okay = 0;
1318 switch ( cmd )
1320 /*
1321 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1322 */
1323 case MMU_NORMAL_PT_UPDATE:
1324 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1326 MEM_LOG("Could not get page for normal update");
1327 break;
1330 if ( likely(prev_pfn == pfn) )
1332 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1334 else
1336 if ( prev_pfn != 0 )
1337 unmap_domain_mem((void *)va);
1338 va = (unsigned long)map_domain_mem(req.ptr);
1339 prev_pfn = pfn;
1342 page = &frame_table[pfn];
1343 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1345 case PGT_l1_page_table:
1346 if ( likely(get_page_type(
1347 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1349 okay = mod_l1_entry((l1_pgentry_t *)va,
1350 mk_l1_pgentry(req.val));
1352 if ( unlikely(d->mm.shadow_mode) && okay &&
1353 (get_shadow_status(&d->mm, page-frame_table) &
1354 PSH_shadowed) )
1356 shadow_l1_normal_pt_update(
1357 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1358 put_shadow_status(&d->mm);
1361 put_page_type(page);
1363 break;
1364 case PGT_l2_page_table:
1365 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1367 okay = mod_l2_entry((l2_pgentry_t *)va,
1368 mk_l2_pgentry(req.val),
1369 pfn);
1371 if ( unlikely(d->mm.shadow_mode) && okay &&
1372 (get_shadow_status(&d->mm, page-frame_table) &
1373 PSH_shadowed) )
1375 shadow_l2_normal_pt_update(req.ptr, req.val);
1376 put_shadow_status(&d->mm);
1379 put_page_type(page);
1381 break;
1382 default:
1383 if ( likely(get_page_type(page, PGT_writable_page)) )
1385 *(unsigned long *)va = req.val;
1386 okay = 1;
1387 put_page_type(page);
1389 break;
1392 put_page(page);
1393 break;
1395 case MMU_MACHPHYS_UPDATE:
1396 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1398 MEM_LOG("Could not get page for mach->phys update");
1399 break;
1402 machine_to_phys_mapping[pfn] = req.val;
1403 okay = 1;
1405 /*
1406 * If in log-dirty mode, mark the corresponding pseudo-physical
1407 * page as dirty.
1408 */
1409 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1410 mark_dirty(&d->mm, pfn) )
1411 d->mm.shadow_dirty_block_count++;
1413 put_page(&frame_table[pfn]);
1414 break;
1416 /*
1417 * MMU_EXTENDED_COMMAND: Extended command is specified
1418 * in the least-siginificant bits of the 'value' field.
1419 */
1420 case MMU_EXTENDED_COMMAND:
1421 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1422 okay = do_extended_command(req.ptr, req.val);
1423 break;
1425 default:
1426 MEM_LOG("Invalid page update command %08lx", req.ptr);
1427 break;
1430 if ( unlikely(!okay) )
1432 rc = -EINVAL;
1433 break;
1436 ureqs++;
1439 if ( prev_pfn != 0 )
1440 unmap_domain_mem((void *)va);
1442 if ( unlikely(prev_spl1e != 0) )
1443 unmap_domain_mem((void *)prev_spl1e);
1445 deferred_ops = percpu_info[cpu].deferred_ops;
1446 percpu_info[cpu].deferred_ops = 0;
1448 if ( deferred_ops & DOP_FLUSH_TLB )
1449 local_flush_tlb();
1451 if ( deferred_ops & DOP_RELOAD_LDT )
1452 (void)map_ldt_shadow_page(0);
1454 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1456 put_domain(percpu_info[cpu].foreign);
1457 percpu_info[cpu].foreign = NULL;
1460 if ( unlikely(success_count != NULL) )
1461 put_user(i, success_count);
1463 return rc;
1467 int do_update_va_mapping(unsigned long page_nr,
1468 unsigned long val,
1469 unsigned long flags)
1471 struct domain *d = current;
1472 int err = 0;
1473 unsigned int cpu = d->processor;
1474 unsigned long deferred_ops;
1476 perfc_incrc(calls_to_update_va);
1478 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1479 return -EINVAL;
1481 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1483 /*
1484 * XXX When we make this support 4MB superpages we should also deal with
1485 * the case of updating L2 entries.
1486 */
1488 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1489 mk_l1_pgentry(val))) )
1490 err = -EINVAL;
1492 if ( unlikely(d->mm.shadow_mode) )
1494 unsigned long sval;
1496 l1pte_propagate_from_guest(&d->mm, &val, &sval);
1498 if ( unlikely(__put_user(sval, ((unsigned long *)(
1499 &shadow_linear_pg_table[page_nr])))) )
1501 /*
1502 * Since L2's are guranteed RW, failure indicates the page was not
1503 * shadowed, so ignore.
1504 */
1505 perfc_incrc(shadow_update_va_fail);
1508 /*
1509 * If we're in log-dirty mode then we need to note that we've updated
1510 * the PTE in the PT-holding page. We need the machine frame number
1511 * for this.
1512 */
1513 if ( d->mm.shadow_mode == SHM_logdirty )
1514 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1516 check_pagetable(&d->mm, d->mm.pagetable, "va"); /* debug */
1519 deferred_ops = percpu_info[cpu].deferred_ops;
1520 percpu_info[cpu].deferred_ops = 0;
1522 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1523 unlikely(flags & UVMF_FLUSH_TLB) )
1524 local_flush_tlb();
1525 else if ( unlikely(flags & UVMF_INVLPG) )
1526 __flush_tlb_one(page_nr << PAGE_SHIFT);
1528 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1529 (void)map_ldt_shadow_page(0);
1531 return err;
1534 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1535 unsigned long val,
1536 unsigned long flags,
1537 domid_t domid)
1539 unsigned int cpu = smp_processor_id();
1540 struct domain *d;
1541 int rc;
1543 if ( unlikely(!IS_PRIV(current)) )
1544 return -EPERM;
1546 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1547 if ( unlikely(d == NULL) )
1549 MEM_LOG("Unknown domain '%u'", domid);
1550 return -ESRCH;
1553 rc = do_update_va_mapping(page_nr, val, flags);
1555 put_domain(d);
1556 percpu_info[cpu].foreign = NULL;
1558 return rc;
1563 /*************************
1564 * Writable Pagetables
1565 */
1567 ptwr_info_t ptwr_info[NR_CPUS];
1569 #ifdef VERBOSE
1570 int ptwr_debug = 0x0;
1571 #define PTWR_PRINTK(_f, _a...) \
1572 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1573 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1574 #else
1575 #define PTWR_PRINTK(_f, _a...) ((void)0)
1576 #endif
1578 /* Flush the given writable p.t. page and write-protect it again. */
1579 void ptwr_flush(const int which)
1581 unsigned long sstat, spte, pte, *ptep, l1va;
1582 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1583 l2_pgentry_t *pl2e;
1584 int i, cpu = smp_processor_id();
1585 struct domain *d = current;
1587 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1588 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1590 /*
1591 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1592 */
1594 if ( unlikely(__get_user(pte, ptep)) )
1596 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1597 /*
1598 * Really a bug. We could read this PTE during the initial fault,
1599 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1600 */
1601 BUG();
1603 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1604 PTWR_PRINT_WHICH, ptep, pte);
1605 pte &= ~_PAGE_RW;
1607 if ( unlikely(d->mm.shadow_mode) )
1609 /* Write-protect the p.t. page in the shadow page table. */
1610 l1pte_propagate_from_guest(&d->mm, &pte, &spte);
1611 __put_user(
1612 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1614 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1615 sstat = get_shadow_status(&d->mm, pte >> PAGE_SHIFT);
1616 if ( sstat & PSH_shadowed )
1617 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1620 /* Write-protect the p.t. page in the guest page table. */
1621 if ( unlikely(__put_user(pte, ptep)) )
1623 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1624 /*
1625 * Really a bug. We could write this PTE during the initial fault,
1626 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1627 */
1628 BUG();
1631 /* Ensure that there are no stale writable mappings in any TLB. */
1632 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1633 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1634 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1635 PTWR_PRINT_WHICH, ptep, pte);
1637 /*
1638 * STEP 2. Validate any modified PTEs.
1639 */
1641 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1642 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1644 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1645 nl1e = pl1e[i];
1647 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1648 continue;
1650 /*
1651 * Fast path for PTEs that have merely been write-protected
1652 * (e.g., during a Unix fork()). A strict reduction in privilege.
1653 */
1654 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1656 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1658 if ( unlikely(sl1e != NULL) )
1659 l1pte_propagate_from_guest(
1660 &d->mm, &l1_pgentry_val(nl1e),
1661 &l1_pgentry_val(sl1e[i]));
1662 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1664 continue;
1667 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1669 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1670 /*
1671 * Make the remaining p.t's consistent before crashing, so the
1672 * reference counts are correct.
1673 */
1674 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1675 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1676 unmap_domain_mem(pl1e);
1677 ptwr_info[cpu].ptinfo[which].l1va = 0;
1678 domain_crash();
1681 if ( unlikely(sl1e != NULL) )
1682 l1pte_propagate_from_guest(
1683 &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1685 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1686 put_page_from_l1e(ol1e, d);
1688 unmap_domain_mem(pl1e);
1690 /*
1691 * STEP 3. Reattach the L1 p.t. page into the current address space.
1692 */
1694 if ( (which == PTWR_PT_ACTIVE) && likely(!d->mm.shadow_mode) )
1696 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1697 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1700 /*
1701 * STEP 4. Final tidy-up.
1702 */
1704 ptwr_info[cpu].ptinfo[which].l1va = 0;
1706 if ( unlikely(sl1e != NULL) )
1708 unmap_domain_mem(sl1e);
1709 put_shadow_status(&d->mm);
1713 /* Write page fault handler: check if guest is trying to modify a PTE. */
1714 int ptwr_do_page_fault(unsigned long addr)
1716 unsigned long pte, pfn, l2e;
1717 struct pfn_info *page;
1718 l2_pgentry_t *pl2e;
1719 int which, cpu = smp_processor_id();
1720 u32 l2_idx;
1722 /*
1723 * Attempt to read the PTE that maps the VA being accessed. By checking for
1724 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1725 */
1726 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1727 _PAGE_PRESENT) ||
1728 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1729 return 0;
1731 pfn = pte >> PAGE_SHIFT;
1732 page = &frame_table[pfn];
1734 /* We are looking only for read-only mappings of p.t. pages. */
1735 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1736 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1737 return 0;
1739 /* Get the L2 index at which this L1 p.t. is always mapped. */
1740 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1741 if ( unlikely(l2_idx >= PGT_va_unknown) )
1742 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1743 l2_idx >>= PGT_va_shift;
1745 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1747 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1748 domain_crash();
1751 /*
1752 * Is the L1 p.t. mapped into the current address space? If so we call it
1753 * an ACTIVE p.t., otherwise it is INACTIVE.
1754 */
1755 pl2e = &linear_l2_table[l2_idx];
1756 l2e = l2_pgentry_val(*pl2e);
1757 which = PTWR_PT_INACTIVE;
1758 if ( (l2e >> PAGE_SHIFT) == pfn )
1760 /*
1761 * If the PRESENT bit is clear, we may be conflicting with the current
1762 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
1763 */
1764 if ( unlikely(!(l2e & _PAGE_PRESENT)) &&
1765 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va )
1766 ptwr_flush(PTWR_PT_ACTIVE);
1768 /* Now do a final check of the PRESENT bit to set ACTIVE. */
1769 if ( likely(l2e & _PAGE_PRESENT) )
1770 which = PTWR_PT_ACTIVE;
1773 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1774 "pfn %08lx\n", PTWR_PRINT_WHICH,
1775 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1777 /*
1778 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1779 * time. If there is already one, we must flush it out.
1780 */
1781 if ( ptwr_info[cpu].ptinfo[which].l1va )
1782 ptwr_flush(which);
1784 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1785 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1787 /* For safety, disconnect the L1 p.t. page from current space. */
1788 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1790 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1791 flush_tlb(); /* XXX Multi-CPU guests? */
1794 /* Temporarily map the L1 page, and make a copy of it. */
1795 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1796 memcpy(ptwr_info[cpu].ptinfo[which].page,
1797 ptwr_info[cpu].ptinfo[which].pl1e,
1798 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1800 /* Finally, make the p.t. page writable by the guest OS. */
1801 pte |= _PAGE_RW;
1802 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1803 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1804 if ( unlikely(__put_user(pte, (unsigned long *)
1805 &linear_pg_table[addr>>PAGE_SHIFT])) )
1807 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1808 &linear_pg_table[addr>>PAGE_SHIFT]);
1809 /* Toss the writable pagetable state and crash. */
1810 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1811 ptwr_info[cpu].ptinfo[which].l1va = 0;
1812 domain_crash();
1815 return EXCRET_fault_fixed;
1818 static __init int ptwr_init(void)
1820 int i;
1822 for ( i = 0; i < smp_num_cpus; i++ )
1824 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1825 (void *)alloc_xenheap_page();
1826 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1827 (void *)alloc_xenheap_page();
1830 return 0;
1832 __initcall(ptwr_init);
1837 /************************************************************************/
1838 /************************************************************************/
1839 /************************************************************************/
1841 #ifndef NDEBUG
1843 void ptwr_status(void)
1845 unsigned long pte, *ptep, pfn;
1846 struct pfn_info *page;
1847 int cpu = smp_processor_id();
1849 ptep = (unsigned long *)&linear_pg_table
1850 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1852 if ( __get_user(pte, ptep) ) {
1853 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1854 domain_crash();
1857 pfn = pte >> PAGE_SHIFT;
1858 page = &frame_table[pfn];
1859 printk("need to alloc l1 page %p\n", page);
1860 /* make pt page writable */
1861 printk("need to make read-only l1-page at %p is %08lx\n",
1862 ptep, pte);
1864 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1865 return;
1867 if ( __get_user(pte, (unsigned long *)
1868 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1869 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1870 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1871 domain_crash();
1873 pfn = pte >> PAGE_SHIFT;
1874 page = &frame_table[pfn];
1877 void audit_domain(struct domain *d)
1879 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1881 void adjust (struct pfn_info *page, int dir, int adjtype)
1883 int count = page->count_info & PGC_count_mask;
1885 if ( adjtype )
1887 int tcount = page->u.inuse.type_info & PGT_count_mask;
1889 ttot++;
1891 tcount += dir;
1893 if ( tcount < 0 )
1895 /* This will only come out once. */
1896 printk("Audit %d: type count whent below zero pfn=%x "
1897 "taf=%x otaf=%x\n",
1898 d->id, page-frame_table,
1899 page->u.inuse.type_info,
1900 page->tlbflush_timestamp);
1903 page->u.inuse.type_info =
1904 (page->u.inuse.type_info & ~PGT_count_mask) |
1905 (tcount & PGT_count_mask);
1908 ctot++;
1909 count += dir;
1910 if ( count < 0 )
1912 /* This will only come out once. */
1913 printk("Audit %d: general count whent below zero pfn=%x "
1914 "taf=%x otaf=%x\n",
1915 d->id, page-frame_table,
1916 page->u.inuse.type_info,
1917 page->tlbflush_timestamp);
1920 page->count_info =
1921 (page->count_info & ~PGC_count_mask) |
1922 (count & PGC_count_mask);
1926 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1928 unsigned long pfn, *pt;
1929 struct list_head *list_ent;
1930 struct pfn_info *page;
1931 int i;
1933 list_ent = d->page_list.next;
1934 for ( i = 0; (list_ent != &d->page_list); i++ )
1936 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1937 page = &frame_table[pfn];
1939 switch ( page->u.inuse.type_info & PGT_type_mask )
1941 case PGT_l1_page_table:
1942 case PGT_l2_page_table:
1943 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1944 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1945 if ( (pt[i] & _PAGE_PRESENT) &&
1946 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1947 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1948 d->id, i, pfn, page->u.inuse.type_info,
1949 page->count_info);
1950 unmap_domain_mem(pt);
1953 list_ent = frame_table[pfn].list.next;
1958 void scan_for_pfn_remote(unsigned long xpfn)
1960 struct domain *e;
1961 for_each_domain ( e )
1962 scan_for_pfn( e, xpfn );
1965 int i;
1966 unsigned long pfn;
1967 struct list_head *list_ent;
1968 struct pfn_info *page;
1970 if ( d != current )
1971 domain_pause(d);
1972 synchronise_pagetables(~0UL);
1974 printk("pt base=%lx sh_info=%x\n",
1975 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
1976 virt_to_page(d->shared_info)-frame_table);
1978 spin_lock(&d->page_alloc_lock);
1980 /* PHASE 0 */
1982 list_ent = d->page_list.next;
1983 for ( i = 0; (list_ent != &d->page_list); i++ )
1985 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1986 page = &frame_table[pfn];
1988 if ( page->u.inuse.domain != d )
1989 BUG();
1991 if ( (page->u.inuse.type_info & PGT_count_mask) >
1992 (page->count_info & PGC_count_mask) )
1993 printk("taf > caf %x %x pfn=%lx\n",
1994 page->u.inuse.type_info, page->count_info, pfn );
1996 #if 0 /* SYSV shared memory pages plus writeable files. */
1997 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
1998 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2000 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2001 pfn,
2002 page->u.inuse.type_info,
2003 page->count_info );
2004 scan_for_pfn_remote(pfn);
2006 #endif
2007 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2008 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2010 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2011 pfn,
2012 page->u.inuse.type_info,
2013 page->count_info );
2016 /* Use tlbflush_timestamp to store original type_info. */
2017 page->tlbflush_timestamp = page->u.inuse.type_info;
2019 list_ent = frame_table[pfn].list.next;
2023 /* PHASE 1 */
2025 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2027 list_ent = d->page_list.next;
2028 for ( i = 0; (list_ent != &d->page_list); i++ )
2030 unsigned long *pt;
2031 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2032 page = &frame_table[pfn];
2034 if ( page->u.inuse.domain != d )
2035 BUG();
2037 switch ( page->u.inuse.type_info & PGT_type_mask )
2039 case PGT_l2_page_table:
2041 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2042 printk("Audit %d: L2 not validated %x\n",
2043 d->id, page->u.inuse.type_info);
2045 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2046 printk("Audit %d: L2 not pinned %x\n",
2047 d->id, page->u.inuse.type_info);
2048 else
2049 adjust( page, -1, 1 );
2051 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2053 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2055 if ( pt[i] & _PAGE_PRESENT )
2057 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2058 struct pfn_info *l1page = &frame_table[l1pfn];
2060 if ( l1page->u.inuse.domain != d )
2062 printk("L2: Skip bizarre page belonging to other "
2063 "dom %p\n", l1page->u.inuse.domain);
2064 continue;
2067 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2068 PGT_l2_page_table )
2069 printk("Audit %d: [%x] Found %s Linear PT "
2070 "t=%x pfn=%lx\n", d->id, i,
2071 (l1pfn==pfn) ? "Self" : "Other",
2072 l1page->u.inuse.type_info,
2073 l1pfn);
2074 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2075 PGT_l1_page_table )
2076 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2077 d->id, i,
2078 l1page->u.inuse.type_info,
2079 l1pfn);
2081 adjust(l1page, -1, 1);
2085 unmap_domain_mem(pt);
2087 break;
2090 case PGT_l1_page_table:
2092 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2093 adjust( page, -1, 1 );
2095 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2096 printk("Audit %d: L1 not validated %x\n",
2097 d->id, page->u.inuse.type_info);
2098 #if 0
2099 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2100 printk("Audit %d: L1 not pinned %x\n",
2101 d->id, page->u.inuse.type_info);
2102 #endif
2103 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2105 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2107 if ( pt[i] & _PAGE_PRESENT )
2109 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2110 struct pfn_info *l1page = &frame_table[l1pfn];
2112 if ( l1pfn < 0x100 )
2114 lowmem_mappings++;
2115 continue;
2118 if ( l1pfn > max_page )
2120 io_mappings++;
2121 continue;
2124 if ( pt[i] & _PAGE_RW )
2127 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2128 PGT_l1_page_table ||
2129 (l1page->u.inuse.type_info & PGT_type_mask) ==
2130 PGT_l2_page_table )
2131 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2132 d->id, i,
2133 l1page->u.inuse.type_info,
2134 l1pfn);
2138 if ( l1page->u.inuse.domain != d )
2140 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2141 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2142 d->id, pfn, i,
2143 (unsigned long)l1page->u.inuse.domain,
2144 l1pfn,
2145 l1page->count_info,
2146 l1page->u.inuse.type_info,
2147 machine_to_phys_mapping[l1pfn]);
2148 continue;
2151 adjust(l1page, -1, 0);
2155 unmap_domain_mem(pt);
2157 break;
2160 list_ent = frame_table[pfn].list.next;
2163 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2164 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2165 d->id, lowmem_mappings, io_mappings);
2167 /* PHASE 2 */
2169 ctot = ttot = 0;
2170 list_ent = d->page_list.next;
2171 for ( i = 0; (list_ent != &d->page_list); i++ )
2173 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2174 page = &frame_table[pfn];
2176 switch ( page->u.inuse.type_info & PGT_type_mask)
2178 case PGT_l1_page_table:
2179 case PGT_l2_page_table:
2180 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2182 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2183 d->id, page->u.inuse.type_info,
2184 page->tlbflush_timestamp,
2185 page->count_info, pfn );
2186 scan_for_pfn_remote(pfn);
2188 default:
2189 if ( (page->count_info & PGC_count_mask) != 1 )
2191 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2192 d->id,
2193 page->count_info,
2194 page->u.inuse.type_info,
2195 page->tlbflush_timestamp, pfn );
2196 scan_for_pfn_remote(pfn);
2198 break;
2201 list_ent = frame_table[pfn].list.next;
2204 /* PHASE 3 */
2206 list_ent = d->page_list.next;
2207 for ( i = 0; (list_ent != &d->page_list); i++ )
2209 unsigned long *pt;
2210 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2211 page = &frame_table[pfn];
2213 switch ( page->u.inuse.type_info & PGT_type_mask )
2215 case PGT_l2_page_table:
2216 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2217 adjust( page, 1, 1 );
2219 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2221 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2223 if ( pt[i] & _PAGE_PRESENT )
2225 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2226 struct pfn_info *l1page = &frame_table[l1pfn];
2228 if ( l1page->u.inuse.domain == d)
2229 adjust(l1page, 1, 1);
2233 unmap_domain_mem(pt);
2234 break;
2236 case PGT_l1_page_table:
2237 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2238 adjust( page, 1, 1 );
2240 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2242 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2244 if ( pt[i] & _PAGE_PRESENT )
2246 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2247 struct pfn_info *l1page = &frame_table[l1pfn];
2249 if ( (l1page->u.inuse.domain != d) ||
2250 (l1pfn < 0x100) || (l1pfn > max_page) )
2251 continue;
2253 adjust(l1page, 1, 0);
2257 unmap_domain_mem(pt);
2258 break;
2262 page->tlbflush_timestamp = 0;
2264 list_ent = frame_table[pfn].list.next;
2267 spin_unlock(&d->page_alloc_lock);
2269 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2271 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2273 if ( d != current )
2274 domain_unpause(d);
2277 void audit_domains(void)
2279 struct domain *d;
2280 for_each_domain ( d )
2281 audit_domain(d);
2284 void audit_domains_key(unsigned char key)
2286 audit_domains();
2289 #endif