debuggers.hg

view xen/arch/x86/memory.c @ 3740:d3e70af90f15

bitkeeper revision 1.1159.212.115 (4207c574hv18R_VTm-3a9w_AZzNBWw)

Force hypercall continuation arguments to size of longs.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Mon Feb 07 19:45:56 2005 +0000 (2005-02-07)
parents d93748c50893
children b37e9180a101
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/memory.c
4 *
5 * Copyright (c) 2002-2004 K A Fraser
6 * Copyright (c) 2004 Christian Limpach
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 /*
24 * A description of the x86 page table API:
25 *
26 * Domains trap to do_mmu_update with a list of update requests.
27 * This is a list of (ptr, val) pairs, where the requested operation
28 * is *ptr = val.
29 *
30 * Reference counting of pages:
31 * ----------------------------
32 * Each page has two refcounts: tot_count and type_count.
33 *
34 * TOT_COUNT is the obvious reference count. It counts all uses of a
35 * physical page frame by a domain, including uses as a page directory,
36 * a page table, or simple mappings via a PTE. This count prevents a
37 * domain from releasing a frame back to the free pool when it still holds
38 * a reference to it.
39 *
40 * TYPE_COUNT is more subtle. A frame can be put to one of three
41 * mutually-exclusive uses: it might be used as a page directory, or a
42 * page table, or it may be mapped writable by the domain [of course, a
43 * frame may not be used in any of these three ways!].
44 * So, type_count is a count of the number of times a frame is being
45 * referred to in its current incarnation. Therefore, a page can only
46 * change its type when its type count is zero.
47 *
48 * Pinning the page type:
49 * ----------------------
50 * The type of a page can be pinned/unpinned with the commands
51 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
52 * pinning is not reference counted, so it can't be nested).
53 * This is useful to prevent a page's type count falling to zero, at which
54 * point safety checks would need to be carried out next time the count
55 * is increased again.
56 *
57 * A further note on writable page mappings:
58 * -----------------------------------------
59 * For simplicity, the count of writable mappings for a page may not
60 * correspond to reality. The 'writable count' is incremented for every
61 * PTE which maps the page with the _PAGE_RW flag set. However, for
62 * write access to be possible the page directory entry must also have
63 * its _PAGE_RW bit set. We do not check this as it complicates the
64 * reference counting considerably [consider the case of multiple
65 * directory entries referencing a single page table, some with the RW
66 * bit set, others not -- it starts getting a bit messy].
67 * In normal use, this simplification shouldn't be a problem.
68 * However, the logic can be added if required.
69 *
70 * One more note on read-only page mappings:
71 * -----------------------------------------
72 * We want domains to be able to map pages for read-only access. The
73 * main reason is that page tables and directories should be readable
74 * by a domain, but it would not be safe for them to be writable.
75 * However, domains have free access to rings 1 & 2 of the Intel
76 * privilege model. In terms of page protection, these are considered
77 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
78 * read-only restrictions are respected in supervisor mode -- if the
79 * bit is clear then any mapped page is writable.
80 *
81 * We get round this by always setting the WP bit and disallowing
82 * updates to it. This is very unlikely to cause a problem for guest
83 * OS's, which will generally use the WP bit to simplify copy-on-write
84 * implementation (in that case, OS wants a fault when it writes to
85 * an application-supplied buffer).
86 */
88 #include <xen/config.h>
89 #include <xen/init.h>
90 #include <xen/kernel.h>
91 #include <xen/lib.h>
92 #include <xen/mm.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/domain_page.h>
104 #include <asm/ldt.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
109 current->domain->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 static int alloc_l2_table(struct pfn_info *page);
115 static int alloc_l1_table(struct pfn_info *page);
116 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
117 static int get_page_and_type_from_pagenr(unsigned long page_nr,
118 u32 type,
119 struct domain *d);
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned long deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
164 4UL << 20, PAGE_HYPERVISOR);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 extern void subarch_init_memory(struct domain *);
174 memset(percpu_info, 0, sizeof(percpu_info));
176 /*
177 * Initialise our DOMID_XEN domain.
178 * Any Xen-heap pages that we will allow to be mapped will have
179 * their domain field set to dom_xen.
180 */
181 dom_xen = alloc_domain_struct();
182 atomic_set(&dom_xen->refcnt, 1);
183 dom_xen->id = DOMID_XEN;
185 /*
186 * Initialise our DOMID_IO domain.
187 * This domain owns no pages but is considered a special case when
188 * mapping I/O pages, as the mappings occur at the priv of the caller.
189 */
190 dom_io = alloc_domain_struct();
191 atomic_set(&dom_io->refcnt, 1);
192 dom_io->id = DOMID_IO;
194 subarch_init_memory(dom_xen);
195 }
197 void write_ptbase(struct exec_domain *ed)
198 {
199 struct domain *d = ed->domain;
200 unsigned long pa;
202 #ifdef CONFIG_VMX
203 if ( unlikely(d->arch.shadow_mode) )
204 pa = ((d->arch.shadow_mode == SHM_full_32) ?
205 pagetable_val(ed->arch.monitor_table) :
206 pagetable_val(ed->arch.shadow_table));
207 else
208 pa = pagetable_val(ed->arch.pagetable);
209 #else
210 if ( unlikely(d->arch.shadow_mode) )
211 pa = pagetable_val(ed->arch.shadow_table);
212 else
213 pa = pagetable_val(ed->arch.pagetable);
214 #endif
216 write_cr3(pa);
217 }
219 static void __invalidate_shadow_ldt(struct exec_domain *d)
220 {
221 int i;
222 unsigned long pfn;
223 struct pfn_info *page;
225 d->arch.shadow_ldt_mapcnt = 0;
227 for ( i = 16; i < 32; i++ )
228 {
229 pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
230 if ( pfn == 0 ) continue;
231 d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
232 page = &frame_table[pfn];
233 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
234 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
235 put_page_and_type(page);
236 }
238 /* Dispose of the (now possibly invalid) mappings from the TLB. */
239 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
240 }
243 static inline void invalidate_shadow_ldt(struct exec_domain *d)
244 {
245 if ( d->arch.shadow_ldt_mapcnt != 0 )
246 __invalidate_shadow_ldt(d);
247 }
250 static int alloc_segdesc_page(struct pfn_info *page)
251 {
252 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
253 int i;
255 for ( i = 0; i < 512; i++ )
256 if ( unlikely(!check_descriptor(&descs[i*2])) )
257 goto fail;
259 unmap_domain_mem(descs);
260 return 1;
262 fail:
263 unmap_domain_mem(descs);
264 return 0;
265 }
268 /* Map shadow page at offset @off. */
269 int map_ldt_shadow_page(unsigned int off)
270 {
271 struct exec_domain *ed = current;
272 struct domain *d = ed->domain;
273 unsigned long l1e;
275 if ( unlikely(in_irq()) )
276 BUG();
278 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >>
279 PAGE_SHIFT) + off]);
281 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
282 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
283 d, PGT_ldt_page)) )
284 return 0;
286 ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
287 ed->arch.shadow_ldt_mapcnt++;
289 return 1;
290 }
293 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
294 {
295 struct pfn_info *page = &frame_table[page_nr];
297 if ( unlikely(!pfn_is_ram(page_nr)) )
298 {
299 MEM_LOG("Pfn %08lx is not RAM", page_nr);
300 return 0;
301 }
303 if ( unlikely(!get_page(page, d)) )
304 {
305 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
306 return 0;
307 }
309 return 1;
310 }
313 static int get_page_and_type_from_pagenr(unsigned long page_nr,
314 u32 type,
315 struct domain *d)
316 {
317 struct pfn_info *page = &frame_table[page_nr];
319 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
320 return 0;
322 if ( unlikely(!get_page_type(page, type)) )
323 {
324 #ifdef VERBOSE
325 if ( (type & PGT_type_mask) != PGT_l1_page_table )
326 MEM_LOG("Bad page type for pfn %08lx (%08x)",
327 page_nr, page->u.inuse.type_info);
328 #endif
329 put_page(page);
330 return 0;
331 }
333 return 1;
334 }
337 /*
338 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
339 * needs some special care with reference counst and access permissions:
340 * 1. The mapping entry must be read-only, or the guest may get write access
341 * to its own PTEs.
342 * 2. We must only bump the reference counts for an *already validated*
343 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
344 * on a validation that is required to complete that validation.
345 * 3. We only need to increment the reference counts for the mapped page
346 * frame if it is mapped by a different L2 table. This is sufficient and
347 * also necessary to allow validation of an L2 table mapping itself.
348 */
349 static int
350 get_linear_pagetable(
351 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
352 {
353 u32 x, y;
354 struct pfn_info *page;
356 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
357 {
358 MEM_LOG("Attempt to create linear p.t. with write perms");
359 return 0;
360 }
362 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
363 {
364 /* Make sure the mapped frame belongs to the correct domain. */
365 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
366 return 0;
368 /*
369 * Make sure that the mapped frame is an already-validated L2 table.
370 * If so, atomically increment the count (checking for overflow).
371 */
372 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
373 y = page->u.inuse.type_info;
374 do {
375 x = y;
376 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
377 unlikely((x & (PGT_type_mask|PGT_validated)) !=
378 (PGT_l2_page_table|PGT_validated)) )
379 {
380 put_page(page);
381 return 0;
382 }
383 }
384 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
385 }
387 return 1;
388 }
391 static int
392 get_page_from_l1e(
393 l1_pgentry_t l1e, struct domain *d)
394 {
395 unsigned long l1v = l1_pgentry_val(l1e);
396 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
397 struct pfn_info *page = &frame_table[pfn];
398 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
400 if ( !(l1v & _PAGE_PRESENT) )
401 return 1;
403 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
404 {
405 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
406 return 0;
407 }
409 if ( unlikely(!pfn_is_ram(pfn)) )
410 {
411 /* Revert to caller privileges if FD == DOMID_IO. */
412 if ( d == dom_io )
413 d = current->domain;
415 if ( IS_PRIV(d) )
416 return 1;
418 if ( IS_CAPABLE_PHYSDEV(d) )
419 return domain_iomem_in_pfn(d, pfn);
421 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
422 return 0;
423 }
425 return ((l1v & _PAGE_RW) ?
426 get_page_and_type(page, d, PGT_writable_page) :
427 get_page(page, d));
428 }
431 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
432 static int
433 get_page_from_l2e(
434 l2_pgentry_t l2e, unsigned long pfn,
435 struct domain *d, unsigned long va_idx)
436 {
437 int rc;
439 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
440 return 1;
442 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
443 {
444 MEM_LOG("Bad L2 page type settings %04lx",
445 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
446 return 0;
447 }
449 rc = get_page_and_type_from_pagenr(
450 l2_pgentry_to_pagenr(l2e),
451 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
453 if ( unlikely(!rc) )
454 return get_linear_pagetable(l2e, pfn, d);
456 return 1;
457 }
460 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
461 {
462 unsigned long l1v = l1_pgentry_val(l1e);
463 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
464 struct pfn_info *page = &frame_table[pfn];
465 struct domain *e;
467 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
468 return;
470 e = page_get_owner(page);
471 if ( unlikely(e != d) )
472 {
473 /*
474 * Unmap a foreign page that may have been mapped via a grant table.
475 * Note that this can fail for a privileged domain that can map foreign
476 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
477 * counted via a grant entry and some counted directly in the page
478 * structure's reference count. Note that reference counts won't get
479 * dangerously confused as long as we always try to decrement the
480 * grant entry first. We may end up with a mismatch between which
481 * mappings and which unmappings are counted via the grant entry, but
482 * really it doesn't matter as privileged domains have carte blanche.
483 */
484 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
485 return;
486 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
487 }
489 if ( l1v & _PAGE_RW )
490 {
491 put_page_and_type(page);
492 }
493 else
494 {
495 /* We expect this is rare so we blow the entire shadow LDT. */
496 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
497 PGT_ldt_page)) &&
498 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
499 invalidate_shadow_ldt(e->exec_domain[0]);
500 put_page(page);
501 }
502 }
505 /*
506 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
507 * Note also that this automatically deals correctly with linear p.t.'s.
508 */
509 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
510 {
511 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
512 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
513 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
514 }
517 static int alloc_l2_table(struct pfn_info *page)
518 {
519 struct domain *d = page_get_owner(page);
520 unsigned long page_nr = page_to_pfn(page);
521 l2_pgentry_t *pl2e;
522 int i;
524 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
526 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
527 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
528 goto fail;
530 #if defined(__i386__)
531 /* Now we add our private high mappings. */
532 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
533 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
534 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
535 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
536 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
537 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
538 mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) |
539 __PAGE_HYPERVISOR);
540 #endif
542 unmap_domain_mem(pl2e);
543 return 1;
545 fail:
546 while ( i-- > 0 )
547 put_page_from_l2e(pl2e[i], page_nr);
549 unmap_domain_mem(pl2e);
550 return 0;
551 }
554 static int alloc_l1_table(struct pfn_info *page)
555 {
556 struct domain *d = page_get_owner(page);
557 unsigned long page_nr = page_to_pfn(page);
558 l1_pgentry_t *pl1e;
559 int i;
561 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
563 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
564 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
565 goto fail;
567 unmap_domain_mem(pl1e);
568 return 1;
570 fail:
571 while ( i-- > 0 )
572 put_page_from_l1e(pl1e[i], d);
574 unmap_domain_mem(pl1e);
575 return 0;
576 }
579 static void free_l2_table(struct pfn_info *page)
580 {
581 unsigned long page_nr = page - frame_table;
582 l2_pgentry_t *pl2e;
583 int i;
585 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
587 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
588 put_page_from_l2e(pl2e[i], page_nr);
590 unmap_domain_mem(pl2e);
591 }
594 static void free_l1_table(struct pfn_info *page)
595 {
596 struct domain *d = page_get_owner(page);
597 unsigned long page_nr = page - frame_table;
598 l1_pgentry_t *pl1e;
599 int i;
601 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
603 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
604 put_page_from_l1e(pl1e[i], d);
606 unmap_domain_mem(pl1e);
607 }
610 static inline int update_l2e(l2_pgentry_t *pl2e,
611 l2_pgentry_t ol2e,
612 l2_pgentry_t nl2e)
613 {
614 unsigned long o = cmpxchg((unsigned long *)pl2e,
615 l2_pgentry_val(ol2e),
616 l2_pgentry_val(nl2e));
617 if ( o != l2_pgentry_val(ol2e) )
618 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
619 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
620 return (o == l2_pgentry_val(ol2e));
621 }
624 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
625 static int mod_l2_entry(l2_pgentry_t *pl2e,
626 l2_pgentry_t nl2e,
627 unsigned long pfn)
628 {
629 l2_pgentry_t ol2e;
630 unsigned long _ol2e;
632 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
633 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
634 {
635 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
636 return 0;
637 }
639 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
640 return 0;
641 ol2e = mk_l2_pgentry(_ol2e);
643 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
644 {
645 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
646 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
647 return update_l2e(pl2e, ol2e, nl2e);
649 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
650 ((unsigned long)pl2e &
651 ~PAGE_MASK) >> 2)) )
652 return 0;
654 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
655 {
656 put_page_from_l2e(nl2e, pfn);
657 return 0;
658 }
660 put_page_from_l2e(ol2e, pfn);
661 return 1;
662 }
664 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
665 return 0;
667 put_page_from_l2e(ol2e, pfn);
668 return 1;
669 }
672 static inline int update_l1e(l1_pgentry_t *pl1e,
673 l1_pgentry_t ol1e,
674 l1_pgentry_t nl1e)
675 {
676 unsigned long o = l1_pgentry_val(ol1e);
677 unsigned long n = l1_pgentry_val(nl1e);
679 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
680 unlikely(o != l1_pgentry_val(ol1e)) )
681 {
682 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
683 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
684 return 0;
685 }
687 return 1;
688 }
691 /* Update the L1 entry at pl1e to new value nl1e. */
692 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
693 {
694 l1_pgentry_t ol1e;
695 unsigned long _ol1e;
696 struct domain *d = current->domain;
698 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
699 {
700 MEM_LOG("Bad get_user\n");
701 return 0;
702 }
704 ol1e = mk_l1_pgentry(_ol1e);
706 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
707 {
708 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
709 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
710 return update_l1e(pl1e, ol1e, nl1e);
712 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
713 return 0;
715 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
716 {
717 put_page_from_l1e(nl1e, d);
718 return 0;
719 }
721 put_page_from_l1e(ol1e, d);
722 return 1;
723 }
725 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
726 return 0;
728 put_page_from_l1e(ol1e, d);
729 return 1;
730 }
733 int alloc_page_type(struct pfn_info *page, unsigned int type)
734 {
735 switch ( type )
736 {
737 case PGT_l1_page_table:
738 return alloc_l1_table(page);
739 case PGT_l2_page_table:
740 return alloc_l2_table(page);
741 case PGT_gdt_page:
742 case PGT_ldt_page:
743 return alloc_segdesc_page(page);
744 default:
745 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
746 type, page->u.inuse.type_info,
747 page->count_info);
748 BUG();
749 }
751 return 0;
752 }
755 void free_page_type(struct pfn_info *page, unsigned int type)
756 {
757 struct domain *d = page_get_owner(page);
759 switch ( type )
760 {
761 case PGT_l1_page_table:
762 free_l1_table(page);
763 break;
765 case PGT_l2_page_table:
766 free_l2_table(page);
767 break;
769 default:
770 BUG();
771 }
773 if ( unlikely(d->arch.shadow_mode) &&
774 (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
775 {
776 unshadow_table(page_to_pfn(page), type);
777 put_shadow_status(d);
778 }
779 }
782 void put_page_type(struct pfn_info *page)
783 {
784 u32 nx, x, y = page->u.inuse.type_info;
786 again:
787 do {
788 x = y;
789 nx = x - 1;
791 ASSERT((x & PGT_count_mask) != 0);
793 /*
794 * The page should always be validated while a reference is held. The
795 * exception is during domain destruction, when we forcibly invalidate
796 * page-table pages if we detect a referential loop.
797 * See domain.c:relinquish_list().
798 */
799 ASSERT((x & PGT_validated) ||
800 test_bit(DF_DYING, &page_get_owner(page)->d_flags));
802 if ( unlikely((nx & PGT_count_mask) == 0) )
803 {
804 /* Record TLB information for flush later. Races are harmless. */
805 page->tlbflush_timestamp = tlbflush_current_time();
807 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
808 likely(nx & PGT_validated) )
809 {
810 /*
811 * Page-table pages must be unvalidated when count is zero. The
812 * 'free' is safe because the refcnt is non-zero and validated
813 * bit is clear => other ops will spin or fail.
814 */
815 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
816 x & ~PGT_validated)) != x) )
817 goto again;
818 /* We cleared the 'valid bit' so we do the clear up. */
819 free_page_type(page, x & PGT_type_mask);
820 /* Carry on, but with the 'valid bit' now clear. */
821 x &= ~PGT_validated;
822 nx &= ~PGT_validated;
823 }
824 }
825 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
826 (PGT_pinned | 1)) )
827 {
828 /* Page is now only pinned. Make the back pointer mutable again. */
829 nx |= PGT_va_mutable;
830 }
831 }
832 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
833 }
836 int get_page_type(struct pfn_info *page, u32 type)
837 {
838 u32 nx, x, y = page->u.inuse.type_info;
840 again:
841 do {
842 x = y;
843 nx = x + 1;
844 if ( unlikely((nx & PGT_count_mask) == 0) )
845 {
846 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
847 return 0;
848 }
849 else if ( unlikely((x & PGT_count_mask) == 0) )
850 {
851 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
852 {
853 /*
854 * On type change we check to flush stale TLB entries. This
855 * may be unnecessary (e.g., page was GDT/LDT) but those
856 * circumstances should be very rare.
857 */
858 struct domain *d = page_get_owner(page);
859 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
860 page->tlbflush_timestamp)) )
861 {
862 perfc_incr(need_flush_tlb_flush);
863 flush_tlb_cpu(d->exec_domain[0]->processor);
864 }
866 /* We lose existing type, back pointer, and validity. */
867 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
868 nx |= type;
870 /* No special validation needed for writable pages. */
871 /* Page tables and GDT/LDT need to be scanned for validity. */
872 if ( type == PGT_writable_page )
873 nx |= PGT_validated;
874 }
875 }
876 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
877 {
878 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
879 {
880 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
881 ((type & PGT_type_mask) != PGT_l1_page_table) )
882 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
883 x & PGT_type_mask, type, page_to_pfn(page));
884 return 0;
885 }
886 else if ( (x & PGT_va_mask) == PGT_va_mutable )
887 {
888 /* The va backpointer is mutable, hence we update it. */
889 nx &= ~PGT_va_mask;
890 nx |= type; /* we know the actual type is correct */
891 }
892 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
893 {
894 /* This table is potentially mapped at multiple locations. */
895 nx &= ~PGT_va_mask;
896 nx |= PGT_va_unknown;
897 }
898 }
899 else if ( unlikely(!(x & PGT_validated)) )
900 {
901 /* Someone else is updating validation of this page. Wait... */
902 while ( (y = page->u.inuse.type_info) == x )
903 {
904 rep_nop();
905 barrier();
906 }
907 goto again;
908 }
909 }
910 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
912 if ( unlikely(!(nx & PGT_validated)) )
913 {
914 /* Try to validate page type; drop the new reference on failure. */
915 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
916 {
917 MEM_LOG("Error while validating pfn %08lx for type %08x."
918 " caf=%08x taf=%08x\n",
919 page_to_pfn(page), type,
920 page->count_info,
921 page->u.inuse.type_info);
922 /* Noone else can get a reference. We hold the only ref. */
923 page->u.inuse.type_info = 0;
924 return 0;
925 }
927 /* Noone else is updating simultaneously. */
928 __set_bit(_PGT_validated, &page->u.inuse.type_info);
929 }
931 return 1;
932 }
935 int new_guest_cr3(unsigned long pfn)
936 {
937 struct exec_domain *ed = current;
938 struct domain *d = ed->domain;
939 int okay, cpu = smp_processor_id();
940 unsigned long old_base_pfn;
942 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
943 if ( likely(okay) )
944 {
945 invalidate_shadow_ldt(ed);
947 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
948 old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
949 ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
951 shadow_mk_pagetable(ed);
953 write_ptbase(ed);
955 put_page_and_type(&frame_table[old_base_pfn]);
956 }
957 else
958 {
959 MEM_LOG("Error while installing new baseptr %08lx", pfn);
960 }
962 return okay;
963 }
965 static int do_extended_command(unsigned long ptr, unsigned long val)
966 {
967 int okay = 1, cpu = smp_processor_id();
968 unsigned int cmd = val & MMUEXT_CMD_MASK;
969 unsigned long pfn = ptr >> PAGE_SHIFT;
970 struct pfn_info *page = &frame_table[pfn];
971 struct exec_domain *ed = current;
972 struct domain *d = ed->domain, *nd, *e;
973 u32 x, y;
974 domid_t domid;
975 grant_ref_t gntref;
977 switch ( cmd )
978 {
979 case MMUEXT_PIN_L1_TABLE:
980 case MMUEXT_PIN_L2_TABLE:
981 /*
982 * We insist that, if you pin an L1 page, it's the first thing that
983 * you do to it. This is because we require the backptr to still be
984 * mutable. This assumption seems safe.
985 */
986 okay = get_page_and_type_from_pagenr(
987 pfn,
988 ((cmd==MMUEXT_PIN_L2_TABLE) ?
989 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
990 FOREIGNDOM);
992 if ( unlikely(!okay) )
993 {
994 MEM_LOG("Error while pinning pfn %08lx", pfn);
995 break;
996 }
998 if ( unlikely(test_and_set_bit(_PGT_pinned,
999 &page->u.inuse.type_info)) )
1001 MEM_LOG("Pfn %08lx already pinned", pfn);
1002 put_page_and_type(page);
1003 okay = 0;
1004 break;
1007 break;
1009 case MMUEXT_UNPIN_TABLE:
1010 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1012 MEM_LOG("Page %08lx bad domain (dom=%p)",
1013 ptr, page_get_owner(page));
1015 else if ( likely(test_and_clear_bit(_PGT_pinned,
1016 &page->u.inuse.type_info)) )
1018 put_page_and_type(page);
1019 put_page(page);
1021 else
1023 okay = 0;
1024 put_page(page);
1025 MEM_LOG("Pfn %08lx not pinned", pfn);
1027 break;
1029 case MMUEXT_NEW_BASEPTR:
1030 okay = new_guest_cr3(pfn);
1031 break;
1033 case MMUEXT_TLB_FLUSH:
1034 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1035 break;
1037 case MMUEXT_INVLPG:
1038 __flush_tlb_one(ptr);
1039 break;
1041 case MMUEXT_FLUSH_CACHE:
1042 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1044 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1045 okay = 0;
1047 else
1049 wbinvd();
1051 break;
1053 case MMUEXT_SET_LDT:
1055 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1056 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1057 (ents > 8192) ||
1058 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1059 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1061 okay = 0;
1062 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1064 else if ( (ed->arch.ldt_ents != ents) ||
1065 (ed->arch.ldt_base != ptr) )
1067 invalidate_shadow_ldt(ed);
1068 ed->arch.ldt_base = ptr;
1069 ed->arch.ldt_ents = ents;
1070 load_LDT(ed);
1071 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1072 if ( ents != 0 )
1073 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1075 break;
1078 case MMUEXT_SET_FOREIGNDOM:
1079 domid = (domid_t)(val >> 16);
1081 if ( (e = percpu_info[cpu].foreign) != NULL )
1082 put_domain(e);
1083 percpu_info[cpu].foreign = NULL;
1085 if ( !IS_PRIV(d) )
1087 switch ( domid )
1089 case DOMID_IO:
1090 get_knownalive_domain(dom_io);
1091 percpu_info[cpu].foreign = dom_io;
1092 break;
1093 default:
1094 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1095 okay = 0;
1096 break;
1099 else
1101 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1102 if ( e == NULL )
1104 switch ( domid )
1106 case DOMID_XEN:
1107 get_knownalive_domain(dom_xen);
1108 percpu_info[cpu].foreign = dom_xen;
1109 break;
1110 case DOMID_IO:
1111 get_knownalive_domain(dom_io);
1112 percpu_info[cpu].foreign = dom_io;
1113 break;
1114 default:
1115 MEM_LOG("Unknown domain '%u'", domid);
1116 okay = 0;
1117 break;
1121 break;
1123 case MMUEXT_TRANSFER_PAGE:
1124 domid = (domid_t)(val >> 16);
1125 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1127 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1128 unlikely(!pfn_is_ram(pfn)) ||
1129 unlikely((e = find_domain_by_id(domid)) == NULL) )
1131 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1132 okay = 0;
1133 break;
1136 spin_lock(&d->page_alloc_lock);
1138 /*
1139 * The tricky bit: atomically release ownership while there is just one
1140 * benign reference to the page (PGC_allocated). If that reference
1141 * disappears then the deallocation routine will safely spin.
1142 */
1143 nd = page_get_owner(page);
1144 y = page->count_info;
1145 do {
1146 x = y;
1147 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1148 (1|PGC_allocated)) ||
1149 unlikely(nd != d) )
1151 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1152 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1153 d, d->id, nd, x, page->u.inuse.type_info);
1154 spin_unlock(&d->page_alloc_lock);
1155 put_domain(e);
1156 return 0;
1158 __asm__ __volatile__(
1159 LOCK_PREFIX "cmpxchg8b %2"
1160 : "=d" (nd), "=a" (y),
1161 "=m" (*(volatile u64 *)(&page->count_info))
1162 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1164 while ( unlikely(nd != d) || unlikely(y != x) );
1166 /*
1167 * Unlink from 'd'. At least one reference remains (now anonymous), so
1168 * noone else is spinning to try to delete this page from 'd'.
1169 */
1170 d->tot_pages--;
1171 list_del(&page->list);
1173 spin_unlock(&d->page_alloc_lock);
1175 spin_lock(&e->page_alloc_lock);
1177 /*
1178 * Check that 'e' will accept the page and has reservation headroom.
1179 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1180 */
1181 ASSERT(e->tot_pages <= e->max_pages);
1182 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1183 unlikely(e->tot_pages == e->max_pages) ||
1184 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1186 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1187 "provided a bad grant ref, or is dying (%08lx).\n",
1188 e->tot_pages, e->max_pages, e->d_flags);
1189 spin_unlock(&e->page_alloc_lock);
1190 put_domain(e);
1191 okay = 0;
1192 break;
1195 /* Okay, add the page to 'e'. */
1196 if ( unlikely(e->tot_pages++ == 0) )
1197 get_knownalive_domain(e);
1198 list_add_tail(&page->list, &e->page_list);
1199 page_set_owner(page, e);
1201 spin_unlock(&e->page_alloc_lock);
1203 /* Transfer is all done: tell the guest about its new page frame. */
1204 gnttab_notify_transfer(e, gntref, pfn);
1206 put_domain(e);
1207 break;
1209 case MMUEXT_REASSIGN_PAGE:
1210 if ( unlikely(!IS_PRIV(d)) )
1212 MEM_LOG("Dom %u has no reassignment priv", d->id);
1213 okay = 0;
1214 break;
1217 e = percpu_info[cpu].foreign;
1218 if ( unlikely(e == NULL) )
1220 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1221 okay = 0;
1222 break;
1225 /*
1226 * Grab both page_list locks, in order. This prevents the page from
1227 * disappearing elsewhere while we modify the owner, and we'll need
1228 * both locks if we're successful so that we can change lists.
1229 */
1230 if ( d < e )
1232 spin_lock(&d->page_alloc_lock);
1233 spin_lock(&e->page_alloc_lock);
1235 else
1237 spin_lock(&e->page_alloc_lock);
1238 spin_lock(&d->page_alloc_lock);
1241 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1242 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1243 unlikely(IS_XEN_HEAP_FRAME(page)) )
1245 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1246 okay = 0;
1247 goto reassign_fail;
1250 /*
1251 * The tricky bit: atomically change owner while there is just one
1252 * benign reference to the page (PGC_allocated). If that reference
1253 * disappears then the deallocation routine will safely spin.
1254 */
1255 nd = page_get_owner(page);
1256 y = page->count_info;
1257 do {
1258 x = y;
1259 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1260 (1|PGC_allocated)) ||
1261 unlikely(nd != d) )
1263 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1264 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1265 d, d->id, nd, x, page->u.inuse.type_info);
1266 okay = 0;
1267 goto reassign_fail;
1269 __asm__ __volatile__(
1270 LOCK_PREFIX "cmpxchg8b %3"
1271 : "=d" (nd), "=a" (y), "=c" (e),
1272 "=m" (*(volatile u64 *)(&page->count_info))
1273 : "0" (d), "1" (x), "c" (e), "b" (x) );
1275 while ( unlikely(nd != d) || unlikely(y != x) );
1277 /*
1278 * Unlink from 'd'. We transferred at least one reference to 'e', so
1279 * noone else is spinning to try to delete this page from 'd'.
1280 */
1281 d->tot_pages--;
1282 list_del(&page->list);
1284 /*
1285 * Add the page to 'e'. Someone may already have removed the last
1286 * reference and want to remove the page from 'e'. However, we have
1287 * the lock so they'll spin waiting for us.
1288 */
1289 if ( unlikely(e->tot_pages++ == 0) )
1290 get_knownalive_domain(e);
1291 list_add_tail(&page->list, &e->page_list);
1293 reassign_fail:
1294 spin_unlock(&d->page_alloc_lock);
1295 spin_unlock(&e->page_alloc_lock);
1296 break;
1298 case MMUEXT_CLEAR_FOREIGNDOM:
1299 if ( (e = percpu_info[cpu].foreign) != NULL )
1300 put_domain(e);
1301 percpu_info[cpu].foreign = NULL;
1302 break;
1304 default:
1305 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1306 okay = 0;
1307 break;
1310 return okay;
1313 int do_mmu_update(
1314 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1316 /*
1317 * We steal the m.s.b. of the @count parameter to indicate whether this
1318 * invocation of do_mmu_update() is resuming a previously preempted call.
1319 * We steal the next 15 bits to remember the current FOREIGNDOM.
1320 */
1321 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1322 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1323 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1325 mmu_update_t req;
1326 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1327 struct pfn_info *page;
1328 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1329 unsigned int cmd, done = 0;
1330 unsigned long prev_spfn = 0;
1331 l1_pgentry_t *prev_spl1e = 0;
1332 struct exec_domain *ed = current;
1333 struct domain *d = ed->domain;
1334 u32 type_info;
1335 domid_t domid;
1337 LOCK_BIGLOCK(d);
1339 cleanup_writable_pagetable(d);
1341 /*
1342 * If we are resuming after preemption, read how much work we have already
1343 * done. This allows us to set the @done output parameter correctly.
1344 * We also reset FOREIGNDOM here.
1345 */
1346 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1348 if ( !(count & MMU_UPDATE_PREEMPTED) )
1350 /* Count overflow into private FOREIGNDOM field. */
1351 MEM_LOG("do_mmu_update count is too large");
1352 rc = -EINVAL;
1353 goto out;
1355 count &= ~MMU_UPDATE_PREEMPTED;
1356 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1357 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1358 if ( unlikely(pdone != NULL) )
1359 (void)get_user(done, pdone);
1360 if ( (domid != current->domain->id) &&
1361 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1363 rc = -EINVAL;
1364 goto out;
1368 perfc_incrc(calls_to_mmu_update);
1369 perfc_addc(num_page_updates, count);
1371 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1373 rc = -EFAULT;
1374 goto out;
1377 for ( i = 0; i < count; i++ )
1379 if ( hypercall_preempt_check() )
1381 rc = hypercall3_create_continuation(
1382 __HYPERVISOR_mmu_update, ureqs,
1383 (count - i) |
1384 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1385 MMU_UPDATE_PREEMPTED, pdone);
1386 break;
1389 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1391 MEM_LOG("Bad __copy_from_user");
1392 rc = -EFAULT;
1393 break;
1396 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1397 pfn = req.ptr >> PAGE_SHIFT;
1399 okay = 0;
1401 switch ( cmd )
1403 /*
1404 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1405 */
1406 case MMU_NORMAL_PT_UPDATE:
1407 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1409 MEM_LOG("Could not get page for normal update");
1410 break;
1413 if ( likely(prev_pfn == pfn) )
1415 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1417 else
1419 if ( prev_pfn != 0 )
1420 unmap_domain_mem((void *)va);
1421 va = (unsigned long)map_domain_mem(req.ptr);
1422 prev_pfn = pfn;
1425 page = &frame_table[pfn];
1426 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1428 case PGT_l1_page_table:
1429 if ( likely(get_page_type(
1430 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1432 okay = mod_l1_entry((l1_pgentry_t *)va,
1433 mk_l1_pgentry(req.val));
1435 if ( unlikely(d->arch.shadow_mode) && okay &&
1436 (get_shadow_status(d, page-frame_table) &
1437 PSH_shadowed) )
1439 shadow_l1_normal_pt_update(
1440 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1441 put_shadow_status(d);
1444 put_page_type(page);
1446 break;
1447 case PGT_l2_page_table:
1448 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1450 okay = mod_l2_entry((l2_pgentry_t *)va,
1451 mk_l2_pgentry(req.val),
1452 pfn);
1454 if ( unlikely(d->arch.shadow_mode) && okay &&
1455 (get_shadow_status(d, page-frame_table) &
1456 PSH_shadowed) )
1458 shadow_l2_normal_pt_update(req.ptr, req.val);
1459 put_shadow_status(d);
1462 put_page_type(page);
1464 break;
1465 default:
1466 if ( likely(get_page_type(page, PGT_writable_page)) )
1468 *(unsigned long *)va = req.val;
1469 okay = 1;
1470 put_page_type(page);
1472 break;
1475 put_page(page);
1476 break;
1478 case MMU_MACHPHYS_UPDATE:
1479 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1481 MEM_LOG("Could not get page for mach->phys update");
1482 break;
1485 machine_to_phys_mapping[pfn] = req.val;
1486 okay = 1;
1488 /*
1489 * If in log-dirty mode, mark the corresponding pseudo-physical
1490 * page as dirty.
1491 */
1492 if ( unlikely(d->arch.shadow_mode == SHM_logdirty) &&
1493 mark_dirty(d, pfn) )
1494 d->arch.shadow_dirty_block_count++;
1496 put_page(&frame_table[pfn]);
1497 break;
1499 /*
1500 * MMU_EXTENDED_COMMAND: Extended command is specified
1501 * in the least-siginificant bits of the 'value' field.
1502 */
1503 case MMU_EXTENDED_COMMAND:
1504 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1505 okay = do_extended_command(req.ptr, req.val);
1506 break;
1508 default:
1509 MEM_LOG("Invalid page update command %08lx", req.ptr);
1510 break;
1513 if ( unlikely(!okay) )
1515 rc = -EINVAL;
1516 break;
1519 ureqs++;
1522 out:
1523 if ( prev_pfn != 0 )
1524 unmap_domain_mem((void *)va);
1526 if ( unlikely(prev_spl1e != 0) )
1527 unmap_domain_mem((void *)prev_spl1e);
1529 deferred_ops = percpu_info[cpu].deferred_ops;
1530 percpu_info[cpu].deferred_ops = 0;
1532 if ( deferred_ops & DOP_FLUSH_TLB )
1533 local_flush_tlb();
1535 if ( deferred_ops & DOP_RELOAD_LDT )
1536 (void)map_ldt_shadow_page(0);
1538 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1540 put_domain(percpu_info[cpu].foreign);
1541 percpu_info[cpu].foreign = NULL;
1544 /* Add incremental work we have done to the @done output parameter. */
1545 if ( unlikely(pdone != NULL) )
1546 __put_user(done + i, pdone);
1548 UNLOCK_BIGLOCK(d);
1549 return rc;
1553 int do_update_va_mapping(unsigned long page_nr,
1554 unsigned long val,
1555 unsigned long flags)
1557 struct exec_domain *ed = current;
1558 struct domain *d = ed->domain;
1559 int err = 0;
1560 unsigned int cpu = ed->processor;
1561 unsigned long deferred_ops;
1563 perfc_incrc(calls_to_update_va);
1565 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1566 return -EINVAL;
1568 LOCK_BIGLOCK(d);
1570 cleanup_writable_pagetable(d);
1572 /*
1573 * XXX When we make this support 4MB superpages we should also deal with
1574 * the case of updating L2 entries.
1575 */
1577 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1578 mk_l1_pgentry(val))) )
1579 err = -EINVAL;
1581 if ( unlikely(d->arch.shadow_mode) )
1583 unsigned long sval;
1585 l1pte_propagate_from_guest(d, &val, &sval);
1587 if ( unlikely(__put_user(sval, ((unsigned long *)(
1588 &shadow_linear_pg_table[page_nr])))) )
1590 /*
1591 * Since L2's are guranteed RW, failure indicates the page was not
1592 * shadowed, so ignore.
1593 */
1594 perfc_incrc(shadow_update_va_fail);
1597 /*
1598 * If we're in log-dirty mode then we need to note that we've updated
1599 * the PTE in the PT-holding page. We need the machine frame number
1600 * for this.
1601 */
1602 if ( d->arch.shadow_mode == SHM_logdirty )
1603 mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT));
1605 check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
1608 deferred_ops = percpu_info[cpu].deferred_ops;
1609 percpu_info[cpu].deferred_ops = 0;
1611 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1612 unlikely(flags & UVMF_FLUSH_TLB) )
1613 local_flush_tlb();
1614 else if ( unlikely(flags & UVMF_INVLPG) )
1615 __flush_tlb_one(page_nr << PAGE_SHIFT);
1617 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1618 (void)map_ldt_shadow_page(0);
1620 UNLOCK_BIGLOCK(d);
1622 return err;
1625 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1626 unsigned long val,
1627 unsigned long flags,
1628 domid_t domid)
1630 unsigned int cpu = smp_processor_id();
1631 struct domain *d;
1632 int rc;
1634 if ( unlikely(!IS_PRIV(current->domain)) )
1635 return -EPERM;
1637 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1638 if ( unlikely(d == NULL) )
1640 MEM_LOG("Unknown domain '%u'", domid);
1641 return -ESRCH;
1644 rc = do_update_va_mapping(page_nr, val, flags);
1646 put_domain(d);
1647 percpu_info[cpu].foreign = NULL;
1649 return rc;
1654 /*************************
1655 * Writable Pagetables
1656 */
1658 ptwr_info_t ptwr_info[NR_CPUS];
1660 #ifdef VERBOSE
1661 int ptwr_debug = 0x0;
1662 #define PTWR_PRINTK(_f, _a...) \
1663 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1664 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1665 #else
1666 #define PTWR_PRINTK(_f, _a...) ((void)0)
1667 #endif
1669 /* Flush the given writable p.t. page and write-protect it again. */
1670 void ptwr_flush(const int which)
1672 unsigned long sstat, spte, pte, *ptep, l1va;
1673 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1674 l2_pgentry_t *pl2e;
1675 int i, cpu = smp_processor_id();
1676 struct exec_domain *ed = current;
1677 struct domain *d = ed->domain;
1679 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1680 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1682 /*
1683 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1684 */
1686 if ( unlikely(__get_user(pte, ptep)) )
1688 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1689 /*
1690 * Really a bug. We could read this PTE during the initial fault,
1691 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1692 */
1693 BUG();
1695 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1696 PTWR_PRINT_WHICH, ptep, pte);
1697 pte &= ~_PAGE_RW;
1699 if ( unlikely(d->arch.shadow_mode) )
1701 /* Write-protect the p.t. page in the shadow page table. */
1702 l1pte_propagate_from_guest(d, &pte, &spte);
1703 __put_user(
1704 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1706 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1707 sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
1708 if ( sstat & PSH_shadowed )
1709 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1712 /* Write-protect the p.t. page in the guest page table. */
1713 if ( unlikely(__put_user(pte, ptep)) )
1715 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1716 /*
1717 * Really a bug. We could write this PTE during the initial fault,
1718 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1719 */
1720 BUG();
1723 /* Ensure that there are no stale writable mappings in any TLB. */
1724 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1725 #if 1
1726 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1727 #else
1728 flush_tlb_all();
1729 #endif
1730 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1731 PTWR_PRINT_WHICH, ptep, pte);
1733 /*
1734 * STEP 2. Validate any modified PTEs.
1735 */
1737 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1738 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1740 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1741 nl1e = pl1e[i];
1743 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1744 continue;
1746 /*
1747 * Fast path for PTEs that have merely been write-protected
1748 * (e.g., during a Unix fork()). A strict reduction in privilege.
1749 */
1750 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1752 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1754 if ( unlikely(sl1e != NULL) )
1755 l1pte_propagate_from_guest(
1756 d, &l1_pgentry_val(nl1e),
1757 &l1_pgentry_val(sl1e[i]));
1758 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1760 continue;
1763 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1765 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1766 /*
1767 * Make the remaining p.t's consistent before crashing, so the
1768 * reference counts are correct.
1769 */
1770 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1771 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1772 unmap_domain_mem(pl1e);
1773 ptwr_info[cpu].ptinfo[which].l1va = 0;
1774 UNLOCK_BIGLOCK(d);
1775 domain_crash();
1778 if ( unlikely(sl1e != NULL) )
1779 l1pte_propagate_from_guest(
1780 d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1782 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1783 put_page_from_l1e(ol1e, d);
1785 unmap_domain_mem(pl1e);
1787 /*
1788 * STEP 3. Reattach the L1 p.t. page into the current address space.
1789 */
1791 if ( (which == PTWR_PT_ACTIVE) && likely(!d->arch.shadow_mode) )
1793 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1794 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1797 /*
1798 * STEP 4. Final tidy-up.
1799 */
1801 ptwr_info[cpu].ptinfo[which].l1va = 0;
1803 if ( unlikely(sl1e != NULL) )
1805 unmap_domain_mem(sl1e);
1806 put_shadow_status(d);
1810 /* Write page fault handler: check if guest is trying to modify a PTE. */
1811 int ptwr_do_page_fault(unsigned long addr)
1813 unsigned long pte, pfn, l2e;
1814 struct pfn_info *page;
1815 l2_pgentry_t *pl2e;
1816 int which, cpu = smp_processor_id();
1817 u32 l2_idx;
1819 /*
1820 * Attempt to read the PTE that maps the VA being accessed. By checking for
1821 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1822 */
1823 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1824 _PAGE_PRESENT) ||
1825 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1827 return 0;
1830 pfn = pte >> PAGE_SHIFT;
1831 page = &frame_table[pfn];
1833 /* We are looking only for read-only mappings of p.t. pages. */
1834 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1835 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1837 return 0;
1840 /* Get the L2 index at which this L1 p.t. is always mapped. */
1841 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1842 if ( unlikely(l2_idx >= PGT_va_unknown) )
1844 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1846 l2_idx >>= PGT_va_shift;
1848 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1850 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1851 domain_crash();
1854 /*
1855 * Is the L1 p.t. mapped into the current address space? If so we call it
1856 * an ACTIVE p.t., otherwise it is INACTIVE.
1857 */
1858 pl2e = &linear_l2_table[l2_idx];
1859 l2e = l2_pgentry_val(*pl2e);
1860 which = PTWR_PT_INACTIVE;
1861 if ( (l2e >> PAGE_SHIFT) == pfn )
1863 /* Check the PRESENT bit to set ACTIVE. */
1864 if ( likely(l2e & _PAGE_PRESENT) )
1865 which = PTWR_PT_ACTIVE;
1866 else {
1867 /*
1868 * If the PRESENT bit is clear, we may be conflicting with
1869 * the current ACTIVE p.t. (it may be the same p.t. mapped
1870 * at another virt addr).
1871 * The ptwr_flush call below will restore the PRESENT bit.
1872 */
1873 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1874 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1875 which = PTWR_PT_ACTIVE;
1879 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1880 "pfn %08lx\n", PTWR_PRINT_WHICH,
1881 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1883 /*
1884 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1885 * time. If there is already one, we must flush it out.
1886 */
1887 if ( ptwr_info[cpu].ptinfo[which].l1va )
1888 ptwr_flush(which);
1890 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1891 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1893 /* For safety, disconnect the L1 p.t. page from current space. */
1894 if ( (which == PTWR_PT_ACTIVE) &&
1895 likely(!current->domain->arch.shadow_mode) )
1897 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1898 #if 1
1899 flush_tlb(); /* XXX Multi-CPU guests? */
1900 #else
1901 flush_tlb_all();
1902 #endif
1905 /* Temporarily map the L1 page, and make a copy of it. */
1906 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1907 memcpy(ptwr_info[cpu].ptinfo[which].page,
1908 ptwr_info[cpu].ptinfo[which].pl1e,
1909 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1911 /* Finally, make the p.t. page writable by the guest OS. */
1912 pte |= _PAGE_RW;
1913 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1914 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1915 if ( unlikely(__put_user(pte, (unsigned long *)
1916 &linear_pg_table[addr>>PAGE_SHIFT])) )
1918 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1919 &linear_pg_table[addr>>PAGE_SHIFT]);
1920 /* Toss the writable pagetable state and crash. */
1921 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1922 ptwr_info[cpu].ptinfo[which].l1va = 0;
1923 domain_crash();
1926 return EXCRET_fault_fixed;
1929 static __init int ptwr_init(void)
1931 int i;
1933 for ( i = 0; i < smp_num_cpus; i++ )
1935 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1936 (void *)alloc_xenheap_page();
1937 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1938 (void *)alloc_xenheap_page();
1941 return 0;
1943 __initcall(ptwr_init);
1948 /************************************************************************/
1949 /************************************************************************/
1950 /************************************************************************/
1952 #ifndef NDEBUG
1954 void ptwr_status(void)
1956 unsigned long pte, *ptep, pfn;
1957 struct pfn_info *page;
1958 int cpu = smp_processor_id();
1960 ptep = (unsigned long *)&linear_pg_table
1961 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1963 if ( __get_user(pte, ptep) ) {
1964 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1965 domain_crash();
1968 pfn = pte >> PAGE_SHIFT;
1969 page = &frame_table[pfn];
1970 printk("need to alloc l1 page %p\n", page);
1971 /* make pt page writable */
1972 printk("need to make read-only l1-page at %p is %08lx\n",
1973 ptep, pte);
1975 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1976 return;
1978 if ( __get_user(pte, (unsigned long *)
1979 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1980 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1981 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1982 domain_crash();
1984 pfn = pte >> PAGE_SHIFT;
1985 page = &frame_table[pfn];
1988 void audit_domain(struct domain *d)
1990 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1992 void adjust (struct pfn_info *page, int dir, int adjtype)
1994 int count = page->count_info & PGC_count_mask;
1996 if ( adjtype )
1998 int tcount = page->u.inuse.type_info & PGT_count_mask;
2000 ttot++;
2002 tcount += dir;
2004 if ( tcount < 0 )
2006 /* This will only come out once. */
2007 printk("Audit %d: type count whent below zero pfn=%x "
2008 "taf=%x otaf=%x\n",
2009 d->id, page-frame_table,
2010 page->u.inuse.type_info,
2011 page->tlbflush_timestamp);
2014 page->u.inuse.type_info =
2015 (page->u.inuse.type_info & ~PGT_count_mask) |
2016 (tcount & PGT_count_mask);
2019 ctot++;
2020 count += dir;
2021 if ( count < 0 )
2023 /* This will only come out once. */
2024 printk("Audit %d: general count whent below zero pfn=%x "
2025 "taf=%x otaf=%x\n",
2026 d->id, page-frame_table,
2027 page->u.inuse.type_info,
2028 page->tlbflush_timestamp);
2031 page->count_info =
2032 (page->count_info & ~PGC_count_mask) |
2033 (count & PGC_count_mask);
2037 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2039 unsigned long pfn, *pt;
2040 struct list_head *list_ent;
2041 struct pfn_info *page;
2042 int i;
2044 list_ent = d->page_list.next;
2045 for ( i = 0; (list_ent != &d->page_list); i++ )
2047 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2048 page = &frame_table[pfn];
2050 switch ( page->u.inuse.type_info & PGT_type_mask )
2052 case PGT_l1_page_table:
2053 case PGT_l2_page_table:
2054 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2055 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2056 if ( (pt[i] & _PAGE_PRESENT) &&
2057 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2058 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2059 d->id, i, pfn, page->u.inuse.type_info,
2060 page->count_info);
2061 unmap_domain_mem(pt);
2064 list_ent = frame_table[pfn].list.next;
2069 void scan_for_pfn_remote(unsigned long xpfn)
2071 struct domain *e;
2072 for_each_domain ( e )
2073 scan_for_pfn( e, xpfn );
2076 int i;
2077 unsigned long pfn;
2078 struct list_head *list_ent;
2079 struct pfn_info *page;
2081 if ( d != current->domain )
2082 domain_pause(d);
2083 synchronise_pagetables(~0UL);
2085 printk("pt base=%lx sh_info=%x\n",
2086 pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
2087 virt_to_page(d->shared_info)-frame_table);
2089 spin_lock(&d->page_alloc_lock);
2091 /* PHASE 0 */
2093 list_ent = d->page_list.next;
2094 for ( i = 0; (list_ent != &d->page_list); i++ )
2096 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2097 page = &frame_table[pfn];
2099 if ( page_get_owner(page) != d )
2100 BUG();
2102 if ( (page->u.inuse.type_info & PGT_count_mask) >
2103 (page->count_info & PGC_count_mask) )
2104 printk("taf > caf %x %x pfn=%lx\n",
2105 page->u.inuse.type_info, page->count_info, pfn );
2107 #if 0 /* SYSV shared memory pages plus writeable files. */
2108 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2109 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2111 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2112 pfn,
2113 page->u.inuse.type_info,
2114 page->count_info );
2115 scan_for_pfn_remote(pfn);
2117 #endif
2118 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2119 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2121 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2122 pfn,
2123 page->u.inuse.type_info,
2124 page->count_info );
2127 /* Use tlbflush_timestamp to store original type_info. */
2128 page->tlbflush_timestamp = page->u.inuse.type_info;
2130 list_ent = frame_table[pfn].list.next;
2134 /* PHASE 1 */
2136 adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
2138 list_ent = d->page_list.next;
2139 for ( i = 0; (list_ent != &d->page_list); i++ )
2141 unsigned long *pt;
2142 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2143 page = &frame_table[pfn];
2145 if ( page_get_owner(page) != d )
2146 BUG();
2148 switch ( page->u.inuse.type_info & PGT_type_mask )
2150 case PGT_l2_page_table:
2152 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2153 printk("Audit %d: L2 not validated %x\n",
2154 d->id, page->u.inuse.type_info);
2156 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2157 printk("Audit %d: L2 not pinned %x\n",
2158 d->id, page->u.inuse.type_info);
2159 else
2160 adjust( page, -1, 1 );
2162 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2164 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2166 if ( pt[i] & _PAGE_PRESENT )
2168 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2169 struct pfn_info *l1page = &frame_table[l1pfn];
2171 if ( page_get_owner(l1page) != d )
2173 printk("L2: Skip bizarre page belonging to other "
2174 "dom %p\n", page_get_owner(l1page));
2175 continue;
2178 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2179 PGT_l2_page_table )
2180 printk("Audit %d: [%x] Found %s Linear PT "
2181 "t=%x pfn=%lx\n", d->id, i,
2182 (l1pfn==pfn) ? "Self" : "Other",
2183 l1page->u.inuse.type_info,
2184 l1pfn);
2185 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2186 PGT_l1_page_table )
2187 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2188 d->id, i,
2189 l1page->u.inuse.type_info,
2190 l1pfn);
2192 adjust(l1page, -1, 1);
2196 unmap_domain_mem(pt);
2198 break;
2201 case PGT_l1_page_table:
2203 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2204 adjust( page, -1, 1 );
2206 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2207 printk("Audit %d: L1 not validated %x\n",
2208 d->id, page->u.inuse.type_info);
2209 #if 0
2210 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2211 printk("Audit %d: L1 not pinned %x\n",
2212 d->id, page->u.inuse.type_info);
2213 #endif
2214 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2216 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2218 if ( pt[i] & _PAGE_PRESENT )
2220 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2221 struct pfn_info *l1page = &frame_table[l1pfn];
2223 if ( l1pfn < 0x100 )
2225 lowmem_mappings++;
2226 continue;
2229 if ( l1pfn > max_page )
2231 io_mappings++;
2232 continue;
2235 if ( pt[i] & _PAGE_RW )
2238 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2239 PGT_l1_page_table ||
2240 (l1page->u.inuse.type_info & PGT_type_mask) ==
2241 PGT_l2_page_table )
2242 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2243 d->id, i,
2244 l1page->u.inuse.type_info,
2245 l1pfn);
2249 if ( page_get_owner(l1page) != d )
2251 printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
2252 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2253 d->id, pfn, i,
2254 page_get_owner(l1page),
2255 l1pfn,
2256 l1page->count_info,
2257 l1page->u.inuse.type_info,
2258 machine_to_phys_mapping[l1pfn]);
2259 continue;
2262 adjust(l1page, -1, 0);
2266 unmap_domain_mem(pt);
2268 break;
2271 list_ent = frame_table[pfn].list.next;
2274 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2275 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2276 d->id, lowmem_mappings, io_mappings);
2278 /* PHASE 2 */
2280 ctot = ttot = 0;
2281 list_ent = d->page_list.next;
2282 for ( i = 0; (list_ent != &d->page_list); i++ )
2284 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2285 page = &frame_table[pfn];
2287 switch ( page->u.inuse.type_info & PGT_type_mask)
2289 case PGT_l1_page_table:
2290 case PGT_l2_page_table:
2291 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2293 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2294 d->id, page->u.inuse.type_info,
2295 page->tlbflush_timestamp,
2296 page->count_info, pfn );
2297 scan_for_pfn_remote(pfn);
2299 default:
2300 if ( (page->count_info & PGC_count_mask) != 1 )
2302 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2303 d->id,
2304 page->count_info,
2305 page->u.inuse.type_info,
2306 page->tlbflush_timestamp, pfn );
2307 scan_for_pfn_remote(pfn);
2309 break;
2312 list_ent = frame_table[pfn].list.next;
2315 /* PHASE 3 */
2317 list_ent = d->page_list.next;
2318 for ( i = 0; (list_ent != &d->page_list); i++ )
2320 unsigned long *pt;
2321 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2322 page = &frame_table[pfn];
2324 switch ( page->u.inuse.type_info & PGT_type_mask )
2326 case PGT_l2_page_table:
2327 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2328 adjust( page, 1, 1 );
2330 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2332 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2334 if ( pt[i] & _PAGE_PRESENT )
2336 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2337 struct pfn_info *l1page = &frame_table[l1pfn];
2339 if ( page_get_owner(l1page) == d )
2340 adjust(l1page, 1, 1);
2344 unmap_domain_mem(pt);
2345 break;
2347 case PGT_l1_page_table:
2348 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2349 adjust( page, 1, 1 );
2351 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2353 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2355 if ( pt[i] & _PAGE_PRESENT )
2357 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2358 struct pfn_info *l1page = &frame_table[l1pfn];
2360 if ( (page_get_owner(l1page) != d) ||
2361 (l1pfn < 0x100) || (l1pfn > max_page) )
2362 continue;
2364 adjust(l1page, 1, 0);
2368 unmap_domain_mem(pt);
2369 break;
2373 page->tlbflush_timestamp = 0;
2375 list_ent = frame_table[pfn].list.next;
2378 spin_unlock(&d->page_alloc_lock);
2380 adjust(&frame_table[pagetable_val(
2381 d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
2383 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2385 if ( d != current->domain )
2386 domain_unpause(d);
2389 void audit_domains(void)
2391 struct domain *d;
2392 for_each_domain ( d )
2393 audit_domain(d);
2396 void audit_domains_key(unsigned char key)
2398 audit_domains();
2401 #endif