debuggers.hg

view xen/arch/x86/memory.c @ 3620:6d98eb831816

bitkeeper revision 1.1159.212.52 (41fa6980PfhDt-hKCfacnyHcFB7DNQ)

Make page allocator 64-bit safe.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Fri Jan 28 16:34:08 2005 +0000 (2005-01-28)
parents eef1949801b8
children fec8b1778268
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
105 #ifdef VERBOSE
106 #define MEM_LOG(_f, _a...) \
107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
108 current->domain->id , __LINE__ , ## _a )
109 #else
110 #define MEM_LOG(_f, _a...) ((void)0)
111 #endif
113 static int alloc_l2_table(struct pfn_info *page);
114 static int alloc_l1_table(struct pfn_info *page);
115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
117 u32 type,
118 struct domain *d);
120 static void free_l2_table(struct pfn_info *page);
121 static void free_l1_table(struct pfn_info *page);
123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
126 /* Used to defer flushing of memory structures. */
127 static struct {
128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
130 unsigned long deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct pfn_info *frame_table;
146 unsigned long frame_table_size;
147 unsigned long max_page;
149 void __init init_frametable(void)
150 {
151 #if defined(__i386__)
152 unsigned long i, p;
153 #endif
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 #if defined(__x86_64__)
159 frame_table = __va(alloc_boot_pages(frame_table_size, 4UL << 20));
160 #elif defined(__i386__)
161 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
163 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
164 {
165 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
166 if ( p == 0 )
167 panic("Not enough memory for frame table\n");
168 idle_pg_table[(FRAMETABLE_VIRT_START + i) >> L2_PAGETABLE_SHIFT] =
169 mk_l2_pgentry(p | __PAGE_HYPERVISOR | _PAGE_PSE);
170 }
171 #endif
173 memset(frame_table, 0, frame_table_size);
174 }
176 void arch_init_memory(void)
177 {
178 unsigned long i;
180 /*
181 * We are rather picky about the layout of 'struct pfn_info'. The
182 * count_info and domain fields must be adjacent, as we perform atomic
183 * 64-bit operations on them. Also, just for sanity, we assert the size
184 * of the structure here.
185 */
186 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
187 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
188 (sizeof(struct pfn_info) != 24) )
189 {
190 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
191 offsetof(struct pfn_info, count_info),
192 offsetof(struct pfn_info, u.inuse.domain),
193 sizeof(struct pfn_info));
194 for ( ; ; ) ;
195 }
197 memset(percpu_info, 0, sizeof(percpu_info));
199 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
200 memset(machine_to_phys_mapping, 0x55, 4<<20);
202 /*
203 * Initialise our DOMID_XEN domain.
204 * Any Xen-heap pages that we will allow to be mapped will have
205 * their domain field set to dom_xen.
206 */
207 dom_xen = alloc_domain_struct();
208 atomic_set(&dom_xen->refcnt, 1);
209 dom_xen->id = DOMID_XEN;
211 /*
212 * Initialise our DOMID_IO domain.
213 * This domain owns no pages but is considered a special case when
214 * mapping I/O pages, as the mappings occur at the priv of the caller.
215 */
216 dom_io = alloc_domain_struct();
217 atomic_set(&dom_io->refcnt, 1);
218 dom_io->id = DOMID_IO;
220 /* M2P table is mappable read-only by privileged domains. */
221 for ( i = 0; i < 1024; i++ )
222 {
223 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
224 /* gdt to make sure it's only mapped read-only by non-privileged
225 domains. */
226 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
227 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
228 }
229 }
231 static void __invalidate_shadow_ldt(struct exec_domain *d)
232 {
233 int i;
234 unsigned long pfn;
235 struct pfn_info *page;
237 d->mm.shadow_ldt_mapcnt = 0;
239 for ( i = 16; i < 32; i++ )
240 {
241 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
242 if ( pfn == 0 ) continue;
243 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
244 page = &frame_table[pfn];
245 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
246 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
247 put_page_and_type(page);
248 }
250 /* Dispose of the (now possibly invalid) mappings from the TLB. */
251 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
252 }
255 static inline void invalidate_shadow_ldt(struct exec_domain *d)
256 {
257 if ( d->mm.shadow_ldt_mapcnt != 0 )
258 __invalidate_shadow_ldt(d);
259 }
262 static int alloc_segdesc_page(struct pfn_info *page)
263 {
264 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
265 int i;
267 for ( i = 0; i < 512; i++ )
268 if ( unlikely(!check_descriptor(&descs[i*2])) )
269 goto fail;
271 unmap_domain_mem(descs);
272 return 1;
274 fail:
275 unmap_domain_mem(descs);
276 return 0;
277 }
280 /* Map shadow page at offset @off. */
281 int map_ldt_shadow_page(unsigned int off)
282 {
283 struct exec_domain *ed = current;
284 struct domain *d = ed->domain;
285 unsigned long l1e;
287 if ( unlikely(in_irq()) )
288 BUG();
290 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
291 PAGE_SHIFT) + off]);
293 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
294 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
295 d, PGT_ldt_page)) )
296 return 0;
298 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
299 ed->mm.shadow_ldt_mapcnt++;
301 return 1;
302 }
305 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
306 {
307 struct pfn_info *page = &frame_table[page_nr];
309 if ( unlikely(!pfn_is_ram(page_nr)) )
310 {
311 MEM_LOG("Pfn %08lx is not RAM", page_nr);
312 return 0;
313 }
315 if ( unlikely(!get_page(page, d)) )
316 {
317 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
318 return 0;
319 }
321 return 1;
322 }
325 static int get_page_and_type_from_pagenr(unsigned long page_nr,
326 u32 type,
327 struct domain *d)
328 {
329 struct pfn_info *page = &frame_table[page_nr];
331 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
332 return 0;
334 if ( unlikely(!get_page_type(page, type)) )
335 {
336 #ifdef VERBOSE
337 if ( (type & PGT_type_mask) != PGT_l1_page_table )
338 MEM_LOG("Bad page type for pfn %08lx (%08x)",
339 page_nr, page->u.inuse.type_info);
340 #endif
341 put_page(page);
342 return 0;
343 }
345 return 1;
346 }
349 /*
350 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
351 * needs some special care with reference counst and access permissions:
352 * 1. The mapping entry must be read-only, or the guest may get write access
353 * to its own PTEs.
354 * 2. We must only bump the reference counts for an *already validated*
355 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
356 * on a validation that is required to complete that validation.
357 * 3. We only need to increment the reference counts for the mapped page
358 * frame if it is mapped by a different L2 table. This is sufficient and
359 * also necessary to allow validation of an L2 table mapping itself.
360 */
361 static int
362 get_linear_pagetable(
363 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
364 {
365 u32 x, y;
366 struct pfn_info *page;
368 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
369 {
370 MEM_LOG("Attempt to create linear p.t. with write perms");
371 return 0;
372 }
374 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
375 {
376 /* Make sure the mapped frame belongs to the correct domain. */
377 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
378 return 0;
380 /*
381 * Make sure that the mapped frame is an already-validated L2 table.
382 * If so, atomically increment the count (checking for overflow).
383 */
384 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
385 y = page->u.inuse.type_info;
386 do {
387 x = y;
388 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
389 unlikely((x & (PGT_type_mask|PGT_validated)) !=
390 (PGT_l2_page_table|PGT_validated)) )
391 {
392 put_page(page);
393 return 0;
394 }
395 }
396 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
397 }
399 return 1;
400 }
403 static int
404 get_page_from_l1e(
405 l1_pgentry_t l1e, struct domain *d)
406 {
407 unsigned long l1v = l1_pgentry_val(l1e);
408 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
409 struct pfn_info *page = &frame_table[pfn];
410 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
412 if ( !(l1v & _PAGE_PRESENT) )
413 return 1;
415 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
416 {
417 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
418 return 0;
419 }
421 if ( unlikely(!pfn_is_ram(pfn)) )
422 {
423 /* Revert to caller privileges if FD == DOMID_IO. */
424 if ( d == dom_io )
425 d = current->domain;
427 if ( IS_PRIV(d) )
428 return 1;
430 if ( IS_CAPABLE_PHYSDEV(d) )
431 return domain_iomem_in_pfn(d, pfn);
433 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
434 return 0;
435 }
437 return ((l1v & _PAGE_RW) ?
438 get_page_and_type(page, d, PGT_writable_page) :
439 get_page(page, d));
440 }
443 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
444 static int
445 get_page_from_l2e(
446 l2_pgentry_t l2e, unsigned long pfn,
447 struct domain *d, unsigned long va_idx)
448 {
449 int rc;
451 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
452 return 1;
454 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
455 {
456 MEM_LOG("Bad L2 page type settings %04lx",
457 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
458 return 0;
459 }
461 rc = get_page_and_type_from_pagenr(
462 l2_pgentry_to_pagenr(l2e),
463 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
465 if ( unlikely(!rc) )
466 return get_linear_pagetable(l2e, pfn, d);
468 return 1;
469 }
472 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
473 {
474 unsigned long l1v = l1_pgentry_val(l1e);
475 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
476 struct pfn_info *page = &frame_table[pfn];
477 struct domain *e;
479 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
480 return;
482 e = page->u.inuse.domain;
483 if ( unlikely(e != d) )
484 {
485 /*
486 * Unmap a foreign page that may have been mapped via a grant table.
487 * Note that this can fail for a privileged domain that can map foreign
488 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
489 * counted via a grant entry and some counted directly in the page
490 * structure's reference count. Note that reference counts won't get
491 * dangerously confused as long as we always try to decrement the
492 * grant entry first. We may end up with a mismatch between which
493 * mappings and which unmappings are counted via the grant entry, but
494 * really it doesn't matter as privileged domains have carte blanche.
495 */
496 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
497 return;
498 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
499 }
501 if ( l1v & _PAGE_RW )
502 {
503 put_page_and_type(page);
504 }
505 else
506 {
507 /* We expect this is rare so we blow the entire shadow LDT. */
508 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
509 PGT_ldt_page)) &&
510 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
511 invalidate_shadow_ldt(e->exec_domain[0]);
512 put_page(page);
513 }
514 }
517 /*
518 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
519 * Note also that this automatically deals correctly with linear p.t.'s.
520 */
521 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
522 {
523 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
524 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
525 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
526 }
529 static int alloc_l2_table(struct pfn_info *page)
530 {
531 struct domain *d = page->u.inuse.domain;
532 unsigned long page_nr = page_to_pfn(page);
533 l2_pgentry_t *pl2e;
534 int i;
536 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
538 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
539 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
540 goto fail;
542 #if defined(__i386__)
543 /* Now we add our private high mappings. */
544 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
545 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
546 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
547 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
548 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
549 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
550 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
551 __PAGE_HYPERVISOR);
552 #endif
554 unmap_domain_mem(pl2e);
555 return 1;
557 fail:
558 while ( i-- > 0 )
559 put_page_from_l2e(pl2e[i], page_nr);
561 unmap_domain_mem(pl2e);
562 return 0;
563 }
566 static int alloc_l1_table(struct pfn_info *page)
567 {
568 struct domain *d = page->u.inuse.domain;
569 unsigned long page_nr = page_to_pfn(page);
570 l1_pgentry_t *pl1e;
571 int i;
573 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
575 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
576 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
577 goto fail;
579 unmap_domain_mem(pl1e);
580 return 1;
582 fail:
583 while ( i-- > 0 )
584 put_page_from_l1e(pl1e[i], d);
586 unmap_domain_mem(pl1e);
587 return 0;
588 }
591 static void free_l2_table(struct pfn_info *page)
592 {
593 unsigned long page_nr = page - frame_table;
594 l2_pgentry_t *pl2e;
595 int i;
597 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
599 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
600 put_page_from_l2e(pl2e[i], page_nr);
602 unmap_domain_mem(pl2e);
603 }
606 static void free_l1_table(struct pfn_info *page)
607 {
608 struct domain *d = page->u.inuse.domain;
609 unsigned long page_nr = page - frame_table;
610 l1_pgentry_t *pl1e;
611 int i;
613 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
615 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
616 put_page_from_l1e(pl1e[i], d);
618 unmap_domain_mem(pl1e);
619 }
622 static inline int update_l2e(l2_pgentry_t *pl2e,
623 l2_pgentry_t ol2e,
624 l2_pgentry_t nl2e)
625 {
626 unsigned long o = cmpxchg((unsigned long *)pl2e,
627 l2_pgentry_val(ol2e),
628 l2_pgentry_val(nl2e));
629 if ( o != l2_pgentry_val(ol2e) )
630 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
631 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
632 return (o == l2_pgentry_val(ol2e));
633 }
636 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
637 static int mod_l2_entry(l2_pgentry_t *pl2e,
638 l2_pgentry_t nl2e,
639 unsigned long pfn)
640 {
641 l2_pgentry_t ol2e;
642 unsigned long _ol2e;
644 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
645 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
646 {
647 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
648 return 0;
649 }
651 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
652 return 0;
653 ol2e = mk_l2_pgentry(_ol2e);
655 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
656 {
657 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
658 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
659 return update_l2e(pl2e, ol2e, nl2e);
661 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
662 ((unsigned long)pl2e &
663 ~PAGE_MASK) >> 2)) )
664 return 0;
666 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
667 {
668 put_page_from_l2e(nl2e, pfn);
669 return 0;
670 }
672 put_page_from_l2e(ol2e, pfn);
673 return 1;
674 }
676 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
677 return 0;
679 put_page_from_l2e(ol2e, pfn);
680 return 1;
681 }
684 static inline int update_l1e(l1_pgentry_t *pl1e,
685 l1_pgentry_t ol1e,
686 l1_pgentry_t nl1e)
687 {
688 unsigned long o = l1_pgentry_val(ol1e);
689 unsigned long n = l1_pgentry_val(nl1e);
691 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
692 unlikely(o != l1_pgentry_val(ol1e)) )
693 {
694 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
695 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
696 return 0;
697 }
699 return 1;
700 }
703 /* Update the L1 entry at pl1e to new value nl1e. */
704 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
705 {
706 l1_pgentry_t ol1e;
707 unsigned long _ol1e;
708 struct domain *d = current->domain;
710 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
711 {
712 MEM_LOG("Bad get_user\n");
713 return 0;
714 }
716 ol1e = mk_l1_pgentry(_ol1e);
718 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
719 {
720 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
721 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
722 return update_l1e(pl1e, ol1e, nl1e);
724 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
725 return 0;
727 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
728 {
729 put_page_from_l1e(nl1e, d);
730 return 0;
731 }
733 put_page_from_l1e(ol1e, d);
734 return 1;
735 }
737 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
738 return 0;
740 put_page_from_l1e(ol1e, d);
741 return 1;
742 }
745 int alloc_page_type(struct pfn_info *page, unsigned int type)
746 {
747 switch ( type )
748 {
749 case PGT_l1_page_table:
750 return alloc_l1_table(page);
751 case PGT_l2_page_table:
752 return alloc_l2_table(page);
753 case PGT_gdt_page:
754 case PGT_ldt_page:
755 return alloc_segdesc_page(page);
756 default:
757 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
758 type, page->u.inuse.type_info,
759 page->count_info);
760 BUG();
761 }
763 return 0;
764 }
767 void free_page_type(struct pfn_info *page, unsigned int type)
768 {
769 struct domain *d = page->u.inuse.domain;
771 switch ( type )
772 {
773 case PGT_l1_page_table:
774 free_l1_table(page);
775 break;
777 case PGT_l2_page_table:
778 free_l2_table(page);
779 break;
781 default:
782 BUG();
783 }
785 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
786 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
787 {
788 unshadow_table(page_to_pfn(page), type);
789 put_shadow_status(&d->exec_domain[0]->mm);
790 }
791 }
794 void put_page_type(struct pfn_info *page)
795 {
796 u32 nx, x, y = page->u.inuse.type_info;
798 again:
799 do {
800 x = y;
801 nx = x - 1;
803 ASSERT((x & PGT_count_mask) != 0);
805 /*
806 * The page should always be validated while a reference is held. The
807 * exception is during domain destruction, when we forcibly invalidate
808 * page-table pages if we detect a referential loop.
809 * See domain.c:relinquish_list().
810 */
811 ASSERT((x & PGT_validated) ||
812 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
814 if ( unlikely((nx & PGT_count_mask) == 0) )
815 {
816 /* Record TLB information for flush later. Races are harmless. */
817 page->tlbflush_timestamp = tlbflush_current_time();
819 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
820 likely(nx & PGT_validated) )
821 {
822 /*
823 * Page-table pages must be unvalidated when count is zero. The
824 * 'free' is safe because the refcnt is non-zero and validated
825 * bit is clear => other ops will spin or fail.
826 */
827 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
828 x & ~PGT_validated)) != x) )
829 goto again;
830 /* We cleared the 'valid bit' so we do the clear up. */
831 free_page_type(page, x & PGT_type_mask);
832 /* Carry on, but with the 'valid bit' now clear. */
833 x &= ~PGT_validated;
834 nx &= ~PGT_validated;
835 }
836 }
837 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
838 (PGT_pinned | 1)) )
839 {
840 /* Page is now only pinned. Make the back pointer mutable again. */
841 nx |= PGT_va_mutable;
842 }
843 }
844 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
845 }
848 int get_page_type(struct pfn_info *page, u32 type)
849 {
850 u32 nx, x, y = page->u.inuse.type_info;
852 again:
853 do {
854 x = y;
855 nx = x + 1;
856 if ( unlikely((nx & PGT_count_mask) == 0) )
857 {
858 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
859 return 0;
860 }
861 else if ( unlikely((x & PGT_count_mask) == 0) )
862 {
863 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
864 {
865 /*
866 * On type change we check to flush stale TLB entries. This
867 * may be unnecessary (e.g., page was GDT/LDT) but those
868 * circumstances should be very rare.
869 */
870 struct domain *d = page->u.inuse.domain;
871 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
872 page->tlbflush_timestamp)) )
873 {
874 perfc_incr(need_flush_tlb_flush);
875 flush_tlb_cpu(d->exec_domain[0]->processor);
876 }
878 /* We lose existing type, back pointer, and validity. */
879 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
880 nx |= type;
882 /* No special validation needed for writable pages. */
883 /* Page tables and GDT/LDT need to be scanned for validity. */
884 if ( type == PGT_writable_page )
885 nx |= PGT_validated;
886 }
887 }
888 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
889 {
890 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
891 {
892 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
893 ((type & PGT_type_mask) != PGT_l1_page_table) )
894 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
895 x & PGT_type_mask, type, page_to_pfn(page));
896 return 0;
897 }
898 else if ( (x & PGT_va_mask) == PGT_va_mutable )
899 {
900 /* The va backpointer is mutable, hence we update it. */
901 nx &= ~PGT_va_mask;
902 nx |= type; /* we know the actual type is correct */
903 }
904 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
905 {
906 /* This table is potentially mapped at multiple locations. */
907 nx &= ~PGT_va_mask;
908 nx |= PGT_va_unknown;
909 }
910 }
911 else if ( unlikely(!(x & PGT_validated)) )
912 {
913 /* Someone else is updating validation of this page. Wait... */
914 while ( (y = page->u.inuse.type_info) == x )
915 {
916 rep_nop();
917 barrier();
918 }
919 goto again;
920 }
921 }
922 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
924 if ( unlikely(!(nx & PGT_validated)) )
925 {
926 /* Try to validate page type; drop the new reference on failure. */
927 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
928 {
929 MEM_LOG("Error while validating pfn %08lx for type %08x."
930 " caf=%08x taf=%08x\n",
931 page_to_pfn(page), type,
932 page->count_info,
933 page->u.inuse.type_info);
934 /* Noone else can get a reference. We hold the only ref. */
935 page->u.inuse.type_info = 0;
936 return 0;
937 }
939 /* Noone else is updating simultaneously. */
940 __set_bit(_PGT_validated, &page->u.inuse.type_info);
941 }
943 return 1;
944 }
947 int new_guest_cr3(unsigned long pfn)
948 {
949 struct exec_domain *ed = current;
950 struct domain *d = ed->domain;
951 int okay, cpu = smp_processor_id();
952 unsigned long old_base_pfn;
954 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
955 if ( likely(okay) )
956 {
957 invalidate_shadow_ldt(ed);
959 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
960 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
961 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
963 shadow_mk_pagetable(&ed->mm);
965 write_ptbase(&ed->mm);
967 put_page_and_type(&frame_table[old_base_pfn]);
968 }
969 else
970 {
971 MEM_LOG("Error while installing new baseptr %08lx", pfn);
972 }
974 return okay;
975 }
977 static int do_extended_command(unsigned long ptr, unsigned long val)
978 {
979 int okay = 1, cpu = smp_processor_id();
980 unsigned int cmd = val & MMUEXT_CMD_MASK;
981 unsigned long pfn = ptr >> PAGE_SHIFT;
982 struct pfn_info *page = &frame_table[pfn];
983 struct exec_domain *ed = current;
984 struct domain *d = ed->domain, *nd, *e;
985 u32 x, y;
986 domid_t domid;
987 grant_ref_t gntref;
989 switch ( cmd )
990 {
991 case MMUEXT_PIN_L1_TABLE:
992 case MMUEXT_PIN_L2_TABLE:
993 /*
994 * We insist that, if you pin an L1 page, it's the first thing that
995 * you do to it. This is because we require the backptr to still be
996 * mutable. This assumption seems safe.
997 */
998 okay = get_page_and_type_from_pagenr(
999 pfn,
1000 ((cmd==MMUEXT_PIN_L2_TABLE) ?
1001 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
1002 FOREIGNDOM);
1004 if ( unlikely(!okay) )
1006 MEM_LOG("Error while pinning pfn %08lx", pfn);
1007 break;
1010 if ( unlikely(test_and_set_bit(_PGT_pinned,
1011 &page->u.inuse.type_info)) )
1013 MEM_LOG("Pfn %08lx already pinned", pfn);
1014 put_page_and_type(page);
1015 okay = 0;
1016 break;
1019 break;
1021 case MMUEXT_UNPIN_TABLE:
1022 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1024 MEM_LOG("Page %08lx bad domain (dom=%p)",
1025 ptr, page->u.inuse.domain);
1027 else if ( likely(test_and_clear_bit(_PGT_pinned,
1028 &page->u.inuse.type_info)) )
1030 put_page_and_type(page);
1031 put_page(page);
1033 else
1035 okay = 0;
1036 put_page(page);
1037 MEM_LOG("Pfn %08lx not pinned", pfn);
1039 break;
1041 case MMUEXT_NEW_BASEPTR:
1042 okay = new_guest_cr3(pfn);
1043 break;
1045 case MMUEXT_TLB_FLUSH:
1046 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1047 break;
1049 case MMUEXT_INVLPG:
1050 __flush_tlb_one(ptr);
1051 break;
1053 case MMUEXT_FLUSH_CACHE:
1054 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1056 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1057 okay = 0;
1059 else
1061 wbinvd();
1063 break;
1065 case MMUEXT_SET_LDT:
1067 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1068 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1069 (ents > 8192) ||
1070 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1071 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1073 okay = 0;
1074 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1076 else if ( (ed->mm.ldt_ents != ents) ||
1077 (ed->mm.ldt_base != ptr) )
1079 invalidate_shadow_ldt(ed);
1080 ed->mm.ldt_base = ptr;
1081 ed->mm.ldt_ents = ents;
1082 load_LDT(ed);
1083 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1084 if ( ents != 0 )
1085 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1087 break;
1090 case MMUEXT_SET_FOREIGNDOM:
1091 domid = (domid_t)(val >> 16);
1093 if ( (e = percpu_info[cpu].foreign) != NULL )
1094 put_domain(e);
1095 percpu_info[cpu].foreign = NULL;
1097 if ( !IS_PRIV(d) )
1099 switch ( domid )
1101 case DOMID_IO:
1102 get_knownalive_domain(dom_io);
1103 percpu_info[cpu].foreign = dom_io;
1104 break;
1105 default:
1106 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1107 okay = 0;
1108 break;
1111 else
1113 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1114 if ( e == NULL )
1116 switch ( domid )
1118 case DOMID_XEN:
1119 get_knownalive_domain(dom_xen);
1120 percpu_info[cpu].foreign = dom_xen;
1121 break;
1122 case DOMID_IO:
1123 get_knownalive_domain(dom_io);
1124 percpu_info[cpu].foreign = dom_io;
1125 break;
1126 default:
1127 MEM_LOG("Unknown domain '%u'", domid);
1128 okay = 0;
1129 break;
1133 break;
1135 case MMUEXT_TRANSFER_PAGE:
1136 domid = (domid_t)(val >> 16);
1137 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1139 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1140 unlikely(!pfn_is_ram(pfn)) ||
1141 unlikely((e = find_domain_by_id(domid)) == NULL) )
1143 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1144 okay = 0;
1145 break;
1148 spin_lock(&d->page_alloc_lock);
1150 /*
1151 * The tricky bit: atomically release ownership while there is just one
1152 * benign reference to the page (PGC_allocated). If that reference
1153 * disappears then the deallocation routine will safely spin.
1154 */
1155 nd = page->u.inuse.domain;
1156 y = page->count_info;
1157 do {
1158 x = y;
1159 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1160 (1|PGC_allocated)) ||
1161 unlikely(nd != d) )
1163 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1164 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1165 d, d->id, nd, x, page->u.inuse.type_info);
1166 spin_unlock(&d->page_alloc_lock);
1167 put_domain(e);
1168 return 0;
1170 __asm__ __volatile__(
1171 LOCK_PREFIX "cmpxchg8b %2"
1172 : "=d" (nd), "=a" (y),
1173 "=m" (*(volatile u64 *)(&page->count_info))
1174 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1176 while ( unlikely(nd != d) || unlikely(y != x) );
1178 /*
1179 * Unlink from 'd'. At least one reference remains (now anonymous), so
1180 * noone else is spinning to try to delete this page from 'd'.
1181 */
1182 d->tot_pages--;
1183 list_del(&page->list);
1185 spin_unlock(&d->page_alloc_lock);
1187 spin_lock(&e->page_alloc_lock);
1189 /*
1190 * Check that 'e' will accept the page and has reservation headroom.
1191 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1192 */
1193 ASSERT(e->tot_pages <= e->max_pages);
1194 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1195 unlikely(e->tot_pages == e->max_pages) ||
1196 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1198 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1199 "provided a bad grant ref, or is dying (%08lx).\n",
1200 e->tot_pages, e->max_pages, e->d_flags);
1201 spin_unlock(&e->page_alloc_lock);
1202 put_domain(e);
1203 okay = 0;
1204 break;
1207 /* Okay, add the page to 'e'. */
1208 if ( unlikely(e->tot_pages++ == 0) )
1209 get_knownalive_domain(e);
1210 list_add_tail(&page->list, &e->page_list);
1211 page->u.inuse.domain = e;
1213 spin_unlock(&e->page_alloc_lock);
1215 /* Transfer is all done: tell the guest about its new page frame. */
1216 gnttab_notify_transfer(e, gntref, pfn);
1218 put_domain(e);
1219 break;
1221 case MMUEXT_REASSIGN_PAGE:
1222 if ( unlikely(!IS_PRIV(d)) )
1224 MEM_LOG("Dom %u has no reassignment priv", d->id);
1225 okay = 0;
1226 break;
1229 e = percpu_info[cpu].foreign;
1230 if ( unlikely(e == NULL) )
1232 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1233 okay = 0;
1234 break;
1237 /*
1238 * Grab both page_list locks, in order. This prevents the page from
1239 * disappearing elsewhere while we modify the owner, and we'll need
1240 * both locks if we're successful so that we can change lists.
1241 */
1242 if ( d < e )
1244 spin_lock(&d->page_alloc_lock);
1245 spin_lock(&e->page_alloc_lock);
1247 else
1249 spin_lock(&e->page_alloc_lock);
1250 spin_lock(&d->page_alloc_lock);
1253 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1254 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1255 unlikely(IS_XEN_HEAP_FRAME(page)) )
1257 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1258 okay = 0;
1259 goto reassign_fail;
1262 /*
1263 * The tricky bit: atomically change owner while there is just one
1264 * benign reference to the page (PGC_allocated). If that reference
1265 * disappears then the deallocation routine will safely spin.
1266 */
1267 nd = page->u.inuse.domain;
1268 y = page->count_info;
1269 do {
1270 x = y;
1271 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1272 (1|PGC_allocated)) ||
1273 unlikely(nd != d) )
1275 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1276 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1277 d, d->id, nd, x, page->u.inuse.type_info);
1278 okay = 0;
1279 goto reassign_fail;
1281 __asm__ __volatile__(
1282 LOCK_PREFIX "cmpxchg8b %3"
1283 : "=d" (nd), "=a" (y), "=c" (e),
1284 "=m" (*(volatile u64 *)(&page->count_info))
1285 : "0" (d), "1" (x), "c" (e), "b" (x) );
1287 while ( unlikely(nd != d) || unlikely(y != x) );
1289 /*
1290 * Unlink from 'd'. We transferred at least one reference to 'e', so
1291 * noone else is spinning to try to delete this page from 'd'.
1292 */
1293 d->tot_pages--;
1294 list_del(&page->list);
1296 /*
1297 * Add the page to 'e'. Someone may already have removed the last
1298 * reference and want to remove the page from 'e'. However, we have
1299 * the lock so they'll spin waiting for us.
1300 */
1301 if ( unlikely(e->tot_pages++ == 0) )
1302 get_knownalive_domain(e);
1303 list_add_tail(&page->list, &e->page_list);
1305 reassign_fail:
1306 spin_unlock(&d->page_alloc_lock);
1307 spin_unlock(&e->page_alloc_lock);
1308 break;
1310 case MMUEXT_CLEAR_FOREIGNDOM:
1311 if ( (e = percpu_info[cpu].foreign) != NULL )
1312 put_domain(e);
1313 percpu_info[cpu].foreign = NULL;
1314 break;
1316 default:
1317 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1318 okay = 0;
1319 break;
1322 return okay;
1325 int do_mmu_update(
1326 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1328 /*
1329 * We steal the m.s.b. of the @count parameter to indicate whether this
1330 * invocation of do_mmu_update() is resuming a previously preempted call.
1331 * We steal the next 15 bits to remember the current FOREIGNDOM.
1332 */
1333 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1334 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1335 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1337 mmu_update_t req;
1338 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1339 struct pfn_info *page;
1340 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1341 unsigned int cmd, done = 0;
1342 unsigned long prev_spfn = 0;
1343 l1_pgentry_t *prev_spl1e = 0;
1344 struct exec_domain *ed = current;
1345 struct domain *d = ed->domain;
1346 u32 type_info;
1347 domid_t domid;
1349 LOCK_BIGLOCK(d);
1351 cleanup_writable_pagetable(d);
1353 /*
1354 * If we are resuming after preemption, read how much work we have already
1355 * done. This allows us to set the @done output parameter correctly.
1356 * We also reset FOREIGNDOM here.
1357 */
1358 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1360 if ( !(count & MMU_UPDATE_PREEMPTED) )
1362 /* Count overflow into private FOREIGNDOM field. */
1363 MEM_LOG("do_mmu_update count is too large");
1364 rc = -EINVAL;
1365 goto out;
1367 count &= ~MMU_UPDATE_PREEMPTED;
1368 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1369 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1370 if ( unlikely(pdone != NULL) )
1371 (void)get_user(done, pdone);
1372 if ( (domid != current->domain->id) &&
1373 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1375 rc = -EINVAL;
1376 goto out;
1380 perfc_incrc(calls_to_mmu_update);
1381 perfc_addc(num_page_updates, count);
1383 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1385 rc = -EFAULT;
1386 goto out;
1389 for ( i = 0; i < count; i++ )
1391 if ( hypercall_preempt_check() )
1393 rc = hypercall_create_continuation(
1394 __HYPERVISOR_mmu_update, 3, ureqs,
1395 (count - i) |
1396 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1397 MMU_UPDATE_PREEMPTED, pdone);
1398 break;
1401 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1403 MEM_LOG("Bad __copy_from_user");
1404 rc = -EFAULT;
1405 break;
1408 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1409 pfn = req.ptr >> PAGE_SHIFT;
1411 okay = 0;
1413 switch ( cmd )
1415 /*
1416 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1417 */
1418 case MMU_NORMAL_PT_UPDATE:
1419 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1421 MEM_LOG("Could not get page for normal update");
1422 break;
1425 if ( likely(prev_pfn == pfn) )
1427 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1429 else
1431 if ( prev_pfn != 0 )
1432 unmap_domain_mem((void *)va);
1433 va = (unsigned long)map_domain_mem(req.ptr);
1434 prev_pfn = pfn;
1437 page = &frame_table[pfn];
1438 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1440 case PGT_l1_page_table:
1441 if ( likely(get_page_type(
1442 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1444 okay = mod_l1_entry((l1_pgentry_t *)va,
1445 mk_l1_pgentry(req.val));
1447 if ( unlikely(ed->mm.shadow_mode) && okay &&
1448 (get_shadow_status(&ed->mm, page-frame_table) &
1449 PSH_shadowed) )
1451 shadow_l1_normal_pt_update(
1452 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1453 put_shadow_status(&ed->mm);
1456 put_page_type(page);
1458 break;
1459 case PGT_l2_page_table:
1460 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1462 okay = mod_l2_entry((l2_pgentry_t *)va,
1463 mk_l2_pgentry(req.val),
1464 pfn);
1466 if ( unlikely(ed->mm.shadow_mode) && okay &&
1467 (get_shadow_status(&ed->mm, page-frame_table) &
1468 PSH_shadowed) )
1470 shadow_l2_normal_pt_update(req.ptr, req.val);
1471 put_shadow_status(&ed->mm);
1474 put_page_type(page);
1476 break;
1477 default:
1478 if ( likely(get_page_type(page, PGT_writable_page)) )
1480 *(unsigned long *)va = req.val;
1481 okay = 1;
1482 put_page_type(page);
1484 break;
1487 put_page(page);
1488 break;
1490 case MMU_MACHPHYS_UPDATE:
1491 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1493 MEM_LOG("Could not get page for mach->phys update");
1494 break;
1497 machine_to_phys_mapping[pfn] = req.val;
1498 okay = 1;
1500 /*
1501 * If in log-dirty mode, mark the corresponding pseudo-physical
1502 * page as dirty.
1503 */
1504 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1505 mark_dirty(&ed->mm, pfn) )
1506 ed->mm.shadow_dirty_block_count++;
1508 put_page(&frame_table[pfn]);
1509 break;
1511 /*
1512 * MMU_EXTENDED_COMMAND: Extended command is specified
1513 * in the least-siginificant bits of the 'value' field.
1514 */
1515 case MMU_EXTENDED_COMMAND:
1516 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1517 okay = do_extended_command(req.ptr, req.val);
1518 break;
1520 default:
1521 MEM_LOG("Invalid page update command %08lx", req.ptr);
1522 break;
1525 if ( unlikely(!okay) )
1527 rc = -EINVAL;
1528 break;
1531 ureqs++;
1534 out:
1535 if ( prev_pfn != 0 )
1536 unmap_domain_mem((void *)va);
1538 if ( unlikely(prev_spl1e != 0) )
1539 unmap_domain_mem((void *)prev_spl1e);
1541 deferred_ops = percpu_info[cpu].deferred_ops;
1542 percpu_info[cpu].deferred_ops = 0;
1544 if ( deferred_ops & DOP_FLUSH_TLB )
1545 local_flush_tlb();
1547 if ( deferred_ops & DOP_RELOAD_LDT )
1548 (void)map_ldt_shadow_page(0);
1550 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1552 put_domain(percpu_info[cpu].foreign);
1553 percpu_info[cpu].foreign = NULL;
1556 /* Add incremental work we have done to the @done output parameter. */
1557 if ( unlikely(pdone != NULL) )
1558 __put_user(done + i, pdone);
1560 UNLOCK_BIGLOCK(d);
1561 return rc;
1565 int do_update_va_mapping(unsigned long page_nr,
1566 unsigned long val,
1567 unsigned long flags)
1569 struct exec_domain *ed = current;
1570 struct domain *d = ed->domain;
1571 int err = 0;
1572 unsigned int cpu = ed->processor;
1573 unsigned long deferred_ops;
1575 perfc_incrc(calls_to_update_va);
1577 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1578 return -EINVAL;
1580 LOCK_BIGLOCK(d);
1582 cleanup_writable_pagetable(d);
1584 /*
1585 * XXX When we make this support 4MB superpages we should also deal with
1586 * the case of updating L2 entries.
1587 */
1589 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1590 mk_l1_pgentry(val))) )
1591 err = -EINVAL;
1593 if ( unlikely(ed->mm.shadow_mode) )
1595 unsigned long sval;
1597 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1599 if ( unlikely(__put_user(sval, ((unsigned long *)(
1600 &shadow_linear_pg_table[page_nr])))) )
1602 /*
1603 * Since L2's are guranteed RW, failure indicates the page was not
1604 * shadowed, so ignore.
1605 */
1606 perfc_incrc(shadow_update_va_fail);
1609 /*
1610 * If we're in log-dirty mode then we need to note that we've updated
1611 * the PTE in the PT-holding page. We need the machine frame number
1612 * for this.
1613 */
1614 if ( ed->mm.shadow_mode == SHM_logdirty )
1615 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1617 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1620 deferred_ops = percpu_info[cpu].deferred_ops;
1621 percpu_info[cpu].deferred_ops = 0;
1623 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1624 unlikely(flags & UVMF_FLUSH_TLB) )
1625 local_flush_tlb();
1626 else if ( unlikely(flags & UVMF_INVLPG) )
1627 __flush_tlb_one(page_nr << PAGE_SHIFT);
1629 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1630 (void)map_ldt_shadow_page(0);
1632 UNLOCK_BIGLOCK(d);
1634 return err;
1637 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1638 unsigned long val,
1639 unsigned long flags,
1640 domid_t domid)
1642 unsigned int cpu = smp_processor_id();
1643 struct domain *d;
1644 int rc;
1646 if ( unlikely(!IS_PRIV(current->domain)) )
1647 return -EPERM;
1649 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1650 if ( unlikely(d == NULL) )
1652 MEM_LOG("Unknown domain '%u'", domid);
1653 return -ESRCH;
1656 rc = do_update_va_mapping(page_nr, val, flags);
1658 put_domain(d);
1659 percpu_info[cpu].foreign = NULL;
1661 return rc;
1666 /*************************
1667 * Writable Pagetables
1668 */
1670 ptwr_info_t ptwr_info[NR_CPUS];
1672 #ifdef VERBOSE
1673 int ptwr_debug = 0x0;
1674 #define PTWR_PRINTK(_f, _a...) \
1675 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1676 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1677 #else
1678 #define PTWR_PRINTK(_f, _a...) ((void)0)
1679 #endif
1681 /* Flush the given writable p.t. page and write-protect it again. */
1682 void ptwr_flush(const int which)
1684 unsigned long sstat, spte, pte, *ptep, l1va;
1685 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1686 l2_pgentry_t *pl2e;
1687 int i, cpu = smp_processor_id();
1688 struct exec_domain *ed = current;
1689 struct domain *d = ed->domain;
1691 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1692 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1694 /*
1695 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1696 */
1698 if ( unlikely(__get_user(pte, ptep)) )
1700 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1701 /*
1702 * Really a bug. We could read this PTE during the initial fault,
1703 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1704 */
1705 BUG();
1707 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1708 PTWR_PRINT_WHICH, ptep, pte);
1709 pte &= ~_PAGE_RW;
1711 if ( unlikely(ed->mm.shadow_mode) )
1713 /* Write-protect the p.t. page in the shadow page table. */
1714 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1715 __put_user(
1716 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1718 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1719 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1720 if ( sstat & PSH_shadowed )
1721 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1724 /* Write-protect the p.t. page in the guest page table. */
1725 if ( unlikely(__put_user(pte, ptep)) )
1727 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1728 /*
1729 * Really a bug. We could write this PTE during the initial fault,
1730 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1731 */
1732 BUG();
1735 /* Ensure that there are no stale writable mappings in any TLB. */
1736 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1737 #if 1
1738 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1739 #else
1740 flush_tlb_all();
1741 #endif
1742 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1743 PTWR_PRINT_WHICH, ptep, pte);
1745 /*
1746 * STEP 2. Validate any modified PTEs.
1747 */
1749 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1750 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1752 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1753 nl1e = pl1e[i];
1755 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1756 continue;
1758 /*
1759 * Fast path for PTEs that have merely been write-protected
1760 * (e.g., during a Unix fork()). A strict reduction in privilege.
1761 */
1762 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1764 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1766 if ( unlikely(sl1e != NULL) )
1767 l1pte_propagate_from_guest(
1768 &ed->mm, &l1_pgentry_val(nl1e),
1769 &l1_pgentry_val(sl1e[i]));
1770 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1772 continue;
1775 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1777 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1778 /*
1779 * Make the remaining p.t's consistent before crashing, so the
1780 * reference counts are correct.
1781 */
1782 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1783 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1784 unmap_domain_mem(pl1e);
1785 ptwr_info[cpu].ptinfo[which].l1va = 0;
1786 UNLOCK_BIGLOCK(d);
1787 domain_crash();
1790 if ( unlikely(sl1e != NULL) )
1791 l1pte_propagate_from_guest(
1792 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1794 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1795 put_page_from_l1e(ol1e, d);
1797 unmap_domain_mem(pl1e);
1799 /*
1800 * STEP 3. Reattach the L1 p.t. page into the current address space.
1801 */
1803 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1805 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1806 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1809 /*
1810 * STEP 4. Final tidy-up.
1811 */
1813 ptwr_info[cpu].ptinfo[which].l1va = 0;
1815 if ( unlikely(sl1e != NULL) )
1817 unmap_domain_mem(sl1e);
1818 put_shadow_status(&ed->mm);
1822 /* Write page fault handler: check if guest is trying to modify a PTE. */
1823 int ptwr_do_page_fault(unsigned long addr)
1825 unsigned long pte, pfn, l2e;
1826 struct pfn_info *page;
1827 l2_pgentry_t *pl2e;
1828 int which, cpu = smp_processor_id();
1829 u32 l2_idx;
1831 /*
1832 * Attempt to read the PTE that maps the VA being accessed. By checking for
1833 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1834 */
1835 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1836 _PAGE_PRESENT) ||
1837 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1839 return 0;
1842 pfn = pte >> PAGE_SHIFT;
1843 page = &frame_table[pfn];
1845 /* We are looking only for read-only mappings of p.t. pages. */
1846 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1847 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1849 return 0;
1852 /* Get the L2 index at which this L1 p.t. is always mapped. */
1853 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1854 if ( unlikely(l2_idx >= PGT_va_unknown) )
1856 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1858 l2_idx >>= PGT_va_shift;
1860 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1862 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1863 domain_crash();
1866 /*
1867 * Is the L1 p.t. mapped into the current address space? If so we call it
1868 * an ACTIVE p.t., otherwise it is INACTIVE.
1869 */
1870 pl2e = &linear_l2_table[l2_idx];
1871 l2e = l2_pgentry_val(*pl2e);
1872 which = PTWR_PT_INACTIVE;
1873 if ( (l2e >> PAGE_SHIFT) == pfn )
1875 /* Check the PRESENT bit to set ACTIVE. */
1876 if ( likely(l2e & _PAGE_PRESENT) )
1877 which = PTWR_PT_ACTIVE;
1878 else {
1879 /*
1880 * If the PRESENT bit is clear, we may be conflicting with
1881 * the current ACTIVE p.t. (it may be the same p.t. mapped
1882 * at another virt addr).
1883 * The ptwr_flush call below will restore the PRESENT bit.
1884 */
1885 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1886 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1887 which = PTWR_PT_ACTIVE;
1891 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1892 "pfn %08lx\n", PTWR_PRINT_WHICH,
1893 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1895 /*
1896 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1897 * time. If there is already one, we must flush it out.
1898 */
1899 if ( ptwr_info[cpu].ptinfo[which].l1va )
1900 ptwr_flush(which);
1902 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1903 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1905 /* For safety, disconnect the L1 p.t. page from current space. */
1906 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1908 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1909 #if 1
1910 flush_tlb(); /* XXX Multi-CPU guests? */
1911 #else
1912 flush_tlb_all();
1913 #endif
1916 /* Temporarily map the L1 page, and make a copy of it. */
1917 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1918 memcpy(ptwr_info[cpu].ptinfo[which].page,
1919 ptwr_info[cpu].ptinfo[which].pl1e,
1920 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1922 /* Finally, make the p.t. page writable by the guest OS. */
1923 pte |= _PAGE_RW;
1924 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1925 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1926 if ( unlikely(__put_user(pte, (unsigned long *)
1927 &linear_pg_table[addr>>PAGE_SHIFT])) )
1929 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1930 &linear_pg_table[addr>>PAGE_SHIFT]);
1931 /* Toss the writable pagetable state and crash. */
1932 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1933 ptwr_info[cpu].ptinfo[which].l1va = 0;
1934 domain_crash();
1937 return EXCRET_fault_fixed;
1940 static __init int ptwr_init(void)
1942 int i;
1944 for ( i = 0; i < smp_num_cpus; i++ )
1946 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1947 (void *)alloc_xenheap_page();
1948 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1949 (void *)alloc_xenheap_page();
1952 return 0;
1954 __initcall(ptwr_init);
1959 /************************************************************************/
1960 /************************************************************************/
1961 /************************************************************************/
1963 #ifndef NDEBUG
1965 void ptwr_status(void)
1967 unsigned long pte, *ptep, pfn;
1968 struct pfn_info *page;
1969 int cpu = smp_processor_id();
1971 ptep = (unsigned long *)&linear_pg_table
1972 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1974 if ( __get_user(pte, ptep) ) {
1975 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1976 domain_crash();
1979 pfn = pte >> PAGE_SHIFT;
1980 page = &frame_table[pfn];
1981 printk("need to alloc l1 page %p\n", page);
1982 /* make pt page writable */
1983 printk("need to make read-only l1-page at %p is %08lx\n",
1984 ptep, pte);
1986 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1987 return;
1989 if ( __get_user(pte, (unsigned long *)
1990 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1991 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1992 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1993 domain_crash();
1995 pfn = pte >> PAGE_SHIFT;
1996 page = &frame_table[pfn];
1999 void audit_domain(struct domain *d)
2001 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
2003 void adjust (struct pfn_info *page, int dir, int adjtype)
2005 int count = page->count_info & PGC_count_mask;
2007 if ( adjtype )
2009 int tcount = page->u.inuse.type_info & PGT_count_mask;
2011 ttot++;
2013 tcount += dir;
2015 if ( tcount < 0 )
2017 /* This will only come out once. */
2018 printk("Audit %d: type count whent below zero pfn=%x "
2019 "taf=%x otaf=%x\n",
2020 d->id, page-frame_table,
2021 page->u.inuse.type_info,
2022 page->tlbflush_timestamp);
2025 page->u.inuse.type_info =
2026 (page->u.inuse.type_info & ~PGT_count_mask) |
2027 (tcount & PGT_count_mask);
2030 ctot++;
2031 count += dir;
2032 if ( count < 0 )
2034 /* This will only come out once. */
2035 printk("Audit %d: general count whent below zero pfn=%x "
2036 "taf=%x otaf=%x\n",
2037 d->id, page-frame_table,
2038 page->u.inuse.type_info,
2039 page->tlbflush_timestamp);
2042 page->count_info =
2043 (page->count_info & ~PGC_count_mask) |
2044 (count & PGC_count_mask);
2048 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2050 unsigned long pfn, *pt;
2051 struct list_head *list_ent;
2052 struct pfn_info *page;
2053 int i;
2055 list_ent = d->page_list.next;
2056 for ( i = 0; (list_ent != &d->page_list); i++ )
2058 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2059 page = &frame_table[pfn];
2061 switch ( page->u.inuse.type_info & PGT_type_mask )
2063 case PGT_l1_page_table:
2064 case PGT_l2_page_table:
2065 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2066 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2067 if ( (pt[i] & _PAGE_PRESENT) &&
2068 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2069 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2070 d->id, i, pfn, page->u.inuse.type_info,
2071 page->count_info);
2072 unmap_domain_mem(pt);
2075 list_ent = frame_table[pfn].list.next;
2080 void scan_for_pfn_remote(unsigned long xpfn)
2082 struct domain *e;
2083 for_each_domain ( e )
2084 scan_for_pfn( e, xpfn );
2087 int i;
2088 unsigned long pfn;
2089 struct list_head *list_ent;
2090 struct pfn_info *page;
2092 if ( d != current->domain )
2093 domain_pause(d);
2094 synchronise_pagetables(~0UL);
2096 printk("pt base=%lx sh_info=%x\n",
2097 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2098 virt_to_page(d->shared_info)-frame_table);
2100 spin_lock(&d->page_alloc_lock);
2102 /* PHASE 0 */
2104 list_ent = d->page_list.next;
2105 for ( i = 0; (list_ent != &d->page_list); i++ )
2107 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2108 page = &frame_table[pfn];
2110 if ( page->u.inuse.domain != d )
2111 BUG();
2113 if ( (page->u.inuse.type_info & PGT_count_mask) >
2114 (page->count_info & PGC_count_mask) )
2115 printk("taf > caf %x %x pfn=%lx\n",
2116 page->u.inuse.type_info, page->count_info, pfn );
2118 #if 0 /* SYSV shared memory pages plus writeable files. */
2119 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2120 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2122 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2123 pfn,
2124 page->u.inuse.type_info,
2125 page->count_info );
2126 scan_for_pfn_remote(pfn);
2128 #endif
2129 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2130 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2132 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2133 pfn,
2134 page->u.inuse.type_info,
2135 page->count_info );
2138 /* Use tlbflush_timestamp to store original type_info. */
2139 page->tlbflush_timestamp = page->u.inuse.type_info;
2141 list_ent = frame_table[pfn].list.next;
2145 /* PHASE 1 */
2147 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2149 list_ent = d->page_list.next;
2150 for ( i = 0; (list_ent != &d->page_list); i++ )
2152 unsigned long *pt;
2153 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2154 page = &frame_table[pfn];
2156 if ( page->u.inuse.domain != d )
2157 BUG();
2159 switch ( page->u.inuse.type_info & PGT_type_mask )
2161 case PGT_l2_page_table:
2163 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2164 printk("Audit %d: L2 not validated %x\n",
2165 d->id, page->u.inuse.type_info);
2167 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2168 printk("Audit %d: L2 not pinned %x\n",
2169 d->id, page->u.inuse.type_info);
2170 else
2171 adjust( page, -1, 1 );
2173 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2175 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2177 if ( pt[i] & _PAGE_PRESENT )
2179 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2180 struct pfn_info *l1page = &frame_table[l1pfn];
2182 if ( l1page->u.inuse.domain != d )
2184 printk("L2: Skip bizarre page belonging to other "
2185 "dom %p\n", l1page->u.inuse.domain);
2186 continue;
2189 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2190 PGT_l2_page_table )
2191 printk("Audit %d: [%x] Found %s Linear PT "
2192 "t=%x pfn=%lx\n", d->id, i,
2193 (l1pfn==pfn) ? "Self" : "Other",
2194 l1page->u.inuse.type_info,
2195 l1pfn);
2196 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2197 PGT_l1_page_table )
2198 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2199 d->id, i,
2200 l1page->u.inuse.type_info,
2201 l1pfn);
2203 adjust(l1page, -1, 1);
2207 unmap_domain_mem(pt);
2209 break;
2212 case PGT_l1_page_table:
2214 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2215 adjust( page, -1, 1 );
2217 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2218 printk("Audit %d: L1 not validated %x\n",
2219 d->id, page->u.inuse.type_info);
2220 #if 0
2221 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2222 printk("Audit %d: L1 not pinned %x\n",
2223 d->id, page->u.inuse.type_info);
2224 #endif
2225 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2227 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2229 if ( pt[i] & _PAGE_PRESENT )
2231 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2232 struct pfn_info *l1page = &frame_table[l1pfn];
2234 if ( l1pfn < 0x100 )
2236 lowmem_mappings++;
2237 continue;
2240 if ( l1pfn > max_page )
2242 io_mappings++;
2243 continue;
2246 if ( pt[i] & _PAGE_RW )
2249 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2250 PGT_l1_page_table ||
2251 (l1page->u.inuse.type_info & PGT_type_mask) ==
2252 PGT_l2_page_table )
2253 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2254 d->id, i,
2255 l1page->u.inuse.type_info,
2256 l1pfn);
2260 if ( l1page->u.inuse.domain != d )
2262 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2263 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2264 d->id, pfn, i,
2265 (unsigned long)l1page->u.inuse.domain,
2266 l1pfn,
2267 l1page->count_info,
2268 l1page->u.inuse.type_info,
2269 machine_to_phys_mapping[l1pfn]);
2270 continue;
2273 adjust(l1page, -1, 0);
2277 unmap_domain_mem(pt);
2279 break;
2282 list_ent = frame_table[pfn].list.next;
2285 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2286 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2287 d->id, lowmem_mappings, io_mappings);
2289 /* PHASE 2 */
2291 ctot = ttot = 0;
2292 list_ent = d->page_list.next;
2293 for ( i = 0; (list_ent != &d->page_list); i++ )
2295 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2296 page = &frame_table[pfn];
2298 switch ( page->u.inuse.type_info & PGT_type_mask)
2300 case PGT_l1_page_table:
2301 case PGT_l2_page_table:
2302 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2304 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2305 d->id, page->u.inuse.type_info,
2306 page->tlbflush_timestamp,
2307 page->count_info, pfn );
2308 scan_for_pfn_remote(pfn);
2310 default:
2311 if ( (page->count_info & PGC_count_mask) != 1 )
2313 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2314 d->id,
2315 page->count_info,
2316 page->u.inuse.type_info,
2317 page->tlbflush_timestamp, pfn );
2318 scan_for_pfn_remote(pfn);
2320 break;
2323 list_ent = frame_table[pfn].list.next;
2326 /* PHASE 3 */
2328 list_ent = d->page_list.next;
2329 for ( i = 0; (list_ent != &d->page_list); i++ )
2331 unsigned long *pt;
2332 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2333 page = &frame_table[pfn];
2335 switch ( page->u.inuse.type_info & PGT_type_mask )
2337 case PGT_l2_page_table:
2338 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2339 adjust( page, 1, 1 );
2341 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2343 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2345 if ( pt[i] & _PAGE_PRESENT )
2347 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2348 struct pfn_info *l1page = &frame_table[l1pfn];
2350 if ( l1page->u.inuse.domain == d)
2351 adjust(l1page, 1, 1);
2355 unmap_domain_mem(pt);
2356 break;
2358 case PGT_l1_page_table:
2359 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2360 adjust( page, 1, 1 );
2362 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2364 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2366 if ( pt[i] & _PAGE_PRESENT )
2368 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2369 struct pfn_info *l1page = &frame_table[l1pfn];
2371 if ( (l1page->u.inuse.domain != d) ||
2372 (l1pfn < 0x100) || (l1pfn > max_page) )
2373 continue;
2375 adjust(l1page, 1, 0);
2379 unmap_domain_mem(pt);
2380 break;
2384 page->tlbflush_timestamp = 0;
2386 list_ent = frame_table[pfn].list.next;
2389 spin_unlock(&d->page_alloc_lock);
2391 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2393 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2395 if ( d != current->domain )
2396 domain_unpause(d);
2399 void audit_domains(void)
2401 struct domain *d;
2402 for_each_domain ( d )
2403 audit_domain(d);
2406 void audit_domains_key(unsigned char key)
2408 audit_domains();
2411 #endif