debuggers.hg

view xen/arch/x86/memory.c @ 3645:fd1dd0663b09

bitkeeper revision 1.1159.212.68 (42001e4d1AQiGV2pdPTNrs2AU2LjsQ)

Merge pb001.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into pb001.cl.cam.ac.uk:/auto/groups/xeno/users/iap10/xeno-clone/xen-unstable.bk
author iap10@pb001.cl.cam.ac.uk
date Wed Feb 02 00:26:53 2005 +0000 (2005-02-02)
parents fec8b1778268 e6af5d8f8b39
children 060c1ea52343
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
105 #ifdef VERBOSE
106 #define MEM_LOG(_f, _a...) \
107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
108 current->domain->id , __LINE__ , ## _a )
109 #else
110 #define MEM_LOG(_f, _a...) ((void)0)
111 #endif
113 static int alloc_l2_table(struct pfn_info *page);
114 static int alloc_l1_table(struct pfn_info *page);
115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
117 u32 type,
118 struct domain *d);
120 static void free_l2_table(struct pfn_info *page);
121 static void free_l1_table(struct pfn_info *page);
123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
126 /* Used to defer flushing of memory structures. */
127 static struct {
128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
130 unsigned long deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct pfn_info *frame_table;
146 unsigned long frame_table_size;
147 unsigned long max_page;
149 void __init init_frametable(void)
150 {
151 unsigned long i, p;
153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
154 frame_table_size = max_page * sizeof(struct pfn_info);
155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
158 {
159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
160 if ( p == 0 )
161 panic("Not enough memory for frame table\n");
162 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
163 4UL << 20, PAGE_HYPERVISOR);
164 }
166 memset(frame_table, 0, frame_table_size);
167 }
169 void arch_init_memory(void)
170 {
171 #ifdef __i386__
172 unsigned long i;
174 /*
175 * We are rather picky about the layout of 'struct pfn_info'. The
176 * count_info and domain fields must be adjacent, as we perform atomic
177 * 64-bit operations on them. Also, just for sanity, we assert the size
178 * of the structure here.
179 */
180 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
181 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
182 (sizeof(struct pfn_info) != 24) )
183 {
184 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
185 offsetof(struct pfn_info, count_info),
186 offsetof(struct pfn_info, u.inuse.domain),
187 sizeof(struct pfn_info));
188 for ( ; ; ) ;
189 }
191 memset(percpu_info, 0, sizeof(percpu_info));
193 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
194 memset(machine_to_phys_mapping, 0x55, 4<<20);
196 /*
197 * Initialise our DOMID_XEN domain.
198 * Any Xen-heap pages that we will allow to be mapped will have
199 * their domain field set to dom_xen.
200 */
201 dom_xen = alloc_domain_struct();
202 atomic_set(&dom_xen->refcnt, 1);
203 dom_xen->id = DOMID_XEN;
205 /*
206 * Initialise our DOMID_IO domain.
207 * This domain owns no pages but is considered a special case when
208 * mapping I/O pages, as the mappings occur at the priv of the caller.
209 */
210 dom_io = alloc_domain_struct();
211 atomic_set(&dom_io->refcnt, 1);
212 dom_io->id = DOMID_IO;
214 /* M2P table is mappable read-only by privileged domains. */
215 for ( i = 0; i < 1024; i++ )
216 {
217 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
218 /* gdt to make sure it's only mapped read-only by non-privileged
219 domains. */
220 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
221 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
222 }
223 #endif
224 }
226 static void __invalidate_shadow_ldt(struct exec_domain *d)
227 {
228 int i;
229 unsigned long pfn;
230 struct pfn_info *page;
232 d->mm.shadow_ldt_mapcnt = 0;
234 for ( i = 16; i < 32; i++ )
235 {
236 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
237 if ( pfn == 0 ) continue;
238 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
239 page = &frame_table[pfn];
240 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
241 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
242 put_page_and_type(page);
243 }
245 /* Dispose of the (now possibly invalid) mappings from the TLB. */
246 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
247 }
250 static inline void invalidate_shadow_ldt(struct exec_domain *d)
251 {
252 if ( d->mm.shadow_ldt_mapcnt != 0 )
253 __invalidate_shadow_ldt(d);
254 }
257 static int alloc_segdesc_page(struct pfn_info *page)
258 {
259 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
260 int i;
262 for ( i = 0; i < 512; i++ )
263 if ( unlikely(!check_descriptor(&descs[i*2])) )
264 goto fail;
266 unmap_domain_mem(descs);
267 return 1;
269 fail:
270 unmap_domain_mem(descs);
271 return 0;
272 }
275 /* Map shadow page at offset @off. */
276 int map_ldt_shadow_page(unsigned int off)
277 {
278 struct exec_domain *ed = current;
279 struct domain *d = ed->domain;
280 unsigned long l1e;
282 if ( unlikely(in_irq()) )
283 BUG();
285 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
286 PAGE_SHIFT) + off]);
288 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
289 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
290 d, PGT_ldt_page)) )
291 return 0;
293 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
294 ed->mm.shadow_ldt_mapcnt++;
296 return 1;
297 }
300 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
301 {
302 struct pfn_info *page = &frame_table[page_nr];
304 if ( unlikely(!pfn_is_ram(page_nr)) )
305 {
306 MEM_LOG("Pfn %08lx is not RAM", page_nr);
307 return 0;
308 }
310 if ( unlikely(!get_page(page, d)) )
311 {
312 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
313 return 0;
314 }
316 return 1;
317 }
320 static int get_page_and_type_from_pagenr(unsigned long page_nr,
321 u32 type,
322 struct domain *d)
323 {
324 struct pfn_info *page = &frame_table[page_nr];
326 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
327 return 0;
329 if ( unlikely(!get_page_type(page, type)) )
330 {
331 #ifdef VERBOSE
332 if ( (type & PGT_type_mask) != PGT_l1_page_table )
333 MEM_LOG("Bad page type for pfn %08lx (%08x)",
334 page_nr, page->u.inuse.type_info);
335 #endif
336 put_page(page);
337 return 0;
338 }
340 return 1;
341 }
344 /*
345 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
346 * needs some special care with reference counst and access permissions:
347 * 1. The mapping entry must be read-only, or the guest may get write access
348 * to its own PTEs.
349 * 2. We must only bump the reference counts for an *already validated*
350 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
351 * on a validation that is required to complete that validation.
352 * 3. We only need to increment the reference counts for the mapped page
353 * frame if it is mapped by a different L2 table. This is sufficient and
354 * also necessary to allow validation of an L2 table mapping itself.
355 */
356 static int
357 get_linear_pagetable(
358 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
359 {
360 u32 x, y;
361 struct pfn_info *page;
363 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
364 {
365 MEM_LOG("Attempt to create linear p.t. with write perms");
366 return 0;
367 }
369 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
370 {
371 /* Make sure the mapped frame belongs to the correct domain. */
372 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
373 return 0;
375 /*
376 * Make sure that the mapped frame is an already-validated L2 table.
377 * If so, atomically increment the count (checking for overflow).
378 */
379 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
380 y = page->u.inuse.type_info;
381 do {
382 x = y;
383 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
384 unlikely((x & (PGT_type_mask|PGT_validated)) !=
385 (PGT_l2_page_table|PGT_validated)) )
386 {
387 put_page(page);
388 return 0;
389 }
390 }
391 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
392 }
394 return 1;
395 }
398 static int
399 get_page_from_l1e(
400 l1_pgentry_t l1e, struct domain *d)
401 {
402 unsigned long l1v = l1_pgentry_val(l1e);
403 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
404 struct pfn_info *page = &frame_table[pfn];
405 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
407 if ( !(l1v & _PAGE_PRESENT) )
408 return 1;
410 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
411 {
412 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
413 return 0;
414 }
416 if ( unlikely(!pfn_is_ram(pfn)) )
417 {
418 /* Revert to caller privileges if FD == DOMID_IO. */
419 if ( d == dom_io )
420 d = current->domain;
422 if ( IS_PRIV(d) )
423 return 1;
425 if ( IS_CAPABLE_PHYSDEV(d) )
426 return domain_iomem_in_pfn(d, pfn);
428 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
429 return 0;
430 }
432 return ((l1v & _PAGE_RW) ?
433 get_page_and_type(page, d, PGT_writable_page) :
434 get_page(page, d));
435 }
438 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
439 static int
440 get_page_from_l2e(
441 l2_pgentry_t l2e, unsigned long pfn,
442 struct domain *d, unsigned long va_idx)
443 {
444 int rc;
446 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
447 return 1;
449 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
450 {
451 MEM_LOG("Bad L2 page type settings %04lx",
452 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
453 return 0;
454 }
456 rc = get_page_and_type_from_pagenr(
457 l2_pgentry_to_pagenr(l2e),
458 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
460 if ( unlikely(!rc) )
461 return get_linear_pagetable(l2e, pfn, d);
463 return 1;
464 }
467 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
468 {
469 unsigned long l1v = l1_pgentry_val(l1e);
470 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
471 struct pfn_info *page = &frame_table[pfn];
472 struct domain *e;
474 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
475 return;
477 e = page->u.inuse.domain;
478 if ( unlikely(e != d) )
479 {
480 /*
481 * Unmap a foreign page that may have been mapped via a grant table.
482 * Note that this can fail for a privileged domain that can map foreign
483 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
484 * counted via a grant entry and some counted directly in the page
485 * structure's reference count. Note that reference counts won't get
486 * dangerously confused as long as we always try to decrement the
487 * grant entry first. We may end up with a mismatch between which
488 * mappings and which unmappings are counted via the grant entry, but
489 * really it doesn't matter as privileged domains have carte blanche.
490 */
491 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
492 return;
493 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
494 }
496 if ( l1v & _PAGE_RW )
497 {
498 put_page_and_type(page);
499 }
500 else
501 {
502 /* We expect this is rare so we blow the entire shadow LDT. */
503 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
504 PGT_ldt_page)) &&
505 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
506 invalidate_shadow_ldt(e->exec_domain[0]);
507 put_page(page);
508 }
509 }
512 /*
513 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
514 * Note also that this automatically deals correctly with linear p.t.'s.
515 */
516 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
517 {
518 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
519 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
520 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
521 }
524 static int alloc_l2_table(struct pfn_info *page)
525 {
526 struct domain *d = page->u.inuse.domain;
527 unsigned long page_nr = page_to_pfn(page);
528 l2_pgentry_t *pl2e;
529 int i;
531 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
533 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
534 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
535 goto fail;
537 #if defined(__i386__)
538 /* Now we add our private high mappings. */
539 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
540 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
541 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
542 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
543 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
544 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
545 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
546 __PAGE_HYPERVISOR);
547 #endif
549 unmap_domain_mem(pl2e);
550 return 1;
552 fail:
553 while ( i-- > 0 )
554 put_page_from_l2e(pl2e[i], page_nr);
556 unmap_domain_mem(pl2e);
557 return 0;
558 }
561 static int alloc_l1_table(struct pfn_info *page)
562 {
563 struct domain *d = page->u.inuse.domain;
564 unsigned long page_nr = page_to_pfn(page);
565 l1_pgentry_t *pl1e;
566 int i;
568 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
570 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
571 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
572 goto fail;
574 unmap_domain_mem(pl1e);
575 return 1;
577 fail:
578 while ( i-- > 0 )
579 put_page_from_l1e(pl1e[i], d);
581 unmap_domain_mem(pl1e);
582 return 0;
583 }
586 static void free_l2_table(struct pfn_info *page)
587 {
588 unsigned long page_nr = page - frame_table;
589 l2_pgentry_t *pl2e;
590 int i;
592 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
594 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
595 put_page_from_l2e(pl2e[i], page_nr);
597 unmap_domain_mem(pl2e);
598 }
601 static void free_l1_table(struct pfn_info *page)
602 {
603 struct domain *d = page->u.inuse.domain;
604 unsigned long page_nr = page - frame_table;
605 l1_pgentry_t *pl1e;
606 int i;
608 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
610 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
611 put_page_from_l1e(pl1e[i], d);
613 unmap_domain_mem(pl1e);
614 }
617 static inline int update_l2e(l2_pgentry_t *pl2e,
618 l2_pgentry_t ol2e,
619 l2_pgentry_t nl2e)
620 {
621 unsigned long o = cmpxchg((unsigned long *)pl2e,
622 l2_pgentry_val(ol2e),
623 l2_pgentry_val(nl2e));
624 if ( o != l2_pgentry_val(ol2e) )
625 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
626 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
627 return (o == l2_pgentry_val(ol2e));
628 }
631 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
632 static int mod_l2_entry(l2_pgentry_t *pl2e,
633 l2_pgentry_t nl2e,
634 unsigned long pfn)
635 {
636 l2_pgentry_t ol2e;
637 unsigned long _ol2e;
639 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
640 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
641 {
642 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
643 return 0;
644 }
646 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
647 return 0;
648 ol2e = mk_l2_pgentry(_ol2e);
650 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
651 {
652 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
653 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
654 return update_l2e(pl2e, ol2e, nl2e);
656 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
657 ((unsigned long)pl2e &
658 ~PAGE_MASK) >> 2)) )
659 return 0;
661 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
662 {
663 put_page_from_l2e(nl2e, pfn);
664 return 0;
665 }
667 put_page_from_l2e(ol2e, pfn);
668 return 1;
669 }
671 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
672 return 0;
674 put_page_from_l2e(ol2e, pfn);
675 return 1;
676 }
679 static inline int update_l1e(l1_pgentry_t *pl1e,
680 l1_pgentry_t ol1e,
681 l1_pgentry_t nl1e)
682 {
683 unsigned long o = l1_pgentry_val(ol1e);
684 unsigned long n = l1_pgentry_val(nl1e);
686 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
687 unlikely(o != l1_pgentry_val(ol1e)) )
688 {
689 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
690 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
691 return 0;
692 }
694 return 1;
695 }
698 /* Update the L1 entry at pl1e to new value nl1e. */
699 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
700 {
701 l1_pgentry_t ol1e;
702 unsigned long _ol1e;
703 struct domain *d = current->domain;
705 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
706 {
707 MEM_LOG("Bad get_user\n");
708 return 0;
709 }
711 ol1e = mk_l1_pgentry(_ol1e);
713 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
714 {
715 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
716 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
717 return update_l1e(pl1e, ol1e, nl1e);
719 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
720 return 0;
722 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
723 {
724 put_page_from_l1e(nl1e, d);
725 return 0;
726 }
728 put_page_from_l1e(ol1e, d);
729 return 1;
730 }
732 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
733 return 0;
735 put_page_from_l1e(ol1e, d);
736 return 1;
737 }
740 int alloc_page_type(struct pfn_info *page, unsigned int type)
741 {
742 switch ( type )
743 {
744 case PGT_l1_page_table:
745 return alloc_l1_table(page);
746 case PGT_l2_page_table:
747 return alloc_l2_table(page);
748 case PGT_gdt_page:
749 case PGT_ldt_page:
750 return alloc_segdesc_page(page);
751 default:
752 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
753 type, page->u.inuse.type_info,
754 page->count_info);
755 BUG();
756 }
758 return 0;
759 }
762 void free_page_type(struct pfn_info *page, unsigned int type)
763 {
764 struct domain *d = page->u.inuse.domain;
766 switch ( type )
767 {
768 case PGT_l1_page_table:
769 free_l1_table(page);
770 break;
772 case PGT_l2_page_table:
773 free_l2_table(page);
774 break;
776 default:
777 BUG();
778 }
780 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
781 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
782 {
783 unshadow_table(page_to_pfn(page), type);
784 put_shadow_status(&d->exec_domain[0]->mm);
785 }
786 }
789 void put_page_type(struct pfn_info *page)
790 {
791 u32 nx, x, y = page->u.inuse.type_info;
793 again:
794 do {
795 x = y;
796 nx = x - 1;
798 ASSERT((x & PGT_count_mask) != 0);
800 /*
801 * The page should always be validated while a reference is held. The
802 * exception is during domain destruction, when we forcibly invalidate
803 * page-table pages if we detect a referential loop.
804 * See domain.c:relinquish_list().
805 */
806 ASSERT((x & PGT_validated) ||
807 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
809 if ( unlikely((nx & PGT_count_mask) == 0) )
810 {
811 /* Record TLB information for flush later. Races are harmless. */
812 page->tlbflush_timestamp = tlbflush_current_time();
814 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
815 likely(nx & PGT_validated) )
816 {
817 /*
818 * Page-table pages must be unvalidated when count is zero. The
819 * 'free' is safe because the refcnt is non-zero and validated
820 * bit is clear => other ops will spin or fail.
821 */
822 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
823 x & ~PGT_validated)) != x) )
824 goto again;
825 /* We cleared the 'valid bit' so we do the clear up. */
826 free_page_type(page, x & PGT_type_mask);
827 /* Carry on, but with the 'valid bit' now clear. */
828 x &= ~PGT_validated;
829 nx &= ~PGT_validated;
830 }
831 }
832 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
833 (PGT_pinned | 1)) )
834 {
835 /* Page is now only pinned. Make the back pointer mutable again. */
836 nx |= PGT_va_mutable;
837 }
838 }
839 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
840 }
843 int get_page_type(struct pfn_info *page, u32 type)
844 {
845 u32 nx, x, y = page->u.inuse.type_info;
847 again:
848 do {
849 x = y;
850 nx = x + 1;
851 if ( unlikely((nx & PGT_count_mask) == 0) )
852 {
853 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
854 return 0;
855 }
856 else if ( unlikely((x & PGT_count_mask) == 0) )
857 {
858 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
859 {
860 /*
861 * On type change we check to flush stale TLB entries. This
862 * may be unnecessary (e.g., page was GDT/LDT) but those
863 * circumstances should be very rare.
864 */
865 struct domain *d = page->u.inuse.domain;
866 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
867 page->tlbflush_timestamp)) )
868 {
869 perfc_incr(need_flush_tlb_flush);
870 flush_tlb_cpu(d->exec_domain[0]->processor);
871 }
873 /* We lose existing type, back pointer, and validity. */
874 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
875 nx |= type;
877 /* No special validation needed for writable pages. */
878 /* Page tables and GDT/LDT need to be scanned for validity. */
879 if ( type == PGT_writable_page )
880 nx |= PGT_validated;
881 }
882 }
883 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
884 {
885 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
886 {
887 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
888 ((type & PGT_type_mask) != PGT_l1_page_table) )
889 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
890 x & PGT_type_mask, type, page_to_pfn(page));
891 return 0;
892 }
893 else if ( (x & PGT_va_mask) == PGT_va_mutable )
894 {
895 /* The va backpointer is mutable, hence we update it. */
896 nx &= ~PGT_va_mask;
897 nx |= type; /* we know the actual type is correct */
898 }
899 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
900 {
901 /* This table is potentially mapped at multiple locations. */
902 nx &= ~PGT_va_mask;
903 nx |= PGT_va_unknown;
904 }
905 }
906 else if ( unlikely(!(x & PGT_validated)) )
907 {
908 /* Someone else is updating validation of this page. Wait... */
909 while ( (y = page->u.inuse.type_info) == x )
910 {
911 rep_nop();
912 barrier();
913 }
914 goto again;
915 }
916 }
917 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
919 if ( unlikely(!(nx & PGT_validated)) )
920 {
921 /* Try to validate page type; drop the new reference on failure. */
922 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
923 {
924 MEM_LOG("Error while validating pfn %08lx for type %08x."
925 " caf=%08x taf=%08x\n",
926 page_to_pfn(page), type,
927 page->count_info,
928 page->u.inuse.type_info);
929 /* Noone else can get a reference. We hold the only ref. */
930 page->u.inuse.type_info = 0;
931 return 0;
932 }
934 /* Noone else is updating simultaneously. */
935 __set_bit(_PGT_validated, &page->u.inuse.type_info);
936 }
938 return 1;
939 }
942 int new_guest_cr3(unsigned long pfn)
943 {
944 struct exec_domain *ed = current;
945 struct domain *d = ed->domain;
946 int okay, cpu = smp_processor_id();
947 unsigned long old_base_pfn;
949 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
950 if ( likely(okay) )
951 {
952 invalidate_shadow_ldt(ed);
954 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
955 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
956 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
958 shadow_mk_pagetable(&ed->mm);
960 write_ptbase(&ed->mm);
962 put_page_and_type(&frame_table[old_base_pfn]);
963 }
964 else
965 {
966 MEM_LOG("Error while installing new baseptr %08lx", pfn);
967 }
969 return okay;
970 }
972 static int do_extended_command(unsigned long ptr, unsigned long val)
973 {
974 int okay = 1, cpu = smp_processor_id();
975 unsigned int cmd = val & MMUEXT_CMD_MASK;
976 unsigned long pfn = ptr >> PAGE_SHIFT;
977 struct pfn_info *page = &frame_table[pfn];
978 struct exec_domain *ed = current;
979 struct domain *d = ed->domain, *nd, *e;
980 u32 x, y;
981 domid_t domid;
982 grant_ref_t gntref;
984 switch ( cmd )
985 {
986 case MMUEXT_PIN_L1_TABLE:
987 case MMUEXT_PIN_L2_TABLE:
988 /*
989 * We insist that, if you pin an L1 page, it's the first thing that
990 * you do to it. This is because we require the backptr to still be
991 * mutable. This assumption seems safe.
992 */
993 okay = get_page_and_type_from_pagenr(
994 pfn,
995 ((cmd==MMUEXT_PIN_L2_TABLE) ?
996 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
997 FOREIGNDOM);
999 if ( unlikely(!okay) )
1001 MEM_LOG("Error while pinning pfn %08lx", pfn);
1002 break;
1005 if ( unlikely(test_and_set_bit(_PGT_pinned,
1006 &page->u.inuse.type_info)) )
1008 MEM_LOG("Pfn %08lx already pinned", pfn);
1009 put_page_and_type(page);
1010 okay = 0;
1011 break;
1014 break;
1016 case MMUEXT_UNPIN_TABLE:
1017 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1019 MEM_LOG("Page %08lx bad domain (dom=%p)",
1020 ptr, page->u.inuse.domain);
1022 else if ( likely(test_and_clear_bit(_PGT_pinned,
1023 &page->u.inuse.type_info)) )
1025 put_page_and_type(page);
1026 put_page(page);
1028 else
1030 okay = 0;
1031 put_page(page);
1032 MEM_LOG("Pfn %08lx not pinned", pfn);
1034 break;
1036 case MMUEXT_NEW_BASEPTR:
1037 okay = new_guest_cr3(pfn);
1038 break;
1040 case MMUEXT_TLB_FLUSH:
1041 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1042 break;
1044 case MMUEXT_INVLPG:
1045 __flush_tlb_one(ptr);
1046 break;
1048 case MMUEXT_FLUSH_CACHE:
1049 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1051 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1052 okay = 0;
1054 else
1056 wbinvd();
1058 break;
1060 case MMUEXT_SET_LDT:
1062 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1063 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1064 (ents > 8192) ||
1065 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1066 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1068 okay = 0;
1069 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1071 else if ( (ed->mm.ldt_ents != ents) ||
1072 (ed->mm.ldt_base != ptr) )
1074 invalidate_shadow_ldt(ed);
1075 ed->mm.ldt_base = ptr;
1076 ed->mm.ldt_ents = ents;
1077 load_LDT(ed);
1078 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1079 if ( ents != 0 )
1080 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1082 break;
1085 case MMUEXT_SET_FOREIGNDOM:
1086 domid = (domid_t)(val >> 16);
1088 if ( (e = percpu_info[cpu].foreign) != NULL )
1089 put_domain(e);
1090 percpu_info[cpu].foreign = NULL;
1092 if ( !IS_PRIV(d) )
1094 switch ( domid )
1096 case DOMID_IO:
1097 get_knownalive_domain(dom_io);
1098 percpu_info[cpu].foreign = dom_io;
1099 break;
1100 default:
1101 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1102 okay = 0;
1103 break;
1106 else
1108 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1109 if ( e == NULL )
1111 switch ( domid )
1113 case DOMID_XEN:
1114 get_knownalive_domain(dom_xen);
1115 percpu_info[cpu].foreign = dom_xen;
1116 break;
1117 case DOMID_IO:
1118 get_knownalive_domain(dom_io);
1119 percpu_info[cpu].foreign = dom_io;
1120 break;
1121 default:
1122 MEM_LOG("Unknown domain '%u'", domid);
1123 okay = 0;
1124 break;
1128 break;
1130 case MMUEXT_TRANSFER_PAGE:
1131 domid = (domid_t)(val >> 16);
1132 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1134 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1135 unlikely(!pfn_is_ram(pfn)) ||
1136 unlikely((e = find_domain_by_id(domid)) == NULL) )
1138 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1139 okay = 0;
1140 break;
1143 spin_lock(&d->page_alloc_lock);
1145 /*
1146 * The tricky bit: atomically release ownership while there is just one
1147 * benign reference to the page (PGC_allocated). If that reference
1148 * disappears then the deallocation routine will safely spin.
1149 */
1150 nd = page->u.inuse.domain;
1151 y = page->count_info;
1152 do {
1153 x = y;
1154 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1155 (1|PGC_allocated)) ||
1156 unlikely(nd != d) )
1158 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1159 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1160 d, d->id, nd, x, page->u.inuse.type_info);
1161 spin_unlock(&d->page_alloc_lock);
1162 put_domain(e);
1163 return 0;
1165 __asm__ __volatile__(
1166 LOCK_PREFIX "cmpxchg8b %2"
1167 : "=d" (nd), "=a" (y),
1168 "=m" (*(volatile u64 *)(&page->count_info))
1169 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1171 while ( unlikely(nd != d) || unlikely(y != x) );
1173 /*
1174 * Unlink from 'd'. At least one reference remains (now anonymous), so
1175 * noone else is spinning to try to delete this page from 'd'.
1176 */
1177 d->tot_pages--;
1178 list_del(&page->list);
1180 spin_unlock(&d->page_alloc_lock);
1182 spin_lock(&e->page_alloc_lock);
1184 /*
1185 * Check that 'e' will accept the page and has reservation headroom.
1186 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1187 */
1188 ASSERT(e->tot_pages <= e->max_pages);
1189 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1190 unlikely(e->tot_pages == e->max_pages) ||
1191 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1193 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1194 "provided a bad grant ref, or is dying (%08lx).\n",
1195 e->tot_pages, e->max_pages, e->d_flags);
1196 spin_unlock(&e->page_alloc_lock);
1197 put_domain(e);
1198 okay = 0;
1199 break;
1202 /* Okay, add the page to 'e'. */
1203 if ( unlikely(e->tot_pages++ == 0) )
1204 get_knownalive_domain(e);
1205 list_add_tail(&page->list, &e->page_list);
1206 page->u.inuse.domain = e;
1208 spin_unlock(&e->page_alloc_lock);
1210 /* Transfer is all done: tell the guest about its new page frame. */
1211 gnttab_notify_transfer(e, gntref, pfn);
1213 put_domain(e);
1214 break;
1216 case MMUEXT_REASSIGN_PAGE:
1217 if ( unlikely(!IS_PRIV(d)) )
1219 MEM_LOG("Dom %u has no reassignment priv", d->id);
1220 okay = 0;
1221 break;
1224 e = percpu_info[cpu].foreign;
1225 if ( unlikely(e == NULL) )
1227 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1228 okay = 0;
1229 break;
1232 /*
1233 * Grab both page_list locks, in order. This prevents the page from
1234 * disappearing elsewhere while we modify the owner, and we'll need
1235 * both locks if we're successful so that we can change lists.
1236 */
1237 if ( d < e )
1239 spin_lock(&d->page_alloc_lock);
1240 spin_lock(&e->page_alloc_lock);
1242 else
1244 spin_lock(&e->page_alloc_lock);
1245 spin_lock(&d->page_alloc_lock);
1248 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1249 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1250 unlikely(IS_XEN_HEAP_FRAME(page)) )
1252 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1253 okay = 0;
1254 goto reassign_fail;
1257 /*
1258 * The tricky bit: atomically change owner while there is just one
1259 * benign reference to the page (PGC_allocated). If that reference
1260 * disappears then the deallocation routine will safely spin.
1261 */
1262 nd = page->u.inuse.domain;
1263 y = page->count_info;
1264 do {
1265 x = y;
1266 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1267 (1|PGC_allocated)) ||
1268 unlikely(nd != d) )
1270 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1271 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1272 d, d->id, nd, x, page->u.inuse.type_info);
1273 okay = 0;
1274 goto reassign_fail;
1276 __asm__ __volatile__(
1277 LOCK_PREFIX "cmpxchg8b %3"
1278 : "=d" (nd), "=a" (y), "=c" (e),
1279 "=m" (*(volatile u64 *)(&page->count_info))
1280 : "0" (d), "1" (x), "c" (e), "b" (x) );
1282 while ( unlikely(nd != d) || unlikely(y != x) );
1284 /*
1285 * Unlink from 'd'. We transferred at least one reference to 'e', so
1286 * noone else is spinning to try to delete this page from 'd'.
1287 */
1288 d->tot_pages--;
1289 list_del(&page->list);
1291 /*
1292 * Add the page to 'e'. Someone may already have removed the last
1293 * reference and want to remove the page from 'e'. However, we have
1294 * the lock so they'll spin waiting for us.
1295 */
1296 if ( unlikely(e->tot_pages++ == 0) )
1297 get_knownalive_domain(e);
1298 list_add_tail(&page->list, &e->page_list);
1300 reassign_fail:
1301 spin_unlock(&d->page_alloc_lock);
1302 spin_unlock(&e->page_alloc_lock);
1303 break;
1305 case MMUEXT_CLEAR_FOREIGNDOM:
1306 if ( (e = percpu_info[cpu].foreign) != NULL )
1307 put_domain(e);
1308 percpu_info[cpu].foreign = NULL;
1309 break;
1311 default:
1312 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1313 okay = 0;
1314 break;
1317 return okay;
1320 int do_mmu_update(
1321 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1323 /*
1324 * We steal the m.s.b. of the @count parameter to indicate whether this
1325 * invocation of do_mmu_update() is resuming a previously preempted call.
1326 * We steal the next 15 bits to remember the current FOREIGNDOM.
1327 */
1328 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1329 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1330 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1332 mmu_update_t req;
1333 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1334 struct pfn_info *page;
1335 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1336 unsigned int cmd, done = 0;
1337 unsigned long prev_spfn = 0;
1338 l1_pgentry_t *prev_spl1e = 0;
1339 struct exec_domain *ed = current;
1340 struct domain *d = ed->domain;
1341 u32 type_info;
1342 domid_t domid;
1344 LOCK_BIGLOCK(d);
1346 cleanup_writable_pagetable(d);
1348 /*
1349 * If we are resuming after preemption, read how much work we have already
1350 * done. This allows us to set the @done output parameter correctly.
1351 * We also reset FOREIGNDOM here.
1352 */
1353 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1355 if ( !(count & MMU_UPDATE_PREEMPTED) )
1357 /* Count overflow into private FOREIGNDOM field. */
1358 MEM_LOG("do_mmu_update count is too large");
1359 rc = -EINVAL;
1360 goto out;
1362 count &= ~MMU_UPDATE_PREEMPTED;
1363 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1364 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1365 if ( unlikely(pdone != NULL) )
1366 (void)get_user(done, pdone);
1367 if ( (domid != current->domain->id) &&
1368 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1370 rc = -EINVAL;
1371 goto out;
1375 perfc_incrc(calls_to_mmu_update);
1376 perfc_addc(num_page_updates, count);
1378 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1380 rc = -EFAULT;
1381 goto out;
1384 for ( i = 0; i < count; i++ )
1386 if ( hypercall_preempt_check() )
1388 rc = hypercall_create_continuation(
1389 __HYPERVISOR_mmu_update, 3, ureqs,
1390 (count - i) |
1391 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1392 MMU_UPDATE_PREEMPTED, pdone);
1393 break;
1396 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1398 MEM_LOG("Bad __copy_from_user");
1399 rc = -EFAULT;
1400 break;
1403 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1404 pfn = req.ptr >> PAGE_SHIFT;
1406 okay = 0;
1408 switch ( cmd )
1410 /*
1411 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1412 */
1413 case MMU_NORMAL_PT_UPDATE:
1414 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1416 MEM_LOG("Could not get page for normal update");
1417 break;
1420 if ( likely(prev_pfn == pfn) )
1422 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1424 else
1426 if ( prev_pfn != 0 )
1427 unmap_domain_mem((void *)va);
1428 va = (unsigned long)map_domain_mem(req.ptr);
1429 prev_pfn = pfn;
1432 page = &frame_table[pfn];
1433 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1435 case PGT_l1_page_table:
1436 if ( likely(get_page_type(
1437 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1439 okay = mod_l1_entry((l1_pgentry_t *)va,
1440 mk_l1_pgentry(req.val));
1442 if ( unlikely(ed->mm.shadow_mode) && okay &&
1443 (get_shadow_status(&ed->mm, page-frame_table) &
1444 PSH_shadowed) )
1446 shadow_l1_normal_pt_update(
1447 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1448 put_shadow_status(&ed->mm);
1451 put_page_type(page);
1453 break;
1454 case PGT_l2_page_table:
1455 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1457 okay = mod_l2_entry((l2_pgentry_t *)va,
1458 mk_l2_pgentry(req.val),
1459 pfn);
1461 if ( unlikely(ed->mm.shadow_mode) && okay &&
1462 (get_shadow_status(&ed->mm, page-frame_table) &
1463 PSH_shadowed) )
1465 shadow_l2_normal_pt_update(req.ptr, req.val);
1466 put_shadow_status(&ed->mm);
1469 put_page_type(page);
1471 break;
1472 default:
1473 if ( likely(get_page_type(page, PGT_writable_page)) )
1475 *(unsigned long *)va = req.val;
1476 okay = 1;
1477 put_page_type(page);
1479 break;
1482 put_page(page);
1483 break;
1485 case MMU_MACHPHYS_UPDATE:
1486 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1488 MEM_LOG("Could not get page for mach->phys update");
1489 break;
1492 machine_to_phys_mapping[pfn] = req.val;
1493 okay = 1;
1495 /*
1496 * If in log-dirty mode, mark the corresponding pseudo-physical
1497 * page as dirty.
1498 */
1499 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1500 mark_dirty(&ed->mm, pfn) )
1501 ed->mm.shadow_dirty_block_count++;
1503 put_page(&frame_table[pfn]);
1504 break;
1506 /*
1507 * MMU_EXTENDED_COMMAND: Extended command is specified
1508 * in the least-siginificant bits of the 'value' field.
1509 */
1510 case MMU_EXTENDED_COMMAND:
1511 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1512 okay = do_extended_command(req.ptr, req.val);
1513 break;
1515 default:
1516 MEM_LOG("Invalid page update command %08lx", req.ptr);
1517 break;
1520 if ( unlikely(!okay) )
1522 rc = -EINVAL;
1523 break;
1526 ureqs++;
1529 out:
1530 if ( prev_pfn != 0 )
1531 unmap_domain_mem((void *)va);
1533 if ( unlikely(prev_spl1e != 0) )
1534 unmap_domain_mem((void *)prev_spl1e);
1536 deferred_ops = percpu_info[cpu].deferred_ops;
1537 percpu_info[cpu].deferred_ops = 0;
1539 if ( deferred_ops & DOP_FLUSH_TLB )
1540 local_flush_tlb();
1542 if ( deferred_ops & DOP_RELOAD_LDT )
1543 (void)map_ldt_shadow_page(0);
1545 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1547 put_domain(percpu_info[cpu].foreign);
1548 percpu_info[cpu].foreign = NULL;
1551 /* Add incremental work we have done to the @done output parameter. */
1552 if ( unlikely(pdone != NULL) )
1553 __put_user(done + i, pdone);
1555 UNLOCK_BIGLOCK(d);
1556 return rc;
1560 int do_update_va_mapping(unsigned long page_nr,
1561 unsigned long val,
1562 unsigned long flags)
1564 struct exec_domain *ed = current;
1565 struct domain *d = ed->domain;
1566 int err = 0;
1567 unsigned int cpu = ed->processor;
1568 unsigned long deferred_ops;
1570 perfc_incrc(calls_to_update_va);
1572 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1573 return -EINVAL;
1575 LOCK_BIGLOCK(d);
1577 cleanup_writable_pagetable(d);
1579 /*
1580 * XXX When we make this support 4MB superpages we should also deal with
1581 * the case of updating L2 entries.
1582 */
1584 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1585 mk_l1_pgentry(val))) )
1586 err = -EINVAL;
1588 if ( unlikely(ed->mm.shadow_mode) )
1590 unsigned long sval;
1592 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1594 if ( unlikely(__put_user(sval, ((unsigned long *)(
1595 &shadow_linear_pg_table[page_nr])))) )
1597 /*
1598 * Since L2's are guranteed RW, failure indicates the page was not
1599 * shadowed, so ignore.
1600 */
1601 perfc_incrc(shadow_update_va_fail);
1604 /*
1605 * If we're in log-dirty mode then we need to note that we've updated
1606 * the PTE in the PT-holding page. We need the machine frame number
1607 * for this.
1608 */
1609 if ( ed->mm.shadow_mode == SHM_logdirty )
1610 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1612 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1615 deferred_ops = percpu_info[cpu].deferred_ops;
1616 percpu_info[cpu].deferred_ops = 0;
1618 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1619 unlikely(flags & UVMF_FLUSH_TLB) )
1620 local_flush_tlb();
1621 else if ( unlikely(flags & UVMF_INVLPG) )
1622 __flush_tlb_one(page_nr << PAGE_SHIFT);
1624 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1625 (void)map_ldt_shadow_page(0);
1627 UNLOCK_BIGLOCK(d);
1629 return err;
1632 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1633 unsigned long val,
1634 unsigned long flags,
1635 domid_t domid)
1637 unsigned int cpu = smp_processor_id();
1638 struct domain *d;
1639 int rc;
1641 if ( unlikely(!IS_PRIV(current->domain)) )
1642 return -EPERM;
1644 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1645 if ( unlikely(d == NULL) )
1647 MEM_LOG("Unknown domain '%u'", domid);
1648 return -ESRCH;
1651 rc = do_update_va_mapping(page_nr, val, flags);
1653 put_domain(d);
1654 percpu_info[cpu].foreign = NULL;
1656 return rc;
1661 /*************************
1662 * Writable Pagetables
1663 */
1665 ptwr_info_t ptwr_info[NR_CPUS];
1667 #ifdef VERBOSE
1668 int ptwr_debug = 0x0;
1669 #define PTWR_PRINTK(_f, _a...) \
1670 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1671 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1672 #else
1673 #define PTWR_PRINTK(_f, _a...) ((void)0)
1674 #endif
1676 /* Flush the given writable p.t. page and write-protect it again. */
1677 void ptwr_flush(const int which)
1679 unsigned long sstat, spte, pte, *ptep, l1va;
1680 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1681 l2_pgentry_t *pl2e;
1682 int i, cpu = smp_processor_id();
1683 struct exec_domain *ed = current;
1684 struct domain *d = ed->domain;
1686 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1687 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1689 /*
1690 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1691 */
1693 if ( unlikely(__get_user(pte, ptep)) )
1695 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1696 /*
1697 * Really a bug. We could read this PTE during the initial fault,
1698 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1699 */
1700 BUG();
1702 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1703 PTWR_PRINT_WHICH, ptep, pte);
1704 pte &= ~_PAGE_RW;
1706 if ( unlikely(ed->mm.shadow_mode) )
1708 /* Write-protect the p.t. page in the shadow page table. */
1709 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1710 __put_user(
1711 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1713 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1714 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1715 if ( sstat & PSH_shadowed )
1716 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1719 /* Write-protect the p.t. page in the guest page table. */
1720 if ( unlikely(__put_user(pte, ptep)) )
1722 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1723 /*
1724 * Really a bug. We could write this PTE during the initial fault,
1725 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1726 */
1727 BUG();
1730 /* Ensure that there are no stale writable mappings in any TLB. */
1731 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1732 #if 1
1733 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1734 #else
1735 flush_tlb_all();
1736 #endif
1737 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1738 PTWR_PRINT_WHICH, ptep, pte);
1740 /*
1741 * STEP 2. Validate any modified PTEs.
1742 */
1744 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1745 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1747 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1748 nl1e = pl1e[i];
1750 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1751 continue;
1753 /*
1754 * Fast path for PTEs that have merely been write-protected
1755 * (e.g., during a Unix fork()). A strict reduction in privilege.
1756 */
1757 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1759 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1761 if ( unlikely(sl1e != NULL) )
1762 l1pte_propagate_from_guest(
1763 &ed->mm, &l1_pgentry_val(nl1e),
1764 &l1_pgentry_val(sl1e[i]));
1765 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1767 continue;
1770 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1772 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1773 /*
1774 * Make the remaining p.t's consistent before crashing, so the
1775 * reference counts are correct.
1776 */
1777 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1778 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1779 unmap_domain_mem(pl1e);
1780 ptwr_info[cpu].ptinfo[which].l1va = 0;
1781 UNLOCK_BIGLOCK(d);
1782 domain_crash();
1785 if ( unlikely(sl1e != NULL) )
1786 l1pte_propagate_from_guest(
1787 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1789 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1790 put_page_from_l1e(ol1e, d);
1792 unmap_domain_mem(pl1e);
1794 /*
1795 * STEP 3. Reattach the L1 p.t. page into the current address space.
1796 */
1798 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1800 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1801 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1804 /*
1805 * STEP 4. Final tidy-up.
1806 */
1808 ptwr_info[cpu].ptinfo[which].l1va = 0;
1810 if ( unlikely(sl1e != NULL) )
1812 unmap_domain_mem(sl1e);
1813 put_shadow_status(&ed->mm);
1817 /* Write page fault handler: check if guest is trying to modify a PTE. */
1818 int ptwr_do_page_fault(unsigned long addr)
1820 unsigned long pte, pfn, l2e;
1821 struct pfn_info *page;
1822 l2_pgentry_t *pl2e;
1823 int which, cpu = smp_processor_id();
1824 u32 l2_idx;
1826 /*
1827 * Attempt to read the PTE that maps the VA being accessed. By checking for
1828 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1829 */
1830 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1831 _PAGE_PRESENT) ||
1832 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1834 return 0;
1837 pfn = pte >> PAGE_SHIFT;
1838 page = &frame_table[pfn];
1840 /* We are looking only for read-only mappings of p.t. pages. */
1841 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1842 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1844 return 0;
1847 /* Get the L2 index at which this L1 p.t. is always mapped. */
1848 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1849 if ( unlikely(l2_idx >= PGT_va_unknown) )
1851 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1853 l2_idx >>= PGT_va_shift;
1855 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1857 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1858 domain_crash();
1861 /*
1862 * Is the L1 p.t. mapped into the current address space? If so we call it
1863 * an ACTIVE p.t., otherwise it is INACTIVE.
1864 */
1865 pl2e = &linear_l2_table[l2_idx];
1866 l2e = l2_pgentry_val(*pl2e);
1867 which = PTWR_PT_INACTIVE;
1868 if ( (l2e >> PAGE_SHIFT) == pfn )
1870 /* Check the PRESENT bit to set ACTIVE. */
1871 if ( likely(l2e & _PAGE_PRESENT) )
1872 which = PTWR_PT_ACTIVE;
1873 else {
1874 /*
1875 * If the PRESENT bit is clear, we may be conflicting with
1876 * the current ACTIVE p.t. (it may be the same p.t. mapped
1877 * at another virt addr).
1878 * The ptwr_flush call below will restore the PRESENT bit.
1879 */
1880 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1881 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1882 which = PTWR_PT_ACTIVE;
1886 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1887 "pfn %08lx\n", PTWR_PRINT_WHICH,
1888 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1890 /*
1891 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1892 * time. If there is already one, we must flush it out.
1893 */
1894 if ( ptwr_info[cpu].ptinfo[which].l1va )
1895 ptwr_flush(which);
1897 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1898 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1900 /* For safety, disconnect the L1 p.t. page from current space. */
1901 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1903 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1904 #if 1
1905 flush_tlb(); /* XXX Multi-CPU guests? */
1906 #else
1907 flush_tlb_all();
1908 #endif
1911 /* Temporarily map the L1 page, and make a copy of it. */
1912 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1913 memcpy(ptwr_info[cpu].ptinfo[which].page,
1914 ptwr_info[cpu].ptinfo[which].pl1e,
1915 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1917 /* Finally, make the p.t. page writable by the guest OS. */
1918 pte |= _PAGE_RW;
1919 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1920 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1921 if ( unlikely(__put_user(pte, (unsigned long *)
1922 &linear_pg_table[addr>>PAGE_SHIFT])) )
1924 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1925 &linear_pg_table[addr>>PAGE_SHIFT]);
1926 /* Toss the writable pagetable state and crash. */
1927 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1928 ptwr_info[cpu].ptinfo[which].l1va = 0;
1929 domain_crash();
1932 return EXCRET_fault_fixed;
1935 static __init int ptwr_init(void)
1937 int i;
1939 for ( i = 0; i < smp_num_cpus; i++ )
1941 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1942 (void *)alloc_xenheap_page();
1943 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1944 (void *)alloc_xenheap_page();
1947 return 0;
1949 __initcall(ptwr_init);
1954 /************************************************************************/
1955 /************************************************************************/
1956 /************************************************************************/
1958 #ifndef NDEBUG
1960 void ptwr_status(void)
1962 unsigned long pte, *ptep, pfn;
1963 struct pfn_info *page;
1964 int cpu = smp_processor_id();
1966 ptep = (unsigned long *)&linear_pg_table
1967 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1969 if ( __get_user(pte, ptep) ) {
1970 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1971 domain_crash();
1974 pfn = pte >> PAGE_SHIFT;
1975 page = &frame_table[pfn];
1976 printk("need to alloc l1 page %p\n", page);
1977 /* make pt page writable */
1978 printk("need to make read-only l1-page at %p is %08lx\n",
1979 ptep, pte);
1981 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1982 return;
1984 if ( __get_user(pte, (unsigned long *)
1985 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1986 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1987 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1988 domain_crash();
1990 pfn = pte >> PAGE_SHIFT;
1991 page = &frame_table[pfn];
1994 void audit_domain(struct domain *d)
1996 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1998 void adjust (struct pfn_info *page, int dir, int adjtype)
2000 int count = page->count_info & PGC_count_mask;
2002 if ( adjtype )
2004 int tcount = page->u.inuse.type_info & PGT_count_mask;
2006 ttot++;
2008 tcount += dir;
2010 if ( tcount < 0 )
2012 /* This will only come out once. */
2013 printk("Audit %d: type count whent below zero pfn=%x "
2014 "taf=%x otaf=%x\n",
2015 d->id, page-frame_table,
2016 page->u.inuse.type_info,
2017 page->tlbflush_timestamp);
2020 page->u.inuse.type_info =
2021 (page->u.inuse.type_info & ~PGT_count_mask) |
2022 (tcount & PGT_count_mask);
2025 ctot++;
2026 count += dir;
2027 if ( count < 0 )
2029 /* This will only come out once. */
2030 printk("Audit %d: general count whent below zero pfn=%x "
2031 "taf=%x otaf=%x\n",
2032 d->id, page-frame_table,
2033 page->u.inuse.type_info,
2034 page->tlbflush_timestamp);
2037 page->count_info =
2038 (page->count_info & ~PGC_count_mask) |
2039 (count & PGC_count_mask);
2043 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2045 unsigned long pfn, *pt;
2046 struct list_head *list_ent;
2047 struct pfn_info *page;
2048 int i;
2050 list_ent = d->page_list.next;
2051 for ( i = 0; (list_ent != &d->page_list); i++ )
2053 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2054 page = &frame_table[pfn];
2056 switch ( page->u.inuse.type_info & PGT_type_mask )
2058 case PGT_l1_page_table:
2059 case PGT_l2_page_table:
2060 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2061 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2062 if ( (pt[i] & _PAGE_PRESENT) &&
2063 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2064 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2065 d->id, i, pfn, page->u.inuse.type_info,
2066 page->count_info);
2067 unmap_domain_mem(pt);
2070 list_ent = frame_table[pfn].list.next;
2075 void scan_for_pfn_remote(unsigned long xpfn)
2077 struct domain *e;
2078 for_each_domain ( e )
2079 scan_for_pfn( e, xpfn );
2082 int i;
2083 unsigned long pfn;
2084 struct list_head *list_ent;
2085 struct pfn_info *page;
2087 if ( d != current->domain )
2088 domain_pause(d);
2089 synchronise_pagetables(~0UL);
2091 printk("pt base=%lx sh_info=%x\n",
2092 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2093 virt_to_page(d->shared_info)-frame_table);
2095 spin_lock(&d->page_alloc_lock);
2097 /* PHASE 0 */
2099 list_ent = d->page_list.next;
2100 for ( i = 0; (list_ent != &d->page_list); i++ )
2102 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2103 page = &frame_table[pfn];
2105 if ( page->u.inuse.domain != d )
2106 BUG();
2108 if ( (page->u.inuse.type_info & PGT_count_mask) >
2109 (page->count_info & PGC_count_mask) )
2110 printk("taf > caf %x %x pfn=%lx\n",
2111 page->u.inuse.type_info, page->count_info, pfn );
2113 #if 0 /* SYSV shared memory pages plus writeable files. */
2114 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2115 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2117 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2118 pfn,
2119 page->u.inuse.type_info,
2120 page->count_info );
2121 scan_for_pfn_remote(pfn);
2123 #endif
2124 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2125 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2127 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2128 pfn,
2129 page->u.inuse.type_info,
2130 page->count_info );
2133 /* Use tlbflush_timestamp to store original type_info. */
2134 page->tlbflush_timestamp = page->u.inuse.type_info;
2136 list_ent = frame_table[pfn].list.next;
2140 /* PHASE 1 */
2142 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2144 list_ent = d->page_list.next;
2145 for ( i = 0; (list_ent != &d->page_list); i++ )
2147 unsigned long *pt;
2148 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2149 page = &frame_table[pfn];
2151 if ( page->u.inuse.domain != d )
2152 BUG();
2154 switch ( page->u.inuse.type_info & PGT_type_mask )
2156 case PGT_l2_page_table:
2158 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2159 printk("Audit %d: L2 not validated %x\n",
2160 d->id, page->u.inuse.type_info);
2162 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2163 printk("Audit %d: L2 not pinned %x\n",
2164 d->id, page->u.inuse.type_info);
2165 else
2166 adjust( page, -1, 1 );
2168 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2170 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2172 if ( pt[i] & _PAGE_PRESENT )
2174 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2175 struct pfn_info *l1page = &frame_table[l1pfn];
2177 if ( l1page->u.inuse.domain != d )
2179 printk("L2: Skip bizarre page belonging to other "
2180 "dom %p\n", l1page->u.inuse.domain);
2181 continue;
2184 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2185 PGT_l2_page_table )
2186 printk("Audit %d: [%x] Found %s Linear PT "
2187 "t=%x pfn=%lx\n", d->id, i,
2188 (l1pfn==pfn) ? "Self" : "Other",
2189 l1page->u.inuse.type_info,
2190 l1pfn);
2191 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2192 PGT_l1_page_table )
2193 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2194 d->id, i,
2195 l1page->u.inuse.type_info,
2196 l1pfn);
2198 adjust(l1page, -1, 1);
2202 unmap_domain_mem(pt);
2204 break;
2207 case PGT_l1_page_table:
2209 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2210 adjust( page, -1, 1 );
2212 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2213 printk("Audit %d: L1 not validated %x\n",
2214 d->id, page->u.inuse.type_info);
2215 #if 0
2216 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2217 printk("Audit %d: L1 not pinned %x\n",
2218 d->id, page->u.inuse.type_info);
2219 #endif
2220 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2222 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2224 if ( pt[i] & _PAGE_PRESENT )
2226 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2227 struct pfn_info *l1page = &frame_table[l1pfn];
2229 if ( l1pfn < 0x100 )
2231 lowmem_mappings++;
2232 continue;
2235 if ( l1pfn > max_page )
2237 io_mappings++;
2238 continue;
2241 if ( pt[i] & _PAGE_RW )
2244 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2245 PGT_l1_page_table ||
2246 (l1page->u.inuse.type_info & PGT_type_mask) ==
2247 PGT_l2_page_table )
2248 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2249 d->id, i,
2250 l1page->u.inuse.type_info,
2251 l1pfn);
2255 if ( l1page->u.inuse.domain != d )
2257 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2258 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2259 d->id, pfn, i,
2260 (unsigned long)l1page->u.inuse.domain,
2261 l1pfn,
2262 l1page->count_info,
2263 l1page->u.inuse.type_info,
2264 machine_to_phys_mapping[l1pfn]);
2265 continue;
2268 adjust(l1page, -1, 0);
2272 unmap_domain_mem(pt);
2274 break;
2277 list_ent = frame_table[pfn].list.next;
2280 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2281 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2282 d->id, lowmem_mappings, io_mappings);
2284 /* PHASE 2 */
2286 ctot = ttot = 0;
2287 list_ent = d->page_list.next;
2288 for ( i = 0; (list_ent != &d->page_list); i++ )
2290 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2291 page = &frame_table[pfn];
2293 switch ( page->u.inuse.type_info & PGT_type_mask)
2295 case PGT_l1_page_table:
2296 case PGT_l2_page_table:
2297 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2299 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2300 d->id, page->u.inuse.type_info,
2301 page->tlbflush_timestamp,
2302 page->count_info, pfn );
2303 scan_for_pfn_remote(pfn);
2305 default:
2306 if ( (page->count_info & PGC_count_mask) != 1 )
2308 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2309 d->id,
2310 page->count_info,
2311 page->u.inuse.type_info,
2312 page->tlbflush_timestamp, pfn );
2313 scan_for_pfn_remote(pfn);
2315 break;
2318 list_ent = frame_table[pfn].list.next;
2321 /* PHASE 3 */
2323 list_ent = d->page_list.next;
2324 for ( i = 0; (list_ent != &d->page_list); i++ )
2326 unsigned long *pt;
2327 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2328 page = &frame_table[pfn];
2330 switch ( page->u.inuse.type_info & PGT_type_mask )
2332 case PGT_l2_page_table:
2333 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2334 adjust( page, 1, 1 );
2336 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2338 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2340 if ( pt[i] & _PAGE_PRESENT )
2342 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2343 struct pfn_info *l1page = &frame_table[l1pfn];
2345 if ( l1page->u.inuse.domain == d)
2346 adjust(l1page, 1, 1);
2350 unmap_domain_mem(pt);
2351 break;
2353 case PGT_l1_page_table:
2354 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2355 adjust( page, 1, 1 );
2357 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2359 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2361 if ( pt[i] & _PAGE_PRESENT )
2363 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2364 struct pfn_info *l1page = &frame_table[l1pfn];
2366 if ( (l1page->u.inuse.domain != d) ||
2367 (l1pfn < 0x100) || (l1pfn > max_page) )
2368 continue;
2370 adjust(l1page, 1, 0);
2374 unmap_domain_mem(pt);
2375 break;
2379 page->tlbflush_timestamp = 0;
2381 list_ent = frame_table[pfn].list.next;
2384 spin_unlock(&d->page_alloc_lock);
2386 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2388 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2390 if ( d != current->domain )
2391 domain_unpause(d);
2394 void audit_domains(void)
2396 struct domain *d;
2397 for_each_domain ( d )
2398 audit_domain(d);
2401 void audit_domains_key(unsigned char key)
2403 audit_domains();
2406 #endif