debuggers.hg

view xen/arch/x86/memory.c @ 3632:fec8b1778268

bitkeeper revision 1.1159.212.60 (41febc4bKKSkh9u-Zes9v2CmBuLZxA)

More bootstrap fixes for x86/64. Next thing to do is sort out the IDT and
get traps.c working; then we can get rid of a bunch of dummy labels from
end of boot/x86_64.S. We're also going to need some kind of entry.S before
we can safely enable interrupts. Also bear in mind that not all of physical
RAM may be mapped (only first 1GB) and no m2p table is yet allocated or
mapped. Plenty to be done!
author kaf24@viper.(none)
date Mon Jan 31 23:16:27 2005 +0000 (2005-01-31)
parents 6d98eb831816
children 9a9c5a491401 e6af5d8f8b39 fd1dd0663b09
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
105 #ifdef VERBOSE
106 #define MEM_LOG(_f, _a...) \
107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
108 current->domain->id , __LINE__ , ## _a )
109 #else
110 #define MEM_LOG(_f, _a...) ((void)0)
111 #endif
113 static int alloc_l2_table(struct pfn_info *page);
114 static int alloc_l1_table(struct pfn_info *page);
115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
117 u32 type,
118 struct domain *d);
120 static void free_l2_table(struct pfn_info *page);
121 static void free_l1_table(struct pfn_info *page);
123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
126 /* Used to defer flushing of memory structures. */
127 static struct {
128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
130 unsigned long deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct pfn_info *frame_table;
146 unsigned long frame_table_size;
147 unsigned long max_page;
149 void __init init_frametable(void)
150 {
151 unsigned long i, p;
153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
154 frame_table_size = max_page * sizeof(struct pfn_info);
155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
158 {
159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
160 if ( p == 0 )
161 panic("Not enough memory for frame table\n");
162 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
163 4UL << 20, PAGE_HYPERVISOR);
164 }
166 memset(frame_table, 0, frame_table_size);
167 }
169 void arch_init_memory(void)
170 {
171 unsigned long i;
173 /*
174 * We are rather picky about the layout of 'struct pfn_info'. The
175 * count_info and domain fields must be adjacent, as we perform atomic
176 * 64-bit operations on them. Also, just for sanity, we assert the size
177 * of the structure here.
178 */
179 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
180 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
181 (sizeof(struct pfn_info) != 24) )
182 {
183 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
184 offsetof(struct pfn_info, count_info),
185 offsetof(struct pfn_info, u.inuse.domain),
186 sizeof(struct pfn_info));
187 for ( ; ; ) ;
188 }
190 memset(percpu_info, 0, sizeof(percpu_info));
192 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
193 memset(machine_to_phys_mapping, 0x55, 4<<20);
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain_struct();
201 atomic_set(&dom_xen->refcnt, 1);
202 dom_xen->id = DOMID_XEN;
204 /*
205 * Initialise our DOMID_IO domain.
206 * This domain owns no pages but is considered a special case when
207 * mapping I/O pages, as the mappings occur at the priv of the caller.
208 */
209 dom_io = alloc_domain_struct();
210 atomic_set(&dom_io->refcnt, 1);
211 dom_io->id = DOMID_IO;
213 /* M2P table is mappable read-only by privileged domains. */
214 for ( i = 0; i < 1024; i++ )
215 {
216 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
217 /* gdt to make sure it's only mapped read-only by non-privileged
218 domains. */
219 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
220 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
221 }
222 }
224 static void __invalidate_shadow_ldt(struct exec_domain *d)
225 {
226 int i;
227 unsigned long pfn;
228 struct pfn_info *page;
230 d->mm.shadow_ldt_mapcnt = 0;
232 for ( i = 16; i < 32; i++ )
233 {
234 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
235 if ( pfn == 0 ) continue;
236 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
237 page = &frame_table[pfn];
238 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
239 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
240 put_page_and_type(page);
241 }
243 /* Dispose of the (now possibly invalid) mappings from the TLB. */
244 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
245 }
248 static inline void invalidate_shadow_ldt(struct exec_domain *d)
249 {
250 if ( d->mm.shadow_ldt_mapcnt != 0 )
251 __invalidate_shadow_ldt(d);
252 }
255 static int alloc_segdesc_page(struct pfn_info *page)
256 {
257 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
258 int i;
260 for ( i = 0; i < 512; i++ )
261 if ( unlikely(!check_descriptor(&descs[i*2])) )
262 goto fail;
264 unmap_domain_mem(descs);
265 return 1;
267 fail:
268 unmap_domain_mem(descs);
269 return 0;
270 }
273 /* Map shadow page at offset @off. */
274 int map_ldt_shadow_page(unsigned int off)
275 {
276 struct exec_domain *ed = current;
277 struct domain *d = ed->domain;
278 unsigned long l1e;
280 if ( unlikely(in_irq()) )
281 BUG();
283 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
284 PAGE_SHIFT) + off]);
286 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
287 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
288 d, PGT_ldt_page)) )
289 return 0;
291 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
292 ed->mm.shadow_ldt_mapcnt++;
294 return 1;
295 }
298 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
299 {
300 struct pfn_info *page = &frame_table[page_nr];
302 if ( unlikely(!pfn_is_ram(page_nr)) )
303 {
304 MEM_LOG("Pfn %08lx is not RAM", page_nr);
305 return 0;
306 }
308 if ( unlikely(!get_page(page, d)) )
309 {
310 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
311 return 0;
312 }
314 return 1;
315 }
318 static int get_page_and_type_from_pagenr(unsigned long page_nr,
319 u32 type,
320 struct domain *d)
321 {
322 struct pfn_info *page = &frame_table[page_nr];
324 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
325 return 0;
327 if ( unlikely(!get_page_type(page, type)) )
328 {
329 #ifdef VERBOSE
330 if ( (type & PGT_type_mask) != PGT_l1_page_table )
331 MEM_LOG("Bad page type for pfn %08lx (%08x)",
332 page_nr, page->u.inuse.type_info);
333 #endif
334 put_page(page);
335 return 0;
336 }
338 return 1;
339 }
342 /*
343 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
344 * needs some special care with reference counst and access permissions:
345 * 1. The mapping entry must be read-only, or the guest may get write access
346 * to its own PTEs.
347 * 2. We must only bump the reference counts for an *already validated*
348 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
349 * on a validation that is required to complete that validation.
350 * 3. We only need to increment the reference counts for the mapped page
351 * frame if it is mapped by a different L2 table. This is sufficient and
352 * also necessary to allow validation of an L2 table mapping itself.
353 */
354 static int
355 get_linear_pagetable(
356 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
357 {
358 u32 x, y;
359 struct pfn_info *page;
361 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
362 {
363 MEM_LOG("Attempt to create linear p.t. with write perms");
364 return 0;
365 }
367 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
368 {
369 /* Make sure the mapped frame belongs to the correct domain. */
370 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
371 return 0;
373 /*
374 * Make sure that the mapped frame is an already-validated L2 table.
375 * If so, atomically increment the count (checking for overflow).
376 */
377 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
378 y = page->u.inuse.type_info;
379 do {
380 x = y;
381 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
382 unlikely((x & (PGT_type_mask|PGT_validated)) !=
383 (PGT_l2_page_table|PGT_validated)) )
384 {
385 put_page(page);
386 return 0;
387 }
388 }
389 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
390 }
392 return 1;
393 }
396 static int
397 get_page_from_l1e(
398 l1_pgentry_t l1e, struct domain *d)
399 {
400 unsigned long l1v = l1_pgentry_val(l1e);
401 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
402 struct pfn_info *page = &frame_table[pfn];
403 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
405 if ( !(l1v & _PAGE_PRESENT) )
406 return 1;
408 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
409 {
410 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
411 return 0;
412 }
414 if ( unlikely(!pfn_is_ram(pfn)) )
415 {
416 /* Revert to caller privileges if FD == DOMID_IO. */
417 if ( d == dom_io )
418 d = current->domain;
420 if ( IS_PRIV(d) )
421 return 1;
423 if ( IS_CAPABLE_PHYSDEV(d) )
424 return domain_iomem_in_pfn(d, pfn);
426 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
427 return 0;
428 }
430 return ((l1v & _PAGE_RW) ?
431 get_page_and_type(page, d, PGT_writable_page) :
432 get_page(page, d));
433 }
436 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
437 static int
438 get_page_from_l2e(
439 l2_pgentry_t l2e, unsigned long pfn,
440 struct domain *d, unsigned long va_idx)
441 {
442 int rc;
444 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
445 return 1;
447 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
448 {
449 MEM_LOG("Bad L2 page type settings %04lx",
450 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
451 return 0;
452 }
454 rc = get_page_and_type_from_pagenr(
455 l2_pgentry_to_pagenr(l2e),
456 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
458 if ( unlikely(!rc) )
459 return get_linear_pagetable(l2e, pfn, d);
461 return 1;
462 }
465 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
466 {
467 unsigned long l1v = l1_pgentry_val(l1e);
468 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
469 struct pfn_info *page = &frame_table[pfn];
470 struct domain *e;
472 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
473 return;
475 e = page->u.inuse.domain;
476 if ( unlikely(e != d) )
477 {
478 /*
479 * Unmap a foreign page that may have been mapped via a grant table.
480 * Note that this can fail for a privileged domain that can map foreign
481 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
482 * counted via a grant entry and some counted directly in the page
483 * structure's reference count. Note that reference counts won't get
484 * dangerously confused as long as we always try to decrement the
485 * grant entry first. We may end up with a mismatch between which
486 * mappings and which unmappings are counted via the grant entry, but
487 * really it doesn't matter as privileged domains have carte blanche.
488 */
489 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
490 return;
491 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
492 }
494 if ( l1v & _PAGE_RW )
495 {
496 put_page_and_type(page);
497 }
498 else
499 {
500 /* We expect this is rare so we blow the entire shadow LDT. */
501 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
502 PGT_ldt_page)) &&
503 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
504 invalidate_shadow_ldt(e->exec_domain[0]);
505 put_page(page);
506 }
507 }
510 /*
511 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
512 * Note also that this automatically deals correctly with linear p.t.'s.
513 */
514 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
515 {
516 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
517 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
518 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
519 }
522 static int alloc_l2_table(struct pfn_info *page)
523 {
524 struct domain *d = page->u.inuse.domain;
525 unsigned long page_nr = page_to_pfn(page);
526 l2_pgentry_t *pl2e;
527 int i;
529 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
531 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
532 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
533 goto fail;
535 #if defined(__i386__)
536 /* Now we add our private high mappings. */
537 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
538 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
539 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
540 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
541 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
542 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
543 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
544 __PAGE_HYPERVISOR);
545 #endif
547 unmap_domain_mem(pl2e);
548 return 1;
550 fail:
551 while ( i-- > 0 )
552 put_page_from_l2e(pl2e[i], page_nr);
554 unmap_domain_mem(pl2e);
555 return 0;
556 }
559 static int alloc_l1_table(struct pfn_info *page)
560 {
561 struct domain *d = page->u.inuse.domain;
562 unsigned long page_nr = page_to_pfn(page);
563 l1_pgentry_t *pl1e;
564 int i;
566 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
568 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
569 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
570 goto fail;
572 unmap_domain_mem(pl1e);
573 return 1;
575 fail:
576 while ( i-- > 0 )
577 put_page_from_l1e(pl1e[i], d);
579 unmap_domain_mem(pl1e);
580 return 0;
581 }
584 static void free_l2_table(struct pfn_info *page)
585 {
586 unsigned long page_nr = page - frame_table;
587 l2_pgentry_t *pl2e;
588 int i;
590 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
592 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
593 put_page_from_l2e(pl2e[i], page_nr);
595 unmap_domain_mem(pl2e);
596 }
599 static void free_l1_table(struct pfn_info *page)
600 {
601 struct domain *d = page->u.inuse.domain;
602 unsigned long page_nr = page - frame_table;
603 l1_pgentry_t *pl1e;
604 int i;
606 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
608 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
609 put_page_from_l1e(pl1e[i], d);
611 unmap_domain_mem(pl1e);
612 }
615 static inline int update_l2e(l2_pgentry_t *pl2e,
616 l2_pgentry_t ol2e,
617 l2_pgentry_t nl2e)
618 {
619 unsigned long o = cmpxchg((unsigned long *)pl2e,
620 l2_pgentry_val(ol2e),
621 l2_pgentry_val(nl2e));
622 if ( o != l2_pgentry_val(ol2e) )
623 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
624 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
625 return (o == l2_pgentry_val(ol2e));
626 }
629 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
630 static int mod_l2_entry(l2_pgentry_t *pl2e,
631 l2_pgentry_t nl2e,
632 unsigned long pfn)
633 {
634 l2_pgentry_t ol2e;
635 unsigned long _ol2e;
637 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
638 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
639 {
640 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
641 return 0;
642 }
644 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
645 return 0;
646 ol2e = mk_l2_pgentry(_ol2e);
648 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
649 {
650 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
651 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
652 return update_l2e(pl2e, ol2e, nl2e);
654 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
655 ((unsigned long)pl2e &
656 ~PAGE_MASK) >> 2)) )
657 return 0;
659 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
660 {
661 put_page_from_l2e(nl2e, pfn);
662 return 0;
663 }
665 put_page_from_l2e(ol2e, pfn);
666 return 1;
667 }
669 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
670 return 0;
672 put_page_from_l2e(ol2e, pfn);
673 return 1;
674 }
677 static inline int update_l1e(l1_pgentry_t *pl1e,
678 l1_pgentry_t ol1e,
679 l1_pgentry_t nl1e)
680 {
681 unsigned long o = l1_pgentry_val(ol1e);
682 unsigned long n = l1_pgentry_val(nl1e);
684 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
685 unlikely(o != l1_pgentry_val(ol1e)) )
686 {
687 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
688 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
689 return 0;
690 }
692 return 1;
693 }
696 /* Update the L1 entry at pl1e to new value nl1e. */
697 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
698 {
699 l1_pgentry_t ol1e;
700 unsigned long _ol1e;
701 struct domain *d = current->domain;
703 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
704 {
705 MEM_LOG("Bad get_user\n");
706 return 0;
707 }
709 ol1e = mk_l1_pgentry(_ol1e);
711 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
712 {
713 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
714 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
715 return update_l1e(pl1e, ol1e, nl1e);
717 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
718 return 0;
720 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
721 {
722 put_page_from_l1e(nl1e, d);
723 return 0;
724 }
726 put_page_from_l1e(ol1e, d);
727 return 1;
728 }
730 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
731 return 0;
733 put_page_from_l1e(ol1e, d);
734 return 1;
735 }
738 int alloc_page_type(struct pfn_info *page, unsigned int type)
739 {
740 switch ( type )
741 {
742 case PGT_l1_page_table:
743 return alloc_l1_table(page);
744 case PGT_l2_page_table:
745 return alloc_l2_table(page);
746 case PGT_gdt_page:
747 case PGT_ldt_page:
748 return alloc_segdesc_page(page);
749 default:
750 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
751 type, page->u.inuse.type_info,
752 page->count_info);
753 BUG();
754 }
756 return 0;
757 }
760 void free_page_type(struct pfn_info *page, unsigned int type)
761 {
762 struct domain *d = page->u.inuse.domain;
764 switch ( type )
765 {
766 case PGT_l1_page_table:
767 free_l1_table(page);
768 break;
770 case PGT_l2_page_table:
771 free_l2_table(page);
772 break;
774 default:
775 BUG();
776 }
778 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
779 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
780 {
781 unshadow_table(page_to_pfn(page), type);
782 put_shadow_status(&d->exec_domain[0]->mm);
783 }
784 }
787 void put_page_type(struct pfn_info *page)
788 {
789 u32 nx, x, y = page->u.inuse.type_info;
791 again:
792 do {
793 x = y;
794 nx = x - 1;
796 ASSERT((x & PGT_count_mask) != 0);
798 /*
799 * The page should always be validated while a reference is held. The
800 * exception is during domain destruction, when we forcibly invalidate
801 * page-table pages if we detect a referential loop.
802 * See domain.c:relinquish_list().
803 */
804 ASSERT((x & PGT_validated) ||
805 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
807 if ( unlikely((nx & PGT_count_mask) == 0) )
808 {
809 /* Record TLB information for flush later. Races are harmless. */
810 page->tlbflush_timestamp = tlbflush_current_time();
812 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
813 likely(nx & PGT_validated) )
814 {
815 /*
816 * Page-table pages must be unvalidated when count is zero. The
817 * 'free' is safe because the refcnt is non-zero and validated
818 * bit is clear => other ops will spin or fail.
819 */
820 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
821 x & ~PGT_validated)) != x) )
822 goto again;
823 /* We cleared the 'valid bit' so we do the clear up. */
824 free_page_type(page, x & PGT_type_mask);
825 /* Carry on, but with the 'valid bit' now clear. */
826 x &= ~PGT_validated;
827 nx &= ~PGT_validated;
828 }
829 }
830 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
831 (PGT_pinned | 1)) )
832 {
833 /* Page is now only pinned. Make the back pointer mutable again. */
834 nx |= PGT_va_mutable;
835 }
836 }
837 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
838 }
841 int get_page_type(struct pfn_info *page, u32 type)
842 {
843 u32 nx, x, y = page->u.inuse.type_info;
845 again:
846 do {
847 x = y;
848 nx = x + 1;
849 if ( unlikely((nx & PGT_count_mask) == 0) )
850 {
851 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
852 return 0;
853 }
854 else if ( unlikely((x & PGT_count_mask) == 0) )
855 {
856 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
857 {
858 /*
859 * On type change we check to flush stale TLB entries. This
860 * may be unnecessary (e.g., page was GDT/LDT) but those
861 * circumstances should be very rare.
862 */
863 struct domain *d = page->u.inuse.domain;
864 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
865 page->tlbflush_timestamp)) )
866 {
867 perfc_incr(need_flush_tlb_flush);
868 flush_tlb_cpu(d->exec_domain[0]->processor);
869 }
871 /* We lose existing type, back pointer, and validity. */
872 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
873 nx |= type;
875 /* No special validation needed for writable pages. */
876 /* Page tables and GDT/LDT need to be scanned for validity. */
877 if ( type == PGT_writable_page )
878 nx |= PGT_validated;
879 }
880 }
881 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
882 {
883 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
884 {
885 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
886 ((type & PGT_type_mask) != PGT_l1_page_table) )
887 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
888 x & PGT_type_mask, type, page_to_pfn(page));
889 return 0;
890 }
891 else if ( (x & PGT_va_mask) == PGT_va_mutable )
892 {
893 /* The va backpointer is mutable, hence we update it. */
894 nx &= ~PGT_va_mask;
895 nx |= type; /* we know the actual type is correct */
896 }
897 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
898 {
899 /* This table is potentially mapped at multiple locations. */
900 nx &= ~PGT_va_mask;
901 nx |= PGT_va_unknown;
902 }
903 }
904 else if ( unlikely(!(x & PGT_validated)) )
905 {
906 /* Someone else is updating validation of this page. Wait... */
907 while ( (y = page->u.inuse.type_info) == x )
908 {
909 rep_nop();
910 barrier();
911 }
912 goto again;
913 }
914 }
915 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
917 if ( unlikely(!(nx & PGT_validated)) )
918 {
919 /* Try to validate page type; drop the new reference on failure. */
920 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
921 {
922 MEM_LOG("Error while validating pfn %08lx for type %08x."
923 " caf=%08x taf=%08x\n",
924 page_to_pfn(page), type,
925 page->count_info,
926 page->u.inuse.type_info);
927 /* Noone else can get a reference. We hold the only ref. */
928 page->u.inuse.type_info = 0;
929 return 0;
930 }
932 /* Noone else is updating simultaneously. */
933 __set_bit(_PGT_validated, &page->u.inuse.type_info);
934 }
936 return 1;
937 }
940 int new_guest_cr3(unsigned long pfn)
941 {
942 struct exec_domain *ed = current;
943 struct domain *d = ed->domain;
944 int okay, cpu = smp_processor_id();
945 unsigned long old_base_pfn;
947 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
948 if ( likely(okay) )
949 {
950 invalidate_shadow_ldt(ed);
952 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
953 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
954 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
956 shadow_mk_pagetable(&ed->mm);
958 write_ptbase(&ed->mm);
960 put_page_and_type(&frame_table[old_base_pfn]);
961 }
962 else
963 {
964 MEM_LOG("Error while installing new baseptr %08lx", pfn);
965 }
967 return okay;
968 }
970 static int do_extended_command(unsigned long ptr, unsigned long val)
971 {
972 int okay = 1, cpu = smp_processor_id();
973 unsigned int cmd = val & MMUEXT_CMD_MASK;
974 unsigned long pfn = ptr >> PAGE_SHIFT;
975 struct pfn_info *page = &frame_table[pfn];
976 struct exec_domain *ed = current;
977 struct domain *d = ed->domain, *nd, *e;
978 u32 x, y;
979 domid_t domid;
980 grant_ref_t gntref;
982 switch ( cmd )
983 {
984 case MMUEXT_PIN_L1_TABLE:
985 case MMUEXT_PIN_L2_TABLE:
986 /*
987 * We insist that, if you pin an L1 page, it's the first thing that
988 * you do to it. This is because we require the backptr to still be
989 * mutable. This assumption seems safe.
990 */
991 okay = get_page_and_type_from_pagenr(
992 pfn,
993 ((cmd==MMUEXT_PIN_L2_TABLE) ?
994 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
995 FOREIGNDOM);
997 if ( unlikely(!okay) )
998 {
999 MEM_LOG("Error while pinning pfn %08lx", pfn);
1000 break;
1003 if ( unlikely(test_and_set_bit(_PGT_pinned,
1004 &page->u.inuse.type_info)) )
1006 MEM_LOG("Pfn %08lx already pinned", pfn);
1007 put_page_and_type(page);
1008 okay = 0;
1009 break;
1012 break;
1014 case MMUEXT_UNPIN_TABLE:
1015 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1017 MEM_LOG("Page %08lx bad domain (dom=%p)",
1018 ptr, page->u.inuse.domain);
1020 else if ( likely(test_and_clear_bit(_PGT_pinned,
1021 &page->u.inuse.type_info)) )
1023 put_page_and_type(page);
1024 put_page(page);
1026 else
1028 okay = 0;
1029 put_page(page);
1030 MEM_LOG("Pfn %08lx not pinned", pfn);
1032 break;
1034 case MMUEXT_NEW_BASEPTR:
1035 okay = new_guest_cr3(pfn);
1036 break;
1038 case MMUEXT_TLB_FLUSH:
1039 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1040 break;
1042 case MMUEXT_INVLPG:
1043 __flush_tlb_one(ptr);
1044 break;
1046 case MMUEXT_FLUSH_CACHE:
1047 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1049 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1050 okay = 0;
1052 else
1054 wbinvd();
1056 break;
1058 case MMUEXT_SET_LDT:
1060 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1061 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1062 (ents > 8192) ||
1063 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1064 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1066 okay = 0;
1067 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1069 else if ( (ed->mm.ldt_ents != ents) ||
1070 (ed->mm.ldt_base != ptr) )
1072 invalidate_shadow_ldt(ed);
1073 ed->mm.ldt_base = ptr;
1074 ed->mm.ldt_ents = ents;
1075 load_LDT(ed);
1076 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1077 if ( ents != 0 )
1078 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1080 break;
1083 case MMUEXT_SET_FOREIGNDOM:
1084 domid = (domid_t)(val >> 16);
1086 if ( (e = percpu_info[cpu].foreign) != NULL )
1087 put_domain(e);
1088 percpu_info[cpu].foreign = NULL;
1090 if ( !IS_PRIV(d) )
1092 switch ( domid )
1094 case DOMID_IO:
1095 get_knownalive_domain(dom_io);
1096 percpu_info[cpu].foreign = dom_io;
1097 break;
1098 default:
1099 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1100 okay = 0;
1101 break;
1104 else
1106 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1107 if ( e == NULL )
1109 switch ( domid )
1111 case DOMID_XEN:
1112 get_knownalive_domain(dom_xen);
1113 percpu_info[cpu].foreign = dom_xen;
1114 break;
1115 case DOMID_IO:
1116 get_knownalive_domain(dom_io);
1117 percpu_info[cpu].foreign = dom_io;
1118 break;
1119 default:
1120 MEM_LOG("Unknown domain '%u'", domid);
1121 okay = 0;
1122 break;
1126 break;
1128 case MMUEXT_TRANSFER_PAGE:
1129 domid = (domid_t)(val >> 16);
1130 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1132 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1133 unlikely(!pfn_is_ram(pfn)) ||
1134 unlikely((e = find_domain_by_id(domid)) == NULL) )
1136 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1137 okay = 0;
1138 break;
1141 spin_lock(&d->page_alloc_lock);
1143 /*
1144 * The tricky bit: atomically release ownership while there is just one
1145 * benign reference to the page (PGC_allocated). If that reference
1146 * disappears then the deallocation routine will safely spin.
1147 */
1148 nd = page->u.inuse.domain;
1149 y = page->count_info;
1150 do {
1151 x = y;
1152 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1153 (1|PGC_allocated)) ||
1154 unlikely(nd != d) )
1156 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1157 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1158 d, d->id, nd, x, page->u.inuse.type_info);
1159 spin_unlock(&d->page_alloc_lock);
1160 put_domain(e);
1161 return 0;
1163 __asm__ __volatile__(
1164 LOCK_PREFIX "cmpxchg8b %2"
1165 : "=d" (nd), "=a" (y),
1166 "=m" (*(volatile u64 *)(&page->count_info))
1167 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1169 while ( unlikely(nd != d) || unlikely(y != x) );
1171 /*
1172 * Unlink from 'd'. At least one reference remains (now anonymous), so
1173 * noone else is spinning to try to delete this page from 'd'.
1174 */
1175 d->tot_pages--;
1176 list_del(&page->list);
1178 spin_unlock(&d->page_alloc_lock);
1180 spin_lock(&e->page_alloc_lock);
1182 /*
1183 * Check that 'e' will accept the page and has reservation headroom.
1184 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1185 */
1186 ASSERT(e->tot_pages <= e->max_pages);
1187 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1188 unlikely(e->tot_pages == e->max_pages) ||
1189 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1191 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1192 "provided a bad grant ref, or is dying (%08lx).\n",
1193 e->tot_pages, e->max_pages, e->d_flags);
1194 spin_unlock(&e->page_alloc_lock);
1195 put_domain(e);
1196 okay = 0;
1197 break;
1200 /* Okay, add the page to 'e'. */
1201 if ( unlikely(e->tot_pages++ == 0) )
1202 get_knownalive_domain(e);
1203 list_add_tail(&page->list, &e->page_list);
1204 page->u.inuse.domain = e;
1206 spin_unlock(&e->page_alloc_lock);
1208 /* Transfer is all done: tell the guest about its new page frame. */
1209 gnttab_notify_transfer(e, gntref, pfn);
1211 put_domain(e);
1212 break;
1214 case MMUEXT_REASSIGN_PAGE:
1215 if ( unlikely(!IS_PRIV(d)) )
1217 MEM_LOG("Dom %u has no reassignment priv", d->id);
1218 okay = 0;
1219 break;
1222 e = percpu_info[cpu].foreign;
1223 if ( unlikely(e == NULL) )
1225 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1226 okay = 0;
1227 break;
1230 /*
1231 * Grab both page_list locks, in order. This prevents the page from
1232 * disappearing elsewhere while we modify the owner, and we'll need
1233 * both locks if we're successful so that we can change lists.
1234 */
1235 if ( d < e )
1237 spin_lock(&d->page_alloc_lock);
1238 spin_lock(&e->page_alloc_lock);
1240 else
1242 spin_lock(&e->page_alloc_lock);
1243 spin_lock(&d->page_alloc_lock);
1246 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1247 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1248 unlikely(IS_XEN_HEAP_FRAME(page)) )
1250 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1251 okay = 0;
1252 goto reassign_fail;
1255 /*
1256 * The tricky bit: atomically change owner while there is just one
1257 * benign reference to the page (PGC_allocated). If that reference
1258 * disappears then the deallocation routine will safely spin.
1259 */
1260 nd = page->u.inuse.domain;
1261 y = page->count_info;
1262 do {
1263 x = y;
1264 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1265 (1|PGC_allocated)) ||
1266 unlikely(nd != d) )
1268 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1269 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1270 d, d->id, nd, x, page->u.inuse.type_info);
1271 okay = 0;
1272 goto reassign_fail;
1274 __asm__ __volatile__(
1275 LOCK_PREFIX "cmpxchg8b %3"
1276 : "=d" (nd), "=a" (y), "=c" (e),
1277 "=m" (*(volatile u64 *)(&page->count_info))
1278 : "0" (d), "1" (x), "c" (e), "b" (x) );
1280 while ( unlikely(nd != d) || unlikely(y != x) );
1282 /*
1283 * Unlink from 'd'. We transferred at least one reference to 'e', so
1284 * noone else is spinning to try to delete this page from 'd'.
1285 */
1286 d->tot_pages--;
1287 list_del(&page->list);
1289 /*
1290 * Add the page to 'e'. Someone may already have removed the last
1291 * reference and want to remove the page from 'e'. However, we have
1292 * the lock so they'll spin waiting for us.
1293 */
1294 if ( unlikely(e->tot_pages++ == 0) )
1295 get_knownalive_domain(e);
1296 list_add_tail(&page->list, &e->page_list);
1298 reassign_fail:
1299 spin_unlock(&d->page_alloc_lock);
1300 spin_unlock(&e->page_alloc_lock);
1301 break;
1303 case MMUEXT_CLEAR_FOREIGNDOM:
1304 if ( (e = percpu_info[cpu].foreign) != NULL )
1305 put_domain(e);
1306 percpu_info[cpu].foreign = NULL;
1307 break;
1309 default:
1310 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1311 okay = 0;
1312 break;
1315 return okay;
1318 int do_mmu_update(
1319 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1321 /*
1322 * We steal the m.s.b. of the @count parameter to indicate whether this
1323 * invocation of do_mmu_update() is resuming a previously preempted call.
1324 * We steal the next 15 bits to remember the current FOREIGNDOM.
1325 */
1326 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1327 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1328 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1330 mmu_update_t req;
1331 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1332 struct pfn_info *page;
1333 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1334 unsigned int cmd, done = 0;
1335 unsigned long prev_spfn = 0;
1336 l1_pgentry_t *prev_spl1e = 0;
1337 struct exec_domain *ed = current;
1338 struct domain *d = ed->domain;
1339 u32 type_info;
1340 domid_t domid;
1342 LOCK_BIGLOCK(d);
1344 cleanup_writable_pagetable(d);
1346 /*
1347 * If we are resuming after preemption, read how much work we have already
1348 * done. This allows us to set the @done output parameter correctly.
1349 * We also reset FOREIGNDOM here.
1350 */
1351 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1353 if ( !(count & MMU_UPDATE_PREEMPTED) )
1355 /* Count overflow into private FOREIGNDOM field. */
1356 MEM_LOG("do_mmu_update count is too large");
1357 rc = -EINVAL;
1358 goto out;
1360 count &= ~MMU_UPDATE_PREEMPTED;
1361 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1362 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1363 if ( unlikely(pdone != NULL) )
1364 (void)get_user(done, pdone);
1365 if ( (domid != current->domain->id) &&
1366 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1368 rc = -EINVAL;
1369 goto out;
1373 perfc_incrc(calls_to_mmu_update);
1374 perfc_addc(num_page_updates, count);
1376 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1378 rc = -EFAULT;
1379 goto out;
1382 for ( i = 0; i < count; i++ )
1384 if ( hypercall_preempt_check() )
1386 rc = hypercall_create_continuation(
1387 __HYPERVISOR_mmu_update, 3, ureqs,
1388 (count - i) |
1389 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1390 MMU_UPDATE_PREEMPTED, pdone);
1391 break;
1394 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1396 MEM_LOG("Bad __copy_from_user");
1397 rc = -EFAULT;
1398 break;
1401 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1402 pfn = req.ptr >> PAGE_SHIFT;
1404 okay = 0;
1406 switch ( cmd )
1408 /*
1409 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1410 */
1411 case MMU_NORMAL_PT_UPDATE:
1412 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1414 MEM_LOG("Could not get page for normal update");
1415 break;
1418 if ( likely(prev_pfn == pfn) )
1420 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1422 else
1424 if ( prev_pfn != 0 )
1425 unmap_domain_mem((void *)va);
1426 va = (unsigned long)map_domain_mem(req.ptr);
1427 prev_pfn = pfn;
1430 page = &frame_table[pfn];
1431 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1433 case PGT_l1_page_table:
1434 if ( likely(get_page_type(
1435 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1437 okay = mod_l1_entry((l1_pgentry_t *)va,
1438 mk_l1_pgentry(req.val));
1440 if ( unlikely(ed->mm.shadow_mode) && okay &&
1441 (get_shadow_status(&ed->mm, page-frame_table) &
1442 PSH_shadowed) )
1444 shadow_l1_normal_pt_update(
1445 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1446 put_shadow_status(&ed->mm);
1449 put_page_type(page);
1451 break;
1452 case PGT_l2_page_table:
1453 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1455 okay = mod_l2_entry((l2_pgentry_t *)va,
1456 mk_l2_pgentry(req.val),
1457 pfn);
1459 if ( unlikely(ed->mm.shadow_mode) && okay &&
1460 (get_shadow_status(&ed->mm, page-frame_table) &
1461 PSH_shadowed) )
1463 shadow_l2_normal_pt_update(req.ptr, req.val);
1464 put_shadow_status(&ed->mm);
1467 put_page_type(page);
1469 break;
1470 default:
1471 if ( likely(get_page_type(page, PGT_writable_page)) )
1473 *(unsigned long *)va = req.val;
1474 okay = 1;
1475 put_page_type(page);
1477 break;
1480 put_page(page);
1481 break;
1483 case MMU_MACHPHYS_UPDATE:
1484 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1486 MEM_LOG("Could not get page for mach->phys update");
1487 break;
1490 machine_to_phys_mapping[pfn] = req.val;
1491 okay = 1;
1493 /*
1494 * If in log-dirty mode, mark the corresponding pseudo-physical
1495 * page as dirty.
1496 */
1497 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1498 mark_dirty(&ed->mm, pfn) )
1499 ed->mm.shadow_dirty_block_count++;
1501 put_page(&frame_table[pfn]);
1502 break;
1504 /*
1505 * MMU_EXTENDED_COMMAND: Extended command is specified
1506 * in the least-siginificant bits of the 'value' field.
1507 */
1508 case MMU_EXTENDED_COMMAND:
1509 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1510 okay = do_extended_command(req.ptr, req.val);
1511 break;
1513 default:
1514 MEM_LOG("Invalid page update command %08lx", req.ptr);
1515 break;
1518 if ( unlikely(!okay) )
1520 rc = -EINVAL;
1521 break;
1524 ureqs++;
1527 out:
1528 if ( prev_pfn != 0 )
1529 unmap_domain_mem((void *)va);
1531 if ( unlikely(prev_spl1e != 0) )
1532 unmap_domain_mem((void *)prev_spl1e);
1534 deferred_ops = percpu_info[cpu].deferred_ops;
1535 percpu_info[cpu].deferred_ops = 0;
1537 if ( deferred_ops & DOP_FLUSH_TLB )
1538 local_flush_tlb();
1540 if ( deferred_ops & DOP_RELOAD_LDT )
1541 (void)map_ldt_shadow_page(0);
1543 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1545 put_domain(percpu_info[cpu].foreign);
1546 percpu_info[cpu].foreign = NULL;
1549 /* Add incremental work we have done to the @done output parameter. */
1550 if ( unlikely(pdone != NULL) )
1551 __put_user(done + i, pdone);
1553 UNLOCK_BIGLOCK(d);
1554 return rc;
1558 int do_update_va_mapping(unsigned long page_nr,
1559 unsigned long val,
1560 unsigned long flags)
1562 struct exec_domain *ed = current;
1563 struct domain *d = ed->domain;
1564 int err = 0;
1565 unsigned int cpu = ed->processor;
1566 unsigned long deferred_ops;
1568 perfc_incrc(calls_to_update_va);
1570 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1571 return -EINVAL;
1573 LOCK_BIGLOCK(d);
1575 cleanup_writable_pagetable(d);
1577 /*
1578 * XXX When we make this support 4MB superpages we should also deal with
1579 * the case of updating L2 entries.
1580 */
1582 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1583 mk_l1_pgentry(val))) )
1584 err = -EINVAL;
1586 if ( unlikely(ed->mm.shadow_mode) )
1588 unsigned long sval;
1590 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1592 if ( unlikely(__put_user(sval, ((unsigned long *)(
1593 &shadow_linear_pg_table[page_nr])))) )
1595 /*
1596 * Since L2's are guranteed RW, failure indicates the page was not
1597 * shadowed, so ignore.
1598 */
1599 perfc_incrc(shadow_update_va_fail);
1602 /*
1603 * If we're in log-dirty mode then we need to note that we've updated
1604 * the PTE in the PT-holding page. We need the machine frame number
1605 * for this.
1606 */
1607 if ( ed->mm.shadow_mode == SHM_logdirty )
1608 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1610 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1613 deferred_ops = percpu_info[cpu].deferred_ops;
1614 percpu_info[cpu].deferred_ops = 0;
1616 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1617 unlikely(flags & UVMF_FLUSH_TLB) )
1618 local_flush_tlb();
1619 else if ( unlikely(flags & UVMF_INVLPG) )
1620 __flush_tlb_one(page_nr << PAGE_SHIFT);
1622 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1623 (void)map_ldt_shadow_page(0);
1625 UNLOCK_BIGLOCK(d);
1627 return err;
1630 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1631 unsigned long val,
1632 unsigned long flags,
1633 domid_t domid)
1635 unsigned int cpu = smp_processor_id();
1636 struct domain *d;
1637 int rc;
1639 if ( unlikely(!IS_PRIV(current->domain)) )
1640 return -EPERM;
1642 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1643 if ( unlikely(d == NULL) )
1645 MEM_LOG("Unknown domain '%u'", domid);
1646 return -ESRCH;
1649 rc = do_update_va_mapping(page_nr, val, flags);
1651 put_domain(d);
1652 percpu_info[cpu].foreign = NULL;
1654 return rc;
1659 /*************************
1660 * Writable Pagetables
1661 */
1663 ptwr_info_t ptwr_info[NR_CPUS];
1665 #ifdef VERBOSE
1666 int ptwr_debug = 0x0;
1667 #define PTWR_PRINTK(_f, _a...) \
1668 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1669 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1670 #else
1671 #define PTWR_PRINTK(_f, _a...) ((void)0)
1672 #endif
1674 /* Flush the given writable p.t. page and write-protect it again. */
1675 void ptwr_flush(const int which)
1677 unsigned long sstat, spte, pte, *ptep, l1va;
1678 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1679 l2_pgentry_t *pl2e;
1680 int i, cpu = smp_processor_id();
1681 struct exec_domain *ed = current;
1682 struct domain *d = ed->domain;
1684 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1685 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1687 /*
1688 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1689 */
1691 if ( unlikely(__get_user(pte, ptep)) )
1693 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1694 /*
1695 * Really a bug. We could read this PTE during the initial fault,
1696 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1697 */
1698 BUG();
1700 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1701 PTWR_PRINT_WHICH, ptep, pte);
1702 pte &= ~_PAGE_RW;
1704 if ( unlikely(ed->mm.shadow_mode) )
1706 /* Write-protect the p.t. page in the shadow page table. */
1707 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1708 __put_user(
1709 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1711 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1712 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1713 if ( sstat & PSH_shadowed )
1714 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1717 /* Write-protect the p.t. page in the guest page table. */
1718 if ( unlikely(__put_user(pte, ptep)) )
1720 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1721 /*
1722 * Really a bug. We could write this PTE during the initial fault,
1723 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1724 */
1725 BUG();
1728 /* Ensure that there are no stale writable mappings in any TLB. */
1729 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1730 #if 1
1731 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1732 #else
1733 flush_tlb_all();
1734 #endif
1735 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1736 PTWR_PRINT_WHICH, ptep, pte);
1738 /*
1739 * STEP 2. Validate any modified PTEs.
1740 */
1742 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1743 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1745 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1746 nl1e = pl1e[i];
1748 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1749 continue;
1751 /*
1752 * Fast path for PTEs that have merely been write-protected
1753 * (e.g., during a Unix fork()). A strict reduction in privilege.
1754 */
1755 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1757 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1759 if ( unlikely(sl1e != NULL) )
1760 l1pte_propagate_from_guest(
1761 &ed->mm, &l1_pgentry_val(nl1e),
1762 &l1_pgentry_val(sl1e[i]));
1763 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1765 continue;
1768 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1770 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1771 /*
1772 * Make the remaining p.t's consistent before crashing, so the
1773 * reference counts are correct.
1774 */
1775 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1776 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1777 unmap_domain_mem(pl1e);
1778 ptwr_info[cpu].ptinfo[which].l1va = 0;
1779 UNLOCK_BIGLOCK(d);
1780 domain_crash();
1783 if ( unlikely(sl1e != NULL) )
1784 l1pte_propagate_from_guest(
1785 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1787 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1788 put_page_from_l1e(ol1e, d);
1790 unmap_domain_mem(pl1e);
1792 /*
1793 * STEP 3. Reattach the L1 p.t. page into the current address space.
1794 */
1796 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1798 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1799 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1802 /*
1803 * STEP 4. Final tidy-up.
1804 */
1806 ptwr_info[cpu].ptinfo[which].l1va = 0;
1808 if ( unlikely(sl1e != NULL) )
1810 unmap_domain_mem(sl1e);
1811 put_shadow_status(&ed->mm);
1815 /* Write page fault handler: check if guest is trying to modify a PTE. */
1816 int ptwr_do_page_fault(unsigned long addr)
1818 unsigned long pte, pfn, l2e;
1819 struct pfn_info *page;
1820 l2_pgentry_t *pl2e;
1821 int which, cpu = smp_processor_id();
1822 u32 l2_idx;
1824 /*
1825 * Attempt to read the PTE that maps the VA being accessed. By checking for
1826 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1827 */
1828 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1829 _PAGE_PRESENT) ||
1830 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1832 return 0;
1835 pfn = pte >> PAGE_SHIFT;
1836 page = &frame_table[pfn];
1838 /* We are looking only for read-only mappings of p.t. pages. */
1839 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1840 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1842 return 0;
1845 /* Get the L2 index at which this L1 p.t. is always mapped. */
1846 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1847 if ( unlikely(l2_idx >= PGT_va_unknown) )
1849 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1851 l2_idx >>= PGT_va_shift;
1853 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1855 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1856 domain_crash();
1859 /*
1860 * Is the L1 p.t. mapped into the current address space? If so we call it
1861 * an ACTIVE p.t., otherwise it is INACTIVE.
1862 */
1863 pl2e = &linear_l2_table[l2_idx];
1864 l2e = l2_pgentry_val(*pl2e);
1865 which = PTWR_PT_INACTIVE;
1866 if ( (l2e >> PAGE_SHIFT) == pfn )
1868 /* Check the PRESENT bit to set ACTIVE. */
1869 if ( likely(l2e & _PAGE_PRESENT) )
1870 which = PTWR_PT_ACTIVE;
1871 else {
1872 /*
1873 * If the PRESENT bit is clear, we may be conflicting with
1874 * the current ACTIVE p.t. (it may be the same p.t. mapped
1875 * at another virt addr).
1876 * The ptwr_flush call below will restore the PRESENT bit.
1877 */
1878 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1879 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1880 which = PTWR_PT_ACTIVE;
1884 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1885 "pfn %08lx\n", PTWR_PRINT_WHICH,
1886 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1888 /*
1889 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1890 * time. If there is already one, we must flush it out.
1891 */
1892 if ( ptwr_info[cpu].ptinfo[which].l1va )
1893 ptwr_flush(which);
1895 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1896 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1898 /* For safety, disconnect the L1 p.t. page from current space. */
1899 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1901 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1902 #if 1
1903 flush_tlb(); /* XXX Multi-CPU guests? */
1904 #else
1905 flush_tlb_all();
1906 #endif
1909 /* Temporarily map the L1 page, and make a copy of it. */
1910 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1911 memcpy(ptwr_info[cpu].ptinfo[which].page,
1912 ptwr_info[cpu].ptinfo[which].pl1e,
1913 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1915 /* Finally, make the p.t. page writable by the guest OS. */
1916 pte |= _PAGE_RW;
1917 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1918 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1919 if ( unlikely(__put_user(pte, (unsigned long *)
1920 &linear_pg_table[addr>>PAGE_SHIFT])) )
1922 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1923 &linear_pg_table[addr>>PAGE_SHIFT]);
1924 /* Toss the writable pagetable state and crash. */
1925 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1926 ptwr_info[cpu].ptinfo[which].l1va = 0;
1927 domain_crash();
1930 return EXCRET_fault_fixed;
1933 static __init int ptwr_init(void)
1935 int i;
1937 for ( i = 0; i < smp_num_cpus; i++ )
1939 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1940 (void *)alloc_xenheap_page();
1941 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1942 (void *)alloc_xenheap_page();
1945 return 0;
1947 __initcall(ptwr_init);
1952 /************************************************************************/
1953 /************************************************************************/
1954 /************************************************************************/
1956 #ifndef NDEBUG
1958 void ptwr_status(void)
1960 unsigned long pte, *ptep, pfn;
1961 struct pfn_info *page;
1962 int cpu = smp_processor_id();
1964 ptep = (unsigned long *)&linear_pg_table
1965 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1967 if ( __get_user(pte, ptep) ) {
1968 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1969 domain_crash();
1972 pfn = pte >> PAGE_SHIFT;
1973 page = &frame_table[pfn];
1974 printk("need to alloc l1 page %p\n", page);
1975 /* make pt page writable */
1976 printk("need to make read-only l1-page at %p is %08lx\n",
1977 ptep, pte);
1979 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1980 return;
1982 if ( __get_user(pte, (unsigned long *)
1983 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1984 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1985 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1986 domain_crash();
1988 pfn = pte >> PAGE_SHIFT;
1989 page = &frame_table[pfn];
1992 void audit_domain(struct domain *d)
1994 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1996 void adjust (struct pfn_info *page, int dir, int adjtype)
1998 int count = page->count_info & PGC_count_mask;
2000 if ( adjtype )
2002 int tcount = page->u.inuse.type_info & PGT_count_mask;
2004 ttot++;
2006 tcount += dir;
2008 if ( tcount < 0 )
2010 /* This will only come out once. */
2011 printk("Audit %d: type count whent below zero pfn=%x "
2012 "taf=%x otaf=%x\n",
2013 d->id, page-frame_table,
2014 page->u.inuse.type_info,
2015 page->tlbflush_timestamp);
2018 page->u.inuse.type_info =
2019 (page->u.inuse.type_info & ~PGT_count_mask) |
2020 (tcount & PGT_count_mask);
2023 ctot++;
2024 count += dir;
2025 if ( count < 0 )
2027 /* This will only come out once. */
2028 printk("Audit %d: general count whent below zero pfn=%x "
2029 "taf=%x otaf=%x\n",
2030 d->id, page-frame_table,
2031 page->u.inuse.type_info,
2032 page->tlbflush_timestamp);
2035 page->count_info =
2036 (page->count_info & ~PGC_count_mask) |
2037 (count & PGC_count_mask);
2041 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2043 unsigned long pfn, *pt;
2044 struct list_head *list_ent;
2045 struct pfn_info *page;
2046 int i;
2048 list_ent = d->page_list.next;
2049 for ( i = 0; (list_ent != &d->page_list); i++ )
2051 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2052 page = &frame_table[pfn];
2054 switch ( page->u.inuse.type_info & PGT_type_mask )
2056 case PGT_l1_page_table:
2057 case PGT_l2_page_table:
2058 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2059 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2060 if ( (pt[i] & _PAGE_PRESENT) &&
2061 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2062 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2063 d->id, i, pfn, page->u.inuse.type_info,
2064 page->count_info);
2065 unmap_domain_mem(pt);
2068 list_ent = frame_table[pfn].list.next;
2073 void scan_for_pfn_remote(unsigned long xpfn)
2075 struct domain *e;
2076 for_each_domain ( e )
2077 scan_for_pfn( e, xpfn );
2080 int i;
2081 unsigned long pfn;
2082 struct list_head *list_ent;
2083 struct pfn_info *page;
2085 if ( d != current->domain )
2086 domain_pause(d);
2087 synchronise_pagetables(~0UL);
2089 printk("pt base=%lx sh_info=%x\n",
2090 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2091 virt_to_page(d->shared_info)-frame_table);
2093 spin_lock(&d->page_alloc_lock);
2095 /* PHASE 0 */
2097 list_ent = d->page_list.next;
2098 for ( i = 0; (list_ent != &d->page_list); i++ )
2100 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2101 page = &frame_table[pfn];
2103 if ( page->u.inuse.domain != d )
2104 BUG();
2106 if ( (page->u.inuse.type_info & PGT_count_mask) >
2107 (page->count_info & PGC_count_mask) )
2108 printk("taf > caf %x %x pfn=%lx\n",
2109 page->u.inuse.type_info, page->count_info, pfn );
2111 #if 0 /* SYSV shared memory pages plus writeable files. */
2112 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2113 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2115 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2116 pfn,
2117 page->u.inuse.type_info,
2118 page->count_info );
2119 scan_for_pfn_remote(pfn);
2121 #endif
2122 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2123 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2125 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2126 pfn,
2127 page->u.inuse.type_info,
2128 page->count_info );
2131 /* Use tlbflush_timestamp to store original type_info. */
2132 page->tlbflush_timestamp = page->u.inuse.type_info;
2134 list_ent = frame_table[pfn].list.next;
2138 /* PHASE 1 */
2140 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2142 list_ent = d->page_list.next;
2143 for ( i = 0; (list_ent != &d->page_list); i++ )
2145 unsigned long *pt;
2146 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2147 page = &frame_table[pfn];
2149 if ( page->u.inuse.domain != d )
2150 BUG();
2152 switch ( page->u.inuse.type_info & PGT_type_mask )
2154 case PGT_l2_page_table:
2156 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2157 printk("Audit %d: L2 not validated %x\n",
2158 d->id, page->u.inuse.type_info);
2160 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2161 printk("Audit %d: L2 not pinned %x\n",
2162 d->id, page->u.inuse.type_info);
2163 else
2164 adjust( page, -1, 1 );
2166 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2168 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2170 if ( pt[i] & _PAGE_PRESENT )
2172 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2173 struct pfn_info *l1page = &frame_table[l1pfn];
2175 if ( l1page->u.inuse.domain != d )
2177 printk("L2: Skip bizarre page belonging to other "
2178 "dom %p\n", l1page->u.inuse.domain);
2179 continue;
2182 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2183 PGT_l2_page_table )
2184 printk("Audit %d: [%x] Found %s Linear PT "
2185 "t=%x pfn=%lx\n", d->id, i,
2186 (l1pfn==pfn) ? "Self" : "Other",
2187 l1page->u.inuse.type_info,
2188 l1pfn);
2189 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2190 PGT_l1_page_table )
2191 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2192 d->id, i,
2193 l1page->u.inuse.type_info,
2194 l1pfn);
2196 adjust(l1page, -1, 1);
2200 unmap_domain_mem(pt);
2202 break;
2205 case PGT_l1_page_table:
2207 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2208 adjust( page, -1, 1 );
2210 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2211 printk("Audit %d: L1 not validated %x\n",
2212 d->id, page->u.inuse.type_info);
2213 #if 0
2214 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2215 printk("Audit %d: L1 not pinned %x\n",
2216 d->id, page->u.inuse.type_info);
2217 #endif
2218 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2220 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2222 if ( pt[i] & _PAGE_PRESENT )
2224 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2225 struct pfn_info *l1page = &frame_table[l1pfn];
2227 if ( l1pfn < 0x100 )
2229 lowmem_mappings++;
2230 continue;
2233 if ( l1pfn > max_page )
2235 io_mappings++;
2236 continue;
2239 if ( pt[i] & _PAGE_RW )
2242 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2243 PGT_l1_page_table ||
2244 (l1page->u.inuse.type_info & PGT_type_mask) ==
2245 PGT_l2_page_table )
2246 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2247 d->id, i,
2248 l1page->u.inuse.type_info,
2249 l1pfn);
2253 if ( l1page->u.inuse.domain != d )
2255 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2256 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2257 d->id, pfn, i,
2258 (unsigned long)l1page->u.inuse.domain,
2259 l1pfn,
2260 l1page->count_info,
2261 l1page->u.inuse.type_info,
2262 machine_to_phys_mapping[l1pfn]);
2263 continue;
2266 adjust(l1page, -1, 0);
2270 unmap_domain_mem(pt);
2272 break;
2275 list_ent = frame_table[pfn].list.next;
2278 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2279 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2280 d->id, lowmem_mappings, io_mappings);
2282 /* PHASE 2 */
2284 ctot = ttot = 0;
2285 list_ent = d->page_list.next;
2286 for ( i = 0; (list_ent != &d->page_list); i++ )
2288 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2289 page = &frame_table[pfn];
2291 switch ( page->u.inuse.type_info & PGT_type_mask)
2293 case PGT_l1_page_table:
2294 case PGT_l2_page_table:
2295 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2297 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2298 d->id, page->u.inuse.type_info,
2299 page->tlbflush_timestamp,
2300 page->count_info, pfn );
2301 scan_for_pfn_remote(pfn);
2303 default:
2304 if ( (page->count_info & PGC_count_mask) != 1 )
2306 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2307 d->id,
2308 page->count_info,
2309 page->u.inuse.type_info,
2310 page->tlbflush_timestamp, pfn );
2311 scan_for_pfn_remote(pfn);
2313 break;
2316 list_ent = frame_table[pfn].list.next;
2319 /* PHASE 3 */
2321 list_ent = d->page_list.next;
2322 for ( i = 0; (list_ent != &d->page_list); i++ )
2324 unsigned long *pt;
2325 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2326 page = &frame_table[pfn];
2328 switch ( page->u.inuse.type_info & PGT_type_mask )
2330 case PGT_l2_page_table:
2331 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2332 adjust( page, 1, 1 );
2334 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2336 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2338 if ( pt[i] & _PAGE_PRESENT )
2340 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2341 struct pfn_info *l1page = &frame_table[l1pfn];
2343 if ( l1page->u.inuse.domain == d)
2344 adjust(l1page, 1, 1);
2348 unmap_domain_mem(pt);
2349 break;
2351 case PGT_l1_page_table:
2352 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2353 adjust( page, 1, 1 );
2355 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2357 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2359 if ( pt[i] & _PAGE_PRESENT )
2361 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2362 struct pfn_info *l1page = &frame_table[l1pfn];
2364 if ( (l1page->u.inuse.domain != d) ||
2365 (l1pfn < 0x100) || (l1pfn > max_page) )
2366 continue;
2368 adjust(l1page, 1, 0);
2372 unmap_domain_mem(pt);
2373 break;
2377 page->tlbflush_timestamp = 0;
2379 list_ent = frame_table[pfn].list.next;
2382 spin_unlock(&d->page_alloc_lock);
2384 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2386 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2388 if ( d != current->domain )
2389 domain_unpause(d);
2392 void audit_domains(void)
2394 struct domain *d;
2395 for_each_domain ( d )
2396 audit_domain(d);
2399 void audit_domains_key(unsigned char key)
2401 audit_domains();
2404 #endif