debuggers.hg

view xen/arch/x86/memory.c @ 3755:ea98f0bb6510

bitkeeper revision 1.1159.212.127 (4208b02bTdSR4AVYRg8diDkKZmIVUg)

General shadow code cleanup.

Fixed compilation problems when SHADOW_DEBUG is enabled.
Fixed compilation problems when CONFIG_VMX is undefined.

Simplified l1pte_write_fault and l1pte_read_fault.
Name change: spfn => smfn (shadow machine frame numbers).

In general, the terms pfn and gpfn now refer to pages in the
guest's idea of physical frames (which diffs for full shadow
guests). mfn always refers to a machine frame number.

One bug fix for check_pagetable():
If we're using writable page tables
along with shadow mode, don't check the currently writable page table
page -- check its snapshot instead.

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Tue Feb 08 12:27:23 2005 +0000 (2005-02-08)
parents 23e7cf28ddb3
children
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/memory.c
4 *
5 * Copyright (c) 2002-2004 K A Fraser
6 * Copyright (c) 2004 Christian Limpach
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 /*
24 * A description of the x86 page table API:
25 *
26 * Domains trap to do_mmu_update with a list of update requests.
27 * This is a list of (ptr, val) pairs, where the requested operation
28 * is *ptr = val.
29 *
30 * Reference counting of pages:
31 * ----------------------------
32 * Each page has two refcounts: tot_count and type_count.
33 *
34 * TOT_COUNT is the obvious reference count. It counts all uses of a
35 * physical page frame by a domain, including uses as a page directory,
36 * a page table, or simple mappings via a PTE. This count prevents a
37 * domain from releasing a frame back to the free pool when it still holds
38 * a reference to it.
39 *
40 * TYPE_COUNT is more subtle. A frame can be put to one of three
41 * mutually-exclusive uses: it might be used as a page directory, or a
42 * page table, or it may be mapped writable by the domain [of course, a
43 * frame may not be used in any of these three ways!].
44 * So, type_count is a count of the number of times a frame is being
45 * referred to in its current incarnation. Therefore, a page can only
46 * change its type when its type count is zero.
47 *
48 * Pinning the page type:
49 * ----------------------
50 * The type of a page can be pinned/unpinned with the commands
51 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
52 * pinning is not reference counted, so it can't be nested).
53 * This is useful to prevent a page's type count falling to zero, at which
54 * point safety checks would need to be carried out next time the count
55 * is increased again.
56 *
57 * A further note on writable page mappings:
58 * -----------------------------------------
59 * For simplicity, the count of writable mappings for a page may not
60 * correspond to reality. The 'writable count' is incremented for every
61 * PTE which maps the page with the _PAGE_RW flag set. However, for
62 * write access to be possible the page directory entry must also have
63 * its _PAGE_RW bit set. We do not check this as it complicates the
64 * reference counting considerably [consider the case of multiple
65 * directory entries referencing a single page table, some with the RW
66 * bit set, others not -- it starts getting a bit messy].
67 * In normal use, this simplification shouldn't be a problem.
68 * However, the logic can be added if required.
69 *
70 * One more note on read-only page mappings:
71 * -----------------------------------------
72 * We want domains to be able to map pages for read-only access. The
73 * main reason is that page tables and directories should be readable
74 * by a domain, but it would not be safe for them to be writable.
75 * However, domains have free access to rings 1 & 2 of the Intel
76 * privilege model. In terms of page protection, these are considered
77 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
78 * read-only restrictions are respected in supervisor mode -- if the
79 * bit is clear then any mapped page is writable.
80 *
81 * We get round this by always setting the WP bit and disallowing
82 * updates to it. This is very unlikely to cause a problem for guest
83 * OS's, which will generally use the WP bit to simplify copy-on-write
84 * implementation (in that case, OS wants a fault when it writes to
85 * an application-supplied buffer).
86 */
88 #include <xen/config.h>
89 #include <xen/init.h>
90 #include <xen/kernel.h>
91 #include <xen/lib.h>
92 #include <xen/mm.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/domain_page.h>
104 #include <asm/ldt.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
109 current->domain->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 static int alloc_l2_table(struct pfn_info *page);
115 static int alloc_l1_table(struct pfn_info *page);
116 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
117 static int get_page_and_type_from_pagenr(unsigned long page_nr,
118 u32 type,
119 struct domain *d);
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned long deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
164 4UL << 20, PAGE_HYPERVISOR);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 extern void subarch_init_memory(struct domain *);
174 memset(percpu_info, 0, sizeof(percpu_info));
176 /*
177 * Initialise our DOMID_XEN domain.
178 * Any Xen-heap pages that we will allow to be mapped will have
179 * their domain field set to dom_xen.
180 */
181 dom_xen = alloc_domain_struct();
182 atomic_set(&dom_xen->refcnt, 1);
183 dom_xen->id = DOMID_XEN;
185 /*
186 * Initialise our DOMID_IO domain.
187 * This domain owns no pages but is considered a special case when
188 * mapping I/O pages, as the mappings occur at the priv of the caller.
189 */
190 dom_io = alloc_domain_struct();
191 atomic_set(&dom_io->refcnt, 1);
192 dom_io->id = DOMID_IO;
194 subarch_init_memory(dom_xen);
195 }
197 void write_ptbase(struct exec_domain *ed)
198 {
199 struct domain *d = ed->domain;
200 unsigned long pa;
202 #ifdef CONFIG_VMX
203 if ( unlikely(shadow_mode(d)) )
204 pa = ((shadow_mode(d) == SHM_full_32) ?
205 pagetable_val(ed->arch.monitor_table) :
206 pagetable_val(ed->arch.shadow_table));
207 else
208 pa = pagetable_val(ed->arch.pagetable);
209 #else
210 if ( unlikely(shadow_mode(d)) )
211 pa = pagetable_val(ed->arch.shadow_table);
212 else
213 pa = pagetable_val(ed->arch.pagetable);
214 #endif
216 write_cr3(pa);
217 }
219 static void __invalidate_shadow_ldt(struct exec_domain *d)
220 {
221 int i;
222 unsigned long pfn;
223 struct pfn_info *page;
225 d->arch.shadow_ldt_mapcnt = 0;
227 for ( i = 16; i < 32; i++ )
228 {
229 pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
230 if ( pfn == 0 ) continue;
231 d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
232 page = &frame_table[pfn];
233 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
234 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
235 put_page_and_type(page);
236 }
238 /* Dispose of the (now possibly invalid) mappings from the TLB. */
239 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
240 }
243 static inline void invalidate_shadow_ldt(struct exec_domain *d)
244 {
245 if ( d->arch.shadow_ldt_mapcnt != 0 )
246 __invalidate_shadow_ldt(d);
247 }
250 static int alloc_segdesc_page(struct pfn_info *page)
251 {
252 struct desc_struct *descs;
253 int i;
255 descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
257 for ( i = 0; i < 512; i++ )
258 if ( unlikely(!check_descriptor(&descs[i])) )
259 goto fail;
261 unmap_domain_mem(descs);
262 return 1;
264 fail:
265 unmap_domain_mem(descs);
266 return 0;
267 }
270 /* Map shadow page at offset @off. */
271 int map_ldt_shadow_page(unsigned int off)
272 {
273 struct exec_domain *ed = current;
274 struct domain *d = ed->domain;
275 unsigned long l1e;
277 if ( unlikely(in_irq()) )
278 BUG();
280 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >>
281 PAGE_SHIFT) + off]);
283 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
284 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
285 d, PGT_ldt_page)) )
286 return 0;
288 ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
289 ed->arch.shadow_ldt_mapcnt++;
291 return 1;
292 }
295 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
296 {
297 struct pfn_info *page = &frame_table[page_nr];
299 if ( unlikely(!pfn_is_ram(page_nr)) )
300 {
301 MEM_LOG("Pfn %08lx is not RAM", page_nr);
302 return 0;
303 }
305 if ( unlikely(!get_page(page, d)) )
306 {
307 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
308 return 0;
309 }
311 return 1;
312 }
315 static int get_page_and_type_from_pagenr(unsigned long page_nr,
316 u32 type,
317 struct domain *d)
318 {
319 struct pfn_info *page = &frame_table[page_nr];
321 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
322 return 0;
324 if ( unlikely(!get_page_type(page, type)) )
325 {
326 #ifdef VERBOSE
327 if ( (type & PGT_type_mask) != PGT_l1_page_table )
328 MEM_LOG("Bad page type for pfn %08lx (%08x)",
329 page_nr, page->u.inuse.type_info);
330 #endif
331 put_page(page);
332 return 0;
333 }
335 return 1;
336 }
339 /*
340 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
341 * needs some special care with reference counst and access permissions:
342 * 1. The mapping entry must be read-only, or the guest may get write access
343 * to its own PTEs.
344 * 2. We must only bump the reference counts for an *already validated*
345 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
346 * on a validation that is required to complete that validation.
347 * 3. We only need to increment the reference counts for the mapped page
348 * frame if it is mapped by a different L2 table. This is sufficient and
349 * also necessary to allow validation of an L2 table mapping itself.
350 */
351 static int
352 get_linear_pagetable(
353 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
354 {
355 u32 x, y;
356 struct pfn_info *page;
358 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
359 {
360 MEM_LOG("Attempt to create linear p.t. with write perms");
361 return 0;
362 }
364 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
365 {
366 /* Make sure the mapped frame belongs to the correct domain. */
367 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
368 return 0;
370 /*
371 * Make sure that the mapped frame is an already-validated L2 table.
372 * If so, atomically increment the count (checking for overflow).
373 */
374 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
375 y = page->u.inuse.type_info;
376 do {
377 x = y;
378 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
379 unlikely((x & (PGT_type_mask|PGT_validated)) !=
380 (PGT_l2_page_table|PGT_validated)) )
381 {
382 put_page(page);
383 return 0;
384 }
385 }
386 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
387 }
389 return 1;
390 }
393 static int
394 get_page_from_l1e(
395 l1_pgentry_t l1e, struct domain *d)
396 {
397 unsigned long l1v = l1_pgentry_val(l1e);
398 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
399 struct pfn_info *page = &frame_table[pfn];
400 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
402 if ( !(l1v & _PAGE_PRESENT) )
403 return 1;
405 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
406 {
407 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
408 return 0;
409 }
411 if ( unlikely(!pfn_is_ram(pfn)) )
412 {
413 /* Revert to caller privileges if FD == DOMID_IO. */
414 if ( d == dom_io )
415 d = current->domain;
417 if ( IS_PRIV(d) )
418 return 1;
420 if ( IS_CAPABLE_PHYSDEV(d) )
421 return domain_iomem_in_pfn(d, pfn);
423 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
424 return 0;
425 }
427 return ((l1v & _PAGE_RW) ?
428 get_page_and_type(page, d, PGT_writable_page) :
429 get_page(page, d));
430 }
433 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
434 static int
435 get_page_from_l2e(
436 l2_pgentry_t l2e, unsigned long pfn,
437 struct domain *d, unsigned long va_idx)
438 {
439 int rc;
441 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
445 {
446 MEM_LOG("Bad L2 page type settings %04lx",
447 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
448 return 0;
449 }
451 rc = get_page_and_type_from_pagenr(
452 l2_pgentry_to_pagenr(l2e),
453 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
455 if ( unlikely(!rc) )
456 return get_linear_pagetable(l2e, pfn, d);
458 return 1;
459 }
462 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
463 {
464 unsigned long l1v = l1_pgentry_val(l1e);
465 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
466 struct pfn_info *page = &frame_table[pfn];
467 struct domain *e;
469 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
470 return;
472 e = page_get_owner(page);
473 if ( unlikely(e != d) )
474 {
475 /*
476 * Unmap a foreign page that may have been mapped via a grant table.
477 * Note that this can fail for a privileged domain that can map foreign
478 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
479 * counted via a grant entry and some counted directly in the page
480 * structure's reference count. Note that reference counts won't get
481 * dangerously confused as long as we always try to decrement the
482 * grant entry first. We may end up with a mismatch between which
483 * mappings and which unmappings are counted via the grant entry, but
484 * really it doesn't matter as privileged domains have carte blanche.
485 */
486 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
487 return;
488 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
489 }
491 if ( l1v & _PAGE_RW )
492 {
493 put_page_and_type(page);
494 }
495 else
496 {
497 /* We expect this is rare so we blow the entire shadow LDT. */
498 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
499 PGT_ldt_page)) &&
500 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
501 invalidate_shadow_ldt(e->exec_domain[0]);
502 put_page(page);
503 }
504 }
507 /*
508 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
509 * Note also that this automatically deals correctly with linear p.t.'s.
510 */
511 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
512 {
513 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
514 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
515 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
516 }
519 static int alloc_l2_table(struct pfn_info *page)
520 {
521 struct domain *d = page_get_owner(page);
522 unsigned long page_nr = page_to_pfn(page);
523 l2_pgentry_t *pl2e;
524 int i;
526 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
528 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
529 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
530 goto fail;
532 #if defined(__i386__)
533 /* Now we add our private high mappings. */
534 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
535 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
536 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
537 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
538 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
539 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
540 mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) |
541 __PAGE_HYPERVISOR);
542 #endif
544 unmap_domain_mem(pl2e);
545 return 1;
547 fail:
548 while ( i-- > 0 )
549 put_page_from_l2e(pl2e[i], page_nr);
551 unmap_domain_mem(pl2e);
552 return 0;
553 }
556 static int alloc_l1_table(struct pfn_info *page)
557 {
558 struct domain *d = page_get_owner(page);
559 unsigned long page_nr = page_to_pfn(page);
560 l1_pgentry_t *pl1e;
561 int i;
563 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
565 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
566 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
567 goto fail;
569 unmap_domain_mem(pl1e);
570 return 1;
572 fail:
573 while ( i-- > 0 )
574 put_page_from_l1e(pl1e[i], d);
576 unmap_domain_mem(pl1e);
577 return 0;
578 }
581 static void free_l2_table(struct pfn_info *page)
582 {
583 unsigned long page_nr = page - frame_table;
584 l2_pgentry_t *pl2e;
585 int i;
587 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
589 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
590 put_page_from_l2e(pl2e[i], page_nr);
592 unmap_domain_mem(pl2e);
593 }
596 static void free_l1_table(struct pfn_info *page)
597 {
598 struct domain *d = page_get_owner(page);
599 unsigned long page_nr = page - frame_table;
600 l1_pgentry_t *pl1e;
601 int i;
603 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
605 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
606 put_page_from_l1e(pl1e[i], d);
608 unmap_domain_mem(pl1e);
609 }
612 static inline int update_l2e(l2_pgentry_t *pl2e,
613 l2_pgentry_t ol2e,
614 l2_pgentry_t nl2e)
615 {
616 unsigned long o = cmpxchg((unsigned long *)pl2e,
617 l2_pgentry_val(ol2e),
618 l2_pgentry_val(nl2e));
619 if ( o != l2_pgentry_val(ol2e) )
620 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
621 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
622 return (o == l2_pgentry_val(ol2e));
623 }
626 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
627 static int mod_l2_entry(l2_pgentry_t *pl2e,
628 l2_pgentry_t nl2e,
629 unsigned long pfn)
630 {
631 l2_pgentry_t ol2e;
632 unsigned long _ol2e;
634 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
635 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
636 {
637 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
638 return 0;
639 }
641 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
642 return 0;
643 ol2e = mk_l2_pgentry(_ol2e);
645 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
646 {
647 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
648 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
649 return update_l2e(pl2e, ol2e, nl2e);
651 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
652 ((unsigned long)pl2e &
653 ~PAGE_MASK) >> 2)) )
654 return 0;
656 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
657 {
658 put_page_from_l2e(nl2e, pfn);
659 return 0;
660 }
662 put_page_from_l2e(ol2e, pfn);
663 return 1;
664 }
666 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
667 return 0;
669 put_page_from_l2e(ol2e, pfn);
670 return 1;
671 }
674 static inline int update_l1e(l1_pgentry_t *pl1e,
675 l1_pgentry_t ol1e,
676 l1_pgentry_t nl1e)
677 {
678 unsigned long o = l1_pgentry_val(ol1e);
679 unsigned long n = l1_pgentry_val(nl1e);
681 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
682 unlikely(o != l1_pgentry_val(ol1e)) )
683 {
684 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
685 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
686 return 0;
687 }
689 return 1;
690 }
693 /* Update the L1 entry at pl1e to new value nl1e. */
694 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
695 {
696 l1_pgentry_t ol1e;
697 unsigned long _ol1e;
698 struct domain *d = current->domain;
700 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
701 {
702 MEM_LOG("Bad get_user\n");
703 return 0;
704 }
706 ol1e = mk_l1_pgentry(_ol1e);
708 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
709 {
710 /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
711 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
712 return update_l1e(pl1e, ol1e, nl1e);
714 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
715 return 0;
717 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
718 {
719 put_page_from_l1e(nl1e, d);
720 return 0;
721 }
723 put_page_from_l1e(ol1e, d);
724 return 1;
725 }
727 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
728 return 0;
730 put_page_from_l1e(ol1e, d);
731 return 1;
732 }
735 int alloc_page_type(struct pfn_info *page, unsigned int type)
736 {
737 switch ( type )
738 {
739 case PGT_l1_page_table:
740 return alloc_l1_table(page);
741 case PGT_l2_page_table:
742 return alloc_l2_table(page);
743 case PGT_gdt_page:
744 case PGT_ldt_page:
745 return alloc_segdesc_page(page);
746 default:
747 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
748 type, page->u.inuse.type_info,
749 page->count_info);
750 BUG();
751 }
753 return 0;
754 }
757 void free_page_type(struct pfn_info *page, unsigned int type)
758 {
759 struct domain *d = page_get_owner(page);
761 switch ( type )
762 {
763 case PGT_l1_page_table:
764 free_l1_table(page);
765 break;
767 case PGT_l2_page_table:
768 free_l2_table(page);
769 break;
771 default:
772 BUG();
773 }
775 if ( unlikely(shadow_mode(d)) &&
776 (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
777 {
778 unshadow_table(page_to_pfn(page), type);
779 put_shadow_status(d);
780 }
781 }
784 void put_page_type(struct pfn_info *page)
785 {
786 u32 nx, x, y = page->u.inuse.type_info;
788 again:
789 do {
790 x = y;
791 nx = x - 1;
793 ASSERT((x & PGT_count_mask) != 0);
795 /*
796 * The page should always be validated while a reference is held. The
797 * exception is during domain destruction, when we forcibly invalidate
798 * page-table pages if we detect a referential loop.
799 * See domain.c:relinquish_list().
800 */
801 ASSERT((x & PGT_validated) ||
802 test_bit(DF_DYING, &page_get_owner(page)->d_flags));
804 if ( unlikely((nx & PGT_count_mask) == 0) )
805 {
806 /* Record TLB information for flush later. Races are harmless. */
807 page->tlbflush_timestamp = tlbflush_current_time();
809 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
810 likely(nx & PGT_validated) )
811 {
812 /*
813 * Page-table pages must be unvalidated when count is zero. The
814 * 'free' is safe because the refcnt is non-zero and validated
815 * bit is clear => other ops will spin or fail.
816 */
817 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
818 x & ~PGT_validated)) != x) )
819 goto again;
820 /* We cleared the 'valid bit' so we do the clear up. */
821 free_page_type(page, x & PGT_type_mask);
822 /* Carry on, but with the 'valid bit' now clear. */
823 x &= ~PGT_validated;
824 nx &= ~PGT_validated;
825 }
826 }
827 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
828 (PGT_pinned | 1)) )
829 {
830 /* Page is now only pinned. Make the back pointer mutable again. */
831 nx |= PGT_va_mutable;
832 }
833 }
834 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
835 }
838 int get_page_type(struct pfn_info *page, u32 type)
839 {
840 u32 nx, x, y = page->u.inuse.type_info;
842 again:
843 do {
844 x = y;
845 nx = x + 1;
846 if ( unlikely((nx & PGT_count_mask) == 0) )
847 {
848 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
849 return 0;
850 }
851 else if ( unlikely((x & PGT_count_mask) == 0) )
852 {
853 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
854 {
855 /*
856 * On type change we check to flush stale TLB entries. This
857 * may be unnecessary (e.g., page was GDT/LDT) but those
858 * circumstances should be very rare.
859 */
860 struct domain *d = page_get_owner(page);
861 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
862 page->tlbflush_timestamp)) )
863 {
864 perfc_incr(need_flush_tlb_flush);
865 flush_tlb_cpu(d->exec_domain[0]->processor);
866 }
868 /* We lose existing type, back pointer, and validity. */
869 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
870 nx |= type;
872 /* No special validation needed for writable pages. */
873 /* Page tables and GDT/LDT need to be scanned for validity. */
874 if ( type == PGT_writable_page )
875 nx |= PGT_validated;
876 }
877 }
878 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
879 {
880 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
881 {
882 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
883 ((type & PGT_type_mask) != PGT_l1_page_table) )
884 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
885 x & PGT_type_mask, type, page_to_pfn(page));
886 return 0;
887 }
888 else if ( (x & PGT_va_mask) == PGT_va_mutable )
889 {
890 /* The va backpointer is mutable, hence we update it. */
891 nx &= ~PGT_va_mask;
892 nx |= type; /* we know the actual type is correct */
893 }
894 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
895 {
896 /* This table is potentially mapped at multiple locations. */
897 nx &= ~PGT_va_mask;
898 nx |= PGT_va_unknown;
899 }
900 }
901 else if ( unlikely(!(x & PGT_validated)) )
902 {
903 /* Someone else is updating validation of this page. Wait... */
904 while ( (y = page->u.inuse.type_info) == x )
905 {
906 rep_nop();
907 barrier();
908 }
909 goto again;
910 }
911 }
912 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
914 if ( unlikely(!(nx & PGT_validated)) )
915 {
916 /* Try to validate page type; drop the new reference on failure. */
917 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
918 {
919 MEM_LOG("Error while validating pfn %08lx for type %08x."
920 " caf=%08x taf=%08x\n",
921 page_to_pfn(page), type,
922 page->count_info,
923 page->u.inuse.type_info);
924 /* Noone else can get a reference. We hold the only ref. */
925 page->u.inuse.type_info = 0;
926 return 0;
927 }
929 /* Noone else is updating simultaneously. */
930 __set_bit(_PGT_validated, &page->u.inuse.type_info);
931 }
933 return 1;
934 }
937 int new_guest_cr3(unsigned long pfn)
938 {
939 struct exec_domain *ed = current;
940 struct domain *d = ed->domain;
941 int okay, cpu = smp_processor_id();
942 unsigned long old_base_pfn;
944 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
945 if ( likely(okay) )
946 {
947 invalidate_shadow_ldt(ed);
949 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
950 old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
951 ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
953 shadow_mk_pagetable(ed);
955 write_ptbase(ed);
957 put_page_and_type(&frame_table[old_base_pfn]);
958 }
959 else
960 {
961 MEM_LOG("Error while installing new baseptr %08lx", pfn);
962 }
964 return okay;
965 }
967 static int do_extended_command(unsigned long ptr, unsigned long val)
968 {
969 int okay = 1, cpu = smp_processor_id();
970 unsigned int cmd = val & MMUEXT_CMD_MASK;
971 unsigned long pfn = ptr >> PAGE_SHIFT;
972 struct pfn_info *page = &frame_table[pfn];
973 struct exec_domain *ed = current;
974 struct domain *d = ed->domain, *nd, *e;
975 u32 x, y;
976 domid_t domid;
977 grant_ref_t gntref;
979 switch ( cmd )
980 {
981 case MMUEXT_PIN_L1_TABLE:
982 case MMUEXT_PIN_L2_TABLE:
983 /*
984 * We insist that, if you pin an L1 page, it's the first thing that
985 * you do to it. This is because we require the backptr to still be
986 * mutable. This assumption seems safe.
987 */
988 okay = get_page_and_type_from_pagenr(
989 pfn,
990 ((cmd==MMUEXT_PIN_L2_TABLE) ?
991 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
992 FOREIGNDOM);
994 if ( unlikely(!okay) )
995 {
996 MEM_LOG("Error while pinning pfn %08lx", pfn);
997 break;
998 }
1000 if ( unlikely(test_and_set_bit(_PGT_pinned,
1001 &page->u.inuse.type_info)) )
1003 MEM_LOG("Pfn %08lx already pinned", pfn);
1004 put_page_and_type(page);
1005 okay = 0;
1006 break;
1009 break;
1011 case MMUEXT_UNPIN_TABLE:
1012 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1014 MEM_LOG("Page %08lx bad domain (dom=%p)",
1015 ptr, page_get_owner(page));
1017 else if ( likely(test_and_clear_bit(_PGT_pinned,
1018 &page->u.inuse.type_info)) )
1020 put_page_and_type(page);
1021 put_page(page);
1023 else
1025 okay = 0;
1026 put_page(page);
1027 MEM_LOG("Pfn %08lx not pinned", pfn);
1029 break;
1031 case MMUEXT_NEW_BASEPTR:
1032 okay = new_guest_cr3(pfn);
1033 break;
1035 case MMUEXT_TLB_FLUSH:
1036 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1037 break;
1039 case MMUEXT_INVLPG:
1040 __flush_tlb_one(ptr);
1041 break;
1043 case MMUEXT_FLUSH_CACHE:
1044 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1046 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1047 okay = 0;
1049 else
1051 wbinvd();
1053 break;
1055 case MMUEXT_SET_LDT:
1057 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1058 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1059 (ents > 8192) ||
1060 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1061 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1063 okay = 0;
1064 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1066 else if ( (ed->arch.ldt_ents != ents) ||
1067 (ed->arch.ldt_base != ptr) )
1069 invalidate_shadow_ldt(ed);
1070 ed->arch.ldt_base = ptr;
1071 ed->arch.ldt_ents = ents;
1072 load_LDT(ed);
1073 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1074 if ( ents != 0 )
1075 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1077 break;
1080 case MMUEXT_SET_FOREIGNDOM:
1081 domid = (domid_t)(val >> 16);
1083 if ( (e = percpu_info[cpu].foreign) != NULL )
1084 put_domain(e);
1085 percpu_info[cpu].foreign = NULL;
1087 if ( !IS_PRIV(d) )
1089 switch ( domid )
1091 case DOMID_IO:
1092 get_knownalive_domain(dom_io);
1093 percpu_info[cpu].foreign = dom_io;
1094 break;
1095 default:
1096 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1097 okay = 0;
1098 break;
1101 else
1103 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1104 if ( e == NULL )
1106 switch ( domid )
1108 case DOMID_XEN:
1109 get_knownalive_domain(dom_xen);
1110 percpu_info[cpu].foreign = dom_xen;
1111 break;
1112 case DOMID_IO:
1113 get_knownalive_domain(dom_io);
1114 percpu_info[cpu].foreign = dom_io;
1115 break;
1116 default:
1117 MEM_LOG("Unknown domain '%u'", domid);
1118 okay = 0;
1119 break;
1123 break;
1125 case MMUEXT_TRANSFER_PAGE:
1126 domid = (domid_t)(val >> 16);
1127 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1129 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1130 unlikely(!pfn_is_ram(pfn)) ||
1131 unlikely((e = find_domain_by_id(domid)) == NULL) )
1133 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1134 okay = 0;
1135 break;
1138 spin_lock(&d->page_alloc_lock);
1140 /*
1141 * The tricky bit: atomically release ownership while there is just one
1142 * benign reference to the page (PGC_allocated). If that reference
1143 * disappears then the deallocation routine will safely spin.
1144 */
1145 nd = page_get_owner(page);
1146 y = page->count_info;
1147 do {
1148 x = y;
1149 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1150 (1|PGC_allocated)) ||
1151 unlikely(nd != d) )
1153 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1154 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1155 d, d->id, nd, x, page->u.inuse.type_info);
1156 spin_unlock(&d->page_alloc_lock);
1157 put_domain(e);
1158 return 0;
1160 __asm__ __volatile__(
1161 LOCK_PREFIX "cmpxchg8b %2"
1162 : "=d" (nd), "=a" (y),
1163 "=m" (*(volatile u64 *)(&page->count_info))
1164 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1166 while ( unlikely(nd != d) || unlikely(y != x) );
1168 /*
1169 * Unlink from 'd'. At least one reference remains (now anonymous), so
1170 * noone else is spinning to try to delete this page from 'd'.
1171 */
1172 d->tot_pages--;
1173 list_del(&page->list);
1175 spin_unlock(&d->page_alloc_lock);
1177 spin_lock(&e->page_alloc_lock);
1179 /*
1180 * Check that 'e' will accept the page and has reservation headroom.
1181 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1182 */
1183 ASSERT(e->tot_pages <= e->max_pages);
1184 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1185 unlikely(e->tot_pages == e->max_pages) ||
1186 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1188 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1189 "provided a bad grant ref, or is dying (%08lx).\n",
1190 e->tot_pages, e->max_pages, e->d_flags);
1191 spin_unlock(&e->page_alloc_lock);
1192 put_domain(e);
1193 okay = 0;
1194 break;
1197 /* Okay, add the page to 'e'. */
1198 if ( unlikely(e->tot_pages++ == 0) )
1199 get_knownalive_domain(e);
1200 list_add_tail(&page->list, &e->page_list);
1201 page_set_owner(page, e);
1203 spin_unlock(&e->page_alloc_lock);
1205 /* Transfer is all done: tell the guest about its new page frame. */
1206 gnttab_notify_transfer(e, gntref, pfn);
1208 put_domain(e);
1209 break;
1211 case MMUEXT_REASSIGN_PAGE:
1212 if ( unlikely(!IS_PRIV(d)) )
1214 MEM_LOG("Dom %u has no reassignment priv", d->id);
1215 okay = 0;
1216 break;
1219 e = percpu_info[cpu].foreign;
1220 if ( unlikely(e == NULL) )
1222 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1223 okay = 0;
1224 break;
1227 /*
1228 * Grab both page_list locks, in order. This prevents the page from
1229 * disappearing elsewhere while we modify the owner, and we'll need
1230 * both locks if we're successful so that we can change lists.
1231 */
1232 if ( d < e )
1234 spin_lock(&d->page_alloc_lock);
1235 spin_lock(&e->page_alloc_lock);
1237 else
1239 spin_lock(&e->page_alloc_lock);
1240 spin_lock(&d->page_alloc_lock);
1243 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1244 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1245 unlikely(IS_XEN_HEAP_FRAME(page)) )
1247 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1248 okay = 0;
1249 goto reassign_fail;
1252 /*
1253 * The tricky bit: atomically change owner while there is just one
1254 * benign reference to the page (PGC_allocated). If that reference
1255 * disappears then the deallocation routine will safely spin.
1256 */
1257 nd = page_get_owner(page);
1258 y = page->count_info;
1259 do {
1260 x = y;
1261 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1262 (1|PGC_allocated)) ||
1263 unlikely(nd != d) )
1265 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1266 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1267 d, d->id, nd, x, page->u.inuse.type_info);
1268 okay = 0;
1269 goto reassign_fail;
1271 __asm__ __volatile__(
1272 LOCK_PREFIX "cmpxchg8b %3"
1273 : "=d" (nd), "=a" (y), "=c" (e),
1274 "=m" (*(volatile u64 *)(&page->count_info))
1275 : "0" (d), "1" (x), "c" (e), "b" (x) );
1277 while ( unlikely(nd != d) || unlikely(y != x) );
1279 /*
1280 * Unlink from 'd'. We transferred at least one reference to 'e', so
1281 * noone else is spinning to try to delete this page from 'd'.
1282 */
1283 d->tot_pages--;
1284 list_del(&page->list);
1286 /*
1287 * Add the page to 'e'. Someone may already have removed the last
1288 * reference and want to remove the page from 'e'. However, we have
1289 * the lock so they'll spin waiting for us.
1290 */
1291 if ( unlikely(e->tot_pages++ == 0) )
1292 get_knownalive_domain(e);
1293 list_add_tail(&page->list, &e->page_list);
1295 reassign_fail:
1296 spin_unlock(&d->page_alloc_lock);
1297 spin_unlock(&e->page_alloc_lock);
1298 break;
1300 case MMUEXT_CLEAR_FOREIGNDOM:
1301 if ( (e = percpu_info[cpu].foreign) != NULL )
1302 put_domain(e);
1303 percpu_info[cpu].foreign = NULL;
1304 break;
1306 default:
1307 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1308 okay = 0;
1309 break;
1312 return okay;
1315 int do_mmu_update(
1316 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1318 /*
1319 * We steal the m.s.b. of the @count parameter to indicate whether this
1320 * invocation of do_mmu_update() is resuming a previously preempted call.
1321 * We steal the next 15 bits to remember the current FOREIGNDOM.
1322 */
1323 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1324 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1325 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1327 mmu_update_t req;
1328 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1329 struct pfn_info *page;
1330 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1331 unsigned int cmd, done = 0;
1332 unsigned long prev_smfn = 0;
1333 l1_pgentry_t *prev_spl1e = 0;
1334 struct exec_domain *ed = current;
1335 struct domain *d = ed->domain;
1336 u32 type_info;
1337 domid_t domid;
1339 LOCK_BIGLOCK(d);
1341 cleanup_writable_pagetable(d);
1343 if ( unlikely(shadow_mode(d)) )
1344 check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
1346 /*
1347 * If we are resuming after preemption, read how much work we have already
1348 * done. This allows us to set the @done output parameter correctly.
1349 * We also reset FOREIGNDOM here.
1350 */
1351 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1353 if ( !(count & MMU_UPDATE_PREEMPTED) )
1355 /* Count overflow into private FOREIGNDOM field. */
1356 MEM_LOG("do_mmu_update count is too large");
1357 rc = -EINVAL;
1358 goto out;
1360 count &= ~MMU_UPDATE_PREEMPTED;
1361 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1362 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1363 if ( unlikely(pdone != NULL) )
1364 (void)get_user(done, pdone);
1365 if ( (domid != current->domain->id) &&
1366 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1368 rc = -EINVAL;
1369 goto out;
1373 perfc_incrc(calls_to_mmu_update);
1374 perfc_addc(num_page_updates, count);
1376 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1378 rc = -EFAULT;
1379 goto out;
1382 for ( i = 0; i < count; i++ )
1384 if ( hypercall_preempt_check() )
1386 rc = hypercall3_create_continuation(
1387 __HYPERVISOR_mmu_update, ureqs,
1388 (count - i) |
1389 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1390 MMU_UPDATE_PREEMPTED, pdone);
1391 break;
1394 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1396 MEM_LOG("Bad __copy_from_user");
1397 rc = -EFAULT;
1398 break;
1401 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1402 pfn = req.ptr >> PAGE_SHIFT;
1404 okay = 0;
1406 switch ( cmd )
1408 /*
1409 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1410 */
1411 case MMU_NORMAL_PT_UPDATE:
1412 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1414 MEM_LOG("Could not get page for normal update");
1415 break;
1418 if ( likely(prev_pfn == pfn) )
1420 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1422 else
1424 if ( prev_pfn != 0 )
1425 unmap_domain_mem((void *)va);
1426 va = (unsigned long)map_domain_mem(req.ptr);
1427 prev_pfn = pfn;
1430 page = &frame_table[pfn];
1431 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1433 case PGT_l1_page_table:
1434 if ( likely(get_page_type(
1435 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1437 okay = mod_l1_entry((l1_pgentry_t *)va,
1438 mk_l1_pgentry(req.val));
1440 if ( unlikely(shadow_mode(d)) && okay &&
1441 (get_shadow_status(d, page-frame_table) &
1442 PSH_shadowed) )
1444 shadow_l1_normal_pt_update(
1445 req.ptr, req.val, &prev_smfn, &prev_spl1e);
1446 put_shadow_status(d);
1449 put_page_type(page);
1451 break;
1452 case PGT_l2_page_table:
1453 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1455 okay = mod_l2_entry((l2_pgentry_t *)va,
1456 mk_l2_pgentry(req.val),
1457 pfn);
1459 if ( unlikely(shadow_mode(d)) && okay &&
1460 (get_shadow_status(d, page-frame_table) &
1461 PSH_shadowed) )
1463 shadow_l2_normal_pt_update(req.ptr, req.val);
1464 put_shadow_status(d);
1467 put_page_type(page);
1469 break;
1470 default:
1471 if ( likely(get_page_type(page, PGT_writable_page)) )
1473 *(unsigned long *)va = req.val;
1474 okay = 1;
1475 put_page_type(page);
1477 break;
1480 put_page(page);
1481 break;
1483 case MMU_MACHPHYS_UPDATE:
1484 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1486 MEM_LOG("Could not get page for mach->phys update");
1487 break;
1490 machine_to_phys_mapping[pfn] = req.val;
1491 okay = 1;
1493 /*
1494 * If in log-dirty mode, mark the corresponding pseudo-physical
1495 * page as dirty.
1496 */
1497 if ( unlikely(shadow_mode(d) == SHM_logdirty) &&
1498 mark_dirty(d, pfn) )
1499 d->arch.shadow_dirty_block_count++;
1501 put_page(&frame_table[pfn]);
1502 break;
1504 /*
1505 * MMU_EXTENDED_COMMAND: Extended command is specified
1506 * in the least-siginificant bits of the 'value' field.
1507 */
1508 case MMU_EXTENDED_COMMAND:
1509 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1510 okay = do_extended_command(req.ptr, req.val);
1511 break;
1513 default:
1514 MEM_LOG("Invalid page update command %08lx", req.ptr);
1515 break;
1518 if ( unlikely(!okay) )
1520 rc = -EINVAL;
1521 break;
1524 ureqs++;
1527 out:
1528 if ( prev_pfn != 0 )
1529 unmap_domain_mem((void *)va);
1531 if ( unlikely(prev_spl1e != 0) )
1532 unmap_domain_mem((void *)prev_spl1e);
1534 deferred_ops = percpu_info[cpu].deferred_ops;
1535 percpu_info[cpu].deferred_ops = 0;
1537 if ( deferred_ops & DOP_FLUSH_TLB )
1538 local_flush_tlb();
1540 if ( deferred_ops & DOP_RELOAD_LDT )
1541 (void)map_ldt_shadow_page(0);
1543 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1545 put_domain(percpu_info[cpu].foreign);
1546 percpu_info[cpu].foreign = NULL;
1549 /* Add incremental work we have done to the @done output parameter. */
1550 if ( unlikely(pdone != NULL) )
1551 __put_user(done + i, pdone);
1553 if ( unlikely(shadow_mode(d)) )
1554 check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
1556 UNLOCK_BIGLOCK(d);
1557 return rc;
1561 int do_update_va_mapping(unsigned long page_nr,
1562 unsigned long val,
1563 unsigned long flags)
1565 struct exec_domain *ed = current;
1566 struct domain *d = ed->domain;
1567 int err = 0;
1568 unsigned int cpu = ed->processor;
1569 unsigned long deferred_ops;
1571 perfc_incrc(calls_to_update_va);
1573 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1574 return -EINVAL;
1576 LOCK_BIGLOCK(d);
1578 cleanup_writable_pagetable(d);
1580 /*
1581 * XXX When we make this support 4MB superpages we should also deal with
1582 * the case of updating L2 entries.
1583 */
1585 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1586 mk_l1_pgentry(val))) )
1587 err = -EINVAL;
1589 if ( unlikely(shadow_mode(d)) )
1591 unsigned long sval = 0;
1593 l1pte_propagate_from_guest(d, &val, &sval);
1595 if ( unlikely(__put_user(sval, ((unsigned long *)(
1596 &shadow_linear_pg_table[page_nr])))) )
1598 /*
1599 * Since L2's are guranteed RW, failure indicates the page was not
1600 * shadowed, so ignore.
1601 */
1602 perfc_incrc(shadow_update_va_fail);
1605 /*
1606 * If we're in log-dirty mode then we need to note that we've updated
1607 * the PTE in the PT-holding page. We need the machine frame number
1608 * for this.
1609 */
1610 if ( shadow_mode(d) == SHM_logdirty )
1611 mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT));
1613 check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
1616 deferred_ops = percpu_info[cpu].deferred_ops;
1617 percpu_info[cpu].deferred_ops = 0;
1619 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1620 unlikely(flags & UVMF_FLUSH_TLB) )
1621 local_flush_tlb();
1622 else if ( unlikely(flags & UVMF_INVLPG) )
1623 __flush_tlb_one(page_nr << PAGE_SHIFT);
1625 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1626 (void)map_ldt_shadow_page(0);
1628 UNLOCK_BIGLOCK(d);
1630 return err;
1633 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1634 unsigned long val,
1635 unsigned long flags,
1636 domid_t domid)
1638 unsigned int cpu = smp_processor_id();
1639 struct domain *d;
1640 int rc;
1642 if ( unlikely(!IS_PRIV(current->domain)) )
1643 return -EPERM;
1645 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1646 if ( unlikely(d == NULL) )
1648 MEM_LOG("Unknown domain '%u'", domid);
1649 return -ESRCH;
1652 rc = do_update_va_mapping(page_nr, val, flags);
1654 put_domain(d);
1655 percpu_info[cpu].foreign = NULL;
1657 return rc;
1662 /*************************
1663 * Descriptor Tables
1664 */
1666 void destroy_gdt(struct exec_domain *ed)
1668 int i;
1669 unsigned long pfn;
1671 for ( i = 0; i < 16; i++ )
1673 if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
1674 put_page_and_type(&frame_table[pfn]);
1675 ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
1680 long set_gdt(struct exec_domain *ed,
1681 unsigned long *frames,
1682 unsigned int entries)
1684 struct domain *d = ed->domain;
1685 /* NB. There are 512 8-byte entries per GDT page. */
1686 int i = 0, nr_pages = (entries + 511) / 512;
1687 struct desc_struct *vgdt;
1688 unsigned long pfn;
1690 /* Check the first page in the new GDT. */
1691 if ( (pfn = frames[0]) >= max_page )
1692 goto fail;
1694 /* The first page is special because Xen owns a range of entries in it. */
1695 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1697 /* GDT checks failed: try zapping the Xen reserved entries. */
1698 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
1699 goto fail;
1700 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
1701 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
1702 NR_RESERVED_GDT_ENTRIES*8);
1703 unmap_domain_mem(vgdt);
1704 put_page_and_type(&frame_table[pfn]);
1706 /* Okay, we zapped the entries. Now try the GDT checks again. */
1707 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1708 goto fail;
1711 /* Check the remaining pages in the new GDT. */
1712 for ( i = 1; i < nr_pages; i++ )
1713 if ( ((pfn = frames[i]) >= max_page) ||
1714 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1715 goto fail;
1717 /* Copy reserved GDT entries to the new GDT. */
1718 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
1719 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
1720 gdt_table + FIRST_RESERVED_GDT_ENTRY,
1721 NR_RESERVED_GDT_ENTRIES*8);
1722 unmap_domain_mem(vgdt);
1724 /* Tear down the old GDT. */
1725 destroy_gdt(ed);
1727 /* Install the new GDT. */
1728 for ( i = 0; i < nr_pages; i++ )
1729 ed->arch.perdomain_ptes[i] =
1730 mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1732 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
1733 SET_GDT_ENTRIES(ed, entries);
1735 return 0;
1737 fail:
1738 while ( i-- > 0 )
1739 put_page_and_type(&frame_table[frames[i]]);
1740 return -EINVAL;
1744 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
1746 int nr_pages = (entries + 511) / 512;
1747 unsigned long frames[16];
1748 long ret;
1750 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
1751 return -EINVAL;
1753 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
1754 return -EFAULT;
1756 LOCK_BIGLOCK(current->domain);
1758 if ( (ret = set_gdt(current, frames, entries)) == 0 )
1760 local_flush_tlb();
1761 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
1764 UNLOCK_BIGLOCK(current->domain);
1766 return ret;
1770 long do_update_descriptor(
1771 unsigned long pa, unsigned long word1, unsigned long word2)
1773 unsigned long pfn = pa >> PAGE_SHIFT;
1774 struct desc_struct *gdt_pent, d;
1775 struct pfn_info *page;
1776 struct exec_domain *ed;
1777 long ret = -EINVAL;
1779 d.a = (u32)word1;
1780 d.b = (u32)word2;
1782 LOCK_BIGLOCK(current->domain);
1784 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
1785 UNLOCK_BIGLOCK(current->domain);
1786 return -EINVAL;
1789 page = &frame_table[pfn];
1790 if ( unlikely(!get_page(page, current->domain)) ) {
1791 UNLOCK_BIGLOCK(current->domain);
1792 return -EINVAL;
1795 /* Check if the given frame is in use in an unsafe context. */
1796 switch ( page->u.inuse.type_info & PGT_type_mask )
1798 case PGT_gdt_page:
1799 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
1800 for_each_exec_domain(current->domain, ed) {
1801 if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
1802 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
1803 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
1804 goto out;
1806 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
1807 goto out;
1808 break;
1809 case PGT_ldt_page:
1810 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
1811 goto out;
1812 break;
1813 default:
1814 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
1815 goto out;
1816 break;
1819 /* All is good so make the update. */
1820 gdt_pent = map_domain_mem(pa);
1821 memcpy(gdt_pent, &d, 8);
1822 unmap_domain_mem(gdt_pent);
1824 put_page_type(page);
1826 ret = 0; /* success */
1828 out:
1829 put_page(page);
1831 UNLOCK_BIGLOCK(current->domain);
1833 return ret;
1838 /*************************
1839 * Writable Pagetables
1840 */
1842 ptwr_info_t ptwr_info[NR_CPUS];
1844 #ifdef VERBOSE
1845 int ptwr_debug = 0x0;
1846 #define PTWR_PRINTK(_f, _a...) \
1847 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1848 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1849 #else
1850 #define PTWR_PRINTK(_f, _a...) ((void)0)
1851 #endif
1853 /* Flush the given writable p.t. page and write-protect it again. */
1854 void ptwr_flush(const int which)
1856 unsigned long sstat, spte, pte, *ptep, l1va;
1857 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1858 l2_pgentry_t *pl2e;
1859 int i, cpu = smp_processor_id();
1860 struct exec_domain *ed = current;
1861 struct domain *d = ed->domain;
1863 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1864 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1866 /*
1867 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1868 */
1870 if ( unlikely(__get_user(pte, ptep)) )
1872 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1873 /*
1874 * Really a bug. We could read this PTE during the initial fault,
1875 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1876 */
1877 BUG();
1879 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1880 PTWR_PRINT_WHICH, ptep, pte);
1881 pte &= ~_PAGE_RW;
1883 if ( unlikely(shadow_mode(d)) )
1885 /* Write-protect the p.t. page in the shadow page table. */
1886 l1pte_propagate_from_guest(d, &pte, &spte);
1887 __put_user(
1888 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1890 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1891 sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
1892 if ( sstat & PSH_shadowed )
1893 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1896 /* Write-protect the p.t. page in the guest page table. */
1897 if ( unlikely(__put_user(pte, ptep)) )
1899 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1900 /*
1901 * Really a bug. We could write this PTE during the initial fault,
1902 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1903 */
1904 BUG();
1907 /* Ensure that there are no stale writable mappings in any TLB. */
1908 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1909 #if 1
1910 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1911 #else
1912 flush_tlb_all();
1913 #endif
1914 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1915 PTWR_PRINT_WHICH, ptep, pte);
1917 /*
1918 * STEP 2. Validate any modified PTEs.
1919 */
1921 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1922 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1924 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1925 nl1e = pl1e[i];
1927 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1928 continue;
1930 /*
1931 * Fast path for PTEs that have merely been write-protected
1932 * (e.g., during a Unix fork()). A strict reduction in privilege.
1933 */
1934 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1936 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1938 if ( unlikely(sl1e != NULL) )
1939 l1pte_propagate_from_guest(
1940 d, &l1_pgentry_val(nl1e),
1941 &l1_pgentry_val(sl1e[i]));
1942 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1944 continue;
1947 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1949 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1950 /*
1951 * Make the remaining p.t's consistent before crashing, so the
1952 * reference counts are correct.
1953 */
1954 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1955 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1956 unmap_domain_mem(pl1e);
1957 ptwr_info[cpu].ptinfo[which].l1va = 0;
1958 UNLOCK_BIGLOCK(d);
1959 domain_crash();
1962 if ( unlikely(sl1e != NULL) )
1963 l1pte_propagate_from_guest(
1964 d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1966 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1967 put_page_from_l1e(ol1e, d);
1969 unmap_domain_mem(pl1e);
1971 /*
1972 * STEP 3. Reattach the L1 p.t. page into the current address space.
1973 */
1975 if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
1977 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1978 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1981 /*
1982 * STEP 4. Final tidy-up.
1983 */
1985 ptwr_info[cpu].ptinfo[which].l1va = 0;
1987 if ( unlikely(sl1e != NULL) )
1989 unmap_domain_mem(sl1e);
1990 put_shadow_status(d);
1994 /* Write page fault handler: check if guest is trying to modify a PTE. */
1995 int ptwr_do_page_fault(unsigned long addr)
1997 unsigned long pte, pfn, l2e;
1998 struct pfn_info *page;
1999 l2_pgentry_t *pl2e;
2000 int which, cpu = smp_processor_id();
2001 u32 l2_idx;
2003 /*
2004 * Attempt to read the PTE that maps the VA being accessed. By checking for
2005 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2006 */
2007 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2008 _PAGE_PRESENT) ||
2009 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
2011 return 0;
2014 pfn = pte >> PAGE_SHIFT;
2015 page = &frame_table[pfn];
2017 /* We are looking only for read-only mappings of p.t. pages. */
2018 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
2019 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
2021 return 0;
2024 /* Get the L2 index at which this L1 p.t. is always mapped. */
2025 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2026 if ( unlikely(l2_idx >= PGT_va_unknown) )
2028 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
2030 l2_idx >>= PGT_va_shift;
2032 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
2034 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
2035 domain_crash();
2038 /*
2039 * Is the L1 p.t. mapped into the current address space? If so we call it
2040 * an ACTIVE p.t., otherwise it is INACTIVE.
2041 */
2042 pl2e = &linear_l2_table[l2_idx];
2043 l2e = l2_pgentry_val(*pl2e);
2044 which = PTWR_PT_INACTIVE;
2045 if ( (l2e >> PAGE_SHIFT) == pfn )
2047 /* Check the PRESENT bit to set ACTIVE. */
2048 if ( likely(l2e & _PAGE_PRESENT) )
2049 which = PTWR_PT_ACTIVE;
2050 else {
2051 /*
2052 * If the PRESENT bit is clear, we may be conflicting with
2053 * the current ACTIVE p.t. (it may be the same p.t. mapped
2054 * at another virt addr).
2055 * The ptwr_flush call below will restore the PRESENT bit.
2056 */
2057 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
2058 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
2059 which = PTWR_PT_ACTIVE;
2063 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
2064 "pfn %08lx\n", PTWR_PRINT_WHICH,
2065 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2067 /*
2068 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2069 * time. If there is already one, we must flush it out.
2070 */
2071 if ( ptwr_info[cpu].ptinfo[which].l1va )
2072 ptwr_flush(which);
2074 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
2075 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
2077 /* For safety, disconnect the L1 p.t. page from current space. */
2078 if ( (which == PTWR_PT_ACTIVE) &&
2079 likely(!shadow_mode(current->domain)) )
2081 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
2082 #if 1
2083 flush_tlb(); /* XXX Multi-CPU guests? */
2084 #else
2085 flush_tlb_all();
2086 #endif
2089 /* Temporarily map the L1 page, and make a copy of it. */
2090 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
2091 memcpy(ptwr_info[cpu].ptinfo[which].page,
2092 ptwr_info[cpu].ptinfo[which].pl1e,
2093 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
2095 /* Finally, make the p.t. page writable by the guest OS. */
2096 pte |= _PAGE_RW;
2097 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
2098 &linear_pg_table[addr>>PAGE_SHIFT], pte);
2099 if ( unlikely(__put_user(pte, (unsigned long *)
2100 &linear_pg_table[addr>>PAGE_SHIFT])) )
2102 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
2103 &linear_pg_table[addr>>PAGE_SHIFT]);
2104 /* Toss the writable pagetable state and crash. */
2105 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
2106 ptwr_info[cpu].ptinfo[which].l1va = 0;
2107 domain_crash();
2110 return EXCRET_fault_fixed;
2113 static __init int ptwr_init(void)
2115 int i;
2117 for ( i = 0; i < smp_num_cpus; i++ )
2119 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
2120 (void *)alloc_xenheap_page();
2121 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
2122 (void *)alloc_xenheap_page();
2125 return 0;
2127 __initcall(ptwr_init);
2132 /************************************************************************/
2133 /************************************************************************/
2134 /************************************************************************/
2136 #ifndef NDEBUG
2138 void ptwr_status(void)
2140 unsigned long pte, *ptep, pfn;
2141 struct pfn_info *page;
2142 int cpu = smp_processor_id();
2144 ptep = (unsigned long *)&linear_pg_table
2145 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
2147 if ( __get_user(pte, ptep) ) {
2148 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
2149 domain_crash();
2152 pfn = pte >> PAGE_SHIFT;
2153 page = &frame_table[pfn];
2154 printk("need to alloc l1 page %p\n", page);
2155 /* make pt page writable */
2156 printk("need to make read-only l1-page at %p is %08lx\n",
2157 ptep, pte);
2159 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
2160 return;
2162 if ( __get_user(pte, (unsigned long *)
2163 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
2164 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
2165 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
2166 domain_crash();
2168 pfn = pte >> PAGE_SHIFT;
2169 page = &frame_table[pfn];
2172 void audit_domain(struct domain *d)
2174 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
2176 void adjust (struct pfn_info *page, int dir, int adjtype)
2178 int count = page->count_info & PGC_count_mask;
2180 if ( adjtype )
2182 int tcount = page->u.inuse.type_info & PGT_count_mask;
2184 ttot++;
2186 tcount += dir;
2188 if ( tcount < 0 )
2190 /* This will only come out once. */
2191 printk("Audit %d: type count whent below zero pfn=%x "
2192 "taf=%x otaf=%x\n",
2193 d->id, page-frame_table,
2194 page->u.inuse.type_info,
2195 page->tlbflush_timestamp);
2198 page->u.inuse.type_info =
2199 (page->u.inuse.type_info & ~PGT_count_mask) |
2200 (tcount & PGT_count_mask);
2203 ctot++;
2204 count += dir;
2205 if ( count < 0 )
2207 /* This will only come out once. */
2208 printk("Audit %d: general count whent below zero pfn=%x "
2209 "taf=%x otaf=%x\n",
2210 d->id, page-frame_table,
2211 page->u.inuse.type_info,
2212 page->tlbflush_timestamp);
2215 page->count_info =
2216 (page->count_info & ~PGC_count_mask) |
2217 (count & PGC_count_mask);
2221 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2223 unsigned long pfn, *pt;
2224 struct list_head *list_ent;
2225 struct pfn_info *page;
2226 int i;
2228 list_ent = d->page_list.next;
2229 for ( i = 0; (list_ent != &d->page_list); i++ )
2231 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2232 page = &frame_table[pfn];
2234 switch ( page->u.inuse.type_info & PGT_type_mask )
2236 case PGT_l1_page_table:
2237 case PGT_l2_page_table:
2238 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2239 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2240 if ( (pt[i] & _PAGE_PRESENT) &&
2241 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2242 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2243 d->id, i, pfn, page->u.inuse.type_info,
2244 page->count_info);
2245 unmap_domain_mem(pt);
2248 list_ent = frame_table[pfn].list.next;
2253 void scan_for_pfn_remote(unsigned long xpfn)
2255 struct domain *e;
2256 for_each_domain ( e )
2257 scan_for_pfn( e, xpfn );
2260 int i;
2261 unsigned long pfn;
2262 struct list_head *list_ent;
2263 struct pfn_info *page;
2265 if ( d != current->domain )
2266 domain_pause(d);
2267 synchronise_pagetables(~0UL);
2269 printk("pt base=%lx sh_info=%x\n",
2270 pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
2271 virt_to_page(d->shared_info)-frame_table);
2273 spin_lock(&d->page_alloc_lock);
2275 /* PHASE 0 */
2277 list_ent = d->page_list.next;
2278 for ( i = 0; (list_ent != &d->page_list); i++ )
2280 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2281 page = &frame_table[pfn];
2283 if ( page_get_owner(page) != d )
2284 BUG();
2286 if ( (page->u.inuse.type_info & PGT_count_mask) >
2287 (page->count_info & PGC_count_mask) )
2288 printk("taf > caf %x %x pfn=%lx\n",
2289 page->u.inuse.type_info, page->count_info, pfn );
2291 #if 0 /* SYSV shared memory pages plus writeable files. */
2292 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2293 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2295 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2296 pfn,
2297 page->u.inuse.type_info,
2298 page->count_info );
2299 scan_for_pfn_remote(pfn);
2301 #endif
2302 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2303 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2305 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2306 pfn,
2307 page->u.inuse.type_info,
2308 page->count_info );
2311 /* Use tlbflush_timestamp to store original type_info. */
2312 page->tlbflush_timestamp = page->u.inuse.type_info;
2314 list_ent = frame_table[pfn].list.next;
2318 /* PHASE 1 */
2320 adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
2322 list_ent = d->page_list.next;
2323 for ( i = 0; (list_ent != &d->page_list); i++ )
2325 unsigned long *pt;
2326 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2327 page = &frame_table[pfn];
2329 if ( page_get_owner(page) != d )
2330 BUG();
2332 switch ( page->u.inuse.type_info & PGT_type_mask )
2334 case PGT_l2_page_table:
2336 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2337 printk("Audit %d: L2 not validated %x\n",
2338 d->id, page->u.inuse.type_info);
2340 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2341 printk("Audit %d: L2 not pinned %x\n",
2342 d->id, page->u.inuse.type_info);
2343 else
2344 adjust( page, -1, 1 );
2346 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2348 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2350 if ( pt[i] & _PAGE_PRESENT )
2352 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2353 struct pfn_info *l1page = &frame_table[l1pfn];
2355 if ( page_get_owner(l1page) != d )
2357 printk("L2: Skip bizarre page belonging to other "
2358 "dom %p\n", page_get_owner(l1page));
2359 continue;
2362 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2363 PGT_l2_page_table )
2364 printk("Audit %d: [%x] Found %s Linear PT "
2365 "t=%x pfn=%lx\n", d->id, i,
2366 (l1pfn==pfn) ? "Self" : "Other",
2367 l1page->u.inuse.type_info,
2368 l1pfn);
2369 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2370 PGT_l1_page_table )
2371 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2372 d->id, i,
2373 l1page->u.inuse.type_info,
2374 l1pfn);
2376 adjust(l1page, -1, 1);
2380 unmap_domain_mem(pt);
2382 break;
2385 case PGT_l1_page_table:
2387 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2388 adjust( page, -1, 1 );
2390 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2391 printk("Audit %d: L1 not validated %x\n",
2392 d->id, page->u.inuse.type_info);
2393 #if 0
2394 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2395 printk("Audit %d: L1 not pinned %x\n",
2396 d->id, page->u.inuse.type_info);
2397 #endif
2398 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2400 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2402 if ( pt[i] & _PAGE_PRESENT )
2404 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2405 struct pfn_info *l1page = &frame_table[l1pfn];
2407 if ( l1pfn < 0x100 )
2409 lowmem_mappings++;
2410 continue;
2413 if ( l1pfn > max_page )
2415 io_mappings++;
2416 continue;
2419 if ( pt[i] & _PAGE_RW )
2422 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2423 PGT_l1_page_table ||
2424 (l1page->u.inuse.type_info & PGT_type_mask) ==
2425 PGT_l2_page_table )
2426 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2427 d->id, i,
2428 l1page->u.inuse.type_info,
2429 l1pfn);
2433 if ( page_get_owner(l1page) != d )
2435 printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
2436 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2437 d->id, pfn, i,
2438 page_get_owner(l1page),
2439 l1pfn,
2440 l1page->count_info,
2441 l1page->u.inuse.type_info,
2442 machine_to_phys_mapping[l1pfn]);
2443 continue;
2446 adjust(l1page, -1, 0);
2450 unmap_domain_mem(pt);
2452 break;
2455 list_ent = frame_table[pfn].list.next;
2458 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2459 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2460 d->id, lowmem_mappings, io_mappings);
2462 /* PHASE 2 */
2464 ctot = ttot = 0;
2465 list_ent = d->page_list.next;
2466 for ( i = 0; (list_ent != &d->page_list); i++ )
2468 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2469 page = &frame_table[pfn];
2471 switch ( page->u.inuse.type_info & PGT_type_mask)
2473 case PGT_l1_page_table:
2474 case PGT_l2_page_table:
2475 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2477 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2478 d->id, page->u.inuse.type_info,
2479 page->tlbflush_timestamp,
2480 page->count_info, pfn );
2481 scan_for_pfn_remote(pfn);
2483 default:
2484 if ( (page->count_info & PGC_count_mask) != 1 )
2486 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2487 d->id,
2488 page->count_info,
2489 page->u.inuse.type_info,
2490 page->tlbflush_timestamp, pfn );
2491 scan_for_pfn_remote(pfn);
2493 break;
2496 list_ent = frame_table[pfn].list.next;
2499 /* PHASE 3 */
2500 list_ent = d->page_list.next;
2501 for ( i = 0; (list_ent != &d->page_list); i++ )
2503 unsigned long *pt;
2504 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2505 page = &frame_table[pfn];
2507 switch ( page->u.inuse.type_info & PGT_type_mask )
2509 case PGT_l2_page_table:
2510 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2511 adjust( page, 1, 1 );
2513 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2515 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2517 if ( pt[i] & _PAGE_PRESENT )
2519 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2520 struct pfn_info *l1page;
2522 if (l1pfn>max_page)
2523 continue;
2525 l1page = &frame_table[l1pfn];
2527 if ( page_get_owner(l1page) == d )
2528 adjust(l1page, 1, 1);
2532 unmap_domain_mem(pt);
2533 break;
2535 case PGT_l1_page_table:
2536 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2537 adjust( page, 1, 1 );
2539 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2541 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2543 if ( pt[i] & _PAGE_PRESENT )
2545 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2546 struct pfn_info *l1page;
2548 if (l1pfn>max_page)
2549 continue;
2551 l1page = &frame_table[l1pfn];
2553 if ( (page_get_owner(l1page) != d) ||
2554 (l1pfn < 0x100) || (l1pfn > max_page) )
2555 continue;
2557 adjust(l1page, 1, 0);
2561 unmap_domain_mem(pt);
2562 break;
2566 page->tlbflush_timestamp = 0;
2568 list_ent = frame_table[pfn].list.next;
2571 spin_unlock(&d->page_alloc_lock);
2573 adjust(&frame_table[pagetable_val(
2574 d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
2576 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2578 if ( d != current->domain )
2579 domain_unpause(d);
2582 void audit_domains(void)
2584 struct domain *d;
2585 for_each_domain ( d )
2586 audit_domain(d);
2589 void audit_domains_key(unsigned char key)
2591 audit_domains();
2594 #endif