debuggers.hg

view xen/arch/x86/mm.c @ 4629:6375127fdf23

bitkeeper revision 1.1311.1.1 (426641eeBv97w6sl983zxeR4Dc3Utg)

Cleanup page table handling. Add macros to access page table
entries, fixup plenty of places in the code to use the page
table types instead of "unsigned long".

Signed-off-by: Gerd Knorr <kraxel@bytesex.org>
Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Wed Apr 20 11:50:06 2005 +0000 (2005-04-20)
parents eb5407610fab
children 1803018b3b05
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
104 #include <asm/x86_emulate.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
109 current->domain->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 /*
115 * Both do_mmuext_op() and do_mmu_update():
116 * We steal the m.s.b. of the @count parameter to indicate whether this
117 * invocation of do_mmu_update() is resuming a previously preempted call.
118 */
119 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned int deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
164 4UL << 20, PAGE_HYPERVISOR);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 extern void subarch_init_memory(struct domain *);
174 unsigned long i, j, pfn, nr_pfns;
175 struct pfn_info *page;
177 memset(percpu_info, 0, sizeof(percpu_info));
179 /*
180 * Initialise our DOMID_XEN domain.
181 * Any Xen-heap pages that we will allow to be mapped will have
182 * their domain field set to dom_xen.
183 */
184 dom_xen = alloc_domain_struct();
185 atomic_set(&dom_xen->refcnt, 1);
186 dom_xen->id = DOMID_XEN;
188 /*
189 * Initialise our DOMID_IO domain.
190 * This domain owns I/O pages that are within the range of the pfn_info
191 * array. Mappings occur at the priv of the caller.
192 */
193 dom_io = alloc_domain_struct();
194 atomic_set(&dom_io->refcnt, 1);
195 dom_io->id = DOMID_IO;
197 /* First 1MB of RAM is historically marked as I/O. */
198 for ( i = 0; i < 0x100; i++ )
199 {
200 page = &frame_table[i];
201 page->count_info = PGC_allocated | 1;
202 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
203 page_set_owner(page, dom_io);
204 }
206 /* Any non-RAM areas in the e820 map are considered to be for I/O. */
207 for ( i = 0; i < e820.nr_map; i++ )
208 {
209 if ( e820.map[i].type == E820_RAM )
210 continue;
211 pfn = e820.map[i].addr >> PAGE_SHIFT;
212 nr_pfns = (e820.map[i].size +
213 (e820.map[i].addr & ~PAGE_MASK) +
214 ~PAGE_MASK) >> PAGE_SHIFT;
215 for ( j = 0; j < nr_pfns; j++ )
216 {
217 if ( !pfn_valid(pfn+j) )
218 continue;
219 page = &frame_table[pfn+j];
220 page->count_info = PGC_allocated | 1;
221 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
222 page_set_owner(page, dom_io);
223 }
224 }
226 subarch_init_memory(dom_xen);
227 }
229 void write_ptbase(struct exec_domain *ed)
230 {
231 write_cr3(pagetable_val(ed->arch.monitor_table));
232 }
234 void invalidate_shadow_ldt(struct exec_domain *d)
235 {
236 int i;
237 unsigned long pfn;
238 struct pfn_info *page;
240 if ( d->arch.shadow_ldt_mapcnt == 0 )
241 return;
243 d->arch.shadow_ldt_mapcnt = 0;
245 for ( i = 16; i < 32; i++ )
246 {
247 pfn = l1e_get_pfn(d->arch.perdomain_ptes[i]);
248 if ( pfn == 0 ) continue;
249 d->arch.perdomain_ptes[i] = l1e_empty();
250 page = &frame_table[pfn];
251 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
252 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
253 put_page_and_type(page);
254 }
256 /* Dispose of the (now possibly invalid) mappings from the TLB. */
257 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
258 }
261 static int alloc_segdesc_page(struct pfn_info *page)
262 {
263 struct desc_struct *descs;
264 int i;
266 descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
268 for ( i = 0; i < 512; i++ )
269 if ( unlikely(!check_descriptor(&descs[i])) )
270 goto fail;
272 unmap_domain_mem(descs);
273 return 1;
275 fail:
276 unmap_domain_mem(descs);
277 return 0;
278 }
281 /* Map shadow page at offset @off. */
282 int map_ldt_shadow_page(unsigned int off)
283 {
284 struct exec_domain *ed = current;
285 struct domain *d = ed->domain;
286 unsigned long gpfn, gmfn;
287 l1_pgentry_t l1e, nl1e;
288 unsigned gva = ed->arch.ldt_base + (off << PAGE_SHIFT);
289 int res;
291 #if defined(__x86_64__)
292 /* If in user mode, switch to kernel mode just to read LDT mapping. */
293 extern void toggle_guest_mode(struct exec_domain *);
294 int user_mode = !(ed->arch.flags & TF_kernel_mode);
295 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(ed)
296 #elif defined(__i386__)
297 #define TOGGLE_MODE() ((void)0)
298 #endif
300 BUG_ON(unlikely(in_irq()));
302 shadow_sync_va(ed, gva);
304 TOGGLE_MODE();
305 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
306 sizeof(l1e));
307 TOGGLE_MODE();
309 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
310 return 0;
312 gpfn = l1e_get_pfn(l1e);
313 gmfn = __gpfn_to_mfn(d, gpfn);
314 if ( unlikely(!VALID_MFN(gmfn)) )
315 return 0;
317 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
319 if ( !res && unlikely(shadow_mode_enabled(d)) )
320 {
321 shadow_lock(d);
322 shadow_remove_all_write_access(d, gpfn, gmfn);
323 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
324 shadow_unlock(d);
325 }
327 if ( unlikely(!res) )
328 return 0;
330 nl1e = l1e_create_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
332 ed->arch.perdomain_ptes[off + 16] = nl1e;
333 ed->arch.shadow_ldt_mapcnt++;
335 return 1;
336 }
339 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
340 {
341 struct pfn_info *page = &frame_table[page_nr];
343 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
344 {
345 MEM_LOG("Could not get page ref for pfn %p", page_nr);
346 return 0;
347 }
349 return 1;
350 }
353 static int get_page_and_type_from_pagenr(unsigned long page_nr,
354 u32 type,
355 struct domain *d)
356 {
357 struct pfn_info *page = &frame_table[page_nr];
359 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
360 return 0;
362 if ( unlikely(!get_page_type(page, type)) )
363 {
364 if ( (type & PGT_type_mask) != PGT_l1_page_table )
365 MEM_LOG("Bad page type for pfn %p (%08x)",
366 page_nr, page->u.inuse.type_info);
367 put_page(page);
368 return 0;
369 }
371 return 1;
372 }
375 /*
376 * We allow root tables to map each other (a.k.a. linear page tables). It
377 * needs some special care with reference counts and access permissions:
378 * 1. The mapping entry must be read-only, or the guest may get write access
379 * to its own PTEs.
380 * 2. We must only bump the reference counts for an *already validated*
381 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
382 * on a validation that is required to complete that validation.
383 * 3. We only need to increment the reference counts for the mapped page
384 * frame if it is mapped by a different root table. This is sufficient and
385 * also necessary to allow validation of a root table mapping itself.
386 */
387 static int
388 get_linear_pagetable(
389 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
390 {
391 u32 x, y;
392 struct pfn_info *page;
393 unsigned long pfn;
395 ASSERT( !shadow_mode_enabled(d) );
397 if ( (root_get_flags(re) & _PAGE_RW) )
398 {
399 MEM_LOG("Attempt to create linear p.t. with write perms");
400 return 0;
401 }
403 if ( (pfn = root_get_pfn(re)) != re_pfn )
404 {
405 /* Make sure the mapped frame belongs to the correct domain. */
406 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
407 return 0;
409 /*
410 * Make sure that the mapped frame is an already-validated L2 table.
411 * If so, atomically increment the count (checking for overflow).
412 */
413 page = &frame_table[pfn];
414 y = page->u.inuse.type_info;
415 do {
416 x = y;
417 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
418 unlikely((x & (PGT_type_mask|PGT_validated)) !=
419 (PGT_root_page_table|PGT_validated)) )
420 {
421 put_page(page);
422 return 0;
423 }
424 }
425 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
426 }
428 return 1;
429 }
432 int
433 get_page_from_l1e(
434 l1_pgentry_t l1e, struct domain *d)
435 {
436 unsigned long mfn = l1e_get_pfn(l1e);
437 struct pfn_info *page = &frame_table[mfn];
438 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
440 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
441 return 1;
443 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
444 {
445 MEM_LOG("Bad L1 type settings %p %p", l1e_get_value(l1e),
446 l1e_get_value(l1e) & L1_DISALLOW_MASK);
447 return 0;
448 }
450 if ( unlikely(!pfn_valid(mfn)) ||
451 unlikely(page_get_owner(page) == dom_io) )
452 {
453 /* DOMID_IO reverts to caller for privilege checks. */
454 if ( d == dom_io )
455 d = current->domain;
457 if ( (!IS_PRIV(d)) &&
458 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) )
459 {
460 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
461 return 0;
462 }
464 /* No reference counting for out-of-range I/O pages. */
465 if ( !pfn_valid(mfn) )
466 return 1;
468 d = dom_io;
469 }
471 return ((l1e_get_flags(l1e) & _PAGE_RW) ?
472 get_page_and_type(page, d, PGT_writable_page) :
473 get_page(page, d));
474 }
477 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
478 static int
479 get_page_from_l2e(
480 l2_pgentry_t l2e, unsigned long pfn,
481 struct domain *d, unsigned long va_idx)
482 {
483 int rc;
485 ASSERT( !shadow_mode_enabled(d) );
487 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
488 return 1;
490 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
491 {
492 MEM_LOG("Bad L2 page type settings %p",
493 l2e_get_value(l2e) & L2_DISALLOW_MASK);
494 return 0;
495 }
497 rc = get_page_and_type_from_pagenr(
498 l2e_get_pfn(l2e),
499 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
501 #if defined(__i386__)
502 return rc ? rc : get_linear_pagetable(l2e, pfn, d);
503 #elif defined(__x86_64__)
504 return rc;
505 #endif
506 }
509 #ifdef __x86_64__
511 static int
512 get_page_from_l3e(
513 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
514 {
515 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
516 return 1;
518 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
519 {
520 MEM_LOG("Bad L3 page type settings %p",
521 l3e_get_value(l3e) & L3_DISALLOW_MASK);
522 return 0;
523 }
525 return get_page_and_type_from_pagenr(
526 l3e_get_pfn(l3e), PGT_l2_page_table, d);
527 }
530 static int
531 get_page_from_l4e(
532 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
533 {
534 int rc;
536 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
537 return 1;
539 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
540 {
541 MEM_LOG("Bad L4 page type settings %p",
542 l4e_get_value(l4e) & L4_DISALLOW_MASK);
543 return 0;
544 }
546 rc = get_page_and_type_from_pagenr(
547 l4e_get_pfn(l4e), PGT_l3_page_table, d);
549 if ( unlikely(!rc) )
550 return get_linear_pagetable(l4e, pfn, d);
552 return 1;
553 }
555 #endif /* __x86_64__ */
558 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
559 {
560 unsigned long pfn = l1e_get_pfn(l1e);
561 struct pfn_info *page = &frame_table[pfn];
562 struct domain *e;
564 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
565 return;
567 e = page_get_owner(page);
568 if ( unlikely(e != d) )
569 {
570 /*
571 * Unmap a foreign page that may have been mapped via a grant table.
572 * Note that this can fail for a privileged domain that can map foreign
573 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
574 * counted via a grant entry and some counted directly in the page
575 * structure's reference count. Note that reference counts won't get
576 * dangerously confused as long as we always try to decrement the
577 * grant entry first. We may end up with a mismatch between which
578 * mappings and which unmappings are counted via the grant entry, but
579 * really it doesn't matter as privileged domains have carte blanche.
580 */
581 if (likely(gnttab_check_unmap(e, d, pfn,
582 !(l1e_get_flags(l1e) & _PAGE_RW))))
583 return;
584 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
585 }
587 if ( l1e_get_flags(l1e) & _PAGE_RW )
588 {
589 put_page_and_type(page);
590 }
591 else
592 {
593 /* We expect this is rare so we blow the entire shadow LDT. */
594 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
595 PGT_ldt_page)) &&
596 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
598 // XXX SMP BUG?
599 invalidate_shadow_ldt(e->exec_domain[0]);
600 put_page(page);
601 }
602 }
605 /*
606 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
607 * Note also that this automatically deals correctly with linear p.t.'s.
608 */
609 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
610 {
611 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
612 (l2e_get_pfn(l2e) != pfn) )
613 put_page_and_type(&frame_table[l2e_get_pfn(l2e)]);
614 }
617 #ifdef __x86_64__
619 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
620 {
621 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
622 (l3e_get_pfn(l3e) != pfn) )
623 put_page_and_type(&frame_table[l3e_get_pfn(l3e)]);
624 }
627 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
628 {
629 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
630 (l4e_get_pfn(l4e) != pfn) )
631 put_page_and_type(&frame_table[l4e_get_pfn(l4e)]);
632 }
634 #endif /* __x86_64__ */
637 static int alloc_l1_table(struct pfn_info *page)
638 {
639 struct domain *d = page_get_owner(page);
640 unsigned long pfn = page_to_pfn(page);
641 l1_pgentry_t *pl1e;
642 int i;
644 ASSERT( !shadow_mode_enabled(d) );
646 pl1e = map_domain_mem(pfn << PAGE_SHIFT);
648 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
649 if ( is_guest_l1_slot(i) &&
650 unlikely(!get_page_from_l1e(pl1e[i], d)) )
651 goto fail;
653 unmap_domain_mem(pl1e);
654 return 1;
656 fail:
657 while ( i-- > 0 )
658 if ( is_guest_l1_slot(i) )
659 put_page_from_l1e(pl1e[i], d);
661 unmap_domain_mem(pl1e);
662 return 0;
663 }
666 static int alloc_l2_table(struct pfn_info *page)
667 {
668 struct domain *d = page_get_owner(page);
669 unsigned long pfn = page_to_pfn(page);
670 l2_pgentry_t *pl2e;
671 int i;
673 if ( (PGT_base_page_table == PGT_l2_page_table) &&
674 shadow_mode_enabled(d) )
675 return 1;
676 ASSERT( !shadow_mode_enabled(d) );
678 pl2e = map_domain_mem(pfn << PAGE_SHIFT);
680 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
681 if ( is_guest_l2_slot(i) &&
682 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, i)) )
683 goto fail;
685 #if defined(__i386__)
686 /* Xen private mappings. */
687 memcpy(&pl2e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
688 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
689 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
690 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
691 l2e_create_pfn(pfn, __PAGE_HYPERVISOR);
692 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
693 l2e_create_phys(__pa(page_get_owner(page)->arch.mm_perdomain_pt),
694 __PAGE_HYPERVISOR);
695 #endif
697 unmap_domain_mem(pl2e);
698 return 1;
700 fail:
701 while ( i-- > 0 )
702 if ( is_guest_l2_slot(i) )
703 put_page_from_l2e(pl2e[i], pfn);
705 unmap_domain_mem(pl2e);
706 return 0;
707 }
710 #ifdef __x86_64__
712 static int alloc_l3_table(struct pfn_info *page)
713 {
714 struct domain *d = page_get_owner(page);
715 unsigned long pfn = page_to_pfn(page);
716 l3_pgentry_t *pl3e = page_to_virt(page);
717 int i;
719 ASSERT( !shadow_mode_enabled(d) );
721 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
722 if ( is_guest_l3_slot(i) &&
723 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
724 goto fail;
726 return 1;
728 fail:
729 while ( i-- > 0 )
730 if ( is_guest_l3_slot(i) )
731 put_page_from_l3e(pl3e[i], pfn);
733 return 0;
734 }
737 static int alloc_l4_table(struct pfn_info *page)
738 {
739 struct domain *d = page_get_owner(page);
740 unsigned long pfn = page_to_pfn(page);
741 l4_pgentry_t *pl4e = page_to_virt(page);
742 int i;
744 if ( (PGT_base_page_table == PGT_l4_page_table) &&
745 shadow_mode_enabled(d) )
746 return 1;
747 ASSERT( !shadow_mode_enabled(d) );
749 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
750 if ( is_guest_l4_slot(i) &&
751 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
752 goto fail;
754 /* Xen private mappings. */
755 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
756 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
757 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
758 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
759 l4e_create_pfn(pfn, __PAGE_HYPERVISOR);
760 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
761 l4e_create_phys(__pa(page_get_owner(page)->arch.mm_perdomain_l3),
762 __PAGE_HYPERVISOR);
764 return 1;
766 fail:
767 while ( i-- > 0 )
768 if ( is_guest_l4_slot(i) )
769 put_page_from_l4e(pl4e[i], pfn);
771 return 0;
772 }
774 #endif /* __x86_64__ */
777 static void free_l1_table(struct pfn_info *page)
778 {
779 struct domain *d = page_get_owner(page);
780 unsigned long pfn = page_to_pfn(page);
781 l1_pgentry_t *pl1e;
782 int i;
784 pl1e = map_domain_mem(pfn << PAGE_SHIFT);
786 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
787 if ( is_guest_l1_slot(i) )
788 put_page_from_l1e(pl1e[i], d);
790 unmap_domain_mem(pl1e);
791 }
794 static void free_l2_table(struct pfn_info *page)
795 {
796 unsigned long pfn = page_to_pfn(page);
797 l2_pgentry_t *pl2e;
798 int i;
800 pl2e = map_domain_mem(pfn << PAGE_SHIFT);
802 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
803 if ( is_guest_l2_slot(i) )
804 put_page_from_l2e(pl2e[i], pfn);
806 unmap_domain_mem(pl2e);
807 }
810 #ifdef __x86_64__
812 static void free_l3_table(struct pfn_info *page)
813 {
814 unsigned long pfn = page_to_pfn(page);
815 l3_pgentry_t *pl3e = page_to_virt(page);
816 int i;
818 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
819 if ( is_guest_l3_slot(i) )
820 put_page_from_l3e(pl3e[i], pfn);
821 }
824 static void free_l4_table(struct pfn_info *page)
825 {
826 unsigned long pfn = page_to_pfn(page);
827 l4_pgentry_t *pl4e = page_to_virt(page);
828 int i;
830 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
831 if ( is_guest_l4_slot(i) )
832 put_page_from_l4e(pl4e[i], pfn);
833 }
835 #endif /* __x86_64__ */
838 static inline int update_l1e(l1_pgentry_t *pl1e,
839 l1_pgentry_t ol1e,
840 l1_pgentry_t nl1e)
841 {
842 /* FIXME: breaks with PAE */
843 unsigned long o = l1e_get_value(ol1e);
844 unsigned long n = l1e_get_value(nl1e);
846 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
847 unlikely(o != l1e_get_value(ol1e)) )
848 {
849 MEM_LOG("Failed to update %p -> %p: saw %p",
850 l1e_get_value(ol1e), l1e_get_value(nl1e), o);
851 return 0;
852 }
854 return 1;
855 }
858 /* Update the L1 entry at pl1e to new value nl1e. */
859 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
860 {
861 l1_pgentry_t ol1e;
862 struct domain *d = current->domain;
864 ASSERT( !shadow_mode_enabled(d) );
866 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
867 return 0;
869 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
870 {
871 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
872 {
873 MEM_LOG("Bad L1 type settings %p",
874 l1e_get_value(nl1e) & L1_DISALLOW_MASK);
875 return 0;
876 }
878 /* Fast path for identical mapping, r/w and presence. */
879 if ( !l1e_has_changed(&ol1e, &nl1e, _PAGE_RW | _PAGE_PRESENT))
880 return update_l1e(pl1e, ol1e, nl1e);
882 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
883 return 0;
885 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
886 {
887 put_page_from_l1e(nl1e, d);
888 return 0;
889 }
890 }
891 else
892 {
893 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
894 return 0;
895 }
897 put_page_from_l1e(ol1e, d);
898 return 1;
899 }
902 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
903 unsigned long __o = cmpxchg((unsigned long *)(_p), \
904 _t ## e_get_value(_o), \
905 _t ## e_get_value(_n)); \
906 if ( __o != _t ## e_get_value(_o) ) \
907 MEM_LOG("Failed to update %p -> %p: saw %p", \
908 _t ## e_get_value(_o), _t ## e_get_value(_n), __o); \
909 (__o == _t ## e_get_value(_o)); })
912 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
913 static int mod_l2_entry(l2_pgentry_t *pl2e,
914 l2_pgentry_t nl2e,
915 unsigned long pfn)
916 {
917 l2_pgentry_t ol2e;
919 if ( unlikely(!is_guest_l2_slot(pgentry_ptr_to_slot(pl2e))) )
920 {
921 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
922 return 0;
923 }
925 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
926 return 0;
928 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
929 {
930 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
931 {
932 MEM_LOG("Bad L2 type settings %p",
933 l2e_get_value(nl2e) & L2_DISALLOW_MASK);
934 return 0;
935 }
937 /* Fast path for identical mapping and presence. */
938 if ( !l2e_has_changed(&ol2e, &nl2e, _PAGE_PRESENT))
939 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
941 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
942 ((unsigned long)pl2e &
943 ~PAGE_MASK) >> 2)) )
944 return 0;
946 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
947 {
948 put_page_from_l2e(nl2e, pfn);
949 return 0;
950 }
951 }
952 else
953 {
954 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
955 return 0;
956 }
958 put_page_from_l2e(ol2e, pfn);
959 return 1;
960 }
963 #ifdef __x86_64__
965 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
966 static int mod_l3_entry(l3_pgentry_t *pl3e,
967 l3_pgentry_t nl3e,
968 unsigned long pfn)
969 {
970 l3_pgentry_t ol3e;
972 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
973 {
974 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
975 return 0;
976 }
978 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
979 return 0;
981 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
982 {
983 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
984 {
985 MEM_LOG("Bad L3 type settings %p",
986 l3e_get_value(nl3e) & L3_DISALLOW_MASK);
987 return 0;
988 }
990 /* Fast path for identical mapping and presence. */
991 if (!l3e_has_changed(&ol3e, &nl3e, _PAGE_PRESENT))
992 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
994 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain)) )
995 return 0;
997 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
998 {
999 put_page_from_l3e(nl3e, pfn);
1000 return 0;
1003 put_page_from_l3e(ol3e, pfn);
1004 return 1;
1007 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1008 return 0;
1010 put_page_from_l3e(ol3e, pfn);
1011 return 1;
1015 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1016 static int mod_l4_entry(l4_pgentry_t *pl4e,
1017 l4_pgentry_t nl4e,
1018 unsigned long pfn)
1020 l4_pgentry_t ol4e;
1022 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1024 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1025 return 0;
1028 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1029 return 0;
1031 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1033 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1035 MEM_LOG("Bad L4 type settings %p",
1036 l4e_get_value(nl4e) & L4_DISALLOW_MASK);
1037 return 0;
1040 /* Fast path for identical mapping and presence. */
1041 if (!l4e_has_changed(&ol4e, &nl4e, _PAGE_PRESENT))
1042 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1044 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1045 return 0;
1047 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1049 put_page_from_l4e(nl4e, pfn);
1050 return 0;
1053 put_page_from_l4e(ol4e, pfn);
1054 return 1;
1057 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1058 return 0;
1060 put_page_from_l4e(ol4e, pfn);
1061 return 1;
1064 #endif /* __x86_64__ */
1067 int alloc_page_type(struct pfn_info *page, unsigned int type)
1069 switch ( type )
1071 case PGT_l1_page_table:
1072 return alloc_l1_table(page);
1073 case PGT_l2_page_table:
1074 return alloc_l2_table(page);
1075 #ifdef __x86_64__
1076 case PGT_l3_page_table:
1077 return alloc_l3_table(page);
1078 case PGT_l4_page_table:
1079 return alloc_l4_table(page);
1080 #endif
1081 case PGT_gdt_page:
1082 case PGT_ldt_page:
1083 return alloc_segdesc_page(page);
1084 default:
1085 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
1086 type, page->u.inuse.type_info,
1087 page->count_info);
1088 BUG();
1091 return 0;
1095 void free_page_type(struct pfn_info *page, unsigned int type)
1097 struct domain *owner = page_get_owner(page);
1098 if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) )
1099 return;
1101 switch ( type )
1103 case PGT_l1_page_table:
1104 free_l1_table(page);
1105 break;
1107 case PGT_l2_page_table:
1108 free_l2_table(page);
1109 break;
1111 #ifdef __x86_64__
1112 case PGT_l3_page_table:
1113 free_l3_table(page);
1114 break;
1116 case PGT_l4_page_table:
1117 free_l4_table(page);
1118 break;
1119 #endif
1121 default:
1122 BUG();
1127 void put_page_type(struct pfn_info *page)
1129 u32 nx, x, y = page->u.inuse.type_info;
1131 again:
1132 do {
1133 x = y;
1134 nx = x - 1;
1136 ASSERT((x & PGT_count_mask) != 0);
1138 /*
1139 * The page should always be validated while a reference is held. The
1140 * exception is during domain destruction, when we forcibly invalidate
1141 * page-table pages if we detect a referential loop.
1142 * See domain.c:relinquish_list().
1143 */
1144 ASSERT((x & PGT_validated) ||
1145 test_bit(DF_DYING, &page_get_owner(page)->d_flags));
1147 if ( unlikely((nx & PGT_count_mask) == 0) )
1149 /* Record TLB information for flush later. Races are harmless. */
1150 page->tlbflush_timestamp = tlbflush_current_time();
1152 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1153 likely(nx & PGT_validated) )
1155 /*
1156 * Page-table pages must be unvalidated when count is zero. The
1157 * 'free' is safe because the refcnt is non-zero and validated
1158 * bit is clear => other ops will spin or fail.
1159 */
1160 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1161 x & ~PGT_validated)) != x) )
1162 goto again;
1163 /* We cleared the 'valid bit' so we do the clean up. */
1164 free_page_type(page, x & PGT_type_mask);
1165 /* Carry on, but with the 'valid bit' now clear. */
1166 x &= ~PGT_validated;
1167 nx &= ~PGT_validated;
1170 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1171 (PGT_pinned | 1)) &&
1172 ((nx & PGT_type_mask) != PGT_writable_page)) )
1174 /* Page is now only pinned. Make the back pointer mutable again. */
1175 nx |= PGT_va_mutable;
1178 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1182 int get_page_type(struct pfn_info *page, u32 type)
1184 u32 nx, x, y = page->u.inuse.type_info;
1186 again:
1187 do {
1188 x = y;
1189 nx = x + 1;
1190 if ( unlikely((nx & PGT_count_mask) == 0) )
1192 MEM_LOG("Type count overflow on pfn %p", page_to_pfn(page));
1193 return 0;
1195 else if ( unlikely((x & PGT_count_mask) == 0) )
1197 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1199 /*
1200 * On type change we check to flush stale TLB entries. This
1201 * may be unnecessary (e.g., page was GDT/LDT) but those
1202 * circumstances should be very rare.
1203 */
1204 unsigned long cpuset = tlbflush_filter_cpuset(
1205 page_get_owner(page)->cpuset, page->tlbflush_timestamp);
1207 if ( unlikely(cpuset != 0) )
1209 perfc_incrc(need_flush_tlb_flush);
1210 flush_tlb_mask(cpuset);
1213 /* We lose existing type, back pointer, and validity. */
1214 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1215 nx |= type;
1217 /* No special validation needed for writable pages. */
1218 /* Page tables and GDT/LDT need to be scanned for validity. */
1219 if ( type == PGT_writable_page )
1220 nx |= PGT_validated;
1223 else
1225 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1227 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1229 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1230 ((type & PGT_type_mask) != PGT_l1_page_table) )
1231 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p",
1232 x, type, page_to_pfn(page));
1233 return 0;
1235 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1237 /* The va backpointer is mutable, hence we update it. */
1238 nx &= ~PGT_va_mask;
1239 nx |= type; /* we know the actual type is correct */
1241 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1242 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1244 /* This table is potentially mapped at multiple locations. */
1245 nx &= ~PGT_va_mask;
1246 nx |= PGT_va_unknown;
1249 if ( unlikely(!(x & PGT_validated)) )
1251 /* Someone else is updating validation of this page. Wait... */
1252 while ( (y = page->u.inuse.type_info) == x )
1253 cpu_relax();
1254 goto again;
1258 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1260 if ( unlikely(!(nx & PGT_validated)) )
1262 /* Try to validate page type; drop the new reference on failure. */
1263 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
1265 MEM_LOG("Error while validating pfn %p for type %08x."
1266 " caf=%08x taf=%08x",
1267 page_to_pfn(page), type,
1268 page->count_info,
1269 page->u.inuse.type_info);
1270 /* Noone else can get a reference. We hold the only ref. */
1271 page->u.inuse.type_info = 0;
1272 return 0;
1275 /* Noone else is updating simultaneously. */
1276 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1279 return 1;
1283 int new_guest_cr3(unsigned long mfn)
1285 struct exec_domain *ed = current;
1286 struct domain *d = ed->domain;
1287 int okay;
1288 unsigned long old_base_mfn;
1290 if ( shadow_mode_enabled(d) )
1291 okay = get_page_from_pagenr(mfn, d);
1292 else
1293 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1295 if ( likely(okay) )
1297 invalidate_shadow_ldt(ed);
1299 old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1300 ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1301 update_pagetables(ed); /* update shadow_table and monitor_table */
1303 write_ptbase(ed);
1305 if ( shadow_mode_enabled(d) )
1306 put_page(&frame_table[old_base_mfn]);
1307 else
1308 put_page_and_type(&frame_table[old_base_mfn]);
1310 // CR3 holds its own ref to its shadow...
1311 //
1312 if ( shadow_mode_enabled(d) )
1314 if ( ed->arch.monitor_shadow_ref )
1315 put_shadow_ref(ed->arch.monitor_shadow_ref);
1316 ed->arch.monitor_shadow_ref =
1317 pagetable_val(ed->arch.monitor_table) >> PAGE_SHIFT;
1318 ASSERT(page_get_owner(&frame_table[ed->arch.monitor_shadow_ref]) == NULL);
1319 get_shadow_ref(ed->arch.monitor_shadow_ref);
1322 else
1324 MEM_LOG("Error while installing new baseptr %p", mfn);
1327 return okay;
1330 static void process_deferred_ops(unsigned int cpu)
1332 unsigned int deferred_ops;
1333 struct domain *d = current->domain;
1335 deferred_ops = percpu_info[cpu].deferred_ops;
1336 percpu_info[cpu].deferred_ops = 0;
1338 if ( deferred_ops & DOP_FLUSH_TLB )
1340 if ( shadow_mode_enabled(d) )
1341 shadow_sync_all(d);
1342 local_flush_tlb();
1345 if ( deferred_ops & DOP_RELOAD_LDT )
1346 (void)map_ldt_shadow_page(0);
1348 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1350 put_domain(percpu_info[cpu].foreign);
1351 percpu_info[cpu].foreign = NULL;
1355 static int set_foreigndom(unsigned int cpu, domid_t domid)
1357 struct domain *e, *d = current->domain;
1358 int okay = 1;
1360 if ( (e = percpu_info[cpu].foreign) != NULL )
1361 put_domain(e);
1362 percpu_info[cpu].foreign = NULL;
1364 if ( domid == DOMID_SELF )
1365 goto out;
1367 if ( !IS_PRIV(d) )
1369 switch ( domid )
1371 case DOMID_IO:
1372 get_knownalive_domain(dom_io);
1373 percpu_info[cpu].foreign = dom_io;
1374 break;
1375 default:
1376 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1377 okay = 0;
1378 break;
1381 else
1383 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1384 if ( e == NULL )
1386 switch ( domid )
1388 case DOMID_XEN:
1389 get_knownalive_domain(dom_xen);
1390 percpu_info[cpu].foreign = dom_xen;
1391 break;
1392 case DOMID_IO:
1393 get_knownalive_domain(dom_io);
1394 percpu_info[cpu].foreign = dom_io;
1395 break;
1396 default:
1397 MEM_LOG("Unknown domain '%u'", domid);
1398 okay = 0;
1399 break;
1404 out:
1405 return okay;
1408 static inline unsigned long vcpuset_to_pcpuset(
1409 struct domain *d, unsigned long vset)
1411 unsigned int vcpu;
1412 unsigned long pset = 0;
1413 struct exec_domain *ed;
1415 while ( vset != 0 )
1417 vcpu = find_first_set_bit(vset);
1418 vset &= ~(1UL << vcpu);
1419 if ( (vcpu < MAX_VIRT_CPUS) &&
1420 ((ed = d->exec_domain[vcpu]) != NULL) )
1421 pset |= 1UL << ed->processor;
1424 return pset;
1427 int do_mmuext_op(
1428 struct mmuext_op *uops,
1429 unsigned int count,
1430 unsigned int *pdone,
1431 unsigned int foreigndom)
1433 struct mmuext_op op;
1434 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1435 unsigned int type, done = 0;
1436 struct pfn_info *page;
1437 struct exec_domain *ed = current;
1438 struct domain *d = ed->domain, *e;
1439 u32 x, y, _d, _nd;
1441 LOCK_BIGLOCK(d);
1443 cleanup_writable_pagetable(d);
1445 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1447 count &= ~MMU_UPDATE_PREEMPTED;
1448 if ( unlikely(pdone != NULL) )
1449 (void)get_user(done, pdone);
1452 if ( !set_foreigndom(cpu, foreigndom) )
1454 rc = -EINVAL;
1455 goto out;
1458 if ( unlikely(!array_access_ok(VERIFY_READ, uops, count, sizeof(op))) )
1460 rc = -EFAULT;
1461 goto out;
1464 for ( i = 0; i < count; i++ )
1466 if ( hypercall_preempt_check() )
1468 rc = hypercall4_create_continuation(
1469 __HYPERVISOR_mmuext_op, uops,
1470 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1471 break;
1474 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1476 MEM_LOG("Bad __copy_from_user");
1477 rc = -EFAULT;
1478 break;
1481 okay = 1;
1482 page = &frame_table[op.mfn];
1484 switch ( op.cmd )
1486 case MMUEXT_PIN_L1_TABLE:
1487 type = PGT_l1_page_table | PGT_va_mutable;
1489 pin_page:
1490 if ( shadow_mode_enabled(FOREIGNDOM) )
1491 type = PGT_writable_page;
1493 okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM);
1494 if ( unlikely(!okay) )
1496 MEM_LOG("Error while pinning MFN %p", op.mfn);
1497 break;
1500 if ( unlikely(test_and_set_bit(_PGT_pinned,
1501 &page->u.inuse.type_info)) )
1503 MEM_LOG("MFN %p already pinned", op.mfn);
1504 put_page_and_type(page);
1505 okay = 0;
1506 break;
1509 break;
1511 case MMUEXT_PIN_L2_TABLE:
1512 type = PGT_l2_page_table;
1513 goto pin_page;
1515 #ifdef __x86_64__
1516 case MMUEXT_PIN_L3_TABLE:
1517 type = PGT_l3_page_table;
1518 goto pin_page;
1520 case MMUEXT_PIN_L4_TABLE:
1521 type = PGT_l4_page_table;
1522 goto pin_page;
1523 #endif /* __x86_64__ */
1525 case MMUEXT_UNPIN_TABLE:
1526 if ( unlikely(!(okay = get_page_from_pagenr(op.mfn, FOREIGNDOM))) )
1528 MEM_LOG("MFN %p bad domain (dom=%p)",
1529 op.mfn, page_get_owner(page));
1531 else if ( likely(test_and_clear_bit(_PGT_pinned,
1532 &page->u.inuse.type_info)) )
1534 put_page_and_type(page);
1535 put_page(page);
1537 else
1539 okay = 0;
1540 put_page(page);
1541 MEM_LOG("MFN %p not pinned", op.mfn);
1543 break;
1545 case MMUEXT_NEW_BASEPTR:
1546 okay = new_guest_cr3(op.mfn);
1547 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1548 break;
1550 #ifdef __x86_64__
1551 case MMUEXT_NEW_USER_BASEPTR:
1552 okay = get_page_and_type_from_pagenr(
1553 op.mfn, PGT_root_page_table, d);
1554 if ( unlikely(!okay) )
1556 MEM_LOG("Error while installing new MFN %p", op.mfn);
1558 else
1560 unsigned long old_mfn =
1561 pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
1562 ed->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT);
1563 if ( old_mfn != 0 )
1564 put_page_and_type(&frame_table[old_mfn]);
1566 break;
1567 #endif
1569 case MMUEXT_TLB_FLUSH_LOCAL:
1570 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1571 break;
1573 case MMUEXT_INVLPG_LOCAL:
1574 if ( shadow_mode_enabled(d) )
1575 shadow_invlpg(ed, op.linear_addr);
1576 local_flush_tlb_one(op.linear_addr);
1577 break;
1579 case MMUEXT_TLB_FLUSH_MULTI:
1580 case MMUEXT_INVLPG_MULTI:
1582 unsigned long vset, pset;
1583 if ( unlikely(get_user(vset, (unsigned long *)op.cpuset)) )
1585 okay = 0;
1586 break;
1588 pset = vcpuset_to_pcpuset(d, vset);
1589 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1591 BUG_ON(shadow_mode_enabled(d) && ((pset & d->cpuset) != (1<<cpu)));
1592 flush_tlb_mask(pset & d->cpuset);
1594 else
1596 BUG_ON(shadow_mode_enabled(d) && ((pset & d->cpuset) != (1<<cpu)));
1597 flush_tlb_one_mask(pset & d->cpuset, op.linear_addr);
1599 break;
1602 case MMUEXT_TLB_FLUSH_ALL:
1603 BUG_ON(shadow_mode_enabled(d) && (d->cpuset != (1<<cpu)));
1604 flush_tlb_mask(d->cpuset);
1605 break;
1607 case MMUEXT_INVLPG_ALL:
1608 BUG_ON(shadow_mode_enabled(d) && (d->cpuset != (1<<cpu)));
1609 flush_tlb_one_mask(d->cpuset, op.linear_addr);
1610 break;
1612 case MMUEXT_FLUSH_CACHE:
1613 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1615 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1616 okay = 0;
1618 else
1620 wbinvd();
1622 break;
1624 case MMUEXT_SET_LDT:
1626 if ( shadow_mode_external(d) )
1628 // ignore this request from an external domain...
1629 MEM_LOG("ignoring SET_LDT hypercall from external "
1630 "domain %u\n", d->id);
1631 okay = 0;
1632 break;
1635 unsigned long ptr = op.linear_addr;
1636 unsigned long ents = op.nr_ents;
1637 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1638 (ents > 8192) ||
1639 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1640 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1642 okay = 0;
1643 MEM_LOG("Bad args to SET_LDT: ptr=%p, ents=%p", ptr, ents);
1645 else if ( (ed->arch.ldt_ents != ents) ||
1646 (ed->arch.ldt_base != ptr) )
1648 invalidate_shadow_ldt(ed);
1649 ed->arch.ldt_base = ptr;
1650 ed->arch.ldt_ents = ents;
1651 load_LDT(ed);
1652 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1653 if ( ents != 0 )
1654 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1656 break;
1659 case MMUEXT_REASSIGN_PAGE:
1660 if ( unlikely(!IS_PRIV(d)) )
1662 MEM_LOG("Dom %u has no reassignment priv", d->id);
1663 okay = 0;
1664 break;
1667 e = percpu_info[cpu].foreign;
1668 if ( unlikely(e == NULL) )
1670 MEM_LOG("No FOREIGNDOM to reassign MFN %p to", op.mfn);
1671 okay = 0;
1672 break;
1675 /*
1676 * Grab both page_list locks, in order. This prevents the page from
1677 * disappearing elsewhere while we modify the owner, and we'll need
1678 * both locks if we're successful so that we can change lists.
1679 */
1680 if ( d < e )
1682 spin_lock(&d->page_alloc_lock);
1683 spin_lock(&e->page_alloc_lock);
1685 else
1687 spin_lock(&e->page_alloc_lock);
1688 spin_lock(&d->page_alloc_lock);
1691 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1692 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1693 unlikely(IS_XEN_HEAP_FRAME(page)) )
1695 MEM_LOG("Reassign page is Xen heap, or dest dom is dying.");
1696 okay = 0;
1697 goto reassign_fail;
1700 /*
1701 * The tricky bit: atomically change owner while there is just one
1702 * benign reference to the page (PGC_allocated). If that reference
1703 * disappears then the deallocation routine will safely spin.
1704 */
1705 _d = pickle_domptr(d);
1706 _nd = page->u.inuse._domain;
1707 y = page->count_info;
1708 do {
1709 x = y;
1710 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1711 (1|PGC_allocated)) ||
1712 unlikely(_nd != _d) )
1714 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
1715 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1716 d, d->id, unpickle_domptr(_nd), x,
1717 page->u.inuse.type_info);
1718 okay = 0;
1719 goto reassign_fail;
1721 __asm__ __volatile__(
1722 LOCK_PREFIX "cmpxchg8b %3"
1723 : "=d" (_nd), "=a" (y), "=c" (e),
1724 "=m" (*(volatile u64 *)(&page->count_info))
1725 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1727 while ( unlikely(_nd != _d) || unlikely(y != x) );
1729 /*
1730 * Unlink from 'd'. We transferred at least one reference to 'e',
1731 * so noone else is spinning to try to delete this page from 'd'.
1732 */
1733 d->tot_pages--;
1734 list_del(&page->list);
1736 /*
1737 * Add the page to 'e'. Someone may already have removed the last
1738 * reference and want to remove the page from 'e'. However, we have
1739 * the lock so they'll spin waiting for us.
1740 */
1741 if ( unlikely(e->tot_pages++ == 0) )
1742 get_knownalive_domain(e);
1743 list_add_tail(&page->list, &e->page_list);
1745 reassign_fail:
1746 spin_unlock(&d->page_alloc_lock);
1747 spin_unlock(&e->page_alloc_lock);
1748 break;
1750 default:
1751 MEM_LOG("Invalid extended pt command 0x%p", op.cmd);
1752 okay = 0;
1753 break;
1756 if ( unlikely(!okay) )
1758 rc = -EINVAL;
1759 break;
1762 uops++;
1765 out:
1766 process_deferred_ops(cpu);
1768 /* Add incremental work we have done to the @done output parameter. */
1769 if ( unlikely(pdone != NULL) )
1770 __put_user(done + i, pdone);
1772 UNLOCK_BIGLOCK(d);
1773 return rc;
1776 int do_mmu_update(
1777 mmu_update_t *ureqs,
1778 unsigned int count,
1779 unsigned int *pdone,
1780 unsigned int foreigndom)
1782 mmu_update_t req;
1783 unsigned long va = 0, mfn, prev_mfn = 0, gpfn;
1784 struct pfn_info *page;
1785 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1786 unsigned int cmd, done = 0;
1787 struct exec_domain *ed = current;
1788 struct domain *d = ed->domain;
1789 u32 type_info;
1791 LOCK_BIGLOCK(d);
1793 cleanup_writable_pagetable(d);
1795 if ( unlikely(shadow_mode_enabled(d)) )
1796 check_pagetable(ed, "pre-mmu"); /* debug */
1798 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1800 count &= ~MMU_UPDATE_PREEMPTED;
1801 if ( unlikely(pdone != NULL) )
1802 (void)get_user(done, pdone);
1805 if ( !set_foreigndom(cpu, foreigndom) )
1807 rc = -EINVAL;
1808 goto out;
1811 perfc_incrc(calls_to_mmu_update);
1812 perfc_addc(num_page_updates, count);
1813 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
1815 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1817 rc = -EFAULT;
1818 goto out;
1821 for ( i = 0; i < count; i++ )
1823 if ( hypercall_preempt_check() )
1825 rc = hypercall4_create_continuation(
1826 __HYPERVISOR_mmu_update, ureqs,
1827 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1828 break;
1831 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1833 MEM_LOG("Bad __copy_from_user");
1834 rc = -EFAULT;
1835 break;
1838 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1839 mfn = req.ptr >> PAGE_SHIFT;
1841 okay = 0;
1843 switch ( cmd )
1845 /*
1846 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1847 */
1848 case MMU_NORMAL_PT_UPDATE:
1849 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
1851 MEM_LOG("Could not get page for normal update");
1852 break;
1855 if ( likely(prev_mfn == mfn) )
1857 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1859 else
1861 if ( prev_mfn != 0 )
1862 unmap_domain_mem((void *)va);
1863 va = (unsigned long)map_domain_mem(req.ptr);
1864 prev_mfn = mfn;
1867 page = &frame_table[mfn];
1868 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1870 case PGT_l1_page_table:
1871 ASSERT(!shadow_mode_enabled(d));
1872 if ( likely(get_page_type(
1873 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1875 l1_pgentry_t pte;
1877 /* FIXME: doesn't work with PAE */
1878 pte = l1e_create_phys(req.val, req.val);
1879 okay = mod_l1_entry((l1_pgentry_t *)va, pte);
1880 put_page_type(page);
1882 break;
1883 case PGT_l2_page_table:
1884 ASSERT(!shadow_mode_enabled(d));
1885 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1887 l2_pgentry_t l2e;
1889 /* FIXME: doesn't work with PAE */
1890 l2e = l2e_create_phys(req.val, req.val);
1891 okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn);
1892 put_page_type(page);
1894 break;
1895 #ifdef __x86_64__
1896 case PGT_l3_page_table:
1897 ASSERT(!shadow_mode_enabled(d));
1898 if ( likely(get_page_type(page, PGT_l3_page_table)) )
1900 l3_pgentry_t l3e;
1902 /* FIXME: doesn't work with PAE */
1903 l3e = l3e_create_phys(req.val,req.val);
1904 okay = mod_l3_entry((l3_pgentry_t *)va, l3e, mfn);
1905 put_page_type(page);
1907 break;
1908 case PGT_l4_page_table:
1909 ASSERT(!shadow_mode_enabled(d));
1910 if ( likely(get_page_type(page, PGT_l4_page_table)) )
1912 l4_pgentry_t l4e;
1914 l4e = l4e_create_phys(req.val,req.val);
1915 okay = mod_l4_entry((l4_pgentry_t *)va, l4e, mfn);
1916 put_page_type(page);
1918 break;
1919 #endif /* __x86_64__ */
1920 default:
1921 if ( likely(get_page_type(page, PGT_writable_page)) )
1923 if ( shadow_mode_enabled(d) )
1925 shadow_lock(d);
1927 if ( shadow_mode_log_dirty(d) )
1928 __mark_dirty(d, mfn);
1930 gpfn = __mfn_to_gpfn(d, mfn);
1931 ASSERT(VALID_M2P(gpfn));
1933 if ( page_is_page_table(page) &&
1934 !page_out_of_sync(page) )
1936 shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
1940 *(unsigned long *)va = req.val;
1941 okay = 1;
1943 if ( shadow_mode_enabled(d) )
1944 shadow_unlock(d);
1946 put_page_type(page);
1948 break;
1951 put_page(page);
1952 break;
1954 case MMU_MACHPHYS_UPDATE:
1956 // HACK ALERT... Need to think about this some more...
1957 //
1958 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
1960 rc = FOREIGNDOM->next_io_page++;
1961 printk("privileged guest dom%d requests mfn=%p for dom%d, gets pfn=%p\n",
1962 d->id, mfn, FOREIGNDOM->id, rc);
1963 set_machinetophys(mfn, rc);
1964 set_p2m_entry(FOREIGNDOM, rc, mfn);
1965 okay = 1;
1966 break;
1969 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
1971 MEM_LOG("Could not get page for mach->phys update");
1972 break;
1975 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
1977 MEM_LOG("can't mutate the m2p of translated guests");
1978 break;
1981 set_machinetophys(mfn, req.val);
1982 okay = 1;
1984 /*
1985 * If in log-dirty mode, mark the corresponding
1986 * page as dirty.
1987 */
1988 if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
1989 mark_dirty(FOREIGNDOM, mfn) )
1990 FOREIGNDOM->arch.shadow_dirty_block_count++;
1992 put_page(&frame_table[mfn]);
1993 break;
1995 default:
1996 MEM_LOG("Invalid page update command %p", req.ptr);
1997 break;
2000 if ( unlikely(!okay) )
2002 rc = -EINVAL;
2003 break;
2006 ureqs++;
2009 out:
2010 if ( prev_mfn != 0 )
2011 unmap_domain_mem((void *)va);
2013 process_deferred_ops(cpu);
2015 /* Add incremental work we have done to the @done output parameter. */
2016 if ( unlikely(pdone != NULL) )
2017 __put_user(done + i, pdone);
2019 if ( unlikely(shadow_mode_enabled(d)) )
2020 check_pagetable(ed, "post-mmu"); /* debug */
2022 UNLOCK_BIGLOCK(d);
2023 return rc;
2026 /* This function assumes the caller is holding the domain's BIGLOCK
2027 * and is running in a shadow mode
2028 */
2029 int update_shadow_va_mapping(unsigned long va,
2030 l1_pgentry_t val,
2031 struct exec_domain *ed,
2032 struct domain *d)
2034 unsigned long l1mfn;
2035 l1_pgentry_t spte;
2036 int rc = 0;
2038 check_pagetable(ed, "pre-va"); /* debug */
2039 shadow_lock(d);
2041 // This is actually overkill - we don't need to sync the L1 itself,
2042 // just everything involved in getting to this L1 (i.e. we need
2043 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2044 //
2045 __shadow_sync_va(ed, va);
2047 #if 1 /* keep check_pagetables() happy */
2048 /*
2049 * However, the above doesn't guarantee that there's no snapshot of
2050 * the L1 table in question; it just says that the relevant L2 and L1
2051 * entries for VA are in-sync. There might still be a snapshot.
2053 * The checking code in _check_pagetables() assumes that no one will
2054 * mutate the shadow of a page that has a snapshot. It's actually
2055 * OK to not sync this page, but it seems simpler to:
2056 * 1) keep all code paths the same, and
2057 * 2) maintain the invariant for _check_pagetables(), rather than try
2058 * to teach it about this boundary case.
2059 * So we flush this L1 page, if it's out of sync.
2060 */
2061 l1mfn = l2e_get_pfn(linear_l2_table(ed)[l2_table_offset(va)]);
2062 if ( mfn_out_of_sync(l1mfn) )
2064 perfc_incrc(extra_va_update_sync);
2065 __shadow_sync_mfn(d, l1mfn);
2067 #endif /* keep check_pagetables() happy */
2069 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2070 &val, sizeof(val))))
2072 rc = -EINVAL;
2073 goto out;
2076 // also need to update the shadow
2078 l1pte_propagate_from_guest(d, val, &spte);
2079 shadow_set_l1e(va, spte, 0);
2081 /*
2082 * If we're in log-dirty mode then we need to note that we've updated
2083 * the PTE in the PT-holding page. We need the machine frame number
2084 * for this.
2085 */
2086 if ( shadow_mode_log_dirty(d) )
2087 mark_dirty(d, va_to_l1mfn(ed, va));
2089 out:
2090 shadow_unlock(d);
2091 check_pagetable(ed, "post-va"); /* debug */
2093 return rc;
2096 int update_grant_va_mapping(unsigned long va,
2097 l1_pgentry_t _nl1e,
2098 struct domain *d,
2099 struct exec_domain *ed)
2101 /* Caller must:
2102 * . own d's BIGLOCK
2103 * . already have 'get_page' correctly on the to-be-installed nl1e
2104 * . be responsible for flushing the TLB
2105 * . check PTE being installed isn't DISALLOWED
2106 */
2108 int rc = 0;
2109 l1_pgentry_t *pl1e;
2110 l1_pgentry_t ol1e;
2112 cleanup_writable_pagetable(d);
2114 pl1e = &linear_pg_table[l1_linear_offset(va)];
2116 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
2117 rc = -EINVAL;
2118 else
2120 if ( update_l1e(pl1e, ol1e, _nl1e) )
2122 put_page_from_l1e(ol1e, d);
2123 if ( l1e_get_flags(ol1e) & _PAGE_PRESENT )
2124 rc = 0; /* Caller needs to invalidate TLB entry */
2125 else
2126 rc = 1; /* Caller need not invalidate TLB entry */
2128 else
2129 rc = -EINVAL;
2132 if ( unlikely(shadow_mode_enabled(d)) )
2133 update_shadow_va_mapping(va, _nl1e, ed, d);
2135 return rc;
2139 int do_update_va_mapping(unsigned long va,
2140 l1_pgentry_t val,
2141 unsigned long flags)
2143 struct exec_domain *ed = current;
2144 struct domain *d = ed->domain;
2145 unsigned int cpu = ed->processor;
2146 unsigned long vset, pset, bmap_ptr;
2147 int rc = 0;
2149 perfc_incrc(calls_to_update_va);
2151 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2152 return -EINVAL;
2154 LOCK_BIGLOCK(d);
2156 cleanup_writable_pagetable(d);
2158 if ( unlikely(shadow_mode_enabled(d)) )
2160 if ( unlikely(percpu_info[cpu].foreign &&
2161 (shadow_mode_translate(d) ||
2162 shadow_mode_translate(percpu_info[cpu].foreign))) )
2164 // The foreign domain's pfn's are in a different namespace.
2165 // There's not enough information in just a gpte to figure out
2166 // how to (re-)shadow this entry.
2167 //
2168 domain_crash();
2171 rc = update_shadow_va_mapping(va, val, ed, d);
2173 else if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2174 val)) )
2175 rc = -EINVAL;
2177 switch ( flags & UVMF_FLUSHTYPE_MASK )
2179 case UVMF_TLB_FLUSH:
2180 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2182 case UVMF_LOCAL:
2183 if ( unlikely(shadow_mode_enabled(d)) )
2184 shadow_sync_all(d);
2185 local_flush_tlb();
2186 break;
2187 case UVMF_ALL:
2188 BUG_ON(shadow_mode_enabled(d) && (d->cpuset != (1<<cpu)));
2189 flush_tlb_mask(d->cpuset);
2190 break;
2191 default:
2192 if ( unlikely(get_user(vset, (unsigned long *)bmap_ptr)) )
2193 rc = -EFAULT;
2194 pset = vcpuset_to_pcpuset(d, vset);
2195 flush_tlb_mask(pset & d->cpuset);
2196 break;
2198 break;
2200 case UVMF_INVLPG:
2201 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2203 case UVMF_LOCAL:
2204 if ( unlikely(shadow_mode_enabled(d)) )
2205 shadow_invlpg(current, va);
2206 local_flush_tlb_one(va);
2207 break;
2208 case UVMF_ALL:
2209 BUG_ON(shadow_mode_enabled(d) && (d->cpuset != (1<<cpu)));
2210 flush_tlb_one_mask(d->cpuset, va);
2211 break;
2212 default:
2213 if ( unlikely(get_user(vset, (unsigned long *)bmap_ptr)) )
2214 rc = -EFAULT;
2215 pset = vcpuset_to_pcpuset(d, vset);
2216 BUG_ON(shadow_mode_enabled(d) && (pset != (1<<cpu)));
2217 flush_tlb_one_mask(pset & d->cpuset, va);
2218 break;
2220 break;
2223 process_deferred_ops(cpu);
2225 UNLOCK_BIGLOCK(d);
2227 return rc;
2230 int do_update_va_mapping_otherdomain(unsigned long va,
2231 l1_pgentry_t val,
2232 unsigned long flags,
2233 domid_t domid)
2235 unsigned int cpu = smp_processor_id();
2236 struct domain *d;
2237 int rc;
2239 if ( unlikely(!IS_PRIV(current->domain)) )
2240 return -EPERM;
2242 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2243 if ( unlikely(d == NULL) )
2245 MEM_LOG("Unknown domain '%u'", domid);
2246 return -ESRCH;
2249 rc = do_update_va_mapping(va, val, flags);
2251 return rc;
2256 /*************************
2257 * Descriptor Tables
2258 */
2260 void destroy_gdt(struct exec_domain *ed)
2262 int i;
2263 unsigned long pfn;
2265 for ( i = 0; i < 16; i++ )
2267 if ( (pfn = l1e_get_pfn(ed->arch.perdomain_ptes[i])) != 0 )
2268 put_page_and_type(&frame_table[pfn]);
2269 ed->arch.perdomain_ptes[i] = l1e_empty();
2274 long set_gdt(struct exec_domain *ed,
2275 unsigned long *frames,
2276 unsigned int entries)
2278 struct domain *d = ed->domain;
2279 /* NB. There are 512 8-byte entries per GDT page. */
2280 int i = 0, nr_pages = (entries + 511) / 512;
2281 struct desc_struct *vgdt;
2282 unsigned long pfn;
2284 /* Check the first page in the new GDT. */
2285 if ( (pfn = frames[0]) >= max_page )
2286 goto fail;
2288 shadow_sync_all(d);
2290 /* The first page is special because Xen owns a range of entries in it. */
2291 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2293 /* GDT checks failed: try zapping the Xen reserved entries. */
2294 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
2295 goto fail;
2296 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
2297 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
2298 NR_RESERVED_GDT_ENTRIES*8);
2299 unmap_domain_mem(vgdt);
2300 put_page_and_type(&frame_table[pfn]);
2302 /* Okay, we zapped the entries. Now try the GDT checks again. */
2303 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2304 goto fail;
2307 /* Check the remaining pages in the new GDT. */
2308 for ( i = 1; i < nr_pages; i++ )
2309 if ( ((pfn = frames[i]) >= max_page) ||
2310 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2311 goto fail;
2313 /* Copy reserved GDT entries to the new GDT. */
2314 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
2315 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
2316 gdt_table + FIRST_RESERVED_GDT_ENTRY,
2317 NR_RESERVED_GDT_ENTRIES*8);
2318 unmap_domain_mem(vgdt);
2320 /* Tear down the old GDT. */
2321 destroy_gdt(ed);
2323 /* Install the new GDT. */
2324 for ( i = 0; i < nr_pages; i++ )
2325 ed->arch.perdomain_ptes[i] =
2326 l1e_create_pfn(frames[i], __PAGE_HYPERVISOR);
2328 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
2329 SET_GDT_ENTRIES(ed, entries);
2331 return 0;
2333 fail:
2334 while ( i-- > 0 )
2335 put_page_and_type(&frame_table[frames[i]]);
2336 return -EINVAL;
2340 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2342 int nr_pages = (entries + 511) / 512;
2343 unsigned long frames[16];
2344 long ret;
2346 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
2347 return -EINVAL;
2349 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2350 return -EFAULT;
2352 LOCK_BIGLOCK(current->domain);
2354 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2356 local_flush_tlb();
2357 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
2360 UNLOCK_BIGLOCK(current->domain);
2362 return ret;
2366 long do_update_descriptor(unsigned long pa, u64 desc)
2368 struct domain *dom = current->domain;
2369 unsigned long gpfn = pa >> PAGE_SHIFT;
2370 unsigned long mfn;
2371 struct desc_struct *gdt_pent, d;
2372 struct pfn_info *page;
2373 struct exec_domain *ed;
2374 long ret = -EINVAL;
2376 *(u64 *)&d = desc;
2378 LOCK_BIGLOCK(dom);
2380 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ) {
2381 UNLOCK_BIGLOCK(dom);
2382 return -EINVAL;
2385 if ( (pa & 7) || (mfn >= max_page) || !check_descriptor(&d) ) {
2386 UNLOCK_BIGLOCK(dom);
2387 return -EINVAL;
2390 page = &frame_table[mfn];
2391 if ( unlikely(!get_page(page, dom)) ) {
2392 UNLOCK_BIGLOCK(dom);
2393 return -EINVAL;
2396 /* Check if the given frame is in use in an unsafe context. */
2397 switch ( page->u.inuse.type_info & PGT_type_mask )
2399 case PGT_gdt_page:
2400 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
2401 for_each_exec_domain(dom, ed) {
2402 if ( (l1e_get_pfn(ed->arch.perdomain_ptes[0]) == mfn) &&
2403 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
2404 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
2405 goto out;
2407 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2408 goto out;
2409 break;
2410 case PGT_ldt_page:
2411 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2412 goto out;
2413 break;
2414 default:
2415 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2416 goto out;
2417 break;
2420 if ( shadow_mode_enabled(dom) )
2422 shadow_lock(dom);
2424 if ( shadow_mode_log_dirty(dom) )
2425 __mark_dirty(dom, mfn);
2427 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2428 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2431 /* All is good so make the update. */
2432 gdt_pent = map_domain_mem((mfn << PAGE_SHIFT) | (pa & ~PAGE_MASK));
2433 memcpy(gdt_pent, &d, 8);
2434 unmap_domain_mem(gdt_pent);
2436 if ( shadow_mode_enabled(dom) )
2437 shadow_unlock(dom);
2439 put_page_type(page);
2441 ret = 0; /* success */
2443 out:
2444 put_page(page);
2446 UNLOCK_BIGLOCK(dom);
2448 return ret;
2453 /*************************
2454 * Writable Pagetables
2455 */
2457 #ifdef VERBOSE
2458 int ptwr_debug = 0x0;
2459 #define PTWR_PRINTK(_f, _a...) \
2460 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2461 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2462 #else
2463 #define PTWR_PRINTK(_f, _a...) ((void)0)
2464 #endif
2466 /* Flush the given writable p.t. page and write-protect it again. */
2467 void ptwr_flush(struct domain *d, const int which)
2469 unsigned long pte, *ptep, l1va;
2470 l1_pgentry_t *pl1e, ol1e, nl1e;
2471 l2_pgentry_t *pl2e;
2472 int i;
2473 unsigned int modified = 0;
2475 // not supported in combination with various shadow modes!
2476 ASSERT( !shadow_mode_enabled(d) );
2478 l1va = d->arch.ptwr[which].l1va;
2479 ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
2481 /*
2482 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2483 */
2485 if ( unlikely(__get_user(pte, ptep)) )
2487 MEM_LOG("ptwr: Could not read pte at %p", ptep);
2488 /*
2489 * Really a bug. We could read this PTE during the initial fault,
2490 * and pagetables can't have changed meantime.
2491 */
2492 BUG();
2494 PTWR_PRINTK("[%c] disconnected_l1va at %p is %p\n",
2495 PTWR_PRINT_WHICH, ptep, pte);
2496 pte &= ~_PAGE_RW;
2498 /* Write-protect the p.t. page in the guest page table. */
2499 if ( unlikely(__put_user(pte, ptep)) )
2501 MEM_LOG("ptwr: Could not update pte at %p", ptep);
2502 /*
2503 * Really a bug. We could write this PTE during the initial fault,
2504 * and pagetables can't have changed meantime.
2505 */
2506 BUG();
2509 /* Ensure that there are no stale writable mappings in any TLB. */
2510 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
2511 local_flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
2512 PTWR_PRINTK("[%c] disconnected_l1va at %p now %p\n",
2513 PTWR_PRINT_WHICH, ptep, pte);
2515 /*
2516 * STEP 2. Validate any modified PTEs.
2517 */
2519 pl1e = d->arch.ptwr[which].pl1e;
2520 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2522 ol1e = d->arch.ptwr[which].page[i];
2523 nl1e = pl1e[i];
2525 if ( likely(l1e_get_value(ol1e) == l1e_get_value(nl1e)) )
2526 continue;
2528 /* Update number of entries modified. */
2529 modified++;
2531 /*
2532 * Fast path for PTEs that have merely been write-protected
2533 * (e.g., during a Unix fork()). A strict reduction in privilege.
2534 */
2535 if ( likely(l1e_get_value(ol1e) == (l1e_get_value(nl1e)|_PAGE_RW)) )
2537 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2538 put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
2539 continue;
2542 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2544 MEM_LOG("ptwr: Could not re-validate l1 page\n");
2545 /*
2546 * Make the remaining p.t's consistent before crashing, so the
2547 * reference counts are correct.
2548 */
2549 memcpy(&pl1e[i], &d->arch.ptwr[which].page[i],
2550 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2551 domain_crash();
2552 break;
2555 put_page_from_l1e(ol1e, d);
2557 unmap_domain_mem(pl1e);
2559 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
2560 d->arch.ptwr[which].prev_nr_updates = modified;
2562 /*
2563 * STEP 3. Reattach the L1 p.t. page into the current address space.
2564 */
2566 if ( which == PTWR_PT_ACTIVE )
2568 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
2569 l2e_add_flags(pl2e, _PAGE_PRESENT);
2572 /*
2573 * STEP 4. Final tidy-up.
2574 */
2576 d->arch.ptwr[which].l1va = 0;
2579 static int ptwr_emulated_update(
2580 unsigned long addr,
2581 unsigned long old,
2582 unsigned long val,
2583 unsigned int bytes,
2584 unsigned int do_cmpxchg)
2586 unsigned long pfn;
2587 struct pfn_info *page;
2588 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
2589 struct domain *d = current->domain;
2591 /* Aligned access only, thank you. */
2592 if ( !access_ok(VERIFY_WRITE, addr, bytes) || ((addr & (bytes-1)) != 0) )
2594 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %p)\n",
2595 bytes, addr);
2596 return X86EMUL_UNHANDLEABLE;
2599 /* Turn a sub-word access into a full-word access. */
2600 /* FIXME: needs tweaks for PAE */
2601 if ( (addr & ((BITS_PER_LONG/8)-1)) != 0 )
2603 int rc;
2604 unsigned long full;
2605 unsigned int mask = addr & ((BITS_PER_LONG/8)-1);
2606 /* Align address; read full word. */
2607 addr &= ~((BITS_PER_LONG/8)-1);
2608 if ( (rc = x86_emulate_read_std(addr, &full, BITS_PER_LONG/8)) )
2609 return rc;
2610 /* Mask out bits provided by caller. */
2611 full &= ~((1UL << (bytes*8)) - 1UL) << (mask*8);
2612 /* Shift the caller value and OR in the missing bits. */
2613 val &= (1UL << (bytes*8)) - 1UL;
2614 val <<= mask*8;
2615 val |= full;
2618 /* Read the PTE that maps the page being updated. */
2619 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
2620 sizeof(pte)))
2622 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
2623 return X86EMUL_UNHANDLEABLE;
2626 pfn = l1e_get_pfn(pte);
2627 page = &frame_table[pfn];
2629 /* We are looking only for read-only mappings of p.t. pages. */
2630 if ( ((l1e_get_flags(pte) & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
2631 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2632 (page_get_owner(page) != d) )
2634 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%p, %x)\n",
2635 pte, page->u.inuse.type_info);
2636 return X86EMUL_UNHANDLEABLE;
2639 /* Check the new PTE. */
2640 nl1e = l1e_create_phys(val, val & ~PAGE_MASK);
2641 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2642 return X86EMUL_UNHANDLEABLE;
2644 /* Checked successfully: do the update (write or cmpxchg). */
2645 pl1e = map_domain_mem(page_to_phys(page) + (addr & ~PAGE_MASK));
2646 if ( do_cmpxchg )
2648 ol1e = l1e_create_phys(old, old & ~PAGE_MASK);
2649 if ( cmpxchg((unsigned long *)pl1e, old, val) != old )
2651 unmap_domain_mem(pl1e);
2652 put_page_from_l1e(nl1e, d);
2653 return X86EMUL_CMPXCHG_FAILED;
2656 else
2658 ol1e = *pl1e;
2659 *pl1e = nl1e;
2661 unmap_domain_mem(pl1e);
2663 /* Propagate update to shadow cache. */
2664 if ( unlikely(shadow_mode_enabled(d)) )
2666 BUG(); // XXX fix me...
2667 #if 0
2668 sstat = get_shadow_status(d, page_to_pfn(page));
2669 if ( sstat & PSH_shadowed )
2671 sl1e = map_domain_mem(
2672 ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
2673 l1pte_propagate_from_guest(d, &nl1e, sl1e);
2674 unmap_domain_mem(sl1e);
2676 #endif
2679 /* Finally, drop the old PTE. */
2680 put_page_from_l1e(ol1e, d);
2682 return X86EMUL_CONTINUE;
2685 static int ptwr_emulated_write(
2686 unsigned long addr,
2687 unsigned long val,
2688 unsigned int bytes)
2690 return ptwr_emulated_update(addr, 0, val, bytes, 0);
2693 static int ptwr_emulated_cmpxchg(
2694 unsigned long addr,
2695 unsigned long old,
2696 unsigned long new,
2697 unsigned int bytes)
2699 return ptwr_emulated_update(addr, old, new, bytes, 1);
2702 static struct x86_mem_emulator ptwr_mem_emulator = {
2703 .read_std = x86_emulate_read_std,
2704 .write_std = x86_emulate_write_std,
2705 .read_emulated = x86_emulate_read_std,
2706 .write_emulated = ptwr_emulated_write,
2707 .cmpxchg_emulated = ptwr_emulated_cmpxchg
2708 };
2710 /* Write page fault handler: check if guest is trying to modify a PTE. */
2711 int ptwr_do_page_fault(struct domain *d, unsigned long addr)
2713 unsigned long pfn;
2714 struct pfn_info *page;
2715 l1_pgentry_t pte;
2716 l2_pgentry_t *pl2e;
2717 int which;
2718 u32 l2_idx;
2720 if ( unlikely(shadow_mode_enabled(d)) )
2721 return 0;
2723 /*
2724 * Attempt to read the PTE that maps the VA being accessed. By checking for
2725 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2726 */
2727 if ( !(l2e_get_flags(__linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2728 _PAGE_PRESENT) ||
2729 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
2730 sizeof(pte)) )
2732 return 0;
2735 pfn = l1e_get_pfn(pte);
2736 page = &frame_table[pfn];
2738 /* We are looking only for read-only mappings of p.t. pages. */
2739 if ( ((l1e_get_flags(pte) & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
2740 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2741 (page_get_owner(page) != d) )
2743 return 0;
2746 /* x86/64: Writable pagetable code needs auditing. Use emulator for now. */
2747 #if defined(__x86_64__)
2748 goto emulate;
2749 #endif
2751 /* Writable pagetables are not yet SMP safe. Use emulator for now. */
2752 if ( d->exec_domain[0]->ed_next_list != NULL )
2753 goto emulate;
2755 /* Get the L2 index at which this L1 p.t. is always mapped. */
2756 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2757 if ( unlikely(l2_idx >= PGT_va_unknown) )
2758 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
2759 l2_idx >>= PGT_va_shift;
2761 if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) )
2762 goto emulate; /* Urk! Pagetable maps itself! */
2764 /*
2765 * Is the L1 p.t. mapped into the current address space? If so we call it
2766 * an ACTIVE p.t., otherwise it is INACTIVE.
2767 */
2768 pl2e = &__linear_l2_table[l2_idx];
2769 which = PTWR_PT_INACTIVE;
2770 if ( (l2e_get_pfn(*pl2e)) == pfn )
2772 /*
2773 * Check the PRESENT bit to set ACTIVE mode.
2774 * If the PRESENT bit is clear, we may be conflicting with the current
2775 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
2776 * The ptwr_flush call below will restore the PRESENT bit.
2777 */
2778 if ( likely(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
2779 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
2780 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
2781 which = PTWR_PT_ACTIVE;
2784 PTWR_PRINTK("[%c] page_fault on l1 pt at va %p, pt for %08x, "
2785 "pfn %p\n", PTWR_PRINT_WHICH,
2786 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2788 /*
2789 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2790 * time. If there is already one, we must flush it out.
2791 */
2792 if ( d->arch.ptwr[which].l1va )
2793 ptwr_flush(d, which);
2795 /*
2796 * If last batch made no updates then we are probably stuck. Emulate this
2797 * update to ensure we make progress.
2798 */
2799 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
2800 goto emulate;
2802 d->arch.ptwr[which].l1va = addr | 1;
2803 d->arch.ptwr[which].l2_idx = l2_idx;
2805 /* For safety, disconnect the L1 p.t. page from current space. */
2806 if ( which == PTWR_PT_ACTIVE )
2808 l2e_remove_flags(pl2e, _PAGE_PRESENT);
2809 local_flush_tlb(); /* XXX Multi-CPU guests? */
2812 /* Temporarily map the L1 page, and make a copy of it. */
2813 d->arch.ptwr[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
2814 memcpy(d->arch.ptwr[which].page,
2815 d->arch.ptwr[which].pl1e,
2816 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
2818 /* Finally, make the p.t. page writable by the guest OS. */
2819 l1e_add_flags(&pte, _PAGE_RW);
2820 PTWR_PRINTK("[%c] update %p pte to %p\n", PTWR_PRINT_WHICH,
2821 &linear_pg_table[addr>>PAGE_SHIFT], pte);
2822 if ( unlikely(__copy_to_user(&linear_pg_table[addr>>PAGE_SHIFT],
2823 &pte, sizeof(pte))) )
2825 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
2826 &linear_pg_table[addr>>PAGE_SHIFT]);
2827 /* Toss the writable pagetable state and crash. */
2828 unmap_domain_mem(d->arch.ptwr[which].pl1e);
2829 d->arch.ptwr[which].l1va = 0;
2830 domain_crash();
2831 return 0;
2834 return EXCRET_fault_fixed;
2836 emulate:
2837 if ( x86_emulate_memop(get_execution_context(), addr,
2838 &ptwr_mem_emulator, BITS_PER_LONG/8) )
2839 return 0;
2840 perfc_incrc(ptwr_emulations);
2841 return EXCRET_fault_fixed;
2844 int ptwr_init(struct domain *d)
2846 void *x = (void *)alloc_xenheap_page();
2847 void *y = (void *)alloc_xenheap_page();
2849 if ( (x == NULL) || (y == NULL) )
2851 if ( x != NULL )
2852 free_xenheap_page((unsigned long)x);
2853 if ( y != NULL )
2854 free_xenheap_page((unsigned long)y);
2855 return -ENOMEM;
2858 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
2859 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
2861 return 0;
2864 void ptwr_destroy(struct domain *d)
2866 cleanup_writable_pagetable(d);
2867 free_xenheap_page((unsigned long)d->arch.ptwr[PTWR_PT_ACTIVE].page);
2868 free_xenheap_page((unsigned long)d->arch.ptwr[PTWR_PT_INACTIVE].page);
2873 /************************************************************************/
2874 /************************************************************************/
2875 /************************************************************************/
2877 /* Graveyard: stuff below may be useful in future. */
2878 #if 0
2879 case MMUEXT_TRANSFER_PAGE:
2880 domid = (domid_t)(val >> 16);
2881 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
2883 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
2884 unlikely(!pfn_valid(pfn)) ||
2885 unlikely((e = find_domain_by_id(domid)) == NULL) )
2887 MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
2888 okay = 0;
2889 break;
2892 spin_lock(&d->page_alloc_lock);
2894 /*
2895 * The tricky bit: atomically release ownership while there is just one
2896 * benign reference to the page (PGC_allocated). If that reference
2897 * disappears then the deallocation routine will safely spin.
2898 */
2899 _d = pickle_domptr(d);
2900 _nd = page->u.inuse._domain;
2901 y = page->count_info;
2902 do {
2903 x = y;
2904 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2905 (1|PGC_allocated)) ||
2906 unlikely(_nd != _d) )
2908 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
2909 " caf=%08x, taf=%08x\n", page_to_pfn(page),
2910 d, d->id, unpickle_domptr(_nd), x,
2911 page->u.inuse.type_info);
2912 spin_unlock(&d->page_alloc_lock);
2913 put_domain(e);
2914 return 0;
2916 __asm__ __volatile__(
2917 LOCK_PREFIX "cmpxchg8b %2"
2918 : "=d" (_nd), "=a" (y),
2919 "=m" (*(volatile u64 *)(&page->count_info))
2920 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2922 while ( unlikely(_nd != _d) || unlikely(y != x) );
2924 /*
2925 * Unlink from 'd'. At least one reference remains (now anonymous), so
2926 * noone else is spinning to try to delete this page from 'd'.
2927 */
2928 d->tot_pages--;
2929 list_del(&page->list);
2931 spin_unlock(&d->page_alloc_lock);
2933 spin_lock(&e->page_alloc_lock);
2935 /*
2936 * Check that 'e' will accept the page and has reservation headroom.
2937 * Also, a domain mustn't have PGC_allocated pages when it is dying.
2938 */
2939 ASSERT(e->tot_pages <= e->max_pages);
2940 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
2941 unlikely(e->tot_pages == e->max_pages) ||
2942 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
2944 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
2945 "provided a bad grant ref, or is dying (%p).\n",
2946 e->tot_pages, e->max_pages, e->d_flags);
2947 spin_unlock(&e->page_alloc_lock);
2948 put_domain(e);
2949 okay = 0;
2950 break;
2953 /* Okay, add the page to 'e'. */
2954 if ( unlikely(e->tot_pages++ == 0) )
2955 get_knownalive_domain(e);
2956 list_add_tail(&page->list, &e->page_list);
2957 page_set_owner(page, e);
2959 spin_unlock(&e->page_alloc_lock);
2961 /* Transfer is all done: tell the guest about its new page frame. */
2962 gnttab_notify_transfer(e, d, gntref, pfn);
2964 put_domain(e);
2965 break;
2966 #endif
2968 /*
2969 * Local variables:
2970 * mode: C
2971 * c-set-style: "BSD"
2972 * c-basic-offset: 4
2973 * tab-width: 4
2974 * indent-tabs-mode: nil
2975 * End:
2976 */