debuggers.hg

view xen/arch/x86/mm/shadow/multi.c @ 16725:25771a7c2907

x86, hvm: Add a perf counter for CR0.WP=0 emulation.
Signed-off-by Kevin Tian <kevin.tian@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 08 09:57:59 2008 +0000 (2008-01-08)
parents e818c24cec03
children cff4c8a1aa28
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * GUEST_WALK_TABLES TLB FLUSH COALESCE
59 * guest_walk_tables can do up to three remote TLB flushes as it walks to
60 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
61 * and if we do flush, re-do the walk. If anything has changed, then
62 * pause all the other vcpus and do the walk *again*.
63 *
64 * PSE disabled / PSE36
65 * We don't support any modes other than PSE enabled, PSE36 disabled.
66 * Neither of those would be hard to change, but we'd need to be able to
67 * deal with shadows made in one mode and used in another.
68 */
70 #define FETCH_TYPE_PREFETCH 1
71 #define FETCH_TYPE_DEMAND 2
72 #define FETCH_TYPE_WRITE 4
73 typedef enum {
74 ft_prefetch = FETCH_TYPE_PREFETCH,
75 ft_demand_read = FETCH_TYPE_DEMAND,
76 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
77 } fetch_type_t;
79 #ifdef DEBUG_TRACE_DUMP
80 static char *fetch_type_names[] = {
81 [ft_prefetch] "prefetch",
82 [ft_demand_read] "demand read",
83 [ft_demand_write] "demand write",
84 };
85 #endif
87 /**************************************************************************/
88 /* Hash table mapping from guest pagetables to shadows
89 *
90 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
91 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
92 * shadow L1 which maps its "splinters".
93 */
95 static inline mfn_t
96 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
97 /* Look for FL1 shadows in the hash table */
98 {
99 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
100 return smfn;
101 }
103 static inline mfn_t
104 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
105 /* Look for shadows in the hash table */
106 {
107 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
108 perfc_incr(shadow_get_shadow_status);
109 return smfn;
110 }
112 static inline void
113 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
114 /* Put an FL1 shadow into the hash table */
115 {
116 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
117 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
119 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
120 }
122 static inline void
123 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
124 /* Put a shadow into the hash table */
125 {
126 struct domain *d = v->domain;
127 int res;
129 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
130 d->domain_id, v->vcpu_id, mfn_x(gmfn),
131 shadow_type, mfn_x(smfn));
133 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
134 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
135 {
136 res = get_page(mfn_to_page(gmfn), d);
137 ASSERT(res == 1);
138 }
140 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
141 }
143 static inline void
144 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
145 /* Remove a shadow from the hash table */
146 {
147 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
148 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
149 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
150 }
152 static inline void
153 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
154 /* Remove a shadow from the hash table */
155 {
156 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
157 v->domain->domain_id, v->vcpu_id,
158 mfn_x(gmfn), shadow_type, mfn_x(smfn));
159 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
160 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
161 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
162 put_page(mfn_to_page(gmfn));
163 }
165 /**************************************************************************/
166 /* CPU feature support querying */
168 static inline int
169 guest_supports_superpages(struct vcpu *v)
170 {
171 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
172 * CR4.PSE is set or the guest is in PAE or long mode.
173 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
174 return (is_hvm_vcpu(v) &&
175 (GUEST_PAGING_LEVELS != 2
176 || !hvm_paging_enabled(v)
177 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
178 }
180 static inline int
181 guest_supports_nx(struct vcpu *v)
182 {
183 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
184 return 0;
185 if ( !is_hvm_vcpu(v) )
186 return cpu_has_nx;
187 return hvm_nx_enabled(v);
188 }
191 /**************************************************************************/
192 /* Functions for walking the guest page tables */
194 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
195 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
196 {
197 static uint32_t flags[] = {
198 /* I/F - Usr Wr */
199 /* 0 0 0 0 */ _PAGE_PRESENT,
200 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
201 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
202 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
203 /* 0 1 0 0 */ _PAGE_PRESENT,
204 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
205 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
206 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
207 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
208 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
209 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
210 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
211 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
212 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
213 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
214 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
215 };
217 /* Don't demand not-NX if the CPU wouldn't enforce it. */
218 if ( !guest_supports_nx(v) )
219 pfec &= ~PFEC_insn_fetch;
221 /* Don't demand R/W if the CPU wouldn't enforce it. */
222 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
223 && !(pfec & PFEC_user_mode) )
224 pfec &= ~PFEC_write_access;
226 return flags[(pfec & 0x1f) >> 1];
227 }
229 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
230 * Returns non-zero if it actually writes to guest memory. */
231 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
232 {
233 guest_intpte_t old, new;
235 old = *(guest_intpte_t *)walk_p;
236 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
237 if ( old != new )
238 {
239 /* Write the new entry into the walk, and try to write it back
240 * into the guest table as well. If the guest table has changed
241 * under out feet then leave it alone. */
242 *(guest_intpte_t *)walk_p = new;
243 if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
244 return 1;
245 }
246 return 0;
247 }
249 /* Walk the guest pagetables, after the manner of a hardware walker.
250 *
251 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
252 * pointer to a pagefault code, and a flag "shadow_op".
253 *
254 * We walk the vcpu's guest pagetables, filling the walk_t with what we
255 * see and adding any Accessed and Dirty bits that are needed in the
256 * guest entries. Using the pagefault code, we check the permissions as
257 * we go. For the purposes of reading pagetables we treat all non-RAM
258 * memory as contining zeroes.
259 *
260 * If "shadow_op" is non-zero, we are serving a genuine guest memory access,
261 * and must (a) be under the shadow lock, and (b) remove write access
262 * from any guest PT pages we see, as we will be shadowing them soon
263 * and will rely on the contents' not having changed.
264 *
265 * Returns 0 for success, or the set of permission bits that we failed on
266 * if the walk did not complete.
267 * N.B. This is different from the old return code but almost no callers
268 * checked the old return code anyway.
269 */
270 static uint32_t
271 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
272 uint32_t pfec, int shadow_op)
273 {
274 struct domain *d = v->domain;
275 p2m_type_t p2mt;
276 guest_l1e_t *l1p = NULL;
277 guest_l2e_t *l2p = NULL;
278 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
279 guest_l3e_t *l3p = NULL;
280 guest_l4e_t *l4p;
281 #endif
282 uint32_t gflags, mflags, rc = 0;
283 int pse;
285 ASSERT(!shadow_op || shadow_locked_by_me(d));
287 perfc_incr(shadow_guest_walk);
288 memset(gw, 0, sizeof(*gw));
289 gw->va = va;
291 /* Mandatory bits that must be set in every entry. We invert NX, to
292 * calculate as if there were an "X" bit that allowed access.
293 * We will accumulate, in rc, the set of flags that are missing. */
294 mflags = mandatory_flags(v, pfec);
296 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
297 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
299 /* Get the l4e from the top level table and check its flags*/
300 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
301 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
302 gw->l4e = l4p[guest_l4_table_offset(va)];
303 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
304 rc |= ((gflags & mflags) ^ mflags);
305 if ( rc & _PAGE_PRESENT ) goto out;
307 /* Map the l3 table */
308 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
309 if ( !p2m_is_ram(p2mt) )
310 {
311 rc |= _PAGE_PRESENT;
312 goto out;
313 }
314 ASSERT(mfn_valid(gw->l3mfn));
315 /* This mfn is a pagetable: make sure the guest can't write to it. */
316 if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
317 flush_tlb_mask(d->domain_dirty_cpumask);
318 /* Get the l3e and check its flags*/
319 l3p = sh_map_domain_page(gw->l3mfn);
320 gw->l3e = l3p[guest_l3_table_offset(va)];
321 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
322 rc |= ((gflags & mflags) ^ mflags);
323 if ( rc & _PAGE_PRESENT )
324 goto out;
326 #else /* PAE only... */
328 /* Get l3e from the cache of the top level table and check its flag */
329 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
330 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
331 {
332 rc |= _PAGE_PRESENT;
333 goto out;
334 }
336 #endif /* PAE or 64... */
338 /* Map the l2 table */
339 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
340 if ( !p2m_is_ram(p2mt) )
341 {
342 rc |= _PAGE_PRESENT;
343 goto out;
344 }
345 ASSERT(mfn_valid(gw->l2mfn));
346 /* This mfn is a pagetable: make sure the guest can't write to it. */
347 if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
348 flush_tlb_mask(d->domain_dirty_cpumask);
349 /* Get the l2e */
350 l2p = sh_map_domain_page(gw->l2mfn);
351 gw->l2e = l2p[guest_l2_table_offset(va)];
353 #else /* 32-bit only... */
355 /* Get l2e from the top level table */
356 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
357 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
358 gw->l2e = l2p[guest_l2_table_offset(va)];
360 #endif /* All levels... */
362 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
363 rc |= ((gflags & mflags) ^ mflags);
364 if ( rc & _PAGE_PRESENT )
365 goto out;
367 pse = (guest_supports_superpages(v) &&
368 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
370 if ( pse )
371 {
372 /* Special case: this guest VA is in a PSE superpage, so there's
373 * no guest l1e. We make one up so that the propagation code
374 * can generate a shadow l1 table. Start with the gfn of the
375 * first 4k-page of the superpage. */
376 gfn_t start = guest_l2e_get_gfn(gw->l2e);
377 /* Grant full access in the l1e, since all the guest entry's
378 * access controls are enforced in the shadow l2e. */
379 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
380 _PAGE_ACCESSED|_PAGE_DIRTY);
381 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
382 * of the level 1. */
383 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
384 flags |= _PAGE_PAT;
385 /* Copy the cache-control bits to the l1 as well, because we
386 * can't represent PAT in the (non-PSE) shadow l2e. :(
387 * This could cause problems if a guest ever maps an area of
388 * memory with superpages using more than one caching mode. */
389 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
390 /* Increment the pfn by the right number of 4k pages.
391 * The ~0x1 is to mask out the PAT bit mentioned above. */
392 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
393 gw->l1e = guest_l1e_from_gfn(start, flags);
394 gw->l1mfn = _mfn(INVALID_MFN);
395 }
396 else
397 {
398 /* Not a superpage: carry on and find the l1e. */
399 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
400 if ( !p2m_is_ram(p2mt) )
401 {
402 rc |= _PAGE_PRESENT;
403 goto out;
404 }
405 ASSERT(mfn_valid(gw->l1mfn));
406 /* This mfn is a pagetable: make sure the guest can't write to it. */
407 if ( shadow_op
408 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
409 flush_tlb_mask(d->domain_dirty_cpumask);
410 l1p = sh_map_domain_page(gw->l1mfn);
411 gw->l1e = l1p[guest_l1_table_offset(va)];
412 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
413 rc |= ((gflags & mflags) ^ mflags);
414 }
416 /* Go back and set accessed and dirty bits only if the walk was a
417 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
418 * get set whenever a lower-level PT is used, at least some hardware
419 * walkers behave this way. */
420 if ( rc == 0 )
421 {
422 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
423 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
424 paging_mark_dirty(d, mfn_x(gw->l4mfn));
425 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
426 paging_mark_dirty(d, mfn_x(gw->l3mfn));
427 #endif
428 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
429 (pse && (pfec & PFEC_write_access))) )
430 paging_mark_dirty(d, mfn_x(gw->l2mfn));
431 if ( !pse )
432 {
433 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
434 (pfec & PFEC_write_access)) )
435 paging_mark_dirty(d, mfn_x(gw->l1mfn));
436 }
437 }
439 out:
440 #if GUEST_PAGING_LEVELS == 4
441 if ( l3p ) sh_unmap_domain_page(l3p);
442 #endif
443 #if GUEST_PAGING_LEVELS >= 3
444 if ( l2p ) sh_unmap_domain_page(l2p);
445 #endif
446 if ( l1p ) sh_unmap_domain_page(l1p);
448 return rc;
449 }
451 /* Given a walk_t, translate the gw->va into the guest's notion of the
452 * corresponding frame number. */
453 static inline gfn_t
454 guest_walk_to_gfn(walk_t *gw)
455 {
456 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
457 return _gfn(INVALID_GFN);
458 return guest_l1e_get_gfn(gw->l1e);
459 }
461 /* Given a walk_t, translate the gw->va into the guest's notion of the
462 * corresponding physical address. */
463 static inline paddr_t
464 guest_walk_to_gpa(walk_t *gw)
465 {
466 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
467 return 0;
468 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
469 }
471 #if 0 /* Keep for debugging */
472 /* Pretty-print the contents of a guest-walk */
473 static inline void print_gw(walk_t *gw)
474 {
475 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
476 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
477 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
478 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
479 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
480 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
481 #endif /* PAE or 64... */
482 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
483 #endif /* All levels... */
484 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
485 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
486 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
487 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
488 }
489 #endif /* 0 */
491 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
492 /* Lightweight audit: pass all the shadows associated with this guest walk
493 * through the audit mechanisms */
494 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
495 {
496 mfn_t smfn;
498 if ( !(SHADOW_AUDIT_ENABLE) )
499 return;
501 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
502 if ( mfn_valid(gw->l4mfn)
503 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
504 SH_type_l4_shadow))) )
505 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
506 if ( mfn_valid(gw->l3mfn)
507 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
508 SH_type_l3_shadow))) )
509 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
510 #endif /* PAE or 64... */
511 if ( mfn_valid(gw->l2mfn) )
512 {
513 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
514 SH_type_l2_shadow))) )
515 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
516 #if GUEST_PAGING_LEVELS == 3
517 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
518 SH_type_l2h_shadow))) )
519 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
520 #endif
521 }
522 if ( mfn_valid(gw->l1mfn)
523 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
524 SH_type_l1_shadow))) )
525 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
526 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
527 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
528 && mfn_valid(
529 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
530 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
531 }
533 #else
534 #define sh_audit_gw(_v, _gw) do {} while(0)
535 #endif /* audit code */
538 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
539 void *
540 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
541 unsigned long *gl1mfn)
542 {
543 void *pl1e = NULL;
544 walk_t gw;
546 ASSERT(shadow_mode_translate(v->domain));
548 // XXX -- this is expensive, but it's easy to cobble together...
549 // FIXME!
551 shadow_lock(v->domain);
552 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0
553 && mfn_valid(gw.l1mfn) )
554 {
555 if ( gl1mfn )
556 *gl1mfn = mfn_x(gw.l1mfn);
557 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
558 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
559 }
561 shadow_unlock(v->domain);
563 return pl1e;
564 }
566 void
567 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
568 {
569 walk_t gw;
571 ASSERT(shadow_mode_translate(v->domain));
573 // XXX -- this is expensive, but it's easy to cobble together...
574 // FIXME!
576 shadow_lock(v->domain);
577 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
578 *(guest_l1e_t *)eff_l1e = gw.l1e;
579 shadow_unlock(v->domain);
580 }
581 #endif /* CONFIG==SHADOW==GUEST */
583 /**************************************************************************/
584 /* Functions to compute the correct index into a shadow page, given an
585 * index into the guest page (as returned by guest_get_index()).
586 * This is trivial when the shadow and guest use the same sized PTEs, but
587 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
588 * PAE- or 64-bit shadows).
589 *
590 * These functions also increment the shadow mfn, when necessary. When PTE
591 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
592 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
593 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
594 * which shadow page we really want. Similarly, when PTE sizes are
595 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
596 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
597 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
598 * space.)
599 *
600 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
601 * of shadow (to store both the shadow, and the info that would normally be
602 * stored in page_info fields). This arrangement allows the shadow and the
603 * "page_info" fields to always be stored in the same page (in fact, in
604 * the same cache line), avoiding an extra call to map_domain_page().
605 */
607 static inline u32
608 guest_index(void *ptr)
609 {
610 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
611 }
613 static u32
614 shadow_l1_index(mfn_t *smfn, u32 guest_index)
615 {
616 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
617 *smfn = _mfn(mfn_x(*smfn) +
618 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
619 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
620 #else
621 return guest_index;
622 #endif
623 }
625 static u32
626 shadow_l2_index(mfn_t *smfn, u32 guest_index)
627 {
628 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
629 // Because we use 2 shadow l2 entries for each guest entry, the number of
630 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
631 //
632 *smfn = _mfn(mfn_x(*smfn) +
633 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
635 // We multiple by two to get the index of the first of the two entries
636 // used to shadow the specified guest entry.
637 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
638 #else
639 return guest_index;
640 #endif
641 }
643 #if GUEST_PAGING_LEVELS >= 4
645 static u32
646 shadow_l3_index(mfn_t *smfn, u32 guest_index)
647 {
648 return guest_index;
649 }
651 static u32
652 shadow_l4_index(mfn_t *smfn, u32 guest_index)
653 {
654 return guest_index;
655 }
657 #endif // GUEST_PAGING_LEVELS >= 4
659 extern u32 get_pat_flags(struct vcpu *v,
660 u32 gl1e_flags,
661 paddr_t gpaddr,
662 paddr_t spaddr);
664 unsigned char pat_type_2_pte_flags(unsigned char pat_type);
665 /**************************************************************************/
666 /* Function which computes shadow entries from their corresponding guest
667 * entries. This is the "heart" of the shadow code. It operates using
668 * level-1 shadow types, but handles all levels of entry.
669 * Don't call it directly, but use the four wrappers below.
670 */
672 static always_inline void
673 _sh_propagate(struct vcpu *v,
674 guest_intpte_t guest_intpte,
675 mfn_t target_mfn,
676 void *shadow_entry_ptr,
677 int level,
678 fetch_type_t ft,
679 p2m_type_t p2mt)
680 {
681 guest_l1e_t guest_entry = { guest_intpte };
682 shadow_l1e_t *sp = shadow_entry_ptr;
683 struct domain *d = v->domain;
684 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
685 u32 pass_thru_flags;
686 u32 gflags, sflags;
688 /* We don't shadow PAE l3s */
689 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
691 /* Check there's something for the shadows to map to */
692 if ( !p2m_is_valid(p2mt) )
693 {
694 *sp = shadow_l1e_empty();
695 goto done;
696 }
698 gflags = guest_l1e_get_flags(guest_entry);
700 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
701 {
702 /* If a guest l1 entry is not present, shadow with the magic
703 * guest-not-present entry. */
704 if ( level == 1 )
705 *sp = sh_l1e_gnp();
706 else
707 *sp = shadow_l1e_empty();
708 goto done;
709 }
711 if ( level == 1 && p2mt == p2m_mmio_dm )
712 {
713 /* Guest l1e maps emulated MMIO space */
714 *sp = sh_l1e_mmio(target_gfn, gflags);
715 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
716 d->arch.paging.shadow.has_fast_mmio_entries = 1;
717 goto done;
718 }
720 // Must have a valid target_mfn unless this is a prefetch or an l1
721 // pointing at MMIO space. In the case of a prefetch, an invalid
722 // mfn means that we can not usefully shadow anything, and so we
723 // return early.
724 //
725 if ( !mfn_valid(target_mfn)
726 && !(level == 1 && (!shadow_mode_refcounts(d)
727 || p2mt == p2m_mmio_direct)) )
728 {
729 ASSERT((ft == ft_prefetch));
730 *sp = shadow_l1e_empty();
731 goto done;
732 }
734 // Propagate bits from the guest to the shadow.
735 // Some of these may be overwritten, below.
736 // Since we know the guest's PRESENT bit is set, we also set the shadow's
737 // SHADOW_PRESENT bit.
738 //
739 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
740 _PAGE_RW | _PAGE_PRESENT);
741 if ( guest_supports_nx(v) )
742 pass_thru_flags |= _PAGE_NX_BIT;
743 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
744 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
745 sflags = gflags & pass_thru_flags;
747 /*
748 * For HVM domains with direct access to MMIO areas, set the correct
749 * caching attributes in the shadows to match what was asked for.
750 */
751 if ( (level == 1) && is_hvm_domain(d) &&
752 !list_empty(&(domain_hvm_iommu(d)->pdev_list)) &&
753 !is_xen_heap_mfn(mfn_x(target_mfn)) )
754 {
755 unsigned int type;
756 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
757 sflags |= pat_type_2_pte_flags(type);
758 else if ( d->arch.hvm_domain.is_in_uc_mode )
759 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
760 else
761 sflags |= get_pat_flags(v,
762 gflags,
763 gfn_to_paddr(target_gfn),
764 mfn_x(target_mfn) << PAGE_SHIFT);
765 }
767 // Set the A&D bits for higher level shadows.
768 // Higher level entries do not, strictly speaking, have dirty bits, but
769 // since we use shadow linear tables, each of these entries may, at some
770 // point in time, also serve as a shadow L1 entry.
771 // By setting both the A&D bits in each of these, we eliminate the burden
772 // on the hardware to update these bits on initial accesses.
773 //
774 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
775 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
777 // If the A or D bit has not yet been set in the guest, then we must
778 // prevent the corresponding kind of access.
779 //
780 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
781 sflags &= ~_PAGE_PRESENT;
783 /* D bits exist in L1es and PSE L2es */
784 if ( unlikely(((level == 1) ||
785 ((level == 2) &&
786 (gflags & _PAGE_PSE) &&
787 guest_supports_superpages(v)))
788 && !(gflags & _PAGE_DIRTY)) )
789 sflags &= ~_PAGE_RW;
791 // shadow_mode_log_dirty support
792 //
793 // Only allow the guest write access to a page a) on a demand fault,
794 // or b) if the page is already marked as dirty.
795 //
796 // (We handle log-dirty entirely inside the shadow code, without using the
797 // p2m_ram_logdirty p2m type: only HAP uses that.)
798 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
799 {
800 if ( mfn_valid(target_mfn) ) {
801 if ( ft & FETCH_TYPE_WRITE )
802 paging_mark_dirty(d, mfn_x(target_mfn));
803 else if ( !sh_mfn_is_dirty(d, target_mfn) )
804 sflags &= ~_PAGE_RW;
805 }
806 }
808 /* Read-only memory */
809 if ( p2mt == p2m_ram_ro )
810 sflags &= ~_PAGE_RW;
812 // protect guest page tables
813 //
814 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
815 {
816 if ( shadow_mode_trap_reads(d) )
817 {
818 // if we are trapping both reads & writes, then mark this page
819 // as not present...
820 //
821 sflags &= ~_PAGE_PRESENT;
822 }
823 else
824 {
825 // otherwise, just prevent any writes...
826 //
827 sflags &= ~_PAGE_RW;
828 }
829 }
831 // PV guests in 64-bit mode use two different page tables for user vs
832 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
833 // It is always shadowed as present...
834 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
835 && !is_hvm_domain(d) )
836 {
837 sflags |= _PAGE_USER;
838 }
840 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
842 done:
843 SHADOW_DEBUG(PROPAGATE,
844 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
845 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
846 }
849 /* These four wrappers give us a little bit of type-safety back around
850 * the use of void-* pointers and intpte types in _sh_propagate(), and
851 * allow the compiler to optimize out some level checks. */
853 #if GUEST_PAGING_LEVELS >= 4
854 static void
855 l4e_propagate_from_guest(struct vcpu *v,
856 guest_l4e_t gl4e,
857 mfn_t sl3mfn,
858 shadow_l4e_t *sl4e,
859 fetch_type_t ft)
860 {
861 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
862 }
864 static void
865 l3e_propagate_from_guest(struct vcpu *v,
866 guest_l3e_t gl3e,
867 mfn_t sl2mfn,
868 shadow_l3e_t *sl3e,
869 fetch_type_t ft)
870 {
871 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
872 }
873 #endif // GUEST_PAGING_LEVELS >= 4
875 static void
876 l2e_propagate_from_guest(struct vcpu *v,
877 guest_l2e_t gl2e,
878 mfn_t sl1mfn,
879 shadow_l2e_t *sl2e,
880 fetch_type_t ft)
881 {
882 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
883 }
885 static void
886 l1e_propagate_from_guest(struct vcpu *v,
887 guest_l1e_t gl1e,
888 mfn_t gmfn,
889 shadow_l1e_t *sl1e,
890 fetch_type_t ft,
891 p2m_type_t p2mt)
892 {
893 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
894 }
897 /**************************************************************************/
898 /* These functions update shadow entries (and do bookkeeping on the shadow
899 * tables they are in). It is intended that they are the only
900 * functions which ever write (non-zero) data onto a shadow page.
901 */
903 static inline void safe_write_entry(void *dst, void *src)
904 /* Copy one PTE safely when processors might be running on the
905 * destination pagetable. This does *not* give safety against
906 * concurrent writes (that's what the shadow lock is for), just
907 * stops the hardware picking up partially written entries. */
908 {
909 volatile unsigned long *d = dst;
910 unsigned long *s = src;
911 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
912 #if CONFIG_PAGING_LEVELS == 3
913 /* In PAE mode, pagetable entries are larger
914 * than machine words, so won't get written atomically. We need to make
915 * sure any other cpu running on these shadows doesn't see a
916 * half-written entry. Do this by marking the entry not-present first,
917 * then writing the high word before the low word. */
918 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
919 d[0] = 0;
920 d[1] = s[1];
921 d[0] = s[0];
922 #else
923 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
924 * which will be an atomic write, since the entry is aligned. */
925 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
926 *d = *s;
927 #endif
928 }
931 static inline void
932 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
933 /* This function does the actual writes to shadow pages.
934 * It must not be called directly, since it doesn't do the bookkeeping
935 * that shadow_set_l*e() functions do. */
936 {
937 shadow_l1e_t *dst = d;
938 shadow_l1e_t *src = s;
939 void *map = NULL;
940 int i;
942 /* Because we mirror access rights at all levels in the shadow, an
943 * l2 (or higher) entry with the RW bit cleared will leave us with
944 * no write access through the linear map.
945 * We detect that by writing to the shadow with copy_to_user() and
946 * using map_domain_page() to get a writeable mapping if we need to. */
947 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
948 {
949 perfc_incr(shadow_linear_map_failed);
950 map = sh_map_domain_page(mfn);
951 ASSERT(map != NULL);
952 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
953 }
956 for ( i = 0; i < entries; i++ )
957 safe_write_entry(dst++, src++);
959 if ( map != NULL ) sh_unmap_domain_page(map);
960 }
962 static inline int
963 perms_strictly_increased(u32 old_flags, u32 new_flags)
964 /* Given the flags of two entries, are the new flags a strict
965 * increase in rights over the old ones? */
966 {
967 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
968 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
969 /* Flip the NX bit, since it's the only one that decreases rights;
970 * we calculate as if it were an "X" bit. */
971 of ^= _PAGE_NX_BIT;
972 nf ^= _PAGE_NX_BIT;
973 /* If the changed bits are all set in the new flags, then rights strictly
974 * increased between old and new. */
975 return ((of | (of ^ nf)) == nf);
976 }
978 static int inline
979 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
980 {
981 int res;
982 mfn_t mfn;
983 struct domain *owner;
985 ASSERT(!sh_l1e_is_magic(sl1e));
987 if ( !shadow_mode_refcounts(d) )
988 return 1;
990 res = get_page_from_l1e(sl1e, d);
992 // If a privileged domain is attempting to install a map of a page it does
993 // not own, we let it succeed anyway.
994 //
995 if ( unlikely(!res) &&
996 IS_PRIV(d) &&
997 !shadow_mode_translate(d) &&
998 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
999 (owner = page_get_owner(mfn_to_page(mfn))) &&
1000 (d != owner) )
1002 res = get_page_from_l1e(sl1e, owner);
1003 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1004 "which is owned by domain %d: %s\n",
1005 d->domain_id, mfn_x(mfn), owner->domain_id,
1006 res ? "success" : "failed");
1009 if ( unlikely(!res) )
1011 perfc_incr(shadow_get_page_fail);
1012 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1015 return res;
1018 static void inline
1019 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1021 if ( !shadow_mode_refcounts(d) )
1022 return;
1024 put_page_from_l1e(sl1e, d);
1027 #if GUEST_PAGING_LEVELS >= 4
1028 static int shadow_set_l4e(struct vcpu *v,
1029 shadow_l4e_t *sl4e,
1030 shadow_l4e_t new_sl4e,
1031 mfn_t sl4mfn)
1033 int flags = 0, ok;
1034 shadow_l4e_t old_sl4e;
1035 paddr_t paddr;
1036 ASSERT(sl4e != NULL);
1037 old_sl4e = *sl4e;
1039 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1041 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1042 | (((unsigned long)sl4e) & ~PAGE_MASK));
1044 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1046 /* About to install a new reference */
1047 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1048 ok = sh_get_ref(v, sl3mfn, paddr);
1049 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1050 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1051 ok |= sh_pin(v, sl3mfn);
1052 if ( !ok )
1054 domain_crash(v->domain);
1055 return SHADOW_SET_ERROR;
1059 /* Write the new entry */
1060 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1061 flags |= SHADOW_SET_CHANGED;
1063 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1065 /* We lost a reference to an old mfn. */
1066 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1067 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1068 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1069 shadow_l4e_get_flags(new_sl4e)) )
1071 flags |= SHADOW_SET_FLUSH;
1073 sh_put_ref(v, osl3mfn, paddr);
1075 return flags;
1078 static int shadow_set_l3e(struct vcpu *v,
1079 shadow_l3e_t *sl3e,
1080 shadow_l3e_t new_sl3e,
1081 mfn_t sl3mfn)
1083 int flags = 0;
1084 shadow_l3e_t old_sl3e;
1085 paddr_t paddr;
1086 ASSERT(sl3e != NULL);
1087 old_sl3e = *sl3e;
1089 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1091 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1092 | (((unsigned long)sl3e) & ~PAGE_MASK));
1094 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1095 /* About to install a new reference */
1096 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1098 domain_crash(v->domain);
1099 return SHADOW_SET_ERROR;
1102 /* Write the new entry */
1103 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1104 flags |= SHADOW_SET_CHANGED;
1106 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1108 /* We lost a reference to an old mfn. */
1109 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1110 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1111 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1112 shadow_l3e_get_flags(new_sl3e)) )
1114 flags |= SHADOW_SET_FLUSH;
1116 sh_put_ref(v, osl2mfn, paddr);
1118 return flags;
1120 #endif /* GUEST_PAGING_LEVELS >= 4 */
1122 static int shadow_set_l2e(struct vcpu *v,
1123 shadow_l2e_t *sl2e,
1124 shadow_l2e_t new_sl2e,
1125 mfn_t sl2mfn)
1127 int flags = 0;
1128 shadow_l2e_t old_sl2e;
1129 paddr_t paddr;
1131 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1132 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1133 * shadows. Reference counting and up-pointers track from the first
1134 * page of the shadow to the first l2e, so make sure that we're
1135 * working with those:
1136 * Align the pointer down so it's pointing at the first of the pair */
1137 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1138 /* Align the mfn of the shadow entry too */
1139 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1140 #endif
1142 ASSERT(sl2e != NULL);
1143 old_sl2e = *sl2e;
1145 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1147 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1148 | (((unsigned long)sl2e) & ~PAGE_MASK));
1150 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1151 /* About to install a new reference */
1152 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1154 domain_crash(v->domain);
1155 return SHADOW_SET_ERROR;
1158 /* Write the new entry */
1159 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1161 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1162 /* The l1 shadow is two pages long and need to be pointed to by
1163 * two adjacent l1es. The pair have the same flags, but point
1164 * at odd and even MFNs */
1165 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1166 pair[1].l2 |= (1<<PAGE_SHIFT);
1167 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1169 #else /* normal case */
1170 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1171 #endif
1172 flags |= SHADOW_SET_CHANGED;
1174 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1176 /* We lost a reference to an old mfn. */
1177 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1178 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1179 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1180 shadow_l2e_get_flags(new_sl2e)) )
1182 flags |= SHADOW_SET_FLUSH;
1184 sh_put_ref(v, osl1mfn, paddr);
1186 return flags;
1189 static int shadow_set_l1e(struct vcpu *v,
1190 shadow_l1e_t *sl1e,
1191 shadow_l1e_t new_sl1e,
1192 mfn_t sl1mfn)
1194 int flags = 0;
1195 struct domain *d = v->domain;
1196 shadow_l1e_t old_sl1e;
1197 ASSERT(sl1e != NULL);
1199 old_sl1e = *sl1e;
1201 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1203 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1204 && !sh_l1e_is_magic(new_sl1e) )
1206 /* About to install a new reference */
1207 if ( shadow_mode_refcounts(d) ) {
1208 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1210 /* Doesn't look like a pagetable. */
1211 flags |= SHADOW_SET_ERROR;
1212 new_sl1e = shadow_l1e_empty();
1217 /* Write the new entry */
1218 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1219 flags |= SHADOW_SET_CHANGED;
1221 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1222 && !sh_l1e_is_magic(old_sl1e) )
1224 /* We lost a reference to an old mfn. */
1225 /* N.B. Unlike higher-level sets, never need an extra flush
1226 * when writing an l1e. Because it points to the same guest frame
1227 * as the guest l1e did, it's the guest's responsibility to
1228 * trigger a flush later. */
1229 if ( shadow_mode_refcounts(d) )
1231 shadow_put_page_from_l1e(old_sl1e, d);
1234 return flags;
1238 /**************************************************************************/
1239 /* Macros to walk pagetables. These take the shadow of a pagetable and
1240 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1241 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1242 * second entry (since pairs of entries are managed together). For multi-page
1243 * shadows they walk all pages.
1245 * Arguments are an MFN, the variable to point to each entry, a variable
1246 * to indicate that we are done (we will shortcut to the end of the scan
1247 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1248 * and the code.
1250 * WARNING: These macros have side-effects. They change the values of both
1251 * the pointer and the MFN. */
1253 static inline void increment_ptr_to_guest_entry(void *ptr)
1255 if ( ptr )
1257 guest_l1e_t **entry = ptr;
1258 (*entry)++;
1262 /* All kinds of l1: touch all entries */
1263 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1264 do { \
1265 int _i; \
1266 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1267 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1268 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1269 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1270 { \
1271 (_sl1e) = _sp + _i; \
1272 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1273 {_code} \
1274 if ( _done ) break; \
1275 increment_ptr_to_guest_entry(_gl1p); \
1276 } \
1277 unmap_shadow_page(_sp); \
1278 } while (0)
1280 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1281 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1282 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1283 do { \
1284 int __done = 0; \
1285 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1286 ({ (__done = _done); }), _code); \
1287 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1288 if ( !__done ) \
1289 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1290 ({ (__done = _done); }), _code); \
1291 } while (0)
1292 #else /* Everything else; l1 shadows are only one page */
1293 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1294 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1295 #endif
1298 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1300 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1301 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1302 do { \
1303 int _i, _j, __done = 0; \
1304 int _xen = !shadow_mode_external(_dom); \
1305 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1306 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1307 { \
1308 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1309 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1310 if ( (!(_xen)) \
1311 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1312 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1313 { \
1314 (_sl2e) = _sp + _i; \
1315 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1316 {_code} \
1317 if ( (__done = (_done)) ) break; \
1318 increment_ptr_to_guest_entry(_gl2p); \
1319 } \
1320 unmap_shadow_page(_sp); \
1321 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1322 } \
1323 } while (0)
1325 #elif GUEST_PAGING_LEVELS == 2
1327 /* 32-bit on 32-bit: avoid Xen entries */
1328 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1329 do { \
1330 int _i; \
1331 int _xen = !shadow_mode_external(_dom); \
1332 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1333 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1334 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1335 if ( (!(_xen)) \
1336 || \
1337 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1338 { \
1339 (_sl2e) = _sp + _i; \
1340 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1341 {_code} \
1342 if ( _done ) break; \
1343 increment_ptr_to_guest_entry(_gl2p); \
1344 } \
1345 unmap_shadow_page(_sp); \
1346 } while (0)
1348 #elif GUEST_PAGING_LEVELS == 3
1350 /* PAE: if it's an l2h, don't touch Xen mappings */
1351 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1352 do { \
1353 int _i; \
1354 int _xen = !shadow_mode_external(_dom); \
1355 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1356 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1357 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1358 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1359 if ( (!(_xen)) \
1360 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1361 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1362 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1363 { \
1364 (_sl2e) = _sp + _i; \
1365 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1366 {_code} \
1367 if ( _done ) break; \
1368 increment_ptr_to_guest_entry(_gl2p); \
1369 } \
1370 unmap_shadow_page(_sp); \
1371 } while (0)
1373 #else
1375 /* 64-bit l2: touch all entries except for PAE compat guests. */
1376 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1377 do { \
1378 int _i; \
1379 int _xen = !shadow_mode_external(_dom); \
1380 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1381 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1382 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1383 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1384 { \
1385 if ( (!(_xen)) \
1386 || !is_pv_32on64_domain(_dom) \
1387 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1388 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1389 { \
1390 (_sl2e) = _sp + _i; \
1391 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1392 {_code} \
1393 if ( _done ) break; \
1394 increment_ptr_to_guest_entry(_gl2p); \
1395 } \
1396 } \
1397 unmap_shadow_page(_sp); \
1398 } while (0)
1400 #endif /* different kinds of l2 */
1402 #if GUEST_PAGING_LEVELS == 4
1404 /* 64-bit l3: touch all entries */
1405 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1406 do { \
1407 int _i; \
1408 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1409 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1410 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1411 { \
1412 (_sl3e) = _sp + _i; \
1413 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1414 {_code} \
1415 if ( _done ) break; \
1416 increment_ptr_to_guest_entry(_gl3p); \
1417 } \
1418 unmap_shadow_page(_sp); \
1419 } while (0)
1421 /* 64-bit l4: avoid Xen mappings */
1422 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1423 do { \
1424 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1425 int _xen = !shadow_mode_external(_dom); \
1426 int _i; \
1427 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1428 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1429 { \
1430 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1431 { \
1432 (_sl4e) = _sp + _i; \
1433 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1434 {_code} \
1435 if ( _done ) break; \
1436 } \
1437 increment_ptr_to_guest_entry(_gl4p); \
1438 } \
1439 unmap_shadow_page(_sp); \
1440 } while (0)
1442 #endif
1446 /**************************************************************************/
1447 /* Functions to install Xen mappings and linear mappings in shadow pages */
1449 // XXX -- this function should probably be moved to shadow-common.c, but that
1450 // probably wants to wait until the shadow types have been moved from
1451 // shadow-types.h to shadow-private.h
1452 //
1453 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1454 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1456 struct domain *d = v->domain;
1457 shadow_l4e_t *sl4e;
1459 sl4e = sh_map_domain_page(sl4mfn);
1460 ASSERT(sl4e != NULL);
1461 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1463 /* Copy the common Xen mappings from the idle domain */
1464 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1465 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1466 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1468 /* Install the per-domain mappings for this domain */
1469 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1470 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1471 __PAGE_HYPERVISOR);
1473 /* Linear mapping */
1474 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1475 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1477 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1479 // linear tables may not be used with translated PV guests
1480 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1481 shadow_l4e_empty();
1483 else
1485 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1486 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1489 if ( shadow_mode_translate(v->domain) )
1491 /* install domain-specific P2M table */
1492 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1493 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1494 __PAGE_HYPERVISOR);
1497 if ( is_pv_32on64_domain(v->domain) )
1499 /* install compat arg xlat entry */
1500 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1501 shadow_l4e_from_mfn(
1502 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1503 __PAGE_HYPERVISOR);
1506 sh_unmap_domain_page(sl4e);
1508 #endif
1510 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1511 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1512 // place, which means that we need to populate the l2h entry in the l3
1513 // table.
1515 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1517 struct domain *d = v->domain;
1518 shadow_l2e_t *sl2e;
1519 #if CONFIG_PAGING_LEVELS == 3
1520 int i;
1521 #else
1523 if ( !is_pv_32on64_vcpu(v) )
1524 return;
1525 #endif
1527 sl2e = sh_map_domain_page(sl2hmfn);
1528 ASSERT(sl2e != NULL);
1529 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1531 #if CONFIG_PAGING_LEVELS == 3
1533 /* Copy the common Xen mappings from the idle domain */
1534 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1535 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1536 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1538 /* Install the per-domain mappings for this domain */
1539 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1540 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1541 shadow_l2e_from_mfn(
1542 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1543 __PAGE_HYPERVISOR);
1545 /* We don't set up a linear mapping here because we can't until this
1546 * l2h is installed in an l3e. sh_update_linear_entries() handles
1547 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1548 * We zero them here, just as a safety measure.
1549 */
1550 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1551 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1552 shadow_l2e_empty();
1553 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1554 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1555 shadow_l2e_empty();
1557 if ( shadow_mode_translate(d) )
1559 /* Install the domain-specific p2m table */
1560 l3_pgentry_t *p2m;
1561 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1562 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1563 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1565 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1566 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1567 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1568 __PAGE_HYPERVISOR)
1569 : shadow_l2e_empty();
1571 sh_unmap_domain_page(p2m);
1574 #else
1576 /* Copy the common Xen mappings from the idle domain */
1577 memcpy(
1578 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1579 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1580 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1582 #endif
1584 sh_unmap_domain_page(sl2e);
1586 #endif
1589 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1590 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1592 struct domain *d = v->domain;
1593 shadow_l2e_t *sl2e;
1594 int i;
1596 sl2e = sh_map_domain_page(sl2mfn);
1597 ASSERT(sl2e != NULL);
1598 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1600 /* Copy the common Xen mappings from the idle domain */
1601 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1602 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1603 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1605 /* Install the per-domain mappings for this domain */
1606 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1607 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1608 shadow_l2e_from_mfn(
1609 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1610 __PAGE_HYPERVISOR);
1612 /* Linear mapping */
1613 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1614 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1616 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1618 // linear tables may not be used with translated PV guests
1619 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1620 shadow_l2e_empty();
1622 else
1624 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1625 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1628 if ( shadow_mode_translate(d) )
1630 /* install domain-specific P2M table */
1631 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1632 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1633 __PAGE_HYPERVISOR);
1636 sh_unmap_domain_page(sl2e);
1638 #endif
1642 /**************************************************************************/
1643 /* Create a shadow of a given guest page.
1644 */
1645 static mfn_t
1646 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1648 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1649 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1650 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1652 if ( shadow_type != SH_type_l2_32_shadow
1653 && shadow_type != SH_type_l2_pae_shadow
1654 && shadow_type != SH_type_l2h_pae_shadow
1655 && shadow_type != SH_type_l4_64_shadow )
1656 /* Lower-level shadow, not yet linked form a higher level */
1657 mfn_to_shadow_page(smfn)->up = 0;
1659 #if GUEST_PAGING_LEVELS == 4
1660 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1661 if ( shadow_type == SH_type_l4_64_shadow &&
1662 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1664 /* We're shadowing a new l4, but we've been assuming the guest uses
1665 * only one l4 per vcpu and context switches using an l4 entry.
1666 * Count the number of active l4 shadows. If there are enough
1667 * of them, decide that this isn't an old linux guest, and stop
1668 * pinning l3es. This is not very quick but it doesn't happen
1669 * very often. */
1670 struct list_head *l, *t;
1671 struct shadow_page_info *sp;
1672 struct vcpu *v2;
1673 int l4count = 0, vcpus = 0;
1674 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1676 sp = list_entry(l, struct shadow_page_info, list);
1677 if ( sp->type == SH_type_l4_64_shadow )
1678 l4count++;
1680 for_each_vcpu ( v->domain, v2 )
1681 vcpus++;
1682 if ( l4count > 2 * vcpus )
1684 /* Unpin all the pinned l3 tables, and don't pin any more. */
1685 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1687 sp = list_entry(l, struct shadow_page_info, list);
1688 if ( sp->type == SH_type_l3_64_shadow )
1689 sh_unpin(v, shadow_page_to_mfn(sp));
1691 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1694 #endif
1695 #endif
1697 // Create the Xen mappings...
1698 if ( !shadow_mode_external(v->domain) )
1700 switch (shadow_type)
1702 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1703 case SH_type_l4_shadow:
1704 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1705 #endif
1706 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1707 case SH_type_l2h_shadow:
1708 sh_install_xen_entries_in_l2h(v, smfn); break;
1709 #endif
1710 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1711 case SH_type_l2_shadow:
1712 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1713 #endif
1714 default: /* Do nothing */ break;
1718 shadow_promote(v, gmfn, shadow_type);
1719 set_shadow_status(v, gmfn, shadow_type, smfn);
1721 return smfn;
1724 /* Make a splintered superpage shadow */
1725 static mfn_t
1726 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1728 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1729 (unsigned long) gfn_x(gfn));
1731 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1732 gfn_x(gfn), mfn_x(smfn));
1734 set_fl1_shadow_status(v, gfn, smfn);
1735 return smfn;
1739 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1740 mfn_t
1741 sh_make_monitor_table(struct vcpu *v)
1743 struct domain *d = v->domain;
1745 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1747 /* Guarantee we can get the memory we need */
1748 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS - 1);
1750 #if CONFIG_PAGING_LEVELS == 4
1752 mfn_t m4mfn;
1753 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1754 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1755 /* Remember the level of this table */
1756 mfn_to_page(m4mfn)->shadow_flags = 4;
1757 #if SHADOW_PAGING_LEVELS < 4
1758 // Install a monitor l3 table in slot 0 of the l4 table.
1759 // This is used for shadow linear maps.
1761 mfn_t m3mfn;
1762 l4_pgentry_t *l4e;
1763 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1764 mfn_to_page(m3mfn)->shadow_flags = 3;
1765 l4e = sh_map_domain_page(m4mfn);
1766 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1767 sh_unmap_domain_page(l4e);
1768 if ( is_pv_32on64_vcpu(v) )
1770 // Install a monitor l2 table in slot 3 of the l3 table.
1771 // This is used for all Xen entries.
1772 mfn_t m2mfn;
1773 l3_pgentry_t *l3e;
1774 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1775 mfn_to_page(m2mfn)->shadow_flags = 2;
1776 l3e = sh_map_domain_page(m3mfn);
1777 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1778 sh_install_xen_entries_in_l2h(v, m2mfn);
1779 sh_unmap_domain_page(l3e);
1782 #endif /* SHADOW_PAGING_LEVELS < 4 */
1783 return m4mfn;
1786 #elif CONFIG_PAGING_LEVELS == 3
1789 mfn_t m3mfn, m2mfn;
1790 l3_pgentry_t *l3e;
1791 l2_pgentry_t *l2e;
1792 int i;
1794 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1795 /* Remember the level of this table */
1796 mfn_to_page(m3mfn)->shadow_flags = 3;
1798 // Install a monitor l2 table in slot 3 of the l3 table.
1799 // This is used for all Xen entries, including linear maps
1800 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1801 mfn_to_page(m2mfn)->shadow_flags = 2;
1802 l3e = sh_map_domain_page(m3mfn);
1803 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1804 sh_install_xen_entries_in_l2h(v, m2mfn);
1805 /* Install the monitor's own linear map */
1806 l2e = sh_map_domain_page(m2mfn);
1807 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1808 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1809 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1810 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1811 : l2e_empty();
1812 sh_unmap_domain_page(l2e);
1813 sh_unmap_domain_page(l3e);
1815 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1816 return m3mfn;
1819 #elif CONFIG_PAGING_LEVELS == 2
1822 mfn_t m2mfn;
1823 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1824 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1825 /* Remember the level of this table */
1826 mfn_to_page(m2mfn)->shadow_flags = 2;
1827 return m2mfn;
1830 #else
1831 #error this should not happen
1832 #endif /* CONFIG_PAGING_LEVELS */
1834 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1836 /**************************************************************************/
1837 /* These functions also take a virtual address and return the level-N
1838 * shadow table mfn and entry, but they create the shadow pagetables if
1839 * they are needed. The "demand" argument is non-zero when handling
1840 * a demand fault (so we know what to do about accessed bits &c).
1841 * If the necessary tables are not present in the guest, they return NULL. */
1843 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1844 * more levels than the guest, the upper levels are always fixed and do not
1845 * reflect any information from the guest, so we do not use these functions
1846 * to access them. */
1848 #if GUEST_PAGING_LEVELS >= 4
1849 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1850 walk_t *gw,
1851 mfn_t *sl4mfn)
1853 /* There is always a shadow of the top level table. Get it. */
1854 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1855 /* Reading the top level table is always valid. */
1856 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1859 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1860 walk_t *gw,
1861 mfn_t *sl3mfn,
1862 fetch_type_t ft)
1864 mfn_t sl4mfn;
1865 shadow_l4e_t *sl4e;
1866 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1867 /* Get the l4e */
1868 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1869 ASSERT(sl4e != NULL);
1870 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1872 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1873 ASSERT(mfn_valid(*sl3mfn));
1875 else
1877 int r;
1878 shadow_l4e_t new_sl4e;
1879 /* No l3 shadow installed: find and install it. */
1880 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1881 if ( !mfn_valid(*sl3mfn) )
1883 /* No l3 shadow of this page exists at all: make one. */
1884 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1886 /* Install the new sl3 table in the sl4e */
1887 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1888 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1889 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1890 if ( r & SHADOW_SET_ERROR )
1891 return NULL;
1893 /* Now follow it down a level. Guaranteed to succeed. */
1894 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1896 #endif /* GUEST_PAGING_LEVELS >= 4 */
1899 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1900 walk_t *gw,
1901 mfn_t *sl2mfn,
1902 fetch_type_t ft)
1904 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1905 mfn_t sl3mfn = _mfn(INVALID_MFN);
1906 shadow_l3e_t *sl3e;
1907 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1908 /* Get the l3e */
1909 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1910 if ( sl3e == NULL ) return NULL;
1911 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1913 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1914 ASSERT(mfn_valid(*sl2mfn));
1916 else
1918 int r;
1919 shadow_l3e_t new_sl3e;
1920 unsigned int t = SH_type_l2_shadow;
1922 /* Tag compat L2 containing hypervisor (m2p) mappings */
1923 if ( is_pv_32on64_domain(v->domain) &&
1924 guest_l4_table_offset(gw->va) == 0 &&
1925 guest_l3_table_offset(gw->va) == 3 )
1926 t = SH_type_l2h_shadow;
1928 /* No l2 shadow installed: find and install it. */
1929 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1930 if ( !mfn_valid(*sl2mfn) )
1932 /* No l2 shadow of this page exists at all: make one. */
1933 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1935 /* Install the new sl2 table in the sl3e */
1936 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1937 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1938 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1939 if ( r & SHADOW_SET_ERROR )
1940 return NULL;
1942 /* Now follow it down a level. Guaranteed to succeed. */
1943 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1944 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1945 /* We never demand-shadow PAE l3es: they are only created in
1946 * sh_update_cr3(). Check if the relevant sl3e is present. */
1947 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1948 + shadow_l3_linear_offset(gw->va);
1949 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1950 return NULL;
1951 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1952 ASSERT(mfn_valid(*sl2mfn));
1953 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1954 #else /* 32bit... */
1955 /* There is always a shadow of the top level table. Get it. */
1956 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1957 /* This next line is important: the guest l2 has a 16k
1958 * shadow, we need to return the right mfn of the four. This
1959 * call will set it for us as a side-effect. */
1960 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1961 /* Reading the top level table is always valid. */
1962 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1963 #endif
1967 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1968 walk_t *gw,
1969 mfn_t *sl1mfn,
1970 fetch_type_t ft)
1972 mfn_t sl2mfn;
1973 shadow_l2e_t *sl2e;
1975 /* Get the l2e */
1976 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1977 if ( sl2e == NULL ) return NULL;
1978 /* Install the sl1 in the l2e if it wasn't there or if we need to
1979 * re-do it to fix a PSE dirty bit. */
1980 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1981 && likely(ft != ft_demand_write
1982 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1983 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1985 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1986 ASSERT(mfn_valid(*sl1mfn));
1988 else
1990 shadow_l2e_t new_sl2e;
1991 int r, flags = guest_l2e_get_flags(gw->l2e);
1992 /* No l1 shadow installed: find and install it. */
1993 if ( !(flags & _PAGE_PRESENT) )
1994 return NULL; /* No guest page. */
1995 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1997 /* Splintering a superpage */
1998 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1999 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2000 if ( !mfn_valid(*sl1mfn) )
2002 /* No fl1 shadow of this superpage exists at all: make one. */
2003 *sl1mfn = make_fl1_shadow(v, l2gfn);
2006 else
2008 /* Shadowing an actual guest l1 table */
2009 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2010 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2011 if ( !mfn_valid(*sl1mfn) )
2013 /* No l1 shadow of this page exists at all: make one. */
2014 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2017 /* Install the new sl1 table in the sl2e */
2018 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2019 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2020 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2021 if ( r & SHADOW_SET_ERROR )
2022 return NULL;
2023 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2024 * the guest l1 table has an 8k shadow, and we need to return
2025 * the right mfn of the pair. This call will set it for us as a
2026 * side-effect. (In all other cases, it's a no-op and will be
2027 * compiled out.) */
2028 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2030 /* Now follow it down a level. Guaranteed to succeed. */
2031 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2036 /**************************************************************************/
2037 /* Destructors for shadow tables:
2038 * Unregister the shadow, decrement refcounts of any entries present in it,
2039 * and release the memory.
2041 * N.B. These destructors do not clear the contents of the shadows.
2042 * This allows us to delay TLB shootdowns until the page is being reused.
2043 * See shadow_alloc() and shadow_free() for how this is handled.
2044 */
2046 #if GUEST_PAGING_LEVELS >= 4
2047 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2049 shadow_l4e_t *sl4e;
2050 u32 t = mfn_to_shadow_page(smfn)->type;
2051 mfn_t gmfn, sl4mfn;
2053 SHADOW_DEBUG(DESTROY_SHADOW,
2054 "%s(%05lx)\n", __func__, mfn_x(smfn));
2055 ASSERT(t == SH_type_l4_shadow);
2057 /* Record that the guest page isn't shadowed any more (in this type) */
2058 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2059 delete_shadow_status(v, gmfn, t, smfn);
2060 shadow_demote(v, gmfn, t);
2061 /* Decrement refcounts of all the old entries */
2062 sl4mfn = smfn;
2063 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2064 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2066 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2067 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2068 | ((unsigned long)sl4e & ~PAGE_MASK));
2070 });
2072 /* Put the memory back in the pool */
2073 shadow_free(v->domain, smfn);
2076 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2078 shadow_l3e_t *sl3e;
2079 u32 t = mfn_to_shadow_page(smfn)->type;
2080 mfn_t gmfn, sl3mfn;
2082 SHADOW_DEBUG(DESTROY_SHADOW,
2083 "%s(%05lx)\n", __func__, mfn_x(smfn));
2084 ASSERT(t == SH_type_l3_shadow);
2086 /* Record that the guest page isn't shadowed any more (in this type) */
2087 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2088 delete_shadow_status(v, gmfn, t, smfn);
2089 shadow_demote(v, gmfn, t);
2091 /* Decrement refcounts of all the old entries */
2092 sl3mfn = smfn;
2093 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2094 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2095 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2096 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2097 | ((unsigned long)sl3e & ~PAGE_MASK));
2098 });
2100 /* Put the memory back in the pool */
2101 shadow_free(v->domain, smfn);
2103 #endif /* GUEST_PAGING_LEVELS >= 4 */
2106 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2108 shadow_l2e_t *sl2e;
2109 u32 t = mfn_to_shadow_page(smfn)->type;
2110 mfn_t gmfn, sl2mfn;
2112 SHADOW_DEBUG(DESTROY_SHADOW,
2113 "%s(%05lx)\n", __func__, mfn_x(smfn));
2115 #if GUEST_PAGING_LEVELS >= 3
2116 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2117 #else
2118 ASSERT(t == SH_type_l2_shadow);
2119 #endif
2121 /* Record that the guest page isn't shadowed any more (in this type) */
2122 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2123 delete_shadow_status(v, gmfn, t, smfn);
2124 shadow_demote(v, gmfn, t);
2126 /* Decrement refcounts of all the old entries */
2127 sl2mfn = smfn;
2128 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2129 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2130 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2131 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2132 | ((unsigned long)sl2e & ~PAGE_MASK));
2133 });
2135 /* Put the memory back in the pool */
2136 shadow_free(v->domain, smfn);
2139 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2141 struct domain *d = v->domain;
2142 shadow_l1e_t *sl1e;
2143 u32 t = mfn_to_shadow_page(smfn)->type;
2145 SHADOW_DEBUG(DESTROY_SHADOW,
2146 "%s(%05lx)\n", __func__, mfn_x(smfn));
2147 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2149 /* Record that the guest page isn't shadowed any more (in this type) */
2150 if ( t == SH_type_fl1_shadow )
2152 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2153 delete_fl1_shadow_status(v, gfn, smfn);
2155 else
2157 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2158 delete_shadow_status(v, gmfn, t, smfn);
2159 shadow_demote(v, gmfn, t);
2162 if ( shadow_mode_refcounts(d) )
2164 /* Decrement refcounts of all the old entries */
2165 mfn_t sl1mfn = smfn;
2166 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2167 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2168 && !sh_l1e_is_magic(*sl1e) )
2169 shadow_put_page_from_l1e(*sl1e, d);
2170 });
2173 /* Put the memory back in the pool */
2174 shadow_free(v->domain, smfn);
2177 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2178 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2180 struct domain *d = v->domain;
2181 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2183 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2184 /* Need to destroy the l3 monitor page in slot 0 too */
2186 mfn_t m3mfn;
2187 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2188 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2189 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2190 if ( is_pv_32on64_vcpu(v) )
2192 /* Need to destroy the l2 monitor page in slot 3 too */
2193 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2194 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2195 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2196 sh_unmap_domain_page(l3e);
2198 shadow_free(d, m3mfn);
2199 sh_unmap_domain_page(l4e);
2201 #elif CONFIG_PAGING_LEVELS == 3
2202 /* Need to destroy the l2 monitor page in slot 4 too */
2204 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2205 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2206 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2207 sh_unmap_domain_page(l3e);
2209 #endif
2211 /* Put the memory back in the pool */
2212 shadow_free(d, mmfn);
2214 #endif
2216 /**************************************************************************/
2217 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2218 * These are called from common code when we are running out of shadow
2219 * memory, and unpinning all the top-level shadows hasn't worked.
2221 * This implementation is pretty crude and slow, but we hope that it won't
2222 * be called very often. */
2224 #if GUEST_PAGING_LEVELS == 2
2226 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2228 shadow_l2e_t *sl2e;
2229 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2230 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2231 });
2234 #elif GUEST_PAGING_LEVELS == 3
2236 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2237 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2239 shadow_l2e_t *sl2e;
2240 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2241 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2242 });
2245 #elif GUEST_PAGING_LEVELS == 4
2247 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2249 shadow_l4e_t *sl4e;
2250 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2251 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2252 });
2255 #endif
2257 /**************************************************************************/
2258 /* Internal translation functions.
2259 * These functions require a pointer to the shadow entry that will be updated.
2260 */
2262 /* These functions take a new guest entry, translate it to shadow and write
2263 * the shadow entry.
2265 * They return the same bitmaps as the shadow_set_lXe() functions.
2266 */
2268 #if GUEST_PAGING_LEVELS >= 4
2269 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2271 shadow_l4e_t new_sl4e;
2272 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2273 shadow_l4e_t *sl4p = se;
2274 mfn_t sl3mfn = _mfn(INVALID_MFN);
2275 struct domain *d = v->domain;
2276 p2m_type_t p2mt;
2277 int result = 0;
2279 perfc_incr(shadow_validate_gl4e_calls);
2281 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2283 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2284 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2285 if ( p2m_is_ram(p2mt) )
2286 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2287 else
2288 result |= SHADOW_SET_ERROR;
2290 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2292 // check for updates to xen reserved slots
2293 if ( !shadow_mode_external(d) )
2295 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2296 sizeof(shadow_l4e_t));
2297 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2299 if ( unlikely(reserved_xen_slot) )
2301 // attempt by the guest to write to a xen reserved slot
2302 //
2303 SHADOW_PRINTK("%s out-of-range update "
2304 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2305 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2306 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2308 SHADOW_ERROR("out-of-range l4e update\n");
2309 result |= SHADOW_SET_ERROR;
2312 // do not call shadow_set_l4e...
2313 return result;
2317 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2318 return result;
2322 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2324 shadow_l3e_t new_sl3e;
2325 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2326 shadow_l3e_t *sl3p = se;
2327 mfn_t sl2mfn = _mfn(INVALID_MFN);
2328 p2m_type_t p2mt;
2329 int result = 0;
2331 perfc_incr(shadow_validate_gl3e_calls);
2333 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2335 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2336 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2337 if ( p2m_is_ram(p2mt) )
2338 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2339 else
2340 result |= SHADOW_SET_ERROR;
2342 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2343 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2345 return result;
2347 #endif // GUEST_PAGING_LEVELS >= 4
2349 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2351 shadow_l2e_t new_sl2e;
2352 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2353 shadow_l2e_t *sl2p = se;
2354 mfn_t sl1mfn = _mfn(INVALID_MFN);
2355 p2m_type_t p2mt;
2356 int result = 0;
2358 perfc_incr(shadow_validate_gl2e_calls);
2360 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2362 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2363 if ( guest_supports_superpages(v) &&
2364 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2366 // superpage -- need to look up the shadow L1 which holds the
2367 // splitters...
2368 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2369 #if 0
2370 // XXX - it's possible that we want to do some kind of prefetch
2371 // for superpage fl1's here, but this is *not* on the demand path,
2372 // so we'll hold off trying that for now...
2373 //
2374 if ( !mfn_valid(sl1mfn) )
2375 sl1mfn = make_fl1_shadow(v, gl1gfn);
2376 #endif
2378 else
2380 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2381 if ( p2m_is_ram(p2mt) )
2382 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2383 else
2384 result |= SHADOW_SET_ERROR;
2387 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2389 // check for updates to xen reserved slots in PV guests...
2390 // XXX -- need to revisit this for PV 3-on-4 guests.
2391 //
2392 #if SHADOW_PAGING_LEVELS < 4
2393 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2394 if ( !shadow_mode_external(v->domain) )
2396 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2397 sizeof(shadow_l2e_t));
2398 int reserved_xen_slot;
2400 #if SHADOW_PAGING_LEVELS == 3
2401 reserved_xen_slot =
2402 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2403 (shadow_index
2404 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2405 #else /* SHADOW_PAGING_LEVELS == 2 */
2406 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2407 #endif
2409 if ( unlikely(reserved_xen_slot) )
2411 // attempt by the guest to write to a xen reserved slot
2412 //
2413 SHADOW_PRINTK("%s out-of-range update "
2414 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2415 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2416 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2418 SHADOW_ERROR("out-of-range l2e update\n");
2419 result |= SHADOW_SET_ERROR;
2422 // do not call shadow_set_l2e...
2423 return result;
2426 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2427 #endif /* SHADOW_PAGING_LEVELS < 4 */
2429 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2431 return result;
2434 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2436 shadow_l1e_t new_sl1e;
2437 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2438 shadow_l1e_t *sl1p = se;
2439 gfn_t gfn;
2440 mfn_t gmfn;
2441 p2m_type_t p2mt;
2442 int result = 0;
2444 perfc_incr(shadow_validate_gl1e_calls);
2446 gfn = guest_l1e_get_gfn(new_gl1e);
2447 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2449 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2451 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2452 return result;
2456 /**************************************************************************/
2457 /* Functions which translate and install the shadows of arbitrary guest
2458 * entries that we have just seen the guest write. */
2461 static inline int
2462 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2463 void *new_gp, u32 size, u32 sh_type,
2464 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2465 int (*validate_ge)(struct vcpu *v, void *ge,
2466 mfn_t smfn, void *se))
2467 /* Generic function for mapping and validating. */
2469 mfn_t smfn, smfn2, map_mfn;
2470 shadow_l1e_t *sl1p;
2471 u32 shadow_idx, guest_idx;
2472 int result = 0;
2474 /* Align address and size to guest entry boundaries */
2475 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2476 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2477 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2478 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2480 /* Map the shadow page */
2481 smfn = get_shadow_status(v, gmfn, sh_type);
2482 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2483 guest_idx = guest_index(new_gp);
2484 map_mfn = smfn;
2485 shadow_idx = shadow_index(&map_mfn, guest_idx);
2486 sl1p = map_shadow_page(map_mfn);
2488 /* Validate one entry at a time */
2489 while ( size )
2491 smfn2 = smfn;
2492 guest_idx = guest_index(new_gp);
2493 shadow_idx = shadow_index(&smfn2, guest_idx);
2494 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2496 /* We have moved to another page of the shadow */
2497 map_mfn = smfn2;
2498 unmap_shadow_page(sl1p);
2499 sl1p = map_shadow_page(map_mfn);
2501 result |= validate_ge(v,
2502 new_gp,
2503 map_mfn,
2504 &sl1p[shadow_idx]);
2505 size -= sizeof(guest_l1e_t);
2506 new_gp += sizeof(guest_l1e_t);
2508 unmap_shadow_page(sl1p);
2509 return result;
2513 int
2514 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2515 void *new_gl4p, u32 size)
2517 #if GUEST_PAGING_LEVELS >= 4
2518 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2519 SH_type_l4_shadow,
2520 shadow_l4_index,
2521 validate_gl4e);
2522 #else // ! GUEST_PAGING_LEVELS >= 4
2523 SHADOW_ERROR("called in wrong paging mode!\n");
2524 BUG();
2525 return 0;
2526 #endif
2529 int
2530 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2531 void *new_gl3p, u32 size)
2533 #if GUEST_PAGING_LEVELS >= 4
2534 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2535 SH_type_l3_shadow,
2536 shadow_l3_index,
2537 validate_gl3e);
2538 #else // ! GUEST_PAGING_LEVELS >= 4
2539 SHADOW_ERROR("called in wrong paging mode!\n");
2540 BUG();
2541 return 0;
2542 #endif
2545 int
2546 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2547 void *new_gl2p, u32 size)
2549 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2550 SH_type_l2_shadow,
2551 shadow_l2_index,
2552 validate_gl2e);
2555 int
2556 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2557 void *new_gl2p, u32 size)
2559 #if GUEST_PAGING_LEVELS >= 3
2560 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2561 SH_type_l2h_shadow,
2562 shadow_l2_index,
2563 validate_gl2e);
2564 #else /* Non-PAE guests don't have different kinds of l2 table */
2565 SHADOW_ERROR("called in wrong paging mode!\n");
2566 BUG();
2567 return 0;
2568 #endif
2571 int
2572 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2573 void *new_gl1p, u32 size)
2575 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2576 SH_type_l1_shadow,
2577 shadow_l1_index,
2578 validate_gl1e);
2582 /**************************************************************************/
2583 /* Optimization: If we see two emulated writes of zeros to the same
2584 * page-table without another kind of page fault in between, we guess
2585 * that this is a batch of changes (for process destruction) and
2586 * unshadow the page so we don't take a pagefault on every entry. This
2587 * should also make finding writeable mappings of pagetables much
2588 * easier. */
2590 /* Look to see if this is the second emulated write in a row to this
2591 * page, and unshadow/unhook if it is */
2592 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2594 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2595 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2596 sh_mfn_is_a_page_table(gmfn) )
2598 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2599 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2601 perfc_incr(shadow_early_unshadow);
2602 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2605 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2606 #endif
2609 /* Stop counting towards early unshadows, as we've seen a real page fault */
2610 static inline void reset_early_unshadow(struct vcpu *v)
2612 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2613 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2614 #endif
2619 /**************************************************************************/
2620 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2621 * demand-faulted a shadow l1e in the fault handler, to see if it's
2622 * worth fetching some more.
2623 */
2625 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2627 /* XXX magic number */
2628 #define PREFETCH_DISTANCE 32
2630 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2631 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2633 int i, dist;
2634 gfn_t gfn;
2635 mfn_t gmfn;
2636 guest_l1e_t *gl1p = NULL, gl1e;
2637 shadow_l1e_t sl1e;
2638 u32 gflags;
2639 p2m_type_t p2mt;
2641 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2642 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2643 /* And no more than a maximum fetches-per-fault */
2644 if ( dist > PREFETCH_DISTANCE )
2645 dist = PREFETCH_DISTANCE;
2647 if ( mfn_valid(gw->l1mfn) )
2649 /* Normal guest page; grab the next guest entry */
2650 gl1p = sh_map_domain_page(gw->l1mfn);
2651 gl1p += guest_l1_table_offset(gw->va);
2654 for ( i = 1; i < dist ; i++ )
2656 /* No point in prefetching if there's already a shadow */
2657 if ( ptr_sl1e[i].l1 != 0 )
2658 break;
2660 if ( mfn_valid(gw->l1mfn) )
2662 /* Normal guest page; grab the next guest entry */
2663 gl1e = gl1p[i];
2664 /* Not worth continuing if we hit an entry that will need another
2665 * fault for A/D-bit propagation anyway */
2666 gflags = guest_l1e_get_flags(gl1e);
2667 if ( (gflags & _PAGE_PRESENT)
2668 && (!(gflags & _PAGE_ACCESSED)
2669 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2670 break;
2672 else
2674 /* Fragmented superpage, unless we've been called wrongly */
2675 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2676 /* Increment the l1e's GFN by the right number of guest pages */
2677 gl1e = guest_l1e_from_gfn(
2678 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2679 guest_l1e_get_flags(gw->l1e));
2682 /* Look at the gfn that the l1e is pointing at */
2683 gfn = guest_l1e_get_gfn(gl1e);
2684 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2686 /* Propagate the entry. */
2687 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2688 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2690 if ( gl1p != NULL )
2691 sh_unmap_domain_page(gl1p);
2694 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2697 /**************************************************************************/
2698 /* Entry points into the shadow code */
2700 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2701 * for pagefaults. Returns 1 if this fault was an artefact of the
2702 * shadow code (and the guest should retry) or 0 if it is not (and the
2703 * fault should be handled elsewhere or passed to the guest). */
2705 static int sh_page_fault(struct vcpu *v,
2706 unsigned long va,
2707 struct cpu_user_regs *regs)
2709 struct domain *d = v->domain;
2710 walk_t gw;
2711 gfn_t gfn;
2712 mfn_t gmfn, sl1mfn=_mfn(0);
2713 shadow_l1e_t sl1e, *ptr_sl1e;
2714 paddr_t gpa;
2715 struct sh_emulate_ctxt emul_ctxt;
2716 struct x86_emulate_ops *emul_ops;
2717 int r;
2718 fetch_type_t ft = 0;
2719 p2m_type_t p2mt;
2721 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2722 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2723 regs->rip);
2725 perfc_incr(shadow_fault);
2726 //
2727 // XXX: Need to think about eventually mapping superpages directly in the
2728 // shadow (when possible), as opposed to splintering them into a
2729 // bunch of 4K maps.
2730 //
2732 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2733 if ( (regs->error_code & PFEC_reserved_bit) )
2735 /* The only reasons for reserved bits to be set in shadow entries
2736 * are the two "magic" shadow_l1e entries. */
2737 if ( likely((__copy_from_user(&sl1e,
2738 (sh_linear_l1_table(v)
2739 + shadow_l1_linear_offset(va)),
2740 sizeof(sl1e)) == 0)
2741 && sh_l1e_is_magic(sl1e)) )
2743 if ( sh_l1e_is_gnp(sl1e) )
2745 /* Not-present in a guest PT: pass to the guest as
2746 * a not-present fault (by flipping two bits). */
2747 ASSERT(regs->error_code & PFEC_page_present);
2748 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2749 reset_early_unshadow(v);
2750 perfc_incr(shadow_fault_fast_gnp);
2751 SHADOW_PRINTK("fast path not-present\n");
2752 return 0;
2754 else
2756 /* Magic MMIO marker: extract gfn for MMIO address */
2757 ASSERT(sh_l1e_is_mmio(sl1e));
2758 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2759 << PAGE_SHIFT)
2760 | (va & ~PAGE_MASK);
2762 perfc_incr(shadow_fault_fast_mmio);
2763 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2764 reset_early_unshadow(v);
2765 handle_mmio(gpa);
2766 return EXCRET_fault_fixed;
2768 else
2770 /* This should be exceptionally rare: another vcpu has fixed
2771 * the tables between the fault and our reading the l1e.
2772 * Retry and let the hardware give us the right fault next time. */
2773 perfc_incr(shadow_fault_fast_fail);
2774 SHADOW_PRINTK("fast path false alarm!\n");
2775 return EXCRET_fault_fixed;
2778 #endif /* SHOPT_FAST_FAULT_PATH */
2780 /* Detect if this page fault happened while we were already in Xen
2781 * doing a shadow operation. If that happens, the only thing we can
2782 * do is let Xen's normal fault handlers try to fix it. In any case,
2783 * a diagnostic trace of the fault will be more useful than
2784 * a BUG() when we try to take the lock again. */
2785 if ( unlikely(shadow_locked_by_me(d)) )
2787 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2788 d->arch.paging.shadow.locker_function);
2789 return 0;
2792 shadow_lock(d);
2794 shadow_audit_tables(v);
2796 if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
2798 perfc_incr(shadow_fault_bail_real_fault);
2799 goto not_a_shadow_fault;
2802 /* It's possible that the guest has put pagetables in memory that it has
2803 * already used for some special purpose (ioreq pages, or granted pages).
2804 * If that happens we'll have killed the guest already but it's still not
2805 * safe to propagate entries out of the guest PT so get out now. */
2806 if ( unlikely(d->is_shutting_down) )
2808 SHADOW_PRINTK("guest is shutting down\n");
2809 shadow_unlock(d);
2810 return 0;
2813 sh_audit_gw(v, &gw);
2815 /* What kind of access are we dealing with? */
2816 ft = ((regs->error_code & PFEC_write_access)
2817 ? ft_demand_write : ft_demand_read);
2819 /* What mfn is the guest trying to access? */
2820 gfn = guest_l1e_get_gfn(gw.l1e);
2821 gmfn = gfn_to_mfn(d, gfn, &p2mt);
2823 if ( shadow_mode_refcounts(d) &&
2824 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
2826 perfc_incr(shadow_fault_bail_bad_gfn);
2827 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2828 gfn_x(gfn), mfn_x(gmfn));
2829 goto not_a_shadow_fault;
2832 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2833 /* Remember this successful VA->GFN translation for later. */
2834 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
2835 regs->error_code | PFEC_page_present);
2836 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2838 /* Make sure there is enough free shadow memory to build a chain of
2839 * shadow tables. (We never allocate a top-level shadow on this path,
2840 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
2841 * SH_type_l1_shadow isn't correct in the latter case, all page
2842 * tables are the same size there.) */
2843 shadow_prealloc(d,
2844 SH_type_l1_shadow,
2845 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
2847 /* Acquire the shadow. This must happen before we figure out the rights
2848 * for the shadow entry, since we might promote a page here. */
2849 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2850 if ( unlikely(ptr_sl1e == NULL) )
2852 /* Couldn't get the sl1e! Since we know the guest entries
2853 * are OK, this can only have been caused by a failed
2854 * shadow_set_l*e(), which will have crashed the guest.
2855 * Get out of the fault handler immediately. */
2856 ASSERT(d->is_shutting_down);
2857 shadow_unlock(d);
2858 return 0;
2861 /* Calculate the shadow entry and write it */
2862 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
2863 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2865 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2866 /* Prefetch some more shadow entries */
2867 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2868 #endif
2870 /* Need to emulate accesses to page tables */
2871 if ( sh_mfn_is_a_page_table(gmfn) )
2873 if ( ft == ft_demand_write )
2875 perfc_incr(shadow_fault_emulate_write);
2876 goto emulate;
2878 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2880 perfc_incr(shadow_fault_emulate_read);
2881 goto emulate;
2885 /* Need to hand off device-model MMIO and writes to read-only
2886 * memory to the device model */
2887 if ( p2mt == p2m_mmio_dm
2888 || (p2mt == p2m_ram_ro && ft == ft_demand_write) )
2890 gpa = guest_walk_to_gpa(&gw);
2891 goto mmio;
2894 /* In HVM guests, we force CR0.WP always to be set, so that the
2895 * pagetables are always write-protected. If the guest thinks
2896 * CR0.WP is clear, we must emulate faulting supervisor writes to
2897 * allow the guest to write through read-only PTEs. Emulate if the
2898 * fault was a non-user write to a present page. */
2899 if ( is_hvm_domain(d)
2900 && unlikely(!hvm_wp_enabled(v))
2901 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
2903 perfc_incr(shadow_fault_emulate_wp);
2904 goto emulate;
2907 perfc_incr(shadow_fault_fixed);
2908 d->arch.paging.log_dirty.fault_count++;
2909 reset_early_unshadow(v);
2911 done:
2912 sh_audit_gw(v, &gw);
2913 SHADOW_PRINTK("fixed\n");
2914 shadow_audit_tables(v);
2915 shadow_unlock(d);
2916 return EXCRET_fault_fixed;
2918 emulate:
2919 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2920 goto not_a_shadow_fault;
2922 /*
2923 * We do not emulate user writes. Instead we use them as a hint that the
2924 * page is no longer a page table. This behaviour differs from native, but
2925 * it seems very unlikely that any OS grants user access to page tables.
2926 */
2927 if ( (regs->error_code & PFEC_user_mode) )
2929 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
2930 mfn_x(gmfn));
2931 perfc_incr(shadow_fault_emulate_failed);
2932 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2933 goto done;
2936 if ( is_hvm_domain(d) )
2938 /*
2939 * If we are in the middle of injecting an exception or interrupt then
2940 * we should not emulate: it is not the instruction at %eip that caused
2941 * the fault. Furthermore it is almost certainly the case the handler
2942 * stack is currently considered to be a page table, so we should
2943 * unshadow the faulting page before exiting.
2944 */
2945 if ( unlikely(hvm_event_pending(v)) )
2947 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2948 "injection: cr2=%#lx, mfn=%#lx\n",
2949 va, mfn_x(gmfn));
2950 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2951 goto done;
2955 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2956 (unsigned long)regs->eip, (unsigned long)regs->esp);
2958 /*
2959 * We don't need to hold the lock for the whole emulation; we will
2960 * take it again when we write to the pagetables.
2961 */
2962 sh_audit_gw(v, &gw);
2963 shadow_audit_tables(v);
2964 shadow_unlock(d);
2966 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2968 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2970 /*
2971 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2972 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
2973 * then it must be 'failable': we cannot require the unshadow to succeed.
2974 */
2975 if ( r == X86EMUL_UNHANDLEABLE )
2977 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2978 mfn_x(gmfn));
2979 perfc_incr(shadow_fault_emulate_failed);
2980 /* If this is actually a page table, then we have a bug, and need
2981 * to support more operations in the emulator. More likely,
2982 * though, this is a hint that this page should not be shadowed. */
2983 shadow_remove_all_shadows(v, gmfn);
2986 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
2987 if ( r == X86EMUL_OKAY ) {
2988 int i;
2989 /* Emulate up to four extra instructions in the hope of catching
2990 * the "second half" of a 64-bit pagetable write. */
2991 for ( i = 0 ; i < 4 ; i++ )
2993 shadow_continue_emulation(&emul_ctxt, regs);
2994 v->arch.paging.last_write_was_pt = 0;
2995 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2996 if ( r == X86EMUL_OKAY )
2998 if ( v->arch.paging.last_write_was_pt )
3000 perfc_incr(shadow_em_ex_pt);
3001 break; /* Don't emulate past the other half of the write */
3003 else
3004 perfc_incr(shadow_em_ex_non_pt);
3006 else
3008 perfc_incr(shadow_em_ex_fail);
3009 break; /* Don't emulate again if we failed! */
3013 #endif /* PAE guest */
3015 SHADOW_PRINTK("emulated\n");
3016 return EXCRET_fault_fixed;
3018 mmio:
3019 if ( !guest_mode(regs) )
3020 goto not_a_shadow_fault;
3021 perfc_incr(shadow_fault_mmio);
3022 sh_audit_gw(v, &gw);
3023 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3024 shadow_audit_tables(v);
3025 reset_early_unshadow(v);
3026 shadow_unlock(d);
3027 handle_mmio(gpa);
3028 return EXCRET_fault_fixed;
3030 not_a_shadow_fault:
3031 sh_audit_gw(v, &gw);
3032 SHADOW_PRINTK("not a shadow fault\n");
3033 shadow_audit_tables(v);
3034 reset_early_unshadow(v);
3035 shadow_unlock(d);
3036 return 0;
3040 static int
3041 sh_invlpg(struct vcpu *v, unsigned long va)
3042 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3043 * instruction should be issued on the hardware, or 0 if it's safe not
3044 * to do so. */
3046 shadow_l2e_t sl2e;
3048 perfc_incr(shadow_invlpg);
3050 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3051 /* No longer safe to use cached gva->gfn translations */
3052 vtlb_flush(v);
3053 #endif
3055 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3056 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3057 * yet. */
3058 #if SHADOW_PAGING_LEVELS == 4
3060 shadow_l3e_t sl3e;
3061 if ( !(shadow_l4e_get_flags(
3062 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3063 & _PAGE_PRESENT) )
3064 return 0;
3065 /* This must still be a copy-from-user because we don't have the
3066 * shadow lock, and the higher-level shadows might disappear
3067 * under our feet. */
3068 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3069 + shadow_l3_linear_offset(va)),
3070 sizeof (sl3e)) != 0 )
3072 perfc_incr(shadow_invlpg_fault);
3073 return 0;
3075 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3076 return 0;
3078 #elif SHADOW_PAGING_LEVELS == 3
3079 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3080 & _PAGE_PRESENT) )
3081 // no need to flush anything if there's no SL2...
3082 return 0;
3083 #endif
3085 /* This must still be a copy-from-user because we don't have the shadow
3086 * lock, and the higher-level shadows might disappear under our feet. */
3087 if ( __copy_from_user(&sl2e,
3088 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3089 sizeof (sl2e)) != 0 )
3091 perfc_incr(shadow_invlpg_fault);
3092 return 0;
3095 // If there's nothing shadowed for this particular sl2e, then
3096 // there is no need to do an invlpg, either...
3097 //
3098 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3099 return 0;
3101 // Check to see if the SL2 is a splintered superpage...
3102 // If so, then we'll need to flush the entire TLB (because that's
3103 // easier than invalidating all of the individual 4K pages).
3104 //
3105 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3106 == SH_type_fl1_shadow )
3108 flush_tlb_local();
3109 return 0;
3112 return 1;
3116 static unsigned long
3117 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3118 /* Called to translate a guest virtual address to what the *guest*
3119 * pagetables would map it to. */
3121 walk_t gw;
3122 gfn_t gfn;
3124 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3125 /* Check the vTLB cache first */
3126 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3127 if ( VALID_GFN(vtlb_gfn) )
3128 return vtlb_gfn;
3129 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3131 if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
3133 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3134 pfec[0] &= ~PFEC_page_present;
3135 return INVALID_GFN;
3137 gfn = guest_walk_to_gfn(&gw);
3139 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3140 /* Remember this successful VA->GFN translation for later. */
3141 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3142 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3144 return gfn_x(gfn);
3148 static inline void
3149 sh_update_linear_entries(struct vcpu *v)
3150 /* Sync up all the linear mappings for this vcpu's pagetables */
3152 struct domain *d = v->domain;
3154 /* Linear pagetables in PV guests
3155 * ------------------------------
3157 * Guest linear pagetables, which map the guest pages, are at
3158 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3159 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3160 * are set up at shadow creation time, but (of course!) the PAE case
3161 * is subtler. Normal linear mappings are made by having an entry
3162 * in the top-level table that points to itself (shadow linear) or
3163 * to the guest top-level table (guest linear). For PAE, to set up
3164 * a linear map requires us to copy the four top-level entries into
3165 * level-2 entries. That means that every time we change a PAE l3e,
3166 * we need to reflect the change into the copy.
3168 * Linear pagetables in HVM guests
3169 * -------------------------------
3171 * For HVM guests, the linear pagetables are installed in the monitor
3172 * tables (since we can't put them in the shadow). Shadow linear
3173 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3174 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3175 * a linear pagetable of the monitor tables themselves. We have
3176 * the same issue of having to re-copy PAE l3 entries whevever we use
3177 * PAE shadows.
3179 * Because HVM guests run on the same monitor tables regardless of the
3180 * shadow tables in use, the linear mapping of the shadow tables has to
3181 * be updated every time v->arch.shadow_table changes.
3182 */
3184 /* Don't try to update the monitor table if it doesn't exist */
3185 if ( shadow_mode_external(d)
3186 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3187 return;
3189 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3191 /* For PV, one l4e points at the guest l4, one points at the shadow
3192 * l4. No maintenance required.
3193 * For HVM, just need to update the l4e that points to the shadow l4. */
3195 if ( shadow_mode_external(d) )
3197 /* Use the linear map if we can; otherwise make a new mapping */
3198 if ( v == current )
3200 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3201 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3202 __PAGE_HYPERVISOR);
3204 else
3206 l4_pgentry_t *ml4e;
3207 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3208 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3209 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3210 __PAGE_HYPERVISOR);
3211 sh_unmap_domain_page(ml4e);
3215 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3217 /* PV: XXX
3219 * HVM: To give ourselves a linear map of the shadows, we need to
3220 * extend a PAE shadow to 4 levels. We do this by having a monitor
3221 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3222 * entries into it. Then, by having the monitor l4e for shadow
3223 * pagetables also point to the monitor l4, we can use it to access
3224 * the shadows.
3225 */
3227 if ( shadow_mode_external(d) )
3229 /* Install copies of the shadow l3es into the monitor l3 table.
3230 * The monitor l3 table is hooked into slot 0 of the monitor
3231 * l4 table, so we use l3 linear indices 0 to 3 */
3232 shadow_l3e_t *sl3e;
3233 l3_pgentry_t *ml3e;
3234 mfn_t l3mfn;
3235 int i;
3237 /* Use linear mappings if we can; otherwise make new mappings */
3238 if ( v == current )
3240 ml3e = __linear_l3_table;
3241 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3243 else
3245 l4_pgentry_t *ml4e;
3246 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3247 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3248 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3249 ml3e = sh_map_domain_page(l3mfn);
3250 sh_unmap_domain_page(ml4e);
3253 /* Shadow l3 tables are made up by sh_update_cr3 */
3254 sl3e = v->arch.paging.shadow.l3table;
3256 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3258 ml3e[i] =
3259 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3260 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3261 __PAGE_HYPERVISOR)
3262 : l3e_empty();
3265 if ( v != current )
3266 sh_unmap_domain_page(ml3e);
3268 else
3269 domain_crash(d); /* XXX */
3271 #elif CONFIG_PAGING_LEVELS == 3
3273 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3274 * entries in the shadow, and the shadow's l3 entries into the
3275 * shadow-linear-map l2 entries in the shadow. This is safe to do
3276 * because Xen does not let guests share high-slot l2 tables between l3s,
3277 * so we know we're not treading on anyone's toes.
3279 * HVM: need to copy the shadow's l3 entries into the
3280 * shadow-linear-map l2 entries in the monitor table. This is safe
3281 * because we have one monitor table for each vcpu. The monitor's
3282 * own l3es don't need to be copied because they never change.
3283 * XXX That might change if we start stuffing things into the rest
3284 * of the monitor's virtual address space.
3285 */
3287 l2_pgentry_t *l2e, new_l2e;
3288 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3289 int i;
3290 int unmap_l2e = 0;
3292 #if GUEST_PAGING_LEVELS == 2
3294 /* Shadow l3 tables were built by sh_update_cr3 */
3295 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3296 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3298 #else /* GUEST_PAGING_LEVELS == 3 */
3300 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3301 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3303 #endif /* GUEST_PAGING_LEVELS */
3305 /* Choose where to write the entries, using linear maps if possible */
3306 if ( shadow_mode_external(d) )
3308 if ( v == current )
3310 /* From the monitor tables, it's safe to use linear maps
3311 * to update monitor l2s */
3312 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3314 else
3316 /* Map the monitor table's high l2 */
3317 l3_pgentry_t *l3e;
3318 l3e = sh_map_domain_page(
3319 pagetable_get_mfn(v->arch.monitor_table));
3320 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3321 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3322 unmap_l2e = 1;
3323 sh_unmap_domain_page(l3e);
3326 else
3328 /* Map the shadow table's high l2 */
3329 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3330 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3331 unmap_l2e = 1;
3334 /* Write linear mapping of guest (only in PV, and only when
3335 * not translated). */
3336 if ( !shadow_mode_translate(d) )
3338 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3340 new_l2e =
3341 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3342 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3343 __PAGE_HYPERVISOR)
3344 : l2e_empty());
3345 safe_write_entry(
3346 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3347 &new_l2e);
3351 /* Write linear mapping of shadow. */
3352 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3354 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3355 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3356 __PAGE_HYPERVISOR)
3357 : l2e_empty();
3358 safe_write_entry(
3359 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3360 &new_l2e);
3363 if ( unmap_l2e )
3364 sh_unmap_domain_page(l2e);
3367 #elif CONFIG_PAGING_LEVELS == 2
3369 /* For PV, one l2e points at the guest l2, one points at the shadow
3370 * l2. No maintenance required.
3371 * For HVM, just need to update the l2e that points to the shadow l2. */
3373 if ( shadow_mode_external(d) )
3375 /* Use the linear map if we can; otherwise make a new mapping */
3376 if ( v == current )
3378 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3379 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3380 __PAGE_HYPERVISOR);
3382 else
3384 l2_pgentry_t *ml2e;
3385 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3386 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3387 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3388 __PAGE_HYPERVISOR);
3389 sh_unmap_domain_page(ml2e);
3393 #else
3394 #error this should not happen
3395 #endif
3397 if ( shadow_mode_external(d) )
3399 /*
3400 * Having modified the linear pagetable mapping, flush local host TLBs.
3401 * This was not needed when vmenter/vmexit always had the side effect
3402 * of flushing host TLBs but, with ASIDs, it is possible to finish
3403 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3404 * without an intervening host TLB flush. Then the page fault code
3405 * could use the linear pagetable to read a top-level shadow page
3406 * table entry. But, without this change, it would fetch the wrong
3407 * value due to a stale TLB.
3408 */
3409 flush_tlb_local();
3414 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3415 * Does all appropriate management/bookkeeping/refcounting/etc...
3416 */
3417 static void
3418 sh_detach_old_tables(struct vcpu *v)
3420 mfn_t smfn;
3421 int i = 0;
3423 ////
3424 //// vcpu->arch.paging.shadow.guest_vtable
3425 ////
3427 #if GUEST_PAGING_LEVELS == 3
3428 /* PAE guests don't have a mapping of the guest top-level table */
3429 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3430 #else
3431 if ( v->arch.paging.shadow.guest_vtable )
3433 struct domain *d = v->domain;
3434 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3435 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3436 v->arch.paging.shadow.guest_vtable = NULL;
3438 #endif
3441 ////
3442 //// vcpu->arch.shadow_table[]
3443 ////
3445 #if GUEST_PAGING_LEVELS == 3
3446 /* PAE guests have four shadow_table entries */
3447 for ( i = 0 ; i < 4 ; i++ )
3448 #endif
3450 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3451 if ( mfn_x(smfn) )
3452 sh_put_ref(v, smfn, 0);
3453 v->arch.shadow_table[i] = pagetable_null();
3457 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3458 static void
3459 sh_set_toplevel_shadow(struct vcpu *v,
3460 int slot,
3461 mfn_t gmfn,
3462 unsigned int root_type)
3464 mfn_t smfn;
3465 pagetable_t old_entry, new_entry;
3467 struct domain *d = v->domain;
3469 /* Remember the old contents of this slot */
3470 old_entry = v->arch.shadow_table[slot];
3472 /* Now figure out the new contents: is this a valid guest MFN? */
3473 if ( !mfn_valid(gmfn) )
3475 new_entry = pagetable_null();
3476 goto install_new_entry;
3479 /* Guest mfn is valid: shadow it and install the shadow */
3480 smfn = get_shadow_status(v, gmfn, root_type);
3481 if ( !mfn_valid(smfn) )
3483 /* Make sure there's enough free shadow memory. */
3484 shadow_prealloc(d, root_type, 1);
3485 /* Shadow the page. */
3486 smfn = sh_make_shadow(v, gmfn, root_type);
3488 ASSERT(mfn_valid(smfn));
3490 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3491 /* Once again OK to unhook entries from this table if we see fork/exit */
3492 ASSERT(sh_mfn_is_a_page_table(gmfn));
3493 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3494 #endif
3496 /* Pin the shadow and put it (back) on the list of pinned shadows */
3497 if ( sh_pin(v, smfn) == 0 )
3499 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3500 domain_crash(v->domain);
3503 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3504 * or the next call to set_toplevel_shadow() */
3505 if ( !sh_get_ref(v, smfn, 0) )
3507 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3508 domain_crash(v->domain);
3511 new_entry = pagetable_from_mfn(smfn);
3513 install_new_entry:
3514 /* Done. Install it */
3515 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3516 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3517 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3518 v->arch.shadow_table[slot] = new_entry;
3520 /* Decrement the refcount of the old contents of this slot */
3521 if ( !pagetable_is_null(old_entry) )
3522 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3526 static void
3527 sh_update_cr3(struct vcpu *v, int do_locking)
3528 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3529 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3530 * if appropriate).
3531 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3532 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3533 * shadow tables are.
3534 * If do_locking != 0, assume we are being called from outside the
3535 * shadow code, and must take and release the shadow lock; otherwise
3536 * that is the caller's responsibility.
3537 */
3539 struct domain *d = v->domain;
3540 mfn_t gmfn;
3541 #if GUEST_PAGING_LEVELS == 3
3542 guest_l3e_t *gl3e;
3543 u32 guest_idx=0;
3544 int i;
3545 #endif
3547 /* Don't do anything on an uninitialised vcpu */
3548 if ( !is_hvm_domain(d) && !v->is_initialised )
3550 ASSERT(v->arch.cr3 == 0);
3551 return;
3554 if ( do_locking ) shadow_lock(v->domain);
3556 ASSERT(shadow_locked_by_me(v->domain));
3557 ASSERT(v->arch.paging.mode);
3559 ////
3560 //// vcpu->arch.guest_table is already set
3561 ////
3563 #ifndef NDEBUG
3564 /* Double-check that the HVM code has sent us a sane guest_table */
3565 if ( is_hvm_domain(d) )
3567 ASSERT(shadow_mode_external(d));
3568 if ( hvm_paging_enabled(v) )
3569 ASSERT(pagetable_get_pfn(v->arch.guest_table));
3570 else
3571 ASSERT(v->arch.guest_table.pfn
3572 == d->arch.paging.shadow.unpaged_pagetable.pfn);
3574 #endif
3576 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3577 d->domain_id, v->vcpu_id,
3578 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3580 #if GUEST_PAGING_LEVELS == 4
3581 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
3582 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3583 else
3584 #endif
3585 gmfn = pagetable_get_mfn(v->arch.guest_table);
3588 ////
3589 //// vcpu->arch.paging.shadow.guest_vtable
3590 ////
3591 #if GUEST_PAGING_LEVELS == 4
3592 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3594 if ( v->arch.paging.shadow.guest_vtable )
3595 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3596 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3597 /* PAGING_LEVELS==4 implies 64-bit, which means that
3598 * map_domain_page_global can't fail */
3599 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
3601 else
3602 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3603 #elif GUEST_PAGING_LEVELS == 3
3604 /* On PAE guests we don't use a mapping of the guest's own top-level
3605 * table. We cache the current state of that table and shadow that,
3606 * until the next CR3 write makes us refresh our cache. */
3607 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3609 if ( shadow_mode_external(d) )
3610 /* Find where in the page the l3 table is */
3611 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
3612 else
3613 /* PV guest: l3 is at the start of a page */
3614 guest_idx = 0;
3616 // Ignore the low 2 bits of guest_idx -- they are really just
3617 // cache control.
3618 guest_idx &= ~3;
3620 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3621 for ( i = 0; i < 4 ; i++ )
3622 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3623 sh_unmap_domain_page(gl3e);
3624 #elif GUEST_PAGING_LEVELS == 2
3625 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3627 if ( v->arch.paging.shadow.guest_vtable )
3628 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3629 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3630 /* Does this really need map_domain_page_global? Handle the
3631 * error properly if so. */
3632 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
3634 else
3635 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3636 #else
3637 #error this should never happen
3638 #endif
3640 #if 0
3641 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3642 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3643 #endif
3645 ////
3646 //// vcpu->arch.shadow_table[]
3647 ////
3649 /* We revoke write access to the new guest toplevel page(s) before we
3650 * replace the old shadow pagetable(s), so that we can safely use the
3651 * (old) shadow linear maps in the writeable mapping heuristics. */
3652 #if GUEST_PAGING_LEVELS == 2
3653 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3654 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3655 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3656 #elif GUEST_PAGING_LEVELS == 3
3657 /* PAE guests have four shadow_table entries, based on the
3658 * current values of the guest's four l3es. */
3660 int flush = 0;
3661 gfn_t gl2gfn;
3662 mfn_t gl2mfn;
3663 p2m_type_t p2mt;
3664 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3665 /* First, make all four entries read-only. */
3666 for ( i = 0; i < 4; i++ )
3668 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3670 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3671 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3672 if ( p2m_is_ram(p2mt) )
3673 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3676 if ( flush )
3677 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3678 /* Now install the new shadows. */
3679 for ( i = 0; i < 4; i++ )
3681 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3683 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3684 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3685 if ( p2m_is_ram(p2mt) )
3686 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3687 ? SH_type_l2h_shadow
3688 : SH_type_l2_shadow);
3689 else
3690 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3692 else
3693 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3696 #elif GUEST_PAGING_LEVELS == 4
3697 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3698 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3699 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3700 #else
3701 #error This should never happen
3702 #endif
3704 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3705 #endif
3707 ///
3708 /// v->arch.paging.shadow.l3table
3709 ///
3710 #if SHADOW_PAGING_LEVELS == 3
3712 mfn_t smfn;
3713 int i;
3714 for ( i = 0; i < 4; i++ )
3716 #if GUEST_PAGING_LEVELS == 2
3717 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3718 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3719 #else
3720 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3721 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3722 #endif
3723 v->arch.paging.shadow.l3table[i] =
3724 (mfn_x(smfn) == 0)
3725 ? shadow_l3e_empty()
3726 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3729 #endif /* SHADOW_PAGING_LEVELS == 3 */
3732 ///
3733 /// v->arch.cr3
3734 ///
3735 if ( shadow_mode_external(d) )
3737 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3739 else // not shadow_mode_external...
3741 /* We don't support PV except guest == shadow == config levels */
3742 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3743 #if SHADOW_PAGING_LEVELS == 3
3744 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3745 * Don't use make_cr3 because (a) we know it's below 4GB, and
3746 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3747 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3748 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3749 #else
3750 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3751 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3752 #endif
3756 ///
3757 /// v->arch.hvm_vcpu.hw_cr[3]
3758 ///
3759 if ( shadow_mode_external(d) )
3761 ASSERT(is_hvm_domain(d));
3762 #if SHADOW_PAGING_LEVELS == 3
3763 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3764 v->arch.hvm_vcpu.hw_cr[3] =
3765 virt_to_maddr(&v->arch.paging.shadow.l3table);
3766 #else
3767 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3768 v->arch.hvm_vcpu.hw_cr[3] =
3769 pagetable_get_paddr(v->arch.shadow_table[0]);
3770 #endif
3771 hvm_update_guest_cr(v, 3);
3774 /* Fix up the linear pagetable mappings */
3775 sh_update_linear_entries(v);
3777 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3778 /* No longer safe to use cached gva->gfn translations */
3779 vtlb_flush(v);
3780 #endif
3782 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3783 if ( do_locking ) shadow_unlock(v->domain);
3787 /**************************************************************************/
3788 /* Functions to revoke guest rights */
3790 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3791 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3792 /* Look up this vaddr in the current shadow and see if it's a writeable
3793 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3795 shadow_l1e_t sl1e, *sl1p;
3796 shadow_l2e_t *sl2p;
3797 #if SHADOW_PAGING_LEVELS >= 3
3798 shadow_l3e_t *sl3p;
3799 #if SHADOW_PAGING_LEVELS >= 4
3800 shadow_l4e_t *sl4p;
3801 #endif
3802 #endif
3803 mfn_t sl1mfn;
3804 int r;
3806 /* Carefully look in the shadow linear map for the l1e we expect */
3807 #if SHADOW_PAGING_LEVELS >= 4
3808 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3809 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3810 return 0;
3811 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3812 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3813 return 0;
3814 #elif SHADOW_PAGING_LEVELS == 3
3815 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3816 + shadow_l3_linear_offset(vaddr);
3817 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3818 return 0;
3819 #endif
3820 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3821 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3822 return 0;
3823 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3824 sl1e = *sl1p;
3825 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3826 != (_PAGE_PRESENT|_PAGE_RW))
3827 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3828 return 0;
3830 /* Found it! Need to remove its write permissions. */
3831 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3832 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3833 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3834 ASSERT( !(r & SHADOW_SET_ERROR) );
3835 return 1;
3837 #endif
3839 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3840 mfn_t readonly_mfn)
3841 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3843 shadow_l1e_t *sl1e;
3844 int done = 0;
3845 int flags;
3846 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3848 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3850 flags = shadow_l1e_get_flags(*sl1e);
3851 if ( (flags & _PAGE_PRESENT)
3852 && (flags & _PAGE_RW)
3853 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3855 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3856 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3857 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3858 /* Remember the last shadow that we shot a writeable mapping in */
3859 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3860 #endif
3861 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3862 & PGT_count_mask) == 0 )
3863 /* This breaks us cleanly out of the FOREACH macro */
3864 done = 1;
3866 });
3867 return done;
3871 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3872 /* Excises all mappings to guest frame from this shadow l1 table */
3874 shadow_l1e_t *sl1e;
3875 int done = 0;
3876 int flags;
3878 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3880 flags = shadow_l1e_get_flags(*sl1e);
3881 if ( (flags & _PAGE_PRESENT)
3882 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3884 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3885 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3886 /* This breaks us cleanly out of the FOREACH macro */
3887 done = 1;
3889 });
3890 return done;
3893 /**************************************************************************/
3894 /* Functions to excise all pointers to shadows from higher-level shadows. */
3896 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3897 /* Blank out a single shadow entry */
3899 switch ( mfn_to_shadow_page(smfn)->type )
3901 case SH_type_l1_shadow:
3902 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3903 case SH_type_l2_shadow:
3904 #if GUEST_PAGING_LEVELS >= 3
3905 case SH_type_l2h_shadow:
3906 #endif
3907 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3908 #if GUEST_PAGING_LEVELS >= 4
3909 case SH_type_l3_shadow:
3910 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3911 case SH_type_l4_shadow:
3912 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3913 #endif
3914 default: BUG(); /* Called with the wrong kind of shadow. */
3918 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3919 /* Remove all mappings of this l1 shadow from this l2 shadow */
3921 shadow_l2e_t *sl2e;
3922 int done = 0;
3923 int flags;
3925 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3927 flags = shadow_l2e_get_flags(*sl2e);
3928 if ( (flags & _PAGE_PRESENT)
3929 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3931 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3932 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3933 /* This breaks us cleanly out of the FOREACH macro */
3934 done = 1;
3936 });
3937 return done;
3940 #if GUEST_PAGING_LEVELS >= 4
3941 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3942 /* Remove all mappings of this l2 shadow from this l3 shadow */
3944 shadow_l3e_t *sl3e;
3945 int done = 0;
3946 int flags;
3948 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3950 flags = shadow_l3e_get_flags(*sl3e);
3951 if ( (flags & _PAGE_PRESENT)
3952 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3954 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3955 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3956 /* This breaks us cleanly out of the FOREACH macro */
3957 done = 1;
3959 });
3960 return done;
3963 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3964 /* Remove all mappings of this l3 shadow from this l4 shadow */
3966 shadow_l4e_t *sl4e;
3967 int done = 0;
3968 int flags;
3970 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
3972 flags = shadow_l4e_get_flags(*sl4e);
3973 if ( (flags & _PAGE_PRESENT)
3974 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3976 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3977 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3978 /* This breaks us cleanly out of the FOREACH macro */
3979 done = 1;
3981 });
3982 return done;
3984 #endif /* 64bit guest */
3986 /**************************************************************************/
3987 /* Handling HVM guest writes to pagetables */
3989 /* Translate a VA to an MFN, injecting a page-fault if we fail */
3990 #define BAD_GVA_TO_GFN (~0UL)
3991 #define BAD_GFN_TO_MFN (~1UL)
3992 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
3993 unsigned long vaddr,
3994 struct sh_emulate_ctxt *sh_ctxt)
3996 unsigned long gfn;
3997 mfn_t mfn;
3998 p2m_type_t p2mt;
3999 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4001 /* Translate the VA to a GFN */
4002 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4003 if ( gfn == INVALID_GFN )
4005 if ( is_hvm_vcpu(v) )
4006 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4007 else
4008 propagate_page_fault(vaddr, pfec);
4009 return _mfn(BAD_GVA_TO_GFN);
4012 /* Translate the GFN to an MFN */
4013 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4014 if ( p2m_is_ram(p2mt) )
4016 ASSERT(mfn_valid(mfn));
4017 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4018 return mfn;
4021 return _mfn(BAD_GFN_TO_MFN);
4024 /* Check that the user is allowed to perform this write.
4025 * Returns a mapped pointer to write to, or NULL for error. */
4026 #define MAPPING_UNHANDLEABLE ((void *)0)
4027 #define MAPPING_EXCEPTION ((void *)1)
4028 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 1)
4029 static void *emulate_map_dest(struct vcpu *v,
4030 unsigned long vaddr,
4031 u32 bytes,
4032 struct sh_emulate_ctxt *sh_ctxt)
4034 struct segment_register *sreg;
4035 unsigned long offset;
4036 void *map = NULL;
4038 /* We don't emulate user-mode writes to page tables */
4039 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
4040 if ( sreg->attr.fields.dpl == 3 )
4041 return MAPPING_UNHANDLEABLE;
4043 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4044 if ( !mfn_valid(sh_ctxt->mfn1) )
4045 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4046 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4048 /* Unaligned writes mean probably this isn't a pagetable */
4049 if ( vaddr & (bytes - 1) )
4050 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4052 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4054 /* Whole write fits on a single page */
4055 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4056 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4058 else
4060 /* Cross-page emulated writes are only supported for HVM guests;
4061 * PV guests ought to know better */
4062 if ( !is_hvm_vcpu(v) )
4063 return MAPPING_UNHANDLEABLE;
4065 /* This write crosses a page boundary. Translate the second page */
4066 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4067 sh_ctxt);
4068 if ( !mfn_valid(sh_ctxt->mfn2) )
4069 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4070 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4072 /* Cross-page writes mean probably not a pagetable */
4073 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4075 /* Hack: we map the pages into the vcpu's LDT space, since we
4076 * know that we're not going to need the LDT for HVM guests,
4077 * and only HVM guests are allowed unaligned writes. */
4078 ASSERT(is_hvm_vcpu(v));
4079 map = (void *)LDT_VIRT_START(v);
4080 offset = l1_linear_offset((unsigned long) map);
4081 l1e_write(&__linear_l1_table[offset],
4082 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4083 l1e_write(&__linear_l1_table[offset + 1],
4084 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4085 flush_tlb_local();
4086 map += (vaddr & ~PAGE_MASK);
4089 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4090 /* Remember if the bottom bit was clear, so we can choose not to run
4091 * the change through the verify code if it's still clear afterwards */
4092 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4093 #endif
4095 return map;
4098 /* Tidy up after the emulated write: mark pages dirty, verify the new
4099 * contents, and undo the mapping */
4100 static void emulate_unmap_dest(struct vcpu *v,
4101 void *addr,
4102 u32 bytes,
4103 struct sh_emulate_ctxt *sh_ctxt)
4105 u32 b1 = bytes, b2 = 0, shflags;
4107 ASSERT(mfn_valid(sh_ctxt->mfn1));
4109 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4110 if ( likely(bytes >= 4)
4111 && (*(u32 *)addr == 0)
4112 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4113 check_for_early_unshadow(v, sh_ctxt->mfn1);
4114 else
4115 reset_early_unshadow(v);
4117 /* We can avoid re-verifying the page contents after the write if:
4118 * - it was no larger than the PTE type of this pagetable;
4119 * - it was aligned to the PTE boundaries; and
4120 * - _PAGE_PRESENT was clear before and after the write. */
4121 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4122 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4123 if ( sh_ctxt->low_bit_was_clear
4124 && !(*(u8 *)addr & _PAGE_PRESENT)
4125 && ((!(shflags & SHF_32)
4126 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4127 * the present bit unset are safe to ignore. */
4128 && ((unsigned long)addr & 7) == 0
4129 && bytes <= 8)
4130 ||
4131 (!(shflags & (SHF_PAE|SHF_64))
4132 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4133 * leave the present bit unset are safe to ignore. */
4134 && ((unsigned long)addr & 3) == 0
4135 && bytes <= 4)) )
4137 /* Writes with this alignment constraint can't possibly cross pages */
4138 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4140 else
4141 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4143 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4145 /* Validate as two writes, one to each page */
4146 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4147 b2 = bytes - b1;
4148 ASSERT(b2 < bytes);
4150 if ( likely(b1 > 0) )
4151 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4152 if ( unlikely(b2 > 0) )
4153 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4156 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4158 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4160 unsigned long offset;
4161 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4162 /* Undo the hacky two-frame contiguous map. */
4163 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4164 offset = l1_linear_offset((unsigned long) addr);
4165 l1e_write(&__linear_l1_table[offset], l1e_empty());
4166 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4167 flush_tlb_all();
4169 else
4170 sh_unmap_domain_page(addr);
4173 int
4174 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4175 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4177 void *addr;
4179 /* Unaligned writes are only acceptable on HVM */
4180 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4181 return X86EMUL_UNHANDLEABLE;
4183 shadow_lock(v->domain);
4184 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4185 if ( emulate_map_dest_failed(addr) )
4187 shadow_unlock(v->domain);
4188 return ((addr == MAPPING_EXCEPTION) ?
4189 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4192 memcpy(addr, src, bytes);
4194 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4195 shadow_audit_tables(v);
4196 shadow_unlock(v->domain);
4197 return X86EMUL_OKAY;
4200 int
4201 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4202 unsigned long old, unsigned long new,
4203 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4205 void *addr;
4206 unsigned long prev;
4207 int rv = X86EMUL_OKAY;
4209 /* Unaligned writes are only acceptable on HVM */
4210 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4211 return X86EMUL_UNHANDLEABLE;
4213 shadow_lock(v->domain);
4215 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4216 if ( emulate_map_dest_failed(addr) )
4218 shadow_unlock(v->domain);
4219 return ((addr == MAPPING_EXCEPTION) ?
4220 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4223 switch ( bytes )
4225 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4226 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4227 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4228 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4229 default:
4230 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4231 prev = ~old;
4234 if ( prev != old )
4235 rv = X86EMUL_CMPXCHG_FAILED;
4237 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4238 " wanted %#lx now %#lx bytes %u\n",
4239 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4241 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4242 shadow_audit_tables(v);
4243 shadow_unlock(v->domain);
4244 return rv;
4247 int
4248 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4249 unsigned long old_lo, unsigned long old_hi,
4250 unsigned long new_lo, unsigned long new_hi,
4251 struct sh_emulate_ctxt *sh_ctxt)
4253 void *addr;
4254 u64 old, new, prev;
4255 int rv = X86EMUL_OKAY;
4257 /* Unaligned writes are only acceptable on HVM */
4258 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4259 return X86EMUL_UNHANDLEABLE;
4261 shadow_lock(v->domain);
4263 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4264 if ( emulate_map_dest_failed(addr) )
4266 shadow_unlock(v->domain);
4267 return ((addr == MAPPING_EXCEPTION) ?
4268 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4271 old = (((u64) old_hi) << 32) | (u64) old_lo;
4272 new = (((u64) new_hi) << 32) | (u64) new_lo;
4273 prev = cmpxchg(((u64 *)addr), old, new);
4275 if ( prev != old )
4276 rv = X86EMUL_CMPXCHG_FAILED;
4278 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4279 shadow_audit_tables(v);
4280 shadow_unlock(v->domain);
4281 return rv;
4285 /**************************************************************************/
4286 /* Audit tools */
4288 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4290 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4291 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4292 "gl" #_level "mfn = %" PRI_mfn \
4293 " sl" #_level "mfn = %" PRI_mfn \
4294 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4295 " gl" #_level "e = %" SH_PRI_gpte \
4296 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4297 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4298 _level, guest_index(gl ## _level ## e), \
4299 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4300 gl ## _level ## e, sl ## _level ## e, \
4301 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4302 ##_a); \
4303 BUG(); \
4304 done = 1; \
4305 } while (0)
4308 static char * sh_audit_flags(struct vcpu *v, int level,
4309 int gflags, int sflags)
4310 /* Common code for auditing flag bits */
4312 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4313 return "shadow is present but guest is not present";
4314 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4315 return "global bit set in PV shadow";
4316 if ( level == 2 && (sflags & _PAGE_PSE) )
4317 return "PS bit set in shadow";
4318 #if SHADOW_PAGING_LEVELS == 3
4319 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4320 #endif
4321 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4322 return "accessed bit not propagated";
4323 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4324 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4325 return "dirty bit not propagated";
4326 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4327 return "user/supervisor bit does not match";
4328 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4329 return "NX bit does not match";
4330 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4331 return "shadow grants write access but guest does not";
4332 return NULL;
4335 static inline mfn_t
4336 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4337 /* Convert this gfn to an mfn in the manner appropriate for the
4338 * guest pagetable it's used in (gmfn) */
4340 p2m_type_t p2mt;
4341 if ( !shadow_mode_translate(v->domain) )
4342 return _mfn(gfn_x(gfn));
4344 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4345 != PGT_writable_page )
4346 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4347 else
4348 return gfn_to_mfn(v->domain, gfn, &p2mt);
4352 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4354 guest_l1e_t *gl1e, *gp;
4355 shadow_l1e_t *sl1e;
4356 mfn_t mfn, gmfn, gl1mfn;
4357 gfn_t gfn;
4358 char *s;
4359 int done = 0;
4361 /* Follow the backpointer */
4362 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4363 gl1e = gp = sh_map_domain_page(gl1mfn);
4364 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4366 if ( sh_l1e_is_magic(*sl1e) )
4368 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4369 if ( sh_l1e_is_gnp(*sl1e) )
4371 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4372 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4374 else
4376 ASSERT(sh_l1e_is_mmio(*sl1e));
4377 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4378 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4379 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4380 " but guest gfn is %" SH_PRI_gfn,
4381 gfn_x(gfn),
4382 gfn_x(guest_l1e_get_gfn(*gl1e)));
4384 #endif
4386 else
4388 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4389 shadow_l1e_get_flags(*sl1e));
4390 if ( s ) AUDIT_FAIL(1, "%s", s);
4392 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4394 gfn = guest_l1e_get_gfn(*gl1e);
4395 mfn = shadow_l1e_get_mfn(*sl1e);
4396 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4397 if ( mfn_x(gmfn) != mfn_x(mfn) )
4398 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4399 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4400 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4403 });
4404 sh_unmap_domain_page(gp);
4405 return done;
4408 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4410 guest_l1e_t *gl1e, e;
4411 shadow_l1e_t *sl1e;
4412 mfn_t gl1mfn = _mfn(INVALID_MFN);
4413 int f;
4414 int done = 0;
4416 /* fl1 has no useful backpointer: all we can check are flags */
4417 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4418 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4419 f = shadow_l1e_get_flags(*sl1e);
4420 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4421 if ( !(f == 0
4422 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4423 _PAGE_ACCESSED|_PAGE_DIRTY)
4424 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4425 || sh_l1e_is_magic(*sl1e)) )
4426 AUDIT_FAIL(1, "fl1e has bad flags");
4427 });
4428 return 0;
4431 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4433 guest_l2e_t *gl2e, *gp;
4434 shadow_l2e_t *sl2e;
4435 mfn_t mfn, gmfn, gl2mfn;
4436 gfn_t gfn;
4437 char *s;
4438 int done = 0;
4440 /* Follow the backpointer */
4441 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4442 gl2e = gp = sh_map_domain_page(gl2mfn);
4443 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4445 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4446 shadow_l2e_get_flags(*sl2e));
4447 if ( s ) AUDIT_FAIL(2, "%s", s);
4449 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4451 gfn = guest_l2e_get_gfn(*gl2e);
4452 mfn = shadow_l2e_get_mfn(*sl2e);
4453 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4454 ? get_fl1_shadow_status(v, gfn)
4455 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4456 SH_type_l1_shadow);
4457 if ( mfn_x(gmfn) != mfn_x(mfn) )
4458 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4459 " (--> %" PRI_mfn ")"
4460 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4461 gfn_x(gfn),
4462 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4463 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4464 mfn_x(gmfn), mfn_x(mfn));
4466 });
4467 sh_unmap_domain_page(gp);
4468 return 0;
4471 #if GUEST_PAGING_LEVELS >= 4
4472 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4474 guest_l3e_t *gl3e, *gp;
4475 shadow_l3e_t *sl3e;
4476 mfn_t mfn, gmfn, gl3mfn;
4477 gfn_t gfn;
4478 char *s;
4479 int done = 0;
4481 /* Follow the backpointer */
4482 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4483 gl3e = gp = sh_map_domain_page(gl3mfn);
4484 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4486 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4487 shadow_l3e_get_flags(*sl3e));
4488 if ( s ) AUDIT_FAIL(3, "%s", s);
4490 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4492 gfn = guest_l3e_get_gfn(*gl3e);
4493 mfn = shadow_l3e_get_mfn(*sl3e);
4494 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4495 ((GUEST_PAGING_LEVELS == 3 ||
4496 is_pv_32on64_vcpu(v))
4497 && !shadow_mode_external(v->domain)
4498 && (guest_index(gl3e) % 4) == 3)
4499 ? SH_type_l2h_shadow
4500 : SH_type_l2_shadow);
4501 if ( mfn_x(gmfn) != mfn_x(mfn) )
4502 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4503 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4504 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4506 });
4507 sh_unmap_domain_page(gp);
4508 return 0;
4511 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4513 guest_l4e_t *gl4e, *gp;
4514 shadow_l4e_t *sl4e;
4515 mfn_t mfn, gmfn, gl4mfn;
4516 gfn_t gfn;
4517 char *s;
4518 int done = 0;
4520 /* Follow the backpointer */
4521 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4522 gl4e = gp = sh_map_domain_page(gl4mfn);
4523 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4525 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4526 shadow_l4e_get_flags(*sl4e));
4527 if ( s ) AUDIT_FAIL(4, "%s", s);
4529 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4531 gfn = guest_l4e_get_gfn(*gl4e);
4532 mfn = shadow_l4e_get_mfn(*sl4e);
4533 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4534 SH_type_l3_shadow);
4535 if ( mfn_x(gmfn) != mfn_x(mfn) )
4536 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4537 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4538 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4540 });
4541 sh_unmap_domain_page(gp);
4542 return 0;
4544 #endif /* GUEST_PAGING_LEVELS >= 4 */
4547 #undef AUDIT_FAIL
4549 #endif /* Audit code */
4551 /**************************************************************************/
4552 /* Entry points into this mode of the shadow code.
4553 * This will all be mangled by the preprocessor to uniquify everything. */
4554 struct paging_mode sh_paging_mode = {
4555 .page_fault = sh_page_fault,
4556 .invlpg = sh_invlpg,
4557 .gva_to_gfn = sh_gva_to_gfn,
4558 .update_cr3 = sh_update_cr3,
4559 .update_paging_modes = shadow_update_paging_modes,
4560 .write_p2m_entry = shadow_write_p2m_entry,
4561 .write_guest_entry = shadow_write_guest_entry,
4562 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4563 .guest_map_l1e = sh_guest_map_l1e,
4564 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4565 .guest_levels = GUEST_PAGING_LEVELS,
4566 .shadow.detach_old_tables = sh_detach_old_tables,
4567 .shadow.x86_emulate_write = sh_x86_emulate_write,
4568 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4569 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4570 .shadow.make_monitor_table = sh_make_monitor_table,
4571 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4572 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4573 .shadow.guess_wrmap = sh_guess_wrmap,
4574 #endif
4575 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4576 };
4578 /*
4579 * Local variables:
4580 * mode: C
4581 * c-set-style: "BSD"
4582 * c-basic-offset: 4
4583 * indent-tabs-mode: nil
4584 * End:
4585 */