debuggers.hg

view xen/arch/x86/mm/shadow/common.c @ 21034:e9146e92df62

Revert C/S 20966 "Disable memory mapping warnings when stub domain is used."

It was ealier resolved by C/S 20720 and C/S 20751.
This fix was backported to xen-3.4-testing.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Mar 01 12:00:11 2010 +0000 (2010-03-01)
parents 50f7b55b69fc
children 19cc497863a4
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
42 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
44 /* Set up the shadow-specific parts of a domain struct at start of day.
45 * Called for every domain from arch_domain_create() */
46 void shadow_domain_init(struct domain *d, unsigned int domcr_flags)
47 {
48 int i;
49 shadow_lock_init(d);
50 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
51 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
52 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
53 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
55 /* Use shadow pagetables for log-dirty support */
56 paging_log_dirty_init(d, shadow_enable_log_dirty,
57 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
59 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
60 d->arch.paging.shadow.oos_active = 0;
61 d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ? 1 : 0;
62 #endif
63 }
65 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
66 * job is to initialize the update_paging_modes() function pointer, which is
67 * used to initialized the rest of resources. Therefore, it really does not
68 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
69 * be compiled.
70 */
71 void shadow_vcpu_init(struct vcpu *v)
72 {
73 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
74 int i, j;
76 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
77 {
78 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
79 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
80 for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
81 v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN);
82 }
83 #endif
85 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
86 }
88 #if SHADOW_AUDIT
89 int shadow_audit_enable = 0;
91 static void shadow_audit_key(unsigned char key)
92 {
93 shadow_audit_enable = !shadow_audit_enable;
94 printk("%s shadow_audit_enable=%d\n",
95 __func__, shadow_audit_enable);
96 }
98 static struct keyhandler shadow_audit_keyhandler = {
99 .u.fn = shadow_audit_key,
100 .desc = "toggle shadow audits"
101 };
103 static int __init shadow_audit_key_init(void)
104 {
105 register_keyhandler('O', &shadow_audit_keyhandler);
106 return 0;
107 }
108 __initcall(shadow_audit_key_init);
109 #endif /* SHADOW_AUDIT */
111 int _shadow_mode_refcounts(struct domain *d)
112 {
113 return shadow_mode_refcounts(d);
114 }
117 /**************************************************************************/
118 /* x86 emulator support for the shadow code
119 */
121 struct segment_register *hvm_get_seg_reg(
122 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
123 {
124 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
125 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
126 hvm_get_segment_register(current, seg, seg_reg);
127 return seg_reg;
128 }
130 static int hvm_translate_linear_addr(
131 enum x86_segment seg,
132 unsigned long offset,
133 unsigned int bytes,
134 enum hvm_access_type access_type,
135 struct sh_emulate_ctxt *sh_ctxt,
136 unsigned long *paddr)
137 {
138 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
139 int okay;
141 okay = hvm_virtual_to_linear_addr(
142 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
144 if ( !okay )
145 {
146 hvm_inject_exception(TRAP_gp_fault, 0, 0);
147 return X86EMUL_EXCEPTION;
148 }
150 return 0;
151 }
153 static int
154 hvm_read(enum x86_segment seg,
155 unsigned long offset,
156 void *p_data,
157 unsigned int bytes,
158 enum hvm_access_type access_type,
159 struct sh_emulate_ctxt *sh_ctxt)
160 {
161 unsigned long addr;
162 int rc;
164 rc = hvm_translate_linear_addr(
165 seg, offset, bytes, access_type, sh_ctxt, &addr);
166 if ( rc )
167 return rc;
169 if ( access_type == hvm_access_insn_fetch )
170 rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
171 else
172 rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
174 switch ( rc )
175 {
176 case HVMCOPY_okay:
177 return X86EMUL_OKAY;
178 case HVMCOPY_bad_gva_to_gfn:
179 return X86EMUL_EXCEPTION;
180 case HVMCOPY_bad_gfn_to_mfn:
181 case HVMCOPY_unhandleable:
182 return X86EMUL_UNHANDLEABLE;
183 }
185 BUG();
186 return X86EMUL_UNHANDLEABLE;
187 }
189 static int
190 hvm_emulate_read(enum x86_segment seg,
191 unsigned long offset,
192 void *p_data,
193 unsigned int bytes,
194 struct x86_emulate_ctxt *ctxt)
195 {
196 if ( !is_x86_user_segment(seg) )
197 return X86EMUL_UNHANDLEABLE;
198 return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
199 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
200 }
202 static int
203 hvm_emulate_insn_fetch(enum x86_segment seg,
204 unsigned long offset,
205 void *p_data,
206 unsigned int bytes,
207 struct x86_emulate_ctxt *ctxt)
208 {
209 struct sh_emulate_ctxt *sh_ctxt =
210 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
211 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
213 ASSERT(seg == x86_seg_cs);
215 /* Fall back if requested bytes are not in the prefetch cache. */
216 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
217 return hvm_read(seg, offset, p_data, bytes,
218 hvm_access_insn_fetch, sh_ctxt);
220 /* Hit the cache. Simple memcpy. */
221 memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
222 return X86EMUL_OKAY;
223 }
225 static int
226 hvm_emulate_write(enum x86_segment seg,
227 unsigned long offset,
228 void *p_data,
229 unsigned int bytes,
230 struct x86_emulate_ctxt *ctxt)
231 {
232 struct sh_emulate_ctxt *sh_ctxt =
233 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
234 struct vcpu *v = current;
235 unsigned long addr;
236 int rc;
238 if ( !is_x86_user_segment(seg) )
239 return X86EMUL_UNHANDLEABLE;
241 /* How many emulations could we save if we unshadowed on stack writes? */
242 if ( seg == x86_seg_ss )
243 perfc_incr(shadow_fault_emulate_stack);
245 rc = hvm_translate_linear_addr(
246 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
247 if ( rc )
248 return rc;
250 return v->arch.paging.mode->shadow.x86_emulate_write(
251 v, addr, p_data, bytes, sh_ctxt);
252 }
254 static int
255 hvm_emulate_cmpxchg(enum x86_segment seg,
256 unsigned long offset,
257 void *p_old,
258 void *p_new,
259 unsigned int bytes,
260 struct x86_emulate_ctxt *ctxt)
261 {
262 struct sh_emulate_ctxt *sh_ctxt =
263 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
264 struct vcpu *v = current;
265 unsigned long addr, old[2], new[2];
266 int rc;
268 if ( !is_x86_user_segment(seg) )
269 return X86EMUL_UNHANDLEABLE;
271 rc = hvm_translate_linear_addr(
272 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
273 if ( rc )
274 return rc;
276 old[0] = new[0] = 0;
277 memcpy(old, p_old, bytes);
278 memcpy(new, p_new, bytes);
280 if ( bytes <= sizeof(long) )
281 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
282 v, addr, old[0], new[0], bytes, sh_ctxt);
284 #ifdef __i386__
285 if ( bytes == 8 )
286 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
287 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
288 #endif
290 return X86EMUL_UNHANDLEABLE;
291 }
293 static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
294 .read = hvm_emulate_read,
295 .insn_fetch = hvm_emulate_insn_fetch,
296 .write = hvm_emulate_write,
297 .cmpxchg = hvm_emulate_cmpxchg,
298 };
300 static int
301 pv_emulate_read(enum x86_segment seg,
302 unsigned long offset,
303 void *p_data,
304 unsigned int bytes,
305 struct x86_emulate_ctxt *ctxt)
306 {
307 unsigned int rc;
309 if ( !is_x86_user_segment(seg) )
310 return X86EMUL_UNHANDLEABLE;
312 if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
313 {
314 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
315 return X86EMUL_EXCEPTION;
316 }
318 return X86EMUL_OKAY;
319 }
321 static int
322 pv_emulate_write(enum x86_segment seg,
323 unsigned long offset,
324 void *p_data,
325 unsigned int bytes,
326 struct x86_emulate_ctxt *ctxt)
327 {
328 struct sh_emulate_ctxt *sh_ctxt =
329 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
330 struct vcpu *v = current;
331 if ( !is_x86_user_segment(seg) )
332 return X86EMUL_UNHANDLEABLE;
333 return v->arch.paging.mode->shadow.x86_emulate_write(
334 v, offset, p_data, bytes, sh_ctxt);
335 }
337 static int
338 pv_emulate_cmpxchg(enum x86_segment seg,
339 unsigned long offset,
340 void *p_old,
341 void *p_new,
342 unsigned int bytes,
343 struct x86_emulate_ctxt *ctxt)
344 {
345 struct sh_emulate_ctxt *sh_ctxt =
346 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
347 unsigned long old[2], new[2];
348 struct vcpu *v = current;
350 if ( !is_x86_user_segment(seg) )
351 return X86EMUL_UNHANDLEABLE;
353 old[0] = new[0] = 0;
354 memcpy(old, p_old, bytes);
355 memcpy(new, p_new, bytes);
357 if ( bytes <= sizeof(long) )
358 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
359 v, offset, old[0], new[0], bytes, sh_ctxt);
361 #ifdef __i386__
362 if ( bytes == 8 )
363 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
364 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
365 #endif
367 return X86EMUL_UNHANDLEABLE;
368 }
370 static const struct x86_emulate_ops pv_shadow_emulator_ops = {
371 .read = pv_emulate_read,
372 .insn_fetch = pv_emulate_read,
373 .write = pv_emulate_write,
374 .cmpxchg = pv_emulate_cmpxchg,
375 };
377 const struct x86_emulate_ops *shadow_init_emulation(
378 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
379 {
380 struct segment_register *creg, *sreg;
381 struct vcpu *v = current;
382 unsigned long addr;
384 sh_ctxt->ctxt.regs = regs;
385 sh_ctxt->ctxt.force_writeback = 0;
387 if ( !is_hvm_vcpu(v) )
388 {
389 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
390 return &pv_shadow_emulator_ops;
391 }
393 /* Segment cache initialisation. Primed with CS. */
394 sh_ctxt->valid_seg_regs = 0;
395 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
397 /* Work out the emulation mode. */
398 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
399 {
400 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
401 }
402 else
403 {
404 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
405 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
406 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
407 }
409 /* Attempt to prefetch whole instruction. */
410 sh_ctxt->insn_buf_eip = regs->eip;
411 sh_ctxt->insn_buf_bytes =
412 (!hvm_translate_linear_addr(
413 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
414 hvm_access_insn_fetch, sh_ctxt, &addr) &&
415 !hvm_fetch_from_guest_virt_nofault(
416 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
417 ? sizeof(sh_ctxt->insn_buf) : 0;
419 return &hvm_shadow_emulator_ops;
420 }
422 /* Update an initialized emulation context to prepare for the next
423 * instruction */
424 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
425 struct cpu_user_regs *regs)
426 {
427 struct vcpu *v = current;
428 unsigned long addr, diff;
430 /* We don't refetch the segment bases, because we don't emulate
431 * writes to segment registers */
433 if ( is_hvm_vcpu(v) )
434 {
435 diff = regs->eip - sh_ctxt->insn_buf_eip;
436 if ( diff > sh_ctxt->insn_buf_bytes )
437 {
438 /* Prefetch more bytes. */
439 sh_ctxt->insn_buf_bytes =
440 (!hvm_translate_linear_addr(
441 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
442 hvm_access_insn_fetch, sh_ctxt, &addr) &&
443 !hvm_fetch_from_guest_virt_nofault(
444 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
445 ? sizeof(sh_ctxt->insn_buf) : 0;
446 sh_ctxt->insn_buf_eip = regs->eip;
447 }
448 }
449 }
452 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
453 /**************************************************************************/
454 /* Out-of-sync shadows. */
456 /* From time to time, we let a shadowed pagetable page go out of sync
457 * with its shadow: the guest is allowed to write directly to the page,
458 * and those writes are not synchronously reflected in the shadow.
459 * This lets us avoid many emulations if the guest is writing a lot to a
460 * pagetable, but it relaxes a pretty important invariant in the shadow
461 * pagetable design. Therefore, some rules:
462 *
463 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
464 * at at higher level must be synchronously updated. This makes
465 * using linear shadow pagetables much less dangerous.
466 * That means that: (a) unsyncing code needs to check for higher-level
467 * shadows, and (b) promotion code needs to resync.
468 *
469 * 2. All shadow operations on a guest page require the page to be brought
470 * back into sync before proceeding. This must be done under the
471 * shadow lock so that the page is guaranteed to remain synced until
472 * the operation completes.
473 *
474 * Exceptions to this rule: the pagefault and invlpg handlers may
475 * update only one entry on an out-of-sync page without resyncing it.
476 *
477 * 3. Operations on shadows that do not start from a guest page need to
478 * be aware that they may be handling an out-of-sync shadow.
479 *
480 * 4. Operations that do not normally take the shadow lock (fast-path
481 * #PF handler, INVLPG) must fall back to a locking, syncing version
482 * if they see an out-of-sync table.
483 *
484 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
485 * must explicitly resync all relevant pages or update their
486 * shadows.
487 *
488 * Currently out-of-sync pages are listed in a simple open-addressed
489 * hash table with a second chance (must resist temptation to radically
490 * over-engineer hash tables...) The virtual address of the access
491 * which caused us to unsync the page is also kept in the hash table, as
492 * a hint for finding the writable mappings later.
493 *
494 * We keep a hash per vcpu, because we want as much as possible to do
495 * the re-sync on the save vcpu we did the unsync on, so the VA hint
496 * will be valid.
497 */
500 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
501 static void sh_oos_audit(struct domain *d)
502 {
503 int idx, expected_idx, expected_idx_alt;
504 struct page_info *pg;
505 struct vcpu *v;
507 for_each_vcpu(d, v)
508 {
509 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
510 {
511 mfn_t *oos = v->arch.paging.shadow.oos;
512 if ( !mfn_valid(oos[idx]) )
513 continue;
515 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
516 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
517 if ( idx != expected_idx && idx != expected_idx_alt )
518 {
519 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
520 __func__, idx, mfn_x(oos[idx]),
521 expected_idx, expected_idx_alt);
522 BUG();
523 }
524 pg = mfn_to_page(oos[idx]);
525 if ( !(pg->count_info & PGC_page_table) )
526 {
527 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
528 __func__, idx, mfn_x(oos[idx]), pg->count_info);
529 BUG();
530 }
531 if ( !(pg->shadow_flags & SHF_out_of_sync) )
532 {
533 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
534 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
535 BUG();
536 }
537 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
538 {
539 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
540 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
541 BUG();
542 }
543 }
544 }
545 }
546 #endif
548 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
549 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
550 {
551 int idx;
552 struct vcpu *v;
553 mfn_t *oos;
555 ASSERT(mfn_is_out_of_sync(gmfn));
557 for_each_vcpu(d, v)
558 {
559 oos = v->arch.paging.shadow.oos;
560 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
561 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
562 idx = (idx + 1) % SHADOW_OOS_PAGES;
564 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
565 return;
566 }
568 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
569 BUG();
570 }
571 #endif
573 /* Update the shadow, but keep the page out of sync. */
574 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
575 {
576 struct page_info *pg = mfn_to_page(gmfn);
578 ASSERT(mfn_valid(gmfn));
579 ASSERT(page_is_out_of_sync(pg));
581 /* Call out to the appropriate per-mode resyncing function */
582 if ( pg->shadow_flags & SHF_L1_32 )
583 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
584 else if ( pg->shadow_flags & SHF_L1_PAE )
585 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
586 #if CONFIG_PAGING_LEVELS >= 4
587 else if ( pg->shadow_flags & SHF_L1_64 )
588 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
589 #endif
590 }
593 /*
594 * Fixup arrays: We limit the maximum number of writable mappings to
595 * SHADOW_OOS_FIXUPS and store enough information to remove them
596 * quickly on resync.
597 */
599 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
600 struct oos_fixup *fixup)
601 {
602 int i;
603 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
604 {
605 if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
606 {
607 sh_remove_write_access_from_sl1p(v, gmfn,
608 fixup->smfn[i],
609 fixup->off[i]);
610 fixup->smfn[i] = _mfn(INVALID_MFN);
611 }
612 }
614 /* Always flush the TLBs. See comment on oos_fixup_add(). */
615 return 1;
616 }
618 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
619 mfn_t smfn, unsigned long off)
620 {
621 int idx, next;
622 mfn_t *oos;
623 struct oos_fixup *oos_fixup;
624 struct domain *d = v->domain;
626 perfc_incr(shadow_oos_fixup_add);
628 for_each_vcpu(d, v)
629 {
630 oos = v->arch.paging.shadow.oos;
631 oos_fixup = v->arch.paging.shadow.oos_fixup;
632 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
633 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
634 idx = (idx + 1) % SHADOW_OOS_PAGES;
635 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
636 {
637 int i;
638 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
639 {
640 if ( mfn_valid(oos_fixup[idx].smfn[i])
641 && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn))
642 && (oos_fixup[idx].off[i] == off) )
643 return;
644 }
646 next = oos_fixup[idx].next;
648 if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
649 {
650 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
652 /* Reuse this slot and remove current writable mapping. */
653 sh_remove_write_access_from_sl1p(v, gmfn,
654 oos_fixup[idx].smfn[next],
655 oos_fixup[idx].off[next]);
656 perfc_incr(shadow_oos_fixup_evict);
657 /* We should flush the TLBs now, because we removed a
658 writable mapping, but since the shadow is already
659 OOS we have no problem if another vcpu write to
660 this page table. We just have to be very careful to
661 *always* flush the tlbs on resync. */
662 }
664 oos_fixup[idx].smfn[next] = smfn;
665 oos_fixup[idx].off[next] = off;
666 oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
668 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
669 return;
670 }
671 }
673 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
674 BUG();
675 }
677 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
678 struct oos_fixup *fixup)
679 {
680 int ftlb = 0;
682 ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
684 switch ( sh_remove_write_access(v, gmfn, 0, 0) )
685 {
686 default:
687 case 0:
688 break;
690 case 1:
691 ftlb |= 1;
692 break;
694 case -1:
695 /* An unfindable writeable typecount has appeared, probably via a
696 * grant table entry: can't shoot the mapping, so try to unshadow
697 * the page. If that doesn't work either, the guest is granting
698 * his pagetables and must be killed after all.
699 * This will flush the tlb, so we can return with no worries. */
700 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
701 return 1;
702 }
704 if ( ftlb )
705 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
707 return 0;
708 }
711 static inline void trace_resync(int event, mfn_t gmfn)
712 {
713 if ( tb_init_done )
714 {
715 /* Convert gmfn to gfn */
716 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
717 __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
718 }
719 }
721 /* Pull all the entries on an out-of-sync page back into sync. */
722 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
723 struct oos_fixup *fixup, mfn_t snp)
724 {
725 struct page_info *pg = mfn_to_page(gmfn);
727 ASSERT(shadow_locked_by_me(v->domain));
728 ASSERT(mfn_is_out_of_sync(gmfn));
729 /* Guest page must be shadowed *only* as L1 when out of sync. */
730 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
731 & ~SHF_L1_ANY));
732 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
734 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
735 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
737 /* Need to pull write access so the page *stays* in sync. */
738 if ( oos_remove_write_access(v, gmfn, fixup) )
739 {
740 /* Page has been unshadowed. */
741 return;
742 }
744 /* No more writable mappings of this page, please */
745 pg->shadow_flags &= ~SHF_oos_may_write;
747 /* Update the shadows with current guest entries. */
748 _sh_resync_l1(v, gmfn, snp);
750 /* Now we know all the entries are synced, and will stay that way */
751 pg->shadow_flags &= ~SHF_out_of_sync;
752 perfc_incr(shadow_resync);
753 trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
754 }
757 /* Add an MFN to the list of out-of-sync guest pagetables */
758 static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
759 {
760 int i, idx, oidx, swap = 0;
761 void *gptr, *gsnpptr;
762 mfn_t *oos = v->arch.paging.shadow.oos;
763 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
764 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
765 struct oos_fixup fixup = { .next = 0 };
767 for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
768 fixup.smfn[i] = _mfn(INVALID_MFN);
770 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
771 oidx = idx;
773 if ( mfn_valid(oos[idx])
774 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
775 {
776 /* Punt the current occupant into the next slot */
777 SWAP(oos[idx], gmfn);
778 SWAP(oos_fixup[idx], fixup);
779 swap = 1;
780 idx = (idx + 1) % SHADOW_OOS_PAGES;
781 }
782 if ( mfn_valid(oos[idx]) )
783 {
784 /* Crush the current occupant. */
785 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
786 perfc_incr(shadow_unsync_evict);
787 }
788 oos[idx] = gmfn;
789 oos_fixup[idx] = fixup;
791 if ( swap )
792 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
794 gptr = sh_map_domain_page(oos[oidx]);
795 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
796 memcpy(gsnpptr, gptr, PAGE_SIZE);
797 sh_unmap_domain_page(gptr);
798 sh_unmap_domain_page(gsnpptr);
799 }
801 /* Remove an MFN from the list of out-of-sync guest pagetables */
802 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
803 {
804 int idx;
805 mfn_t *oos;
806 struct domain *d = v->domain;
808 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
809 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
811 for_each_vcpu(d, v)
812 {
813 oos = v->arch.paging.shadow.oos;
814 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
815 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
816 idx = (idx + 1) % SHADOW_OOS_PAGES;
817 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
818 {
819 oos[idx] = _mfn(INVALID_MFN);
820 return;
821 }
822 }
824 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
825 BUG();
826 }
828 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
829 {
830 int idx;
831 mfn_t *oos;
832 mfn_t *oos_snapshot;
833 struct domain *d = v->domain;
835 for_each_vcpu(d, v)
836 {
837 oos = v->arch.paging.shadow.oos;
838 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
839 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
840 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
841 idx = (idx + 1) % SHADOW_OOS_PAGES;
842 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
843 {
844 return oos_snapshot[idx];
845 }
846 }
848 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
849 BUG();
850 return _mfn(INVALID_MFN);
851 }
853 /* Pull a single guest page back into sync */
854 void sh_resync(struct vcpu *v, mfn_t gmfn)
855 {
856 int idx;
857 mfn_t *oos;
858 mfn_t *oos_snapshot;
859 struct oos_fixup *oos_fixup;
860 struct domain *d = v->domain;
862 for_each_vcpu(d, v)
863 {
864 oos = v->arch.paging.shadow.oos;
865 oos_fixup = v->arch.paging.shadow.oos_fixup;
866 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
867 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
868 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
869 idx = (idx + 1) % SHADOW_OOS_PAGES;
871 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
872 {
873 _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
874 oos[idx] = _mfn(INVALID_MFN);
875 return;
876 }
877 }
879 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
880 BUG();
881 }
883 /* Figure out whether it's definitely safe not to sync this l1 table,
884 * by making a call out to the mode in which that shadow was made. */
885 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
886 {
887 struct page_info *pg = mfn_to_page(gl1mfn);
888 if ( pg->shadow_flags & SHF_L1_32 )
889 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
890 else if ( pg->shadow_flags & SHF_L1_PAE )
891 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
892 #if CONFIG_PAGING_LEVELS >= 4
893 else if ( pg->shadow_flags & SHF_L1_64 )
894 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
895 #endif
896 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
897 mfn_x(gl1mfn));
898 BUG();
899 return 0; /* BUG() is no longer __attribute__((noreturn)). */
900 }
903 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
904 * on other vcpus are allowed to remain out of sync, but their contents
905 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
906 * are brought back into sync and write-protected. If skip != 0, we try
907 * to avoid resyncing at all if we think we can get away with it. */
908 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
909 {
910 int idx;
911 struct vcpu *other;
912 mfn_t *oos = v->arch.paging.shadow.oos;
913 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
914 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
916 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
918 ASSERT(do_locking || shadow_locked_by_me(v->domain));
920 if ( !this )
921 goto resync_others;
923 if ( do_locking )
924 shadow_lock(v->domain);
926 /* First: resync all of this vcpu's oos pages */
927 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
928 if ( mfn_valid(oos[idx]) )
929 {
930 /* Write-protect and sync contents */
931 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
932 oos[idx] = _mfn(INVALID_MFN);
933 }
935 if ( do_locking )
936 shadow_unlock(v->domain);
938 resync_others:
939 if ( !others )
940 return;
942 /* Second: make all *other* vcpus' oos pages safe. */
943 for_each_vcpu(v->domain, other)
944 {
945 if ( v == other )
946 continue;
948 if ( do_locking )
949 shadow_lock(v->domain);
951 oos = other->arch.paging.shadow.oos;
952 oos_fixup = other->arch.paging.shadow.oos_fixup;
953 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
955 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
956 {
957 if ( !mfn_valid(oos[idx]) )
958 continue;
960 if ( skip )
961 {
962 /* Update the shadows and leave the page OOS. */
963 if ( sh_skip_sync(v, oos[idx]) )
964 continue;
965 trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
966 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
967 }
968 else
969 {
970 /* Write-protect and sync contents */
971 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
972 oos[idx] = _mfn(INVALID_MFN);
973 }
974 }
976 if ( do_locking )
977 shadow_unlock(v->domain);
978 }
979 }
981 /* Allow a shadowed page to go out of sync. Unsyncs are traced in
982 * multi.c:sh_page_fault() */
983 int sh_unsync(struct vcpu *v, mfn_t gmfn)
984 {
985 struct page_info *pg;
987 ASSERT(shadow_locked_by_me(v->domain));
989 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
990 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
992 pg = mfn_to_page(gmfn);
994 /* Guest page must be shadowed *only* as L1 and *only* once when out
995 * of sync. Also, get out now if it's already out of sync.
996 * Also, can't safely unsync if some vcpus have paging disabled.*/
997 if ( pg->shadow_flags &
998 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
999 || sh_page_has_multiple_shadows(pg)
1000 || !is_hvm_domain(v->domain)
1001 || !v->domain->arch.paging.shadow.oos_active )
1002 return 0;
1004 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
1005 oos_hash_add(v, gmfn);
1006 perfc_incr(shadow_unsync);
1007 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
1008 return 1;
1011 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
1014 /**************************************************************************/
1015 /* Code for "promoting" a guest page to the point where the shadow code is
1016 * willing to let it be treated as a guest page table. This generally
1017 * involves making sure there are no writable mappings available to the guest
1018 * for this page.
1019 */
1020 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
1022 struct page_info *page = mfn_to_page(gmfn);
1024 ASSERT(mfn_valid(gmfn));
1026 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1027 /* Is the page already shadowed and out of sync? */
1028 if ( page_is_out_of_sync(page) )
1029 sh_resync(v, gmfn);
1030 #endif
1032 /* We should never try to promote a gmfn that has writeable mappings */
1033 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
1034 || (page->u.inuse.type_info & PGT_count_mask) == 0
1035 || v->domain->is_shutting_down);
1037 /* Is the page already shadowed? */
1038 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
1039 page->shadow_flags = 0;
1041 ASSERT(!test_bit(type, &page->shadow_flags));
1042 set_bit(type, &page->shadow_flags);
1043 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
1046 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
1048 struct page_info *page = mfn_to_page(gmfn);
1050 ASSERT(test_bit(_PGC_page_table, &page->count_info));
1051 ASSERT(test_bit(type, &page->shadow_flags));
1053 clear_bit(type, &page->shadow_flags);
1055 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
1057 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1058 /* Was the page out of sync? */
1059 if ( page_is_out_of_sync(page) )
1061 oos_hash_remove(v, gmfn);
1063 #endif
1064 clear_bit(_PGC_page_table, &page->count_info);
1067 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
1070 /**************************************************************************/
1071 /* Validate a pagetable change from the guest and update the shadows.
1072 * Returns a bitmask of SHADOW_SET_* flags. */
1074 int
1075 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
1077 int result = 0;
1078 struct page_info *page = mfn_to_page(gmfn);
1080 paging_mark_dirty(v->domain, mfn_x(gmfn));
1082 // Determine which types of shadows are affected, and update each.
1083 //
1084 // Always validate L1s before L2s to prevent another cpu with a linear
1085 // mapping of this gmfn from seeing a walk that results from
1086 // using the new L2 value and the old L1 value. (It is OK for such a
1087 // guest to see a walk that uses the old L2 value with the new L1 value,
1088 // as hardware could behave this way if one level of the pagewalk occurs
1089 // before the store, and the next level of the pagewalk occurs after the
1090 // store.
1091 //
1092 // Ditto for L2s before L3s, etc.
1093 //
1095 if ( !(page->count_info & PGC_page_table) )
1096 return 0; /* Not shadowed at all */
1098 if ( page->shadow_flags & SHF_L1_32 )
1099 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1100 (v, gmfn, entry, size);
1101 if ( page->shadow_flags & SHF_L2_32 )
1102 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1103 (v, gmfn, entry, size);
1105 if ( page->shadow_flags & SHF_L1_PAE )
1106 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1107 (v, gmfn, entry, size);
1108 if ( page->shadow_flags & SHF_L2_PAE )
1109 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1110 (v, gmfn, entry, size);
1111 if ( page->shadow_flags & SHF_L2H_PAE )
1112 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1113 (v, gmfn, entry, size);
1115 #if CONFIG_PAGING_LEVELS >= 4
1116 if ( page->shadow_flags & SHF_L1_64 )
1117 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1118 (v, gmfn, entry, size);
1119 if ( page->shadow_flags & SHF_L2_64 )
1120 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1121 (v, gmfn, entry, size);
1122 if ( page->shadow_flags & SHF_L2H_64 )
1123 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1124 (v, gmfn, entry, size);
1125 if ( page->shadow_flags & SHF_L3_64 )
1126 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1127 (v, gmfn, entry, size);
1128 if ( page->shadow_flags & SHF_L4_64 )
1129 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1130 (v, gmfn, entry, size);
1131 #else /* 32-bit hypervisor does not support 64-bit guests */
1132 ASSERT((page->shadow_flags
1133 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
1134 #endif
1135 this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
1137 return result;
1141 void
1142 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1143 void *entry, u32 size)
1144 /* This is the entry point for emulated writes to pagetables in HVM guests and
1145 * PV translated guests.
1146 */
1148 struct domain *d = v->domain;
1149 int rc;
1151 ASSERT(shadow_locked_by_me(v->domain));
1152 rc = sh_validate_guest_entry(v, gmfn, entry, size);
1153 if ( rc & SHADOW_SET_FLUSH )
1154 /* Need to flush TLBs to pick up shadow PT changes */
1155 flush_tlb_mask(&d->domain_dirty_cpumask);
1156 if ( rc & SHADOW_SET_ERROR )
1158 /* This page is probably not a pagetable any more: tear it out of the
1159 * shadows, along with any tables that reference it.
1160 * Since the validate call above will have made a "safe" (i.e. zero)
1161 * shadow entry, we can let the domain live even if we can't fully
1162 * unshadow the page. */
1163 sh_remove_shadows(v, gmfn, 0, 0);
1167 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
1168 intpte_t new, mfn_t gmfn)
1169 /* Write a new value into the guest pagetable, and update the shadows
1170 * appropriately. Returns 0 if we page-faulted, 1 for success. */
1172 int failed;
1173 shadow_lock(v->domain);
1174 failed = __copy_to_user(p, &new, sizeof(new));
1175 if ( failed != sizeof(new) )
1176 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1177 shadow_unlock(v->domain);
1178 return (failed == 0);
1181 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
1182 intpte_t *old, intpte_t new, mfn_t gmfn)
1183 /* Cmpxchg a new value into the guest pagetable, and update the shadows
1184 * appropriately. Returns 0 if we page-faulted, 1 if not.
1185 * N.B. caller should check the value of "old" to see if the
1186 * cmpxchg itself was successful. */
1188 int failed;
1189 intpte_t t = *old;
1190 shadow_lock(v->domain);
1191 failed = cmpxchg_user(p, t, new);
1192 if ( t == *old )
1193 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1194 *old = t;
1195 shadow_unlock(v->domain);
1196 return (failed == 0);
1200 /**************************************************************************/
1201 /* Memory management for shadow pages. */
1203 /* Allocating shadow pages
1204 * -----------------------
1206 * Most shadow pages are allocated singly, but there is one case where
1207 * we need to allocate multiple pages together: shadowing 32-bit guest
1208 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
1209 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1210 * l1 tables (covering 2MB of virtual address space each). Similarly, a
1211 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1212 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
1213 * contiguous and aligned; functions for handling offsets into them are
1214 * defined in shadow.c (shadow_l1_index() etc.)
1216 * This table shows the allocation behaviour of the different modes:
1218 * Xen paging pae pae 64b 64b 64b
1219 * Guest paging 32b pae 32b pae 64b
1220 * PV or HVM HVM * HVM HVM *
1221 * Shadow paging pae pae pae pae 64b
1223 * sl1 size 8k 4k 8k 4k 4k
1224 * sl2 size 16k 4k 16k 4k 4k
1225 * sl3 size - - - - 4k
1226 * sl4 size - - - - 4k
1228 * We allocate memory from xen in four-page units and break them down
1229 * with a simple buddy allocator. Can't use the xen allocator to handle
1230 * this as it only works for contiguous zones, and a domain's shadow
1231 * pool is made of fragments.
1233 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1234 * a function for the p2m management to steal pages, in max-order chunks, from
1235 * the free pool. We don't provide for giving them back, yet.
1236 */
1238 /* Figure out the least acceptable quantity of shadow memory.
1239 * The minimum memory requirement for always being able to free up a
1240 * chunk of memory is very small -- only three max-order chunks per
1241 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1243 * But for a guest to be guaranteed to successfully execute a single
1244 * instruction, we must be able to map a large number (about thirty) VAs
1245 * at the same time, which means that to guarantee progress, we must
1246 * allow for more than ninety allocated pages per vcpu. We round that
1247 * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's
1248 * worth to make sure we never return zero. */
1249 static unsigned int shadow_min_acceptable_pages(struct domain *d)
1251 u32 vcpu_count = 1;
1252 struct vcpu *v;
1254 for_each_vcpu(d, v)
1255 vcpu_count++;
1257 return (vcpu_count * 128);
1260 /* Figure out the order of allocation needed for a given shadow type */
1261 static inline u32
1262 shadow_order(unsigned int shadow_type)
1264 static const u32 type_to_order[SH_type_unused] = {
1265 0, /* SH_type_none */
1266 1, /* SH_type_l1_32_shadow */
1267 1, /* SH_type_fl1_32_shadow */
1268 2, /* SH_type_l2_32_shadow */
1269 0, /* SH_type_l1_pae_shadow */
1270 0, /* SH_type_fl1_pae_shadow */
1271 0, /* SH_type_l2_pae_shadow */
1272 0, /* SH_type_l2h_pae_shadow */
1273 0, /* SH_type_l1_64_shadow */
1274 0, /* SH_type_fl1_64_shadow */
1275 0, /* SH_type_l2_64_shadow */
1276 0, /* SH_type_l2h_64_shadow */
1277 0, /* SH_type_l3_64_shadow */
1278 0, /* SH_type_l4_64_shadow */
1279 2, /* SH_type_p2m_table */
1280 0, /* SH_type_monitor_table */
1281 0 /* SH_type_oos_snapshot */
1282 };
1283 ASSERT(shadow_type < SH_type_unused);
1284 return type_to_order[shadow_type];
1287 static inline unsigned int
1288 shadow_max_order(struct domain *d)
1290 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
1293 /* Do we have at total of count pages of the requested order free? */
1294 static inline int space_is_available(
1295 struct domain *d,
1296 unsigned int order,
1297 unsigned int count)
1299 for ( ; order <= shadow_max_order(d); ++order )
1301 unsigned int n = count;
1302 const struct page_info *sp;
1304 page_list_for_each ( sp, &d->arch.paging.shadow.freelists[order] )
1305 if ( --n == 0 )
1306 return 1;
1307 count = (count + 1) >> 1;
1310 return 0;
1313 /* Dispatcher function: call the per-mode function that will unhook the
1314 * non-Xen mappings in this top-level shadow mfn */
1315 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
1317 struct page_info *sp = mfn_to_page(smfn);
1318 switch ( sp->u.sh.type )
1320 case SH_type_l2_32_shadow:
1321 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
1322 break;
1323 case SH_type_l2_pae_shadow:
1324 case SH_type_l2h_pae_shadow:
1325 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
1326 break;
1327 #if CONFIG_PAGING_LEVELS >= 4
1328 case SH_type_l4_64_shadow:
1329 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
1330 break;
1331 #endif
1332 default:
1333 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type);
1334 BUG();
1338 static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
1340 if ( tb_init_done )
1342 /* Convert smfn to gfn */
1343 unsigned long gfn;
1344 ASSERT(mfn_valid(smfn));
1345 gfn = mfn_to_gfn(d, backpointer(mfn_to_page(smfn)));
1346 __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
1347 sizeof(gfn), (unsigned char*)&gfn);
1351 /* Make sure there are at least count order-sized pages
1352 * available in the shadow page pool. */
1353 static void _shadow_prealloc(
1354 struct domain *d,
1355 unsigned int order,
1356 unsigned int count)
1358 /* Need a vpcu for calling unpins; for now, since we don't have
1359 * per-vcpu shadows, any will do */
1360 struct vcpu *v, *v2;
1361 struct page_info *sp, *t;
1362 mfn_t smfn;
1363 int i;
1365 ASSERT(order <= shadow_max_order(d));
1366 if ( space_is_available(d, order, count) ) return;
1368 v = current;
1369 if ( v->domain != d )
1370 v = d->vcpu[0];
1371 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
1373 /* Stage one: walk the list of pinned pages, unpinning them */
1374 perfc_incr(shadow_prealloc_1);
1375 page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
1377 smfn = page_to_mfn(sp);
1379 /* Unpin this top-level shadow */
1380 trace_shadow_prealloc_unpin(d, smfn);
1381 sh_unpin(v, smfn);
1383 /* See if that freed up enough space */
1384 if ( space_is_available(d, order, count) ) return;
1387 /* Stage two: all shadow pages are in use in hierarchies that are
1388 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
1389 * mappings. */
1390 perfc_incr(shadow_prealloc_2);
1392 for_each_vcpu(d, v2)
1393 for ( i = 0 ; i < 4 ; i++ )
1395 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
1397 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
1398 shadow_unhook_mappings(v,
1399 pagetable_get_mfn(v2->arch.shadow_table[i]));
1401 /* See if that freed up enough space */
1402 if ( space_is_available(d, order, count) )
1404 flush_tlb_mask(&d->domain_dirty_cpumask);
1405 return;
1410 /* Nothing more we can do: all remaining shadows are of pages that
1411 * hold Xen mappings for some vcpu. This can never happen. */
1412 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
1413 " shadow pages total = %u, free = %u, p2m=%u\n",
1414 count, order,
1415 d->arch.paging.shadow.total_pages,
1416 d->arch.paging.shadow.free_pages,
1417 d->arch.paging.shadow.p2m_pages);
1418 BUG();
1421 /* Make sure there are at least count pages of the order according to
1422 * type available in the shadow page pool.
1423 * This must be called before any calls to shadow_alloc(). Since this
1424 * will free existing shadows to make room, it must be called early enough
1425 * to avoid freeing shadows that the caller is currently working on. */
1426 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1428 return _shadow_prealloc(d, shadow_order(type), count);
1431 /* Deliberately free all the memory we can: this will tear down all of
1432 * this domain's shadows */
1433 static void shadow_blow_tables(struct domain *d)
1435 struct page_info *sp, *t;
1436 struct vcpu *v = d->vcpu[0];
1437 mfn_t smfn;
1438 int i;
1440 ASSERT(v != NULL);
1442 /* Pass one: unpin all pinned pages */
1443 page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
1445 smfn = page_to_mfn(sp);
1446 sh_unpin(v, smfn);
1449 /* Second pass: unhook entries of in-use shadows */
1450 for_each_vcpu(d, v)
1451 for ( i = 0 ; i < 4 ; i++ )
1452 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1453 shadow_unhook_mappings(v,
1454 pagetable_get_mfn(v->arch.shadow_table[i]));
1456 /* Make sure everyone sees the unshadowings */
1457 flush_tlb_mask(&d->domain_dirty_cpumask);
1460 void shadow_blow_tables_per_domain(struct domain *d)
1462 if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) {
1463 shadow_lock(d);
1464 shadow_blow_tables(d);
1465 shadow_unlock(d);
1469 #ifndef NDEBUG
1470 /* Blow all shadows of all shadowed domains: this can be used to cause the
1471 * guest's pagetables to be re-shadowed if we suspect that the shadows
1472 * have somehow got out of sync */
1473 static void shadow_blow_all_tables(unsigned char c)
1475 struct domain *d;
1476 printk("'%c' pressed -> blowing all shadow tables\n", c);
1477 rcu_read_lock(&domlist_read_lock);
1478 for_each_domain(d)
1480 if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL )
1482 shadow_lock(d);
1483 shadow_blow_tables(d);
1484 shadow_unlock(d);
1487 rcu_read_unlock(&domlist_read_lock);
1490 static struct keyhandler shadow_blow_all_tables_keyhandler = {
1491 .u.fn = shadow_blow_all_tables,
1492 .desc = "reset shadow pagetables"
1493 };
1495 /* Register this function in the Xen console keypress table */
1496 static __init int shadow_blow_tables_keyhandler_init(void)
1498 register_keyhandler('S', &shadow_blow_all_tables_keyhandler);
1499 return 0;
1501 __initcall(shadow_blow_tables_keyhandler_init);
1502 #endif /* !NDEBUG */
1504 static inline struct page_info *
1505 next_shadow(const struct page_info *sp)
1507 return sp->next_shadow ? pdx_to_page(sp->next_shadow) : NULL;
1510 static inline void
1511 set_next_shadow(struct page_info *sp, struct page_info *next)
1513 sp->next_shadow = next ? page_to_pdx(next) : 0;
1516 /* Allocate another shadow's worth of (contiguous, aligned) pages,
1517 * and fill in the type and backpointer fields of their page_infos.
1518 * Never fails to allocate. */
1519 mfn_t shadow_alloc(struct domain *d,
1520 u32 shadow_type,
1521 unsigned long backpointer)
1523 struct page_info *sp = NULL;
1524 unsigned int order = shadow_order(shadow_type);
1525 cpumask_t mask;
1526 void *p;
1527 int i;
1529 ASSERT(shadow_locked_by_me(d));
1530 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
1531 order = shadow_max_order(d);
1532 ASSERT(order <= shadow_max_order(d));
1533 ASSERT(shadow_type != SH_type_none);
1534 perfc_incr(shadow_alloc);
1536 /* Find smallest order which can satisfy the request. */
1537 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
1538 if ( (sp = page_list_remove_head(&d->arch.paging.shadow.freelists[i])) )
1539 goto found;
1541 /* If we get here, we failed to allocate. This should never happen.
1542 * It means that we didn't call shadow_prealloc() correctly before
1543 * we allocated. We can't recover by calling prealloc here, because
1544 * we might free up higher-level pages that the caller is working on. */
1545 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
1546 BUG();
1548 found:
1549 /* We may have to halve the chunk a number of times. */
1550 while ( i != order )
1552 i--;
1553 sp->v.free.order = i;
1554 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[i]);
1555 sp += 1 << i;
1557 d->arch.paging.shadow.free_pages -= 1 << order;
1559 switch (shadow_type)
1561 case SH_type_fl1_32_shadow:
1562 case SH_type_fl1_pae_shadow:
1563 case SH_type_fl1_64_shadow:
1564 break;
1565 default:
1566 backpointer = pfn_to_pdx(backpointer);
1567 break;
1570 /* Init page info fields and clear the pages */
1571 for ( i = 0; i < 1<<order ; i++ )
1573 /* Before we overwrite the old contents of this page,
1574 * we need to be sure that no TLB holds a pointer to it. */
1575 mask = d->domain_dirty_cpumask;
1576 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1577 if ( unlikely(!cpus_empty(mask)) )
1579 perfc_incr(shadow_alloc_tlbflush);
1580 flush_tlb_mask(&mask);
1582 /* Now safe to clear the page for reuse */
1583 p = __map_domain_page(sp+i);
1584 ASSERT(p != NULL);
1585 clear_page(p);
1586 sh_unmap_domain_page(p);
1587 INIT_PAGE_LIST_ENTRY(&sp[i].list);
1588 sp[i].u.sh.type = shadow_type;
1589 sp[i].u.sh.pinned = 0;
1590 sp[i].u.sh.count = 0;
1591 sp[i].v.sh.back = backpointer;
1592 set_next_shadow(&sp[i], NULL);
1593 perfc_incr(shadow_alloc_count);
1595 return page_to_mfn(sp);
1599 /* Return some shadow pages to the pool. */
1600 void shadow_free(struct domain *d, mfn_t smfn)
1602 struct page_info *sp = mfn_to_page(smfn);
1603 u32 shadow_type;
1604 unsigned long order;
1605 unsigned long mask;
1606 int i;
1608 ASSERT(shadow_locked_by_me(d));
1609 perfc_incr(shadow_free);
1611 shadow_type = sp->u.sh.type;
1612 ASSERT(shadow_type != SH_type_none);
1613 ASSERT(shadow_type != SH_type_p2m_table);
1614 order = shadow_order(shadow_type);
1616 d->arch.paging.shadow.free_pages += 1 << order;
1618 for ( i = 0; i < 1<<order; i++ )
1620 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1621 struct vcpu *v;
1622 for_each_vcpu(d, v)
1624 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1625 /* No longer safe to look for a writeable mapping in this shadow */
1626 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1627 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1628 #endif
1629 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1630 v->arch.paging.last_write_emul_ok = 0;
1631 #endif
1633 #endif
1634 /* Strip out the type: this is now a free shadow page */
1635 sp[i].u.sh.type = 0;
1636 /* Remember the TLB timestamp so we will know whether to flush
1637 * TLBs when we reuse the page. Because the destructors leave the
1638 * contents of the pages in place, we can delay TLB flushes until
1639 * just before the allocator hands the page out again. */
1640 sp[i].tlbflush_timestamp = tlbflush_current_time();
1641 perfc_decr(shadow_alloc_count);
1644 /* Merge chunks as far as possible. */
1645 for ( ; order < shadow_max_order(d); ++order )
1647 mask = 1 << order;
1648 if ( (mfn_x(page_to_mfn(sp)) & mask) ) {
1649 /* Merge with predecessor block? */
1650 if ( ((sp-mask)->u.sh.type != PGT_none) ||
1651 ((sp-mask)->v.free.order != order) )
1652 break;
1653 sp -= mask;
1654 page_list_del(sp, &d->arch.paging.shadow.freelists[order]);
1655 } else {
1656 /* Merge with successor block? */
1657 if ( ((sp+mask)->u.sh.type != PGT_none) ||
1658 ((sp+mask)->v.free.order != order) )
1659 break;
1660 page_list_del(sp + mask, &d->arch.paging.shadow.freelists[order]);
1664 sp->v.free.order = order;
1665 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
1668 /* Divert some memory from the pool to be used by the p2m mapping.
1669 * This action is irreversible: the p2m mapping only ever grows.
1670 * That's OK because the p2m table only exists for translated domains,
1671 * and those domains can't ever turn off shadow mode.
1672 * Also, we only ever allocate a max-order chunk, so as to preserve
1673 * the invariant that shadow_prealloc() always works.
1674 * Returns 0 iff it can't get a chunk (the caller should then
1675 * free up some pages in domheap and call sh_set_allocation);
1676 * returns non-zero on success.
1677 */
1678 static int
1679 sh_alloc_p2m_pages(struct domain *d)
1681 struct page_info *pg;
1682 u32 i;
1683 unsigned int order = shadow_max_order(d);
1685 ASSERT(shadow_locked_by_me(d));
1687 if ( d->arch.paging.shadow.total_pages
1688 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1689 return 0; /* Not enough shadow memory: need to increase it first */
1691 shadow_prealloc(d, SH_type_p2m_table, 1);
1692 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1693 d->arch.paging.shadow.p2m_pages += (1 << order);
1694 d->arch.paging.shadow.total_pages -= (1 << order);
1695 for (i = 0; i < (1U << order); i++)
1697 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1698 * Marking the domain as the owner would normally allow the guest to
1699 * create mappings of these pages, but these p2m pages will never be
1700 * in the domain's guest-physical address space, and so that is not
1701 * believed to be a concern.
1702 */
1703 page_set_owner(&pg[i], d);
1704 pg[i].count_info |= 1;
1705 page_list_add_tail(&pg[i], &d->arch.paging.shadow.p2m_freelist);
1707 return 1;
1710 // Returns 0 if no memory is available...
1711 static struct page_info *
1712 shadow_alloc_p2m_page(struct domain *d)
1714 struct page_info *pg;
1715 mfn_t mfn;
1716 void *p;
1718 shadow_lock(d);
1720 if ( page_list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1721 !sh_alloc_p2m_pages(d) )
1723 shadow_unlock(d);
1724 return NULL;
1726 pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist);
1728 shadow_unlock(d);
1730 mfn = page_to_mfn(pg);
1731 p = sh_map_domain_page(mfn);
1732 clear_page(p);
1733 sh_unmap_domain_page(p);
1735 return pg;
1738 static void
1739 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1741 ASSERT(page_get_owner(pg) == d);
1742 /* Should have just the one ref we gave it in alloc_p2m_page() */
1743 if ( (pg->count_info & PGC_count_mask) != 1 )
1745 SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n",
1746 pg->count_info, pg->u.inuse.type_info);
1748 pg->count_info &= ~PGC_count_mask;
1749 /* Free should not decrement domain's total allocation, since
1750 * these pages were allocated without an owner. */
1751 page_set_owner(pg, NULL);
1752 free_domheap_pages(pg, 0);
1753 d->arch.paging.shadow.p2m_pages--;
1754 perfc_decr(shadow_alloc_count);
1757 #if CONFIG_PAGING_LEVELS == 3
1758 static void p2m_install_entry_in_monitors(struct domain *d,
1759 l3_pgentry_t *l3e)
1760 /* Special case, only used for external-mode domains on PAE hosts:
1761 * update the mapping of the p2m table. Once again, this is trivial in
1762 * other paging modes (one top-level entry points to the top-level p2m,
1763 * no maintenance needed), but PAE makes life difficult by needing a
1764 * copy the eight l3es of the p2m table in eight l2h slots in the
1765 * monitor table. This function makes fresh copies when a p2m l3e
1766 * changes. */
1768 l2_pgentry_t *ml2e;
1769 struct vcpu *v;
1770 unsigned int index;
1772 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1773 ASSERT(index < MACHPHYS_MBYTES>>1);
1775 for_each_vcpu(d, v)
1777 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1778 continue;
1779 ASSERT(shadow_mode_external(v->domain));
1781 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1782 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1784 if ( v == current ) /* OK to use linear map of monitor_table */
1785 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1786 else
1788 l3_pgentry_t *ml3e;
1789 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1790 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1791 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1792 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1793 sh_unmap_domain_page(ml3e);
1795 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1796 if ( v != current )
1797 sh_unmap_domain_page(ml2e);
1800 #endif
1802 /* Set the pool of shadow pages to the required number of pages.
1803 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1804 * plus space for the p2m table.
1805 * Returns 0 for success, non-zero for failure. */
1806 static unsigned int sh_set_allocation(struct domain *d,
1807 unsigned int pages,
1808 int *preempted)
1810 struct page_info *sp;
1811 unsigned int lower_bound;
1812 unsigned int j, order = shadow_max_order(d);
1814 ASSERT(shadow_locked_by_me(d));
1816 /* Don't allocate less than the minimum acceptable, plus one page per
1817 * megabyte of RAM (for the p2m table) */
1818 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1819 if ( pages > 0 && pages < lower_bound )
1820 pages = lower_bound;
1821 /* Round up to largest block size */
1822 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1824 SHADOW_PRINTK("current %i target %i\n",
1825 d->arch.paging.shadow.total_pages, pages);
1827 while ( d->arch.paging.shadow.total_pages != pages )
1829 if ( d->arch.paging.shadow.total_pages < pages )
1831 /* Need to allocate more memory from domheap */
1832 sp = (struct page_info *)
1833 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1834 if ( sp == NULL )
1836 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1837 return -ENOMEM;
1839 d->arch.paging.shadow.free_pages += 1 << order;
1840 d->arch.paging.shadow.total_pages += 1 << order;
1841 for ( j = 0; j < 1U << order; j++ )
1843 sp[j].u.sh.type = 0;
1844 sp[j].u.sh.pinned = 0;
1845 sp[j].u.sh.count = 0;
1846 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1848 sp->v.free.order = order;
1849 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
1851 else if ( d->arch.paging.shadow.total_pages > pages )
1853 /* Need to return memory to domheap */
1854 _shadow_prealloc(d, order, 1);
1855 sp = page_list_remove_head(&d->arch.paging.shadow.freelists[order]);
1856 ASSERT(sp);
1857 /*
1858 * The pages were allocated anonymously, but the owner field
1859 * gets overwritten normally, so need to clear it here.
1860 */
1861 for ( j = 0; j < 1U << order; j++ )
1862 page_set_owner(&((struct page_info *)sp)[j], NULL);
1863 d->arch.paging.shadow.free_pages -= 1 << order;
1864 d->arch.paging.shadow.total_pages -= 1 << order;
1865 free_domheap_pages((struct page_info *)sp, order);
1868 /* Check to see if we need to yield and try again */
1869 if ( preempted && hypercall_preempt_check() )
1871 *preempted = 1;
1872 return 0;
1876 return 0;
1879 /* Return the size of the shadow pool, rounded up to the nearest MB */
1880 static unsigned int shadow_get_allocation(struct domain *d)
1882 unsigned int pg = d->arch.paging.shadow.total_pages;
1883 return ((pg >> (20 - PAGE_SHIFT))
1884 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1887 /**************************************************************************/
1888 /* Hash table for storing the guest->shadow mappings.
1889 * The table itself is an array of pointers to shadows; the shadows are then
1890 * threaded on a singly-linked list of shadows with the same hash value */
1892 #define SHADOW_HASH_BUCKETS 251
1893 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1895 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1896 typedef u32 key_t;
1897 static inline key_t sh_hash(unsigned long n, unsigned int t)
1899 unsigned char *p = (unsigned char *)&n;
1900 key_t k = t;
1901 int i;
1902 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1903 return k % SHADOW_HASH_BUCKETS;
1906 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1908 /* Before we get to the mechanism, define a pair of audit functions
1909 * that sanity-check the contents of the hash table. */
1910 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1911 /* Audit one bucket of the hash table */
1913 struct page_info *sp, *x;
1915 if ( !(SHADOW_AUDIT_ENABLE) )
1916 return;
1918 sp = d->arch.paging.shadow.hash_table[bucket];
1919 while ( sp )
1921 /* Not a shadow? */
1922 BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ;
1923 /* Bogus type? */
1924 BUG_ON( sp->u.sh.type == 0 );
1925 BUG_ON( sp->u.sh.type > SH_type_max_shadow );
1926 /* Wrong bucket? */
1927 BUG_ON( sh_hash(__backpointer(sp), sp->u.sh.type) != bucket );
1928 /* Duplicate entry? */
1929 for ( x = next_shadow(sp); x; x = next_shadow(x) )
1930 BUG_ON( x->v.sh.back == sp->v.sh.back &&
1931 x->u.sh.type == sp->u.sh.type );
1932 /* Follow the backpointer to the guest pagetable */
1933 if ( sp->u.sh.type != SH_type_fl1_32_shadow
1934 && sp->u.sh.type != SH_type_fl1_pae_shadow
1935 && sp->u.sh.type != SH_type_fl1_64_shadow )
1937 struct page_info *gpg = mfn_to_page(backpointer(sp));
1938 /* Bad shadow flags on guest page? */
1939 BUG_ON( !(gpg->shadow_flags & (1<<sp->u.sh.type)) );
1940 /* Bad type count on guest page? */
1941 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1942 if ( sp->u.sh.type == SH_type_l1_32_shadow
1943 || sp->u.sh.type == SH_type_l1_pae_shadow
1944 || sp->u.sh.type == SH_type_l1_64_shadow )
1946 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1947 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1949 if ( !page_is_out_of_sync(gpg) )
1951 SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
1952 " and not OOS but has typecount %#lx\n",
1953 __backpointer(sp),
1954 mfn_x(page_to_mfn(sp)),
1955 gpg->u.inuse.type_info);
1956 BUG();
1960 else /* Not an l1 */
1961 #endif
1962 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1963 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1965 SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
1966 " but has typecount %#lx\n",
1967 __backpointer(sp), mfn_x(page_to_mfn(sp)),
1968 gpg->u.inuse.type_info);
1969 BUG();
1972 /* That entry was OK; on we go */
1973 sp = next_shadow(sp);
1977 #else
1978 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1979 #endif /* Hashtable bucket audit */
1982 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1984 static void sh_hash_audit(struct domain *d)
1985 /* Full audit: audit every bucket in the table */
1987 int i;
1989 if ( !(SHADOW_AUDIT_ENABLE) )
1990 return;
1992 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1994 sh_hash_audit_bucket(d, i);
1998 #else
1999 #define sh_hash_audit(_d) do {} while(0)
2000 #endif /* Hashtable bucket audit */
2002 /* Allocate and initialise the table itself.
2003 * Returns 0 for success, 1 for error. */
2004 static int shadow_hash_alloc(struct domain *d)
2006 struct page_info **table;
2008 ASSERT(shadow_locked_by_me(d));
2009 ASSERT(!d->arch.paging.shadow.hash_table);
2011 table = xmalloc_array(struct page_info *, SHADOW_HASH_BUCKETS);
2012 if ( !table ) return 1;
2013 memset(table, 0,
2014 SHADOW_HASH_BUCKETS * sizeof (struct page_info *));
2015 d->arch.paging.shadow.hash_table = table;
2016 return 0;
2019 /* Tear down the hash table and return all memory to Xen.
2020 * This function does not care whether the table is populated. */
2021 static void shadow_hash_teardown(struct domain *d)
2023 ASSERT(shadow_locked_by_me(d));
2024 ASSERT(d->arch.paging.shadow.hash_table);
2026 xfree(d->arch.paging.shadow.hash_table);
2027 d->arch.paging.shadow.hash_table = NULL;
2031 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
2032 /* Find an entry in the hash table. Returns the MFN of the shadow,
2033 * or INVALID_MFN if it doesn't exist */
2035 struct domain *d = v->domain;
2036 struct page_info *sp, *prev;
2037 key_t key;
2039 ASSERT(shadow_locked_by_me(d));
2040 ASSERT(d->arch.paging.shadow.hash_table);
2041 ASSERT(t);
2043 sh_hash_audit(d);
2045 perfc_incr(shadow_hash_lookups);
2046 key = sh_hash(n, t);
2047 sh_hash_audit_bucket(d, key);
2049 sp = d->arch.paging.shadow.hash_table[key];
2050 prev = NULL;
2051 while(sp)
2053 if ( __backpointer(sp) == n && sp->u.sh.type == t )
2055 /* Pull-to-front if 'sp' isn't already the head item */
2056 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2058 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2059 /* Can't reorder: someone is walking the hash chains */
2060 return page_to_mfn(sp);
2061 else
2063 ASSERT(prev);
2064 /* Delete sp from the list */
2065 prev->next_shadow = sp->next_shadow;
2066 /* Re-insert it at the head of the list */
2067 set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2068 d->arch.paging.shadow.hash_table[key] = sp;
2071 else
2073 perfc_incr(shadow_hash_lookup_head);
2075 return page_to_mfn(sp);
2077 prev = sp;
2078 sp = next_shadow(sp);
2081 perfc_incr(shadow_hash_lookup_miss);
2082 return _mfn(INVALID_MFN);
2085 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
2086 mfn_t smfn)
2087 /* Put a mapping (n,t)->smfn into the hash table */
2089 struct domain *d = v->domain;
2090 struct page_info *sp;
2091 key_t key;
2093 ASSERT(shadow_locked_by_me(d));
2094 ASSERT(d->arch.paging.shadow.hash_table);
2095 ASSERT(t);
2097 sh_hash_audit(d);
2099 perfc_incr(shadow_hash_inserts);
2100 key = sh_hash(n, t);
2101 sh_hash_audit_bucket(d, key);
2103 /* Insert this shadow at the top of the bucket */
2104 sp = mfn_to_page(smfn);
2105 set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2106 d->arch.paging.shadow.hash_table[key] = sp;
2108 sh_hash_audit_bucket(d, key);
2111 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
2112 mfn_t smfn)
2113 /* Excise the mapping (n,t)->smfn from the hash table */
2115 struct domain *d = v->domain;
2116 struct page_info *sp, *x;
2117 key_t key;
2119 ASSERT(shadow_locked_by_me(d));
2120 ASSERT(d->arch.paging.shadow.hash_table);
2121 ASSERT(t);
2123 sh_hash_audit(d);
2125 perfc_incr(shadow_hash_deletes);
2126 key = sh_hash(n, t);
2127 sh_hash_audit_bucket(d, key);
2129 sp = mfn_to_page(smfn);
2130 if ( d->arch.paging.shadow.hash_table[key] == sp )
2131 /* Easy case: we're deleting the head item. */
2132 d->arch.paging.shadow.hash_table[key] = next_shadow(sp);
2133 else
2135 /* Need to search for the one we want */
2136 x = d->arch.paging.shadow.hash_table[key];
2137 while ( 1 )
2139 ASSERT(x); /* We can't have hit the end, since our target is
2140 * still in the chain somehwere... */
2141 if ( next_shadow(x) == sp )
2143 x->next_shadow = sp->next_shadow;
2144 break;
2146 x = next_shadow(x);
2149 set_next_shadow(sp, NULL);
2151 sh_hash_audit_bucket(d, key);
2154 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2156 static void hash_foreach(struct vcpu *v,
2157 unsigned int callback_mask,
2158 const hash_callback_t callbacks[],
2159 mfn_t callback_mfn)
2160 /* Walk the hash table looking at the types of the entries and
2161 * calling the appropriate callback function for each entry.
2162 * The mask determines which shadow types we call back for, and the array
2163 * of callbacks tells us which function to call.
2164 * Any callback may return non-zero to let us skip the rest of the scan.
2166 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2167 * then return non-zero to terminate the scan. */
2169 int i, done = 0;
2170 struct domain *d = v->domain;
2171 struct page_info *x;
2173 /* Say we're here, to stop hash-lookups reordering the chains */
2174 ASSERT(shadow_locked_by_me(d));
2175 ASSERT(d->arch.paging.shadow.hash_walking == 0);
2176 d->arch.paging.shadow.hash_walking = 1;
2178 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2180 /* WARNING: This is not safe against changes to the hash table.
2181 * The callback *must* return non-zero if it has inserted or
2182 * deleted anything from the hash (lookups are OK, though). */
2183 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
2185 if ( callback_mask & (1 << x->u.sh.type) )
2187 ASSERT(x->u.sh.type <= 15);
2188 ASSERT(callbacks[x->u.sh.type] != NULL);
2189 done = callbacks[x->u.sh.type](v, page_to_mfn(x),
2190 callback_mfn);
2191 if ( done ) break;
2194 if ( done ) break;
2196 d->arch.paging.shadow.hash_walking = 0;
2200 /**************************************************************************/
2201 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
2202 * which will decrement refcounts appropriately and return memory to the
2203 * free pool. */
2205 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
2207 struct page_info *sp = mfn_to_page(smfn);
2208 unsigned int t = sp->u.sh.type;
2211 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2213 /* Double-check, if we can, that the shadowed page belongs to this
2214 * domain, (by following the back-pointer). */
2215 ASSERT(t == SH_type_fl1_32_shadow ||
2216 t == SH_type_fl1_pae_shadow ||
2217 t == SH_type_fl1_64_shadow ||
2218 t == SH_type_monitor_table ||
2219 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
2220 (page_get_owner(mfn_to_page(backpointer(sp)))
2221 == v->domain));
2223 /* The down-shifts here are so that the switch statement is on nice
2224 * small numbers that the compiler will enjoy */
2225 switch ( t )
2227 case SH_type_l1_32_shadow:
2228 case SH_type_fl1_32_shadow:
2229 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
2230 break;
2231 case SH_type_l2_32_shadow:
2232 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
2233 break;
2235 case SH_type_l1_pae_shadow:
2236 case SH_type_fl1_pae_shadow:
2237 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
2238 break;
2239 case SH_type_l2_pae_shadow:
2240 case SH_type_l2h_pae_shadow:
2241 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
2242 break;
2244 #if CONFIG_PAGING_LEVELS >= 4
2245 case SH_type_l1_64_shadow:
2246 case SH_type_fl1_64_shadow:
2247 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
2248 break;
2249 case SH_type_l2h_64_shadow:
2250 ASSERT(is_pv_32on64_vcpu(v));
2251 /* Fall through... */
2252 case SH_type_l2_64_shadow:
2253 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
2254 break;
2255 case SH_type_l3_64_shadow:
2256 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
2257 break;
2258 case SH_type_l4_64_shadow:
2259 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
2260 break;
2261 #endif
2262 default:
2263 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2264 (unsigned long)t);
2265 BUG();
2269 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
2271 if ( tb_init_done )
2273 /* Convert gmfn to gfn */
2274 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
2275 __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
2279 /**************************************************************************/
2280 /* Remove all writeable mappings of a guest frame from the shadow tables
2281 * Returns non-zero if we need to flush TLBs.
2282 * level and fault_addr desribe how we found this to be a pagetable;
2283 * level==0 means we have some other reason for revoking write access.
2284 * If level==0 we are allowed to fail, returning -1. */
2286 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
2287 unsigned int level,
2288 unsigned long fault_addr)
2290 /* Dispatch table for getting per-type functions */
2291 static const hash_callback_t callbacks[SH_type_unused] = {
2292 NULL, /* none */
2293 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
2294 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
2295 NULL, /* l2_32 */
2296 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
2297 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2298 NULL, /* l2_pae */
2299 NULL, /* l2h_pae */
2300 #if CONFIG_PAGING_LEVELS >= 4
2301 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
2302 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
2303 #else
2304 NULL, /* l1_64 */
2305 NULL, /* fl1_64 */
2306 #endif
2307 NULL, /* l2_64 */
2308 NULL, /* l2h_64 */
2309 NULL, /* l3_64 */
2310 NULL, /* l4_64 */
2311 NULL, /* p2m */
2312 NULL /* unused */
2313 };
2315 static unsigned int callback_mask =
2316 1 << SH_type_l1_32_shadow
2317 | 1 << SH_type_fl1_32_shadow
2318 | 1 << SH_type_l1_pae_shadow
2319 | 1 << SH_type_fl1_pae_shadow
2320 | 1 << SH_type_l1_64_shadow
2321 | 1 << SH_type_fl1_64_shadow
2323 struct page_info *pg = mfn_to_page(gmfn);
2325 ASSERT(shadow_locked_by_me(v->domain));
2327 /* Only remove writable mappings if we are doing shadow refcounts.
2328 * In guest refcounting, we trust Xen to already be restricting
2329 * all the writes to the guest page tables, so we do not need to
2330 * do more. */
2331 if ( !shadow_mode_refcounts(v->domain) )
2332 return 0;
2334 /* Early exit if it's already a pagetable, or otherwise not writeable */
2335 if ( (sh_mfn_is_a_page_table(gmfn)
2336 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2337 /* Unless they've been allowed to go out of sync with their shadows */
2338 && !mfn_oos_may_write(gmfn)
2339 #endif
2341 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2342 return 0;
2344 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
2346 perfc_incr(shadow_writeable);
2348 /* If this isn't a "normal" writeable page, the domain is trying to
2349 * put pagetables in special memory of some kind. We can't allow that. */
2350 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2352 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2353 PRtype_info "\n",
2354 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2355 domain_crash(v->domain);
2358 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2359 if ( v == current )
2361 unsigned long gfn;
2362 /* Heuristic: there is likely to be only one writeable mapping,
2363 * and that mapping is likely to be in the current pagetable,
2364 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2366 #define GUESS(_a, _h) do { \
2367 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
2368 perfc_incr(shadow_writeable_h_ ## _h); \
2369 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2370 { \
2371 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
2372 return 1; \
2373 } \
2374 } while (0)
2376 if ( v->arch.paging.mode->guest_levels == 2 )
2378 if ( level == 1 )
2379 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2380 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2382 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2383 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2384 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2386 /* FreeBSD: Linear map at 0xBFC00000 */
2387 if ( level == 1 )
2388 GUESS(0xBFC00000UL
2389 + ((fault_addr & VADDR_MASK) >> 10), 6);
2391 else if ( v->arch.paging.mode->guest_levels == 3 )
2393 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2394 switch ( level )
2396 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2397 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2400 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2401 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2402 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2404 /* FreeBSD PAE: Linear map at 0xBF800000 */
2405 switch ( level )
2407 case 1: GUESS(0xBF800000UL
2408 + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2409 case 2: GUESS(0xBFDFC000UL
2410 + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2413 #if CONFIG_PAGING_LEVELS >= 4
2414 else if ( v->arch.paging.mode->guest_levels == 4 )
2416 /* 64bit w2k3: linear map at 0xfffff68000000000 */
2417 switch ( level )
2419 case 1: GUESS(0xfffff68000000000UL
2420 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2421 case 2: GUESS(0xfffff6fb40000000UL
2422 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2423 case 3: GUESS(0xfffff6fb7da00000UL
2424 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2427 /* 64bit Linux direct map at 0xffff880000000000; older kernels
2428 * had it at 0xffff810000000000, and older kernels yet had it
2429 * at 0x0000010000000000UL */
2430 gfn = mfn_to_gfn(v->domain, gmfn);
2431 GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
2432 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2433 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2435 /*
2436 * 64bit Solaris kernel page map at
2437 * kpm_vbase; 0xfffffe0000000000UL
2438 */
2439 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2441 /* FreeBSD 64bit: linear map 0xffff800000000000 */
2442 switch ( level )
2444 case 1: GUESS(0xffff800000000000
2445 + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2446 case 2: GUESS(0xffff804000000000UL
2447 + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2448 case 3: GUESS(0xffff804020000000UL
2449 + ((fault_addr & VADDR_MASK) >> 27), 6); break;
2451 /* FreeBSD 64bit: direct map at 0xffffff0000000000 */
2452 GUESS(0xffffff0000000000 + (gfn << PAGE_SHIFT), 6);
2454 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2456 #undef GUESS
2459 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2460 return 1;
2462 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2463 * (entries in the fixmap) where linux maps its pagetables. Since
2464 * we expect to hit them most of the time, we start the search for
2465 * the writeable mapping by looking at the same MFN where the last
2466 * brute-force search succeeded. */
2468 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
2470 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2471 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
2472 int shtype = mfn_to_page(last_smfn)->u.sh.type;
2474 if ( callbacks[shtype] )
2475 callbacks[shtype](v, last_smfn, gmfn);
2477 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2478 perfc_incr(shadow_writeable_h_5);
2481 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2482 return 1;
2484 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2486 /* Brute-force search of all the shadows, by walking the hash */
2487 trace_shadow_wrmap_bf(gmfn);
2488 if ( level == 0 )
2489 perfc_incr(shadow_writeable_bf_1);
2490 else
2491 perfc_incr(shadow_writeable_bf);
2492 hash_foreach(v, callback_mask, callbacks, gmfn);
2494 /* If that didn't catch the mapping, then there's some non-pagetable
2495 * mapping -- ioreq page, grant mapping, &c. */
2496 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2498 if ( level == 0 )
2499 return -1;
2501 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2502 "%lu special-use mappings of it\n", mfn_x(gmfn),
2503 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2504 domain_crash(v->domain);
2507 /* We killed at least one writeable mapping, so must flush TLBs. */
2508 return 1;
2511 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2512 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
2513 mfn_t smfn, unsigned long off)
2515 struct page_info *sp = mfn_to_page(smfn);
2517 ASSERT(mfn_valid(smfn));
2518 ASSERT(mfn_valid(gmfn));
2520 if ( sp->u.sh.type == SH_type_l1_32_shadow
2521 || sp->u.sh.type == SH_type_fl1_32_shadow )
2523 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2524 (v, gmfn, smfn, off);
2526 #if CONFIG_PAGING_LEVELS >= 3
2527 else if ( sp->u.sh.type == SH_type_l1_pae_shadow
2528 || sp->u.sh.type == SH_type_fl1_pae_shadow )
2529 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2530 (v, gmfn, smfn, off);
2531 #if CONFIG_PAGING_LEVELS >= 4
2532 else if ( sp->u.sh.type == SH_type_l1_64_shadow
2533 || sp->u.sh.type == SH_type_fl1_64_shadow )
2534 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2535 (v, gmfn, smfn, off);
2536 #endif
2537 #endif
2539 return 0;
2541 #endif
2543 /**************************************************************************/
2544 /* Remove all mappings of a guest frame from the shadow tables.
2545 * Returns non-zero if we need to flush TLBs. */
2547 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2549 struct page_info *page = mfn_to_page(gmfn);
2550 int expected_count, do_locking;
2552 /* Dispatch table for getting per-type functions */
2553 static const hash_callback_t callbacks[SH_type_unused] = {
2554 NULL, /* none */
2555 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
2556 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
2557 NULL, /* l2_32 */
2558 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
2559 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2560 NULL, /* l2_pae */
2561 NULL, /* l2h_pae */
2562 #if CONFIG_PAGING_LEVELS >= 4
2563 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
2564 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
2565 #else
2566 NULL, /* l1_64 */
2567 NULL, /* fl1_64 */
2568 #endif
2569 NULL, /* l2_64 */
2570 NULL, /* l2h_64 */
2571 NULL, /* l3_64 */
2572 NULL, /* l4_64 */
2573 NULL, /* p2m */
2574 NULL /* unused */
2575 };
2577 static unsigned int callback_mask =
2578 1 << SH_type_l1_32_shadow
2579 | 1 << SH_type_fl1_32_shadow
2580 | 1 << SH_type_l1_pae_shadow
2581 | 1 << SH_type_fl1_pae_shadow
2582 | 1 << SH_type_l1_64_shadow
2583 | 1 << SH_type_fl1_64_shadow
2586 perfc_incr(shadow_mappings);
2587 if ( (page->count_info & PGC_count_mask) == 0 )
2588 return 0;
2590 /* Although this is an externally visible function, we do not know
2591 * whether the shadow lock will be held when it is called (since it
2592 * can be called via put_page_type when we clear a shadow l1e).
2593 * If the lock isn't held, take it for the duration of the call. */
2594 do_locking = !shadow_locked_by_me(v->domain);
2595 if ( do_locking ) shadow_lock(v->domain);
2597 /* XXX TODO:
2598 * Heuristics for finding the (probably) single mapping of this gmfn */
2600 /* Brute-force search of all the shadows, by walking the hash */
2601 perfc_incr(shadow_mappings_bf);
2602 hash_foreach(v, callback_mask, callbacks, gmfn);
2604 /* If that didn't catch the mapping, something is very wrong */
2605 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2606 if ( (page->count_info & PGC_count_mask) != expected_count )
2608 /* Don't complain if we're in HVM and there are some extra mappings:
2609 * The qemu helper process has an untyped mapping of this dom's RAM
2610 * and the HVM restore program takes another. */
2611 if ( !(shadow_mode_external(v->domain)
2612 && (page->count_info & PGC_count_mask) <= 3
2613 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2615 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2616 "c=%08lx t=%08lx\n", mfn_x(gmfn),
2617 page->count_info, page->u.inuse.type_info);
2621 if ( do_locking ) shadow_unlock(v->domain);
2623 /* We killed at least one mapping, so must flush TLBs. */
2624 return 1;
2628 /**************************************************************************/
2629 /* Remove all shadows of a guest frame from the shadow tables */
2631 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2632 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2633 * found there. Returns 1 if that was the only reference to this shadow */
2635 struct page_info *sp = mfn_to_page(smfn);
2636 mfn_t pmfn;
2637 void *vaddr;
2638 int rc;
2640 ASSERT(sp->u.sh.type > 0);
2641 ASSERT(sp->u.sh.type < SH_type_max_shadow);
2642 ASSERT(sp->u.sh.type != SH_type_l2_32_shadow);
2643 ASSERT(sp->u.sh.type != SH_type_l2_pae_shadow);
2644 ASSERT(sp->u.sh.type != SH_type_l2h_pae_shadow);
2645 ASSERT(sp->u.sh.type != SH_type_l4_64_shadow);
2647 if (sp->up == 0) return 0;
2648 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2649 ASSERT(mfn_valid(pmfn));
2650 vaddr = sh_map_domain_page(pmfn);
2651 ASSERT(vaddr);
2652 vaddr += sp->up & (PAGE_SIZE-1);
2653 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2655 /* Is this the only reference to this shadow? */
2656 rc = (sp->u.sh.count == 1) ? 1 : 0;
2658 /* Blank the offending entry */
2659 switch (sp->u.sh.type)
2661 case SH_type_l1_32_shadow:
2662 case SH_type_l2_32_shadow:
2663 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
2664 break;
2665 case SH_type_l1_pae_shadow:
2666 case SH_type_l2_pae_shadow:
2667 case SH_type_l2h_pae_shadow:
2668 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
2669 break;
2670 #if CONFIG_PAGING_LEVELS >= 4
2671 case SH_type_l1_64_shadow:
2672 case SH_type_l2_64_shadow:
2673 case SH_type_l2h_64_shadow:
2674 case SH_type_l3_64_shadow:
2675 case SH_type_l4_64_shadow:
2676 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
2677 break;
2678 #endif
2679 default: BUG(); /* Some wierd unknown shadow type */
2682 sh_unmap_domain_page(vaddr);
2683 if ( rc )
2684 perfc_incr(shadow_up_pointer);
2685 else
2686 perfc_incr(shadow_unshadow_bf);
2688 return rc;
2691 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2692 /* Remove the shadows of this guest page.
2693 * If fast != 0, just try the quick heuristic, which will remove
2694 * at most one reference to each shadow of the page. Otherwise, walk
2695 * all the shadow tables looking for refs to shadows of this gmfn.
2696 * If all != 0, kill the domain if we can't find all the shadows.
2697 * (all != 0 implies fast == 0)
2698 */
2700 struct page_info *pg = mfn_to_page(gmfn);
2701 mfn_t smfn;
2702 int do_locking;
2703 unsigned char t;
2705 /* Dispatch table for getting per-type functions: each level must
2706 * be called with the function to remove a lower-level shadow. */
2707 static const hash_callback_t callbacks[SH_type_unused] = {
2708 NULL, /* none */
2709 NULL, /* l1_32 */
2710 NULL, /* fl1_32 */
2711 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
2712 NULL, /* l1_pae */
2713 NULL, /* fl1_pae */
2714 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
2715 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2716 NULL, /* l1_64 */
2717 NULL, /* fl1_64 */
2718 #if CONFIG_PAGING_LEVELS >= 4
2719 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
2720 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
2721 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
2722 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
2723 #else
2724 NULL, /* l2_64 */
2725 NULL, /* l2h_64 */
2726 NULL, /* l3_64 */
2727 NULL, /* l4_64 */
2728 #endif
2729 NULL, /* p2m */
2730 NULL /* unused */
2731 };
2733 /* Another lookup table, for choosing which mask to use */
2734 static unsigned int masks[SH_type_unused] = {
2735 0, /* none */
2736 1 << SH_type_l2_32_shadow, /* l1_32 */
2737 0, /* fl1_32 */
2738 0, /* l2_32 */
2739 ((1 << SH_type_l2h_pae_shadow)
2740 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2741 0, /* fl1_pae */
2742 0, /* l2_pae */
2743 0, /* l2h_pae */
2744 ((1 << SH_type_l2h_64_shadow)
2745 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2746 0, /* fl1_64 */
2747 1 << SH_type_l3_64_shadow, /* l2_64 */
2748 1 << SH_type_l3_64_shadow, /* l2h_64 */
2749 1 << SH_type_l4_64_shadow, /* l3_64 */
2750 0, /* l4_64 */
2751 0, /* p2m */
2752 0 /* unused */
2753 };
2755 ASSERT(!(all && fast));
2756 ASSERT(mfn_valid(gmfn));
2758 /* Although this is an externally visible function, we do not know
2759 * whether the shadow lock will be held when it is called (since it
2760 * can be called via put_page_type when we clear a shadow l1e).
2761 * If the lock isn't held, take it for the duration of the call. */
2762 do_locking = !shadow_locked_by_me(v->domain);
2763 if ( do_locking ) shadow_lock(v->domain);
2765 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2766 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2768 /* Bail out now if the page is not shadowed */
2769 if ( (pg->count_info & PGC_page_table) == 0 )
2771 if ( do_locking ) shadow_unlock(v->domain);
2772 return;
2775 /* Search for this shadow in all appropriate shadows */
2776 perfc_incr(shadow_unshadow);
2778 /* Lower-level shadows need to be excised from upper-level shadows.
2779 * This call to hash_foreach() looks dangerous but is in fact OK: each
2780 * call will remove at most one shadow, and terminate immediately when
2781 * it does remove it, so we never walk the hash after doing a deletion. */
2782 #define DO_UNSHADOW(_type) do { \
2783 t = (_type); \
2784 if( !(pg->count_info & PGC_page_table) \
2785 || !(pg->shadow_flags & (1 << t)) ) \
2786 break; \
2787 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2788 if ( unlikely(!mfn_valid(smfn)) ) \
2789 { \
2790 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2791 " but no type-0x%"PRIx32" shadow\n", \
2792 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2793 break; \
2794 } \
2795 if ( sh_type_is_pinnable(v, t) ) \
2796 sh_unpin(v, smfn); \
2797 else \
2798 sh_remove_shadow_via_pointer(v, smfn); \
2799 if( !fast \
2800 && (pg->count_info & PGC_page_table) \
2801 && (pg->shadow_flags & (1 << t)) ) \
2802 hash_foreach(v, masks[t], callbacks, smfn); \
2803 } while (0)
2805 DO_UNSHADOW(SH_type_l2_32_shadow);
2806 DO_UNSHADOW(SH_type_l1_32_shadow);
2807 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2808 DO_UNSHADOW(SH_type_l2_pae_shadow);
2809 DO_UNSHADOW(SH_type_l1_pae_shadow);
2810 #if CONFIG_PAGING_LEVELS >= 4
2811 DO_UNSHADOW(SH_type_l4_64_shadow);
2812 DO_UNSHADOW(SH_type_l3_64_shadow);
2813 DO_UNSHADOW(SH_type_l2h_64_shadow);
2814 DO_UNSHADOW(SH_type_l2_64_shadow);
2815 DO_UNSHADOW(SH_type_l1_64_shadow);
2816 #endif
2818 #undef DO_UNSHADOW
2820 /* If that didn't catch the shadows, something is wrong */
2821 if ( !fast && all && (pg->count_info & PGC_page_table) )
2823 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2824 "(shadow_flags=%08x)\n",
2825 mfn_x(gmfn), pg->shadow_flags);
2826 domain_crash(v->domain);
2829 /* Need to flush TLBs now, so that linear maps are safe next time we
2830 * take a fault. */
2831 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
2833 if ( do_locking ) shadow_unlock(v->domain);
2836 static void
2837 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2838 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2839 * Unshadow it, and recursively unshadow pages that reference it. */
2841 sh_remove_shadows(v, gmfn, 0, 1);
2842 /* XXX TODO:
2843 * Rework this hashtable walker to return a linked-list of all
2844 * the shadows it modified, then do breadth-first recursion
2845 * to find the way up to higher-level tables and unshadow them too.
2847 * The current code (just tearing down each page's shadows as we
2848 * detect that it is not a pagetable) is correct, but very slow.
2849 * It means extra emulated writes and slows down removal of mappings. */
2852 /**************************************************************************/
2854 static void sh_update_paging_modes(struct vcpu *v)
2856 struct domain *d = v->domain;
2857 const struct paging_mode *old_mode = v->arch.paging.mode;
2859 ASSERT(shadow_locked_by_me(d));
2861 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2862 /* Make sure this vcpu has a virtual TLB array allocated */
2863 if ( unlikely(!v->arch.paging.vtlb) )
2865 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2866 if ( unlikely(!v->arch.paging.vtlb) )
2868 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2869 d->domain_id, v->vcpu_id);
2870 domain_crash(v->domain);
2871 return;
2873 memset(v->arch.paging.vtlb, 0,
2874 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2875 spin_lock_init(&v->arch.paging.vtlb_lock);
2877 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2879 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2880 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
2882 int i;
2883 for(i = 0; i < SHADOW_OOS_PAGES; i++)
2885 shadow_prealloc(d, SH_type_oos_snapshot, 1);
2886 v->arch.paging.shadow.oos_snapshot[i] =
2887 shadow_alloc(d, SH_type_oos_snapshot, 0);
2890 #endif /* OOS */
2892 // Valid transitions handled by this function:
2893 // - For PV guests:
2894 // - after a shadow mode has been changed
2895 // - For HVM guests:
2896 // - after a shadow mode has been changed
2897 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2898 //
2900 // First, tear down any old shadow tables held by this vcpu.
2901 //
2902 if ( v->arch.paging.mode )
2903 v->arch.paging.mode->shadow.detach_old_tables(v);
2905 if ( !is_hvm_domain(d) )
2907 ///
2908 /// PV guest
2909 ///
2910 #if CONFIG_PAGING_LEVELS == 4
2911 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2912 #else /* CONFIG_PAGING_LEVELS == 3 */
2913 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2914 #endif
2916 else
2918 ///
2919 /// HVM guest
2920 ///
2921 ASSERT(shadow_mode_translate(d));
2922 ASSERT(shadow_mode_external(d));
2924 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2925 /* Need to resync all our pages now, because if a page goes out
2926 * of sync with paging enabled and is resynced with paging
2927 * disabled, the resync will go wrong. */
2928 shadow_resync_all(v, 0);
2929 #endif /* OOS */
2931 if ( !hvm_paging_enabled(v) )
2933 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2934 * pagetable for it, mapping 4 GB one-to-one using a single l2
2935 * page of 1024 superpage mappings */
2936 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2937 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2939 else
2941 #ifdef __x86_64__
2942 if ( hvm_long_mode_enabled(v) )
2944 // long mode guest...
2945 v->arch.paging.mode =
2946 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2948 else
2949 #endif
2950 if ( hvm_pae_enabled(v) )
2952 // 32-bit PAE mode guest...
2953 v->arch.paging.mode =
2954 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2956 else
2958 // 32-bit 2 level guest...
2959 v->arch.paging.mode =
2960 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2964 if ( pagetable_is_null(v->arch.monitor_table) )
2966 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2967 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2968 make_cr3(v, mfn_x(mmfn));
2969 hvm_update_host_cr3(v);
2972 if ( v->arch.paging.mode != old_mode )
2974 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
2975 "(was g=%u s=%u)\n",
2976 d->domain_id, v->vcpu_id,
2977 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2978 v->arch.paging.mode->guest_levels,
2979 v->arch.paging.mode->shadow.shadow_levels,
2980 old_mode ? old_mode->guest_levels : 0,
2981 old_mode ? old_mode->shadow.shadow_levels : 0);
2982 if ( old_mode &&
2983 (v->arch.paging.mode->shadow.shadow_levels !=
2984 old_mode->shadow.shadow_levels) )
2986 /* Need to make a new monitor table for the new mode */
2987 mfn_t new_mfn, old_mfn;
2989 if ( v != current && vcpu_runnable(v) )
2991 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2992 "this HVM vcpu's (d=%u v=%u) paging mode "
2993 "while it is running.\n",
2994 current->domain->domain_id, current->vcpu_id,
2995 v->domain->domain_id, v->vcpu_id);
2996 /* It's not safe to do that because we can't change
2997 * the host CR3 for a running domain */
2998 domain_crash(v->domain);
2999 return;
3002 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
3003 v->arch.monitor_table = pagetable_null();
3004 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
3005 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
3006 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
3007 mfn_x(new_mfn));
3009 /* Don't be running on the old monitor table when we
3010 * pull it down! Switch CR3, and warn the HVM code that
3011 * its host cr3 has changed. */
3012 make_cr3(v, mfn_x(new_mfn));
3013 if ( v == current )
3014 write_ptbase(v);
3015 hvm_update_host_cr3(v);
3016 old_mode->shadow.destroy_monitor_table(v, old_mfn);
3020 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
3021 // These are HARD: think about the case where two CPU's have
3022 // different values for CR4.PSE and CR4.PGE at the same time.
3023 // This *does* happen, at least for CR4.PGE...
3026 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3027 /* We need to check that all the vcpus have paging enabled to
3028 * unsync PTs. */
3029 if ( is_hvm_domain(d) && !d->arch.paging.shadow.oos_off )
3031 int pe = 1;
3032 struct vcpu *vptr;
3034 for_each_vcpu(d, vptr)
3036 if ( !hvm_paging_enabled(vptr) )
3038 pe = 0;
3039 break;
3043 d->arch.paging.shadow.oos_active = pe;
3045 #endif /* OOS */
3047 v->arch.paging.mode->update_cr3(v, 0);
3050 void shadow_update_paging_modes(struct vcpu *v)
3052 shadow_lock(v->domain);
3053 sh_update_paging_modes(v);
3054 shadow_unlock(v->domain);
3057 /**************************************************************************/
3058 /* Turning on and off shadow features */
3060 static void sh_new_mode(struct domain *d, u32 new_mode)
3061 /* Inform all the vcpus that the shadow mode has been changed */
3063 struct vcpu *v;
3065 ASSERT(shadow_locked_by_me(d));
3066 ASSERT(d != current->domain);
3067 d->arch.paging.mode = new_mode;
3068 for_each_vcpu(d, v)
3069 sh_update_paging_modes(v);
3072 int shadow_enable(struct domain *d, u32 mode)
3073 /* Turn on "permanent" shadow features: external, translate, refcount.
3074 * Can only be called once on a domain, and these features cannot be
3075 * disabled.
3076 * Returns 0 for success, -errno for failure. */
3078 unsigned int old_pages;
3079 struct page_info *pg = NULL;
3080 uint32_t *e;
3081 int i, rv = 0;
3083 mode |= PG_SH_enable;
3085 domain_pause(d);
3087 /* Sanity check the arguments */
3088 if ( (d == current->domain) ||
3089 shadow_mode_enabled(d) ||
3090 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
3091 ((mode & PG_external) && !(mode & PG_translate)) )
3093 rv = -EINVAL;
3094 goto out_unlocked;
3097 /* Init the shadow memory allocation if the user hasn't done so */
3098 old_pages = d->arch.paging.shadow.total_pages;
3099 if ( old_pages == 0 )
3101 unsigned int r;
3102 shadow_lock(d);
3103 r = sh_set_allocation(d, 1024, NULL); /* Use at least 4MB */
3104 if ( r != 0 )
3106 sh_set_allocation(d, 0, NULL);
3107 rv = -ENOMEM;
3108 goto out_locked;
3110 shadow_unlock(d);
3113 /* Init the P2M table. Must be done before we take the shadow lock
3114 * to avoid possible deadlock. */
3115 if ( mode & PG_translate )
3117 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
3118 if (rv != 0)
3119 goto out_unlocked;
3122 /* HVM domains need an extra pagetable for vcpus that think they
3123 * have paging disabled */
3124 if ( is_hvm_domain(d) )
3126 /* Get a single page from the shadow pool. Take it via the
3127 * P2M interface to make freeing it simpler afterwards. */
3128 pg = shadow_alloc_p2m_page(d);
3129 if ( pg == NULL )
3131 rv = -ENOMEM;
3132 goto out_unlocked;
3134 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3135 * of virtual address space onto the same physical address range */
3136 e = __map_domain_page(pg);
3137 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
3138 e[i] = ((0x400000U * i)
3139 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
3140 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3141 sh_unmap_domain_page(e);
3142 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3145 shadow_lock(d);
3147 /* Sanity check again with the lock held */
3148 if ( shadow_mode_enabled(d) )
3150 rv = -EINVAL;
3151 goto out_locked;
3154 /* Init the hash table */
3155 if ( shadow_hash_alloc(d) != 0 )
3157 rv = -ENOMEM;
3158 goto out_locked;
3161 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3162 /* We assume we're dealing with an older 64bit linux guest until we
3163 * see the guest use more than one l4 per vcpu. */
3164 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3165 #endif
3167 /* Record the 1-to-1 pagetable we just made */
3168 if ( is_hvm_domain(d) )
3169 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3171 /* Update the bits */
3172 sh_new_mode(d, mode);
3174 out_locked:
3175 shadow_unlock(d);
3176 out_unlocked:
3177 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
3178 p2m_teardown(d);
3179 if ( rv != 0 && pg != NULL )
3180 shadow_free_p2m_page(d, pg);
3181 domain_unpause(d);
3182 return rv;
3185 void shadow_teardown(struct domain *d)
3186 /* Destroy the shadow pagetables of this domain and free its shadow memory.
3187 * Should only be called for dying domains. */
3189 struct vcpu *v;
3190 mfn_t mfn;
3191 struct page_info *pg;
3193 ASSERT(d->is_dying);
3194 ASSERT(d != current->domain);
3196 if ( !shadow_locked_by_me(d) )
3197 shadow_lock(d); /* Keep various asserts happy */
3199 if ( shadow_mode_enabled(d) )
3201 /* Release the shadow and monitor tables held by each vcpu */
3202 for_each_vcpu(d, v)
3204 if ( v->arch.paging.mode )
3206 v->arch.paging.mode->shadow.detach_old_tables(v);
3207 if ( shadow_mode_external(d) )
3209 mfn = pagetable_get_mfn(v->arch.monitor_table);
3210 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3211 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3212 v->arch.monitor_table = pagetable_null();
3218 #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3219 /* Free the virtual-TLB array attached to each vcpu */
3220 for_each_vcpu(d, v)
3222 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3223 if ( v->arch.paging.vtlb )
3225 xfree(v->arch.paging.vtlb);
3226 v->arch.paging.vtlb = NULL;
3228 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3230 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3232 int i;
3233 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3234 for(i = 0; i < SHADOW_OOS_PAGES; i++)
3235 if ( mfn_valid(oos_snapshot[i]) )
3236 shadow_free(d, oos_snapshot[i]);
3238 #endif /* OOS */
3240 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3242 while ( (pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist)) )
3243 shadow_free_p2m_page(d, pg);
3245 if ( d->arch.paging.shadow.total_pages != 0 )
3247 SHADOW_PRINTK("teardown of domain %u starts."
3248 " Shadow pages total = %u, free = %u, p2m=%u\n",
3249 d->domain_id,
3250 d->arch.paging.shadow.total_pages,
3251 d->arch.paging.shadow.free_pages,
3252 d->arch.paging.shadow.p2m_pages);
3253 /* Destroy all the shadows and release memory to domheap */
3254 sh_set_allocation(d, 0, NULL);
3255 /* Release the hash table back to xenheap */
3256 if (d->arch.paging.shadow.hash_table)
3257 shadow_hash_teardown(d);
3258 /* Should not have any more memory held */
3259 SHADOW_PRINTK("teardown done."
3260 " Shadow pages total = %u, free = %u, p2m=%u\n",
3261 d->arch.paging.shadow.total_pages,
3262 d->arch.paging.shadow.free_pages,
3263 d->arch.paging.shadow.p2m_pages);
3264 ASSERT(d->arch.paging.shadow.total_pages == 0);
3267 /* Free the non-paged-vcpus pagetable; must happen after we've
3268 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3269 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3271 for_each_vcpu(d, v)
3273 ASSERT(is_hvm_vcpu(v));
3274 if ( !hvm_paging_enabled(v) )
3275 v->arch.guest_table = pagetable_null();
3277 shadow_free_p2m_page(d,
3278 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
3279 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3282 /* We leave the "permanent" shadow modes enabled, but clear the
3283 * log-dirty mode bit. We don't want any more mark_dirty()
3284 * calls now that we've torn down the bitmap */
3285 d->arch.paging.mode &= ~PG_log_dirty;
3287 if (d->arch.hvm_domain.dirty_vram) {
3288 xfree(d->arch.hvm_domain.dirty_vram->sl1ma);
3289 xfree(d->arch.hvm_domain.dirty_vram->dirty_bitmap);
3290 xfree(d->arch.hvm_domain.dirty_vram);
3291 d->arch.hvm_domain.dirty_vram = NULL;
3294 shadow_unlock(d);
3297 void shadow_final_teardown(struct domain *d)
3298 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3300 SHADOW_PRINTK("dom %u final teardown starts."
3301 " Shadow pages total = %u, free = %u, p2m=%u\n",
3302 d->domain_id,
3303 d->arch.paging.shadow.total_pages,
3304 d->arch.paging.shadow.free_pages,
3305 d->arch.paging.shadow.p2m_pages);
3307 /* Double-check that the domain didn't have any shadow memory.
3308 * It is possible for a domain that never got domain_kill()ed
3309 * to get here with its shadow allocation intact. */
3310 if ( d->arch.paging.shadow.total_pages != 0 )
3311 shadow_teardown(d);
3313 /* It is now safe to pull down the p2m map. */
3314 p2m_teardown(d);
3316 SHADOW_PRINTK("dom %u final teardown done."
3317 " Shadow pages total = %u, free = %u, p2m=%u\n",
3318 d->domain_id,
3319 d->arch.paging.shadow.total_pages,
3320 d->arch.paging.shadow.free_pages,
3321 d->arch.paging.shadow.p2m_pages);
3324 static int shadow_one_bit_enable(struct domain *d, u32 mode)
3325 /* Turn on a single shadow mode feature */
3327 ASSERT(shadow_locked_by_me(d));
3329 /* Sanity check the call */
3330 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3332 return -EINVAL;
3335 mode |= PG_SH_enable;
3337 if ( d->arch.paging.mode == 0 )
3339 /* Init the shadow memory allocation and the hash table */
3340 if ( sh_set_allocation(d, 1, NULL) != 0
3341 || shadow_hash_alloc(d) != 0 )
3343 sh_set_allocation(d, 0, NULL);
3344 return -ENOMEM;
3348 /* Update the bits */
3349 sh_new_mode(d, d->arch.paging.mode | mode);
3351 return 0;
3354 static int shadow_one_bit_disable(struct domain *d, u32 mode)
3355 /* Turn off a single shadow mode feature */
3357 struct vcpu *v;
3358 ASSERT(shadow_locked_by_me(d));
3360 /* Sanity check the call */
3361 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3363 return -EINVAL;
3366 /* Update the bits */
3367 sh_new_mode(d, d->arch.paging.mode & ~mode);
3368 if ( d->arch.paging.mode == 0 )
3370 /* Get this domain off shadows */
3371 SHADOW_PRINTK("un-shadowing of domain %u starts."
3372 " Shadow pages total = %u, free = %u, p2m=%u\n",
3373 d->domain_id,
3374 d->arch.paging.shadow.total_pages,
3375 d->arch.paging.shadow.free_pages,
3376 d->arch.paging.shadow.p2m_pages);
3377 for_each_vcpu(d, v)
3379 if ( v->arch.paging.mode )
3380 v->arch.paging.mode->shadow.detach_old_tables(v);
3381 #if CONFIG_PAGING_LEVELS == 4
3382 if ( !(v->arch.flags & TF_kernel_mode) )
3383 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
3384 else
3385 #endif
3386 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
3390 /* Pull down the memory allocation */
3391 if ( sh_set_allocation(d, 0, NULL) != 0 )
3393 // XXX - How can this occur?
3394 // Seems like a bug to return an error now that we've
3395 // disabled the relevant shadow mode.
3396 //
3397 return -ENOMEM;
3399 shadow_hash_teardown(d);
3400 SHADOW_PRINTK("un-shadowing of domain %u done."
3401 " Shadow pages total = %u, free = %u, p2m=%u\n",
3402 d->domain_id,
3403 d->arch.paging.shadow.total_pages,
3404 d->arch.paging.shadow.free_pages,
3405 d->arch.paging.shadow.p2m_pages);
3408 return 0;
3411 /* Enable/disable ops for the "test" and "log-dirty" modes */
3412 static int shadow_test_enable(struct domain *d)
3414 int ret;
3416 domain_pause(d);
3417 shadow_lock(d);
3418 ret = shadow_one_bit_enable(d, PG_SH_enable);
3419 shadow_unlock(d);
3420 domain_unpause(d);
3422 return ret;
3425 static int shadow_test_disable(struct domain *d)
3427 int ret;
3429 domain_pause(d);
3430 shadow_lock(d);
3431 ret = shadow_one_bit_disable(d, PG_SH_enable);
3432 shadow_unlock(d);
3433 domain_unpause(d);
3435 return ret;
3438 /**************************************************************************/
3439 /* P2M map manipulations */
3441 /* shadow specific code which should be called when P2M table entry is updated
3442 * with new content. It is responsible for update the entry, as well as other
3443 * shadow processing jobs.
3444 */
3446 static void sh_unshadow_for_p2m_change(struct vcpu *v, unsigned long gfn,
3447 l1_pgentry_t *p, mfn_t table_mfn,
3448 l1_pgentry_t new, unsigned int level)
3450 struct domain *d = v->domain;
3452 /* If we're removing an MFN from the p2m, remove it from the shadows too */
3453 if ( level == 1 )
3455 mfn_t mfn = _mfn(l1e_get_pfn(*p));
3456 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3457 if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) )
3459 sh_remove_all_shadows_and_parents(v, mfn);
3460 if ( sh_remove_all_mappings(v, mfn) )
3461 flush_tlb_mask(&d->domain_dirty_cpumask);
3465 /* If we're removing a superpage mapping from the p2m, we need to check
3466 * all the pages covered by it. If they're still there in the new
3467 * scheme, that's OK, but otherwise they must be unshadowed. */
3468 if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3469 (l1e_get_flags(*p) & _PAGE_PSE) )
3471 unsigned int i;
3472 cpumask_t flushmask;
3473 mfn_t omfn = _mfn(l1e_get_pfn(*p));
3474 mfn_t nmfn = _mfn(l1e_get_pfn(new));
3475 l1_pgentry_t *npte = NULL;
3476 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3477 if ( p2m_is_valid(p2mt) && mfn_valid(omfn) )
3479 cpus_clear(flushmask);
3481 /* If we're replacing a superpage with a normal L1 page, map it */
3482 if ( (l1e_get_flags(new) & _PAGE_PRESENT)
3483 && !(l1e_get_flags(new) & _PAGE_PSE)
3484 && mfn_valid(nmfn) )
3485 npte = map_domain_page(mfn_x(nmfn));
3487 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3489 if ( !npte
3490 || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
3491 || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
3493 /* This GFN->MFN mapping has gone away */
3494 sh_remove_all_shadows_and_parents(v, omfn);
3495 if ( sh_remove_all_mappings(v, omfn) )
3496 cpus_or(flushmask, flushmask, d->domain_dirty_cpumask);
3498 omfn = _mfn(mfn_x(omfn) + 1);
3500 flush_tlb_mask(&flushmask);
3502 if ( npte )
3503 unmap_domain_page(npte);
3508 void
3509 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
3510 l1_pgentry_t *p, mfn_t table_mfn,
3511 l1_pgentry_t new, unsigned int level)
3513 struct domain *d = v->domain;
3515 shadow_lock(d);
3517 /* If there are any shadows, update them. But if shadow_teardown()
3518 * has already been called then it's not safe to try. */
3519 if ( likely(d->arch.paging.shadow.total_pages != 0) )
3520 sh_unshadow_for_p2m_change(v, gfn, p, table_mfn, new, level);
3522 /* Update the entry with new content */
3523 safe_write_pte(p, new);
3525 /* install P2M in monitors for PAE Xen */
3526 #if CONFIG_PAGING_LEVELS == 3
3527 if ( level == 3 )
3528 /* We have written to the p2m l3: need to sync the per-vcpu
3529 * copies of it in the monitor tables */
3530 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
3531 #endif
3533 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3534 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3535 cached the fact that this is an mmio region in the shadow
3536 page tables. Blow the tables away to remove the cache.
3537 This is pretty heavy handed, but this is a rare operation
3538 (it might happen a dozen times during boot and then never
3539 again), so it doesn't matter too much. */
3540 if ( d->arch.paging.shadow.has_fast_mmio_entries )
3542 shadow_blow_tables(d);
3543 d->arch.paging.shadow.has_fast_mmio_entries = 0;
3545 #endif
3547 shadow_unlock(d);
3550 /**************************************************************************/
3551 /* Log-dirty mode support */
3553 /* Shadow specific code which is called in paging_log_dirty_enable().
3554 * Return 0 if no problem found.
3555 */
3556 int shadow_enable_log_dirty(struct domain *d)
3558 int ret;
3560 /* shadow lock is required here */
3561 shadow_lock(d);
3562 if ( shadow_mode_enabled(d) )
3564 /* This domain already has some shadows: need to clear them out
3565 * of the way to make sure that all references to guest memory are
3566 * properly write-protected */
3567 shadow_blow_tables(d);
3570 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3571 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3572 * change an l4e instead of cr3 to switch tables. Give them the
3573 * same optimization */
3574 if ( is_pv_32on64_domain(d) )
3575 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3576 #endif
3578 ret = shadow_one_bit_enable(d, PG_log_dirty);
3579 shadow_unlock(d);
3581 return ret;
3584 /* shadow specfic code which is called in paging_log_dirty_disable() */
3585 int shadow_disable_log_dirty(struct domain *d)
3587 int ret;
3589 /* shadow lock is required here */
3590 shadow_lock(d);
3591 ret = shadow_one_bit_disable(d, PG_log_dirty);
3592 shadow_unlock(d);
3594 return ret;
3597 /* This function is called when we CLEAN log dirty bitmap. See
3598 * paging_log_dirty_op() for details.
3599 */
3600 void shadow_clean_dirty_bitmap(struct domain *d)
3602 shadow_lock(d);
3603 /* Need to revoke write access to the domain's pages again.
3604 * In future, we'll have a less heavy-handed approach to this,
3605 * but for now, we just unshadow everything except Xen. */
3606 shadow_blow_tables(d);
3607 shadow_unlock(d);
3611 /**************************************************************************/
3612 /* VRAM dirty tracking support */
3613 int shadow_track_dirty_vram(struct domain *d,
3614 unsigned long begin_pfn,
3615 unsigned long nr,
3616 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
3618 int rc;
3619 unsigned long end_pfn = begin_pfn + nr;
3620 unsigned long dirty_size = (nr + 7) / 8;
3621 int flush_tlb = 0;
3622 unsigned long i;
3623 p2m_type_t t;
3624 struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
3626 if (end_pfn < begin_pfn
3627 || begin_pfn > d->arch.p2m->max_mapped_pfn
3628 || end_pfn >= d->arch.p2m->max_mapped_pfn)
3629 return -EINVAL;
3631 shadow_lock(d);
3633 if ( dirty_vram && (!nr ||
3634 ( begin_pfn != dirty_vram->begin_pfn
3635 || end_pfn != dirty_vram->end_pfn )) )
3637 /* Different tracking, tear the previous down. */
3638 gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", dirty_vram->begin_pfn, dirty_vram->end_pfn);
3639 xfree(dirty_vram->sl1ma);
3640 xfree(dirty_vram->dirty_bitmap);
3641 xfree(dirty_vram);
3642 dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3645 if ( !nr )
3647 rc = 0;
3648 goto out;
3651 /* This should happen seldomly (Video mode change),
3652 * no need to be careful. */
3653 if ( !dirty_vram )
3655 /* Throw away all the shadows rather than walking through them
3656 * up to nr times getting rid of mappings of each pfn */
3657 shadow_blow_tables(d);
3659 gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3661 rc = -ENOMEM;
3662 if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3663 goto out;
3664 dirty_vram->begin_pfn = begin_pfn;
3665 dirty_vram->end_pfn = end_pfn;
3666 d->arch.hvm_domain.dirty_vram = dirty_vram;
3668 if ( (dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3669 goto out_dirty_vram;
3670 memset(dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3672 if ( (dirty_vram->dirty_bitmap = xmalloc_array(uint8_t, dirty_size)) == NULL )
3673 goto out_sl1ma;
3674 memset(dirty_vram->dirty_bitmap, 0, dirty_size);
3676 dirty_vram->last_dirty = NOW();
3678 /* Tell the caller that this time we could not track dirty bits. */
3679 rc = -ENODATA;
3681 else if (dirty_vram->last_dirty == -1)
3683 /* still completely clean, just copy our empty bitmap */
3684 rc = -EFAULT;
3685 if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 )
3686 rc = 0;
3688 else
3690 #ifdef __i386__
3691 unsigned long map_mfn = INVALID_MFN;
3692 void *map_sl1p = NULL;
3693 #endif
3695 /* Iterate over VRAM to track dirty bits. */
3696 for ( i = 0; i < nr; i++ ) {
3697 mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
3698 struct page_info *page;
3699 int dirty = 0;
3700 paddr_t sl1ma = dirty_vram->sl1ma[i];
3702 if (mfn_x(mfn) == INVALID_MFN)
3704 dirty = 1;
3706 else
3708 page = mfn_to_page(mfn);
3709 switch (page->u.inuse.type_info & PGT_count_mask)
3711 case 0:
3712 /* No guest reference, nothing to track. */
3713 break;
3714 case 1:
3715 /* One guest reference. */
3716 if ( sl1ma == INVALID_PADDR )
3718 /* We don't know which sl1e points to this, too bad. */
3719 dirty = 1;
3720 /* TODO: Heuristics for finding the single mapping of
3721 * this gmfn */
3722 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3724 else
3726 /* Hopefully the most common case: only one mapping,
3727 * whose dirty bit we can use. */
3728 l1_pgentry_t *sl1e;
3729 #ifdef __i386__
3730 void *sl1p = map_sl1p;
3731 unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3733 if ( sl1mfn != map_mfn ) {
3734 if ( map_sl1p )
3735 sh_unmap_domain_page(map_sl1p);
3736 map_sl1p = sl1p = sh_map_domain_page(_mfn(sl1mfn));
3737 map_mfn = sl1mfn;
3739 sl1e = sl1p + (sl1ma & ~PAGE_MASK);
3740 #else
3741 sl1e = maddr_to_virt(sl1ma);
3742 #endif
3744 if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3746 dirty = 1;
3747 /* Note: this is atomic, so we may clear a
3748 * _PAGE_ACCESSED set by another processor. */
3749 l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3750 flush_tlb = 1;
3753 break;
3754 default:
3755 /* More than one guest reference,
3756 * we don't afford tracking that. */
3757 dirty = 1;
3758 break;
3762 if ( dirty )
3764 dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3765 dirty_vram->last_dirty = NOW();
3769 #ifdef __i386__
3770 if ( map_sl1p )
3771 sh_unmap_domain_page(map_sl1p);
3772 #endif
3774 rc = -EFAULT;
3775 if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
3776 memset(dirty_vram->dirty_bitmap, 0, dirty_size);
3777 if (dirty_vram->last_dirty + SECONDS(2) < NOW())
3779 /* was clean for more than two seconds, try to disable guest
3780 * write access */
3781 for ( i = begin_pfn; i < end_pfn; i++ ) {
3782 mfn_t mfn = gfn_to_mfn(d, i, &t);
3783 if (mfn_x(mfn) != INVALID_MFN)
3784 flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
3786 dirty_vram->last_dirty = -1;
3788 rc = 0;
3791 if ( flush_tlb )
3792 flush_tlb_mask(&d->domain_dirty_cpumask);
3793 goto out;
3795 out_sl1ma:
3796 xfree(dirty_vram->sl1ma);
3797 out_dirty_vram:
3798 xfree(dirty_vram);
3799 dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3801 out:
3802 shadow_unlock(d);
3803 return rc;
3806 /**************************************************************************/
3807 /* Shadow-control XEN_DOMCTL dispatcher */
3809 int shadow_domctl(struct domain *d,
3810 xen_domctl_shadow_op_t *sc,
3811 XEN_GUEST_HANDLE(void) u_domctl)
3813 int rc, preempted = 0;
3815 switch ( sc->op )
3817 case XEN_DOMCTL_SHADOW_OP_OFF:
3818 if ( d->arch.paging.mode == PG_SH_enable )
3819 if ( (rc = shadow_test_disable(d)) != 0 )
3820 return rc;
3821 return 0;
3823 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3824 return shadow_test_enable(d);
3826 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3827 return shadow_enable(d, PG_refcounts|PG_translate);
3829 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3830 return shadow_enable(d, sc->mode << PG_mode_shift);
3832 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3833 sc->mb = shadow_get_allocation(d);
3834 return 0;
3836 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3837 shadow_lock(d);
3838 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3840 /* Can't set the allocation to zero unless the domain stops using
3841 * shadow pagetables first */
3842 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3843 " is still using shadows.\n", d->domain_id);
3844 shadow_unlock(d);
3845 return -EINVAL;
3847 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3848 shadow_unlock(d);
3849 if ( preempted )
3850 /* Not finished. Set up to re-run the call. */
3851 rc = hypercall_create_continuation(
3852 __HYPERVISOR_domctl, "h", u_domctl);
3853 else
3854 /* Finished. Return the new allocation */
3855 sc->mb = shadow_get_allocation(d);
3856 return rc;
3858 default:
3859 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3860 return -EINVAL;
3865 /**************************************************************************/
3866 /* Auditing shadow tables */
3868 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3870 void shadow_audit_tables(struct vcpu *v)
3872 /* Dispatch table for getting per-type functions */
3873 static const hash_callback_t callbacks[SH_type_unused] = {
3874 NULL, /* none */
3875 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */
3876 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */
3877 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */
3878 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */
3879 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3880 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */
3881 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */
3882 #if CONFIG_PAGING_LEVELS >= 4
3883 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */
3884 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */
3885 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */
3886 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */
3887 SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */
3888 SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */
3889 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3890 NULL /* All the rest */
3891 };
3892 unsigned int mask;
3894 if ( !(SHADOW_AUDIT_ENABLE) )
3895 return;
3897 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3898 sh_oos_audit(v->domain);
3899 #endif
3901 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3902 mask = ~1; /* Audit every table in the system */
3903 else
3905 /* Audit only the current mode's tables */
3906 switch ( v->arch.paging.mode->guest_levels )
3908 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3909 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3910 |SHF_L2H_PAE); break;
3911 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3912 |SHF_L3_64|SHF_L4_64); break;
3913 default: BUG();
3917 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3920 #endif /* Shadow audit */
3922 /*
3923 * Local variables:
3924 * mode: C
3925 * c-set-style: "BSD"
3926 * c-basic-offset: 4
3927 * indent-tabs-mode: nil
3928 * End:
3929 */