debuggers.hg

view xen/arch/x86/shadow.c @ 4671:18a8f5216548

bitkeeper revision 1.1366 (4268c126o36cKcnzrSkVxkbrPsoz1g)

Clean up shadow destruction and fix domain destroy when shadow mode
is disabled.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Apr 22 09:17:26 2005 +0000 (2005-04-22)
parents 744349042cd0
children ccc4ee412321
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <asm/shadow.h>
27 #include <asm/domain_page.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
33 static void shadow_free_snapshot(struct domain *d,
34 struct out_of_sync_entry *entry);
35 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
36 static void free_writable_pte_predictions(struct domain *d);
38 /********
40 There's a per-domain shadow table spin lock which works fine for SMP
41 hosts. We don't have to worry about interrupts as no shadow operations
42 happen in an interrupt context. It's probably not quite ready for SMP
43 guest operation as we have to worry about synchonisation between gpte
44 and spte updates. Its possible that this might only happen in a
45 hypercall context, in which case we'll probably at have a per-domain
46 hypercall lock anyhow (at least initially).
48 ********/
50 static inline int
51 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
52 unsigned long new_type)
53 {
54 struct pfn_info *page = pfn_to_page(gmfn);
55 int pinned = 0, okay = 1;
57 if ( page_out_of_sync(page) )
58 {
59 // Don't know how long ago this snapshot was taken.
60 // Can't trust it to be recent enough.
61 //
62 __shadow_sync_mfn(d, gmfn);
63 }
65 if ( unlikely(page_is_page_table(page)) )
66 return 1;
68 FSH_LOG("%s: gpfn=%p gmfn=%p nt=%p", __func__, gpfn, gmfn, new_type);
70 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
71 {
72 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%p gmfn=%p",
73 __func__, gpfn, gmfn);
74 #if 1 || defined(LIVE_DANGEROUSLY)
75 set_bit(_PGC_page_table, &page->count_info);
76 return 1;
77 #endif
78 return 0;
80 }
82 // To convert this page to use as a page table, the writable count
83 // should now be zero. Test this by grabbing the page as an page table,
84 // and then immediately releasing. This will also deal with any
85 // necessary TLB flushing issues for us.
86 //
87 // The cruft here about pinning doesn't really work right. This
88 // needs rethinking/rewriting... Need to gracefully deal with the
89 // TLB flushes required when promoting a writable page, and also deal
90 // with any outstanding (external) writable refs to this page (by
91 // refusing to promote it). The pinning headache complicates this
92 // code -- it would all much get simpler if we stop using
93 // shadow_lock() and move the shadow code to BIGLOCK().
94 //
95 if ( unlikely(!get_page(page, d)) )
96 BUG(); // XXX -- needs more thought for a graceful failure
97 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
98 {
99 pinned = 1;
100 put_page_and_type(page);
101 }
102 if ( get_page_type(page, PGT_base_page_table) )
103 {
104 set_bit(_PGC_page_table, &page->count_info);
105 put_page_type(page);
106 }
107 else
108 {
109 printk("shadow_promote: get_page_type failed "
110 "dom%d gpfn=%p gmfn=%p t=%x\n",
111 d->id, gpfn, gmfn, new_type);
112 okay = 0;
113 }
115 // Now put the type back to writable...
116 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
117 BUG(); // XXX -- needs more thought for a graceful failure
118 if ( unlikely(pinned) )
119 {
120 if ( unlikely(test_and_set_bit(_PGT_pinned,
121 &page->u.inuse.type_info)) )
122 BUG(); // hmm... someone pinned this again?
123 }
124 else
125 put_page_and_type(page);
127 return okay;
128 }
130 static inline void
131 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
132 {
133 ASSERT(frame_table[gmfn].count_info & PGC_page_table);
135 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
136 {
137 clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
139 if ( page_out_of_sync(pfn_to_page(gmfn)) )
140 {
141 remove_out_of_sync_entries(d, gmfn);
142 }
143 }
144 }
146 /*
147 * Things in shadow mode that collect get_page() refs to the domain's
148 * pages are:
149 * - PGC_allocated takes a gen count, just like normal.
150 * - A writable page can be pinned (paravirtualized guests may consider
151 * these pages to be L1s or L2s, and don't know the difference).
152 * Pinning a page takes a gen count (but, for domains in shadow mode,
153 * it *doesn't* take a type count)
154 * - CR3 grabs a ref to whatever it points at, just like normal.
155 * - Shadow mode grabs an initial gen count for itself, as a placehold
156 * for whatever references will exist.
157 * - Shadow PTEs that point to a page take a gen count, just like regular
158 * PTEs. However, they don't get a type count, as get_page_type() is
159 * hardwired to keep writable pages' counts at 1 for domains in shadow
160 * mode.
161 * - Whenever we shadow a page, the entry in the shadow hash grabs a
162 * general ref to the page.
163 * - Whenever a page goes out of sync, the out of sync entry grabs a
164 * general ref to the page.
165 */
166 /*
167 * pfn_info fields for pages allocated as shadow pages:
168 *
169 * All 32 bits of count_info are a simple count of refs to this shadow
170 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
171 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
172 * references.
173 *
174 * u.inuse._domain is left NULL, to prevent accidently allow some random
175 * domain from gaining permissions to map this page.
176 *
177 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
178 * shadowed.
179 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
180 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
181 * is currently exists because this is a shadow of a root page, and we
182 * don't want to let those disappear just because no CR3 is currently pointing
183 * at it.
184 *
185 * tlbflush_timestamp holds a min & max index of valid page table entries
186 * within the shadow page.
187 */
189 static inline unsigned long
190 alloc_shadow_page(struct domain *d,
191 unsigned long gpfn, unsigned long gmfn,
192 u32 psh_type)
193 {
194 struct pfn_info *page;
195 unsigned long smfn;
196 int pin = 0;
198 // Currently, we only keep pre-zero'ed pages around for use as L1's...
199 // This will change. Soon.
200 //
201 if ( psh_type == PGT_l1_shadow )
202 {
203 if ( !list_empty(&d->arch.free_shadow_frames) )
204 {
205 struct list_head *entry = d->arch.free_shadow_frames.next;
206 page = list_entry(entry, struct pfn_info, list);
207 list_del(entry);
208 perfc_decr(free_l1_pages);
209 }
210 else
211 {
212 page = alloc_domheap_page(NULL);
213 void *l1 = map_domain_mem(page_to_pfn(page) << PAGE_SHIFT);
214 memset(l1, 0, PAGE_SIZE);
215 unmap_domain_mem(l1);
216 }
217 }
218 else
219 page = alloc_domheap_page(NULL);
221 if ( unlikely(page == NULL) )
222 {
223 printk("Couldn't alloc shadow page! dom%d count=%d\n",
224 d->id, d->arch.shadow_page_count);
225 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
226 perfc_value(shadow_l1_pages),
227 perfc_value(shadow_l2_pages),
228 perfc_value(hl2_table_pages),
229 perfc_value(snapshot_pages));
230 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
231 }
233 smfn = page_to_pfn(page);
235 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
236 page->u.inuse.type_info = psh_type | gmfn;
237 page->count_info = 0;
238 page->tlbflush_timestamp = 0;
240 switch ( psh_type )
241 {
242 case PGT_l1_shadow:
243 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
244 goto fail;
245 perfc_incr(shadow_l1_pages);
246 d->arch.shadow_page_count++;
247 break;
249 case PGT_l2_shadow:
250 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
251 goto fail;
252 perfc_incr(shadow_l2_pages);
253 d->arch.shadow_page_count++;
254 if ( PGT_l2_page_table == PGT_root_page_table )
255 pin = 1;
257 break;
259 case PGT_hl2_shadow:
260 // Treat an hl2 as an L1 for purposes of promotion.
261 // For external mode domains, treat them as an L2 for purposes of
262 // pinning.
263 //
264 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
265 goto fail;
266 perfc_incr(hl2_table_pages);
267 d->arch.hl2_page_count++;
268 if ( shadow_mode_external(d) &&
269 (PGT_l2_page_table == PGT_root_page_table) )
270 pin = 1;
272 break;
274 case PGT_snapshot:
275 perfc_incr(snapshot_pages);
276 d->arch.snapshot_page_count++;
277 break;
279 default:
280 printk("Alloc shadow weird page type type=%08x\n", psh_type);
281 BUG();
282 break;
283 }
285 // Don't add a new shadow of something that already has a snapshot.
286 //
287 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
289 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
291 if ( pin )
292 shadow_pin(smfn);
294 return smfn;
296 fail:
297 FSH_LOG("promotion of pfn=%p mfn=%p failed! external gnttab refs?",
298 gpfn, gmfn);
299 free_domheap_page(page);
300 return 0;
301 }
303 static void inline
304 free_shadow_l1_table(struct domain *d, unsigned long smfn)
305 {
306 l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
307 int i;
308 struct pfn_info *spage = pfn_to_page(smfn);
309 u32 min_max = spage->tlbflush_timestamp;
310 int min = SHADOW_MIN(min_max);
311 int max = SHADOW_MAX(min_max);
313 for ( i = min; i <= max; i++ )
314 {
315 put_page_from_l1e(pl1e[i], d);
316 pl1e[i] = l1e_empty();
317 }
319 unmap_domain_mem(pl1e);
320 }
322 static void inline
323 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
324 {
325 l1_pgentry_t *hl2 = map_domain_mem(smfn << PAGE_SHIFT);
326 int i, limit;
328 SH_VVLOG("%s: smfn=%p freed", __func__, smfn);
330 #ifdef __i386__
331 if ( shadow_mode_external(d) )
332 limit = L2_PAGETABLE_ENTRIES;
333 else
334 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
335 #else
336 limit = 0; /* XXX x86/64 XXX */
337 #endif
339 for ( i = 0; i < limit; i++ )
340 {
341 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
342 put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
343 }
345 unmap_domain_mem(hl2);
346 }
348 static void inline
349 free_shadow_l2_table(struct domain *d, unsigned long smfn)
350 {
351 unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
352 int i, external = shadow_mode_external(d);
354 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
355 if ( external || is_guest_l2_slot(i) )
356 if ( pl2e[i] & _PAGE_PRESENT )
357 put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
359 if ( (PGT_base_page_table == PGT_l2_page_table) &&
360 shadow_mode_translate(d) && !external )
361 {
362 // free the ref to the hl2
363 //
364 put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
365 >> PAGE_SHIFT);
366 }
368 unmap_domain_mem(pl2e);
369 }
371 void free_shadow_page(unsigned long smfn)
372 {
373 struct pfn_info *page = &frame_table[smfn];
374 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
375 struct domain *d = page_get_owner(pfn_to_page(gmfn));
376 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
377 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
379 SH_VVLOG("%s: free'ing smfn=%p", __func__, smfn);
381 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
383 delete_shadow_status(d, gpfn, gmfn, type);
385 switch ( type )
386 {
387 case PGT_l1_shadow:
388 perfc_decr(shadow_l1_pages);
389 shadow_demote(d, gpfn, gmfn);
390 free_shadow_l1_table(d, smfn);
391 break;
393 case PGT_l2_shadow:
394 perfc_decr(shadow_l2_pages);
395 shadow_demote(d, gpfn, gmfn);
396 free_shadow_l2_table(d, smfn);
397 break;
399 case PGT_hl2_shadow:
400 perfc_decr(hl2_table_pages);
401 shadow_demote(d, gpfn, gmfn);
402 free_shadow_hl2_table(d, smfn);
403 break;
405 case PGT_snapshot:
406 perfc_decr(snapshot_pages);
407 break;
409 default:
410 printk("Free shadow weird page type mfn=%08x type=%08x\n",
411 page-frame_table, page->u.inuse.type_info);
412 break;
413 }
415 d->arch.shadow_page_count--;
417 // No TLB flushes are needed the next time this page gets allocated.
418 //
419 page->tlbflush_timestamp = 0;
420 page->u.free.cpu_mask = 0;
422 if ( type == PGT_l1_shadow )
423 {
424 list_add(&page->list, &d->arch.free_shadow_frames);
425 perfc_incr(free_l1_pages);
426 }
427 else
428 free_domheap_page(page);
429 }
431 static void inline
432 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
433 {
434 struct pfn_info *page;
436 page = &frame_table[entry->gmfn];
438 // Decrement ref count of guest & shadow pages
439 //
440 put_page(page);
442 // Only use entries that have low bits clear...
443 //
444 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
445 {
446 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
447 entry->writable_pl1e = -2;
448 }
449 else
450 ASSERT( entry->writable_pl1e == -1 );
452 // Free the snapshot
453 //
454 shadow_free_snapshot(d, entry);
455 }
457 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
458 {
459 struct out_of_sync_entry *entry = d->arch.out_of_sync;
460 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
461 struct out_of_sync_entry *found = NULL;
463 // NB: Be careful not to call something that manipulates this list
464 // while walking it. Collect the results into a separate list
465 // first, then walk that list.
466 //
467 while ( entry )
468 {
469 if ( entry->gmfn == gmfn )
470 {
471 // remove from out of sync list
472 *prev = entry->next;
474 // add to found list
475 entry->next = found;
476 found = entry;
478 entry = *prev;
479 continue;
480 }
481 prev = &entry->next;
482 entry = entry->next;
483 }
485 prev = NULL;
486 entry = found;
487 while ( entry )
488 {
489 release_out_of_sync_entry(d, entry);
491 prev = &entry->next;
492 entry = entry->next;
493 }
495 // Add found list to free list
496 if ( prev )
497 {
498 *prev = d->arch.out_of_sync_free;
499 d->arch.out_of_sync_free = found;
500 }
501 }
503 static void free_out_of_sync_state(struct domain *d)
504 {
505 struct out_of_sync_entry *entry;
507 // NB: Be careful not to call something that manipulates this list
508 // while walking it. Remove one item at a time, and always
509 // restart from start of list.
510 //
511 while ( (entry = d->arch.out_of_sync) )
512 {
513 d->arch.out_of_sync = entry->next;
514 release_out_of_sync_entry(d, entry);
516 entry->next = d->arch.out_of_sync_free;
517 d->arch.out_of_sync_free = entry;
518 }
519 }
521 static void free_shadow_pages(struct domain *d)
522 {
523 int i;
524 struct shadow_status *x;
525 struct exec_domain *ed;
527 /*
528 * WARNING! The shadow page table must not currently be in use!
529 * e.g., You are expected to have paused the domain and synchronized CR3.
530 */
532 if( !d->arch.shadow_ht ) return;
534 shadow_audit(d, 1);
536 // first, remove any outstanding refs from out_of_sync entries...
537 //
538 free_out_of_sync_state(d);
540 // second, remove any outstanding refs from ed->arch.shadow_table...
541 //
542 for_each_exec_domain(d, ed)
543 {
544 if ( pagetable_val(ed->arch.shadow_table) )
545 {
546 put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
547 ed->arch.shadow_table = mk_pagetable(0);
548 }
549 }
551 // For external shadows, remove the monitor table's refs
552 //
553 if ( shadow_mode_external(d) )
554 {
555 for_each_exec_domain(d, ed)
556 {
557 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
559 if ( mpl2e )
560 {
561 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
562 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
564 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
565 {
566 put_shadow_ref(l2e_get_pfn(hl2e));
567 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
568 }
569 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
570 {
571 put_shadow_ref(l2e_get_pfn(smfn));
572 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
573 }
574 }
575 }
576 }
578 // Now, the only refs to shadow pages that are left are from the shadow
579 // pages themselves. We just unpin the pinned pages, and the rest
580 // should automatically disappear.
581 //
582 // NB: Beware: each explicitly or implicit call to free_shadow_page
583 // can/will result in the hash bucket getting rewritten out from
584 // under us... First, collect the list of pinned pages, then
585 // free them.
586 //
587 #define PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
588 for ( i = 0; i < shadow_ht_buckets; i++ )
589 {
590 u32 count;
591 unsigned long *mfn_list;
593 /* Skip empty buckets. */
594 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
595 continue;
597 count = 0;
598 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
599 if ( PINNED(x->smfn) )
600 count++;
601 if ( !count )
602 continue;
604 mfn_list = xmalloc_array(unsigned long, count);
605 count = 0;
606 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
607 if ( PINNED(x->smfn) )
608 mfn_list[count++] = x->smfn;
610 while ( count )
611 {
612 shadow_unpin(mfn_list[--count]);
613 }
614 xfree(mfn_list);
615 }
616 #undef PINNED
618 shadow_audit(d, 0);
620 SH_LOG("Free shadow table.");
621 }
623 void shadow_mode_init(void)
624 {
625 }
627 int _shadow_mode_enabled(struct domain *d)
628 {
629 return shadow_mode_enabled(d);
630 }
632 static void alloc_monitor_pagetable(struct exec_domain *ed)
633 {
634 unsigned long mmfn;
635 l2_pgentry_t *mpl2e;
636 struct pfn_info *mmfn_info;
637 struct domain *d = ed->domain;
639 ASSERT(pagetable_val(ed->arch.monitor_table) == 0);
641 mmfn_info = alloc_domheap_page(NULL);
642 ASSERT(mmfn_info != NULL);
644 mmfn = (unsigned long) (mmfn_info - frame_table);
645 mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
646 memset(mpl2e, 0, PAGE_SIZE);
648 #ifdef __i386__ /* XXX screws x86/64 build */
649 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
650 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
651 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
652 #endif
654 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
655 l2e_create_phys(__pa(d->arch.mm_perdomain_pt),
656 __PAGE_HYPERVISOR);
658 // map the phys_to_machine map into the Read-Only MPT space for this domain
659 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
660 l2e_create_phys(pagetable_val(d->arch.phys_table),
661 __PAGE_HYPERVISOR);
663 // Don't (yet) have mappings for these...
664 // Don't want to accidentally see the idle_pg_table's linear mapping.
665 //
666 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
667 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
669 ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
670 ed->arch.monitor_vtable = mpl2e;
671 }
673 /*
674 * Free the pages for monitor_table and hl2_table
675 */
676 void free_monitor_pagetable(struct exec_domain *ed)
677 {
678 l2_pgentry_t *mpl2e, hl2e, sl2e;
679 unsigned long mfn;
681 ASSERT( pagetable_val(ed->arch.monitor_table) );
683 mpl2e = ed->arch.monitor_vtable;
685 /*
686 * First get the mfn for hl2_table by looking at monitor_table
687 */
688 hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
689 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
690 {
691 mfn = l2e_get_pfn(hl2e);
692 ASSERT(mfn);
693 put_shadow_ref(mfn);
694 }
696 sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
697 if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
698 {
699 mfn = l2e_get_pfn(sl2e);
700 ASSERT(mfn);
701 put_shadow_ref(mfn);
702 }
704 unmap_domain_mem(mpl2e);
706 /*
707 * Then free monitor_table.
708 */
709 mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
710 free_domheap_page(&frame_table[mfn]);
712 ed->arch.monitor_table = mk_pagetable(0);
713 ed->arch.monitor_vtable = 0;
714 }
716 int
717 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
718 {
719 unsigned long phystab = pagetable_val(d->arch.phys_table);
720 l2_pgentry_t *l2, l2e;
721 l1_pgentry_t *l1;
722 struct pfn_info *l1page;
723 unsigned long va = pfn << PAGE_SHIFT;
725 ASSERT( phystab );
727 l2 = map_domain_mem(phystab);
728 l2e = l2[l2_table_offset(va)];
729 if ( !l2e_get_value(l2e) ) /* FIXME: check present bit? */
730 {
731 l1page = alloc_domheap_page(NULL);
732 if ( !l1page )
733 return 0;
735 l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT);
736 memset(l1, 0, PAGE_SIZE);
737 unmap_domain_mem(l1);
739 l2e = l2e_create_pfn(page_to_pfn(l1page), __PAGE_HYPERVISOR);
740 l2[l2_table_offset(va)] = l2e;
741 }
742 unmap_domain_mem(l2);
744 l1 = map_domain_mem(l2e_get_phys(l2e));
745 l1[l1_table_offset(va)] = l1e_create_pfn(mfn, __PAGE_HYPERVISOR);
746 unmap_domain_mem(l1);
748 return 1;
749 }
751 static int
752 alloc_p2m_table(struct domain *d)
753 {
754 struct list_head *list_ent;
755 struct pfn_info *page, *l2page;
756 l2_pgentry_t *l2;
757 unsigned long mfn, pfn;
759 l2page = alloc_domheap_page(NULL);
760 if ( !l2page )
761 return 0;
762 d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT);
763 l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT);
764 memset(l2, 0, PAGE_SIZE);
765 unmap_domain_mem(l2);
767 list_ent = d->page_list.next;
768 while ( list_ent != &d->page_list )
769 {
770 page = list_entry(list_ent, struct pfn_info, list);
771 mfn = page_to_pfn(page);
772 pfn = machine_to_phys_mapping[mfn];
773 ASSERT(pfn != INVALID_M2P_ENTRY);
774 ASSERT(pfn < (1u<<20));
776 set_p2m_entry(d, pfn, mfn);
778 list_ent = page->list.next;
779 }
781 list_ent = d->xenpage_list.next;
782 while ( list_ent != &d->xenpage_list )
783 {
784 page = list_entry(list_ent, struct pfn_info, list);
785 mfn = page_to_pfn(page);
786 pfn = machine_to_phys_mapping[mfn];
787 if ( (pfn != INVALID_M2P_ENTRY) &&
788 (pfn < (1u<<20)) )
789 {
790 set_p2m_entry(d, pfn, mfn);
791 }
793 list_ent = page->list.next;
794 }
796 return 1;
797 }
799 static void
800 free_p2m_table(struct domain *d)
801 {
802 // uh, this needs some work... :)
803 BUG();
804 }
806 int __shadow_mode_enable(struct domain *d, unsigned int mode)
807 {
808 struct exec_domain *ed;
809 int new_modes = (mode & ~d->arch.shadow_mode);
811 // Gotta be adding something to call this function.
812 ASSERT(new_modes);
814 // can't take anything away by calling this function.
815 ASSERT(!(d->arch.shadow_mode & ~mode));
817 for_each_exec_domain(d, ed)
818 {
819 invalidate_shadow_ldt(ed);
821 // We need to set these up for __update_pagetables().
822 // See the comment there.
824 /*
825 * arch.guest_vtable
826 */
827 if ( ed->arch.guest_vtable &&
828 (ed->arch.guest_vtable != __linear_l2_table) )
829 {
830 unmap_domain_mem(ed->arch.guest_vtable);
831 }
832 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
833 ed->arch.guest_vtable = __linear_l2_table;
834 else
835 ed->arch.guest_vtable = NULL;
837 /*
838 * arch.shadow_vtable
839 */
840 if ( ed->arch.shadow_vtable &&
841 (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
842 {
843 unmap_domain_mem(ed->arch.shadow_vtable);
844 }
845 if ( !(mode & SHM_external) )
846 ed->arch.shadow_vtable = __shadow_linear_l2_table;
847 else
848 ed->arch.shadow_vtable = NULL;
850 /*
851 * arch.hl2_vtable
852 */
853 if ( ed->arch.hl2_vtable &&
854 (ed->arch.hl2_vtable != __linear_hl2_table) )
855 {
856 unmap_domain_mem(ed->arch.hl2_vtable);
857 }
858 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
859 ed->arch.hl2_vtable = __linear_hl2_table;
860 else
861 ed->arch.hl2_vtable = NULL;
863 /*
864 * arch.monitor_table & arch.monitor_vtable
865 */
866 if ( ed->arch.monitor_vtable )
867 {
868 free_monitor_pagetable(ed);
869 }
870 if ( mode & SHM_external )
871 {
872 alloc_monitor_pagetable(ed);
873 }
874 }
876 if ( new_modes & SHM_enable )
877 {
878 ASSERT( !d->arch.shadow_ht );
879 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
880 if ( d->arch.shadow_ht == NULL )
881 goto nomem;
883 memset(d->arch.shadow_ht, 0,
884 shadow_ht_buckets * sizeof(struct shadow_status));
885 }
887 if ( new_modes & SHM_log_dirty )
888 {
889 ASSERT( !d->arch.shadow_dirty_bitmap );
890 d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
891 d->arch.shadow_dirty_bitmap =
892 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
893 (8 * sizeof(unsigned long)));
894 if ( d->arch.shadow_dirty_bitmap == NULL )
895 {
896 d->arch.shadow_dirty_bitmap_size = 0;
897 goto nomem;
898 }
899 memset(d->arch.shadow_dirty_bitmap, 0,
900 d->arch.shadow_dirty_bitmap_size/8);
901 }
903 if ( new_modes & SHM_translate )
904 {
905 if ( !(new_modes & SHM_external) )
906 {
907 ASSERT( !pagetable_val(d->arch.phys_table) );
908 if ( !alloc_p2m_table(d) )
909 {
910 printk("alloc_p2m_table failed (out-of-memory?)\n");
911 goto nomem;
912 }
913 }
914 else
915 {
916 // external guests provide their own memory for their P2M maps.
917 //
918 ASSERT( d == page_get_owner(&frame_table[pagetable_val(
919 d->arch.phys_table)>>PAGE_SHIFT]) );
920 }
921 }
923 printk("audit1\n");
924 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
925 printk("audit1 done\n");
927 // Get rid of any shadow pages from any previous shadow mode.
928 //
929 free_shadow_pages(d);
931 printk("audit2\n");
932 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
933 printk("audit2 done\n");
935 // Turn off writable page tables.
936 // It doesn't mix with shadow mode.
937 // And shadow mode offers a superset of functionality.
938 //
939 vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
941 /*
942 * Tear down it's counts by disassembling its page-table-based ref counts.
943 * Also remove CR3's gcount/tcount.
944 * That leaves things like GDTs and LDTs and external refs in tact.
945 *
946 * Most pages will be writable tcount=0.
947 * Some will still be L1 tcount=0 or L2 tcount=0.
948 * Maybe some pages will be type none tcount=0.
949 * Pages granted external writable refs (via grant tables?) will
950 * still have a non-zero tcount. That's OK.
951 *
952 * gcounts will generally be 1 for PGC_allocated.
953 * GDTs and LDTs will have additional gcounts.
954 * Any grant-table based refs will still be in the gcount.
955 *
956 * We attempt to grab writable refs to each page (thus setting its type).
957 * Immediately put back those type refs.
958 *
959 * Assert that no pages are left with L1/L2/L3/L4 type.
960 */
961 audit_adjust_pgtables(d, -1, 1);
962 d->arch.shadow_mode = mode;
964 struct list_head *list_ent = d->page_list.next;
965 while ( list_ent != &d->page_list )
966 {
967 struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
968 if ( !get_page_type(page, PGT_writable_page) )
969 BUG();
970 put_page_type(page);
972 list_ent = page->list.next;
973 }
975 audit_adjust_pgtables(d, 1, 1);
977 printk("audit3\n");
978 _audit_domain(d, AUDIT_ALREADY_LOCKED);
979 printk("audit3 done\n");
981 return 0;
983 nomem:
984 if ( (new_modes & SHM_enable) && (d->arch.shadow_ht != NULL) )
985 {
986 xfree(d->arch.shadow_ht);
987 d->arch.shadow_ht = NULL;
988 }
989 if ( (new_modes & SHM_log_dirty) && (d->arch.shadow_dirty_bitmap != NULL) )
990 {
991 xfree(d->arch.shadow_dirty_bitmap);
992 d->arch.shadow_dirty_bitmap = NULL;
993 }
994 if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
995 pagetable_val(d->arch.phys_table) )
996 {
997 free_p2m_table(d);
998 }
999 return -ENOMEM;
1002 int shadow_mode_enable(struct domain *d, unsigned int mode)
1004 int rc;
1005 shadow_lock(d);
1006 rc = __shadow_mode_enable(d, mode);
1007 shadow_unlock(d);
1008 return rc;
1011 static void
1012 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1014 int i;
1015 l1_pgentry_t *l1;
1017 l1 = map_domain_mem(l1mfn << PAGE_SHIFT);
1018 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1020 if ( is_guest_l1_slot(i) &&
1021 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1023 unsigned long mfn = l1e_get_pfn(l1[i]);
1024 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1025 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1026 l1[i] = l1e_create_pfn(gpfn, l1e_get_flags(l1[i]));
1029 unmap_domain_mem(l1);
1032 // This is not general enough to handle arbitrary pagetables
1033 // with shared L1 pages, etc., but it is sufficient for bringing
1034 // up dom0.
1035 //
1036 void
1037 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn)
1039 int i;
1040 l2_pgentry_t *l2;
1042 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1044 l2 = map_domain_mem(l2mfn << PAGE_SHIFT);
1045 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1047 if ( is_guest_l2_slot(i) &&
1048 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1050 unsigned long mfn = l2e_get_pfn(l2[i]);
1051 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1052 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1053 l2[i] = l2e_create_pfn(gpfn, l2e_get_flags(l2[i]));
1054 translate_l1pgtable(d, p2m, mfn);
1057 unmap_domain_mem(l2);
1060 static void free_shadow_ht_entries(struct domain *d)
1062 struct shadow_status *x, *n;
1064 SH_VLOG("freed tables count=%d l1=%d l2=%d",
1065 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
1066 perfc_value(shadow_l2_pages));
1068 n = d->arch.shadow_ht_extras;
1069 while ( (x = n) != NULL )
1071 d->arch.shadow_extras_count--;
1072 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
1073 xfree(x);
1076 d->arch.shadow_ht_extras = NULL;
1077 d->arch.shadow_ht_free = NULL;
1079 ASSERT(d->arch.shadow_extras_count == 0);
1080 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
1082 if ( d->arch.shadow_dirty_bitmap != NULL )
1084 xfree(d->arch.shadow_dirty_bitmap);
1085 d->arch.shadow_dirty_bitmap = 0;
1086 d->arch.shadow_dirty_bitmap_size = 0;
1089 xfree(d->arch.shadow_ht);
1090 d->arch.shadow_ht = NULL;
1093 static void free_out_of_sync_entries(struct domain *d)
1095 struct out_of_sync_entry *x, *n;
1097 n = d->arch.out_of_sync_extras;
1098 while ( (x = n) != NULL )
1100 d->arch.out_of_sync_extras_count--;
1101 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
1102 xfree(x);
1105 d->arch.out_of_sync_extras = NULL;
1106 d->arch.out_of_sync_free = NULL;
1107 d->arch.out_of_sync = NULL;
1109 ASSERT(d->arch.out_of_sync_extras_count == 0);
1110 FSH_LOG("freed extra out_of_sync entries, now %d",
1111 d->arch.out_of_sync_extras_count);
1114 void __shadow_mode_disable(struct domain *d)
1116 if ( unlikely(!shadow_mode_enabled(d)) )
1117 return;
1119 /*
1120 * Currently this does not fix up page ref counts, so it is valid to call
1121 * only when a domain is being destroyed.
1122 */
1123 BUG_ON(!test_bit(DF_DYING, &d->d_flags));
1124 d->arch.shadow_tainted_refcnts = 1;
1126 free_shadow_pages(d);
1127 free_writable_pte_predictions(d);
1129 #ifndef NDEBUG
1130 int i;
1131 for ( i = 0; i < shadow_ht_buckets; i++ )
1133 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1135 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%p\n",
1136 i, d->arch.shadow_ht[i].gpfn_and_flags);
1137 BUG();
1140 #endif
1142 d->arch.shadow_mode = 0;
1144 free_shadow_ht_entries(d);
1145 free_out_of_sync_entries(d);
1148 static int shadow_mode_table_op(
1149 struct domain *d, dom0_shadow_control_t *sc)
1151 unsigned int op = sc->op;
1152 int i, rc = 0;
1153 struct exec_domain *ed;
1155 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1157 SH_VLOG("shadow mode table op %p %p count %d",
1158 pagetable_val(d->exec_domain[0]->arch.guest_table), /* XXX SMP */
1159 pagetable_val(d->exec_domain[0]->arch.shadow_table), /* XXX SMP */
1160 d->arch.shadow_page_count);
1162 shadow_audit(d, 1);
1164 switch ( op )
1166 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1167 free_shadow_pages(d);
1169 d->arch.shadow_fault_count = 0;
1170 d->arch.shadow_dirty_count = 0;
1171 d->arch.shadow_dirty_net_count = 0;
1172 d->arch.shadow_dirty_block_count = 0;
1174 break;
1176 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1177 free_shadow_pages(d);
1179 sc->stats.fault_count = d->arch.shadow_fault_count;
1180 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1181 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1182 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1184 d->arch.shadow_fault_count = 0;
1185 d->arch.shadow_dirty_count = 0;
1186 d->arch.shadow_dirty_net_count = 0;
1187 d->arch.shadow_dirty_block_count = 0;
1189 if ( (d->max_pages > sc->pages) ||
1190 (sc->dirty_bitmap == NULL) ||
1191 (d->arch.shadow_dirty_bitmap == NULL) )
1193 rc = -EINVAL;
1194 break;
1197 sc->pages = d->max_pages;
1199 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1200 for ( i = 0; i < d->max_pages; i += chunk )
1202 int bytes = ((((d->max_pages - i) > chunk) ?
1203 chunk : (d->max_pages - i)) + 7) / 8;
1205 if (copy_to_user(
1206 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1207 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1208 bytes))
1210 // copy_to_user can fail when copying to guest app memory.
1211 // app should zero buffer after mallocing, and pin it
1212 rc = -EINVAL;
1213 memset(
1214 d->arch.shadow_dirty_bitmap +
1215 (i/(8*sizeof(unsigned long))),
1216 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
1217 break;
1220 memset(
1221 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1222 0, bytes);
1225 break;
1227 case DOM0_SHADOW_CONTROL_OP_PEEK:
1228 sc->stats.fault_count = d->arch.shadow_fault_count;
1229 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1230 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1231 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1233 if ( (d->max_pages > sc->pages) ||
1234 (sc->dirty_bitmap == NULL) ||
1235 (d->arch.shadow_dirty_bitmap == NULL) )
1237 rc = -EINVAL;
1238 break;
1241 sc->pages = d->max_pages;
1242 if (copy_to_user(
1243 sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
1245 rc = -EINVAL;
1246 break;
1249 break;
1251 default:
1252 rc = -EINVAL;
1253 break;
1256 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1257 shadow_audit(d, 1);
1259 for_each_exec_domain(d,ed)
1260 __update_pagetables(ed);
1262 return rc;
1265 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1267 unsigned int op = sc->op;
1268 int rc = 0;
1269 struct exec_domain *ed;
1271 if ( unlikely(d == current->domain) )
1273 DPRINTK("Don't try to do a shadow op on yourself!\n");
1274 return -EINVAL;
1277 domain_pause(d);
1279 shadow_lock(d);
1281 switch ( op )
1283 case DOM0_SHADOW_CONTROL_OP_OFF:
1284 __shadow_mode_disable(d);
1285 break;
1287 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1288 free_shadow_pages(d);
1289 rc = __shadow_mode_enable(d, SHM_enable);
1290 break;
1292 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1293 free_shadow_pages(d);
1294 rc = __shadow_mode_enable(
1295 d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1296 break;
1298 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1299 free_shadow_pages(d);
1300 rc = __shadow_mode_enable(
1301 d, d->arch.shadow_mode|SHM_enable|SHM_translate);
1302 break;
1304 default:
1305 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1306 break;
1309 shadow_unlock(d);
1311 for_each_exec_domain(d,ed)
1312 update_pagetables(ed);
1314 domain_unpause(d);
1316 return rc;
1319 /*
1320 * XXX KAF: Why is this VMX specific?
1321 */
1322 void vmx_shadow_clear_state(struct domain *d)
1324 SH_VVLOG("%s:", __func__);
1325 shadow_lock(d);
1326 free_shadow_pages(d);
1327 shadow_unlock(d);
1328 update_pagetables(d->exec_domain[0]);
1331 unsigned long
1332 gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1334 ASSERT( shadow_mode_translate(d) );
1336 perfc_incrc(gpfn_to_mfn_foreign);
1338 unsigned long va = gpfn << PAGE_SHIFT;
1339 unsigned long phystab = pagetable_val(d->arch.phys_table);
1340 l2_pgentry_t *l2 = map_domain_mem(phystab);
1341 l2_pgentry_t l2e = l2[l2_table_offset(va)];
1342 unmap_domain_mem(l2);
1343 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1345 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l2e=%p\n",
1346 d->id, gpfn, l2e_get_value(l2e));
1347 return INVALID_MFN;
1349 unsigned long l1tab = l2e_get_phys(l2e);
1350 l1_pgentry_t *l1 = map_domain_mem(l1tab);
1351 l1_pgentry_t l1e = l1[l1_table_offset(va)];
1352 unmap_domain_mem(l1);
1354 #if 0
1355 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => %p phystab=%p l2e=%p l1tab=%p, l1e=%p\n",
1356 d->id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, phystab, l2e, l1tab, l1e);
1357 #endif
1359 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1361 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l1e=%p\n",
1362 d->id, gpfn, l1e_get_value(l1e));
1363 return INVALID_MFN;
1366 return l1e_get_pfn(l1e);
1369 static unsigned long
1370 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1371 unsigned long smfn)
1373 unsigned long hl2mfn;
1374 l1_pgentry_t *hl2;
1375 int limit;
1377 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1379 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1381 printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1382 BUG(); /* XXX Deal gracefully with failure. */
1385 SH_VVLOG("shadow_hl2_table(gpfn=%p, gmfn=%p, smfn=%p) => %p",
1386 gpfn, gmfn, smfn, hl2mfn);
1387 perfc_incrc(shadow_hl2_table_count);
1389 hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
1391 #ifdef __i386__
1392 if ( shadow_mode_external(d) )
1393 limit = L2_PAGETABLE_ENTRIES;
1394 else
1395 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1396 #else
1397 limit = 0; /* XXX x86/64 XXX */
1398 #endif
1400 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1402 if ( !shadow_mode_external(d) )
1404 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1405 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1407 // Setup easy access to the GL2, SL2, and HL2 frames.
1408 //
1409 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1410 l1e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1411 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1412 l1e_create_pfn(smfn, __PAGE_HYPERVISOR);
1413 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1414 l1e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1417 unmap_domain_mem(hl2);
1419 return hl2mfn;
1422 /*
1423 * This could take and use a snapshot, and validate the entire page at
1424 * once, or it could continue to fault in entries one at a time...
1425 * Might be worth investigating...
1426 */
1427 static unsigned long shadow_l2_table(
1428 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1430 unsigned long smfn;
1431 l2_pgentry_t *spl2e;
1433 SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
1435 perfc_incrc(shadow_l2_table_count);
1437 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1439 printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1440 BUG(); /* XXX Deal gracefully with failure. */
1443 spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
1445 /* Install hypervisor and 2x linear p.t. mapings. */
1446 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1447 !shadow_mode_external(d) )
1449 /*
1450 * We could proactively fill in PDEs for pages that are already
1451 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1452 * (restriction required for coherence of the accessed bit). However,
1453 * we tried it and it didn't help performance. This is simpler.
1454 */
1455 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1457 /* Install hypervisor and 2x linear p.t. mapings. */
1458 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1459 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1460 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1462 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1463 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
1465 spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
1466 l2e_create_phys(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
1467 __PAGE_HYPERVISOR);
1469 if ( shadow_mode_translate(d) ) // NB: not external
1471 unsigned long hl2mfn;
1473 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1474 l2e_create_phys(pagetable_val(d->arch.phys_table),
1475 __PAGE_HYPERVISOR);
1477 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1478 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1480 // shadow_mode_translate (but not external) sl2 tables hold a
1481 // ref to their hl2.
1482 //
1483 if ( !get_shadow_ref(hl2mfn) )
1484 BUG();
1486 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1487 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1489 else
1490 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1491 l2e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1493 else
1495 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1498 unmap_domain_mem(spl2e);
1500 SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
1501 return smfn;
1504 void shadow_map_l1_into_current_l2(unsigned long va)
1506 struct exec_domain *ed = current;
1507 struct domain *d = ed->domain;
1508 l1_pgentry_t *gpl1e, *spl1e;
1509 l2_pgentry_t gl2e, sl2e;
1510 unsigned long gl1pfn, gl1mfn, sl1mfn;
1511 int i, init_table = 0;
1513 __guest_get_l2e(ed, va, &gl2e);
1514 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
1515 gl1pfn = l2e_get_pfn(gl2e);
1517 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1519 /* This L1 is NOT already shadowed so we need to shadow it. */
1520 SH_VVLOG("4a: l1 not shadowed");
1522 gl1mfn = __gpfn_to_mfn(d, gl1pfn);
1523 if ( unlikely(!VALID_MFN(gl1mfn)) )
1525 // Attempt to use an invalid pfn as an L1 page.
1526 // XXX this needs to be more graceful!
1527 BUG();
1530 if ( unlikely(!(sl1mfn =
1531 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1533 printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
1534 gl1pfn, gl1mfn);
1535 BUG(); /* XXX Need to deal gracefully with failure. */
1538 perfc_incrc(shadow_l1_table_count);
1539 init_table = 1;
1541 else
1543 /* This L1 is shadowed already, but the L2 entry is missing. */
1544 SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
1547 #ifndef NDEBUG
1548 l2_pgentry_t old_sl2e;
1549 __shadow_get_l2e(ed, va, &old_sl2e);
1550 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
1551 #endif
1553 if ( !get_shadow_ref(sl1mfn) )
1554 BUG();
1555 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1556 __guest_set_l2e(ed, va, gl2e);
1557 __shadow_set_l2e(ed, va, sl2e);
1559 if ( init_table )
1561 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
1562 ~(L1_PAGETABLE_ENTRIES-1)]);
1564 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
1565 ~(L1_PAGETABLE_ENTRIES-1)]);
1567 l1_pgentry_t sl1e;
1568 int index = l1_table_offset(va);
1569 int min = 1, max = 0;
1571 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1573 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1574 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
1575 !shadow_get_page_from_l1e(sl1e, d) )
1576 sl1e = l1e_empty();
1577 if ( l1e_get_value(sl1e) == 0 ) /* FIXME: check flags? */
1579 // First copy entries from 0 until first invalid.
1580 // Then copy entries from index until first invalid.
1581 //
1582 if ( i < index ) {
1583 i = index - 1;
1584 continue;
1586 break;
1588 spl1e[i] = sl1e;
1589 if ( unlikely(i < min) )
1590 min = i;
1591 if ( likely(i > max) )
1592 max = i;
1595 frame_table[sl1mfn].tlbflush_timestamp =
1596 SHADOW_ENCODE_MIN_MAX(min, max);
1600 void shadow_invlpg(struct exec_domain *ed, unsigned long va)
1602 struct domain *d = ed->domain;
1603 l1_pgentry_t gpte, spte;
1605 ASSERT(shadow_mode_enabled(d));
1607 shadow_lock(d);
1609 __shadow_sync_va(ed, va);
1611 // XXX mafetter: will need to think about 4MB pages...
1613 // It's not strictly necessary to update the shadow here,
1614 // but it might save a fault later.
1615 //
1616 if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
1617 sizeof(gpte))) {
1618 perfc_incrc(shadow_invlpg_faults);
1619 return;
1621 l1pte_propagate_from_guest(d, gpte, &spte);
1622 shadow_set_l1e(va, spte, 1);
1624 shadow_unlock(d);
1627 struct out_of_sync_entry *
1628 shadow_alloc_oos_entry(struct domain *d)
1630 struct out_of_sync_entry *f, *extra;
1631 unsigned size, i;
1633 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1635 FSH_LOG("Allocate more fullshadow tuple blocks.");
1637 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1638 extra = xmalloc_bytes(size);
1640 /* XXX Should be more graceful here. */
1641 if ( extra == NULL )
1642 BUG();
1644 memset(extra, 0, size);
1646 /* Record the allocation block so it can be correctly freed later. */
1647 d->arch.out_of_sync_extras_count++;
1648 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1649 d->arch.out_of_sync_extras;
1650 d->arch.out_of_sync_extras = &extra[0];
1652 /* Thread a free chain through the newly-allocated nodes. */
1653 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1654 extra[i].next = &extra[i+1];
1655 extra[i].next = NULL;
1657 /* Add the new nodes to the free list. */
1658 d->arch.out_of_sync_free = &extra[0];
1661 /* Allocate a new node from the quicklist. */
1662 f = d->arch.out_of_sync_free;
1663 d->arch.out_of_sync_free = f->next;
1665 return f;
1668 static inline unsigned long
1669 shadow_make_snapshot(
1670 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1672 unsigned long smfn, sl1mfn = 0;
1673 void *original, *snapshot;
1674 u32 min_max = 0;
1675 int min, max, length;
1677 if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
1679 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
1680 return SHADOW_SNAPSHOT_ELSEWHERE;
1683 perfc_incrc(shadow_make_snapshot);
1685 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
1687 printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
1688 "Dom%d snapshot_count_count=%d\n",
1689 gpfn, gmfn, d->id, d->arch.snapshot_page_count);
1690 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
1693 if ( !get_shadow_ref(smfn) )
1694 BUG();
1696 if ( shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow )
1697 min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
1698 pfn_to_page(smfn)->tlbflush_timestamp = min_max;
1700 min = SHADOW_MIN(min_max);
1701 max = SHADOW_MAX(min_max);
1702 length = max - min + 1;
1703 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
1705 min *= sizeof(l1_pgentry_t);
1706 length *= sizeof(l1_pgentry_t);
1708 original = map_domain_mem(gmfn << PAGE_SHIFT);
1709 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1710 memcpy(snapshot + min, original + min, length);
1711 unmap_domain_mem(original);
1712 unmap_domain_mem(snapshot);
1714 return smfn;
1717 static void
1718 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
1720 void *snapshot;
1722 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1723 return;
1725 // Clear the out_of_sync bit.
1726 //
1727 clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
1729 // XXX Need to think about how to protect the domain's
1730 // information less expensively.
1731 //
1732 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
1733 memset(snapshot, 0, PAGE_SIZE);
1734 unmap_domain_mem(snapshot);
1736 put_shadow_ref(entry->snapshot_mfn);
1739 struct out_of_sync_entry *
1740 shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
1741 unsigned long mfn)
1743 struct domain *d = ed->domain;
1744 struct pfn_info *page = &frame_table[mfn];
1745 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1747 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1748 ASSERT(pfn_valid(mfn));
1749 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
1751 FSH_LOG("%s(gpfn=%p, mfn=%p) c=%p t=%p", __func__,
1752 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1754 // XXX this will require some more thought... Cross-domain sharing and
1755 // modification of page tables? Hmm...
1756 //
1757 if ( d != page_get_owner(page) )
1758 BUG();
1760 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1762 entry->gpfn = gpfn;
1763 entry->gmfn = mfn;
1764 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1765 entry->writable_pl1e = -1;
1767 // increment guest's ref count to represent the entry in the
1768 // full shadow out-of-sync list.
1769 //
1770 get_page(page, d);
1772 // Add to the out-of-sync list
1773 //
1774 entry->next = d->arch.out_of_sync;
1775 d->arch.out_of_sync = entry;
1777 return entry;
1780 void shadow_mark_va_out_of_sync(
1781 struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
1783 struct out_of_sync_entry *entry =
1784 shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
1785 l2_pgentry_t sl2e;
1787 // We need the address of shadow PTE that maps @va.
1788 // It might not exist yet. Make sure it's there.
1789 //
1790 __shadow_get_l2e(ed, va, &sl2e);
1791 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1793 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1794 // the current L2.
1795 shadow_map_l1_into_current_l2(va);
1796 __shadow_get_l2e(ed, va, &sl2e);
1798 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1800 // NB: this is stored as a machine address.
1801 entry->writable_pl1e =
1802 l2e_get_phys(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1803 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1805 // Increment shadow's page count to represent the reference
1806 // inherent in entry->writable_pl1e
1807 //
1808 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1809 BUG();
1811 FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
1812 va, entry->writable_pl1e);
1815 /*
1816 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1817 * Returns 0 otherwise.
1818 */
1819 static int snapshot_entry_matches(
1820 struct exec_domain *ed, unsigned long gmfn, unsigned index)
1822 unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
1823 unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
1824 unsigned long *guest, *snapshot;
1825 int compare;
1827 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
1829 perfc_incrc(snapshot_entry_matches_calls);
1831 if ( !smfn )
1832 return 0;
1834 guest = map_domain_mem(gmfn << PAGE_SHIFT);
1835 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1837 // This could probably be smarter, but this is sufficent for
1838 // our current needs.
1839 //
1840 compare = (guest[index] == snapshot[index]);
1842 unmap_domain_mem(guest);
1843 unmap_domain_mem(snapshot);
1845 #ifdef PERF_COUNTERS
1846 if ( compare )
1847 perfc_incrc(snapshot_entry_matches_true);
1848 #endif
1850 return compare;
1853 /*
1854 * Returns 1 if va's shadow mapping is out-of-sync.
1855 * Returns 0 otherwise.
1856 */
1857 int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
1859 struct domain *d = ed->domain;
1860 unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1861 l2_pgentry_t l2e;
1862 unsigned long l1mfn;
1864 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1866 perfc_incrc(shadow_out_of_sync_calls);
1868 if ( page_out_of_sync(&frame_table[l2mfn]) &&
1869 !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
1870 return 1;
1872 __guest_get_l2e(ed, va, &l2e);
1873 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1874 return 0;
1876 l1mfn = __gpfn_to_mfn(d, l2e_get_pfn(l2e));
1878 // If the l1 pfn is invalid, it can't be out of sync...
1879 if ( !VALID_MFN(l1mfn) )
1880 return 0;
1882 if ( page_out_of_sync(&frame_table[l1mfn]) &&
1883 !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
1884 return 1;
1886 return 0;
1889 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
1890 static inline unsigned long
1891 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1893 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1896 static inline void
1897 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1899 unsigned long score = prediction & PGT_score_mask;
1900 int create = (score == 0);
1902 // saturating addition
1903 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1904 score = score ? score : PGT_score_mask;
1906 prediction = (prediction & PGT_mfn_mask) | score;
1908 //printk("increase gpfn=%p pred=%p create=%d\n", gpfn, prediction, create);
1909 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1911 if ( create )
1912 perfc_incr(writable_pte_predictions);
1915 static inline void
1916 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1918 unsigned long score = prediction & PGT_score_mask;
1919 ASSERT(score);
1921 // divide score by 2... We don't like bad predictions.
1922 //
1923 score = (score >> 1) & PGT_score_mask;
1925 prediction = (prediction & PGT_mfn_mask) | score;
1927 //printk("decrease gpfn=%p pred=%p score=%p\n", gpfn, prediction, score);
1929 if ( score )
1930 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1931 else
1933 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1934 perfc_decr(writable_pte_predictions);
1938 static void
1939 free_writable_pte_predictions(struct domain *d)
1941 int i;
1942 struct shadow_status *x;
1944 for ( i = 0; i < shadow_ht_buckets; i++ )
1946 u32 count;
1947 unsigned long *gpfn_list;
1949 /* Skip empty buckets. */
1950 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
1951 continue;
1953 count = 0;
1954 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
1955 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
1956 count++;
1958 gpfn_list = xmalloc_array(unsigned long, count);
1959 count = 0;
1960 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
1961 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
1962 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
1964 while ( count )
1966 count--;
1967 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
1970 xfree(gpfn_list);
1974 static u32 remove_all_write_access_in_ptpage(
1975 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1976 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1977 u32 max_refs_to_find, unsigned long prediction)
1979 l1_pgentry_t *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
1980 l1_pgentry_t match;
1981 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1982 int i;
1983 u32 found = 0;
1984 int is_l1_shadow =
1985 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1986 PGT_l1_shadow);
1988 match = l1e_create_pfn(readonly_gmfn, flags);
1990 // returns true if all refs have been found and fixed.
1991 //
1992 int fix_entry(int i)
1994 l1_pgentry_t old = pt[i];
1995 l1_pgentry_t new = old;
1997 l1e_remove_flags(&new,_PAGE_RW);
1998 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1999 BUG();
2000 found++;
2001 pt[i] = new;
2002 if ( is_l1_shadow )
2003 put_page_from_l1e(old, d);
2005 #if 0
2006 printk("removed write access to pfn=%p mfn=%p in smfn=%p entry %x "
2007 "is_l1_shadow=%d\n",
2008 readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
2009 #endif
2011 return (found == max_refs_to_find);
2014 i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
2015 if ( !l1e_has_changed(&pt[i], &match, flags) && fix_entry(i) )
2017 perfc_incrc(remove_write_fast_exit);
2018 increase_writable_pte_prediction(d, readonly_gpfn, prediction);
2019 unmap_domain_mem(pt);
2020 return found;
2023 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2025 if ( unlikely(!l1e_has_changed(&pt[i], &match, flags)) && fix_entry(i) )
2026 break;
2029 unmap_domain_mem(pt);
2031 return found;
2032 #undef MATCH_ENTRY
2035 int shadow_remove_all_write_access(
2036 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
2038 int i;
2039 struct shadow_status *a;
2040 u32 found = 0, fixups, write_refs;
2041 unsigned long prediction, predicted_gpfn, predicted_smfn;
2043 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2044 ASSERT(VALID_MFN(readonly_gmfn));
2046 perfc_incrc(remove_write_access);
2048 // If it's not a writable page, then no writable refs can be outstanding.
2049 //
2050 if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
2051 PGT_writable_page )
2053 perfc_incrc(remove_write_not_writable);
2054 return 1;
2057 // How many outstanding writable PTEs for this page are there?
2058 //
2059 write_refs =
2060 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
2061 if ( write_refs &&
2062 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) )
2064 write_refs--;
2067 if ( write_refs == 0 )
2069 perfc_incrc(remove_write_no_work);
2070 return 1;
2073 // Before searching all the L1 page tables, check the typical culprit first
2074 //
2075 if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
2077 predicted_gpfn = prediction & PGT_mfn_mask;
2078 if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
2079 (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
2081 found += fixups;
2082 if ( found == write_refs )
2084 perfc_incrc(remove_write_predicted);
2085 return 1;
2088 else
2090 perfc_incrc(remove_write_bad_prediction);
2091 decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
2095 // Search all the shadow L1 page tables...
2096 //
2097 for (i = 0; i < shadow_ht_buckets; i++)
2099 a = &d->arch.shadow_ht[i];
2100 while ( a && a->gpfn_and_flags )
2102 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
2104 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
2105 if ( found == write_refs )
2106 return 1;
2109 a = a->next;
2113 FSH_LOG("%s: looking for %d refs, found %d refs",
2114 __func__, write_refs, found);
2116 return 0;
2119 static u32 remove_all_access_in_page(
2120 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
2122 l1_pgentry_t *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
2123 l1_pgentry_t match;
2124 unsigned long flags = _PAGE_PRESENT;
2125 int i;
2126 u32 count = 0;
2127 int is_l1_shadow =
2128 ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
2129 PGT_l1_shadow);
2131 match = l1e_create_pfn(forbidden_gmfn, flags);
2133 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2135 if ( unlikely(!l1e_has_changed(&pl1e[i], &match, flags) == 0) )
2137 l1_pgentry_t ol2e = pl1e[i];
2138 pl1e[i] = l1e_empty();
2139 count++;
2141 if ( is_l1_shadow )
2142 put_page_from_l1e(ol2e, d);
2143 else /* must be an hl2 page */
2144 put_page(&frame_table[forbidden_gmfn]);
2148 unmap_domain_mem(pl1e);
2150 return count;
2153 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
2155 int i;
2156 struct shadow_status *a;
2157 u32 count = 0;
2159 if ( unlikely(!shadow_mode_enabled(d)) )
2160 return 0;
2162 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2163 perfc_incrc(remove_all_access);
2165 for (i = 0; i < shadow_ht_buckets; i++)
2167 a = &d->arch.shadow_ht[i];
2168 while ( a && a->gpfn_and_flags )
2170 switch (a->gpfn_and_flags & PGT_type_mask)
2172 case PGT_l1_shadow:
2173 case PGT_l2_shadow:
2174 case PGT_l3_shadow:
2175 case PGT_l4_shadow:
2176 case PGT_hl2_shadow:
2177 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2178 break;
2179 case PGT_snapshot:
2180 case PGT_writable_pred:
2181 // these can't hold refs to the forbidden page
2182 break;
2183 default:
2184 BUG();
2187 a = a->next;
2191 return count;
2194 static int resync_all(struct domain *d, u32 stype)
2196 struct out_of_sync_entry *entry;
2197 unsigned i;
2198 unsigned long smfn;
2199 void *guest, *shadow, *snapshot;
2200 int need_flush = 0, external = shadow_mode_external(d);
2201 int unshadow;
2202 int changed;
2204 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2206 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2208 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2209 continue;
2211 if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
2212 continue;
2214 FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
2215 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2217 // Compare guest's new contents to its snapshot, validating
2218 // and updating its shadow as appropriate.
2219 //
2220 guest = map_domain_mem(entry->gmfn << PAGE_SHIFT);
2221 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
2222 shadow = map_domain_mem(smfn << PAGE_SHIFT);
2223 unshadow = 0;
2225 switch ( stype ) {
2226 case PGT_l1_shadow:
2228 u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
2229 int min_shadow = SHADOW_MIN(min_max_shadow);
2230 int max_shadow = SHADOW_MAX(min_max_shadow);
2232 u32 min_max_snapshot =
2233 pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
2234 int min_snapshot = SHADOW_MIN(min_max_snapshot);
2235 int max_snapshot = SHADOW_MAX(min_max_snapshot);
2237 l1_pgentry_t *guest1 = guest;
2238 l1_pgentry_t *shadow1 = shadow;
2239 l1_pgentry_t *snapshot1 = snapshot;
2241 changed = 0;
2243 for ( i = min_shadow; i <= max_shadow; i++ )
2245 if ( (i < min_snapshot) || (i > max_snapshot) ||
2246 l1e_has_changed(&guest1[i], &snapshot1[i], PAGE_FLAG_MASK) )
2248 need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]);
2250 // can't update snapshots of linear page tables -- they
2251 // are used multiple times...
2252 //
2253 // snapshot[i] = new_pte;
2255 changed++;
2258 perfc_incrc(resync_l1);
2259 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
2260 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
2261 break;
2263 case PGT_l2_shadow:
2265 int max = -1;
2267 l2_pgentry_t *guest2 = guest;
2268 l2_pgentry_t *shadow2 = shadow;
2269 l2_pgentry_t *snapshot2 = snapshot;
2271 changed = 0;
2272 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2274 if ( !is_guest_l2_slot(i) && !external )
2275 continue;
2277 l2_pgentry_t new_pde = guest2[i];
2278 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK))
2280 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
2282 // can't update snapshots of linear page tables -- they
2283 // are used multiple times...
2284 //
2285 // snapshot[i] = new_pde;
2287 changed++;
2289 if ( l2e_get_value(new_pde) != 0 ) /* FIXME: check flags? */
2290 max = i;
2292 // XXX - This hack works for linux guests.
2293 // Need a better solution long term.
2294 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
2295 unlikely(l2e_get_value(new_pde) != 0) &&
2296 !unshadow &&
2297 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
2298 unshadow = 1;
2300 if ( max == -1 )
2301 unshadow = 1;
2302 perfc_incrc(resync_l2);
2303 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
2304 break;
2306 case PGT_hl2_shadow:
2308 l2_pgentry_t *guest2 = guest;
2309 l2_pgentry_t *snapshot2 = snapshot;
2310 l1_pgentry_t *shadow2 = shadow;
2312 changed = 0;
2313 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2315 if ( !is_guest_l2_slot(i) && !external )
2316 continue;
2318 l2_pgentry_t new_pde = guest2[i];
2319 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK) )
2321 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
2323 // can't update snapshots of linear page tables -- they
2324 // are used multiple times...
2325 //
2326 // snapshot[i] = new_pde;
2328 changed++;
2331 perfc_incrc(resync_hl2);
2332 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
2333 break;
2335 default:
2336 BUG();
2339 unmap_domain_mem(shadow);
2340 unmap_domain_mem(snapshot);
2341 unmap_domain_mem(guest);
2343 if ( unlikely(unshadow) )
2345 perfc_incrc(unshadow_l2_count);
2346 shadow_unpin(smfn);
2347 if ( unlikely(shadow_mode_external(d)) )
2349 unsigned long hl2mfn;
2351 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
2352 (frame_table[hl2mfn].u.inuse.type_info & PGT_pinned) )
2353 shadow_unpin(hl2mfn);
2358 return need_flush;
2361 void __shadow_sync_all(struct domain *d)
2363 struct out_of_sync_entry *entry;
2364 int need_flush = 0;
2366 perfc_incrc(shadow_sync_all);
2368 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2370 // First, remove all write permissions to the page tables
2371 //
2372 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2374 // Skip entries that have low bits set... Those aren't
2375 // real PTEs.
2376 //
2377 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2378 continue;
2380 l1_pgentry_t *ppte = map_domain_mem(entry->writable_pl1e);
2381 l1_pgentry_t opte = *ppte;
2382 l1_pgentry_t npte = opte;
2383 l1e_remove_flags(&npte, _PAGE_RW);
2385 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2386 !shadow_get_page_from_l1e(npte, d) )
2387 BUG();
2388 *ppte = npte;
2389 put_page_from_l1e(opte, d);
2391 unmap_domain_mem(ppte);
2394 // XXX mafetter: SMP
2395 //
2396 // With the current algorithm, we've gotta flush all the TLBs
2397 // before we can safely continue. I don't think we want to
2398 // do it this way, so I think we should consider making
2399 // entirely private copies of the shadow for each vcpu, and/or
2400 // possibly having a mix of private and shared shadow state
2401 // (any path from a PTE that grants write access to an out-of-sync
2402 // page table page needs to be vcpu private).
2403 //
2404 #if 0 // this should be enabled for SMP guests...
2405 flush_tlb_mask(((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id()));
2406 #endif
2407 need_flush = 1;
2409 // Second, resync all L1 pages, then L2 pages, etc...
2410 //
2411 need_flush |= resync_all(d, PGT_l1_shadow);
2412 if ( shadow_mode_translate(d) )
2413 need_flush |= resync_all(d, PGT_hl2_shadow);
2414 need_flush |= resync_all(d, PGT_l2_shadow);
2416 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2417 local_flush_tlb();
2419 free_out_of_sync_state(d);
2422 int shadow_fault(unsigned long va, struct xen_regs *regs)
2424 l1_pgentry_t gpte, spte, orig_gpte;
2425 struct exec_domain *ed = current;
2426 struct domain *d = ed->domain;
2427 l2_pgentry_t gpde;
2429 spte = l1e_empty();
2431 SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code);
2432 perfc_incrc(shadow_fault_calls);
2434 check_pagetable(ed, "pre-sf");
2436 /*
2437 * Don't let someone else take the guest's table pages out-of-sync.
2438 */
2439 shadow_lock(d);
2441 /* XXX - FIX THIS COMMENT!!!
2442 * STEP 1. Check to see if this fault might have been caused by an
2443 * out-of-sync table page entry, or if we should pass this
2444 * fault onto the guest.
2445 */
2446 __shadow_sync_va(ed, va);
2448 /*
2449 * STEP 2. Check the guest PTE.
2450 */
2451 __guest_get_l2e(ed, va, &gpde);
2452 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2454 SH_VVLOG("shadow_fault - EXIT: L1 not present");
2455 perfc_incrc(shadow_fault_bail_pde_not_present);
2456 goto fail;
2459 // This can't fault because we hold the shadow lock and we've ensured that
2460 // the mapping is in-sync, so the check of the PDE's present bit, above,
2461 // covers this access.
2462 //
2463 orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2464 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2466 SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)", gpte);
2467 perfc_incrc(shadow_fault_bail_pte_not_present);
2468 goto fail;
2471 /* Write fault? */
2472 if ( regs->error_code & 2 )
2474 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2476 /* Write fault on a read-only mapping. */
2477 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
2478 perfc_incrc(shadow_fault_bail_ro_mapping);
2479 goto fail;
2482 if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
2484 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2485 perfc_incrc(write_fault_bail);
2486 shadow_unlock(d);
2487 return 0;
2490 else
2492 if ( !l1pte_read_fault(d, &gpte, &spte) )
2494 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2495 perfc_incrc(read_fault_bail);
2496 shadow_unlock(d);
2497 return 0;
2501 /*
2502 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2503 */
2505 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2506 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2507 &gpte, sizeof(gpte))) )
2509 printk("shadow_fault() failed, crashing domain %d "
2510 "due to a read-only L2 page table (gpde=%p), va=%p\n",
2511 d->id, gpde, va);
2512 domain_crash_synchronous();
2515 // if necessary, record the page table page as dirty
2516 if ( unlikely(shadow_mode_log_dirty(d)) &&
2517 l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK))
2518 mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
2520 shadow_set_l1e(va, spte, 1);
2522 perfc_incrc(shadow_fault_fixed);
2523 d->arch.shadow_fault_count++;
2525 shadow_unlock(d);
2527 check_pagetable(ed, "post-sf");
2528 return EXCRET_fault_fixed;
2530 fail:
2531 shadow_unlock(d);
2532 return 0;
2535 /*
2536 * What lives where in the 32-bit address space in the various shadow modes,
2537 * and what it uses to get/maintain that mapping.
2539 * SHADOW MODE: none enable translate external
2541 * 4KB things:
2542 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2543 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2544 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2545 * monitor_vtable n/a n/a n/a mapped once
2547 * 4MB things:
2548 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2549 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2550 * monitor_linear n/a n/a n/a ???
2551 * perdomain perdomain perdomain perdomain perdomain
2552 * R/O M2P R/O M2P R/O M2P n/a n/a
2553 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2554 * P2M n/a n/a R/O M2P R/O M2P
2556 * NB:
2557 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
2558 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2559 * all play a part in maintaining these mappings.
2560 */
2561 void __update_pagetables(struct exec_domain *ed)
2563 struct domain *d = ed->domain;
2564 unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
2565 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
2566 unsigned long smfn, hl2mfn, old_smfn;
2568 int max_mode = ( shadow_mode_external(d) ? SHM_external
2569 : shadow_mode_translate(d) ? SHM_translate
2570 : shadow_mode_enabled(d) ? SHM_enable
2571 : 0 );
2573 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2574 ASSERT( max_mode );
2576 /*
2577 * arch.guest_vtable
2578 */
2579 if ( max_mode & (SHM_enable | SHM_external) )
2581 if ( likely(ed->arch.guest_vtable != NULL) )
2582 unmap_domain_mem(ed->arch.guest_vtable);
2583 ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
2586 /*
2587 * arch.shadow_table
2588 */
2589 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2590 smfn = shadow_l2_table(d, gpfn, gmfn);
2591 if ( !get_shadow_ref(smfn) )
2592 BUG();
2593 old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
2594 ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
2595 if ( old_smfn )
2596 put_shadow_ref(old_smfn);
2598 SH_VVLOG("__update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
2600 /*
2601 * arch.shadow_vtable
2602 */
2603 if ( max_mode == SHM_external )
2605 if ( ed->arch.shadow_vtable )
2606 unmap_domain_mem(ed->arch.shadow_vtable);
2607 ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
2610 /*
2611 * arch.hl2_vtable
2612 */
2614 // if max_mode == SHM_translate, then the hl2 is already installed
2615 // correctly in its smfn, and there's nothing to do.
2616 //
2617 if ( max_mode == SHM_external )
2619 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2620 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2621 if ( ed->arch.hl2_vtable )
2622 unmap_domain_mem(ed->arch.hl2_vtable);
2623 ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
2626 /*
2627 * fixup pointers in monitor table, as necessary
2628 */
2629 if ( max_mode == SHM_external )
2631 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
2632 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2633 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2635 ASSERT( shadow_mode_translate(d) );
2637 if ( !get_shadow_ref(hl2mfn) )
2638 BUG();
2639 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2640 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
2641 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2642 put_shadow_ref(l2e_get_pfn(old_hl2e));
2644 if ( !get_shadow_ref(smfn) )
2645 BUG();
2646 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2647 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2648 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2649 put_shadow_ref(l2e_get_pfn(old_sl2e));
2651 // XXX - maybe this can be optimized somewhat??
2652 local_flush_tlb();
2657 /************************************************************************/
2658 /************************************************************************/
2659 /************************************************************************/
2661 #if SHADOW_DEBUG
2663 // BUG: these are not SMP safe...
2664 static int sh_l2_present;
2665 static int sh_l1_present;
2666 char * sh_check_name;
2667 int shadow_status_noswap;
2669 #define v2m(_ed, _adr) ({ \
2670 unsigned long _a = (unsigned long)(_adr); \
2671 l2_pgentry_t _pde = shadow_linear_l2_table(_ed)[l2_table_offset(_a)]; \
2672 unsigned long _pa = -1; \
2673 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2674 { \
2675 l1_pgentry_t _pte; \
2676 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2677 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2678 _pa = l1e_get_phys(_pte); \
2679 } \
2680 _pa | (_a & ~PAGE_MASK); \
2681 })
2683 #define FAIL(_f, _a...) \
2684 do { \
2685 printk("XXX %s-FAIL (%d,%d,%d)" _f " at %s(%d)\n", \
2686 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2687 __FILE__, __LINE__); \
2688 printk("g=%08lx s=%08lx &g=%08lx &s=%08lx" \
2689 " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08lx\n", \
2690 gpte, spte, pgpte, pspte, \
2691 v2m(ed, pgpte), v2m(ed, pspte), \
2692 (l2_idx << L2_PAGETABLE_SHIFT) | \
2693 (l1_idx << L1_PAGETABLE_SHIFT)); \
2694 errors++; \
2695 } while ( 0 )
2697 static int check_pte(
2698 struct exec_domain *ed, l1_pgentry_t *pgpte, l1_pgentry_t *pspte,
2699 int level, int l2_idx, int l1_idx, int oos_ptes)
2701 struct domain *d = ed->domain;
2702 l1_pgentry_t gpte = *pgpte;
2703 l1_pgentry_t spte = *pspte;
2704 unsigned long mask, gpfn, smfn, gmfn;
2705 int errors = 0;
2706 int page_table_page;
2708 if ( (l1e_get_value(spte) == 0) ||
2709 (l1e_get_value(spte) == 0xdeadface) ||
2710 (l1e_get_value(spte) == 0x00000E00) )
2711 return errors; /* always safe */
2713 if ( !(l1e_get_flags(spte) & _PAGE_PRESENT) )
2714 FAIL("Non zero not present spte");
2716 if ( level == 2 ) sh_l2_present++;
2717 if ( level == 1 ) sh_l1_present++;
2719 if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
2720 FAIL("Guest not present yet shadow is");
2722 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK);
2724 if ( (l1e_get_value(spte) & mask) != (l1e_get_value(gpte) & mask) )
2725 FAIL("Corrupt?");
2727 if ( (level == 1) &&
2728 (l1e_get_flags(spte) & _PAGE_DIRTY) &&
2729 !(l1e_get_flags(gpte) & _PAGE_DIRTY) && !oos_ptes )
2730 FAIL("Dirty coherence");
2732 if ( (l1e_get_flags(spte) & _PAGE_ACCESSED) &&
2733 !(l1e_get_flags(gpte) & _PAGE_ACCESSED) && !oos_ptes )
2734 FAIL("Accessed coherence");
2736 if ( l1e_get_flags(spte) & _PAGE_GLOBAL )
2737 FAIL("global bit set in shadow");
2739 smfn = l1e_get_pfn(spte);
2740 gpfn = l1e_get_pfn(gpte);
2741 gmfn = __gpfn_to_mfn(d, gpfn);
2743 if ( !VALID_MFN(gmfn) )
2744 FAIL("invalid gpfn=%p gpte=%p\n", __func__, gpfn,
2745 l1e_get_value(gpte));
2747 page_table_page = mfn_is_page_table(gmfn);
2749 if ( (l1e_get_flags(spte) & _PAGE_RW ) &&
2750 !(l1e_get_flags(gpte) & _PAGE_RW) && !oos_ptes )
2752 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2753 "oos_ptes=%d\n",
2754 gpfn, gmfn, smfn,
2755 frame_table[gmfn].u.inuse.type_info,
2756 page_table_page, oos_ptes);
2757 FAIL("RW coherence");
2760 if ( (level == 1) &&
2761 (l1e_get_flags(spte) & _PAGE_RW ) &&
2762 !((l1e_get_flags(gpte) & _PAGE_RW) &&
2763 (l1e_get_flags(gpte) & _PAGE_DIRTY)) &&
2764 !oos_ptes )
2766 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2767 "oos_ptes=%d\n",
2768 gpfn, gmfn, smfn,
2769 frame_table[gmfn].u.inuse.type_info,
2770 page_table_page, oos_ptes);
2771 FAIL("RW2 coherence");
2774 if ( gmfn == smfn )
2776 if ( level > 1 )
2777 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2779 else
2781 if ( level < 2 )
2782 FAIL("Shadow in L1 entry?");
2784 if ( level == 2 )
2786 if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
2787 FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
2788 __shadow_status(d, gpfn, PGT_l1_shadow));
2790 else
2791 BUG(); // XXX -- not handled yet.
2794 return errors;
2796 #undef FAIL
2797 #undef v2m
2799 static int check_l1_table(
2800 struct exec_domain *ed, unsigned long gpfn,
2801 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2803 struct domain *d = ed->domain;
2804 int i;
2805 l1_pgentry_t *gpl1e, *spl1e;
2806 int errors = 0, oos_ptes = 0;
2808 if ( page_out_of_sync(pfn_to_page(gmfn)) )
2810 gmfn = __shadow_status(d, gpfn, PGT_snapshot);
2811 oos_ptes = 1;
2812 ASSERT(gmfn);
2815 gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
2816 spl1e = map_domain_mem(smfn << PAGE_SHIFT);
2818 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2819 errors += check_pte(ed, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
2821 unmap_domain_mem(spl1e);
2822 unmap_domain_mem(gpl1e);
2824 return errors;
2827 #define FAILPT(_f, _a...) \
2828 do { \
2829 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2830 errors++; \
2831 } while ( 0 )
2833 int check_l2_table(
2834 struct exec_domain *ed, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2836 struct domain *d = ed->domain;
2837 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
2838 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
2839 l2_pgentry_t match;
2840 int i;
2841 int errors = 0;
2842 int limit;
2844 if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
2845 FAILPT("domain doesn't own page");
2846 if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
2847 FAILPT("bogus owner for snapshot page");
2848 if ( page_get_owner(pfn_to_page(smfn)) != NULL )
2849 FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
2850 smfn, page_get_owner(pfn_to_page(smfn))->id);
2852 #if 0
2853 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2854 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2855 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2856 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2858 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2859 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2860 i++ )
2861 printk("+++ (%d) %p %p\n",i,
2862 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2863 FAILPT("hypervisor entries inconsistent");
2866 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2867 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2868 FAILPT("hypervisor linear map inconsistent");
2869 #endif
2871 match = l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2872 if ( !shadow_mode_external(d) &&
2873 l2e_has_changed(&spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2874 &match, PAGE_FLAG_MASK))
2876 FAILPT("hypervisor shadow linear map inconsistent %p %p",
2877 l2e_get_value(spl2e[SH_LINEAR_PT_VIRT_START >>
2878 L2_PAGETABLE_SHIFT]),
2879 l2e_get_value(match));
2882 match = l2e_create_phys(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2883 if ( !shadow_mode_external(d) &&
2884 l2e_has_changed(&spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2885 &match, PAGE_FLAG_MASK))
2887 FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
2888 l2e_get_value(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2889 d->arch.mm_perdomain_pt,
2890 l2e_get_value(match));
2893 #ifdef __i386__
2894 if ( shadow_mode_external(d) )
2895 limit = L2_PAGETABLE_ENTRIES;
2896 else
2897 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2898 #else
2899 limit = 0; /* XXX x86/64 XXX */
2900 #endif
2902 /* Check the whole L2. */
2903 for ( i = 0; i < limit; i++ )
2904 errors += check_pte(ed,
2905 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2906 (l1_pgentry_t*)(&spl2e[i]),
2907 2, i, 0, 0);
2909 unmap_domain_mem(spl2e);
2910 unmap_domain_mem(gpl2e);
2912 #if 1
2913 if ( errors )
2914 printk("check_l2_table returning %d errors\n", errors);
2915 #endif
2917 return errors;
2919 #undef FAILPT
2921 int _check_pagetable(struct exec_domain *ed, char *s)
2923 struct domain *d = ed->domain;
2924 pagetable_t pt = ed->arch.guest_table;
2925 unsigned long gptbase = pagetable_val(pt);
2926 unsigned long ptbase_pfn, smfn;
2927 unsigned long i;
2928 l2_pgentry_t *gpl2e, *spl2e;
2929 unsigned long ptbase_mfn = 0;
2930 int errors = 0, limit, oos_pdes = 0;
2932 //_audit_domain(d, AUDIT_QUIET);
2933 shadow_lock(d);
2935 sh_check_name = s;
2936 //SH_VVLOG("%s-PT Audit", s);
2937 sh_l2_present = sh_l1_present = 0;
2938 perfc_incrc(check_pagetable);
2940 ptbase_mfn = gptbase >> PAGE_SHIFT;
2941 ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
2943 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2945 printk("%s-PT %p not shadowed\n", s, gptbase);
2946 goto out;
2948 if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
2950 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2951 oos_pdes = 1;
2952 ASSERT(ptbase_mfn);
2955 errors += check_l2_table(ed, ptbase_mfn, smfn, oos_pdes);
2957 gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
2958 spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
2960 /* Go back and recurse. */
2961 #ifdef __i386__
2962 if ( shadow_mode_external(d) )
2963 limit = L2_PAGETABLE_ENTRIES;
2964 else
2965 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2966 #else
2967 limit = 0; /* XXX x86/64 XXX */
2968 #endif
2970 for ( i = 0; i < limit; i++ )
2972 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2973 unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
2974 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2976 if ( l2e_get_value(spl2e[i]) != 0 ) /* FIXME: check flags? */
2978 errors += check_l1_table(ed, gl1pfn, gl1mfn, sl1mfn, i);
2982 unmap_domain_mem(spl2e);
2983 unmap_domain_mem(gpl2e);
2985 #if 0
2986 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2987 sh_l2_present, sh_l1_present);
2988 #endif
2990 out:
2991 if ( errors )
2992 BUG();
2994 shadow_unlock(d);
2996 return errors;
2999 int _check_all_pagetables(struct exec_domain *ed, char *s)
3001 struct domain *d = ed->domain;
3002 int i;
3003 struct shadow_status *a;
3004 unsigned long gmfn;
3005 int errors = 0;
3007 shadow_status_noswap = 1;
3009 sh_check_name = s;
3010 SH_VVLOG("%s-PT Audit domid=%d", s, d->id);
3011 sh_l2_present = sh_l1_present = 0;
3012 perfc_incrc(check_all_pagetables);
3014 for (i = 0; i < shadow_ht_buckets; i++)
3016 a = &d->arch.shadow_ht[i];
3017 while ( a && a->gpfn_and_flags )
3019 gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
3021 switch ( a->gpfn_and_flags & PGT_type_mask )
3023 case PGT_l1_shadow:
3024 errors += check_l1_table(ed, a->gpfn_and_flags & PGT_mfn_mask,
3025 gmfn, a->smfn, 0);
3026 break;
3027 case PGT_l2_shadow:
3028 errors += check_l2_table(ed, gmfn, a->smfn,
3029 page_out_of_sync(pfn_to_page(gmfn)));
3030 break;
3031 case PGT_l3_shadow:
3032 case PGT_l4_shadow:
3033 case PGT_hl2_shadow:
3034 BUG(); // XXX - ought to fix this...
3035 break;
3036 case PGT_snapshot:
3037 case PGT_writable_pred:
3038 break;
3039 default:
3040 errors++;
3041 printk("unexpected shadow type %p, gpfn=%p, "
3042 "gmfn=%p smfn=%p\n",
3043 a->gpfn_and_flags & PGT_type_mask,
3044 a->gpfn_and_flags & PGT_mfn_mask,
3045 gmfn, a->smfn);
3046 BUG();
3048 a = a->next;
3052 shadow_status_noswap = 0;
3054 if ( errors )
3055 BUG();
3057 return errors;
3060 #endif // SHADOW_DEBUG
3062 /*
3063 * Local variables:
3064 * mode: C
3065 * c-set-style: "BSD"
3066 * c-basic-offset: 4
3067 * tab-width: 4
3068 * indent-tabs-mode: nil
3069 * End:
3070 */