debuggers.hg

view xen/arch/x86/shadow.c @ 4629:6375127fdf23

bitkeeper revision 1.1311.1.1 (426641eeBv97w6sl983zxeR4Dc3Utg)

Cleanup page table handling. Add macros to access page table
entries, fixup plenty of places in the code to use the page
table types instead of "unsigned long".

Signed-off-by: Gerd Knorr <kraxel@bytesex.org>
Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Wed Apr 20 11:50:06 2005 +0000 (2005-04-20)
parents b4ebb22003b1
children 1803018b3b05
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <asm/shadow.h>
27 #include <asm/domain_page.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
33 static void shadow_free_snapshot(struct domain *d,
34 struct out_of_sync_entry *entry);
35 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
37 /********
39 There's a per-domain shadow table spin lock which works fine for SMP
40 hosts. We don't have to worry about interrupts as no shadow operations
41 happen in an interrupt context. It's probably not quite ready for SMP
42 guest operation as we have to worry about synchonisation between gpte
43 and spte updates. Its possible that this might only happen in a
44 hypercall context, in which case we'll probably at have a per-domain
45 hypercall lock anyhow (at least initially).
47 ********/
49 static inline int
50 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
51 unsigned long new_type)
52 {
53 struct pfn_info *page = pfn_to_page(gmfn);
54 int pinned = 0, okay = 1;
56 if ( page_out_of_sync(page) )
57 {
58 // Don't know how long ago this snapshot was taken.
59 // Can't trust it to be recent enough.
60 //
61 __shadow_sync_mfn(d, gmfn);
62 }
64 if ( unlikely(page_is_page_table(page)) )
65 return 1;
67 FSH_LOG("%s: gpfn=%p gmfn=%p nt=%p", __func__, gpfn, gmfn, new_type);
69 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
70 {
71 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%p gmfn=%p",
72 __func__, gpfn, gmfn);
73 #if 1 || defined(LIVE_DANGEROUSLY)
74 set_bit(_PGC_page_table, &page->count_info);
75 return 1;
76 #endif
77 return 0;
79 }
81 // To convert this page to use as a page table, the writable count
82 // should now be zero. Test this by grabbing the page as an page table,
83 // and then immediately releasing. This will also deal with any
84 // necessary TLB flushing issues for us.
85 //
86 // The cruft here about pinning doesn't really work right. This
87 // needs rethinking/rewriting... Need to gracefully deal with the
88 // TLB flushes required when promoting a writable page, and also deal
89 // with any outstanding (external) writable refs to this page (by
90 // refusing to promote it). The pinning headache complicates this
91 // code -- it would all much get simpler if we stop using
92 // shadow_lock() and move the shadow code to BIGLOCK().
93 //
94 if ( unlikely(!get_page(page, d)) )
95 BUG(); // XXX -- needs more thought for a graceful failure
96 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
97 {
98 pinned = 1;
99 put_page_and_type(page);
100 }
101 if ( get_page_type(page, PGT_base_page_table) )
102 {
103 set_bit(_PGC_page_table, &page->count_info);
104 put_page_type(page);
105 }
106 else
107 {
108 printk("shadow_promote: get_page_type failed "
109 "dom%d gpfn=%p gmfn=%p t=%x\n",
110 d->id, gpfn, gmfn, new_type);
111 okay = 0;
112 }
114 // Now put the type back to writable...
115 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
116 BUG(); // XXX -- needs more thought for a graceful failure
117 if ( unlikely(pinned) )
118 {
119 if ( unlikely(test_and_set_bit(_PGT_pinned,
120 &page->u.inuse.type_info)) )
121 BUG(); // hmm... someone pinned this again?
122 }
123 else
124 put_page_and_type(page);
126 return okay;
127 }
129 static inline void
130 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
131 {
132 ASSERT(frame_table[gmfn].count_info & PGC_page_table);
134 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
135 {
136 clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
138 if ( page_out_of_sync(pfn_to_page(gmfn)) )
139 {
140 remove_out_of_sync_entries(d, gmfn);
141 }
142 }
143 }
145 /*
146 * Things in shadow mode that collect get_page() refs to the domain's
147 * pages are:
148 * - PGC_allocated takes a gen count, just like normal.
149 * - A writable page can be pinned (paravirtualized guests may consider
150 * these pages to be L1s or L2s, and don't know the difference).
151 * Pinning a page takes a gen count (but, for domains in shadow mode,
152 * it *doesn't* take a type count)
153 * - CR3 grabs a ref to whatever it points at, just like normal.
154 * - Shadow mode grabs an initial gen count for itself, as a placehold
155 * for whatever references will exist.
156 * - Shadow PTEs that point to a page take a gen count, just like regular
157 * PTEs. However, they don't get a type count, as get_page_type() is
158 * hardwired to keep writable pages' counts at 1 for domains in shadow
159 * mode.
160 * - Whenever we shadow a page, the entry in the shadow hash grabs a
161 * general ref to the page.
162 * - Whenever a page goes out of sync, the out of sync entry grabs a
163 * general ref to the page.
164 */
165 /*
166 * pfn_info fields for pages allocated as shadow pages:
167 *
168 * All 32 bits of count_info are a simple count of refs to this shadow
169 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
170 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
171 * references.
172 *
173 * u.inuse._domain is left NULL, to prevent accidently allow some random
174 * domain from gaining permissions to map this page.
175 *
176 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
177 * shadowed.
178 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
179 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
180 * is currently exists because this is a shadow of a root page, and we
181 * don't want to let those disappear just because no CR3 is currently pointing
182 * at it.
183 *
184 * tlbflush_timestamp holds a min & max index of valid page table entries
185 * within the shadow page.
186 */
188 static inline unsigned long
189 alloc_shadow_page(struct domain *d,
190 unsigned long gpfn, unsigned long gmfn,
191 u32 psh_type)
192 {
193 struct pfn_info *page;
194 unsigned long smfn;
195 int pin = 0;
197 // Currently, we only keep pre-zero'ed pages around for use as L1's...
198 // This will change. Soon.
199 //
200 if ( psh_type == PGT_l1_shadow )
201 {
202 if ( !list_empty(&d->arch.free_shadow_frames) )
203 {
204 struct list_head *entry = d->arch.free_shadow_frames.next;
205 page = list_entry(entry, struct pfn_info, list);
206 list_del(entry);
207 perfc_decr(free_l1_pages);
208 }
209 else
210 {
211 page = alloc_domheap_page(NULL);
212 void *l1 = map_domain_mem(page_to_pfn(page) << PAGE_SHIFT);
213 memset(l1, 0, PAGE_SIZE);
214 unmap_domain_mem(l1);
215 }
216 }
217 else
218 page = alloc_domheap_page(NULL);
220 if ( unlikely(page == NULL) )
221 {
222 printk("Couldn't alloc shadow page! dom%d count=%d\n",
223 d->id, d->arch.shadow_page_count);
224 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
225 perfc_value(shadow_l1_pages),
226 perfc_value(shadow_l2_pages),
227 perfc_value(hl2_table_pages),
228 perfc_value(snapshot_pages));
229 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
230 }
232 smfn = page_to_pfn(page);
234 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
235 page->u.inuse.type_info = psh_type | gmfn;
236 page->count_info = 0;
237 page->tlbflush_timestamp = 0;
239 switch ( psh_type )
240 {
241 case PGT_l1_shadow:
242 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
243 goto fail;
244 perfc_incr(shadow_l1_pages);
245 d->arch.shadow_page_count++;
246 break;
248 case PGT_l2_shadow:
249 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
250 goto fail;
251 perfc_incr(shadow_l2_pages);
252 d->arch.shadow_page_count++;
253 if ( PGT_l2_page_table == PGT_root_page_table )
254 pin = 1;
256 break;
258 case PGT_hl2_shadow:
259 // Treat an hl2 as an L1 for purposes of promotion.
260 // For external mode domains, treat them as an L2 for purposes of
261 // pinning.
262 //
263 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
264 goto fail;
265 perfc_incr(hl2_table_pages);
266 d->arch.hl2_page_count++;
267 if ( shadow_mode_external(d) &&
268 (PGT_l2_page_table == PGT_root_page_table) )
269 pin = 1;
271 break;
273 case PGT_snapshot:
274 perfc_incr(snapshot_pages);
275 d->arch.snapshot_page_count++;
276 break;
278 default:
279 printk("Alloc shadow weird page type type=%08x\n", psh_type);
280 BUG();
281 break;
282 }
284 // Don't add a new shadow of something that already has a snapshot.
285 //
286 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
288 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
290 if ( pin )
291 shadow_pin(smfn);
293 return smfn;
295 fail:
296 FSH_LOG("promotion of pfn=%p mfn=%p failed! external gnttab refs?",
297 gpfn, gmfn);
298 free_domheap_page(page);
299 return 0;
300 }
302 static void inline
303 free_shadow_l1_table(struct domain *d, unsigned long smfn)
304 {
305 l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
306 int i;
307 struct pfn_info *spage = pfn_to_page(smfn);
308 u32 min_max = spage->tlbflush_timestamp;
309 int min = SHADOW_MIN(min_max);
310 int max = SHADOW_MAX(min_max);
312 for ( i = min; i <= max; i++ )
313 {
314 put_page_from_l1e(pl1e[i], d);
315 pl1e[i] = l1e_empty();
316 }
318 unmap_domain_mem(pl1e);
319 }
321 static void inline
322 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
323 {
324 l1_pgentry_t *hl2 = map_domain_mem(smfn << PAGE_SHIFT);
325 int i, limit;
327 SH_VVLOG("%s: smfn=%p freed\n", __func__, smfn);
329 #ifdef __i386__
330 if ( shadow_mode_external(d) )
331 limit = L2_PAGETABLE_ENTRIES;
332 else
333 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
334 #else
335 limit = 0; /* XXX x86/64 XXX */
336 #endif
338 for ( i = 0; i < limit; i++ )
339 {
340 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
341 put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
342 }
344 unmap_domain_mem(hl2);
345 }
347 static void inline
348 free_shadow_l2_table(struct domain *d, unsigned long smfn)
349 {
350 unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
351 int i, external = shadow_mode_external(d);
353 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
354 if ( external || is_guest_l2_slot(i) )
355 if ( pl2e[i] & _PAGE_PRESENT )
356 put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
358 if ( (PGT_base_page_table == PGT_l2_page_table) &&
359 shadow_mode_translate(d) && !external )
360 {
361 // free the ref to the hl2
362 //
363 put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
364 >> PAGE_SHIFT);
365 }
367 unmap_domain_mem(pl2e);
368 }
370 void free_shadow_page(unsigned long smfn)
371 {
372 struct pfn_info *page = &frame_table[smfn];
373 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
374 struct domain *d = page_get_owner(pfn_to_page(gmfn));
375 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
376 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
378 SH_VVLOG("%s: free'ing smfn=%p", __func__, smfn);
380 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
382 delete_shadow_status(d, gpfn, gmfn, type);
384 switch ( type )
385 {
386 case PGT_l1_shadow:
387 perfc_decr(shadow_l1_pages);
388 shadow_demote(d, gpfn, gmfn);
389 free_shadow_l1_table(d, smfn);
390 break;
392 case PGT_l2_shadow:
393 perfc_decr(shadow_l2_pages);
394 shadow_demote(d, gpfn, gmfn);
395 free_shadow_l2_table(d, smfn);
396 break;
398 case PGT_hl2_shadow:
399 perfc_decr(hl2_table_pages);
400 shadow_demote(d, gpfn, gmfn);
401 free_shadow_hl2_table(d, smfn);
402 break;
404 case PGT_snapshot:
405 perfc_decr(snapshot_pages);
406 break;
408 default:
409 printk("Free shadow weird page type mfn=%08x type=%08x\n",
410 page-frame_table, page->u.inuse.type_info);
411 break;
412 }
414 d->arch.shadow_page_count--;
416 // No TLB flushes are needed the next time this page gets allocated.
417 //
418 page->tlbflush_timestamp = 0;
419 page->u.free.cpu_mask = 0;
421 if ( type == PGT_l1_shadow )
422 {
423 list_add(&page->list, &d->arch.free_shadow_frames);
424 perfc_incr(free_l1_pages);
425 }
426 else
427 free_domheap_page(page);
428 }
430 static void inline
431 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
432 {
433 struct pfn_info *page;
435 page = &frame_table[entry->gmfn];
437 // Decrement ref count of guest & shadow pages
438 //
439 put_page(page);
441 // Only use entries that have low bits clear...
442 //
443 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
444 {
445 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
446 entry->writable_pl1e = -2;
447 }
448 else
449 ASSERT( entry->writable_pl1e == -1 );
451 // Free the snapshot
452 //
453 shadow_free_snapshot(d, entry);
454 }
456 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
457 {
458 struct out_of_sync_entry *entry = d->arch.out_of_sync;
459 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
460 struct out_of_sync_entry *found = NULL;
462 // NB: Be careful not to call something that manipulates this list
463 // while walking it. Collect the results into a separate list
464 // first, then walk that list.
465 //
466 while ( entry )
467 {
468 if ( entry->gmfn == gmfn )
469 {
470 // remove from out of sync list
471 *prev = entry->next;
473 // add to found list
474 entry->next = found;
475 found = entry;
477 entry = *prev;
478 continue;
479 }
480 prev = &entry->next;
481 entry = entry->next;
482 }
484 prev = NULL;
485 entry = found;
486 while ( entry )
487 {
488 release_out_of_sync_entry(d, entry);
490 prev = &entry->next;
491 entry = entry->next;
492 }
494 // Add found list to free list
495 if ( prev )
496 {
497 *prev = d->arch.out_of_sync_free;
498 d->arch.out_of_sync_free = found;
499 }
500 }
502 static void free_out_of_sync_state(struct domain *d)
503 {
504 struct out_of_sync_entry *entry;
506 // NB: Be careful not to call something that manipulates this list
507 // while walking it. Remove one item at a time, and always
508 // restart from start of list.
509 //
510 while ( (entry = d->arch.out_of_sync) )
511 {
512 d->arch.out_of_sync = entry->next;
513 release_out_of_sync_entry(d, entry);
515 entry->next = d->arch.out_of_sync_free;
516 d->arch.out_of_sync_free = entry;
517 }
518 }
520 static void free_shadow_pages(struct domain *d)
521 {
522 int i;
523 struct shadow_status *x;
524 struct exec_domain *ed;
526 /*
527 * WARNING! The shadow page table must not currently be in use!
528 * e.g., You are expected to have paused the domain and synchronized CR3.
529 */
531 if( !d->arch.shadow_ht ) return;
533 shadow_audit(d, 1);
535 // first, remove any outstanding refs from out_of_sync entries...
536 //
537 free_out_of_sync_state(d);
539 // second, remove any outstanding refs from ed->arch.shadow_table...
540 //
541 for_each_exec_domain(d, ed)
542 {
543 if ( pagetable_val(ed->arch.shadow_table) )
544 {
545 put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
546 ed->arch.shadow_table = mk_pagetable(0);
547 }
548 }
550 // For external shadows, remove the monitor table's refs
551 //
552 if ( shadow_mode_external(d) )
553 {
554 for_each_exec_domain(d, ed)
555 {
556 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
557 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
558 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
560 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
561 {
562 put_shadow_ref(l2e_get_pfn(hl2e));
563 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
564 }
565 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
566 {
567 put_shadow_ref(l2e_get_pfn(smfn));
568 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
569 }
570 }
571 }
573 // Now, the only refs to shadow pages that are left are from the shadow
574 // pages themselves. We just unpin the pinned pages, and the rest
575 // should automatically disappear.
576 //
577 // NB: Beware: each explicitly or implicit call to free_shadow_page
578 // can/will result in the hash bucket getting rewritten out from
579 // under us... First, collect the list of pinned pages, then
580 // free them.
581 //
582 #define PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
583 for ( i = 0; i < shadow_ht_buckets; i++ )
584 {
585 u32 count;
586 unsigned long *mfn_list;
588 /* Skip empty buckets. */
589 x = &d->arch.shadow_ht[i];
590 if ( x->gpfn_and_flags == 0 )
591 continue;
593 count = 0;
594 for ( ; x != NULL; x = x->next )
595 if ( PINNED(x->smfn) )
596 count++;
597 if ( !count )
598 continue;
600 mfn_list = xmalloc_array(unsigned long, count);
601 count = 0;
602 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
603 if ( PINNED(x->smfn) )
604 mfn_list[count++] = x->smfn;
606 while ( count )
607 {
608 shadow_unpin(mfn_list[--count]);
609 }
610 xfree(mfn_list);
611 }
612 #undef PINNED
614 shadow_audit(d, 0);
616 SH_LOG("Free shadow table.");
617 }
619 void shadow_mode_init(void)
620 {
621 }
623 int _shadow_mode_enabled(struct domain *d)
624 {
625 return shadow_mode_enabled(d);
626 }
628 static void alloc_monitor_pagetable(struct exec_domain *ed)
629 {
630 unsigned long mmfn;
631 l2_pgentry_t *mpl2e;
632 struct pfn_info *mmfn_info;
633 struct domain *d = ed->domain;
635 ASSERT(pagetable_val(ed->arch.monitor_table) == 0);
637 mmfn_info = alloc_domheap_page(NULL);
638 ASSERT(mmfn_info != NULL);
640 mmfn = (unsigned long) (mmfn_info - frame_table);
641 mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
642 memset(mpl2e, 0, PAGE_SIZE);
644 #ifdef __i386__ /* XXX screws x86/64 build */
645 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
646 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
647 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
648 #endif
650 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
651 l2e_create_phys(__pa(d->arch.mm_perdomain_pt),
652 __PAGE_HYPERVISOR);
654 // map the phys_to_machine map into the Read-Only MPT space for this domain
655 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
656 l2e_create_phys(pagetable_val(d->arch.phys_table),
657 __PAGE_HYPERVISOR);
659 // Don't (yet) have mappings for these...
660 // Don't want to accidentally see the idle_pg_table's linear mapping.
661 //
662 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
663 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
665 ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
666 ed->arch.monitor_vtable = mpl2e;
667 }
669 /*
670 * Free the pages for monitor_table and hl2_table
671 */
672 void free_monitor_pagetable(struct exec_domain *ed)
673 {
674 l2_pgentry_t *mpl2e, hl2e, sl2e;
675 unsigned long mfn;
677 ASSERT( pagetable_val(ed->arch.monitor_table) );
678 ASSERT( shadow_mode_external(ed->domain) );
680 mpl2e = ed->arch.monitor_vtable;
682 /*
683 * First get the mfn for hl2_table by looking at monitor_table
684 */
685 hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
686 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
687 {
688 mfn = l2e_get_pfn(hl2e);
689 ASSERT(mfn);
690 put_shadow_ref(mfn);
691 }
693 sl2e = mpl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
694 if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
695 {
696 mfn = l2e_get_pfn(sl2e);
697 ASSERT(mfn);
698 put_shadow_ref(mfn);
699 }
701 unmap_domain_mem(mpl2e);
703 /*
704 * Then free monitor_table.
705 */
706 mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
707 free_domheap_page(&frame_table[mfn]);
709 ed->arch.monitor_table = mk_pagetable(0);
710 ed->arch.monitor_vtable = 0;
711 }
713 int
714 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
715 {
716 unsigned long phystab = pagetable_val(d->arch.phys_table);
717 l2_pgentry_t *l2, l2e;
718 l1_pgentry_t *l1;
719 struct pfn_info *l1page;
720 unsigned long va = pfn << PAGE_SHIFT;
722 ASSERT( phystab );
724 l2 = map_domain_mem(phystab);
725 l2e = l2[l2_table_offset(va)];
726 if ( !l2e_get_value(l2e) ) /* FIXME: check present bit? */
727 {
728 l1page = alloc_domheap_page(NULL);
729 if ( !l1page )
730 return 0;
732 l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT);
733 memset(l1, 0, PAGE_SIZE);
734 unmap_domain_mem(l1);
736 l2e = l2e_create_pfn(page_to_pfn(l1page), __PAGE_HYPERVISOR);
737 l2[l2_table_offset(va)] = l2e;
738 }
739 unmap_domain_mem(l2);
741 l1 = map_domain_mem(l2e_get_phys(l2e));
742 l1[l1_table_offset(va)] = l1e_create_pfn(mfn, __PAGE_HYPERVISOR);
743 unmap_domain_mem(l1);
745 return 1;
746 }
748 static int
749 alloc_p2m_table(struct domain *d)
750 {
751 struct list_head *list_ent;
752 struct pfn_info *page, *l2page;
753 l2_pgentry_t *l2;
754 unsigned long mfn, pfn;
756 l2page = alloc_domheap_page(NULL);
757 if ( !l2page )
758 return 0;
759 d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT);
760 l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT);
761 memset(l2, 0, PAGE_SIZE);
762 unmap_domain_mem(l2);
764 list_ent = d->page_list.next;
765 while ( list_ent != &d->page_list )
766 {
767 page = list_entry(list_ent, struct pfn_info, list);
768 mfn = page_to_pfn(page);
769 pfn = machine_to_phys_mapping[mfn];
770 ASSERT(pfn != INVALID_M2P_ENTRY);
771 ASSERT(pfn < (1u<<20));
773 set_p2m_entry(d, pfn, mfn);
775 list_ent = page->list.next;
776 }
778 list_ent = d->xenpage_list.next;
779 while ( list_ent != &d->xenpage_list )
780 {
781 page = list_entry(list_ent, struct pfn_info, list);
782 mfn = page_to_pfn(page);
783 pfn = machine_to_phys_mapping[mfn];
784 if ( (pfn != INVALID_M2P_ENTRY) &&
785 (pfn < (1u<<20)) )
786 {
787 set_p2m_entry(d, pfn, mfn);
788 }
790 list_ent = page->list.next;
791 }
793 return 1;
794 }
796 static void
797 free_p2m_table(struct domain *d)
798 {
799 // uh, this needs some work... :)
800 BUG();
801 }
803 int __shadow_mode_enable(struct domain *d, unsigned int mode)
804 {
805 struct exec_domain *ed;
806 int new_modes = (mode & ~d->arch.shadow_mode);
808 // Gotta be adding something to call this function.
809 ASSERT(new_modes);
811 // can't take anything away by calling this function.
812 ASSERT(!(d->arch.shadow_mode & ~mode));
814 for_each_exec_domain(d, ed)
815 {
816 invalidate_shadow_ldt(ed);
818 // We need to set these up for __update_pagetables().
819 // See the comment there.
821 /*
822 * arch.guest_vtable
823 */
824 if ( ed->arch.guest_vtable &&
825 (ed->arch.guest_vtable != __linear_l2_table) )
826 {
827 unmap_domain_mem(ed->arch.guest_vtable);
828 }
829 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
830 ed->arch.guest_vtable = __linear_l2_table;
831 else
832 ed->arch.guest_vtable = NULL;
834 /*
835 * arch.shadow_vtable
836 */
837 if ( ed->arch.shadow_vtable &&
838 (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
839 {
840 unmap_domain_mem(ed->arch.shadow_vtable);
841 }
842 if ( !(mode & SHM_external) )
843 ed->arch.shadow_vtable = __shadow_linear_l2_table;
844 else
845 ed->arch.shadow_vtable = NULL;
847 /*
848 * arch.hl2_vtable
849 */
850 if ( ed->arch.hl2_vtable &&
851 (ed->arch.hl2_vtable != __linear_hl2_table) )
852 {
853 unmap_domain_mem(ed->arch.hl2_vtable);
854 }
855 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
856 ed->arch.hl2_vtable = __linear_hl2_table;
857 else
858 ed->arch.hl2_vtable = NULL;
860 /*
861 * arch.monitor_table & arch.monitor_vtable
862 */
863 if ( ed->arch.monitor_vtable )
864 {
865 free_monitor_pagetable(ed);
866 }
867 if ( mode & SHM_external )
868 {
869 alloc_monitor_pagetable(ed);
870 }
871 }
873 if ( new_modes & SHM_enable )
874 {
875 ASSERT( !d->arch.shadow_ht );
876 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
877 if ( d->arch.shadow_ht == NULL )
878 goto nomem;
880 memset(d->arch.shadow_ht, 0,
881 shadow_ht_buckets * sizeof(struct shadow_status));
882 }
884 if ( new_modes & SHM_log_dirty )
885 {
886 ASSERT( !d->arch.shadow_dirty_bitmap );
887 d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
888 d->arch.shadow_dirty_bitmap =
889 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
890 (8 * sizeof(unsigned long)));
891 if ( d->arch.shadow_dirty_bitmap == NULL )
892 {
893 d->arch.shadow_dirty_bitmap_size = 0;
894 goto nomem;
895 }
896 memset(d->arch.shadow_dirty_bitmap, 0,
897 d->arch.shadow_dirty_bitmap_size/8);
898 }
900 if ( new_modes & SHM_translate )
901 {
902 if ( !(new_modes & SHM_external) )
903 {
904 ASSERT( !pagetable_val(d->arch.phys_table) );
905 if ( !alloc_p2m_table(d) )
906 {
907 printk("alloc_p2m_table failed (out-of-memory?)\n");
908 goto nomem;
909 }
910 }
911 else
912 {
913 // external guests provide their own memory for their P2M maps.
914 //
915 ASSERT( d == page_get_owner(&frame_table[pagetable_val(
916 d->arch.phys_table)>>PAGE_SHIFT]) );
917 }
918 }
920 printk("audit1\n");
921 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
922 printk("audit1 done\n");
924 // Get rid of any shadow pages from any previous shadow mode.
925 //
926 free_shadow_pages(d);
928 printk("audit2\n");
929 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
930 printk("audit2 done\n");
932 // Turn off writable page tables.
933 // It doesn't mix with shadow mode.
934 // And shadow mode offers a superset of functionality.
935 //
936 vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
938 /*
939 * Tear down it's counts by disassembling its page-table-based ref counts.
940 * Also remove CR3's gcount/tcount.
941 * That leaves things like GDTs and LDTs and external refs in tact.
942 *
943 * Most pages will be writable tcount=0.
944 * Some will still be L1 tcount=0 or L2 tcount=0.
945 * Maybe some pages will be type none tcount=0.
946 * Pages granted external writable refs (via grant tables?) will
947 * still have a non-zero tcount. That's OK.
948 *
949 * gcounts will generally be 1 for PGC_allocated.
950 * GDTs and LDTs will have additional gcounts.
951 * Any grant-table based refs will still be in the gcount.
952 *
953 * We attempt to grab writable refs to each page (thus setting its type).
954 * Immediately put back those type refs.
955 *
956 * Assert that no pages are left with L1/L2/L3/L4 type.
957 */
958 audit_adjust_pgtables(d, -1, 1);
959 d->arch.shadow_mode = mode;
961 struct list_head *list_ent = d->page_list.next;
962 while ( list_ent != &d->page_list )
963 {
964 struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
965 if ( !get_page_type(page, PGT_writable_page) )
966 BUG();
967 put_page_type(page);
969 list_ent = page->list.next;
970 }
972 audit_adjust_pgtables(d, 1, 1);
974 printk("audit3\n");
975 _audit_domain(d, AUDIT_ALREADY_LOCKED);
976 printk("audit3 done\n");
978 return 0;
980 nomem:
981 if ( (new_modes & SHM_enable) && (d->arch.shadow_ht != NULL) )
982 {
983 xfree(d->arch.shadow_ht);
984 d->arch.shadow_ht = NULL;
985 }
986 if ( (new_modes & SHM_log_dirty) && (d->arch.shadow_dirty_bitmap != NULL) )
987 {
988 xfree(d->arch.shadow_dirty_bitmap);
989 d->arch.shadow_dirty_bitmap = NULL;
990 }
991 if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
992 pagetable_val(d->arch.phys_table) )
993 {
994 free_p2m_table(d);
995 }
996 return -ENOMEM;
997 }
999 int shadow_mode_enable(struct domain *d, unsigned int mode)
1001 int rc;
1002 shadow_lock(d);
1003 rc = __shadow_mode_enable(d, mode);
1004 shadow_unlock(d);
1005 return rc;
1008 static void
1009 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1011 int i;
1012 l1_pgentry_t *l1;
1014 l1 = map_domain_mem(l1mfn << PAGE_SHIFT);
1015 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1017 if ( is_guest_l1_slot(i) &&
1018 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1020 unsigned long mfn = l1e_get_pfn(l1[i]);
1021 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1022 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1023 l1[i] = l1e_create_pfn(gpfn, l1e_get_flags(l1[i]));
1026 unmap_domain_mem(l1);
1029 // This is not general enough to handle arbitrary pagetables
1030 // with shared L1 pages, etc., but it is sufficient for bringing
1031 // up dom0.
1032 //
1033 void
1034 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn)
1036 int i;
1037 l2_pgentry_t *l2;
1039 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1041 l2 = map_domain_mem(l2mfn << PAGE_SHIFT);
1042 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1044 if ( is_guest_l2_slot(i) &&
1045 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1047 unsigned long mfn = l2e_get_pfn(l2[i]);
1048 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1049 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1050 l2[i] = l2e_create_pfn(gpfn, l2e_get_flags(l2[i]));
1051 translate_l1pgtable(d, p2m, mfn);
1054 unmap_domain_mem(l2);
1057 static void free_shadow_ht_entries(struct domain *d)
1059 struct shadow_status *x, *n;
1061 SH_VLOG("freed tables count=%d l1=%d l2=%d",
1062 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
1063 perfc_value(shadow_l2_pages));
1065 n = d->arch.shadow_ht_extras;
1066 while ( (x = n) != NULL )
1068 d->arch.shadow_extras_count--;
1069 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
1070 xfree(x);
1073 d->arch.shadow_ht_extras = NULL;
1074 d->arch.shadow_ht_free = NULL;
1076 ASSERT(d->arch.shadow_extras_count == 0);
1077 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
1079 if ( d->arch.shadow_dirty_bitmap != NULL )
1081 xfree(d->arch.shadow_dirty_bitmap);
1082 d->arch.shadow_dirty_bitmap = 0;
1083 d->arch.shadow_dirty_bitmap_size = 0;
1086 xfree(d->arch.shadow_ht);
1087 d->arch.shadow_ht = NULL;
1090 static void free_out_of_sync_entries(struct domain *d)
1092 struct out_of_sync_entry *x, *n;
1094 n = d->arch.out_of_sync_extras;
1095 while ( (x = n) != NULL )
1097 d->arch.out_of_sync_extras_count--;
1098 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
1099 xfree(x);
1102 d->arch.out_of_sync_extras = NULL;
1103 d->arch.out_of_sync_free = NULL;
1104 d->arch.out_of_sync = NULL;
1106 ASSERT(d->arch.out_of_sync_extras_count == 0);
1107 FSH_LOG("freed extra out_of_sync entries, now %d",
1108 d->arch.out_of_sync_extras_count);
1111 void __shadow_mode_disable(struct domain *d)
1113 // This needs rethinking for the full shadow mode stuff.
1114 //
1115 // Among other things, ref counts need to be restored to a sensible
1116 // state for a non-shadow-mode guest...
1117 // This is probably easiest to do by stealing code from audit_domain().
1118 //
1119 BUG();
1121 free_shadow_pages(d);
1123 d->arch.shadow_mode = 0;
1125 free_shadow_ht_entries(d);
1126 free_out_of_sync_entries(d);
1129 static int shadow_mode_table_op(
1130 struct domain *d, dom0_shadow_control_t *sc)
1132 unsigned int op = sc->op;
1133 int i, rc = 0;
1134 struct exec_domain *ed;
1136 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1138 SH_VLOG("shadow mode table op %p %p count %d",
1139 pagetable_val(d->exec_domain[0]->arch.guest_table), /* XXX SMP */
1140 pagetable_val(d->exec_domain[0]->arch.shadow_table), /* XXX SMP */
1141 d->arch.shadow_page_count);
1143 shadow_audit(d, 1);
1145 switch ( op )
1147 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1148 free_shadow_pages(d);
1150 d->arch.shadow_fault_count = 0;
1151 d->arch.shadow_dirty_count = 0;
1152 d->arch.shadow_dirty_net_count = 0;
1153 d->arch.shadow_dirty_block_count = 0;
1155 break;
1157 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1158 free_shadow_pages(d);
1160 sc->stats.fault_count = d->arch.shadow_fault_count;
1161 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1162 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1163 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1165 d->arch.shadow_fault_count = 0;
1166 d->arch.shadow_dirty_count = 0;
1167 d->arch.shadow_dirty_net_count = 0;
1168 d->arch.shadow_dirty_block_count = 0;
1170 if ( (d->max_pages > sc->pages) ||
1171 (sc->dirty_bitmap == NULL) ||
1172 (d->arch.shadow_dirty_bitmap == NULL) )
1174 rc = -EINVAL;
1175 break;
1178 sc->pages = d->max_pages;
1180 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1181 for ( i = 0; i < d->max_pages; i += chunk )
1183 int bytes = ((((d->max_pages - i) > chunk) ?
1184 chunk : (d->max_pages - i)) + 7) / 8;
1186 if (copy_to_user(
1187 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1188 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1189 bytes))
1191 // copy_to_user can fail when copying to guest app memory.
1192 // app should zero buffer after mallocing, and pin it
1193 rc = -EINVAL;
1194 memset(
1195 d->arch.shadow_dirty_bitmap +
1196 (i/(8*sizeof(unsigned long))),
1197 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
1198 break;
1201 memset(
1202 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1203 0, bytes);
1206 break;
1208 case DOM0_SHADOW_CONTROL_OP_PEEK:
1209 sc->stats.fault_count = d->arch.shadow_fault_count;
1210 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1211 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1212 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1214 if ( (d->max_pages > sc->pages) ||
1215 (sc->dirty_bitmap == NULL) ||
1216 (d->arch.shadow_dirty_bitmap == NULL) )
1218 rc = -EINVAL;
1219 break;
1222 sc->pages = d->max_pages;
1223 if (copy_to_user(
1224 sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
1226 rc = -EINVAL;
1227 break;
1230 break;
1232 default:
1233 rc = -EINVAL;
1234 break;
1237 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1238 shadow_audit(d, 1);
1240 for_each_exec_domain(d,ed)
1241 __update_pagetables(ed);
1243 return rc;
1246 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1248 unsigned int op = sc->op;
1249 int rc = 0;
1250 struct exec_domain *ed;
1252 if ( unlikely(d == current->domain) )
1254 DPRINTK("Don't try to do a shadow op on yourself!\n");
1255 return -EINVAL;
1258 domain_pause(d);
1260 shadow_lock(d);
1262 switch ( op )
1264 case DOM0_SHADOW_CONTROL_OP_OFF:
1265 shadow_mode_disable(d);
1266 break;
1268 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1269 free_shadow_pages(d);
1270 rc = __shadow_mode_enable(d, SHM_enable);
1271 break;
1273 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1274 free_shadow_pages(d);
1275 rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1276 break;
1278 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1279 free_shadow_pages(d);
1280 rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_translate);
1281 break;
1283 default:
1284 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1285 break;
1288 shadow_unlock(d);
1290 for_each_exec_domain(d,ed)
1291 update_pagetables(ed);
1293 domain_unpause(d);
1295 return rc;
1298 /*
1299 * XXX KAF: Why is this VMX specific?
1300 */
1301 void vmx_shadow_clear_state(struct domain *d)
1303 SH_VVLOG("%s:", __func__);
1304 shadow_lock(d);
1305 free_shadow_pages(d);
1306 shadow_unlock(d);
1307 update_pagetables(d->exec_domain[0]);
1310 unsigned long
1311 gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1313 ASSERT( shadow_mode_translate(d) );
1315 perfc_incrc(gpfn_to_mfn_foreign);
1317 unsigned long va = gpfn << PAGE_SHIFT;
1318 unsigned long phystab = pagetable_val(d->arch.phys_table);
1319 l2_pgentry_t *l2 = map_domain_mem(phystab);
1320 l2_pgentry_t l2e = l2[l2_table_offset(va)];
1321 unmap_domain_mem(l2);
1322 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1324 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l2e=%p\n",
1325 d->id, gpfn, l2e_get_value(l2e));
1326 return INVALID_MFN;
1328 unsigned long l1tab = l2e_get_phys(l2e);
1329 l1_pgentry_t *l1 = map_domain_mem(l1tab);
1330 l1_pgentry_t l1e = l1[l1_table_offset(va)];
1331 unmap_domain_mem(l1);
1333 #if 0
1334 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => %p phystab=%p l2e=%p l1tab=%p, l1e=%p\n",
1335 d->id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, phystab, l2e, l1tab, l1e);
1336 #endif
1338 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1340 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l1e=%p\n",
1341 d->id, gpfn, l1e_get_value(l1e));
1342 return INVALID_MFN;
1345 return l1e_get_pfn(l1e);
1348 static unsigned long
1349 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1350 unsigned long smfn)
1352 unsigned long hl2mfn;
1353 l1_pgentry_t *hl2;
1354 int limit;
1356 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1358 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1360 printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1361 BUG(); /* XXX Deal gracefully with failure. */
1364 SH_VVLOG("shadow_hl2_table(gpfn=%p, gmfn=%p, smfn=%p) => %p",
1365 gpfn, gmfn, smfn, hl2mfn);
1366 perfc_incrc(shadow_hl2_table_count);
1368 hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
1370 #ifdef __i386__
1371 if ( shadow_mode_external(d) )
1372 limit = L2_PAGETABLE_ENTRIES;
1373 else
1374 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1375 #else
1376 limit = 0; /* XXX x86/64 XXX */
1377 #endif
1379 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1381 if ( !shadow_mode_external(d) )
1383 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1384 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1386 // Setup easy access to the GL2, SL2, and HL2 frames.
1387 //
1388 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1389 l1e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1390 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1391 l1e_create_pfn(smfn, __PAGE_HYPERVISOR);
1392 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1393 l1e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1396 unmap_domain_mem(hl2);
1398 return hl2mfn;
1401 /*
1402 * This could take and use a snapshot, and validate the entire page at
1403 * once, or it could continue to fault in entries one at a time...
1404 * Might be worth investigating...
1405 */
1406 static unsigned long shadow_l2_table(
1407 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1409 unsigned long smfn;
1410 l2_pgentry_t *spl2e;
1412 SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
1414 perfc_incrc(shadow_l2_table_count);
1416 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1418 printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1419 BUG(); /* XXX Deal gracefully with failure. */
1422 spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
1424 /* Install hypervisor and 2x linear p.t. mapings. */
1425 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1426 !shadow_mode_external(d) )
1428 /*
1429 * We could proactively fill in PDEs for pages that are already
1430 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1431 * (restriction required for coherence of the accessed bit). However,
1432 * we tried it and it didn't help performance. This is simpler.
1433 */
1434 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1436 /* Install hypervisor and 2x linear p.t. mapings. */
1437 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1438 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1439 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1441 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1442 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
1444 spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
1445 l2e_create_phys(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
1446 __PAGE_HYPERVISOR);
1448 if ( shadow_mode_translate(d) ) // NB: not external
1450 unsigned long hl2mfn;
1452 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1453 l2e_create_phys(pagetable_val(d->arch.phys_table),
1454 __PAGE_HYPERVISOR);
1456 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1457 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1459 // shadow_mode_translate (but not external) sl2 tables hold a
1460 // ref to their hl2.
1461 //
1462 if ( !get_shadow_ref(hl2mfn) )
1463 BUG();
1465 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1466 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1468 else
1469 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1470 l2e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1472 else
1474 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1477 unmap_domain_mem(spl2e);
1479 SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
1480 return smfn;
1483 void shadow_map_l1_into_current_l2(unsigned long va)
1485 struct exec_domain *ed = current;
1486 struct domain *d = ed->domain;
1487 l1_pgentry_t *gpl1e, *spl1e;
1488 l2_pgentry_t gl2e, sl2e;
1489 unsigned long gl1pfn, gl1mfn, sl1mfn;
1490 int i, init_table = 0;
1492 __guest_get_l2e(ed, va, &gl2e);
1493 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
1494 gl1pfn = l2e_get_pfn(gl2e);
1496 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1498 /* This L1 is NOT already shadowed so we need to shadow it. */
1499 SH_VVLOG("4a: l1 not shadowed");
1501 gl1mfn = __gpfn_to_mfn(d, gl1pfn);
1502 if ( unlikely(!VALID_MFN(gl1mfn)) )
1504 // Attempt to use an invalid pfn as an L1 page.
1505 // XXX this needs to be more graceful!
1506 BUG();
1509 if ( unlikely(!(sl1mfn =
1510 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1512 printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
1513 gl1pfn, gl1mfn);
1514 BUG(); /* XXX Need to deal gracefully with failure. */
1517 perfc_incrc(shadow_l1_table_count);
1518 init_table = 1;
1520 else
1522 /* This L1 is shadowed already, but the L2 entry is missing. */
1523 SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
1526 #ifndef NDEBUG
1527 l2_pgentry_t old_sl2e;
1528 __shadow_get_l2e(ed, va, &old_sl2e);
1529 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
1530 #endif
1532 if ( !get_shadow_ref(sl1mfn) )
1533 BUG();
1534 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1535 __guest_set_l2e(ed, va, gl2e);
1536 __shadow_set_l2e(ed, va, sl2e);
1538 if ( init_table )
1540 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
1541 ~(L1_PAGETABLE_ENTRIES-1)]);
1543 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
1544 ~(L1_PAGETABLE_ENTRIES-1)]);
1546 l1_pgentry_t sl1e;
1547 int index = l1_table_offset(va);
1548 int min = 1, max = 0;
1550 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1552 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1553 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
1554 !shadow_get_page_from_l1e(sl1e, d) )
1555 sl1e = l1e_empty();
1556 if ( l1e_get_value(sl1e) == 0 ) /* FIXME: check flags? */
1558 // First copy entries from 0 until first invalid.
1559 // Then copy entries from index until first invalid.
1560 //
1561 if ( i < index ) {
1562 i = index - 1;
1563 continue;
1565 break;
1567 spl1e[i] = sl1e;
1568 if ( unlikely(i < min) )
1569 min = i;
1570 if ( likely(i > max) )
1571 max = i;
1574 frame_table[sl1mfn].tlbflush_timestamp =
1575 SHADOW_ENCODE_MIN_MAX(min, max);
1579 void shadow_invlpg(struct exec_domain *ed, unsigned long va)
1581 struct domain *d = ed->domain;
1582 l1_pgentry_t gpte, spte;
1584 ASSERT(shadow_mode_enabled(d));
1586 shadow_lock(d);
1588 __shadow_sync_va(ed, va);
1590 // XXX mafetter: will need to think about 4MB pages...
1592 // It's not strictly necessary to update the shadow here,
1593 // but it might save a fault later.
1594 //
1595 if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
1596 sizeof(gpte))) {
1597 perfc_incrc(shadow_invlpg_faults);
1598 return;
1600 l1pte_propagate_from_guest(d, gpte, &spte);
1601 shadow_set_l1e(va, spte, 1);
1603 shadow_unlock(d);
1606 struct out_of_sync_entry *
1607 shadow_alloc_oos_entry(struct domain *d)
1609 struct out_of_sync_entry *f, *extra;
1610 unsigned size, i;
1612 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1614 FSH_LOG("Allocate more fullshadow tuple blocks.");
1616 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1617 extra = xmalloc_bytes(size);
1619 /* XXX Should be more graceful here. */
1620 if ( extra == NULL )
1621 BUG();
1623 memset(extra, 0, size);
1625 /* Record the allocation block so it can be correctly freed later. */
1626 d->arch.out_of_sync_extras_count++;
1627 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1628 d->arch.out_of_sync_extras;
1629 d->arch.out_of_sync_extras = &extra[0];
1631 /* Thread a free chain through the newly-allocated nodes. */
1632 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1633 extra[i].next = &extra[i+1];
1634 extra[i].next = NULL;
1636 /* Add the new nodes to the free list. */
1637 d->arch.out_of_sync_free = &extra[0];
1640 /* Allocate a new node from the quicklist. */
1641 f = d->arch.out_of_sync_free;
1642 d->arch.out_of_sync_free = f->next;
1644 return f;
1647 static inline unsigned long
1648 shadow_make_snapshot(
1649 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1651 unsigned long smfn, sl1mfn;
1652 void *original, *snapshot;
1653 u32 min_max = 0;
1654 int min, max, length;
1656 if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
1658 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
1659 return SHADOW_SNAPSHOT_ELSEWHERE;
1662 perfc_incrc(shadow_make_snapshot);
1664 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
1666 printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
1667 "Dom%d snapshot_count_count=%d\n",
1668 gpfn, gmfn, d->id, d->arch.snapshot_page_count);
1669 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
1672 if ( !get_shadow_ref(smfn) )
1673 BUG();
1675 if ( shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow )
1676 min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
1677 pfn_to_page(smfn)->tlbflush_timestamp = min_max;
1679 min = SHADOW_MIN(min_max);
1680 max = SHADOW_MAX(min_max);
1681 length = max - min + 1;
1682 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
1684 min *= sizeof(l1_pgentry_t);
1685 length *= sizeof(l1_pgentry_t);
1687 original = map_domain_mem(gmfn << PAGE_SHIFT);
1688 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1689 memcpy(snapshot + min, original + min, length);
1690 unmap_domain_mem(original);
1691 unmap_domain_mem(snapshot);
1693 return smfn;
1696 static void
1697 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
1699 void *snapshot;
1701 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1702 return;
1704 // Clear the out_of_sync bit.
1705 //
1706 clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
1708 // XXX Need to think about how to protect the domain's
1709 // information less expensively.
1710 //
1711 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
1712 memset(snapshot, 0, PAGE_SIZE);
1713 unmap_domain_mem(snapshot);
1715 put_shadow_ref(entry->snapshot_mfn);
1718 struct out_of_sync_entry *
1719 shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
1720 unsigned long mfn)
1722 struct domain *d = ed->domain;
1723 struct pfn_info *page = &frame_table[mfn];
1724 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1726 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1727 ASSERT(pfn_valid(mfn));
1728 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
1730 FSH_LOG("%s(gpfn=%p, mfn=%p) c=%p t=%p", __func__,
1731 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1733 // XXX this will require some more thought... Cross-domain sharing and
1734 // modification of page tables? Hmm...
1735 //
1736 if ( d != page_get_owner(page) )
1737 BUG();
1739 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1741 entry->gpfn = gpfn;
1742 entry->gmfn = mfn;
1743 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1744 entry->writable_pl1e = -1;
1746 // increment guest's ref count to represent the entry in the
1747 // full shadow out-of-sync list.
1748 //
1749 get_page(page, d);
1751 // Add to the out-of-sync list
1752 //
1753 entry->next = d->arch.out_of_sync;
1754 d->arch.out_of_sync = entry;
1756 return entry;
1759 void shadow_mark_va_out_of_sync(
1760 struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
1762 struct out_of_sync_entry *entry =
1763 shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
1764 l2_pgentry_t sl2e;
1766 // We need the address of shadow PTE that maps @va.
1767 // It might not exist yet. Make sure it's there.
1768 //
1769 __shadow_get_l2e(ed, va, &sl2e);
1770 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1772 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1773 // the current L2.
1774 shadow_map_l1_into_current_l2(va);
1775 __shadow_get_l2e(ed, va, &sl2e);
1777 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1779 // NB: this is stored as a machine address.
1780 entry->writable_pl1e =
1781 l2e_get_phys(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1782 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1784 // Increment shadow's page count to represent the reference
1785 // inherent in entry->writable_pl1e
1786 //
1787 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1788 BUG();
1790 FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
1791 va, entry->writable_pl1e);
1794 /*
1795 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1796 * Returns 0 otherwise.
1797 */
1798 static int snapshot_entry_matches(
1799 struct exec_domain *ed, unsigned long gmfn, unsigned index)
1801 unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
1802 unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
1803 unsigned long *guest, *snapshot;
1804 int compare;
1806 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
1808 perfc_incrc(snapshot_entry_matches_calls);
1810 if ( !smfn )
1811 return 0;
1813 guest = map_domain_mem(gmfn << PAGE_SHIFT);
1814 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1816 // This could probably be smarter, but this is sufficent for
1817 // our current needs.
1818 //
1819 compare = (guest[index] == snapshot[index]);
1821 unmap_domain_mem(guest);
1822 unmap_domain_mem(snapshot);
1824 #ifdef PERF_COUNTERS
1825 if ( compare )
1826 perfc_incrc(snapshot_entry_matches_true);
1827 #endif
1829 return compare;
1832 /*
1833 * Returns 1 if va's shadow mapping is out-of-sync.
1834 * Returns 0 otherwise.
1835 */
1836 int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
1838 struct domain *d = ed->domain;
1839 unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1840 l2_pgentry_t l2e;
1841 unsigned long l1mfn;
1843 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1845 perfc_incrc(shadow_out_of_sync_calls);
1847 if ( page_out_of_sync(&frame_table[l2mfn]) &&
1848 !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
1849 return 1;
1851 __guest_get_l2e(ed, va, &l2e);
1852 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1853 return 0;
1855 l1mfn = __gpfn_to_mfn(d, l2e_get_pfn(l2e));
1857 // If the l1 pfn is invalid, it can't be out of sync...
1858 if ( !VALID_MFN(l1mfn) )
1859 return 0;
1861 if ( page_out_of_sync(&frame_table[l1mfn]) &&
1862 !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
1863 return 1;
1865 return 0;
1868 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
1869 static inline unsigned long
1870 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1872 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1875 static inline void
1876 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1878 unsigned long score = prediction & PGT_score_mask;
1879 int create = (score == 0);
1881 // saturating addition
1882 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1883 score = score ? score : PGT_score_mask;
1885 prediction = (prediction & PGT_mfn_mask) | score;
1887 //printk("increase gpfn=%p pred=%p create=%d\n", gpfn, prediction, create);
1888 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1890 if ( create )
1891 perfc_incr(writable_pte_predictions);
1894 static inline void
1895 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1897 unsigned long score = prediction & PGT_score_mask;
1898 ASSERT(score);
1900 // divide score by 2... We don't like bad predictions.
1901 //
1902 score = (score >> 1) & PGT_score_mask;
1904 prediction = (prediction & PGT_mfn_mask) | score;
1906 //printk("decrease gpfn=%p pred=%p score=%p\n", gpfn, prediction, score);
1908 if ( score )
1909 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1910 else
1912 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1913 perfc_decr(writable_pte_predictions);
1917 static u32 remove_all_write_access_in_ptpage(
1918 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1919 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1920 u32 max_refs_to_find, unsigned long prediction)
1922 l1_pgentry_t *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
1923 l1_pgentry_t match;
1924 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1925 int i;
1926 u32 found = 0;
1927 int is_l1_shadow =
1928 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1929 PGT_l1_shadow);
1931 match = l1e_create_pfn(readonly_gmfn, flags);
1933 // returns true if all refs have been found and fixed.
1934 //
1935 int fix_entry(int i)
1937 l1_pgentry_t old = pt[i];
1938 l1_pgentry_t new = old;
1940 l1e_remove_flags(&new,_PAGE_RW);
1941 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1942 BUG();
1943 found++;
1944 pt[i] = new;
1945 if ( is_l1_shadow )
1946 put_page_from_l1e(old, d);
1948 #if 0
1949 printk("removed write access to pfn=%p mfn=%p in smfn=%p entry %x "
1950 "is_l1_shadow=%d\n",
1951 readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
1952 #endif
1954 return (found == max_refs_to_find);
1957 i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
1958 if ( !l1e_has_changed(&pt[i], &match, flags) && fix_entry(i) )
1960 perfc_incrc(remove_write_fast_exit);
1961 increase_writable_pte_prediction(d, readonly_gpfn, prediction);
1962 unmap_domain_mem(pt);
1963 return found;
1966 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1968 if ( unlikely(!l1e_has_changed(&pt[i], &match, flags)) && fix_entry(i) )
1969 break;
1972 unmap_domain_mem(pt);
1974 return found;
1975 #undef MATCH_ENTRY
1978 int shadow_remove_all_write_access(
1979 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1981 int i;
1982 struct shadow_status *a;
1983 u32 found = 0, fixups, write_refs;
1984 unsigned long prediction, predicted_gpfn, predicted_smfn;
1986 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1987 ASSERT(VALID_MFN(readonly_gmfn));
1989 perfc_incrc(remove_write_access);
1991 // If it's not a writable page, then no writable refs can be outstanding.
1992 //
1993 if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
1994 PGT_writable_page )
1996 perfc_incrc(remove_write_not_writable);
1997 return 1;
2000 // How many outstanding writable PTEs for this page are there?
2001 //
2002 write_refs =
2003 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
2004 if ( write_refs &&
2005 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) )
2007 write_refs--;
2010 if ( write_refs == 0 )
2012 perfc_incrc(remove_write_no_work);
2013 return 1;
2016 // Before searching all the L1 page tables, check the typical culprit first
2017 //
2018 if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
2020 predicted_gpfn = prediction & PGT_mfn_mask;
2021 if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
2022 (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
2024 found += fixups;
2025 if ( found == write_refs )
2027 perfc_incrc(remove_write_predicted);
2028 return 1;
2031 else
2033 perfc_incrc(remove_write_bad_prediction);
2034 decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
2038 // Search all the shadow L1 page tables...
2039 //
2040 for (i = 0; i < shadow_ht_buckets; i++)
2042 a = &d->arch.shadow_ht[i];
2043 while ( a && a->gpfn_and_flags )
2045 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
2047 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
2048 if ( found == write_refs )
2049 return 1;
2052 a = a->next;
2056 FSH_LOG("%s: looking for %d refs, found %d refs",
2057 __func__, write_refs, found);
2059 return 0;
2062 static u32 remove_all_access_in_page(
2063 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
2065 l1_pgentry_t *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
2066 l1_pgentry_t match;
2067 unsigned long flags = _PAGE_PRESENT;
2068 int i;
2069 u32 count = 0;
2070 int is_l1_shadow =
2071 ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
2072 PGT_l1_shadow);
2074 match = l1e_create_pfn(forbidden_gmfn, flags);
2076 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2078 if ( unlikely(!l1e_has_changed(&pl1e[i], &match, flags) == 0) )
2080 l1_pgentry_t ol2e = pl1e[i];
2081 pl1e[i] = l1e_empty();
2082 count++;
2084 if ( is_l1_shadow )
2085 put_page_from_l1e(ol2e, d);
2086 else /* must be an hl2 page */
2087 put_page(&frame_table[forbidden_gmfn]);
2091 unmap_domain_mem(pl1e);
2093 return count;
2096 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
2098 int i;
2099 struct shadow_status *a;
2100 u32 count = 0;
2102 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2103 perfc_incrc(remove_all_access);
2105 for (i = 0; i < shadow_ht_buckets; i++)
2107 a = &d->arch.shadow_ht[i];
2108 while ( a && a->gpfn_and_flags )
2110 switch (a->gpfn_and_flags & PGT_type_mask)
2112 case PGT_l1_shadow:
2113 case PGT_l2_shadow:
2114 case PGT_l3_shadow:
2115 case PGT_l4_shadow:
2116 case PGT_hl2_shadow:
2117 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2118 break;
2119 case PGT_snapshot:
2120 case PGT_writable_pred:
2121 // these can't hold refs to the forbidden page
2122 break;
2123 default:
2124 BUG();
2127 a = a->next;
2131 return count;
2134 static int resync_all(struct domain *d, u32 stype)
2136 struct out_of_sync_entry *entry;
2137 unsigned i;
2138 unsigned long smfn;
2139 void *guest, *shadow, *snapshot;
2140 int need_flush = 0, external = shadow_mode_external(d);
2141 int unshadow;
2142 int changed;
2144 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2146 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2148 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2149 continue;
2151 if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
2152 continue;
2154 FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
2155 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2157 // Compare guest's new contents to its snapshot, validating
2158 // and updating its shadow as appropriate.
2159 //
2160 guest = map_domain_mem(entry->gmfn << PAGE_SHIFT);
2161 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
2162 shadow = map_domain_mem(smfn << PAGE_SHIFT);
2163 unshadow = 0;
2165 switch ( stype ) {
2166 case PGT_l1_shadow:
2168 u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
2169 int min_shadow = SHADOW_MIN(min_max_shadow);
2170 int max_shadow = SHADOW_MAX(min_max_shadow);
2172 u32 min_max_snapshot =
2173 pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
2174 int min_snapshot = SHADOW_MIN(min_max_snapshot);
2175 int max_snapshot = SHADOW_MAX(min_max_snapshot);
2177 l1_pgentry_t *guest1 = guest;
2178 l1_pgentry_t *shadow1 = shadow;
2179 l1_pgentry_t *snapshot1 = snapshot;
2181 changed = 0;
2183 for ( i = min_shadow; i <= max_shadow; i++ )
2185 if ( (i < min_snapshot) || (i > max_snapshot) ||
2186 l1e_has_changed(&guest1[i], &snapshot1[i], PAGE_FLAG_MASK) )
2188 need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]);
2190 // can't update snapshots of linear page tables -- they
2191 // are used multiple times...
2192 //
2193 // snapshot[i] = new_pte;
2195 changed++;
2198 perfc_incrc(resync_l1);
2199 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
2200 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
2201 break;
2203 case PGT_l2_shadow:
2205 int max = -1;
2207 l2_pgentry_t *guest2 = guest;
2208 l2_pgentry_t *shadow2 = shadow;
2209 l2_pgentry_t *snapshot2 = snapshot;
2211 changed = 0;
2212 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2214 if ( !is_guest_l2_slot(i) && !external )
2215 continue;
2217 l2_pgentry_t new_pde = guest2[i];
2218 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK))
2220 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
2222 // can't update snapshots of linear page tables -- they
2223 // are used multiple times...
2224 //
2225 // snapshot[i] = new_pde;
2227 changed++;
2229 if ( l2e_get_value(new_pde) != 0 ) /* FIXME: check flags? */
2230 max = i;
2232 // XXX - This hack works for linux guests.
2233 // Need a better solution long term.
2234 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
2235 unlikely(l2e_get_value(new_pde) != 0) &&
2236 !unshadow &&
2237 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
2238 unshadow = 1;
2240 if ( max == -1 )
2241 unshadow = 1;
2242 perfc_incrc(resync_l2);
2243 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
2244 break;
2246 case PGT_hl2_shadow:
2248 l2_pgentry_t *guest2 = guest;
2249 l2_pgentry_t *snapshot2 = snapshot;
2250 l1_pgentry_t *shadow2 = shadow;
2252 changed = 0;
2253 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2255 if ( !is_guest_l2_slot(i) && !external )
2256 continue;
2258 l2_pgentry_t new_pde = guest2[i];
2259 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK) )
2261 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
2263 // can't update snapshots of linear page tables -- they
2264 // are used multiple times...
2265 //
2266 // snapshot[i] = new_pde;
2268 changed++;
2271 perfc_incrc(resync_hl2);
2272 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
2273 break;
2275 default:
2276 BUG();
2279 unmap_domain_mem(shadow);
2280 unmap_domain_mem(snapshot);
2281 unmap_domain_mem(guest);
2283 if ( unlikely(unshadow) )
2285 perfc_incrc(unshadow_l2_count);
2286 shadow_unpin(smfn);
2287 if ( unlikely(shadow_mode_external(d)) )
2289 unsigned long hl2mfn;
2291 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
2292 (frame_table[hl2mfn].u.inuse.type_info & PGT_pinned) )
2293 shadow_unpin(hl2mfn);
2298 return need_flush;
2301 void __shadow_sync_all(struct domain *d)
2303 struct out_of_sync_entry *entry;
2304 int need_flush = 0;
2306 perfc_incrc(shadow_sync_all);
2308 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2310 // First, remove all write permissions to the page tables
2311 //
2312 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2314 // Skip entries that have low bits set... Those aren't
2315 // real PTEs.
2316 //
2317 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2318 continue;
2320 l1_pgentry_t *ppte = map_domain_mem(entry->writable_pl1e);
2321 l1_pgentry_t opte = *ppte;
2322 l1_pgentry_t npte = opte;
2323 l1e_remove_flags(&opte, _PAGE_RW);
2325 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2326 !shadow_get_page_from_l1e(npte, d) )
2327 BUG();
2328 *ppte = npte;
2329 put_page_from_l1e(opte, d);
2331 unmap_domain_mem(ppte);
2334 // XXX mafetter: SMP
2335 //
2336 // With the current algorithm, we've gotta flush all the TLBs
2337 // before we can safely continue. I don't think we want to
2338 // do it this way, so I think we should consider making
2339 // entirely private copies of the shadow for each vcpu, and/or
2340 // possibly having a mix of private and shared shadow state
2341 // (any path from a PTE that grants write access to an out-of-sync
2342 // page table page needs to be vcpu private).
2343 //
2344 #if 0 // this should be enabled for SMP guests...
2345 flush_tlb_mask(((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id()));
2346 #endif
2347 need_flush = 1;
2349 // Second, resync all L1 pages, then L2 pages, etc...
2350 //
2351 need_flush |= resync_all(d, PGT_l1_shadow);
2352 if ( shadow_mode_translate(d) )
2353 need_flush |= resync_all(d, PGT_hl2_shadow);
2354 need_flush |= resync_all(d, PGT_l2_shadow);
2356 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2357 local_flush_tlb();
2359 free_out_of_sync_state(d);
2362 int shadow_fault(unsigned long va, struct xen_regs *regs)
2364 l1_pgentry_t gpte, spte, orig_gpte;
2365 struct exec_domain *ed = current;
2366 struct domain *d = ed->domain;
2367 l2_pgentry_t gpde;
2369 spte = l1e_empty();
2371 SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
2372 perfc_incrc(shadow_fault_calls);
2374 check_pagetable(ed, "pre-sf");
2376 /*
2377 * Don't let someone else take the guest's table pages out-of-sync.
2378 */
2379 shadow_lock(d);
2381 /* XXX - FIX THIS COMMENT!!!
2382 * STEP 1. Check to see if this fault might have been caused by an
2383 * out-of-sync table page entry, or if we should pass this
2384 * fault onto the guest.
2385 */
2386 __shadow_sync_va(ed, va);
2388 /*
2389 * STEP 2. Check the guest PTE.
2390 */
2391 __guest_get_l2e(ed, va, &gpde);
2392 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2394 SH_VVLOG("shadow_fault - EXIT: L1 not present" );
2395 perfc_incrc(shadow_fault_bail_pde_not_present);
2396 goto fail;
2399 // This can't fault because we hold the shadow lock and we've ensured that
2400 // the mapping is in-sync, so the check of the PDE's present bit, above,
2401 // covers this access.
2402 //
2403 orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2404 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2406 SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
2407 perfc_incrc(shadow_fault_bail_pte_not_present);
2408 goto fail;
2411 /* Write fault? */
2412 if ( regs->error_code & 2 )
2414 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2416 /* Write fault on a read-only mapping. */
2417 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
2418 perfc_incrc(shadow_fault_bail_ro_mapping);
2419 goto fail;
2422 if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
2424 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2425 perfc_incrc(write_fault_bail);
2426 shadow_unlock(d);
2427 return 0;
2430 else
2432 if ( !l1pte_read_fault(d, &gpte, &spte) )
2434 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2435 perfc_incrc(read_fault_bail);
2436 shadow_unlock(d);
2437 return 0;
2441 /*
2442 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2443 */
2445 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2446 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2447 &gpte, sizeof(gpte))) )
2449 printk("shadow_fault() failed, crashing domain %d "
2450 "due to a read-only L2 page table (gpde=%p), va=%p\n",
2451 d->id, gpde, va);
2452 domain_crash_synchronous();
2455 // if necessary, record the page table page as dirty
2456 if ( unlikely(shadow_mode_log_dirty(d)) &&
2457 l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK))
2458 mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
2460 shadow_set_l1e(va, spte, 1);
2462 perfc_incrc(shadow_fault_fixed);
2463 d->arch.shadow_fault_count++;
2465 shadow_unlock(d);
2467 check_pagetable(ed, "post-sf");
2468 return EXCRET_fault_fixed;
2470 fail:
2471 shadow_unlock(d);
2472 return 0;
2475 /*
2476 * What lives where in the 32-bit address space in the various shadow modes,
2477 * and what it uses to get/maintain that mapping.
2479 * SHADOW MODE: none enable translate external
2481 * 4KB things:
2482 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2483 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2484 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2485 * monitor_vtable n/a n/a n/a mapped once
2487 * 4MB things:
2488 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2489 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2490 * monitor_linear n/a n/a n/a ???
2491 * perdomain perdomain perdomain perdomain perdomain
2492 * R/O M2P R/O M2P R/O M2P n/a n/a
2493 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2494 * P2M n/a n/a R/O M2P R/O M2P
2496 * NB:
2497 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
2498 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2499 * all play a part in maintaining these mappings.
2500 */
2501 void __update_pagetables(struct exec_domain *ed)
2503 struct domain *d = ed->domain;
2504 unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
2505 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
2506 unsigned long smfn, hl2mfn, old_smfn;
2508 int max_mode = ( shadow_mode_external(d) ? SHM_external
2509 : shadow_mode_translate(d) ? SHM_translate
2510 : shadow_mode_enabled(d) ? SHM_enable
2511 : 0 );
2513 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2514 ASSERT( max_mode );
2516 /*
2517 * arch.guest_vtable
2518 */
2519 if ( max_mode & (SHM_enable | SHM_external) )
2521 if ( likely(ed->arch.guest_vtable != NULL) )
2522 unmap_domain_mem(ed->arch.guest_vtable);
2523 ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
2526 /*
2527 * arch.shadow_table
2528 */
2529 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2530 smfn = shadow_l2_table(d, gpfn, gmfn);
2531 if ( !get_shadow_ref(smfn) )
2532 BUG();
2533 old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
2534 ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
2535 if ( old_smfn )
2536 put_shadow_ref(old_smfn);
2538 SH_VVLOG("__update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
2540 /*
2541 * arch.shadow_vtable
2542 */
2543 if ( max_mode == SHM_external )
2545 if ( ed->arch.shadow_vtable )
2546 unmap_domain_mem(ed->arch.shadow_vtable);
2547 ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
2550 /*
2551 * arch.hl2_vtable
2552 */
2554 // if max_mode == SHM_translate, then the hl2 is already installed
2555 // correctly in its smfn, and there's nothing to do.
2556 //
2557 if ( max_mode == SHM_external )
2559 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2560 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2561 if ( ed->arch.hl2_vtable )
2562 unmap_domain_mem(ed->arch.hl2_vtable);
2563 ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
2566 /*
2567 * fixup pointers in monitor table, as necessary
2568 */
2569 if ( max_mode == SHM_external )
2571 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
2572 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2573 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2575 ASSERT( shadow_mode_translate(d) );
2577 if ( !get_shadow_ref(hl2mfn) )
2578 BUG();
2579 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2580 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
2581 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2582 put_shadow_ref(l2e_get_pfn(old_hl2e));
2584 if ( !get_shadow_ref(smfn) )
2585 BUG();
2586 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2587 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2588 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2589 put_shadow_ref(l2e_get_pfn(old_sl2e));
2591 // XXX - maybe this can be optimized somewhat??
2592 local_flush_tlb();
2597 /************************************************************************/
2598 /************************************************************************/
2599 /************************************************************************/
2601 #if SHADOW_DEBUG
2603 // BUG: these are not SMP safe...
2604 static int sh_l2_present;
2605 static int sh_l1_present;
2606 char * sh_check_name;
2607 int shadow_status_noswap;
2609 #define v2m(adr) ({ \
2610 unsigned long _a = (unsigned long)(adr); \
2611 l1_pgentry_t _pte = shadow_linear_pg_table[_a >> PAGE_SHIFT]; \
2612 unsigned long _pa = l1e_get_phys(_pte); \
2613 _pa | (_a & ~PAGE_MASK); \
2614 })
2616 #define FAIL(_f, _a...) \
2617 do { \
2618 printk("XXX %s-FAIL (%d,%d,%d)" _f "\n" \
2619 "g=%08lx s=%08lx &g=%08lx &s=%08lx" \
2620 " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08lx\n", \
2621 sh_check_name, level, l2_idx, l1_idx, ## _a , \
2622 gpte, spte, pgpte, pspte, \
2623 v2m(pgpte), v2m(pspte), \
2624 (l2_idx << L2_PAGETABLE_SHIFT) | \
2625 (l1_idx << L1_PAGETABLE_SHIFT)); \
2626 errors++; \
2627 } while ( 0 )
2629 static int check_pte(
2630 struct domain *d, l1_pgentry_t *pgpte, l1_pgentry_t *pspte,
2631 int level, int l2_idx, int l1_idx, int oos_ptes)
2633 l1_pgentry_t gpte = *pgpte;
2634 l1_pgentry_t spte = *pspte;
2635 unsigned long mask, gpfn, smfn, gmfn;
2636 int errors = 0;
2637 int page_table_page;
2639 if ( (l1e_get_value(spte) == 0) ||
2640 (l1e_get_value(spte) == 0xdeadface) ||
2641 (l1e_get_value(spte) == 0x00000E00) )
2642 return errors; /* always safe */
2644 if ( !(l1e_get_flags(spte) & _PAGE_PRESENT) )
2645 FAIL("Non zero not present spte");
2647 if ( level == 2 ) sh_l2_present++;
2648 if ( level == 1 ) sh_l1_present++;
2650 if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
2651 FAIL("Guest not present yet shadow is");
2653 mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
2655 if ( l1e_has_changed(&spte, &gpte, mask) )
2656 FAIL("Corrupt?");
2658 if ( (level == 1) &&
2659 (l1e_get_flags(spte) & _PAGE_DIRTY ) &&
2660 !(l1e_get_flags(gpte) & _PAGE_DIRTY) && !oos_ptes )
2661 FAIL("Dirty coherence");
2663 if ( (l1e_get_flags(spte) & _PAGE_ACCESSED ) &&
2664 !(l1e_get_flags(gpte) & _PAGE_ACCESSED) && !oos_ptes )
2665 FAIL("Accessed coherence");
2667 smfn = l1e_get_pfn(spte);
2668 gpfn = l1e_get_pfn(gpte);
2669 gmfn = __gpfn_to_mfn(d, gpfn);
2671 if ( !VALID_MFN(gmfn) )
2672 FAIL("invalid gpfn=%p gpte=%p\n", __func__, gpfn,
2673 l1e_get_value(gpte));
2675 page_table_page = mfn_is_page_table(gmfn);
2677 if ( (l1e_get_flags(spte) & _PAGE_RW ) &&
2678 !(l1e_get_flags(gpte) & _PAGE_RW) && !oos_ptes )
2680 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2681 "oos_ptes=%d\n",
2682 gpfn, gmfn, smfn,
2683 frame_table[gmfn].u.inuse.type_info,
2684 page_table_page, oos_ptes);
2685 FAIL("RW coherence");
2688 if ( (level == 1) &&
2689 (l1e_get_flags(spte) & _PAGE_RW ) &&
2690 !((l1e_get_flags(gpte) & _PAGE_RW) &&
2691 (l1e_get_flags(gpte) & _PAGE_DIRTY)) &&
2692 !oos_ptes )
2694 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2695 "oos_ptes=%d\n",
2696 gpfn, gmfn, smfn,
2697 frame_table[gmfn].u.inuse.type_info,
2698 page_table_page, oos_ptes);
2699 FAIL("RW2 coherence");
2702 if ( gmfn == smfn )
2704 if ( level > 1 )
2705 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2707 else
2709 if ( level < 2 )
2710 FAIL("Shadow in L1 entry?");
2712 if ( level == 2 )
2714 if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
2715 FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
2716 __shadow_status(d, gpfn, PGT_l1_shadow));
2718 else
2719 BUG(); // XXX -- not handled yet.
2722 return errors;
2725 static int check_l1_table(
2726 struct domain *d, unsigned long gpfn,
2727 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2729 int i;
2730 l1_pgentry_t *gpl1e, *spl1e;
2731 int errors = 0, oos_ptes = 0;
2733 if ( page_out_of_sync(pfn_to_page(gmfn)) )
2735 gmfn = __shadow_status(d, gpfn, PGT_snapshot);
2736 oos_ptes = 1;
2737 ASSERT(gmfn);
2740 gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
2741 spl1e = map_domain_mem(smfn << PAGE_SHIFT);
2743 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2744 errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
2746 unmap_domain_mem(spl1e);
2747 unmap_domain_mem(gpl1e);
2749 return errors;
2752 #define FAILPT(_f, _a...) \
2753 do { \
2754 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2755 errors++; \
2756 } while ( 0 )
2758 int check_l2_table(
2759 struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2761 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
2762 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
2763 l2_pgentry_t match;
2764 int i;
2765 int errors = 0;
2766 int limit;
2768 if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
2769 FAILPT("domain doesn't own page");
2770 if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
2771 FAILPT("bogus owner for snapshot page");
2772 if ( page_get_owner(pfn_to_page(smfn)) != NULL )
2773 FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
2774 smfn, page_get_owner(pfn_to_page(smfn))->id);
2776 #if 0
2777 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2778 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2779 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2780 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2782 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2783 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2784 i++ )
2785 printk("+++ (%d) %p %p\n",i,
2786 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2787 FAILPT("hypervisor entries inconsistent");
2790 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2791 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2792 FAILPT("hypervisor linear map inconsistent");
2793 #endif
2795 match = l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2796 if ( !shadow_mode_external(d) &&
2797 l2e_has_changed(&spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2798 &match, PAGE_FLAG_MASK))
2800 FAILPT("hypervisor shadow linear map inconsistent %p %p",
2801 l2e_get_value(spl2e[SH_LINEAR_PT_VIRT_START >>
2802 L2_PAGETABLE_SHIFT]),
2803 l2e_get_value(match));
2806 match = l2e_create_phys(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2807 if ( !shadow_mode_external(d) &&
2808 l2e_has_changed(&spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2809 &match, PAGE_FLAG_MASK))
2811 FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
2812 l2e_get_value(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2813 d->arch.mm_perdomain_pt,
2814 l2e_get_value(match));
2817 #ifdef __i386__
2818 if ( shadow_mode_external(d) )
2819 limit = L2_PAGETABLE_ENTRIES;
2820 else
2821 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2822 #else
2823 limit = 0; /* XXX x86/64 XXX */
2824 #endif
2826 /* Check the whole L2. */
2827 for ( i = 0; i < limit; i++ )
2828 errors += check_pte(d,
2829 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2830 (l1_pgentry_t*)(&spl2e[i]),
2831 2, i, 0, 0);
2833 unmap_domain_mem(spl2e);
2834 unmap_domain_mem(gpl2e);
2836 #if 1
2837 if ( errors )
2838 printk("check_l2_table returning %d errors\n", errors);
2839 #endif
2841 return errors;
2844 int _check_pagetable(struct exec_domain *ed, char *s)
2846 struct domain *d = ed->domain;
2847 pagetable_t pt = ed->arch.guest_table;
2848 unsigned long gptbase = pagetable_val(pt);
2849 unsigned long ptbase_pfn, smfn;
2850 unsigned long i;
2851 l2_pgentry_t *gpl2e, *spl2e;
2852 unsigned long ptbase_mfn = 0;
2853 int errors = 0, limit, oos_pdes = 0;
2855 //_audit_domain(d, AUDIT_QUIET);
2856 shadow_lock(d);
2858 sh_check_name = s;
2859 //SH_VVLOG("%s-PT Audit", s);
2860 sh_l2_present = sh_l1_present = 0;
2861 perfc_incrc(check_pagetable);
2863 ptbase_mfn = gptbase >> PAGE_SHIFT;
2864 ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
2866 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2868 printk("%s-PT %p not shadowed\n", s, gptbase);
2869 goto out;
2871 if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
2873 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2874 oos_pdes = 1;
2875 ASSERT(ptbase_mfn);
2878 errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes);
2880 gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
2881 spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
2883 /* Go back and recurse. */
2884 #ifdef __i386__
2885 if ( shadow_mode_external(d) )
2886 limit = L2_PAGETABLE_ENTRIES;
2887 else
2888 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2889 #else
2890 limit = 0; /* XXX x86/64 XXX */
2891 #endif
2893 for ( i = 0; i < limit; i++ )
2895 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2896 unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
2897 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2899 if ( l2e_get_value(spl2e[i]) != 0 ) /* FIXME: check flags? */
2901 errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i);
2905 unmap_domain_mem(spl2e);
2906 unmap_domain_mem(gpl2e);
2908 #if 0
2909 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2910 sh_l2_present, sh_l1_present);
2911 #endif
2913 out:
2914 if ( errors )
2915 BUG();
2917 shadow_unlock(d);
2919 return errors;
2922 int _check_all_pagetables(struct exec_domain *ed, char *s)
2924 struct domain *d = ed->domain;
2925 int i;
2926 struct shadow_status *a;
2927 unsigned long gmfn;
2928 int errors = 0;
2930 shadow_status_noswap = 1;
2932 sh_check_name = s;
2933 SH_VVLOG("%s-PT Audit domid=%d", s, d->id);
2934 sh_l2_present = sh_l1_present = 0;
2935 perfc_incrc(check_all_pagetables);
2937 for (i = 0; i < shadow_ht_buckets; i++)
2939 a = &d->arch.shadow_ht[i];
2940 while ( a && a->gpfn_and_flags )
2942 gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
2944 switch ( a->gpfn_and_flags & PGT_type_mask )
2946 case PGT_l1_shadow:
2947 errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask,
2948 gmfn, a->smfn, 0);
2949 break;
2950 case PGT_l2_shadow:
2951 errors += check_l2_table(d, gmfn, a->smfn,
2952 page_out_of_sync(pfn_to_page(gmfn)));
2953 break;
2954 case PGT_l3_shadow:
2955 case PGT_l4_shadow:
2956 case PGT_hl2_shadow:
2957 BUG(); // XXX - ought to fix this...
2958 break;
2959 case PGT_snapshot:
2960 case PGT_writable_pred:
2961 break;
2962 default:
2963 errors++;
2964 printk("unexpected shadow type %p, gpfn=%p, "
2965 "gmfn=%p smfn=%p\n",
2966 a->gpfn_and_flags & PGT_type_mask,
2967 a->gpfn_and_flags & PGT_mfn_mask,
2968 gmfn, a->smfn);
2969 BUG();
2971 a = a->next;
2975 shadow_status_noswap = 0;
2977 if ( errors )
2978 BUG();
2980 return errors;
2983 #endif // SHADOW_DEBUG
2985 /*
2986 * Local variables:
2987 * mode: C
2988 * c-set-style: "BSD"
2989 * c-basic-offset: 4
2990 * tab-width: 4
2991 * indent-tabs-mode: nil
2992 * End:
2993 */