debuggers.hg

view xen/arch/x86/shadow.c @ 4662:9a768d11cc7b

bitkeeper revision 1.1358 (4267e561Ml7gO0DQYGp9EYRUYPBDHA)

Merge burn.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into burn.cl.cam.ac.uk:/local/scratch-1/maf46/xen-unstable.bk

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author maf46@burn.cl.cam.ac.uk
date Thu Apr 21 17:39:45 2005 +0000 (2005-04-21)
parents 1803018b3b05 8e987582b901
children 43d58d3eeaa5
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <asm/shadow.h>
27 #include <asm/domain_page.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
33 static void shadow_free_snapshot(struct domain *d,
34 struct out_of_sync_entry *entry);
35 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
36 static void free_writable_pte_predictions(struct domain *d);
38 /********
40 There's a per-domain shadow table spin lock which works fine for SMP
41 hosts. We don't have to worry about interrupts as no shadow operations
42 happen in an interrupt context. It's probably not quite ready for SMP
43 guest operation as we have to worry about synchonisation between gpte
44 and spte updates. Its possible that this might only happen in a
45 hypercall context, in which case we'll probably at have a per-domain
46 hypercall lock anyhow (at least initially).
48 ********/
50 static inline int
51 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
52 unsigned long new_type)
53 {
54 struct pfn_info *page = pfn_to_page(gmfn);
55 int pinned = 0, okay = 1;
57 if ( page_out_of_sync(page) )
58 {
59 // Don't know how long ago this snapshot was taken.
60 // Can't trust it to be recent enough.
61 //
62 __shadow_sync_mfn(d, gmfn);
63 }
65 if ( unlikely(page_is_page_table(page)) )
66 return 1;
68 FSH_LOG("%s: gpfn=%p gmfn=%p nt=%p", __func__, gpfn, gmfn, new_type);
70 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
71 {
72 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%p gmfn=%p",
73 __func__, gpfn, gmfn);
74 #if 1 || defined(LIVE_DANGEROUSLY)
75 set_bit(_PGC_page_table, &page->count_info);
76 return 1;
77 #endif
78 return 0;
80 }
82 // To convert this page to use as a page table, the writable count
83 // should now be zero. Test this by grabbing the page as an page table,
84 // and then immediately releasing. This will also deal with any
85 // necessary TLB flushing issues for us.
86 //
87 // The cruft here about pinning doesn't really work right. This
88 // needs rethinking/rewriting... Need to gracefully deal with the
89 // TLB flushes required when promoting a writable page, and also deal
90 // with any outstanding (external) writable refs to this page (by
91 // refusing to promote it). The pinning headache complicates this
92 // code -- it would all much get simpler if we stop using
93 // shadow_lock() and move the shadow code to BIGLOCK().
94 //
95 if ( unlikely(!get_page(page, d)) )
96 BUG(); // XXX -- needs more thought for a graceful failure
97 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
98 {
99 pinned = 1;
100 put_page_and_type(page);
101 }
102 if ( get_page_type(page, PGT_base_page_table) )
103 {
104 set_bit(_PGC_page_table, &page->count_info);
105 put_page_type(page);
106 }
107 else
108 {
109 printk("shadow_promote: get_page_type failed "
110 "dom%d gpfn=%p gmfn=%p t=%x\n",
111 d->id, gpfn, gmfn, new_type);
112 okay = 0;
113 }
115 // Now put the type back to writable...
116 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
117 BUG(); // XXX -- needs more thought for a graceful failure
118 if ( unlikely(pinned) )
119 {
120 if ( unlikely(test_and_set_bit(_PGT_pinned,
121 &page->u.inuse.type_info)) )
122 BUG(); // hmm... someone pinned this again?
123 }
124 else
125 put_page_and_type(page);
127 return okay;
128 }
130 static inline void
131 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
132 {
133 ASSERT(frame_table[gmfn].count_info & PGC_page_table);
135 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
136 {
137 clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
139 if ( page_out_of_sync(pfn_to_page(gmfn)) )
140 {
141 remove_out_of_sync_entries(d, gmfn);
142 }
143 }
144 }
146 /*
147 * Things in shadow mode that collect get_page() refs to the domain's
148 * pages are:
149 * - PGC_allocated takes a gen count, just like normal.
150 * - A writable page can be pinned (paravirtualized guests may consider
151 * these pages to be L1s or L2s, and don't know the difference).
152 * Pinning a page takes a gen count (but, for domains in shadow mode,
153 * it *doesn't* take a type count)
154 * - CR3 grabs a ref to whatever it points at, just like normal.
155 * - Shadow mode grabs an initial gen count for itself, as a placehold
156 * for whatever references will exist.
157 * - Shadow PTEs that point to a page take a gen count, just like regular
158 * PTEs. However, they don't get a type count, as get_page_type() is
159 * hardwired to keep writable pages' counts at 1 for domains in shadow
160 * mode.
161 * - Whenever we shadow a page, the entry in the shadow hash grabs a
162 * general ref to the page.
163 * - Whenever a page goes out of sync, the out of sync entry grabs a
164 * general ref to the page.
165 */
166 /*
167 * pfn_info fields for pages allocated as shadow pages:
168 *
169 * All 32 bits of count_info are a simple count of refs to this shadow
170 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
171 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
172 * references.
173 *
174 * u.inuse._domain is left NULL, to prevent accidently allow some random
175 * domain from gaining permissions to map this page.
176 *
177 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
178 * shadowed.
179 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
180 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
181 * is currently exists because this is a shadow of a root page, and we
182 * don't want to let those disappear just because no CR3 is currently pointing
183 * at it.
184 *
185 * tlbflush_timestamp holds a min & max index of valid page table entries
186 * within the shadow page.
187 */
189 static inline unsigned long
190 alloc_shadow_page(struct domain *d,
191 unsigned long gpfn, unsigned long gmfn,
192 u32 psh_type)
193 {
194 struct pfn_info *page;
195 unsigned long smfn;
196 int pin = 0;
198 // Currently, we only keep pre-zero'ed pages around for use as L1's...
199 // This will change. Soon.
200 //
201 if ( psh_type == PGT_l1_shadow )
202 {
203 if ( !list_empty(&d->arch.free_shadow_frames) )
204 {
205 struct list_head *entry = d->arch.free_shadow_frames.next;
206 page = list_entry(entry, struct pfn_info, list);
207 list_del(entry);
208 perfc_decr(free_l1_pages);
209 }
210 else
211 {
212 page = alloc_domheap_page(NULL);
213 void *l1 = map_domain_mem(page_to_pfn(page) << PAGE_SHIFT);
214 memset(l1, 0, PAGE_SIZE);
215 unmap_domain_mem(l1);
216 }
217 }
218 else
219 page = alloc_domheap_page(NULL);
221 if ( unlikely(page == NULL) )
222 {
223 printk("Couldn't alloc shadow page! dom%d count=%d\n",
224 d->id, d->arch.shadow_page_count);
225 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
226 perfc_value(shadow_l1_pages),
227 perfc_value(shadow_l2_pages),
228 perfc_value(hl2_table_pages),
229 perfc_value(snapshot_pages));
230 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
231 }
233 smfn = page_to_pfn(page);
235 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
236 page->u.inuse.type_info = psh_type | gmfn;
237 page->count_info = 0;
238 page->tlbflush_timestamp = 0;
240 switch ( psh_type )
241 {
242 case PGT_l1_shadow:
243 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
244 goto fail;
245 perfc_incr(shadow_l1_pages);
246 d->arch.shadow_page_count++;
247 break;
249 case PGT_l2_shadow:
250 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
251 goto fail;
252 perfc_incr(shadow_l2_pages);
253 d->arch.shadow_page_count++;
254 if ( PGT_l2_page_table == PGT_root_page_table )
255 pin = 1;
257 break;
259 case PGT_hl2_shadow:
260 // Treat an hl2 as an L1 for purposes of promotion.
261 // For external mode domains, treat them as an L2 for purposes of
262 // pinning.
263 //
264 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
265 goto fail;
266 perfc_incr(hl2_table_pages);
267 d->arch.hl2_page_count++;
268 if ( shadow_mode_external(d) &&
269 (PGT_l2_page_table == PGT_root_page_table) )
270 pin = 1;
272 break;
274 case PGT_snapshot:
275 perfc_incr(snapshot_pages);
276 d->arch.snapshot_page_count++;
277 break;
279 default:
280 printk("Alloc shadow weird page type type=%08x\n", psh_type);
281 BUG();
282 break;
283 }
285 // Don't add a new shadow of something that already has a snapshot.
286 //
287 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
289 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
291 if ( pin )
292 shadow_pin(smfn);
294 return smfn;
296 fail:
297 FSH_LOG("promotion of pfn=%p mfn=%p failed! external gnttab refs?",
298 gpfn, gmfn);
299 free_domheap_page(page);
300 return 0;
301 }
303 static void inline
304 free_shadow_l1_table(struct domain *d, unsigned long smfn)
305 {
306 l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
307 int i;
308 struct pfn_info *spage = pfn_to_page(smfn);
309 u32 min_max = spage->tlbflush_timestamp;
310 int min = SHADOW_MIN(min_max);
311 int max = SHADOW_MAX(min_max);
313 for ( i = min; i <= max; i++ )
314 {
315 put_page_from_l1e(pl1e[i], d);
316 pl1e[i] = l1e_empty();
317 }
319 unmap_domain_mem(pl1e);
320 }
322 static void inline
323 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
324 {
325 l1_pgentry_t *hl2 = map_domain_mem(smfn << PAGE_SHIFT);
326 int i, limit;
328 SH_VVLOG("%s: smfn=%p freed\n", __func__, smfn);
330 #ifdef __i386__
331 if ( shadow_mode_external(d) )
332 limit = L2_PAGETABLE_ENTRIES;
333 else
334 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
335 #else
336 limit = 0; /* XXX x86/64 XXX */
337 #endif
339 for ( i = 0; i < limit; i++ )
340 {
341 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
342 put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
343 }
345 unmap_domain_mem(hl2);
346 }
348 static void inline
349 free_shadow_l2_table(struct domain *d, unsigned long smfn)
350 {
351 unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
352 int i, external = shadow_mode_external(d);
354 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
355 if ( external || is_guest_l2_slot(i) )
356 if ( pl2e[i] & _PAGE_PRESENT )
357 put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
359 if ( (PGT_base_page_table == PGT_l2_page_table) &&
360 shadow_mode_translate(d) && !external )
361 {
362 // free the ref to the hl2
363 //
364 put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
365 >> PAGE_SHIFT);
366 }
368 unmap_domain_mem(pl2e);
369 }
371 void free_shadow_page(unsigned long smfn)
372 {
373 struct pfn_info *page = &frame_table[smfn];
374 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
375 struct domain *d = page_get_owner(pfn_to_page(gmfn));
376 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
377 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
379 SH_VVLOG("%s: free'ing smfn=%p", __func__, smfn);
381 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
383 delete_shadow_status(d, gpfn, gmfn, type);
385 switch ( type )
386 {
387 case PGT_l1_shadow:
388 perfc_decr(shadow_l1_pages);
389 shadow_demote(d, gpfn, gmfn);
390 free_shadow_l1_table(d, smfn);
391 break;
393 case PGT_l2_shadow:
394 perfc_decr(shadow_l2_pages);
395 shadow_demote(d, gpfn, gmfn);
396 free_shadow_l2_table(d, smfn);
397 break;
399 case PGT_hl2_shadow:
400 perfc_decr(hl2_table_pages);
401 shadow_demote(d, gpfn, gmfn);
402 free_shadow_hl2_table(d, smfn);
403 break;
405 case PGT_snapshot:
406 perfc_decr(snapshot_pages);
407 break;
409 default:
410 printk("Free shadow weird page type mfn=%08x type=%08x\n",
411 page-frame_table, page->u.inuse.type_info);
412 break;
413 }
415 d->arch.shadow_page_count--;
417 // No TLB flushes are needed the next time this page gets allocated.
418 //
419 page->tlbflush_timestamp = 0;
420 page->u.free.cpu_mask = 0;
422 if ( type == PGT_l1_shadow )
423 {
424 list_add(&page->list, &d->arch.free_shadow_frames);
425 perfc_incr(free_l1_pages);
426 }
427 else
428 free_domheap_page(page);
429 }
431 static void inline
432 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
433 {
434 struct pfn_info *page;
436 page = &frame_table[entry->gmfn];
438 // Decrement ref count of guest & shadow pages
439 //
440 put_page(page);
442 // Only use entries that have low bits clear...
443 //
444 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
445 {
446 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
447 entry->writable_pl1e = -2;
448 }
449 else
450 ASSERT( entry->writable_pl1e == -1 );
452 // Free the snapshot
453 //
454 shadow_free_snapshot(d, entry);
455 }
457 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
458 {
459 struct out_of_sync_entry *entry = d->arch.out_of_sync;
460 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
461 struct out_of_sync_entry *found = NULL;
463 // NB: Be careful not to call something that manipulates this list
464 // while walking it. Collect the results into a separate list
465 // first, then walk that list.
466 //
467 while ( entry )
468 {
469 if ( entry->gmfn == gmfn )
470 {
471 // remove from out of sync list
472 *prev = entry->next;
474 // add to found list
475 entry->next = found;
476 found = entry;
478 entry = *prev;
479 continue;
480 }
481 prev = &entry->next;
482 entry = entry->next;
483 }
485 prev = NULL;
486 entry = found;
487 while ( entry )
488 {
489 release_out_of_sync_entry(d, entry);
491 prev = &entry->next;
492 entry = entry->next;
493 }
495 // Add found list to free list
496 if ( prev )
497 {
498 *prev = d->arch.out_of_sync_free;
499 d->arch.out_of_sync_free = found;
500 }
501 }
503 static void free_out_of_sync_state(struct domain *d)
504 {
505 struct out_of_sync_entry *entry;
507 // NB: Be careful not to call something that manipulates this list
508 // while walking it. Remove one item at a time, and always
509 // restart from start of list.
510 //
511 while ( (entry = d->arch.out_of_sync) )
512 {
513 d->arch.out_of_sync = entry->next;
514 release_out_of_sync_entry(d, entry);
516 entry->next = d->arch.out_of_sync_free;
517 d->arch.out_of_sync_free = entry;
518 }
519 }
521 static void free_shadow_pages(struct domain *d)
522 {
523 int i;
524 struct shadow_status *x;
525 struct exec_domain *ed;
527 /*
528 * WARNING! The shadow page table must not currently be in use!
529 * e.g., You are expected to have paused the domain and synchronized CR3.
530 */
532 if( !d->arch.shadow_ht ) return;
534 shadow_audit(d, 1);
536 // first, remove any outstanding refs from out_of_sync entries...
537 //
538 free_out_of_sync_state(d);
540 // second, remove any outstanding refs from ed->arch.shadow_table...
541 //
542 for_each_exec_domain(d, ed)
543 {
544 if ( pagetable_val(ed->arch.shadow_table) )
545 {
546 put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
547 ed->arch.shadow_table = mk_pagetable(0);
548 }
549 }
551 // For external shadows, remove the monitor table's refs
552 //
553 if ( shadow_mode_external(d) )
554 {
555 for_each_exec_domain(d, ed)
556 {
557 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
559 if ( mpl2e )
560 {
561 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
562 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
564 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
565 {
566 put_shadow_ref(l2e_get_pfn(hl2e));
567 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
568 }
569 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
570 {
571 put_shadow_ref(l2e_get_pfn(smfn));
572 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
573 }
574 }
575 }
576 }
578 // Now, the only refs to shadow pages that are left are from the shadow
579 // pages themselves. We just unpin the pinned pages, and the rest
580 // should automatically disappear.
581 //
582 // NB: Beware: each explicitly or implicit call to free_shadow_page
583 // can/will result in the hash bucket getting rewritten out from
584 // under us... First, collect the list of pinned pages, then
585 // free them.
586 //
587 #define PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
588 for ( i = 0; i < shadow_ht_buckets; i++ )
589 {
590 u32 count;
591 unsigned long *mfn_list;
593 /* Skip empty buckets. */
594 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
595 continue;
597 count = 0;
598 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
599 if ( PINNED(x->smfn) )
600 count++;
601 if ( !count )
602 continue;
604 mfn_list = xmalloc_array(unsigned long, count);
605 count = 0;
606 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
607 if ( PINNED(x->smfn) )
608 mfn_list[count++] = x->smfn;
610 while ( count )
611 {
612 shadow_unpin(mfn_list[--count]);
613 }
614 xfree(mfn_list);
615 }
616 #undef PINNED
618 shadow_audit(d, 0);
620 SH_LOG("Free shadow table.");
621 }
623 void shadow_mode_init(void)
624 {
625 }
627 int _shadow_mode_enabled(struct domain *d)
628 {
629 return shadow_mode_enabled(d);
630 }
632 static void alloc_monitor_pagetable(struct exec_domain *ed)
633 {
634 unsigned long mmfn;
635 l2_pgentry_t *mpl2e;
636 struct pfn_info *mmfn_info;
637 struct domain *d = ed->domain;
639 ASSERT(pagetable_val(ed->arch.monitor_table) == 0);
641 mmfn_info = alloc_domheap_page(NULL);
642 ASSERT(mmfn_info != NULL);
644 mmfn = (unsigned long) (mmfn_info - frame_table);
645 mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
646 memset(mpl2e, 0, PAGE_SIZE);
648 #ifdef __i386__ /* XXX screws x86/64 build */
649 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
650 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
651 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
652 #endif
654 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
655 l2e_create_phys(__pa(d->arch.mm_perdomain_pt),
656 __PAGE_HYPERVISOR);
658 // map the phys_to_machine map into the Read-Only MPT space for this domain
659 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
660 l2e_create_phys(pagetable_val(d->arch.phys_table),
661 __PAGE_HYPERVISOR);
663 // Don't (yet) have mappings for these...
664 // Don't want to accidentally see the idle_pg_table's linear mapping.
665 //
666 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
667 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
669 ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
670 ed->arch.monitor_vtable = mpl2e;
671 }
673 /*
674 * Free the pages for monitor_table and hl2_table
675 */
676 void free_monitor_pagetable(struct exec_domain *ed)
677 {
678 l2_pgentry_t *mpl2e, hl2e, sl2e;
679 unsigned long mfn;
681 ASSERT( pagetable_val(ed->arch.monitor_table) );
683 mpl2e = ed->arch.monitor_vtable;
685 /*
686 * First get the mfn for hl2_table by looking at monitor_table
687 */
688 hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
689 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
690 {
691 mfn = l2e_get_pfn(hl2e);
692 ASSERT(mfn);
693 put_shadow_ref(mfn);
694 }
696 sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
697 if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
698 {
699 mfn = l2e_get_pfn(sl2e);
700 ASSERT(mfn);
701 put_shadow_ref(mfn);
702 }
704 unmap_domain_mem(mpl2e);
706 /*
707 * Then free monitor_table.
708 */
709 mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
710 free_domheap_page(&frame_table[mfn]);
712 ed->arch.monitor_table = mk_pagetable(0);
713 ed->arch.monitor_vtable = 0;
714 }
716 int
717 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
718 {
719 unsigned long phystab = pagetable_val(d->arch.phys_table);
720 l2_pgentry_t *l2, l2e;
721 l1_pgentry_t *l1;
722 struct pfn_info *l1page;
723 unsigned long va = pfn << PAGE_SHIFT;
725 ASSERT( phystab );
727 l2 = map_domain_mem(phystab);
728 l2e = l2[l2_table_offset(va)];
729 if ( !l2e_get_value(l2e) ) /* FIXME: check present bit? */
730 {
731 l1page = alloc_domheap_page(NULL);
732 if ( !l1page )
733 return 0;
735 l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT);
736 memset(l1, 0, PAGE_SIZE);
737 unmap_domain_mem(l1);
739 l2e = l2e_create_pfn(page_to_pfn(l1page), __PAGE_HYPERVISOR);
740 l2[l2_table_offset(va)] = l2e;
741 }
742 unmap_domain_mem(l2);
744 l1 = map_domain_mem(l2e_get_phys(l2e));
745 l1[l1_table_offset(va)] = l1e_create_pfn(mfn, __PAGE_HYPERVISOR);
746 unmap_domain_mem(l1);
748 return 1;
749 }
751 static int
752 alloc_p2m_table(struct domain *d)
753 {
754 struct list_head *list_ent;
755 struct pfn_info *page, *l2page;
756 l2_pgentry_t *l2;
757 unsigned long mfn, pfn;
759 l2page = alloc_domheap_page(NULL);
760 if ( !l2page )
761 return 0;
762 d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT);
763 l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT);
764 memset(l2, 0, PAGE_SIZE);
765 unmap_domain_mem(l2);
767 list_ent = d->page_list.next;
768 while ( list_ent != &d->page_list )
769 {
770 page = list_entry(list_ent, struct pfn_info, list);
771 mfn = page_to_pfn(page);
772 pfn = machine_to_phys_mapping[mfn];
773 ASSERT(pfn != INVALID_M2P_ENTRY);
774 ASSERT(pfn < (1u<<20));
776 set_p2m_entry(d, pfn, mfn);
778 list_ent = page->list.next;
779 }
781 list_ent = d->xenpage_list.next;
782 while ( list_ent != &d->xenpage_list )
783 {
784 page = list_entry(list_ent, struct pfn_info, list);
785 mfn = page_to_pfn(page);
786 pfn = machine_to_phys_mapping[mfn];
787 if ( (pfn != INVALID_M2P_ENTRY) &&
788 (pfn < (1u<<20)) )
789 {
790 set_p2m_entry(d, pfn, mfn);
791 }
793 list_ent = page->list.next;
794 }
796 return 1;
797 }
799 static void
800 free_p2m_table(struct domain *d)
801 {
802 // uh, this needs some work... :)
803 BUG();
804 }
806 int __shadow_mode_enable(struct domain *d, unsigned int mode)
807 {
808 struct exec_domain *ed;
809 int new_modes = (mode & ~d->arch.shadow_mode);
811 // Gotta be adding something to call this function.
812 ASSERT(new_modes);
814 // can't take anything away by calling this function.
815 ASSERT(!(d->arch.shadow_mode & ~mode));
817 for_each_exec_domain(d, ed)
818 {
819 invalidate_shadow_ldt(ed);
821 // We need to set these up for __update_pagetables().
822 // See the comment there.
824 /*
825 * arch.guest_vtable
826 */
827 if ( ed->arch.guest_vtable &&
828 (ed->arch.guest_vtable != __linear_l2_table) )
829 {
830 unmap_domain_mem(ed->arch.guest_vtable);
831 }
832 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
833 ed->arch.guest_vtable = __linear_l2_table;
834 else
835 ed->arch.guest_vtable = NULL;
837 /*
838 * arch.shadow_vtable
839 */
840 if ( ed->arch.shadow_vtable &&
841 (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
842 {
843 unmap_domain_mem(ed->arch.shadow_vtable);
844 }
845 if ( !(mode & SHM_external) )
846 ed->arch.shadow_vtable = __shadow_linear_l2_table;
847 else
848 ed->arch.shadow_vtable = NULL;
850 /*
851 * arch.hl2_vtable
852 */
853 if ( ed->arch.hl2_vtable &&
854 (ed->arch.hl2_vtable != __linear_hl2_table) )
855 {
856 unmap_domain_mem(ed->arch.hl2_vtable);
857 }
858 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
859 ed->arch.hl2_vtable = __linear_hl2_table;
860 else
861 ed->arch.hl2_vtable = NULL;
863 /*
864 * arch.monitor_table & arch.monitor_vtable
865 */
866 if ( ed->arch.monitor_vtable )
867 {
868 free_monitor_pagetable(ed);
869 }
870 if ( mode & SHM_external )
871 {
872 alloc_monitor_pagetable(ed);
873 }
874 }
876 if ( new_modes & SHM_enable )
877 {
878 ASSERT( !d->arch.shadow_ht );
879 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
880 if ( d->arch.shadow_ht == NULL )
881 goto nomem;
883 memset(d->arch.shadow_ht, 0,
884 shadow_ht_buckets * sizeof(struct shadow_status));
885 }
887 if ( new_modes & SHM_log_dirty )
888 {
889 ASSERT( !d->arch.shadow_dirty_bitmap );
890 d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
891 d->arch.shadow_dirty_bitmap =
892 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
893 (8 * sizeof(unsigned long)));
894 if ( d->arch.shadow_dirty_bitmap == NULL )
895 {
896 d->arch.shadow_dirty_bitmap_size = 0;
897 goto nomem;
898 }
899 memset(d->arch.shadow_dirty_bitmap, 0,
900 d->arch.shadow_dirty_bitmap_size/8);
901 }
903 if ( new_modes & SHM_translate )
904 {
905 if ( !(new_modes & SHM_external) )
906 {
907 ASSERT( !pagetable_val(d->arch.phys_table) );
908 if ( !alloc_p2m_table(d) )
909 {
910 printk("alloc_p2m_table failed (out-of-memory?)\n");
911 goto nomem;
912 }
913 }
914 else
915 {
916 // external guests provide their own memory for their P2M maps.
917 //
918 ASSERT( d == page_get_owner(&frame_table[pagetable_val(
919 d->arch.phys_table)>>PAGE_SHIFT]) );
920 }
921 }
923 printk("audit1\n");
924 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
925 printk("audit1 done\n");
927 // Get rid of any shadow pages from any previous shadow mode.
928 //
929 free_shadow_pages(d);
931 printk("audit2\n");
932 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
933 printk("audit2 done\n");
935 // Turn off writable page tables.
936 // It doesn't mix with shadow mode.
937 // And shadow mode offers a superset of functionality.
938 //
939 vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
941 /*
942 * Tear down it's counts by disassembling its page-table-based ref counts.
943 * Also remove CR3's gcount/tcount.
944 * That leaves things like GDTs and LDTs and external refs in tact.
945 *
946 * Most pages will be writable tcount=0.
947 * Some will still be L1 tcount=0 or L2 tcount=0.
948 * Maybe some pages will be type none tcount=0.
949 * Pages granted external writable refs (via grant tables?) will
950 * still have a non-zero tcount. That's OK.
951 *
952 * gcounts will generally be 1 for PGC_allocated.
953 * GDTs and LDTs will have additional gcounts.
954 * Any grant-table based refs will still be in the gcount.
955 *
956 * We attempt to grab writable refs to each page (thus setting its type).
957 * Immediately put back those type refs.
958 *
959 * Assert that no pages are left with L1/L2/L3/L4 type.
960 */
961 audit_adjust_pgtables(d, -1, 1);
962 d->arch.shadow_mode = mode;
964 struct list_head *list_ent = d->page_list.next;
965 while ( list_ent != &d->page_list )
966 {
967 struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
968 if ( !get_page_type(page, PGT_writable_page) )
969 BUG();
970 put_page_type(page);
972 list_ent = page->list.next;
973 }
975 audit_adjust_pgtables(d, 1, 1);
977 printk("audit3\n");
978 _audit_domain(d, AUDIT_ALREADY_LOCKED);
979 printk("audit3 done\n");
981 return 0;
983 nomem:
984 if ( (new_modes & SHM_enable) && (d->arch.shadow_ht != NULL) )
985 {
986 xfree(d->arch.shadow_ht);
987 d->arch.shadow_ht = NULL;
988 }
989 if ( (new_modes & SHM_log_dirty) && (d->arch.shadow_dirty_bitmap != NULL) )
990 {
991 xfree(d->arch.shadow_dirty_bitmap);
992 d->arch.shadow_dirty_bitmap = NULL;
993 }
994 if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
995 pagetable_val(d->arch.phys_table) )
996 {
997 free_p2m_table(d);
998 }
999 return -ENOMEM;
1002 int shadow_mode_enable(struct domain *d, unsigned int mode)
1004 int rc;
1005 shadow_lock(d);
1006 rc = __shadow_mode_enable(d, mode);
1007 shadow_unlock(d);
1008 return rc;
1011 static void
1012 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1014 int i;
1015 l1_pgentry_t *l1;
1017 l1 = map_domain_mem(l1mfn << PAGE_SHIFT);
1018 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1020 if ( is_guest_l1_slot(i) &&
1021 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1023 unsigned long mfn = l1e_get_pfn(l1[i]);
1024 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1025 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1026 l1[i] = l1e_create_pfn(gpfn, l1e_get_flags(l1[i]));
1029 unmap_domain_mem(l1);
1032 // This is not general enough to handle arbitrary pagetables
1033 // with shared L1 pages, etc., but it is sufficient for bringing
1034 // up dom0.
1035 //
1036 void
1037 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn)
1039 int i;
1040 l2_pgentry_t *l2;
1042 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1044 l2 = map_domain_mem(l2mfn << PAGE_SHIFT);
1045 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1047 if ( is_guest_l2_slot(i) &&
1048 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1050 unsigned long mfn = l2e_get_pfn(l2[i]);
1051 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
1052 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1053 l2[i] = l2e_create_pfn(gpfn, l2e_get_flags(l2[i]));
1054 translate_l1pgtable(d, p2m, mfn);
1057 unmap_domain_mem(l2);
1060 static void free_shadow_ht_entries(struct domain *d)
1062 struct shadow_status *x, *n;
1064 SH_VLOG("freed tables count=%d l1=%d l2=%d",
1065 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
1066 perfc_value(shadow_l2_pages));
1068 n = d->arch.shadow_ht_extras;
1069 while ( (x = n) != NULL )
1071 d->arch.shadow_extras_count--;
1072 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
1073 xfree(x);
1076 d->arch.shadow_ht_extras = NULL;
1077 d->arch.shadow_ht_free = NULL;
1079 ASSERT(d->arch.shadow_extras_count == 0);
1080 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
1082 if ( d->arch.shadow_dirty_bitmap != NULL )
1084 xfree(d->arch.shadow_dirty_bitmap);
1085 d->arch.shadow_dirty_bitmap = 0;
1086 d->arch.shadow_dirty_bitmap_size = 0;
1089 xfree(d->arch.shadow_ht);
1090 d->arch.shadow_ht = NULL;
1093 static void free_out_of_sync_entries(struct domain *d)
1095 struct out_of_sync_entry *x, *n;
1097 n = d->arch.out_of_sync_extras;
1098 while ( (x = n) != NULL )
1100 d->arch.out_of_sync_extras_count--;
1101 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
1102 xfree(x);
1105 d->arch.out_of_sync_extras = NULL;
1106 d->arch.out_of_sync_free = NULL;
1107 d->arch.out_of_sync = NULL;
1109 ASSERT(d->arch.out_of_sync_extras_count == 0);
1110 FSH_LOG("freed extra out_of_sync entries, now %d",
1111 d->arch.out_of_sync_extras_count);
1114 void shadow_mode_destroy(struct domain *d)
1116 shadow_lock(d);
1118 free_shadow_pages(d);
1119 free_writable_pte_predictions(d);
1121 #ifndef NDEBUG
1122 int i;
1123 for ( i = 0; i < shadow_ht_buckets; i++ )
1125 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1127 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%p\n",
1128 i, d->arch.shadow_ht[i].gpfn_and_flags);
1129 BUG();
1132 #endif
1134 d->arch.shadow_mode = 0;
1136 free_shadow_ht_entries(d);
1137 free_out_of_sync_entries(d);
1139 shadow_unlock(d);
1142 void __shadow_mode_disable(struct domain *d)
1144 // This needs rethinking for the full shadow mode stuff.
1145 //
1146 // Among other things, ref counts need to be restored to a sensible
1147 // state for a non-shadow-mode guest...
1148 // This is probably easiest to do by stealing code from audit_domain().
1149 //
1150 BUG();
1152 free_shadow_pages(d);
1154 d->arch.shadow_mode = 0;
1156 free_shadow_ht_entries(d);
1157 free_out_of_sync_entries(d);
1160 static int shadow_mode_table_op(
1161 struct domain *d, dom0_shadow_control_t *sc)
1163 unsigned int op = sc->op;
1164 int i, rc = 0;
1165 struct exec_domain *ed;
1167 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1169 SH_VLOG("shadow mode table op %p %p count %d",
1170 pagetable_val(d->exec_domain[0]->arch.guest_table), /* XXX SMP */
1171 pagetable_val(d->exec_domain[0]->arch.shadow_table), /* XXX SMP */
1172 d->arch.shadow_page_count);
1174 shadow_audit(d, 1);
1176 switch ( op )
1178 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1179 free_shadow_pages(d);
1181 d->arch.shadow_fault_count = 0;
1182 d->arch.shadow_dirty_count = 0;
1183 d->arch.shadow_dirty_net_count = 0;
1184 d->arch.shadow_dirty_block_count = 0;
1186 break;
1188 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1189 free_shadow_pages(d);
1191 sc->stats.fault_count = d->arch.shadow_fault_count;
1192 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1193 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1194 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1196 d->arch.shadow_fault_count = 0;
1197 d->arch.shadow_dirty_count = 0;
1198 d->arch.shadow_dirty_net_count = 0;
1199 d->arch.shadow_dirty_block_count = 0;
1201 if ( (d->max_pages > sc->pages) ||
1202 (sc->dirty_bitmap == NULL) ||
1203 (d->arch.shadow_dirty_bitmap == NULL) )
1205 rc = -EINVAL;
1206 break;
1209 sc->pages = d->max_pages;
1211 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1212 for ( i = 0; i < d->max_pages; i += chunk )
1214 int bytes = ((((d->max_pages - i) > chunk) ?
1215 chunk : (d->max_pages - i)) + 7) / 8;
1217 if (copy_to_user(
1218 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1219 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1220 bytes))
1222 // copy_to_user can fail when copying to guest app memory.
1223 // app should zero buffer after mallocing, and pin it
1224 rc = -EINVAL;
1225 memset(
1226 d->arch.shadow_dirty_bitmap +
1227 (i/(8*sizeof(unsigned long))),
1228 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
1229 break;
1232 memset(
1233 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1234 0, bytes);
1237 break;
1239 case DOM0_SHADOW_CONTROL_OP_PEEK:
1240 sc->stats.fault_count = d->arch.shadow_fault_count;
1241 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1242 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1243 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1245 if ( (d->max_pages > sc->pages) ||
1246 (sc->dirty_bitmap == NULL) ||
1247 (d->arch.shadow_dirty_bitmap == NULL) )
1249 rc = -EINVAL;
1250 break;
1253 sc->pages = d->max_pages;
1254 if (copy_to_user(
1255 sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
1257 rc = -EINVAL;
1258 break;
1261 break;
1263 default:
1264 rc = -EINVAL;
1265 break;
1268 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1269 shadow_audit(d, 1);
1271 for_each_exec_domain(d,ed)
1272 __update_pagetables(ed);
1274 return rc;
1277 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1279 unsigned int op = sc->op;
1280 int rc = 0;
1281 struct exec_domain *ed;
1283 if ( unlikely(d == current->domain) )
1285 DPRINTK("Don't try to do a shadow op on yourself!\n");
1286 return -EINVAL;
1289 domain_pause(d);
1291 shadow_lock(d);
1293 switch ( op )
1295 case DOM0_SHADOW_CONTROL_OP_OFF:
1296 shadow_mode_disable(d);
1297 break;
1299 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1300 free_shadow_pages(d);
1301 rc = __shadow_mode_enable(d, SHM_enable);
1302 break;
1304 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1305 free_shadow_pages(d);
1306 rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1307 break;
1309 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1310 free_shadow_pages(d);
1311 rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_translate);
1312 break;
1314 default:
1315 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1316 break;
1319 shadow_unlock(d);
1321 for_each_exec_domain(d,ed)
1322 update_pagetables(ed);
1324 domain_unpause(d);
1326 return rc;
1329 /*
1330 * XXX KAF: Why is this VMX specific?
1331 */
1332 void vmx_shadow_clear_state(struct domain *d)
1334 SH_VVLOG("%s:", __func__);
1335 shadow_lock(d);
1336 free_shadow_pages(d);
1337 shadow_unlock(d);
1338 update_pagetables(d->exec_domain[0]);
1341 unsigned long
1342 gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1344 ASSERT( shadow_mode_translate(d) );
1346 perfc_incrc(gpfn_to_mfn_foreign);
1348 unsigned long va = gpfn << PAGE_SHIFT;
1349 unsigned long phystab = pagetable_val(d->arch.phys_table);
1350 l2_pgentry_t *l2 = map_domain_mem(phystab);
1351 l2_pgentry_t l2e = l2[l2_table_offset(va)];
1352 unmap_domain_mem(l2);
1353 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1355 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l2e=%p\n",
1356 d->id, gpfn, l2e_get_value(l2e));
1357 return INVALID_MFN;
1359 unsigned long l1tab = l2e_get_phys(l2e);
1360 l1_pgentry_t *l1 = map_domain_mem(l1tab);
1361 l1_pgentry_t l1e = l1[l1_table_offset(va)];
1362 unmap_domain_mem(l1);
1364 #if 0
1365 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => %p phystab=%p l2e=%p l1tab=%p, l1e=%p\n",
1366 d->id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, phystab, l2e, l1tab, l1e);
1367 #endif
1369 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1371 printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%p) => 0 l1e=%p\n",
1372 d->id, gpfn, l1e_get_value(l1e));
1373 return INVALID_MFN;
1376 return l1e_get_pfn(l1e);
1379 static unsigned long
1380 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1381 unsigned long smfn)
1383 unsigned long hl2mfn;
1384 l1_pgentry_t *hl2;
1385 int limit;
1387 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1389 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1391 printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1392 BUG(); /* XXX Deal gracefully with failure. */
1395 SH_VVLOG("shadow_hl2_table(gpfn=%p, gmfn=%p, smfn=%p) => %p",
1396 gpfn, gmfn, smfn, hl2mfn);
1397 perfc_incrc(shadow_hl2_table_count);
1399 hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
1401 #ifdef __i386__
1402 if ( shadow_mode_external(d) )
1403 limit = L2_PAGETABLE_ENTRIES;
1404 else
1405 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1406 #else
1407 limit = 0; /* XXX x86/64 XXX */
1408 #endif
1410 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1412 if ( !shadow_mode_external(d) )
1414 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1415 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1417 // Setup easy access to the GL2, SL2, and HL2 frames.
1418 //
1419 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1420 l1e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1421 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1422 l1e_create_pfn(smfn, __PAGE_HYPERVISOR);
1423 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1424 l1e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1427 unmap_domain_mem(hl2);
1429 return hl2mfn;
1432 /*
1433 * This could take and use a snapshot, and validate the entire page at
1434 * once, or it could continue to fault in entries one at a time...
1435 * Might be worth investigating...
1436 */
1437 static unsigned long shadow_l2_table(
1438 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1440 unsigned long smfn;
1441 l2_pgentry_t *spl2e;
1443 SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
1445 perfc_incrc(shadow_l2_table_count);
1447 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1449 printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1450 BUG(); /* XXX Deal gracefully with failure. */
1453 spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
1455 /* Install hypervisor and 2x linear p.t. mapings. */
1456 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1457 !shadow_mode_external(d) )
1459 /*
1460 * We could proactively fill in PDEs for pages that are already
1461 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1462 * (restriction required for coherence of the accessed bit). However,
1463 * we tried it and it didn't help performance. This is simpler.
1464 */
1465 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1467 /* Install hypervisor and 2x linear p.t. mapings. */
1468 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1469 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1470 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1472 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1473 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
1475 spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
1476 l2e_create_phys(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
1477 __PAGE_HYPERVISOR);
1479 if ( shadow_mode_translate(d) ) // NB: not external
1481 unsigned long hl2mfn;
1483 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1484 l2e_create_phys(pagetable_val(d->arch.phys_table),
1485 __PAGE_HYPERVISOR);
1487 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1488 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1490 // shadow_mode_translate (but not external) sl2 tables hold a
1491 // ref to their hl2.
1492 //
1493 if ( !get_shadow_ref(hl2mfn) )
1494 BUG();
1496 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1497 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
1499 else
1500 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1501 l2e_create_pfn(gmfn, __PAGE_HYPERVISOR);
1503 else
1505 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1508 unmap_domain_mem(spl2e);
1510 SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
1511 return smfn;
1514 void shadow_map_l1_into_current_l2(unsigned long va)
1516 struct exec_domain *ed = current;
1517 struct domain *d = ed->domain;
1518 l1_pgentry_t *gpl1e, *spl1e;
1519 l2_pgentry_t gl2e, sl2e;
1520 unsigned long gl1pfn, gl1mfn, sl1mfn;
1521 int i, init_table = 0;
1523 __guest_get_l2e(ed, va, &gl2e);
1524 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
1525 gl1pfn = l2e_get_pfn(gl2e);
1527 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1529 /* This L1 is NOT already shadowed so we need to shadow it. */
1530 SH_VVLOG("4a: l1 not shadowed");
1532 gl1mfn = __gpfn_to_mfn(d, gl1pfn);
1533 if ( unlikely(!VALID_MFN(gl1mfn)) )
1535 // Attempt to use an invalid pfn as an L1 page.
1536 // XXX this needs to be more graceful!
1537 BUG();
1540 if ( unlikely(!(sl1mfn =
1541 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1543 printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
1544 gl1pfn, gl1mfn);
1545 BUG(); /* XXX Need to deal gracefully with failure. */
1548 perfc_incrc(shadow_l1_table_count);
1549 init_table = 1;
1551 else
1553 /* This L1 is shadowed already, but the L2 entry is missing. */
1554 SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
1557 #ifndef NDEBUG
1558 l2_pgentry_t old_sl2e;
1559 __shadow_get_l2e(ed, va, &old_sl2e);
1560 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
1561 #endif
1563 if ( !get_shadow_ref(sl1mfn) )
1564 BUG();
1565 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1566 __guest_set_l2e(ed, va, gl2e);
1567 __shadow_set_l2e(ed, va, sl2e);
1569 if ( init_table )
1571 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
1572 ~(L1_PAGETABLE_ENTRIES-1)]);
1574 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
1575 ~(L1_PAGETABLE_ENTRIES-1)]);
1577 l1_pgentry_t sl1e;
1578 int index = l1_table_offset(va);
1579 int min = 1, max = 0;
1581 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1583 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1584 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
1585 !shadow_get_page_from_l1e(sl1e, d) )
1586 sl1e = l1e_empty();
1587 if ( l1e_get_value(sl1e) == 0 ) /* FIXME: check flags? */
1589 // First copy entries from 0 until first invalid.
1590 // Then copy entries from index until first invalid.
1591 //
1592 if ( i < index ) {
1593 i = index - 1;
1594 continue;
1596 break;
1598 spl1e[i] = sl1e;
1599 if ( unlikely(i < min) )
1600 min = i;
1601 if ( likely(i > max) )
1602 max = i;
1605 frame_table[sl1mfn].tlbflush_timestamp =
1606 SHADOW_ENCODE_MIN_MAX(min, max);
1610 void shadow_invlpg(struct exec_domain *ed, unsigned long va)
1612 struct domain *d = ed->domain;
1613 l1_pgentry_t gpte, spte;
1615 ASSERT(shadow_mode_enabled(d));
1617 shadow_lock(d);
1619 __shadow_sync_va(ed, va);
1621 // XXX mafetter: will need to think about 4MB pages...
1623 // It's not strictly necessary to update the shadow here,
1624 // but it might save a fault later.
1625 //
1626 if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
1627 sizeof(gpte))) {
1628 perfc_incrc(shadow_invlpg_faults);
1629 return;
1631 l1pte_propagate_from_guest(d, gpte, &spte);
1632 shadow_set_l1e(va, spte, 1);
1634 shadow_unlock(d);
1637 struct out_of_sync_entry *
1638 shadow_alloc_oos_entry(struct domain *d)
1640 struct out_of_sync_entry *f, *extra;
1641 unsigned size, i;
1643 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1645 FSH_LOG("Allocate more fullshadow tuple blocks.");
1647 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1648 extra = xmalloc_bytes(size);
1650 /* XXX Should be more graceful here. */
1651 if ( extra == NULL )
1652 BUG();
1654 memset(extra, 0, size);
1656 /* Record the allocation block so it can be correctly freed later. */
1657 d->arch.out_of_sync_extras_count++;
1658 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1659 d->arch.out_of_sync_extras;
1660 d->arch.out_of_sync_extras = &extra[0];
1662 /* Thread a free chain through the newly-allocated nodes. */
1663 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1664 extra[i].next = &extra[i+1];
1665 extra[i].next = NULL;
1667 /* Add the new nodes to the free list. */
1668 d->arch.out_of_sync_free = &extra[0];
1671 /* Allocate a new node from the quicklist. */
1672 f = d->arch.out_of_sync_free;
1673 d->arch.out_of_sync_free = f->next;
1675 return f;
1678 static inline unsigned long
1679 shadow_make_snapshot(
1680 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1682 unsigned long smfn, sl1mfn = 0;
1683 void *original, *snapshot;
1684 u32 min_max = 0;
1685 int min, max, length;
1687 if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
1689 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
1690 return SHADOW_SNAPSHOT_ELSEWHERE;
1693 perfc_incrc(shadow_make_snapshot);
1695 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
1697 printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
1698 "Dom%d snapshot_count_count=%d\n",
1699 gpfn, gmfn, d->id, d->arch.snapshot_page_count);
1700 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
1703 if ( !get_shadow_ref(smfn) )
1704 BUG();
1706 if ( shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow )
1707 min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
1708 pfn_to_page(smfn)->tlbflush_timestamp = min_max;
1710 min = SHADOW_MIN(min_max);
1711 max = SHADOW_MAX(min_max);
1712 length = max - min + 1;
1713 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
1715 min *= sizeof(l1_pgentry_t);
1716 length *= sizeof(l1_pgentry_t);
1718 original = map_domain_mem(gmfn << PAGE_SHIFT);
1719 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1720 memcpy(snapshot + min, original + min, length);
1721 unmap_domain_mem(original);
1722 unmap_domain_mem(snapshot);
1724 return smfn;
1727 static void
1728 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
1730 void *snapshot;
1732 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1733 return;
1735 // Clear the out_of_sync bit.
1736 //
1737 clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
1739 // XXX Need to think about how to protect the domain's
1740 // information less expensively.
1741 //
1742 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
1743 memset(snapshot, 0, PAGE_SIZE);
1744 unmap_domain_mem(snapshot);
1746 put_shadow_ref(entry->snapshot_mfn);
1749 struct out_of_sync_entry *
1750 shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
1751 unsigned long mfn)
1753 struct domain *d = ed->domain;
1754 struct pfn_info *page = &frame_table[mfn];
1755 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1757 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1758 ASSERT(pfn_valid(mfn));
1759 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
1761 FSH_LOG("%s(gpfn=%p, mfn=%p) c=%p t=%p", __func__,
1762 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1764 // XXX this will require some more thought... Cross-domain sharing and
1765 // modification of page tables? Hmm...
1766 //
1767 if ( d != page_get_owner(page) )
1768 BUG();
1770 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1772 entry->gpfn = gpfn;
1773 entry->gmfn = mfn;
1774 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1775 entry->writable_pl1e = -1;
1777 // increment guest's ref count to represent the entry in the
1778 // full shadow out-of-sync list.
1779 //
1780 get_page(page, d);
1782 // Add to the out-of-sync list
1783 //
1784 entry->next = d->arch.out_of_sync;
1785 d->arch.out_of_sync = entry;
1787 return entry;
1790 void shadow_mark_va_out_of_sync(
1791 struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
1793 struct out_of_sync_entry *entry =
1794 shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
1795 l2_pgentry_t sl2e;
1797 // We need the address of shadow PTE that maps @va.
1798 // It might not exist yet. Make sure it's there.
1799 //
1800 __shadow_get_l2e(ed, va, &sl2e);
1801 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1803 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1804 // the current L2.
1805 shadow_map_l1_into_current_l2(va);
1806 __shadow_get_l2e(ed, va, &sl2e);
1808 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1810 // NB: this is stored as a machine address.
1811 entry->writable_pl1e =
1812 l2e_get_phys(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1813 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1815 // Increment shadow's page count to represent the reference
1816 // inherent in entry->writable_pl1e
1817 //
1818 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1819 BUG();
1821 FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
1822 va, entry->writable_pl1e);
1825 /*
1826 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1827 * Returns 0 otherwise.
1828 */
1829 static int snapshot_entry_matches(
1830 struct exec_domain *ed, unsigned long gmfn, unsigned index)
1832 unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
1833 unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
1834 unsigned long *guest, *snapshot;
1835 int compare;
1837 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
1839 perfc_incrc(snapshot_entry_matches_calls);
1841 if ( !smfn )
1842 return 0;
1844 guest = map_domain_mem(gmfn << PAGE_SHIFT);
1845 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1847 // This could probably be smarter, but this is sufficent for
1848 // our current needs.
1849 //
1850 compare = (guest[index] == snapshot[index]);
1852 unmap_domain_mem(guest);
1853 unmap_domain_mem(snapshot);
1855 #ifdef PERF_COUNTERS
1856 if ( compare )
1857 perfc_incrc(snapshot_entry_matches_true);
1858 #endif
1860 return compare;
1863 /*
1864 * Returns 1 if va's shadow mapping is out-of-sync.
1865 * Returns 0 otherwise.
1866 */
1867 int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
1869 struct domain *d = ed->domain;
1870 unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1871 l2_pgentry_t l2e;
1872 unsigned long l1mfn;
1874 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1876 perfc_incrc(shadow_out_of_sync_calls);
1878 if ( page_out_of_sync(&frame_table[l2mfn]) &&
1879 !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
1880 return 1;
1882 __guest_get_l2e(ed, va, &l2e);
1883 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1884 return 0;
1886 l1mfn = __gpfn_to_mfn(d, l2e_get_pfn(l2e));
1888 // If the l1 pfn is invalid, it can't be out of sync...
1889 if ( !VALID_MFN(l1mfn) )
1890 return 0;
1892 if ( page_out_of_sync(&frame_table[l1mfn]) &&
1893 !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
1894 return 1;
1896 return 0;
1899 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
1900 static inline unsigned long
1901 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1903 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1906 static inline void
1907 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1909 unsigned long score = prediction & PGT_score_mask;
1910 int create = (score == 0);
1912 // saturating addition
1913 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1914 score = score ? score : PGT_score_mask;
1916 prediction = (prediction & PGT_mfn_mask) | score;
1918 //printk("increase gpfn=%p pred=%p create=%d\n", gpfn, prediction, create);
1919 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1921 if ( create )
1922 perfc_incr(writable_pte_predictions);
1925 static inline void
1926 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1928 unsigned long score = prediction & PGT_score_mask;
1929 ASSERT(score);
1931 // divide score by 2... We don't like bad predictions.
1932 //
1933 score = (score >> 1) & PGT_score_mask;
1935 prediction = (prediction & PGT_mfn_mask) | score;
1937 //printk("decrease gpfn=%p pred=%p score=%p\n", gpfn, prediction, score);
1939 if ( score )
1940 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1941 else
1943 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1944 perfc_decr(writable_pte_predictions);
1948 static void
1949 free_writable_pte_predictions(struct domain *d)
1951 int i;
1952 struct shadow_status *x;
1954 for ( i = 0; i < shadow_ht_buckets; i++ )
1956 u32 count;
1957 unsigned long *gpfn_list;
1959 /* Skip empty buckets. */
1960 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
1961 continue;
1963 count = 0;
1964 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
1965 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
1966 count++;
1968 gpfn_list = xmalloc_array(unsigned long, count);
1969 count = 0;
1970 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
1971 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
1972 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
1974 while ( count )
1976 count--;
1977 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
1980 xfree(gpfn_list);
1984 static u32 remove_all_write_access_in_ptpage(
1985 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1986 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1987 u32 max_refs_to_find, unsigned long prediction)
1989 l1_pgentry_t *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
1990 l1_pgentry_t match;
1991 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1992 int i;
1993 u32 found = 0;
1994 int is_l1_shadow =
1995 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1996 PGT_l1_shadow);
1998 match = l1e_create_pfn(readonly_gmfn, flags);
2000 // returns true if all refs have been found and fixed.
2001 //
2002 int fix_entry(int i)
2004 l1_pgentry_t old = pt[i];
2005 l1_pgentry_t new = old;
2007 l1e_remove_flags(&new,_PAGE_RW);
2008 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
2009 BUG();
2010 found++;
2011 pt[i] = new;
2012 if ( is_l1_shadow )
2013 put_page_from_l1e(old, d);
2015 #if 0
2016 printk("removed write access to pfn=%p mfn=%p in smfn=%p entry %x "
2017 "is_l1_shadow=%d\n",
2018 readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
2019 #endif
2021 return (found == max_refs_to_find);
2024 i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
2025 if ( !l1e_has_changed(&pt[i], &match, flags) && fix_entry(i) )
2027 perfc_incrc(remove_write_fast_exit);
2028 increase_writable_pte_prediction(d, readonly_gpfn, prediction);
2029 unmap_domain_mem(pt);
2030 return found;
2033 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2035 if ( unlikely(!l1e_has_changed(&pt[i], &match, flags)) && fix_entry(i) )
2036 break;
2039 unmap_domain_mem(pt);
2041 return found;
2042 #undef MATCH_ENTRY
2045 int shadow_remove_all_write_access(
2046 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
2048 int i;
2049 struct shadow_status *a;
2050 u32 found = 0, fixups, write_refs;
2051 unsigned long prediction, predicted_gpfn, predicted_smfn;
2053 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2054 ASSERT(VALID_MFN(readonly_gmfn));
2056 perfc_incrc(remove_write_access);
2058 // If it's not a writable page, then no writable refs can be outstanding.
2059 //
2060 if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
2061 PGT_writable_page )
2063 perfc_incrc(remove_write_not_writable);
2064 return 1;
2067 // How many outstanding writable PTEs for this page are there?
2068 //
2069 write_refs =
2070 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
2071 if ( write_refs &&
2072 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) )
2074 write_refs--;
2077 if ( write_refs == 0 )
2079 perfc_incrc(remove_write_no_work);
2080 return 1;
2083 // Before searching all the L1 page tables, check the typical culprit first
2084 //
2085 if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
2087 predicted_gpfn = prediction & PGT_mfn_mask;
2088 if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
2089 (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
2091 found += fixups;
2092 if ( found == write_refs )
2094 perfc_incrc(remove_write_predicted);
2095 return 1;
2098 else
2100 perfc_incrc(remove_write_bad_prediction);
2101 decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
2105 // Search all the shadow L1 page tables...
2106 //
2107 for (i = 0; i < shadow_ht_buckets; i++)
2109 a = &d->arch.shadow_ht[i];
2110 while ( a && a->gpfn_and_flags )
2112 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
2114 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
2115 if ( found == write_refs )
2116 return 1;
2119 a = a->next;
2123 FSH_LOG("%s: looking for %d refs, found %d refs",
2124 __func__, write_refs, found);
2126 return 0;
2129 static u32 remove_all_access_in_page(
2130 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
2132 l1_pgentry_t *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
2133 l1_pgentry_t match;
2134 unsigned long flags = _PAGE_PRESENT;
2135 int i;
2136 u32 count = 0;
2137 int is_l1_shadow =
2138 ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
2139 PGT_l1_shadow);
2141 match = l1e_create_pfn(forbidden_gmfn, flags);
2143 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2145 if ( unlikely(!l1e_has_changed(&pl1e[i], &match, flags) == 0) )
2147 l1_pgentry_t ol2e = pl1e[i];
2148 pl1e[i] = l1e_empty();
2149 count++;
2151 if ( is_l1_shadow )
2152 put_page_from_l1e(ol2e, d);
2153 else /* must be an hl2 page */
2154 put_page(&frame_table[forbidden_gmfn]);
2158 unmap_domain_mem(pl1e);
2160 return count;
2163 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
2165 int i;
2166 struct shadow_status *a;
2167 u32 count = 0;
2169 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2170 perfc_incrc(remove_all_access);
2172 for (i = 0; i < shadow_ht_buckets; i++)
2174 a = &d->arch.shadow_ht[i];
2175 while ( a && a->gpfn_and_flags )
2177 switch (a->gpfn_and_flags & PGT_type_mask)
2179 case PGT_l1_shadow:
2180 case PGT_l2_shadow:
2181 case PGT_l3_shadow:
2182 case PGT_l4_shadow:
2183 case PGT_hl2_shadow:
2184 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2185 break;
2186 case PGT_snapshot:
2187 case PGT_writable_pred:
2188 // these can't hold refs to the forbidden page
2189 break;
2190 default:
2191 BUG();
2194 a = a->next;
2198 return count;
2201 static int resync_all(struct domain *d, u32 stype)
2203 struct out_of_sync_entry *entry;
2204 unsigned i;
2205 unsigned long smfn;
2206 void *guest, *shadow, *snapshot;
2207 int need_flush = 0, external = shadow_mode_external(d);
2208 int unshadow;
2209 int changed;
2211 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2213 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2215 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2216 continue;
2218 if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
2219 continue;
2221 FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
2222 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2224 // Compare guest's new contents to its snapshot, validating
2225 // and updating its shadow as appropriate.
2226 //
2227 guest = map_domain_mem(entry->gmfn << PAGE_SHIFT);
2228 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
2229 shadow = map_domain_mem(smfn << PAGE_SHIFT);
2230 unshadow = 0;
2232 switch ( stype ) {
2233 case PGT_l1_shadow:
2235 u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
2236 int min_shadow = SHADOW_MIN(min_max_shadow);
2237 int max_shadow = SHADOW_MAX(min_max_shadow);
2239 u32 min_max_snapshot =
2240 pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
2241 int min_snapshot = SHADOW_MIN(min_max_snapshot);
2242 int max_snapshot = SHADOW_MAX(min_max_snapshot);
2244 l1_pgentry_t *guest1 = guest;
2245 l1_pgentry_t *shadow1 = shadow;
2246 l1_pgentry_t *snapshot1 = snapshot;
2248 changed = 0;
2250 for ( i = min_shadow; i <= max_shadow; i++ )
2252 if ( (i < min_snapshot) || (i > max_snapshot) ||
2253 l1e_has_changed(&guest1[i], &snapshot1[i], PAGE_FLAG_MASK) )
2255 need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]);
2257 // can't update snapshots of linear page tables -- they
2258 // are used multiple times...
2259 //
2260 // snapshot[i] = new_pte;
2262 changed++;
2265 perfc_incrc(resync_l1);
2266 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
2267 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
2268 break;
2270 case PGT_l2_shadow:
2272 int max = -1;
2274 l2_pgentry_t *guest2 = guest;
2275 l2_pgentry_t *shadow2 = shadow;
2276 l2_pgentry_t *snapshot2 = snapshot;
2278 changed = 0;
2279 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2281 if ( !is_guest_l2_slot(i) && !external )
2282 continue;
2284 l2_pgentry_t new_pde = guest2[i];
2285 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK))
2287 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
2289 // can't update snapshots of linear page tables -- they
2290 // are used multiple times...
2291 //
2292 // snapshot[i] = new_pde;
2294 changed++;
2296 if ( l2e_get_value(new_pde) != 0 ) /* FIXME: check flags? */
2297 max = i;
2299 // XXX - This hack works for linux guests.
2300 // Need a better solution long term.
2301 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
2302 unlikely(l2e_get_value(new_pde) != 0) &&
2303 !unshadow &&
2304 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
2305 unshadow = 1;
2307 if ( max == -1 )
2308 unshadow = 1;
2309 perfc_incrc(resync_l2);
2310 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
2311 break;
2313 case PGT_hl2_shadow:
2315 l2_pgentry_t *guest2 = guest;
2316 l2_pgentry_t *snapshot2 = snapshot;
2317 l1_pgentry_t *shadow2 = shadow;
2319 changed = 0;
2320 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2322 if ( !is_guest_l2_slot(i) && !external )
2323 continue;
2325 l2_pgentry_t new_pde = guest2[i];
2326 if ( l2e_has_changed(&new_pde, &snapshot2[i], PAGE_FLAG_MASK) )
2328 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
2330 // can't update snapshots of linear page tables -- they
2331 // are used multiple times...
2332 //
2333 // snapshot[i] = new_pde;
2335 changed++;
2338 perfc_incrc(resync_hl2);
2339 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
2340 break;
2342 default:
2343 BUG();
2346 unmap_domain_mem(shadow);
2347 unmap_domain_mem(snapshot);
2348 unmap_domain_mem(guest);
2350 if ( unlikely(unshadow) )
2352 perfc_incrc(unshadow_l2_count);
2353 shadow_unpin(smfn);
2354 if ( unlikely(shadow_mode_external(d)) )
2356 unsigned long hl2mfn;
2358 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
2359 (frame_table[hl2mfn].u.inuse.type_info & PGT_pinned) )
2360 shadow_unpin(hl2mfn);
2365 return need_flush;
2368 void __shadow_sync_all(struct domain *d)
2370 struct out_of_sync_entry *entry;
2371 int need_flush = 0;
2373 perfc_incrc(shadow_sync_all);
2375 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2377 // First, remove all write permissions to the page tables
2378 //
2379 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2381 // Skip entries that have low bits set... Those aren't
2382 // real PTEs.
2383 //
2384 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2385 continue;
2387 l1_pgentry_t *ppte = map_domain_mem(entry->writable_pl1e);
2388 l1_pgentry_t opte = *ppte;
2389 l1_pgentry_t npte = opte;
2390 l1e_remove_flags(&opte, _PAGE_RW);
2392 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2393 !shadow_get_page_from_l1e(npte, d) )
2394 BUG();
2395 *ppte = npte;
2396 put_page_from_l1e(opte, d);
2398 unmap_domain_mem(ppte);
2401 // XXX mafetter: SMP
2402 //
2403 // With the current algorithm, we've gotta flush all the TLBs
2404 // before we can safely continue. I don't think we want to
2405 // do it this way, so I think we should consider making
2406 // entirely private copies of the shadow for each vcpu, and/or
2407 // possibly having a mix of private and shared shadow state
2408 // (any path from a PTE that grants write access to an out-of-sync
2409 // page table page needs to be vcpu private).
2410 //
2411 #if 0 // this should be enabled for SMP guests...
2412 flush_tlb_mask(((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id()));
2413 #endif
2414 need_flush = 1;
2416 // Second, resync all L1 pages, then L2 pages, etc...
2417 //
2418 need_flush |= resync_all(d, PGT_l1_shadow);
2419 if ( shadow_mode_translate(d) )
2420 need_flush |= resync_all(d, PGT_hl2_shadow);
2421 need_flush |= resync_all(d, PGT_l2_shadow);
2423 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2424 local_flush_tlb();
2426 free_out_of_sync_state(d);
2429 int shadow_fault(unsigned long va, struct xen_regs *regs)
2431 l1_pgentry_t gpte, spte, orig_gpte;
2432 struct exec_domain *ed = current;
2433 struct domain *d = ed->domain;
2434 l2_pgentry_t gpde;
2436 spte = l1e_empty();
2438 SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
2439 perfc_incrc(shadow_fault_calls);
2441 check_pagetable(ed, "pre-sf");
2443 /*
2444 * Don't let someone else take the guest's table pages out-of-sync.
2445 */
2446 shadow_lock(d);
2448 /* XXX - FIX THIS COMMENT!!!
2449 * STEP 1. Check to see if this fault might have been caused by an
2450 * out-of-sync table page entry, or if we should pass this
2451 * fault onto the guest.
2452 */
2453 __shadow_sync_va(ed, va);
2455 /*
2456 * STEP 2. Check the guest PTE.
2457 */
2458 __guest_get_l2e(ed, va, &gpde);
2459 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2461 SH_VVLOG("shadow_fault - EXIT: L1 not present" );
2462 perfc_incrc(shadow_fault_bail_pde_not_present);
2463 goto fail;
2466 // This can't fault because we hold the shadow lock and we've ensured that
2467 // the mapping is in-sync, so the check of the PDE's present bit, above,
2468 // covers this access.
2469 //
2470 orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2471 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2473 SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
2474 perfc_incrc(shadow_fault_bail_pte_not_present);
2475 goto fail;
2478 /* Write fault? */
2479 if ( regs->error_code & 2 )
2481 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2483 /* Write fault on a read-only mapping. */
2484 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
2485 perfc_incrc(shadow_fault_bail_ro_mapping);
2486 goto fail;
2489 if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
2491 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2492 perfc_incrc(write_fault_bail);
2493 shadow_unlock(d);
2494 return 0;
2497 else
2499 if ( !l1pte_read_fault(d, &gpte, &spte) )
2501 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2502 perfc_incrc(read_fault_bail);
2503 shadow_unlock(d);
2504 return 0;
2508 /*
2509 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2510 */
2512 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2513 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2514 &gpte, sizeof(gpte))) )
2516 printk("shadow_fault() failed, crashing domain %d "
2517 "due to a read-only L2 page table (gpde=%p), va=%p\n",
2518 d->id, gpde, va);
2519 domain_crash_synchronous();
2522 // if necessary, record the page table page as dirty
2523 if ( unlikely(shadow_mode_log_dirty(d)) &&
2524 l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK))
2525 mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
2527 shadow_set_l1e(va, spte, 1);
2529 perfc_incrc(shadow_fault_fixed);
2530 d->arch.shadow_fault_count++;
2532 shadow_unlock(d);
2534 check_pagetable(ed, "post-sf");
2535 return EXCRET_fault_fixed;
2537 fail:
2538 shadow_unlock(d);
2539 return 0;
2542 /*
2543 * What lives where in the 32-bit address space in the various shadow modes,
2544 * and what it uses to get/maintain that mapping.
2546 * SHADOW MODE: none enable translate external
2548 * 4KB things:
2549 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2550 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2551 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2552 * monitor_vtable n/a n/a n/a mapped once
2554 * 4MB things:
2555 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2556 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2557 * monitor_linear n/a n/a n/a ???
2558 * perdomain perdomain perdomain perdomain perdomain
2559 * R/O M2P R/O M2P R/O M2P n/a n/a
2560 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2561 * P2M n/a n/a R/O M2P R/O M2P
2563 * NB:
2564 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
2565 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2566 * all play a part in maintaining these mappings.
2567 */
2568 void __update_pagetables(struct exec_domain *ed)
2570 struct domain *d = ed->domain;
2571 unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
2572 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
2573 unsigned long smfn, hl2mfn, old_smfn;
2575 int max_mode = ( shadow_mode_external(d) ? SHM_external
2576 : shadow_mode_translate(d) ? SHM_translate
2577 : shadow_mode_enabled(d) ? SHM_enable
2578 : 0 );
2580 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2581 ASSERT( max_mode );
2583 /*
2584 * arch.guest_vtable
2585 */
2586 if ( max_mode & (SHM_enable | SHM_external) )
2588 if ( likely(ed->arch.guest_vtable != NULL) )
2589 unmap_domain_mem(ed->arch.guest_vtable);
2590 ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
2593 /*
2594 * arch.shadow_table
2595 */
2596 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2597 smfn = shadow_l2_table(d, gpfn, gmfn);
2598 if ( !get_shadow_ref(smfn) )
2599 BUG();
2600 old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
2601 ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
2602 if ( old_smfn )
2603 put_shadow_ref(old_smfn);
2605 SH_VVLOG("__update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
2607 /*
2608 * arch.shadow_vtable
2609 */
2610 if ( max_mode == SHM_external )
2612 if ( ed->arch.shadow_vtable )
2613 unmap_domain_mem(ed->arch.shadow_vtable);
2614 ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
2617 /*
2618 * arch.hl2_vtable
2619 */
2621 // if max_mode == SHM_translate, then the hl2 is already installed
2622 // correctly in its smfn, and there's nothing to do.
2623 //
2624 if ( max_mode == SHM_external )
2626 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2627 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2628 if ( ed->arch.hl2_vtable )
2629 unmap_domain_mem(ed->arch.hl2_vtable);
2630 ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
2633 /*
2634 * fixup pointers in monitor table, as necessary
2635 */
2636 if ( max_mode == SHM_external )
2638 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
2639 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2640 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2642 ASSERT( shadow_mode_translate(d) );
2644 if ( !get_shadow_ref(hl2mfn) )
2645 BUG();
2646 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2647 l2e_create_pfn(hl2mfn, __PAGE_HYPERVISOR);
2648 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2649 put_shadow_ref(l2e_get_pfn(old_hl2e));
2651 if ( !get_shadow_ref(smfn) )
2652 BUG();
2653 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2654 l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2655 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2656 put_shadow_ref(l2e_get_pfn(old_sl2e));
2658 // XXX - maybe this can be optimized somewhat??
2659 local_flush_tlb();
2664 /************************************************************************/
2665 /************************************************************************/
2666 /************************************************************************/
2668 #if SHADOW_DEBUG
2670 // BUG: these are not SMP safe...
2671 static int sh_l2_present;
2672 static int sh_l1_present;
2673 char * sh_check_name;
2674 int shadow_status_noswap;
2676 #define v2m(_ed, _adr) ({ \
2677 unsigned long _a = (unsigned long)(_adr); \
2678 l2_pgentry_t _pde = shadow_linear_l2_table(_ed)[l2_table_offset(_a)]; \
2679 unsigned long _pa = -1; \
2680 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2681 { \
2682 l1_pgentry_t _pte; \
2683 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2684 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2685 _pa = l1e_get_phys(_pte); \
2686 } \
2687 _pa | (_a & ~PAGE_MASK); \
2688 })
2690 #define FAIL(_f, _a...) \
2691 do { \
2692 printk("XXX %s-FAIL (%d,%d,%d)" _f " at %s(%d)\n", \
2693 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2694 __FILE__, __LINE__); \
2695 printk("g=%08lx s=%08lx &g=%08lx &s=%08lx" \
2696 " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08lx\n", \
2697 gpte, spte, pgpte, pspte, \
2698 v2m(ed, pgpte), v2m(ed, pspte), \
2699 (l2_idx << L2_PAGETABLE_SHIFT) | \
2700 (l1_idx << L1_PAGETABLE_SHIFT)); \
2701 errors++; \
2702 } while ( 0 )
2704 static int check_pte(
2705 struct exec_domain *ed, l1_pgentry_t *pgpte, l1_pgentry_t *pspte,
2706 int level, int l2_idx, int l1_idx, int oos_ptes)
2708 struct domain *d = ed->domain;
2709 l1_pgentry_t gpte = *pgpte;
2710 l1_pgentry_t spte = *pspte;
2711 unsigned long mask, gpfn, smfn, gmfn;
2712 int errors = 0;
2713 int page_table_page;
2715 if ( (l1e_get_value(spte) == 0) ||
2716 (l1e_get_value(spte) == 0xdeadface) ||
2717 (l1e_get_value(spte) == 0x00000E00) )
2718 return errors; /* always safe */
2720 if ( !(l1e_get_flags(spte) & _PAGE_PRESENT) )
2721 FAIL("Non zero not present spte");
2723 if ( level == 2 ) sh_l2_present++;
2724 if ( level == 1 ) sh_l1_present++;
2726 if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
2727 FAIL("Guest not present yet shadow is");
2729 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK);
2731 if ( (l1e_get_value(spte) & mask) != (l1e_get_value(gpte) & mask) )
2732 FAIL("Corrupt?");
2734 if ( (level == 1) &&
2735 (l1e_get_flags(spte) & _PAGE_DIRTY) &&
2736 !(l1e_get_flags(gpte) & _PAGE_DIRTY) && !oos_ptes )
2737 FAIL("Dirty coherence");
2739 if ( (l1e_get_flags(spte) & _PAGE_ACCESSED) &&
2740 !(l1e_get_flags(gpte) & _PAGE_ACCESSED) && !oos_ptes )
2741 FAIL("Accessed coherence");
2743 if ( l1e_get_flags(spte) & _PAGE_GLOBAL )
2744 FAIL("global bit set in shadow");
2746 smfn = l1e_get_pfn(spte);
2747 gpfn = l1e_get_pfn(gpte);
2748 gmfn = __gpfn_to_mfn(d, gpfn);
2750 if ( !VALID_MFN(gmfn) )
2751 FAIL("invalid gpfn=%p gpte=%p\n", __func__, gpfn,
2752 l1e_get_value(gpte));
2754 page_table_page = mfn_is_page_table(gmfn);
2756 if ( (l1e_get_flags(spte) & _PAGE_RW ) &&
2757 !(l1e_get_flags(gpte) & _PAGE_RW) && !oos_ptes )
2759 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2760 "oos_ptes=%d\n",
2761 gpfn, gmfn, smfn,
2762 frame_table[gmfn].u.inuse.type_info,
2763 page_table_page, oos_ptes);
2764 FAIL("RW coherence");
2767 if ( (level == 1) &&
2768 (l1e_get_flags(spte) & _PAGE_RW ) &&
2769 !((l1e_get_flags(gpte) & _PAGE_RW) &&
2770 (l1e_get_flags(gpte) & _PAGE_DIRTY)) &&
2771 !oos_ptes )
2773 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2774 "oos_ptes=%d\n",
2775 gpfn, gmfn, smfn,
2776 frame_table[gmfn].u.inuse.type_info,
2777 page_table_page, oos_ptes);
2778 FAIL("RW2 coherence");
2781 if ( gmfn == smfn )
2783 if ( level > 1 )
2784 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2786 else
2788 if ( level < 2 )
2789 FAIL("Shadow in L1 entry?");
2791 if ( level == 2 )
2793 if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
2794 FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
2795 __shadow_status(d, gpfn, PGT_l1_shadow));
2797 else
2798 BUG(); // XXX -- not handled yet.
2801 return errors;
2803 #undef FAIL
2804 #undef v2m
2806 static int check_l1_table(
2807 struct exec_domain *ed, unsigned long gpfn,
2808 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2810 struct domain *d = ed->domain;
2811 int i;
2812 l1_pgentry_t *gpl1e, *spl1e;
2813 int errors = 0, oos_ptes = 0;
2815 if ( page_out_of_sync(pfn_to_page(gmfn)) )
2817 gmfn = __shadow_status(d, gpfn, PGT_snapshot);
2818 oos_ptes = 1;
2819 ASSERT(gmfn);
2822 gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
2823 spl1e = map_domain_mem(smfn << PAGE_SHIFT);
2825 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2826 errors += check_pte(ed, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
2828 unmap_domain_mem(spl1e);
2829 unmap_domain_mem(gpl1e);
2831 return errors;
2834 #define FAILPT(_f, _a...) \
2835 do { \
2836 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2837 errors++; \
2838 } while ( 0 )
2840 int check_l2_table(
2841 struct exec_domain *ed, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2843 struct domain *d = ed->domain;
2844 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
2845 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
2846 l2_pgentry_t match;
2847 int i;
2848 int errors = 0;
2849 int limit;
2851 if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
2852 FAILPT("domain doesn't own page");
2853 if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
2854 FAILPT("bogus owner for snapshot page");
2855 if ( page_get_owner(pfn_to_page(smfn)) != NULL )
2856 FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
2857 smfn, page_get_owner(pfn_to_page(smfn))->id);
2859 #if 0
2860 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2861 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2862 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2863 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2865 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2866 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2867 i++ )
2868 printk("+++ (%d) %p %p\n",i,
2869 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2870 FAILPT("hypervisor entries inconsistent");
2873 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2874 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2875 FAILPT("hypervisor linear map inconsistent");
2876 #endif
2878 match = l2e_create_pfn(smfn, __PAGE_HYPERVISOR);
2879 if ( !shadow_mode_external(d) &&
2880 l2e_has_changed(&spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2881 &match, PAGE_FLAG_MASK))
2883 FAILPT("hypervisor shadow linear map inconsistent %p %p",
2884 l2e_get_value(spl2e[SH_LINEAR_PT_VIRT_START >>
2885 L2_PAGETABLE_SHIFT]),
2886 l2e_get_value(match));
2889 match = l2e_create_phys(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2890 if ( !shadow_mode_external(d) &&
2891 l2e_has_changed(&spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2892 &match, PAGE_FLAG_MASK))
2894 FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
2895 l2e_get_value(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2896 d->arch.mm_perdomain_pt,
2897 l2e_get_value(match));
2900 #ifdef __i386__
2901 if ( shadow_mode_external(d) )
2902 limit = L2_PAGETABLE_ENTRIES;
2903 else
2904 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2905 #else
2906 limit = 0; /* XXX x86/64 XXX */
2907 #endif
2909 /* Check the whole L2. */
2910 for ( i = 0; i < limit; i++ )
2911 errors += check_pte(ed,
2912 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2913 (l1_pgentry_t*)(&spl2e[i]),
2914 2, i, 0, 0);
2916 unmap_domain_mem(spl2e);
2917 unmap_domain_mem(gpl2e);
2919 #if 1
2920 if ( errors )
2921 printk("check_l2_table returning %d errors\n", errors);
2922 #endif
2924 return errors;
2926 #undef FAILPT
2928 int _check_pagetable(struct exec_domain *ed, char *s)
2930 struct domain *d = ed->domain;
2931 pagetable_t pt = ed->arch.guest_table;
2932 unsigned long gptbase = pagetable_val(pt);
2933 unsigned long ptbase_pfn, smfn;
2934 unsigned long i;
2935 l2_pgentry_t *gpl2e, *spl2e;
2936 unsigned long ptbase_mfn = 0;
2937 int errors = 0, limit, oos_pdes = 0;
2939 //_audit_domain(d, AUDIT_QUIET);
2940 shadow_lock(d);
2942 sh_check_name = s;
2943 //SH_VVLOG("%s-PT Audit", s);
2944 sh_l2_present = sh_l1_present = 0;
2945 perfc_incrc(check_pagetable);
2947 ptbase_mfn = gptbase >> PAGE_SHIFT;
2948 ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
2950 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2952 printk("%s-PT %p not shadowed\n", s, gptbase);
2953 goto out;
2955 if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
2957 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2958 oos_pdes = 1;
2959 ASSERT(ptbase_mfn);
2962 errors += check_l2_table(ed, ptbase_mfn, smfn, oos_pdes);
2964 gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
2965 spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
2967 /* Go back and recurse. */
2968 #ifdef __i386__
2969 if ( shadow_mode_external(d) )
2970 limit = L2_PAGETABLE_ENTRIES;
2971 else
2972 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2973 #else
2974 limit = 0; /* XXX x86/64 XXX */
2975 #endif
2977 for ( i = 0; i < limit; i++ )
2979 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2980 unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
2981 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2983 if ( l2e_get_value(spl2e[i]) != 0 ) /* FIXME: check flags? */
2985 errors += check_l1_table(ed, gl1pfn, gl1mfn, sl1mfn, i);
2989 unmap_domain_mem(spl2e);
2990 unmap_domain_mem(gpl2e);
2992 #if 0
2993 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2994 sh_l2_present, sh_l1_present);
2995 #endif
2997 out:
2998 if ( errors )
2999 BUG();
3001 shadow_unlock(d);
3003 return errors;
3006 int _check_all_pagetables(struct exec_domain *ed, char *s)
3008 struct domain *d = ed->domain;
3009 int i;
3010 struct shadow_status *a;
3011 unsigned long gmfn;
3012 int errors = 0;
3014 shadow_status_noswap = 1;
3016 sh_check_name = s;
3017 SH_VVLOG("%s-PT Audit domid=%d", s, d->id);
3018 sh_l2_present = sh_l1_present = 0;
3019 perfc_incrc(check_all_pagetables);
3021 for (i = 0; i < shadow_ht_buckets; i++)
3023 a = &d->arch.shadow_ht[i];
3024 while ( a && a->gpfn_and_flags )
3026 gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
3028 switch ( a->gpfn_and_flags & PGT_type_mask )
3030 case PGT_l1_shadow:
3031 errors += check_l1_table(ed, a->gpfn_and_flags & PGT_mfn_mask,
3032 gmfn, a->smfn, 0);
3033 break;
3034 case PGT_l2_shadow:
3035 errors += check_l2_table(ed, gmfn, a->smfn,
3036 page_out_of_sync(pfn_to_page(gmfn)));
3037 break;
3038 case PGT_l3_shadow:
3039 case PGT_l4_shadow:
3040 case PGT_hl2_shadow:
3041 BUG(); // XXX - ought to fix this...
3042 break;
3043 case PGT_snapshot:
3044 case PGT_writable_pred:
3045 break;
3046 default:
3047 errors++;
3048 printk("unexpected shadow type %p, gpfn=%p, "
3049 "gmfn=%p smfn=%p\n",
3050 a->gpfn_and_flags & PGT_type_mask,
3051 a->gpfn_and_flags & PGT_mfn_mask,
3052 gmfn, a->smfn);
3053 BUG();
3055 a = a->next;
3059 shadow_status_noswap = 0;
3061 if ( errors )
3062 BUG();
3064 return errors;
3067 #endif // SHADOW_DEBUG
3069 /*
3070 * Local variables:
3071 * mode: C
3072 * c-set-style: "BSD"
3073 * c-basic-offset: 4
3074 * tab-width: 4
3075 * indent-tabs-mode: nil
3076 * End:
3077 */