# HG changeset patch # User kaf24@freefall.cl.cam.ac.uk # Date 1097677521 0 # Node ID 0174982516f66ad06b1b45d0f4fd1818277ae32f # Parent 7565994e86cb7b3ada63f29aaa2a3b50df2fc85e bitkeeper revision 1.1159.1.229 (416d3ad1BpCS1RVPjkX14HUpsanlGw) Shadow pagetable walkthrough. diff -r 7565994e86cb -r 0174982516f6 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Wed Oct 13 03:33:39 2004 +0000 +++ b/tools/libxc/xc_linux_save.c Wed Oct 13 14:25:21 2004 +0000 @@ -210,7 +210,7 @@ static int analysis_phase( int xc_handle int i; xc_shadow_control( xc_handle, domid, - DOM0_SHADOW_CONTROL_OP_CLEAN2, + DOM0_SHADOW_CONTROL_OP_CLEAN, arr, nr_pfns, NULL); printf("#Flush\n"); for ( i = 0; i < 100; i++ ) @@ -829,7 +829,7 @@ int xc_linux_save(int xc_handle, XcIOCon } if ( xc_shadow_control( xc_handle, domid, - DOM0_SHADOW_CONTROL_OP_CLEAN2, + DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, nr_pfns, &stats ) != nr_pfns ) { xcio_error(ioctxt, "Error flushing shadow PT"); diff -r 7565994e86cb -r 0174982516f6 xen/arch/x86/memory.c --- a/xen/arch/x86/memory.c Wed Oct 13 03:33:39 2004 +0000 +++ b/xen/arch/x86/memory.c Wed Oct 13 14:25:21 2004 +0000 @@ -1503,7 +1503,7 @@ int do_update_va_mapping(unsigned long p { unsigned long sval; - l1pte_no_fault(&d->mm, &val, &sval); + l1pte_propagate_from_guest(&d->mm, &val, &sval); if ( unlikely(__put_user(sval, ((unsigned long *)( &shadow_linear_pg_table[page_nr])))) ) @@ -1521,9 +1521,9 @@ int do_update_va_mapping(unsigned long p * for this. */ if ( d->mm.shadow_mode == SHM_logdirty ) - mark_dirty( ¤t->mm, va_to_l1mfn(page_nr<mm, va_to_l1mfn(page_nr << PAGE_SHIFT)); - check_pagetable(d, d->mm.pagetable, "va"); /* debug */ + check_pagetable(&d->mm, d->mm.pagetable, "va"); /* debug */ } deferred_ops = percpu_info[cpu].deferred_ops; @@ -1613,7 +1613,7 @@ void ptwr_flush(const int which) if ( unlikely(d->mm.shadow_mode) ) { /* Write-protect the p.t. page in the shadow page table. */ - l1pte_no_fault(&d->mm, &pte, &spte); + l1pte_propagate_from_guest(&d->mm, &pte, &spte); __put_user( spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); @@ -1657,7 +1657,7 @@ void ptwr_flush(const int which) if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) { if ( unlikely(sl1e != NULL) ) - l1pte_no_fault( + l1pte_propagate_from_guest( &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); @@ -1672,7 +1672,7 @@ void ptwr_flush(const int which) } if ( unlikely(sl1e != NULL) ) - l1pte_no_fault( + l1pte_propagate_from_guest( &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) diff -r 7565994e86cb -r 0174982516f6 xen/arch/x86/shadow.c --- a/xen/arch/x86/shadow.c Wed Oct 13 03:33:39 2004 +0000 +++ b/xen/arch/x86/shadow.c Wed Oct 13 14:25:21 2004 +0000 @@ -9,7 +9,6 @@ #include #include - /******** To use these shadow page tables, guests must not rely on the ACCESSED @@ -28,166 +27,141 @@ hypercall lock anyhow (at least initiall ********/ - -static inline void free_shadow_page(struct mm_struct *m, - struct pfn_info *page) +static inline void free_shadow_page( + struct mm_struct *m, struct pfn_info *page) { - unsigned long type = page->u.inuse.type_info & PGT_type_mask; - m->shadow_page_count--; - if (type == PGT_l1_page_table) + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_l1_page_table: perfc_decr(shadow_l1_pages); - else if (type == PGT_l2_page_table) + break; + + case PGT_l2_page_table: perfc_decr(shadow_l2_pages); - else printk("Free shadow weird page type pfn=%08x type=%08x\n", - frame_table-page, page->u.inuse.type_info); - + break; + + default: + printk("Free shadow weird page type pfn=%08x type=%08x\n", + frame_table-page, page->u.inuse.type_info); + break; + } + free_domheap_page(page); } -static void __free_shadow_table( struct mm_struct *m ) +static void __free_shadow_table(struct mm_struct *m) { - int j, free=0; - struct shadow_status *a,*next; + int i, free = 0; + struct shadow_status *x, *n; - // the code assumes you're not using the page tables i.e. - // the domain is stopped and cr3 is something else!! + /* + * WARNING! The shadow page table must not currently be in use! + * e.g., You are expected to have paused the domain and synchronized CR3. + */ + + shadow_audit(m, 1); - // walk the hash table and call free_shadow_page on all pages + /* Free each hash chain in turn. */ + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + /* Skip empty buckets. */ + x = &m->shadow_ht[i]; + if ( x->pfn == 0 ) + continue; + + /* Free the head page. */ + free_shadow_page( + m, &frame_table[x->spfn_and_flags & PSH_pfn_mask]); - shadow_audit(m,1); + /* Reinitialise the head node. */ + x->pfn = 0; + x->spfn_and_flags = 0; + n = x->next; + x->next = NULL; + + free++; - for(j=0;jshadow_ht[j]; - if (a->pfn) - { - free_shadow_page( m, - &frame_table[a->spfn_and_flags & PSH_pfn_mask] ); - a->pfn = 0; - a->spfn_and_flags = 0; + /* Iterate over non-head nodes. */ + for ( x = n; x != NULL; x = n ) + { + /* Free the shadow page. */ + free_shadow_page( + m, &frame_table[x->spfn_and_flags & PSH_pfn_mask]); + + /* Re-initialise the chain node. */ + x->pfn = 0; + x->spfn_and_flags = 0; + + /* Add to the free list. */ + n = x->next; + x->next = m->shadow_ht_free; + m->shadow_ht_free = x; + free++; } - next=a->next; - a->next=NULL; - a=next; - while(a) - { - struct shadow_status *next = a->next; - free_shadow_page( m, - &frame_table[a->spfn_and_flags & PSH_pfn_mask] ); - a->pfn = 0; - a->spfn_and_flags = 0; - free++; - a->next = m->shadow_ht_free; - m->shadow_ht_free = a; - a=next; - } - shadow_audit(m,0); + shadow_audit(m, 0); } - SH_LOG("Free shadow table. Freed= %d",free); + + SH_LOG("Free shadow table. Freed=%d.", free); } - -#define TABLE_OP_ZERO_L2 1 -#define TABLE_OP_ZERO_L1 2 -#define TABLE_OP_FREE_L1 3 +static inline int __clear_shadow_page( + struct mm_struct *m, struct shadow_status *x) +{ + unsigned long *p; + int restart = 0; + struct pfn_info *spage = &frame_table[x->spfn_and_flags & PSH_pfn_mask]; -static inline int shadow_page_op( struct mm_struct *m, unsigned int op, - unsigned int gpfn, - struct pfn_info *spfn_info, int *work ) -{ - unsigned int spfn = spfn_info-frame_table; - int restart = 0; - - switch( op ) + switch ( spage->u.inuse.type_info & PGT_type_mask ) { - case TABLE_OP_ZERO_L2: - { - if ( (spfn_info->u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - { - unsigned long * spl1e = map_domain_mem( spfn<u.inuse.type_info & PGT_type_mask) == - PGT_l1_page_table ) - { - unsigned long * spl1e = map_domain_mem( spfn<u.inuse.type_info & PGT_type_mask) == - PGT_l1_page_table ) - { - // lock is already held - delete_shadow_status( m, gpfn ); - free_shadow_page( m, spfn_info ); - restart = 1; // we need to go to start of list again - } + /* We clear L1 pages by freeing them: no benefit from zeroing them. */ + case PGT_l1_page_table: + delete_shadow_status(m, x->pfn); + free_shadow_page(m, spage); + restart = 1; /* We need to go to start of list again. */ + break; } - break; - - default: - BUG(); - - } return restart; } -static void __scan_shadow_table( struct mm_struct *m, unsigned int op ) +static void __clear_shadow_state(struct mm_struct *m) { - int j, work=0; - struct shadow_status *a, *next; + int i; + struct shadow_status *x; - // the code assumes you're not using the page tables i.e. - // the domain is stopped and cr3 is something else!! + shadow_audit(m, 1); - // walk the hash table and call free_shadow_page on all pages - - shadow_audit(m,1); - - for(j=0;jshadow_ht[j]; - next = a->next; - if (a->pfn) - { - if ( shadow_page_op( m, op, a->pfn, - &frame_table[a->spfn_and_flags & PSH_pfn_mask], - &work ) ) - goto retry; - } - a=next; - while(a) - { - next = a->next; - if ( shadow_page_op( m, op, a->pfn, - &frame_table[a->spfn_and_flags & PSH_pfn_mask], - &work ) ) - goto retry; - a=next; - } - shadow_audit(m,0); + retry: + /* Skip empty buckets. */ + x = &m->shadow_ht[i]; + if ( x->pfn == 0 ) + continue; + + if ( __clear_shadow_page(m, x) ) + goto retry; + + for ( x = x->next; x != NULL; x = x->next ) + if ( __clear_shadow_page(m, x) ) + goto retry; + + shadow_audit(m, 0); } - SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); + + SH_VLOG("Scan shadow table. l1=%d l2=%d", + perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); } @@ -195,297 +169,215 @@ void shadow_mode_init(void) { } -int shadow_mode_enable( struct domain *p, unsigned int mode ) +int shadow_mode_enable(struct domain *p, unsigned int mode) { struct mm_struct *m = &p->mm; - struct shadow_status **fptr; - int i; - // allocate hashtable - m->shadow_ht = xmalloc(shadow_ht_buckets * - sizeof(struct shadow_status)); - if( m->shadow_ht == NULL ) - goto nomem; - - memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status)); - - // allocate space for first lot of extra nodes - m->shadow_ht_extras = xmalloc(sizeof(void*) + - (shadow_ht_extra_size * - sizeof(struct shadow_status))); - if( m->shadow_ht_extras == NULL ) + m->shadow_ht = xmalloc( + shadow_ht_buckets * sizeof(struct shadow_status)); + if ( m->shadow_ht == NULL ) goto nomem; - - memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * - sizeof(struct shadow_status)) ); - - m->shadow_extras_count++; - - // add extras to free list - fptr = &m->shadow_ht_free; - for ( i=0; ishadow_ht_extras[i]; - fptr = &(m->shadow_ht_extras[i].next); - } - *fptr = NULL; - *((struct shadow_status ** ) - &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL; + memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status)); if ( mode == SHM_logdirty ) { - m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63); + m->shadow_dirty_bitmap_size = (p->max_pages + 63) & ~63; m->shadow_dirty_bitmap = - xmalloc( m->shadow_dirty_bitmap_size/8); - if( m->shadow_dirty_bitmap == NULL ) + xmalloc(m->shadow_dirty_bitmap_size/8); + if ( m->shadow_dirty_bitmap == NULL ) { m->shadow_dirty_bitmap_size = 0; - BUG(); goto nomem; } - memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8); + memset(m->shadow_dirty_bitmap, 0, m->shadow_dirty_bitmap_size/8); } m->shadow_mode = mode; - // call shadow_mk_pagetable - __shadow_mk_pagetable( m ); + __shadow_mk_pagetable(m); return 0; -nomem: - if( m->shadow_ht ) { - xfree( m->shadow_ht ); m->shadow_ht = NULL; }; - - if( m->shadow_ht_extras ) { - xfree( m->shadow_ht_extras ); m->shadow_ht_extras = NULL; }; - + nomem: + if ( m->shadow_ht != NULL ) + xfree( m->shadow_ht ); + m->shadow_ht = NULL; return -ENOMEM; } void __shadow_mode_disable(struct domain *d) { struct mm_struct *m = &d->mm; - struct shadow_status *next; + struct shadow_status *x, *n; __free_shadow_table(m); m->shadow_mode = 0; SH_VLOG("freed tables count=%d l1=%d l2=%d", - m->shadow_page_count, perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages)); + m->shadow_page_count, perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages)); - next = m->shadow_ht_extras; - while ( next ) + n = m->shadow_ht_extras; + while ( (x = n) != NULL ) { - struct shadow_status * this = next; m->shadow_extras_count--; - next = *((struct shadow_status **)(&next[shadow_ht_extra_size])); - xfree(this); + n = *((struct shadow_status **)(&x[shadow_ht_extra_size])); + xfree(x); } + m->shadow_ht_extras = NULL; + ASSERT(m->shadow_extras_count == 0); SH_LOG("freed extras, now %d", m->shadow_extras_count); - if ( m->shadow_dirty_bitmap ) + if ( m->shadow_dirty_bitmap != NULL ) { - xfree( m->shadow_dirty_bitmap ); + xfree(m->shadow_dirty_bitmap); m->shadow_dirty_bitmap = 0; m->shadow_dirty_bitmap_size = 0; } - // free the hashtable itself - xfree( m->shadow_ht ); - - m->shadow_ht = NULL; - m->shadow_ht_extras = NULL; + xfree(m->shadow_ht); + m->shadow_ht = NULL; } -static int shadow_mode_table_op(struct domain *d, - dom0_shadow_control_t *sc) +static int shadow_mode_table_op( + struct domain *d, dom0_shadow_control_t *sc) { - unsigned int op = sc->op; + unsigned int op = sc->op; struct mm_struct *m = &d->mm; - int rc = 0; - - // since Dom0 did the hypercall, we should be running with it's page - // tables right now. Calling flush on yourself would be really - // stupid. + int i, rc = 0; ASSERT(spin_is_locked(&d->mm.shadow_lock)); - if ( m == ¤t->mm ) - { - printk("Don't try and flush your own page tables!\n"); - return -EINVAL; - } - - SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count); + SH_VLOG("shadow mode table op %08lx %08lx count %d", + pagetable_val(m->pagetable), pagetable_val(m->shadow_table), + m->shadow_page_count); - shadow_audit(m,1); + shadow_audit(m, 1); - switch(op) + switch ( op ) { case DOM0_SHADOW_CONTROL_OP_FLUSH: __free_shadow_table( m ); - d->mm.shadow_fault_count = 0; - d->mm.shadow_dirty_count = 0; - d->mm.shadow_dirty_net_count = 0; - d->mm.shadow_dirty_block_count = 0; + d->mm.shadow_fault_count = 0; + d->mm.shadow_dirty_count = 0; + d->mm.shadow_dirty_net_count = 0; + d->mm.shadow_dirty_block_count = 0; break; - case DOM0_SHADOW_CONTROL_OP_CLEAN: // zero all-non hypervisor - { - __scan_shadow_table( m, TABLE_OP_ZERO_L2 ); - __scan_shadow_table( m, TABLE_OP_ZERO_L1 ); + case DOM0_SHADOW_CONTROL_OP_CLEAN: + __clear_shadow_state(m); - goto send_bitmap; - } - + sc->stats.fault_count = d->mm.shadow_fault_count; + sc->stats.dirty_count = d->mm.shadow_dirty_count; + sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count; - case DOM0_SHADOW_CONTROL_OP_CLEAN2: // zero all L2, free L1s - { - int i,j,zero=1; - - __scan_shadow_table( m, TABLE_OP_ZERO_L2 ); - __scan_shadow_table( m, TABLE_OP_FREE_L1 ); - - send_bitmap: - sc->stats.fault_count = d->mm.shadow_fault_count; - sc->stats.dirty_count = d->mm.shadow_dirty_count; - sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count; - sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count; - - d->mm.shadow_fault_count = 0; - d->mm.shadow_dirty_count = 0; - d->mm.shadow_dirty_net_count = 0; - d->mm.shadow_dirty_block_count = 0; - - sc->pages = d->max_pages; + d->mm.shadow_fault_count = 0; + d->mm.shadow_dirty_count = 0; + d->mm.shadow_dirty_net_count = 0; + d->mm.shadow_dirty_block_count = 0; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->mm.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + goto out; + } + + sc->pages = d->max_pages; - if( d->max_pages > sc->pages || - !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap ) - { - rc = -EINVAL; - goto out; - } +#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < d->max_pages; i += chunk ) + { + int bytes = ((((d->max_pages - i) > chunk) ? + chunk : (d->max_pages - i)) + 7) / 8; + + copy_to_user( + sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), + d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), + bytes); + + memset( + d->mm.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } + + break; - -#define chunk (8*1024) // do this in 1KB chunks for L1 cache - - for(i=0;imax_pages;i+=chunk) - { - int bytes = (( ((d->max_pages-i) > (chunk))? - (chunk):(d->max_pages-i) ) + 7) / 8; - - copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), - d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), - bytes ); - - for(j=0; zero && jmm.shadow_dirty_bitmap[j] != 0 ) - zero = 0; - } + case DOM0_SHADOW_CONTROL_OP_PEEK: + sc->stats.fault_count = d->mm.shadow_fault_count; + sc->stats.dirty_count = d->mm.shadow_dirty_count; + sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->mm.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + goto out; + } + + sc->pages = d->max_pages; + copy_to_user( + sc->dirty_bitmap, d->mm.shadow_dirty_bitmap, (d->max_pages+7)/8); - memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), - 0, bytes); - } + break; -#if 0 /* This optimisation is dangerous for some uses of this function. - disable for the moment */ - /* Might as well stop the domain as an optimization. */ - if ( zero ) - domain_pause_by_systemcontroller(d); -#endif - - break; + default: + BUG(); } - case DOM0_SHADOW_CONTROL_OP_PEEK: - { - int i; - - sc->stats.fault_count = d->mm.shadow_fault_count; - sc->stats.dirty_count = d->mm.shadow_dirty_count; - sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count; - sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count; - - if( d->max_pages > sc->pages || - !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap ) - { - rc = -EINVAL; - goto out; - } - - sc->pages = d->max_pages; - -#define chunk (8*1024) // do this in 1KB chunks for L1 cache - - for(i=0;imax_pages;i+=chunk) - { - int bytes = (( ((d->max_pages-i) > (chunk))? - (chunk):(d->max_pages-i) ) + 7) / 8; - - copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), - d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), - bytes ); - } - - break; - } - - default: - BUG(); - - } - - -out: - + out: SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count); - - shadow_audit(m,1); - - // call shadow_mk_pagetable - __shadow_mk_pagetable( m ); - + shadow_audit(m, 1); + __shadow_mk_pagetable(m); return rc; } int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) { unsigned int cmd = sc->op; - int rc = 0; + int rc = 0; + + if ( unlikely(d == current) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } domain_pause(d); synchronise_pagetables(~0UL); shadow_lock(&d->mm); - if ( cmd == DOM0_SHADOW_CONTROL_OP_OFF ) + switch ( cmd ) { + case DOM0_SHADOW_CONTROL_OP_OFF: shadow_mode_disable(d); - } - else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST ) - { + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: shadow_mode_disable(d); rc = shadow_mode_enable(d, SHM_test); - } - else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY ) - { + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: shadow_mode_disable(d); rc = shadow_mode_enable(d, SHM_logdirty); - } - else if ( shadow_mode(d) && - (cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH) && - (cmd <= DOM0_SHADOW_CONTROL_OP_CLEAN2) ) - { - rc = shadow_mode_table_op(d, sc); - } - else - { - rc = -EINVAL; + break; + + default: + if ( shadow_mode(d) && + (cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH) && + (cmd <= DOM0_SHADOW_CONTROL_OP_PEEK) ) + rc = shadow_mode_table_op(d, sc); + else + rc = -EINVAL; + break; } shadow_unlock(&d->mm); @@ -497,80 +389,74 @@ int shadow_mode_control(struct domain *d static inline struct pfn_info *alloc_shadow_page(struct mm_struct *m) { - struct pfn_info *page; + struct pfn_info *page = alloc_domheap_page(NULL); + m->shadow_page_count++; - page = alloc_domheap_page(NULL); - if( unlikely(page == NULL) ) - { - printk("Couldn't alloc shadow page! count=%d\n", - m->shadow_page_count); - SH_VLOG("Shadow tables l1=%d l2=%d", - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages)); - BUG(); // FIXME: try a shadow flush to free up some memory - } + if ( unlikely(page == NULL) ) + { + printk("Couldn't alloc shadow page! count=%d\n", + m->shadow_page_count); + SH_VLOG("Shadow tables l1=%d l2=%d", + perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages)); + BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ + } - return page; + return page; } -void unshadow_table( unsigned long gpfn, unsigned int type ) +void unshadow_table(unsigned long gpfn, unsigned int type) { - unsigned long spfn; - struct domain *d = frame_table[gpfn].u.inuse.domain; + unsigned long spfn; + struct domain *d = frame_table[gpfn].u.inuse.domain; - SH_VLOG("unshadow_table type=%08x gpfn=%08lx", - type, - gpfn ); + SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, gpfn); perfc_incrc(unshadow_table_count); - // this function is the same for both l1 and l2 tables - - // even in the SMP guest case, there won't be a race here as - // this CPU was the one that cmpxchg'ed the page to invalid - + /* + * This function is the same for all p.t. pages. Even for multi-processor + * guests there won't be a race here as this CPU was the one that + * cmpxchg'ed the page to invalid. + */ spfn = __shadow_status(&d->mm, gpfn) & PSH_pfn_mask; - delete_shadow_status(&d->mm, gpfn); - - free_shadow_page(&d->mm, &frame_table[spfn] ); - + free_shadow_page(&d->mm, &frame_table[spfn]); } - unsigned long shadow_l2_table( - struct mm_struct *m, unsigned long gpfn ) + struct mm_struct *m, unsigned long gpfn) { struct pfn_info *spfn_info; - unsigned long spfn; - l2_pgentry_t *spl2e, *gpl2e; - int i; + unsigned long spfn; + l2_pgentry_t *spl2e; - SH_VVLOG("shadow_l2_table( %08lx )",gpfn); + SH_VVLOG("shadow_l2_table( %08lx )", gpfn); perfc_incrc(shadow_l2_table_count); - // XXX in future, worry about racing in SMP guests - // -- use cmpxchg with PSH_pending flag to show progress (and spin) - - spfn_info = alloc_shadow_page(m); - - ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache + if ( (spfn_info = alloc_shadow_page(m)) != NULL ) + BUG(); /* XXX Deal gracefully with failure. */ spfn_info->u.inuse.type_info = PGT_l2_page_table; perfc_incr(shadow_l2_pages); - spfn = (unsigned long) (spfn_info - frame_table); + spfn = spfn_info - frame_table; - // mark pfn as being shadowed, update field to point at shadow + /* Mark pfn as being shadowed; update field to point at shadow. */ set_shadow_status(m, gpfn, spfn | PSH_shadowed); - // we need to do this before the linear map is set up - spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT); + spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); + + /* + * We could proactively fill in PDEs for pages that are already shadowed. + * However, we tried it and it didn't help performance. This is simpler. + */ + memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); #ifdef __i386__ - // get hypervisor and 2x linear PT mapings installed + /* Install hypervisor and 2x linear p.t. mapings. */ memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); @@ -579,218 +465,187 @@ unsigned long shadow_l2_table( spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | + mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR); #endif - // can't use the linear map as we may not be in the right PT - gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT); - - // proactively create entries for pages that are already shadowed - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - { - unsigned long spte = 0; - -#if 0 // Turns out this doesn't really help - unsigned long gpte; - - gpte = l2_pgentry_val(gpl2e[i]); - - if (gpte & _PAGE_PRESENT) - { - unsigned long s_sh = - __shadow_status(p, gpte>>PAGE_SHIFT); + unmap_domain_mem(spl2e); - l2pde_general( m, &gpte, &spte, s_sh ); - - } -#endif - - spl2e[i] = mk_l2_pgentry( spte ); - - } - - // its arguable we should 'preemptively shadow' a few active L1 pages - // to avoid taking a string of faults when 'jacking' a running domain - - unmap_domain_mem( gpl2e ); - unmap_domain_mem( spl2e ); - - SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn); - + SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn); return spfn; } +static void shadow_map_l1_into_current_l2(unsigned long va) +{ + struct mm_struct *m = ¤t->mm; + unsigned long *gpl1e, *spl1e, gpde, spde, gl1pfn, sl1pfn, sl1ss; + struct pfn_info *sl1pfn_info; + int i; -int shadow_fault( unsigned long va, long error_code ) + gpde = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]); + + gl1pfn = gpde >> PAGE_SHIFT; + + sl1ss = __shadow_status(m, gl1pfn); + if ( !(sl1ss & PSH_shadowed) ) + { + /* This L1 is NOT already shadowed so we need to shadow it. */ + SH_VVLOG("4a: l1 not shadowed ( %08lx )", sl1pfn); + + sl1pfn_info = alloc_shadow_page(m); + sl1pfn_info->u.inuse.type_info = PGT_l1_page_table; + + sl1pfn = sl1pfn_info - frame_table; + + perfc_incrc(shadow_l1_table_count); + perfc_incr(shadow_l1_pages); + + set_shadow_status(m, gl1pfn, PSH_shadowed | sl1pfn); + + l2pde_general(m, &gpde, &spde, sl1pfn); + + linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); + shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(spde); + + gpl1e = (unsigned long *) &(linear_pg_table[ + (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]); + + spl1e = (unsigned long *) &shadow_linear_pg_table[ + (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]; + + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) + l1pte_propagate_from_guest(m, &gpl1e[i], &spl1e[i]); + } + else + { + /* This L1 is shadowed already, but the L2 entry is missing. */ + SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn); + + sl1pfn = sl1ss & PSH_pfn_mask; + l2pde_general(m, &gpde, &spde, sl1pfn); + + linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); + shadow_linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); + } +} + +int shadow_fault(unsigned long va, long error_code) { unsigned long gpte, spte; struct mm_struct *m = ¤t->mm; SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code ); - check_pagetable( current, current->mm.pagetable, "pre-sf" ); + check_pagetable(m, current->mm.pagetable, "pre-sf"); - if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) + /* + * STEP 1. A fast-reject set of checks with no locking. + */ + + if ( unlikely(__get_user(gpte, (unsigned long *) + &linear_pg_table[va >> PAGE_SHIFT])) ) { SH_VVLOG("shadow_fault - EXIT: read gpte faulted" ); - return 0; // propagate to guest + return 0; } - if ( ! (gpte & _PAGE_PRESENT) ) + if ( !(gpte & _PAGE_PRESENT) ) { SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); - return 0; // we're not going to be able to help - } - - if ( (error_code & 2) && ! (gpte & _PAGE_RW) ) - { - // write fault on RO page return 0; } - // take the lock and reread gpte + if ( (error_code & 2) && !(gpte & _PAGE_RW) ) + { + /* Write fault on a read-only mapping. */ + return 0; + } + + /* + * STEP 2. Take the shadow lock and re-check the guest PTE. + */ shadow_lock(m); - - if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) + + if ( unlikely(__get_user(gpte, (unsigned long *) + &linear_pg_table[va >> PAGE_SHIFT])) ) { SH_VVLOG("shadow_fault - EXIT: read gpte faulted" ); shadow_unlock(m); - return 0; // propagate to guest + return 0; } if ( unlikely(!(gpte & _PAGE_PRESENT)) ) { SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); shadow_unlock(m); - return 0; // we're not going to be able to help + return 0; } - if ( error_code & 2 ) - { // write fault - if ( likely(gpte & _PAGE_RW) ) + /* Write fault? */ + if ( error_code & 2 ) + { + if ( unlikely(!(gpte & _PAGE_RW)) ) { - l1pte_write_fault( m, &gpte, &spte ); + /* Write fault on a read-only mapping. */ + SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte); + shadow_unlock(m); + return 0; } - else - { // write fault on RO page - SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte ); - shadow_unlock(m); - return 0; // propagate to guest - // not clear whether we should set accessed bit here... - } + + l1pte_write_fault(m, &gpte, &spte); } else { - l1pte_read_fault( m, &gpte, &spte ); + l1pte_read_fault(m, &gpte, &spte); } - SH_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte ); - - // write back updated gpte - // XXX watch out for read-only L2 entries! (not used in Linux) - if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) - domain_crash(); // fixme! - - if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) ) - { - // failed: - // the L1 may not be shadowed, or the L2 entry may be insufficient - - unsigned long gpde, spde, gl1pfn, sl1pfn, sl1ss; - - SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte ); - - gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]); - - gl1pfn = gpde>>PAGE_SHIFT; + /* + * STEP 3. Write the modified shadow PTE and guest PTE back to the tables. + */ - sl1ss = __shadow_status(¤t->mm, gl1pfn); - if ( ! (sl1ss & PSH_shadowed) ) - { - // this L1 is NOT already shadowed so we need to shadow it - struct pfn_info *sl1pfn_info; - unsigned long *gpl1e, *spl1e; - int i; - sl1pfn_info = alloc_shadow_page( ¤t->mm ); - sl1pfn_info->u.inuse.type_info = PGT_l1_page_table; - - sl1pfn = sl1pfn_info - frame_table; - - SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn); - perfc_incrc(shadow_l1_table_count); - perfc_incr(shadow_l1_pages); - - set_shadow_status(¤t->mm, gl1pfn, PSH_shadowed | sl1pfn); - - l2pde_general( m, &gpde, &spde, sl1pfn ); + /* XXX Watch out for read-only L2 entries! (not used in Linux). */ + if ( unlikely(__put_user(gpte, (unsigned long *) + &linear_pg_table[va >> PAGE_SHIFT])) ) + domain_crash(); - linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); - shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); - - gpl1e = (unsigned long *) &(linear_pg_table[ - (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]); - - spl1e = (unsigned long *) &shadow_linear_pg_table[ - (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]; - - - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) - { - l1pte_no_fault( m, &gpl1e[i], &spl1e[i] ); - } - + /* + * Update of shadow PTE can fail because the L1 p.t. is not shadowed, + * or because the shadow isn't linked into this shadow L2 p.t. + */ + if ( unlikely(__put_user(spte, (unsigned long *) + &shadow_linear_pg_table[va >> PAGE_SHIFT])) ) + { + SH_VVLOG("3: not shadowed/mapped gpte=%08lx spte=%08lx", gpte, spte); + shadow_map_l1_into_current_l2(va); + shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte); + } - } - else - { - // this L1 was shadowed (by another PT) but we didn't have an L2 - // entry for it - - SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn); - - sl1pfn = sl1ss & PSH_pfn_mask; - l2pde_general( m, &gpde, &spde, sl1pfn ); - - linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); - shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); - - } - - shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte); - // (we need to do the above even if we've just made the shadow L1) - - } // end of fixup writing the shadow L1 directly failed - perfc_incrc(shadow_fixup_count); - - m->shadow_fault_count++; - - check_pagetable( current, current->mm.pagetable, "post-sf" ); + m->shadow_fault_count++; shadow_unlock(m); - return 1; // let's try the faulting instruction again... - + check_pagetable(m, current->mm.pagetable, "post-sf"); + return 1; } -void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, - unsigned long *prev_spfn_ptr, - l1_pgentry_t **prev_spl1e_ptr ) +void shadow_l1_normal_pt_update( + unsigned long pa, unsigned long gpte, + unsigned long *prev_spfn_ptr, + l1_pgentry_t **prev_spl1e_ptr) { - unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr; - l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr; - + unsigned long spfn, spte, prev_spfn = *prev_spfn_ptr; + l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr; - SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n", - pa,gpte,prev_spfn, prev_spl1e); + /* N.B. To get here, we know the l1 page *must* be shadowed. */ + SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, " + "prev_spfn=%08lx, prev_spl1e=%p\n", + pa, gpte, prev_spfn, prev_spl1e); - // to get here, we know the l1 page *must* be shadowed - - gpfn = pa >> PAGE_SHIFT; - spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask; + spfn = __shadow_status(¤t->mm, pa >> PAGE_SHIFT) & PSH_pfn_mask; if ( spfn == prev_spfn ) { @@ -798,54 +653,44 @@ void shadow_l1_normal_pt_update( unsigne } else { - if( prev_spl1e ) unmap_domain_mem( prev_spl1e ); - spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); + if ( prev_spl1e != NULL ) + unmap_domain_mem( prev_spl1e ); + spl1e = (l1_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); *prev_spfn_ptr = spfn; *prev_spl1e_ptr = spl1e; } - // XXX we assume only pagetables can be shadowed; - // this will have to change to allow arbitrary CoW etc. - - l1pte_no_fault( ¤t->mm, &gpte, &spte ); - - - spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte ); - + l1pte_propagate_from_guest(¤t->mm, &gpte, &spte); + spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte); } -void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte ) +void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte) { - unsigned long gpfn, spfn, spte; - l2_pgentry_t * sp2le; - unsigned long s_sh=0; + unsigned long spfn, spte; + l2_pgentry_t *spl2e; + unsigned long s_sh; + /* N.B. To get here, we know the l2 page *must* be shadowed. */ SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte); - // to get here, we know the l2 page has a shadow + spfn = __shadow_status(¤t->mm, pa >> PAGE_SHIFT) & PSH_pfn_mask; + + s_sh = (gpte & _PAGE_PRESENT) ? + __shadow_status(¤t->mm, gpte >> PAGE_SHIFT) : 0; - gpfn = pa >> PAGE_SHIFT; - spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask; + /* XXXX Should mark guest pte as DIRTY and ACCESSED too! */ + l2pde_general(¤t->mm, &gpte, &spte, s_sh); + spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); + spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spte); + unmap_domain_mem(spl2e); +} - spte = 0; - if( gpte & _PAGE_PRESENT ) - s_sh = __shadow_status(¤t->mm, gpte >> PAGE_SHIFT); - - sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); - // no real need for a cache here - l2pde_general( ¤t->mm, &gpte, &spte, s_sh ); - - // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!! - - sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] = - mk_l2_pgentry( spte ); - - unmap_domain_mem( (void *) sp2le ); -} - +/************************************************************************/ +/************************************************************************/ +/************************************************************************/ #if SHADOW_DEBUG @@ -853,29 +698,34 @@ static int sh_l2_present; static int sh_l1_present; char * sh_check_name; -#define FAIL(_f, _a...) \ -{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", sh_check_name, level, i, ## _a , gpte, spte ); BUG();} +#define FAIL(_f, _a...) \ + do { \ + printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", \ + sh_check_name, level, i, ## _a , gpte, spte); \ + BUG(); \ + } while ( 0 ) -static int check_pte( struct mm_struct *m, - unsigned long gpte, unsigned long spte, int level, int i ) +static int check_pte( + struct mm_struct *m, unsigned long gpte, unsigned long spte, + int level, int i) { unsigned long mask, gpfn, spfn; - if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00) - return 1; // always safe + if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) ) + return 1; /* always safe */ if ( !(spte & _PAGE_PRESENT) ) FAIL("Non zero not present spte"); - if( level == 2 ) sh_l2_present++; - if( level == 1 ) sh_l1_present++; + if ( level == 2 ) sh_l2_present++; + if ( level == 1 ) sh_l1_present++; if ( !(gpte & _PAGE_PRESENT) ) FAIL("Guest not present yet shadow is"); mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000); - if ( (spte & mask) != (gpte & mask ) ) + if ( (spte & mask) != (gpte & mask) ) FAIL("Corrupt?"); if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) ) @@ -887,108 +737,97 @@ static int check_pte( struct mm_struct * if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) ) FAIL("RW coherence"); - if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) )) + if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) ) FAIL("RW2 coherence"); - spfn = spte>>PAGE_SHIFT; - gpfn = gpte>>PAGE_SHIFT; + spfn = spte >> PAGE_SHIFT; + gpfn = gpte >> PAGE_SHIFT; if ( gpfn == spfn ) { if ( level > 1 ) - FAIL("Linear map ???"); // XXX this will fail on BSD - - return 1; + FAIL("Linear map ???"); /* XXX this will fail on BSD */ } else { if ( level < 2 ) FAIL("Shadow in L1 entry?"); - if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) ) - FAIL("spfn problem g.sf=%08lx", - __shadow_status(p, gpfn) ); + if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) ) + FAIL("spfn problem g.sf=%08lx", __shadow_status(m, gpfn)); } return 1; } -static int check_l1_table( struct mm_struct *m, unsigned long va, - unsigned long g2, unsigned long s2 ) +static int check_l1_table( + struct mm_struct *m, unsigned long va, + unsigned long g2, unsigned long s2) { - int j; + int i; unsigned long *gpl1e, *spl1e; - //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]); - //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]); - - gpl1e = map_domain_mem( g2<> PAGE_SHIFT; + gpfn = gptbase >> PAGE_SHIFT; - if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) ) + if ( !(__shadow_status(m, gpfn) & PSH_shadowed) ) { printk("%s-PT %08lx not shadowed\n", s, gptbase); - - if( __shadow_status(p, gpfn) != 0 ) BUG(); - + if ( __shadow_status(m, gpfn) != 0 ) + BUG(); return 0; } - spfn = __shadow_status(p, gpfn) & PSH_pfn_mask; + spfn = __shadow_status(m, gpfn) & PSH_pfn_mask; - if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) ) + if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) ) FAILPT("ptbase shadow inconsistent1"); gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT ); spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); - //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); - - - if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE) - * sizeof(l2_pgentry_t)) ) + if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - + DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) ) { printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn); - for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE; - i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT)); - i++ ) + for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT); + i++ ) printk("+++ (%d) %08lx %08lx\n",i, - l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) ); + l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i])); FAILPT("hypervisor entries inconsistent"); } @@ -996,49 +835,41 @@ int check_pagetable( struct mm_struct *m l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) ) FAILPT("hypervisor linear map inconsistent"); - if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != + if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> + L2_PAGETABLE_SHIFT]) != ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) ) FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx", - l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]), - (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR - ); + l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> + L2_PAGETABLE_SHIFT]), + (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != - ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) ) + ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | + __PAGE_HYPERVISOR))) ) FAILPT("hypervisor per-domain map inconsistent"); - // check the whole L2 + /* Check the whole L2. */ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - { - unsigned long gpte = l2_pgentry_val(gpl2e[i]); - unsigned long spte = l2_pgentry_val(spl2e[i]); + check_pte(m, l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]), 2, i); - check_pte( p, gpte, spte, 2, i ); - } - - - // go back and recurse + /* Go back and recurse. */ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) { - unsigned long gpte = l2_pgentry_val(gpl2e[i]); - unsigned long spte = l2_pgentry_val(spl2e[i]); - - if ( spte ) - check_l1_table( p, - i<>PAGE_SHIFT, spte>>PAGE_SHIFT ); - + if ( l2_pgentry_val(spl2e[i]) != 0 ) + check_l1_table( + m, i << L2_PAGETABLE_SHIFT, + l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT, + l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT); } - unmap_domain_mem( spl2e ); - unmap_domain_mem( gpl2e ); + unmap_domain_mem(spl2e); + unmap_domain_mem(gpl2e); SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n", - sh_l2_present, sh_l1_present ); + sh_l2_present, sh_l1_present); return 1; } - #endif diff -r 7565994e86cb -r 0174982516f6 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Wed Oct 13 03:33:39 2004 +0000 +++ b/xen/include/asm-x86/shadow.h Wed Oct 13 14:25:21 2004 +0000 @@ -8,21 +8,19 @@ #include #include - /* Shadow PT flag bits in pfn_info */ #define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */ -#define PSH_pending (1<<29) /* page is in the process of being shadowed */ #define PSH_pfn_mask ((1<<21)-1) /* Shadow PT operation mode : shadowmode variable in mm_struct */ #define SHM_test (1) /* just run domain on shadow PTs */ #define SHM_logdirty (2) /* log pages that are dirtied */ #define SHM_translate (3) /* lookup machine pages in translation table */ -//#define SHM_cow (4) /* copy on write all dirtied pages */ - +#define SHM_cow (4) /* copy on write all dirtied pages */ #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) -#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) +#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ + (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) #define shadow_mode(_d) ((_d)->mm.shadow_mode) #define shadow_lock_init(_d) spin_lock_init(&(_d)->mm.shadow_lock) @@ -32,9 +30,9 @@ extern void shadow_mode_init(void); extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc); extern int shadow_fault(unsigned long va, long error_code); -extern void shadow_l1_normal_pt_update(unsigned long pa, unsigned long gpte, - unsigned long *prev_spfn_ptr, - l1_pgentry_t **prev_spl1e_ptr); +extern void shadow_l1_normal_pt_update( + unsigned long pa, unsigned long gpte, + unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr); extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte); extern void unshadow_table(unsigned long gpfn, unsigned int type); extern int shadow_mode_enable(struct domain *p, unsigned int mode); @@ -47,20 +45,19 @@ static inline void shadow_mode_disable(s } extern unsigned long shadow_l2_table( - struct mm_struct *m, unsigned long gpfn ); + struct mm_struct *m, unsigned long gpfn); -#define SHADOW_DEBUG 0 +#define SHADOW_DEBUG 0 #define SHADOW_HASH_DEBUG 0 -#define SHADOW_OPTIMISE 1 struct shadow_status { - unsigned long pfn; // gpfn - unsigned long spfn_and_flags; // spfn plus flags - struct shadow_status *next; // use pull-to-front list. + unsigned long pfn; /* Guest pfn. */ + unsigned long spfn_and_flags; /* Shadow pfn plus flags. */ + struct shadow_status *next; /* Pull-to-front list. */ }; -#define shadow_ht_extra_size 128 /*128*/ -#define shadow_ht_buckets 256 /*256*/ +#define shadow_ht_extra_size 128 +#define shadow_ht_buckets 256 #ifdef VERBOSE #define SH_LOG(_f, _a...) \ @@ -89,63 +86,60 @@ printk("DOM%u: (file=shadow.c, line=%d) /************************************************************************/ -static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn ) +static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn) { - unsigned int pfn; - int rc = 0; + unsigned long pfn; + int rc = 0; ASSERT(spin_is_locked(&m->shadow_lock)); + ASSERT(m->shadow_dirty_bitmap != NULL); pfn = machine_to_phys_mapping[mfn]; - /* We use values with the top bit set to mark MFNs that aren't - really part of the domain's psuedo-physical memory map e.g. - the shared info frame. Nothing to do here... - */ - if ( unlikely(pfn & 0x80000000U) ) return rc; + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(pfn & 0x80000000UL) ) + return rc; - ASSERT(m->shadow_dirty_bitmap); - if( likely(pfnshadow_dirty_bitmap_size) ) + if ( likely(pfn < m->shadow_dirty_bitmap_size) ) { - /* These updates occur with mm.shadow_lock held, so use - (__) version of test_and_set */ - if ( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 ) + /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ + if ( !__test_and_set_bit(pfn, m->shadow_dirty_bitmap) ) { - // if we set it m->shadow_dirty_count++; rc = 1; } } - else +#ifndef NDEBUG + else if ( mfn < max_page ) { - if ( mfn < max_page ) - { - SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)", - mfn, pfn, m->shadow_dirty_bitmap_size, m ); - SH_LOG("dom=%p caf=%08x taf=%08x\n", - frame_table[mfn].u.inuse.domain, - frame_table[mfn].count_info, - frame_table[mfn].u.inuse.type_info ); - { - extern void show_trace(unsigned long *esp); - unsigned long *esp; - __asm__ __volatile__ ("movl %%esp,%0" : "=r" (esp) : ); - show_trace(esp); - } - } + SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)", + mfn, pfn, m->shadow_dirty_bitmap_size, m ); + SH_LOG("dom=%p caf=%08x taf=%08x\n", + frame_table[mfn].u.inuse.domain, + frame_table[mfn].count_info, + frame_table[mfn].u.inuse.type_info ); + { + extern void show_trace(unsigned long *esp); + unsigned long *esp; + __asm__ __volatile__ ("movl %%esp,%0" : "=r" (esp) : ); + show_trace(esp); + } } +#endif return rc; } -static inline int mark_dirty( struct mm_struct *m, unsigned int mfn ) +static inline int mark_dirty(struct mm_struct *m, unsigned int mfn) { int rc; - //ASSERT(local_irq_is_enabled()); - //if(spin_is_locked(&m->shadow_lock)) printk("+"); shadow_lock(m); - rc = __mark_dirty( m, mfn ); + rc = __mark_dirty(m, mfn); shadow_unlock(m); return rc; } @@ -159,19 +153,19 @@ static inline void l1pte_write_fault( unsigned long gpte = *gpte_p; unsigned long spte = *spte_p; + ASSERT(gpte & _PAGE_RW); + + gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; + switch ( m->shadow_mode ) { case SHM_test: - spte = gpte; - gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; - spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; + spte = gpte | _PAGE_RW; break; case SHM_logdirty: - spte = gpte; - gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; - spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; - __mark_dirty( m, (gpte >> PAGE_SHIFT) ); + spte = gpte | _PAGE_RW; + __mark_dirty(m, gpte >> PAGE_SHIFT); break; } @@ -185,21 +179,16 @@ static inline void l1pte_read_fault( unsigned long gpte = *gpte_p; unsigned long spte = *spte_p; + gpte |= _PAGE_ACCESSED; + switch ( m->shadow_mode ) { case SHM_test: - spte = gpte; - gpte |= _PAGE_ACCESSED; - spte |= _PAGE_ACCESSED; - if ( ! (gpte & _PAGE_DIRTY ) ) - spte &= ~ _PAGE_RW; + spte = (gpte & _PAGE_DIRTY) ? gpte : (gpte & ~_PAGE_RW); break; case SHM_logdirty: - spte = gpte; - gpte |= _PAGE_ACCESSED; - spte |= _PAGE_ACCESSED; - spte &= ~ _PAGE_RW; + spte = gpte & ~_PAGE_RW; break; } @@ -207,7 +196,7 @@ static inline void l1pte_read_fault( *spte_p = spte; } -static inline void l1pte_no_fault( +static inline void l1pte_propagate_from_guest( struct mm_struct *m, unsigned long *gpte_p, unsigned long *spte_p) { unsigned long gpte = *gpte_p; @@ -219,22 +208,14 @@ static inline void l1pte_no_fault( spte = 0; if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == (_PAGE_PRESENT|_PAGE_ACCESSED) ) - { - spte = gpte; - if ( ! (gpte & _PAGE_DIRTY ) ) - spte &= ~ _PAGE_RW; - } + spte = (gpte & _PAGE_DIRTY) ? gpte : (gpte & ~_PAGE_RW); break; case SHM_logdirty: spte = 0; if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == (_PAGE_PRESENT|_PAGE_ACCESSED) ) - { - spte = gpte; - spte &= ~ _PAGE_RW; - } - + spte = gpte & ~_PAGE_RW; break; } @@ -243,7 +224,7 @@ static inline void l1pte_no_fault( } static inline void l2pde_general( - struct mm_struct *m, + struct mm_struct *m, unsigned long *gpde_p, unsigned long *spde_p, unsigned long sl1pfn) @@ -253,18 +234,16 @@ static inline void l2pde_general( spde = 0; - if ( sl1pfn ) + if ( sl1pfn != 0 ) { - spde = (gpde & ~PAGE_MASK) | (sl1pfn<shadow_ht[j]; - if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);} - ASSERT((a->pfn&0xf0000000)==0); - ASSERT(a->pfn<0x00100000); - a=a->next; - while(a && live<9999) + if ( a->pfn ) { live++; ASSERT(a->spfn_and_flags & PSH_pfn_mask); } + ASSERT(a->pfn < 0x00100000UL); + a = a->next; + while ( a && (live < 9999) ) { live++; - if(a->pfn == 0 || a->spfn_and_flags == 0) + if ( (a->pfn == 0) || (a->spfn_and_flags == 0) ) { printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n", live, a->pfn, a->spfn_and_flags, a->next); BUG(); } - ASSERT(a->pfn); - ASSERT((a->pfn&0xf0000000)==0); - ASSERT(a->pfn<0x00100000); - ASSERT(a->spfn_and_flags&PSH_pfn_mask); - a=a->next; + ASSERT(a->pfn < 0x00100000UL); + ASSERT(a->spfn_and_flags & PSH_pfn_mask); + a = a->next; } - ASSERT(live<9999); + ASSERT(live < 9999); } - a = m->shadow_ht_free; - while(a) { free++; a=a->next; } + for ( a = m->shadow_ht_free; a != NULL; a = a->next ) + free++; - if(print) printk("Xlive=%d free=%d\n",live,free); + if ( print) + printk("Xlive=%d free=%d\n",live,free); - abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live; - if( abs < -1 || abs > 1 ) + abs = (perfc_value(shadow_l1_pages) + perfc_value(shadow_l2_pages)) - live; + if ( (abs < -1) || (abs > 1) ) { printk("live=%d free=%d l1=%d l2=%d\n",live,free, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) ); BUG(); } - } - #else -#define shadow_audit(p, print) +#define shadow_audit(p, print) ((void)0) #endif -static inline struct shadow_status* hash_bucket( struct mm_struct *m, - unsigned int gpfn ) +static inline struct shadow_status *hash_bucket( + struct mm_struct *m, unsigned int gpfn) { - return &(m->shadow_ht[gpfn % shadow_ht_buckets]); + return &m->shadow_ht[gpfn % shadow_ht_buckets]; } -static inline unsigned long __shadow_status( struct mm_struct *m, - unsigned int gpfn ) +static inline unsigned long __shadow_status( + struct mm_struct *m, unsigned int gpfn) { - struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn ); + struct shadow_status *p, *x, *head; - b = B; - ob = NULL; + x = head = hash_bucket(m, gpfn); + p = NULL; - SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b ); - shadow_audit(m,0); // if in debug mode + SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x); + shadow_audit(m, 0); do { - if ( b->pfn == gpfn ) - { - unsigned long t; - struct shadow_status *x; + ASSERT(x->pfn || ((x == head) && (x->next == NULL))); - // swap with head - t=B->pfn; B->pfn=b->pfn; b->pfn=t; - t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; - b->spfn_and_flags=t; + if ( x->pfn == gpfn ) + { + /* Pull-to-front if 'x' isn't already the head item. */ + if ( unlikely(x != head) ) + { + /* Delete 'x' from list and reinsert immediately after head. */ + p->next = x->next; + x->next = head->next; + head->next = x; - if( ob ) - { // pull to front - *ob=b->next; - x=B->next; - B->next=b; - b->next=x; + /* Swap 'x' contents with head contents. */ + SWAP(head->pfn, x->pfn); + SWAP(head->spfn_and_flags, x->spfn_and_flags); } - return B->spfn_and_flags; + + return head->spfn_and_flags; } -#if SHADOW_HASH_DEBUG - else - { - if(b!=B)ASSERT(b->pfn); - } -#endif - ob=&b->next; - b=b->next; + + p = x; + x = x->next; } - while (b); + while ( x != NULL ); return 0; } -/* we can make this locking more fine grained e.g. per shadow page if it -ever becomes a problem, but since we need a spin lock on the hash table -anyway its probably not worth being too clever. */ - -static inline unsigned long get_shadow_status( struct mm_struct *m, - unsigned int gpfn ) +/* + * N.B. We can make this locking more fine grained (e.g., per shadow page) if + * it ever becomes a problem, but since we need a spin lock on the hash table + * anyway it's probably not worth being too clever. + */ +static inline unsigned long get_shadow_status( + struct mm_struct *m, unsigned int gpfn ) { unsigned long res; - /* If we get here, we know that this domain is running in shadow mode. - We also know that some sort of update has happened to the underlying - page table page: either a PTE has been updated, or the page has - changed type. If we're in log dirty mode, we should set the approrpiate - bit in the dirty bitmap. - NB: the VA update path doesn't use this so needs to be handled - independnetly. - */ + ASSERT(m->shadow_mode); - //ASSERT(local_irq_is_enabled()); - //if(spin_is_locked(&m->shadow_lock)) printk("*"); + /* + * If we get here we know that some sort of update has happened to the + * underlying page table page: either a PTE has been updated, or the page + * has changed type. If we're in log dirty mode, we should set the + * appropriate bit in the dirty bitmap. + * N.B. The VA update path doesn't use this and is handled independently. + */ + shadow_lock(m); - if( m->shadow_mode == SHM_logdirty ) + if ( m->shadow_mode == SHM_logdirty ) __mark_dirty( m, gpfn ); - res = __shadow_status( m, gpfn ); - if (!res) + if ( !(res = __shadow_status(m, gpfn)) ) shadow_unlock(m); + return res; } -static inline void put_shadow_status( struct mm_struct *m ) +static inline void put_shadow_status( + struct mm_struct *m) { shadow_unlock(m); } -static inline void delete_shadow_status( struct mm_struct *m, - unsigned int gpfn ) +static inline void delete_shadow_status( + struct mm_struct *m, unsigned int gpfn) { - struct shadow_status *b, *B, **ob; + struct shadow_status *p, *x, *n, *head; ASSERT(spin_is_locked(&m->shadow_lock)); + ASSERT(gpfn != 0); - B = b = hash_bucket( m, gpfn ); + head = hash_bucket(m, gpfn); - SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b ); - shadow_audit(m,0); - ASSERT(gpfn); + SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b); + shadow_audit(m, 0); - if( b->pfn == gpfn ) + /* Match on head item? */ + if ( head->pfn == gpfn ) { - if (b->next) + if ( (n = head->next) != NULL ) { - struct shadow_status *D=b->next; - b->spfn_and_flags = b->next->spfn_and_flags; - b->pfn = b->next->pfn; + /* Overwrite head with contents of following node. */ + head->pfn = n->pfn; + head->spfn_and_flags = n->spfn_and_flags; - b->next = b->next->next; - D->next = m->shadow_ht_free; - D->pfn = 0; - D->spfn_and_flags = 0; - m->shadow_ht_free = D; + /* Delete following node. */ + head->next = n->next; + + /* Add deleted node to the free list. */ + n->pfn = 0; + n->spfn_and_flags = 0; + n->next = m->shadow_ht_free; + m->shadow_ht_free = n; } else { - b->pfn = 0; - b->spfn_and_flags = 0; + /* This bucket is now empty. Initialise the head node. */ + head->pfn = 0; + head->spfn_and_flags = 0; } -#if SHADOW_HASH_DEBUG - if( __shadow_status(m,gpfn) ) BUG(); - shadow_audit(m,0); -#endif - return; + goto found; } - ob = &b->next; - b=b->next; + p = head; + x = head->next; do { - if ( b->pfn == gpfn ) + if ( x->pfn == gpfn ) { - b->pfn = 0; - b->spfn_and_flags = 0; + /* Delete matching node. */ + p->next = x->next; - // b is in the list - *ob=b->next; - b->next = m->shadow_ht_free; - m->shadow_ht_free = b; + /* Add deleted node to the free list. */ + x->pfn = 0; + x->spfn_and_flags = 0; + x->next = m->shadow_ht_free; + m->shadow_ht_free = x; -#if SHADOW_HASH_DEBUG - if( __shadow_status(m,gpfn) ) BUG(); -#endif - shadow_audit(m,0); - return; + goto found; } - ob = &b->next; - b=b->next; + p = x; + x = x->next; } - while (b); + while ( x != NULL ); - // if we got here, it wasn't in the list + /* If we got here, it wasn't in the list! */ BUG(); + + found: + shadow_audit(m, 0); } -static inline void set_shadow_status( struct mm_struct *m, - unsigned int gpfn, unsigned long s ) +static inline void set_shadow_status( + struct mm_struct *m, unsigned int gpfn, unsigned long s) { - struct shadow_status *b, *B, *extra, **fptr; + struct shadow_status *x, *head, *extra; int i; ASSERT(spin_is_locked(&m->shadow_lock)); + ASSERT(gpfn != 0); + ASSERT(s & PSH_shadowed); - B = b = hash_bucket( m, gpfn ); + x = head = hash_bucket(m, gpfn); - ASSERT(gpfn); - SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next ); + SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next); + shadow_audit(m, 0); - shadow_audit(m,0); + /* + * STEP 1. If page is already in the table, update it in place. + */ do { - if ( b->pfn == gpfn ) + if ( x->pfn == gpfn ) { - b->spfn_and_flags = s; - shadow_audit(m,0); - return; + x->spfn_and_flags = s; + goto done; } - b=b->next; + x = x->next; } - while (b); + while ( x != NULL ); - // if we got here, this is an insert rather than update - - ASSERT( s ); // deletes must have succeeded by here + /* + * STEP 2. The page must be inserted into the table. + */ - if ( B->pfn == 0 ) + /* If the bucket is empty then insert the new page as the head item. */ + if ( head->pfn == 0 ) { - // we can use this head - ASSERT( B->next == 0 ); - B->pfn = gpfn; - B->spfn_and_flags = s; - shadow_audit(m,0); - return; + head->pfn = gpfn; + head->spfn_and_flags = s; + ASSERT(head->next == NULL); + goto done; } - if( unlikely(m->shadow_ht_free == NULL) ) + /* We need to allocate a new node. Ensure the quicklist is non-empty. */ + if ( unlikely(m->shadow_ht_free == NULL) ) { - SH_LOG("allocate more shadow hashtable blocks"); + SH_LOG("Allocate more shadow hashtable blocks."); + + extra = xmalloc( + sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - // we need to allocate more space - extra = xmalloc(sizeof(void*) + (shadow_ht_extra_size * - sizeof(struct shadow_status))); + /* XXX Should be more graceful here. */ + if ( extra == NULL ) + BUG(); - if( ! extra ) BUG(); // should be more graceful here.... + memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size * - sizeof(struct shadow_status))); - + /* Record the allocation block so it can be correctly freed later. */ m->shadow_extras_count++; + *((struct shadow_status **)&extra[shadow_ht_extra_size]) = + m->shadow_ht_extras; + m->shadow_ht_extras = &extra[0]; - // add extras to free list - fptr = &m->shadow_ht_free; - for ( i=0; ishadow_ht_extras; - m->shadow_ht_extras = extra; - + /* Add the new nodes to the free list. */ + m->shadow_ht_free = &extra[0]; } - // should really put this in B to go right to front - b = m->shadow_ht_free; - m->shadow_ht_free = b->next; - b->spfn_and_flags = s; - b->pfn = gpfn; - b->next = B->next; - B->next = b; + /* Allocate a new node from the quicklist. */ + x = m->shadow_ht_free; + m->shadow_ht_free = x->next; - shadow_audit(m,0); + /* Initialise the new node and insert directly after the head item. */ + x->pfn = gpfn; + x->spfn_and_flags = s; + x->next = head->next; + head->next = x; - return; + done: + shadow_audit(m, 0); } -static inline void __shadow_mk_pagetable( struct mm_struct *mm ) +static inline void __shadow_mk_pagetable(struct mm_struct *mm) { - unsigned long gpfn, spfn=0; - - gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT; + unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT; + unsigned long spfn = __shadow_status(mm, gpfn); - if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) ) - { - spfn = shadow_l2_table(mm, gpfn ); - } - mm->shadow_table = mk_pagetable(spfn<shadow_table = mk_pagetable(spfn << PAGE_SHIFT); } -static inline void shadow_mk_pagetable( struct mm_struct *mm ) +static inline void shadow_mk_pagetable(struct mm_struct *mm) { SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", pagetable_val(mm->pagetable), mm->shadow_mode ); if ( unlikely(mm->shadow_mode) ) { - //ASSERT(local_irq_is_enabled()); shadow_lock(mm); __shadow_mk_pagetable(mm); shadow_unlock(mm); @@ -602,17 +573,12 @@ static inline void shadow_mk_pagetable( SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx", pagetable_val(mm->pagetable), mm->shadow_mode, pagetable_val(mm->shadow_table) ); - } - #if SHADOW_DEBUG extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s); #else #define check_pagetable(m, pt, s) ((void)0) #endif - #endif /* XEN_SHADOW_H */ - - diff -r 7565994e86cb -r 0174982516f6 xen/include/hypervisor-ifs/dom0_ops.h --- a/xen/include/hypervisor-ifs/dom0_ops.h Wed Oct 13 03:33:39 2004 +0000 +++ b/xen/include/hypervisor-ifs/dom0_ops.h Wed Oct 13 14:25:21 2004 +0000 @@ -19,7 +19,7 @@ * This makes sure that old versions of dom0 tools will stop working in a * well-defined way (rather than crashing the machine, for instance). */ -#define DOM0_INTERFACE_VERSION 0xAAAA0015 +#define DOM0_INTERFACE_VERSION 0xAAAA0016 #define MAX_DOMAIN_NAME 16 @@ -275,7 +275,6 @@ typedef struct { #define DOM0_SHADOW_CONTROL_OP_FLUSH 10 /* table ops */ #define DOM0_SHADOW_CONTROL_OP_CLEAN 11 #define DOM0_SHADOW_CONTROL_OP_PEEK 12 -#define DOM0_SHADOW_CONTROL_OP_CLEAN2 13 typedef struct dom0_shadow_control { diff -r 7565994e86cb -r 0174982516f6 xen/include/xen/lib.h --- a/xen/include/xen/lib.h Wed Oct 13 03:33:39 2004 +0000 +++ b/xen/include/xen/lib.h Wed Oct 13 14:25:21 2004 +0000 @@ -11,6 +11,9 @@ #define ASSERT(_p) ((void)0) #endif +#define SWAP(_a, _b) \ + do { typeof(_a) _t = (_a); (_a) = (_b); (_b) = _t; } while ( 0 ) + #define reserve_bootmem(_p,_l) \ printk("Memory Reservation 0x%lx, %lu bytes\n", (_p), (_l))