debuggers.hg
changeset 16431:68c911f7733a
hvm: make dirty logging stop requiring physical pages of order > 0
This patch re-implements the (x86) hypervisor dirty page log with a
simple four-level radix tree whose nodes are all single pages, thus
making migration require only order-0 pages (where before it required
at least an order-5 page).
Unlike the p2m radix tree implementation, the interior nodes of this
tree are NOT page table nodes. I chose a lazy-allocation and -mapping
approach because most pages are not marked dirty while dirty-logging is
enabled. There are doubtless situations (the 'stream' benchmark, for
example) where a more complex p2m-like approach is faster, but I'm not
sure they're worth the effort.
Signed-off-by: Dave Lively <dlively@virtualiron.com>
This patch re-implements the (x86) hypervisor dirty page log with a
simple four-level radix tree whose nodes are all single pages, thus
making migration require only order-0 pages (where before it required
at least an order-5 page).
Unlike the p2m radix tree implementation, the interior nodes of this
tree are NOT page table nodes. I chose a lazy-allocation and -mapping
approach because most pages are not marked dirty while dirty-logging is
enabled. There are doubtless situations (the 'stream' benchmark, for
example) where a more complex p2m-like approach is faster, but I'm not
sure they're worth the effort.
Signed-off-by: Dave Lively <dlively@virtualiron.com>
author | Keir Fraser <keir.fraser@citrix.com> |
---|---|
date | Fri Nov 16 18:33:24 2007 +0000 (2007-11-16) |
parents | 2052364cb456 |
children | 86e4b37a06cc |
files | xen/arch/x86/mm/paging.c xen/arch/x86/mm/shadow/private.h xen/include/asm-x86/domain.h xen/include/asm-x86/paging.h |
line diff
1.1 --- a/xen/arch/x86/mm/paging.c Fri Nov 16 17:59:34 2007 +0000 1.2 +++ b/xen/arch/x86/mm/paging.c Fri Nov 16 18:33:24 2007 +0000 1.3 @@ -96,36 +96,97 @@ 1.4 spin_unlock(&(_d)->arch.paging.log_dirty.lock); \ 1.5 } while (0) 1.6 1.7 +static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p) 1.8 +{ 1.9 + mfn_t mfn; 1.10 + struct page_info *page = alloc_domheap_page(NULL); 1.11 + 1.12 + if ( unlikely(page == NULL) ) { 1.13 + d->arch.paging.log_dirty.failed_allocs++; 1.14 + return _mfn(INVALID_MFN); 1.15 + } 1.16 + d->arch.paging.log_dirty.allocs++; 1.17 + mfn = page_to_mfn(page); 1.18 + *mapping_p = map_domain_page(mfn_x(mfn)); 1.19 + return mfn; 1.20 +} 1.21 + 1.22 + 1.23 +static mfn_t paging_new_log_dirty_leaf(struct domain *d, uint8_t **leaf_p) 1.24 +{ 1.25 + mfn_t mfn = paging_new_log_dirty_page(d, (void **)leaf_p); 1.26 + clear_page(*leaf_p); 1.27 + return mfn; 1.28 +} 1.29 + 1.30 + 1.31 +static mfn_t paging_new_log_dirty_node(struct domain *d, mfn_t **node_p) 1.32 +{ 1.33 + int i; 1.34 + mfn_t mfn = paging_new_log_dirty_page(d, (void **)node_p); 1.35 + for (i = 0; i < LOGDIRTY_NODE_ENTRIES; i++) 1.36 + (*node_p)[i] = _mfn(INVALID_MFN); 1.37 + return mfn; 1.38 +} 1.39 + 1.40 + 1.41 /* allocate bitmap resources for log dirty */ 1.42 int paging_alloc_log_dirty_bitmap(struct domain *d) 1.43 { 1.44 - if ( d->arch.paging.log_dirty.bitmap != NULL ) 1.45 + mfn_t *mapping; 1.46 + 1.47 + if ( mfn_valid(d->arch.paging.log_dirty.top) ) 1.48 return 0; 1.49 1.50 - d->arch.paging.log_dirty.bitmap_size = 1.51 - (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); 1.52 - d->arch.paging.log_dirty.bitmap = 1.53 - xmalloc_array(unsigned long, 1.54 - d->arch.paging.log_dirty.bitmap_size / BITS_PER_LONG); 1.55 - if ( d->arch.paging.log_dirty.bitmap == NULL ) 1.56 - { 1.57 - d->arch.paging.log_dirty.bitmap_size = 0; 1.58 + d->arch.paging.log_dirty.top = paging_new_log_dirty_node(d, &mapping); 1.59 + if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) { 1.60 + /* Clear error indicator since we're reporting this one */ 1.61 + d->arch.paging.log_dirty.failed_allocs = 0; 1.62 return -ENOMEM; 1.63 } 1.64 - memset(d->arch.paging.log_dirty.bitmap, 0, 1.65 - d->arch.paging.log_dirty.bitmap_size/8); 1.66 + unmap_domain_page(mapping); 1.67 1.68 return 0; 1.69 } 1.70 1.71 + 1.72 +static void paging_free_log_dirty_page(struct domain *d, mfn_t mfn) 1.73 +{ 1.74 + d->arch.paging.log_dirty.allocs--; 1.75 + free_domheap_page(mfn_to_page(mfn)); 1.76 +} 1.77 + 1.78 /* free bitmap resources */ 1.79 void paging_free_log_dirty_bitmap(struct domain *d) 1.80 { 1.81 - d->arch.paging.log_dirty.bitmap_size = 0; 1.82 - if ( d->arch.paging.log_dirty.bitmap ) 1.83 - { 1.84 - xfree(d->arch.paging.log_dirty.bitmap); 1.85 - d->arch.paging.log_dirty.bitmap = NULL; 1.86 + int i4, i3, i2; 1.87 + 1.88 + if (mfn_valid(d->arch.paging.log_dirty.top)) { 1.89 + mfn_t *l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); 1.90 + printk("%s: used %d pages for domain %d dirty logging\n", 1.91 + __FUNCTION__, d->arch.paging.log_dirty.allocs, d->domain_id); 1.92 + for (i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++) { 1.93 + if (mfn_valid(l4[i4])) { 1.94 + mfn_t *l3 = map_domain_page(mfn_x(l4[i4])); 1.95 + for (i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++) { 1.96 + if (mfn_valid(l3[i3])) { 1.97 + mfn_t *l2 = map_domain_page(mfn_x(l3[i3])); 1.98 + for (i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++) 1.99 + if (mfn_valid(l2[i2])) 1.100 + paging_free_log_dirty_page(d, l2[i2]); 1.101 + unmap_domain_page(l2); 1.102 + paging_free_log_dirty_page(d, l3[i3]); 1.103 + } 1.104 + } 1.105 + unmap_domain_page(l3); 1.106 + paging_free_log_dirty_page(d, l4[i4]); 1.107 + } 1.108 + } 1.109 + unmap_domain_page(l4); 1.110 + paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top); 1.111 + d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); 1.112 + ASSERT(d->arch.paging.log_dirty.allocs == 0); 1.113 + d->arch.paging.log_dirty.failed_allocs = 0; 1.114 } 1.115 } 1.116 1.117 @@ -187,16 +248,20 @@ void paging_mark_dirty(struct domain *d, 1.118 { 1.119 unsigned long pfn; 1.120 mfn_t gmfn; 1.121 + int changed; 1.122 + mfn_t mfn, *l4, *l3, *l2; 1.123 + uint8_t *l1; 1.124 + int i1, i2, i3, i4; 1.125 1.126 gmfn = _mfn(guest_mfn); 1.127 1.128 + ASSERT(mfn_valid(d->arch.paging.log_dirty.top)); 1.129 + 1.130 if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ) 1.131 return; 1.132 1.133 log_dirty_lock(d); 1.134 1.135 - ASSERT(d->arch.paging.log_dirty.bitmap != NULL); 1.136 - 1.137 /* We /really/ mean PFN here, even for non-translated guests. */ 1.138 pfn = get_gpfn_from_mfn(mfn_x(gmfn)); 1.139 1.140 @@ -206,37 +271,52 @@ void paging_mark_dirty(struct domain *d, 1.141 * Nothing to do here... 1.142 */ 1.143 if ( unlikely(!VALID_M2P(pfn)) ) 1.144 + goto out; 1.145 + 1.146 + i1 = L1_LOGDIRTY_IDX(pfn); 1.147 + i2 = L2_LOGDIRTY_IDX(pfn); 1.148 + i3 = L3_LOGDIRTY_IDX(pfn); 1.149 + i4 = L4_LOGDIRTY_IDX(pfn); 1.150 + 1.151 + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); 1.152 + mfn = l4[i4]; 1.153 + if ( !mfn_valid(mfn) ) 1.154 + mfn = l4[i4] = paging_new_log_dirty_node(d, &l3); 1.155 + else 1.156 + l3 = map_domain_page(mfn_x(mfn)); 1.157 + unmap_domain_page(l4); 1.158 + if ( unlikely(!mfn_valid(mfn)) ) 1.159 + goto out; 1.160 + 1.161 + mfn = l3[i3]; 1.162 + if ( !mfn_valid(mfn) ) 1.163 + mfn = l3[i3] = paging_new_log_dirty_node(d, &l2); 1.164 + else 1.165 + l2 = map_domain_page(mfn_x(mfn)); 1.166 + unmap_domain_page(l3); 1.167 + if ( unlikely(!mfn_valid(mfn)) ) 1.168 + goto out; 1.169 + 1.170 + mfn = l2[i2]; 1.171 + if ( !mfn_valid(mfn) ) 1.172 + mfn = l2[i2] = paging_new_log_dirty_leaf(d, &l1); 1.173 + else 1.174 + l1 = map_domain_page(mfn_x(mfn)); 1.175 + unmap_domain_page(l2); 1.176 + if ( unlikely(!mfn_valid(mfn)) ) 1.177 + goto out; 1.178 + 1.179 + changed = !__test_and_set_bit(i1, l1); 1.180 + unmap_domain_page(l1); 1.181 + if ( changed ) 1.182 { 1.183 - log_dirty_unlock(d); 1.184 - return; 1.185 + PAGING_DEBUG(LOGDIRTY, 1.186 + "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", 1.187 + mfn_x(gmfn), pfn, d->domain_id); 1.188 + d->arch.paging.log_dirty.dirty_count++; 1.189 } 1.190 1.191 - if ( likely(pfn < d->arch.paging.log_dirty.bitmap_size) ) 1.192 - { 1.193 - if ( !__test_and_set_bit(pfn, d->arch.paging.log_dirty.bitmap) ) 1.194 - { 1.195 - PAGING_DEBUG(LOGDIRTY, 1.196 - "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", 1.197 - mfn_x(gmfn), pfn, d->domain_id); 1.198 - d->arch.paging.log_dirty.dirty_count++; 1.199 - } 1.200 - } 1.201 - else 1.202 - { 1.203 - PAGING_PRINTK("mark_dirty OOR! " 1.204 - "mfn=%" PRI_mfn " pfn=%lx max=%x (dom %d)\n" 1.205 - "owner=%d c=%08x t=%" PRtype_info "\n", 1.206 - mfn_x(gmfn), 1.207 - pfn, 1.208 - d->arch.paging.log_dirty.bitmap_size, 1.209 - d->domain_id, 1.210 - (page_get_owner(mfn_to_page(gmfn)) 1.211 - ? page_get_owner(mfn_to_page(gmfn))->domain_id 1.212 - : -1), 1.213 - mfn_to_page(gmfn)->count_info, 1.214 - mfn_to_page(gmfn)->u.inuse.type_info); 1.215 - } 1.216 - 1.217 + out: 1.218 log_dirty_unlock(d); 1.219 } 1.220 1.221 @@ -244,7 +324,11 @@ void paging_mark_dirty(struct domain *d, 1.222 * clear the bitmap and stats as well. */ 1.223 int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc) 1.224 { 1.225 - int i, rv = 0, clean = 0, peek = 1; 1.226 + int rv = 0, clean = 0, peek = 1; 1.227 + unsigned long pages = 0; 1.228 + mfn_t *l4, *l3, *l2; 1.229 + uint8_t *l1; 1.230 + int i4, i3, i2; 1.231 1.232 domain_pause(d); 1.233 log_dirty_lock(d); 1.234 @@ -270,37 +354,55 @@ int paging_log_dirty_op(struct domain *d 1.235 /* caller may have wanted just to clean the state or access stats. */ 1.236 peek = 0; 1.237 1.238 - if ( (peek || clean) && (d->arch.paging.log_dirty.bitmap == NULL) ) 1.239 + if ( (peek || clean) && !mfn_valid(d->arch.paging.log_dirty.top) ) 1.240 { 1.241 rv = -EINVAL; /* perhaps should be ENOMEM? */ 1.242 goto out; 1.243 } 1.244 1.245 - if ( sc->pages > d->arch.paging.log_dirty.bitmap_size ) 1.246 - sc->pages = d->arch.paging.log_dirty.bitmap_size; 1.247 - 1.248 -#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ 1.249 - for ( i = 0; i < sc->pages; i += CHUNK ) 1.250 - { 1.251 - int bytes = ((((sc->pages - i) > CHUNK) 1.252 - ? CHUNK 1.253 - : (sc->pages - i)) + 7) / 8; 1.254 + if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) { 1.255 + printk("%s: %d failed page allocs while logging dirty pages\n", 1.256 + __FUNCTION__, d->arch.paging.log_dirty.failed_allocs); 1.257 + rv = -ENOMEM; 1.258 + goto out; 1.259 + } 1.260 1.261 - if ( likely(peek) ) 1.262 - { 1.263 - if ( copy_to_guest_offset( 1.264 - sc->dirty_bitmap, i/8, 1.265 - (uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), bytes) ) 1.266 - { 1.267 - rv = -EFAULT; 1.268 - goto out; 1.269 + pages = 0; 1.270 + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); 1.271 + for ( i4 = 0; pages < sc->pages && i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) { 1.272 + l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL; 1.273 + for ( i3 = 0; pages < sc->pages && i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) { 1.274 + l2 = l3 && mfn_valid(l3[i3]) ? map_domain_page(mfn_x(l3[i3])) : NULL; 1.275 + for ( i2 = 0; pages < sc->pages && i2 < LOGDIRTY_NODE_ENTRIES; i2++ ) { 1.276 + static uint8_t zeroes[PAGE_SIZE]; 1.277 + unsigned int bytes = PAGE_SIZE; 1.278 + l1 = l2 && mfn_valid(l2[i2]) ? map_domain_page(mfn_x(l2[i2])) : zeroes; 1.279 + if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) ) 1.280 + bytes = (unsigned int)((sc->pages - pages + 7) >> 3); 1.281 + if ( likely(peek) ) { 1.282 + if ( copy_to_guest_offset(sc->dirty_bitmap, pages >> 3, l1, bytes) != 0) { 1.283 + rv = -EFAULT; 1.284 + goto out; 1.285 + } 1.286 + } 1.287 + 1.288 + if ( clean && l1 != zeroes ) 1.289 + clear_page(l1); 1.290 + 1.291 + pages += bytes << 3; 1.292 + if (l1 != zeroes) 1.293 + unmap_domain_page(l1); 1.294 } 1.295 + if (l2) 1.296 + unmap_domain_page(l2); 1.297 } 1.298 + if (l3) 1.299 + unmap_domain_page(l3); 1.300 + } 1.301 + unmap_domain_page(l4); 1.302 1.303 - if ( clean ) 1.304 - memset((uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), 0, bytes); 1.305 - } 1.306 -#undef CHUNK 1.307 + if (pages < sc->pages) 1.308 + sc->pages = pages; 1.309 1.310 log_dirty_unlock(d); 1.311 1.312 @@ -338,6 +440,7 @@ void paging_log_dirty_init(struct domain 1.313 d->arch.paging.log_dirty.enable_log_dirty = enable_log_dirty; 1.314 d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty; 1.315 d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap; 1.316 + d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); 1.317 } 1.318 1.319 /* This function fress log dirty bitmap resources. */
2.1 --- a/xen/arch/x86/mm/shadow/private.h Fri Nov 16 17:59:34 2007 +0000 2.2 +++ b/xen/arch/x86/mm/shadow/private.h Fri Nov 16 18:33:24 2007 +0000 2.3 @@ -491,17 +491,50 @@ sh_mfn_is_dirty(struct domain *d, mfn_t 2.4 /* Is this guest page dirty? Call only in log-dirty mode. */ 2.5 { 2.6 unsigned long pfn; 2.7 + mfn_t mfn, *l4, *l3, *l2; 2.8 + uint8_t *l1; 2.9 + int rv; 2.10 + 2.11 ASSERT(shadow_mode_log_dirty(d)); 2.12 - ASSERT(d->arch.paging.log_dirty.bitmap != NULL); 2.13 + ASSERT(mfn_valid(d->arch.paging.log_dirty.top)); 2.14 2.15 /* We /really/ mean PFN here, even for non-translated guests. */ 2.16 pfn = get_gpfn_from_mfn(mfn_x(gmfn)); 2.17 - if ( likely(VALID_M2P(pfn)) 2.18 - && likely(pfn < d->arch.paging.log_dirty.bitmap_size) 2.19 - && test_bit(pfn, d->arch.paging.log_dirty.bitmap) ) 2.20 + if ( unlikely(!VALID_M2P(pfn)) ) 2.21 + return 0; 2.22 + 2.23 + if (d->arch.paging.log_dirty.failed_allocs > 0) 2.24 + /* If we have any failed allocations our dirty log is bogus. 2.25 + * Since we can't signal an error here, be conservative and 2.26 + * report "dirty" in this case. (The only current caller, 2.27 + * _sh_propagate, leaves known-dirty pages writable, preventing 2.28 + * subsequent dirty-logging faults from them.) 2.29 + */ 2.30 return 1; 2.31 2.32 - return 0; 2.33 + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); 2.34 + mfn = l4[L4_LOGDIRTY_IDX(pfn)]; 2.35 + unmap_domain_page(l4); 2.36 + if (!mfn_valid(mfn)) 2.37 + return 0; 2.38 + 2.39 + l3 = map_domain_page(mfn_x(mfn)); 2.40 + mfn = l3[L3_LOGDIRTY_IDX(pfn)]; 2.41 + unmap_domain_page(l3); 2.42 + if (!mfn_valid(mfn)) 2.43 + return 0; 2.44 + 2.45 + l2 = map_domain_page(mfn_x(mfn)); 2.46 + mfn = l2[L2_LOGDIRTY_IDX(pfn)]; 2.47 + unmap_domain_page(l2); 2.48 + if (!mfn_valid(mfn)) 2.49 + return 0; 2.50 + 2.51 + l1 = map_domain_page(mfn_x(mfn)); 2.52 + rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1); 2.53 + unmap_domain_page(l1); 2.54 + 2.55 + return rv; 2.56 } 2.57 2.58
3.1 --- a/xen/include/asm-x86/domain.h Fri Nov 16 17:59:34 2007 +0000 3.2 +++ b/xen/include/asm-x86/domain.h Fri Nov 16 18:33:24 2007 +0000 3.3 @@ -158,9 +158,10 @@ struct log_dirty_domain { 3.4 int locker; /* processor that holds the lock */ 3.5 const char *locker_function; /* func that took it */ 3.6 3.7 - /* log-dirty bitmap to record dirty pages */ 3.8 - unsigned long *bitmap; 3.9 - unsigned int bitmap_size; /* in pages, bit per page */ 3.10 + /* log-dirty radix tree to record dirty pages */ 3.11 + mfn_t top; 3.12 + unsigned int allocs; 3.13 + unsigned int failed_allocs; 3.14 3.15 /* log-dirty mode stats */ 3.16 unsigned int fault_count;
4.1 --- a/xen/include/asm-x86/paging.h Fri Nov 16 17:59:34 2007 +0000 4.2 +++ b/xen/include/asm-x86/paging.h Fri Nov 16 18:33:24 2007 +0000 4.3 @@ -152,6 +152,28 @@ void paging_log_dirty_init(struct domain 4.4 /* mark a page as dirty */ 4.5 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn); 4.6 4.7 +/* 4.8 + * Log-dirty radix tree indexing: 4.9 + * All tree nodes are PAGE_SIZE bytes, mapped on-demand. 4.10 + * Leaf nodes are simple bitmaps; 1 bit per guest pfn. 4.11 + * Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns. 4.12 + * TODO: Dynamic radix tree height. Most guests will only need 2 levels. 4.13 + * The fourth level is basically unusable on 32-bit Xen. 4.14 + * TODO2: Abstract out the radix-tree mechanics? 4.15 + */ 4.16 +#define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER) 4.17 +#define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1)) 4.18 +#define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \ 4.19 + (LOGDIRTY_NODE_ENTRIES-1)) 4.20 +#define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \ 4.21 + (LOGDIRTY_NODE_ENTRIES-1)) 4.22 +#if BITS_PER_LONG == 64 4.23 +#define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \ 4.24 + (LOGDIRTY_NODE_ENTRIES-1)) 4.25 +#else 4.26 +#define L4_LOGDIRTY_IDX(pfn) 0 4.27 +#endif 4.28 + 4.29 /***************************************************************************** 4.30 * Entry points into the paging-assistance code */ 4.31