debuggers.hg
changeset 18992:f2ba08549466
PoD memory 3/9: PoD core
X-BeenThere: xen-devel@lists.xensource.com
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xensource.com>
List-Unsubscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
Return-Path: xen-devel-bounces@lists.xensource.com
X-OriginalArrivalTime: 23 Dec 2008 13:47:03.0625 (UTC)
FILETIME=[EFEBC390:01C96504]
Core of populate-on-demand functionality:
* Introduce a populate-on-demand type
* Call p2m_demand_populate() when gfn_to_mfn() encounters PoD entries
* Return p2m memory to the domain list for freeing during domain destruction
* Audit p2m checks our PoD-entry reference-counting
* Add PoD information to the 'q' debug key
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
X-BeenThere: xen-devel@lists.xensource.com
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xensource.com>
List-Unsubscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
Return-Path: xen-devel-bounces@lists.xensource.com
X-OriginalArrivalTime: 23 Dec 2008 13:47:03.0625 (UTC)
FILETIME=[EFEBC390:01C96504]
Core of populate-on-demand functionality:
* Introduce a populate-on-demand type
* Call p2m_demand_populate() when gfn_to_mfn() encounters PoD entries
* Return p2m memory to the domain list for freeing during domain destruction
* Audit p2m checks our PoD-entry reference-counting
* Add PoD information to the 'q' debug key
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author | Keir Fraser <keir.fraser@citrix.com> |
---|---|
date | Mon Jan 05 10:43:19 2009 +0000 (2009-01-05) |
parents | 629f028d22f9 |
children | bd33ff263e2c |
files | xen/arch/x86/domain.c xen/arch/x86/mm/p2m.c xen/arch/x86/mm/paging.c xen/arch/x86/mm/shadow/multi.c xen/include/asm-x86/p2m.h |
line diff
1.1 --- a/xen/arch/x86/domain.c Mon Jan 05 10:42:39 2009 +0000 1.2 +++ b/xen/arch/x86/domain.c Mon Jan 05 10:43:19 2009 +0000 1.3 @@ -149,6 +149,11 @@ void dump_pageframe_info(struct domain * 1.4 } 1.5 } 1.6 1.7 + if ( is_hvm_domain(d) ) 1.8 + { 1.9 + p2m_pod_dump_data(d); 1.10 + } 1.11 + 1.12 list_for_each_entry ( page, &d->xenpage_list, list ) 1.13 { 1.14 printk(" XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
2.1 --- a/xen/arch/x86/mm/p2m.c Mon Jan 05 10:42:39 2009 +0000 2.2 +++ b/xen/arch/x86/mm/p2m.c Mon Jan 05 10:43:19 2009 +0000 2.3 @@ -118,9 +118,16 @@ static unsigned long p2m_type_to_flags(p 2.4 return flags; 2.5 case p2m_mmio_direct: 2.6 return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD; 2.7 + case p2m_populate_on_demand: 2.8 + return flags; 2.9 } 2.10 } 2.11 2.12 +#if P2M_AUDIT 2.13 +static void audit_p2m(struct domain *d); 2.14 +#else 2.15 +# define audit_p2m(_d) do { (void)(_d); } while(0) 2.16 +#endif /* P2M_AUDIT */ 2.17 2.18 // Find the next level's P2M entry, checking for out-of-range gfn's... 2.19 // Returns NULL on error. 2.20 @@ -162,7 +169,8 @@ p2m_next_level(struct domain *d, mfn_t * 2.21 shift, max)) ) 2.22 return 0; 2.23 2.24 - if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) 2.25 + /* PoD: Not present doesn't imply empty. */ 2.26 + if ( !l1e_get_flags(*p2m_entry) ) 2.27 { 2.28 struct page_info *pg = d->arch.p2m->alloc_page(d); 2.29 if ( pg == NULL ) 2.30 @@ -197,7 +205,7 @@ p2m_next_level(struct domain *d, mfn_t * 2.31 } 2.32 } 2.33 2.34 - ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT); 2.35 + ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE)); 2.36 2.37 /* split single large page into 4KB page in P2M table */ 2.38 if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) 2.39 @@ -242,6 +250,236 @@ p2m_next_level(struct domain *d, mfn_t * 2.40 return 1; 2.41 } 2.42 2.43 +/* 2.44 + * Populate-on-demand functionality 2.45 + */ 2.46 +int 2.47 +p2m_pod_cache_add(struct domain *d, 2.48 + struct page_info *page, 2.49 + unsigned long order) 2.50 +{ 2.51 + int i; 2.52 + struct page_info *p; 2.53 + struct p2m_domain *p2md = d->arch.p2m; 2.54 + 2.55 +#ifndef NDEBUG 2.56 + mfn_t mfn; 2.57 + 2.58 + mfn = page_to_mfn(page); 2.59 + 2.60 + /* Check to make sure this is a contiguous region */ 2.61 + if( mfn_x(mfn) & ((1 << order) - 1) ) 2.62 + { 2.63 + printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n", 2.64 + __func__, mfn_x(mfn), order, ((1UL << order) - 1)); 2.65 + return -1; 2.66 + } 2.67 + 2.68 + for(i=0; i < 1 << order ; i++) { 2.69 + struct domain * od; 2.70 + 2.71 + p = mfn_to_page(_mfn(mfn_x(mfn) + i)); 2.72 + od = page_get_owner(p); 2.73 + if(od != d) 2.74 + { 2.75 + printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", 2.76 + __func__, mfn_x(mfn), d->domain_id, 2.77 + od?od->domain_id:-1); 2.78 + return -1; 2.79 + } 2.80 + } 2.81 +#endif 2.82 + 2.83 + spin_lock(&d->page_alloc_lock); 2.84 + 2.85 + /* First, take all pages off the domain list */ 2.86 + for(i=0; i < 1 << order ; i++) 2.87 + { 2.88 + p = page + i; 2.89 + list_del(&p->list); 2.90 + } 2.91 + 2.92 + /* Then add the first one to the appropriate populate-on-demand list */ 2.93 + switch(order) 2.94 + { 2.95 + case 9: 2.96 + list_add_tail(&page->list, &p2md->pod.super); /* lock: page_alloc */ 2.97 + p2md->pod.count += 1 << order; 2.98 + break; 2.99 + case 0: 2.100 + list_add_tail(&page->list, &p2md->pod.single); /* lock: page_alloc */ 2.101 + p2md->pod.count += 1 ; 2.102 + break; 2.103 + default: 2.104 + BUG(); 2.105 + } 2.106 + 2.107 + spin_unlock(&d->page_alloc_lock); 2.108 + 2.109 + return 0; 2.110 +} 2.111 + 2.112 +void 2.113 +p2m_pod_empty_cache(struct domain *d) 2.114 +{ 2.115 + struct p2m_domain *p2md = d->arch.p2m; 2.116 + struct list_head *q, *p; 2.117 + 2.118 + spin_lock(&d->page_alloc_lock); 2.119 + 2.120 + list_for_each_safe(p, q, &p2md->pod.super) /* lock: page_alloc */ 2.121 + { 2.122 + int i; 2.123 + struct page_info *page; 2.124 + 2.125 + list_del(p); 2.126 + 2.127 + page = list_entry(p, struct page_info, list); 2.128 + 2.129 + for ( i = 0 ; i < (1 << 9) ; i++ ) 2.130 + { 2.131 + BUG_ON(page_get_owner(page + i) != d); 2.132 + list_add_tail(&page[i].list, &d->page_list); 2.133 + } 2.134 + 2.135 + p2md->pod.count -= 1<<9; 2.136 + } 2.137 + 2.138 + list_for_each_safe(p, q, &p2md->pod.single) 2.139 + { 2.140 + struct page_info *page; 2.141 + 2.142 + list_del(p); 2.143 + 2.144 + page = list_entry(p, struct page_info, list); 2.145 + 2.146 + BUG_ON(page_get_owner(page) != d); 2.147 + list_add_tail(&page->list, &d->page_list); 2.148 + 2.149 + p2md->pod.count -= 1; 2.150 + } 2.151 + 2.152 + BUG_ON(p2md->pod.count != 0); 2.153 + 2.154 + spin_unlock(&d->page_alloc_lock); 2.155 +} 2.156 + 2.157 +void 2.158 +p2m_pod_dump_data(struct domain *d) 2.159 +{ 2.160 + struct p2m_domain *p2md = d->arch.p2m; 2.161 + 2.162 + printk(" PoD entries=%d cachesize=%d\n", 2.163 + p2md->pod.entry_count, p2md->pod.count); 2.164 +} 2.165 + 2.166 +static int 2.167 +p2m_pod_demand_populate(struct domain *d, unsigned long gfn, 2.168 + mfn_t table_mfn, 2.169 + l1_pgentry_t *p2m_entry, 2.170 + unsigned int order, 2.171 + p2m_query_t q) 2.172 +{ 2.173 + struct page_info *p = NULL; /* Compiler warnings */ 2.174 + unsigned long gfn_aligned; 2.175 + mfn_t mfn; 2.176 + l1_pgentry_t entry_content = l1e_empty(); 2.177 + struct p2m_domain *p2md = d->arch.p2m; 2.178 + int i; 2.179 + 2.180 + /* We need to grab the p2m lock here and re-check the entry to make 2.181 + * sure that someone else hasn't populated it for us, then hold it 2.182 + * until we're done. */ 2.183 + p2m_lock(p2md); 2.184 + audit_p2m(d); 2.185 + 2.186 + /* Check to make sure this is still PoD */ 2.187 + if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand ) 2.188 + { 2.189 + p2m_unlock(p2md); 2.190 + return 0; 2.191 + } 2.192 + 2.193 + spin_lock(&d->page_alloc_lock); 2.194 + 2.195 + if ( p2md->pod.count == 0 ) 2.196 + goto out_of_memory; 2.197 + 2.198 + /* FIXME -- use single pages / splinter superpages if need be */ 2.199 + switch ( order ) 2.200 + { 2.201 + case 9: 2.202 + BUG_ON( list_empty(&p2md->pod.super) ); 2.203 + p = list_entry(p2md->pod.super.next, struct page_info, list); 2.204 + p2md->pod.count -= 1 << order; /* Lock: page_alloc */ 2.205 + break; 2.206 + case 0: 2.207 + BUG_ON( list_empty(&p2md->pod.single) ); 2.208 + p = list_entry(p2md->pod.single.next, struct page_info, list); 2.209 + p2md->pod.count -= 1; 2.210 + break; 2.211 + default: 2.212 + BUG(); 2.213 + } 2.214 + 2.215 + list_del(&p->list); 2.216 + 2.217 + mfn = page_to_mfn(p); 2.218 + 2.219 + BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); 2.220 + 2.221 + /* Put the pages back on the domain page_list */ 2.222 + for ( i = 0 ; i < (1 << order) ; i++ ) 2.223 + { 2.224 + BUG_ON(page_get_owner(p + i) != d); 2.225 + list_add_tail(&p[i].list, &d->page_list); 2.226 + } 2.227 + 2.228 + spin_unlock(&d->page_alloc_lock); 2.229 + 2.230 + /* Fill in the entry in the p2m */ 2.231 + switch ( order ) 2.232 + { 2.233 + case 9: 2.234 + { 2.235 + l2_pgentry_t l2e_content; 2.236 + 2.237 + l2e_content = l2e_from_pfn(mfn_x(mfn), 2.238 + p2m_type_to_flags(p2m_ram_rw) | _PAGE_PSE); 2.239 + 2.240 + entry_content.l1 = l2e_content.l2; 2.241 + } 2.242 + break; 2.243 + case 0: 2.244 + entry_content = l1e_from_pfn(mfn_x(mfn), 2.245 + p2m_type_to_flags(p2m_ram_rw)); 2.246 + break; 2.247 + 2.248 + } 2.249 + 2.250 + gfn_aligned = (gfn >> order) << order; 2.251 + 2.252 + paging_write_p2m_entry(d, gfn_aligned, p2m_entry, table_mfn, 2.253 + entry_content, (order==9)?2:1); 2.254 + 2.255 + for( i = 0 ; i < (1UL << order) ; i++ ) 2.256 + set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i); 2.257 + 2.258 + p2md->pod.entry_count -= (1 << order); /* Lock: p2m */ 2.259 + BUG_ON(p2md->pod.entry_count < 0); 2.260 + audit_p2m(d); 2.261 + p2m_unlock(p2md); 2.262 + 2.263 + return 0; 2.264 +out_of_memory: 2.265 + spin_unlock(&d->page_alloc_lock); 2.266 + audit_p2m(d); 2.267 + p2m_unlock(p2md); 2.268 + printk("%s: Out of populate-on-demand memory!\n", __func__); 2.269 + domain_crash(d); 2.270 + return -1; 2.271 +} 2.272 + 2.273 // Returns 0 on error (out of memory) 2.274 static int 2.275 p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 2.276 @@ -303,6 +541,7 @@ p2m_set_entry(struct domain *d, unsigned 2.277 L2_PAGETABLE_ENTRIES); 2.278 ASSERT(p2m_entry); 2.279 2.280 + /* FIXME: Deal with 4k replaced by 2meg pages */ 2.281 if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && 2.282 !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) 2.283 { 2.284 @@ -311,7 +550,7 @@ p2m_set_entry(struct domain *d, unsigned 2.285 goto out; 2.286 } 2.287 2.288 - if ( mfn_valid(mfn) ) 2.289 + if ( mfn_valid(mfn) || p2m_is_magic(p2mt) ) 2.290 l2e_content = l2e_from_pfn(mfn_x(mfn), 2.291 p2m_type_to_flags(p2mt) | _PAGE_PSE); 2.292 else 2.293 @@ -403,8 +642,21 @@ p2m_gfn_to_mfn(struct domain *d, unsigne 2.294 2.295 l2e = map_domain_page(mfn_x(mfn)); 2.296 l2e += l2_table_offset(addr); 2.297 + 2.298 +pod_retry_l2: 2.299 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) 2.300 { 2.301 + /* PoD: Try to populate a 2-meg chunk */ 2.302 + if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) 2.303 + { 2.304 + if ( q != p2m_query ) { 2.305 + if( !p2m_pod_demand_populate(d, gfn, mfn, 2.306 + (l1_pgentry_t *)l2e, 9, q) ) 2.307 + goto pod_retry_l2; 2.308 + } else 2.309 + *t = p2m_populate_on_demand; 2.310 + } 2.311 + 2.312 unmap_domain_page(l2e); 2.313 return _mfn(INVALID_MFN); 2.314 } 2.315 @@ -423,8 +675,20 @@ p2m_gfn_to_mfn(struct domain *d, unsigne 2.316 2.317 l1e = map_domain_page(mfn_x(mfn)); 2.318 l1e += l1_table_offset(addr); 2.319 +pod_retry_l1: 2.320 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) 2.321 { 2.322 + /* PoD: Try to populate */ 2.323 + if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand ) 2.324 + { 2.325 + if ( q != p2m_query ) { 2.326 + if( !p2m_pod_demand_populate(d, gfn, mfn, 2.327 + (l1_pgentry_t *)l1e, 0, q) ) 2.328 + goto pod_retry_l1; 2.329 + } else 2.330 + *t = p2m_populate_on_demand; 2.331 + } 2.332 + 2.333 unmap_domain_page(l1e); 2.334 return _mfn(INVALID_MFN); 2.335 } 2.336 @@ -450,48 +714,114 @@ static mfn_t p2m_gfn_to_mfn_current(unsi 2.337 2.338 if ( gfn <= current->domain->arch.p2m->max_mapped_pfn ) 2.339 { 2.340 - l1_pgentry_t l1e = l1e_empty(); 2.341 + l1_pgentry_t l1e = l1e_empty(), *p2m_entry; 2.342 l2_pgentry_t l2e = l2e_empty(); 2.343 int ret; 2.344 2.345 ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 2.346 / sizeof(l1_pgentry_t)); 2.347 2.348 + /* 2.349 + * Read & process L2 2.350 + */ 2.351 + p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) 2.352 + + l2_linear_offset(addr)]; 2.353 + 2.354 + pod_retry_l2: 2.355 ret = __copy_from_user(&l2e, 2.356 - &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)], 2.357 + p2m_entry, 2.358 sizeof(l2e)); 2.359 + if ( ret != 0 2.360 + || !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) 2.361 + { 2.362 + if( (l2e_get_flags(l2e) & _PAGE_PSE) 2.363 + && ( p2m_flags_to_type(l2e_get_flags(l2e)) 2.364 + == p2m_populate_on_demand ) ) 2.365 + { 2.366 + /* The read has succeeded, so we know that the mapping 2.367 + * exits at this point. */ 2.368 + if ( q != p2m_query ) 2.369 + { 2.370 + if( !p2m_pod_demand_populate(current->domain, gfn, mfn, 2.371 + p2m_entry, 9, q) ) 2.372 + goto pod_retry_l2; 2.373 + 2.374 + /* Allocate failed. */ 2.375 + p2mt = p2m_invalid; 2.376 + printk("%s: Allocate failed!\n", __func__); 2.377 + goto out; 2.378 + } 2.379 + else 2.380 + { 2.381 + p2mt = p2m_populate_on_demand; 2.382 + goto out; 2.383 + } 2.384 + } 2.385 + 2.386 + goto pod_retry_l1; 2.387 + } 2.388 2.389 - if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 2.390 - (l2e_get_flags(l2e) & _PAGE_PSE) ) 2.391 + if (l2e_get_flags(l2e) & _PAGE_PSE) 2.392 { 2.393 p2mt = p2m_flags_to_type(l2e_get_flags(l2e)); 2.394 ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt)); 2.395 + 2.396 if ( p2m_is_valid(p2mt) ) 2.397 mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr)); 2.398 else 2.399 p2mt = p2m_mmio_dm; 2.400 + 2.401 + goto out; 2.402 } 2.403 - else 2.404 - { 2.405 - 2.406 - /* Need to __copy_from_user because the p2m is sparse and this 2.407 - * part might not exist */ 2.408 - ret = __copy_from_user(&l1e, 2.409 - &phys_to_machine_mapping[gfn], 2.410 - sizeof(l1e)); 2.411 + 2.412 + /* 2.413 + * Read and process L1 2.414 + */ 2.415 + 2.416 + /* Need to __copy_from_user because the p2m is sparse and this 2.417 + * part might not exist */ 2.418 + pod_retry_l1: 2.419 + p2m_entry = &phys_to_machine_mapping[gfn]; 2.420 + 2.421 + ret = __copy_from_user(&l1e, 2.422 + p2m_entry, 2.423 + sizeof(l1e)); 2.424 2.425 - if ( ret == 0 ) { 2.426 - p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); 2.427 - ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); 2.428 - if ( p2m_is_valid(p2mt) ) 2.429 - mfn = _mfn(l1e_get_pfn(l1e)); 2.430 - else 2.431 - /* XXX see above */ 2.432 - p2mt = p2m_mmio_dm; 2.433 + if ( ret == 0 ) { 2.434 + p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); 2.435 + ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); 2.436 + 2.437 + if ( p2m_flags_to_type(l1e_get_flags(l1e)) 2.438 + == p2m_populate_on_demand ) 2.439 + { 2.440 + /* The read has succeeded, so we know that the mapping 2.441 + * exits at this point. */ 2.442 + if ( q != p2m_query ) 2.443 + { 2.444 + if( !p2m_pod_demand_populate(current->domain, gfn, mfn, 2.445 + (l1_pgentry_t *)p2m_entry, 0, 2.446 + q) ) 2.447 + goto pod_retry_l1; 2.448 + 2.449 + /* Allocate failed. */ 2.450 + p2mt = p2m_invalid; 2.451 + goto out; 2.452 + } 2.453 + else 2.454 + { 2.455 + p2mt = p2m_populate_on_demand; 2.456 + goto out; 2.457 + } 2.458 } 2.459 + 2.460 + if ( p2m_is_valid(p2mt) ) 2.461 + mfn = _mfn(l1e_get_pfn(l1e)); 2.462 + else 2.463 + /* XXX see above */ 2.464 + p2mt = p2m_mmio_dm; 2.465 } 2.466 } 2.467 - 2.468 +out: 2.469 *t = p2mt; 2.470 return mfn; 2.471 } 2.472 @@ -510,6 +840,8 @@ int p2m_init(struct domain *d) 2.473 memset(p2m, 0, sizeof(*p2m)); 2.474 p2m_lock_init(p2m); 2.475 INIT_LIST_HEAD(&p2m->pages); 2.476 + INIT_LIST_HEAD(&p2m->pod.super); 2.477 + INIT_LIST_HEAD(&p2m->pod.single); 2.478 2.479 p2m->set_entry = p2m_set_entry; 2.480 p2m->get_entry = p2m_gfn_to_mfn; 2.481 @@ -680,6 +1012,7 @@ static void audit_p2m(struct domain *d) 2.482 struct page_info *page; 2.483 struct domain *od; 2.484 unsigned long mfn, gfn, m2pfn, lp2mfn = 0; 2.485 + int entry_count = 0; 2.486 mfn_t p2mfn; 2.487 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; 2.488 int test_linear; 2.489 @@ -805,6 +1138,10 @@ static void audit_p2m(struct domain *d) 2.490 { 2.491 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) 2.492 { 2.493 + if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) 2.494 + && ( p2m_flags_to_type(l2e_get_flags(l2e[i2])) 2.495 + == p2m_populate_on_demand ) ) 2.496 + entry_count+=(1<<9); 2.497 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); 2.498 continue; 2.499 } 2.500 @@ -835,13 +1172,20 @@ static void audit_p2m(struct domain *d) 2.501 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) 2.502 { 2.503 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) 2.504 + { 2.505 + if ( p2m_flags_to_type(l1e_get_flags(l1e[i1])) 2.506 + == p2m_populate_on_demand ) 2.507 + entry_count++; 2.508 continue; 2.509 + } 2.510 mfn = l1e_get_pfn(l1e[i1]); 2.511 ASSERT(mfn_valid(_mfn(mfn))); 2.512 m2pfn = get_gpfn_from_mfn(mfn); 2.513 if ( m2pfn != gfn ) 2.514 { 2.515 pmbad++; 2.516 + printk("mismatch: gfn %#lx -> mfn %#lx" 2.517 + " -> gfn %#lx\n", gfn, mfn, m2pfn); 2.518 P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" 2.519 " -> gfn %#lx\n", gfn, mfn, m2pfn); 2.520 BUG(); 2.521 @@ -864,6 +1208,15 @@ static void audit_p2m(struct domain *d) 2.522 2.523 } 2.524 2.525 + if ( entry_count != d->arch.p2m->pod.entry_count ) 2.526 + { 2.527 + printk("%s: refcounted entry count %d, audit count %d!\n", 2.528 + __func__, 2.529 + d->arch.p2m->pod.entry_count, 2.530 + entry_count); 2.531 + BUG(); 2.532 + } 2.533 + 2.534 //P2M_PRINTK("p2m audit complete\n"); 2.535 //if ( orphans_i | orphans_d | mpbad | pmbad ) 2.536 // P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", 2.537 @@ -872,8 +1225,6 @@ static void audit_p2m(struct domain *d) 2.538 P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", 2.539 pmbad, mpbad); 2.540 } 2.541 -#else 2.542 -#define audit_p2m(_d) do { (void)(_d); } while(0) 2.543 #endif /* P2M_AUDIT */ 2.544 2.545 2.546 @@ -911,6 +1262,77 @@ guest_physmap_remove_page(struct domain 2.547 } 2.548 2.549 int 2.550 +guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, 2.551 + unsigned int order) 2.552 +{ 2.553 + struct p2m_domain *p2md = d->arch.p2m; 2.554 + unsigned long i; 2.555 + p2m_type_t ot; 2.556 + mfn_t omfn; 2.557 + int pod_count = 0; 2.558 + int rc = 0; 2.559 + 2.560 + BUG_ON(!paging_mode_translate(d)); 2.561 + 2.562 +#if CONFIG_PAGING_LEVELS == 3 2.563 + /* 2.564 + * 32bit PAE nested paging does not support over 4GB guest due to 2.565 + * hardware translation limit. This limitation is checked by comparing 2.566 + * gfn with 0xfffffUL. 2.567 + */ 2.568 + if ( paging_mode_hap(d) && (gfn > 0xfffffUL) ) 2.569 + { 2.570 + if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) ) 2.571 + dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond" 2.572 + " 4GB: specify 'hap=0' domain config option.\n", 2.573 + d->domain_id); 2.574 + return -EINVAL; 2.575 + } 2.576 +#endif 2.577 + 2.578 + p2m_lock(p2md); 2.579 + audit_p2m(d); 2.580 + 2.581 + P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn); 2.582 + 2.583 + /* Make sure all gpfns are unused */ 2.584 + for ( i = 0; i < (1UL << order); i++ ) 2.585 + { 2.586 + omfn = gfn_to_mfn_query(d, gfn + i, &ot); 2.587 + if ( p2m_is_ram(ot) ) 2.588 + { 2.589 + printk("%s: gfn_to_mfn returned type %d!\n", 2.590 + __func__, ot); 2.591 + rc = -EBUSY; 2.592 + goto out; 2.593 + } 2.594 + else if ( ot == p2m_populate_on_demand ) 2.595 + { 2.596 + /* Count how man PoD entries we'll be replacing if successful */ 2.597 + pod_count++; 2.598 + } 2.599 + } 2.600 + 2.601 + /* Now, actually do the two-way mapping */ 2.602 + if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order, 2.603 + p2m_populate_on_demand) ) 2.604 + rc = -EINVAL; 2.605 + else 2.606 + { 2.607 + p2md->pod.entry_count += 1 << order; /* Lock: p2m */ 2.608 + p2md->pod.entry_count -= pod_count; 2.609 + BUG_ON(p2md->pod.entry_count < 0); 2.610 + } 2.611 + 2.612 + audit_p2m(d); 2.613 + p2m_unlock(p2md); 2.614 + 2.615 +out: 2.616 + return rc; 2.617 + 2.618 +} 2.619 + 2.620 +int 2.621 guest_physmap_add_entry(struct domain *d, unsigned long gfn, 2.622 unsigned long mfn, unsigned int page_order, 2.623 p2m_type_t t) 2.624 @@ -918,6 +1340,7 @@ guest_physmap_add_entry(struct domain *d 2.625 unsigned long i, ogfn; 2.626 p2m_type_t ot; 2.627 mfn_t omfn; 2.628 + int pod_count = 0; 2.629 int rc = 0; 2.630 2.631 if ( !paging_mode_translate(d) ) 2.632 @@ -966,6 +1389,11 @@ guest_physmap_add_entry(struct domain *d 2.633 ASSERT(mfn_valid(omfn)); 2.634 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); 2.635 } 2.636 + else if ( ot == p2m_populate_on_demand ) 2.637 + { 2.638 + /* Count how man PoD entries we'll be replacing if successful */ 2.639 + pod_count++; 2.640 + } 2.641 } 2.642 2.643 /* Then, look for m->p mappings for this range and deal with them */ 2.644 @@ -1012,6 +1440,11 @@ guest_physmap_add_entry(struct domain *d 2.645 if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 2.646 p2m_invalid) ) 2.647 rc = -EINVAL; 2.648 + else 2.649 + { 2.650 + d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */ 2.651 + BUG_ON(d->arch.p2m->pod.entry_count < 0); 2.652 + } 2.653 } 2.654 2.655 audit_p2m(d);
3.1 --- a/xen/arch/x86/mm/paging.c Mon Jan 05 10:42:39 2009 +0000 3.2 +++ b/xen/arch/x86/mm/paging.c Mon Jan 05 10:43:19 2009 +0000 3.3 @@ -585,6 +585,9 @@ void paging_teardown(struct domain *d) 3.4 3.5 /* clean up log dirty resources. */ 3.6 paging_log_dirty_teardown(d); 3.7 + 3.8 + /* Move populate-on-demand cache back to domain_list for destruction */ 3.9 + p2m_pod_empty_cache(d); 3.10 } 3.11 3.12 /* Call once all of the references to the domain have gone away */
4.1 --- a/xen/arch/x86/mm/shadow/multi.c Mon Jan 05 10:42:39 2009 +0000 4.2 +++ b/xen/arch/x86/mm/shadow/multi.c Mon Jan 05 10:43:19 2009 +0000 4.3 @@ -2173,7 +2173,7 @@ static int validate_gl4e(struct vcpu *v, 4.4 mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt); 4.5 if ( p2m_is_ram(p2mt) ) 4.6 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow); 4.7 - else 4.8 + else if ( p2mt != p2m_populate_on_demand ) 4.9 result |= SHADOW_SET_ERROR; 4.10 4.11 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) 4.12 @@ -2230,7 +2230,7 @@ static int validate_gl3e(struct vcpu *v, 4.13 mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt); 4.14 if ( p2m_is_ram(p2mt) ) 4.15 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow); 4.16 - else 4.17 + else if ( p2mt != p2m_populate_on_demand ) 4.18 result |= SHADOW_SET_ERROR; 4.19 4.20 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) 4.21 @@ -2278,8 +2278,8 @@ static int validate_gl2e(struct vcpu *v, 4.22 { 4.23 mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt); 4.24 if ( p2m_is_ram(p2mt) ) 4.25 - sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 4.26 - else 4.27 + sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 4.28 + else if ( p2mt != p2m_populate_on_demand ) 4.29 result |= SHADOW_SET_ERROR; 4.30 } 4.31 }
5.1 --- a/xen/include/asm-x86/p2m.h Mon Jan 05 10:42:39 2009 +0000 5.2 +++ b/xen/include/asm-x86/p2m.h Mon Jan 05 10:43:19 2009 +0000 5.3 @@ -64,6 +64,7 @@ typedef enum { 5.4 p2m_ram_ro = 3, /* Read-only; writes are silently dropped */ 5.5 p2m_mmio_dm = 4, /* Reads and write go to the device model */ 5.6 p2m_mmio_direct = 5, /* Read/write mapping of genuine MMIO area */ 5.7 + p2m_populate_on_demand = 6, /* Place-holder for empty memory */ 5.8 } p2m_type_t; 5.9 5.10 typedef enum { 5.11 @@ -88,12 +89,20 @@ typedef enum { 5.12 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty) \ 5.13 | p2m_to_mask(p2m_ram_ro)) 5.14 5.15 +#define P2M_MAGIC_TYPES (p2m_to_mask(p2m_populate_on_demand)) 5.16 + 5.17 /* Useful predicates */ 5.18 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES) 5.19 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES) 5.20 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES) 5.21 +#define p2m_is_magic(_t) (p2m_to_mask(_t) & P2M_MAGIC_TYPES) 5.22 #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES)) 5.23 5.24 +/* Populate-on-demand */ 5.25 +#define POPULATE_ON_DEMAND_MFN (1<<9) 5.26 +#define POD_PAGE_ORDER 9 5.27 + 5.28 + 5.29 struct p2m_domain { 5.30 /* Lock that protects updates to the p2m */ 5.31 spinlock_t lock; 5.32 @@ -122,6 +131,28 @@ struct p2m_domain { 5.33 5.34 /* Highest guest frame that's ever been mapped in the p2m */ 5.35 unsigned long max_mapped_pfn; 5.36 + 5.37 + /* Populate-on-demand variables 5.38 + * NB on locking. {super,single,count} are 5.39 + * covered by d->page_alloc_lock, since they're almost always used in 5.40 + * conjunction with that functionality. {entry_count} is covered by 5.41 + * the domain p2m lock, since it's almost always used in conjunction 5.42 + * with changing the p2m tables. 5.43 + * 5.44 + * At this point, both locks are held in two places. In both, 5.45 + * the order is [p2m,page_alloc]: 5.46 + * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(), 5.47 + * which grabs page_alloc 5.48 + * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid 5.49 + * double-demand-populating of pages, the page_alloc lock to 5.50 + * protect moving stuff from the PoD cache to the domain page list. 5.51 + */ 5.52 + struct { 5.53 + struct list_head super, /* List of superpages */ 5.54 + single; /* Non-super lists */ 5.55 + int count, /* # of pages in cache lists */ 5.56 + entry_count; /* # of pages in p2m marked pod */ 5.57 + } pod; 5.58 }; 5.59 5.60 /* Extract the type from the PTE flags that store it */ 5.61 @@ -220,11 +251,22 @@ int p2m_alloc_table(struct domain *d, 5.62 void p2m_teardown(struct domain *d); 5.63 void p2m_final_teardown(struct domain *d); 5.64 5.65 +/* Dump PoD information about the domain */ 5.66 +void p2m_pod_dump_data(struct domain *d); 5.67 + 5.68 +/* Move all pages from the populate-on-demand cache to the domain page_list 5.69 + * (usually in preparation for domain destruction) */ 5.70 +void p2m_pod_empty_cache(struct domain *d); 5.71 + 5.72 /* Add a page to a domain's p2m table */ 5.73 int guest_physmap_add_entry(struct domain *d, unsigned long gfn, 5.74 unsigned long mfn, unsigned int page_order, 5.75 p2m_type_t t); 5.76 5.77 +/* Set a p2m range as populate-on-demand */ 5.78 +int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, 5.79 + unsigned int order); 5.80 + 5.81 /* Untyped version for RAM only, for compatibility 5.82 * 5.83 * Return 0 for success