/root/src/xen/xen/arch/x86/mm/p2m-pod.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * arch/x86/mm/p2m-pod.c |
3 | | * |
4 | | * Populate-on-demand p2m entries. |
5 | | * |
6 | | * Copyright (c) 2009-2011 Citrix Systems, Inc. |
7 | | * |
8 | | * This program is free software; you can redistribute it and/or modify |
9 | | * it under the terms of the GNU General Public License as published by |
10 | | * the Free Software Foundation; either version 2 of the License, or |
11 | | * (at your option) any later version. |
12 | | * |
13 | | * This program is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | * GNU General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU General Public License |
19 | | * along with this program; If not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include <xen/event.h> |
23 | | #include <xen/mm.h> |
24 | | #include <xen/sched.h> |
25 | | #include <xen/trace.h> |
26 | | #include <asm/page.h> |
27 | | #include <asm/paging.h> |
28 | | #include <asm/p2m.h> |
29 | | |
30 | | #include "mm-locks.h" |
31 | | |
32 | | /* Override macros from asm/page.h to make them work with mfn_t */ |
33 | | #undef mfn_to_page |
34 | 0 | #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) |
35 | | #undef page_to_mfn |
36 | 0 | #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) |
37 | | |
38 | 0 | #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) |
39 | | |
40 | | /* Enforce lock ordering when grabbing the "external" page_alloc lock */ |
41 | | static inline void lock_page_alloc(struct p2m_domain *p2m) |
42 | 0 | { |
43 | 0 | page_alloc_mm_pre_lock(); |
44 | 0 | spin_lock(&(p2m->domain->page_alloc_lock)); |
45 | 0 | page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level); |
46 | 0 | } |
47 | | |
48 | | static inline void unlock_page_alloc(struct p2m_domain *p2m) |
49 | 0 | { |
50 | 0 | page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level); |
51 | 0 | spin_unlock(&(p2m->domain->page_alloc_lock)); |
52 | 0 | } |
53 | | |
54 | | /* |
55 | | * Populate-on-demand functionality |
56 | | */ |
57 | | |
58 | | static int |
59 | | p2m_pod_cache_add(struct p2m_domain *p2m, |
60 | | struct page_info *page, |
61 | | unsigned int order) |
62 | 0 | { |
63 | 0 | unsigned long i; |
64 | 0 | struct page_info *p; |
65 | 0 | struct domain *d = p2m->domain; |
66 | 0 |
|
67 | 0 | #ifndef NDEBUG |
68 | 0 | mfn_t mfn; |
69 | 0 |
|
70 | 0 | mfn = page_to_mfn(page); |
71 | 0 |
|
72 | 0 | /* Check to make sure this is a contiguous region */ |
73 | 0 | if ( mfn_x(mfn) & ((1UL << order) - 1) ) |
74 | 0 | { |
75 | 0 | printk("%s: mfn %lx not aligned order %u! (mask %lx)\n", |
76 | 0 | __func__, mfn_x(mfn), order, ((1UL << order) - 1)); |
77 | 0 | return -1; |
78 | 0 | } |
79 | 0 |
|
80 | 0 | for ( i = 0; i < 1UL << order ; i++) |
81 | 0 | { |
82 | 0 | struct domain * od; |
83 | 0 |
|
84 | 0 | p = mfn_to_page(_mfn(mfn_x(mfn) + i)); |
85 | 0 | od = page_get_owner(p); |
86 | 0 | if ( od != d ) |
87 | 0 | { |
88 | 0 | printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", |
89 | 0 | __func__, mfn_x(mfn), d->domain_id, |
90 | 0 | od ? od->domain_id : -1); |
91 | 0 | return -1; |
92 | 0 | } |
93 | 0 | } |
94 | 0 | #endif |
95 | 0 |
|
96 | 0 | ASSERT(pod_locked_by_me(p2m)); |
97 | 0 |
|
98 | 0 | /* |
99 | 0 | * Pages from domain_alloc and returned by the balloon driver aren't |
100 | 0 | * guaranteed to be zero; but by reclaiming zero pages, we implicitly |
101 | 0 | * promise to provide zero pages. So we scrub pages before using. |
102 | 0 | */ |
103 | 0 | for ( i = 0; i < (1UL << order); i++ ) |
104 | 0 | clear_domain_page(mfn_add(page_to_mfn(page), i)); |
105 | 0 |
|
106 | 0 | /* First, take all pages off the domain list */ |
107 | 0 | lock_page_alloc(p2m); |
108 | 0 | for ( i = 0; i < 1UL << order ; i++ ) |
109 | 0 | { |
110 | 0 | p = page + i; |
111 | 0 | page_list_del(p, &d->page_list); |
112 | 0 | } |
113 | 0 |
|
114 | 0 | unlock_page_alloc(p2m); |
115 | 0 |
|
116 | 0 | /* Then add to the appropriate populate-on-demand list. */ |
117 | 0 | switch ( order ) |
118 | 0 | { |
119 | 0 | case PAGE_ORDER_1G: |
120 | 0 | for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M ) |
121 | 0 | page_list_add_tail(page + i, &p2m->pod.super); |
122 | 0 | break; |
123 | 0 | case PAGE_ORDER_2M: |
124 | 0 | page_list_add_tail(page, &p2m->pod.super); |
125 | 0 | break; |
126 | 0 | case PAGE_ORDER_4K: |
127 | 0 | page_list_add_tail(page, &p2m->pod.single); |
128 | 0 | break; |
129 | 0 | default: |
130 | 0 | BUG(); |
131 | 0 | } |
132 | 0 | p2m->pod.count += 1UL << order; |
133 | 0 |
|
134 | 0 | return 0; |
135 | 0 | } |
136 | | |
137 | | /* Get a page of size order from the populate-on-demand cache. Will break |
138 | | * down 2-meg pages into singleton pages automatically. Returns null if |
139 | | * a superpage is requested and no superpages are available. */ |
140 | | static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, |
141 | | unsigned int order) |
142 | 0 | { |
143 | 0 | struct page_info *p = NULL; |
144 | 0 | unsigned long i; |
145 | 0 |
|
146 | 0 | ASSERT(pod_locked_by_me(p2m)); |
147 | 0 |
|
148 | 0 | if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) |
149 | 0 | { |
150 | 0 | return NULL; |
151 | 0 | } |
152 | 0 | else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) ) |
153 | 0 | { |
154 | 0 | unsigned long mfn; |
155 | 0 | struct page_info *q; |
156 | 0 |
|
157 | 0 | BUG_ON( page_list_empty(&p2m->pod.super) ); |
158 | 0 |
|
159 | 0 | /* |
160 | 0 | * Break up a superpage to make single pages. NB count doesn't |
161 | 0 | * need to be adjusted. |
162 | 0 | */ |
163 | 0 | p = page_list_remove_head(&p2m->pod.super); |
164 | 0 | mfn = mfn_x(page_to_mfn(p)); |
165 | 0 |
|
166 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i++ ) |
167 | 0 | { |
168 | 0 | q = mfn_to_page(_mfn(mfn+i)); |
169 | 0 | page_list_add_tail(q, &p2m->pod.single); |
170 | 0 | } |
171 | 0 | } |
172 | 0 |
|
173 | 0 | switch ( order ) |
174 | 0 | { |
175 | 0 | case PAGE_ORDER_2M: |
176 | 0 | BUG_ON( page_list_empty(&p2m->pod.super) ); |
177 | 0 | p = page_list_remove_head(&p2m->pod.super); |
178 | 0 | p2m->pod.count -= 1UL << order; |
179 | 0 | break; |
180 | 0 | case PAGE_ORDER_4K: |
181 | 0 | BUG_ON( page_list_empty(&p2m->pod.single) ); |
182 | 0 | p = page_list_remove_head(&p2m->pod.single); |
183 | 0 | p2m->pod.count -= 1UL; |
184 | 0 | break; |
185 | 0 | default: |
186 | 0 | BUG(); |
187 | 0 | } |
188 | 0 |
|
189 | 0 | /* Put the pages back on the domain page_list */ |
190 | 0 | lock_page_alloc(p2m); |
191 | 0 | for ( i = 0 ; i < (1UL << order); i++ ) |
192 | 0 | { |
193 | 0 | BUG_ON(page_get_owner(p + i) != p2m->domain); |
194 | 0 | page_list_add_tail(p + i, &p2m->domain->page_list); |
195 | 0 | } |
196 | 0 | unlock_page_alloc(p2m); |
197 | 0 |
|
198 | 0 | return p; |
199 | 0 | } |
200 | | |
201 | | /* Set the size of the cache, allocating or freeing as necessary. */ |
202 | | static int |
203 | | p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible) |
204 | 0 | { |
205 | 0 | struct domain *d = p2m->domain; |
206 | 0 | int ret = 0; |
207 | 0 |
|
208 | 0 | ASSERT(pod_locked_by_me(p2m)); |
209 | 0 |
|
210 | 0 | /* Increasing the target */ |
211 | 0 | while ( pod_target > p2m->pod.count ) |
212 | 0 | { |
213 | 0 | struct page_info * page; |
214 | 0 | int order; |
215 | 0 |
|
216 | 0 | if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES ) |
217 | 0 | order = PAGE_ORDER_2M; |
218 | 0 | else |
219 | 0 | order = PAGE_ORDER_4K; |
220 | 0 | retry: |
221 | 0 | page = alloc_domheap_pages(d, order, 0); |
222 | 0 | if ( unlikely(page == NULL) ) |
223 | 0 | { |
224 | 0 | if ( order == PAGE_ORDER_2M ) |
225 | 0 | { |
226 | 0 | /* If we can't allocate a superpage, try singleton pages */ |
227 | 0 | order = PAGE_ORDER_4K; |
228 | 0 | goto retry; |
229 | 0 | } |
230 | 0 |
|
231 | 0 | printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n", |
232 | 0 | __func__, pod_target, p2m->pod.count); |
233 | 0 | ret = -ENOMEM; |
234 | 0 | goto out; |
235 | 0 | } |
236 | 0 |
|
237 | 0 | p2m_pod_cache_add(p2m, page, order); |
238 | 0 |
|
239 | 0 | if ( preemptible && pod_target != p2m->pod.count && |
240 | 0 | hypercall_preempt_check() ) |
241 | 0 | { |
242 | 0 | ret = -ERESTART; |
243 | 0 | goto out; |
244 | 0 | } |
245 | 0 | } |
246 | 0 |
|
247 | 0 | /* Decreasing the target */ |
248 | 0 | /* |
249 | 0 | * We hold the pod lock here, so we don't need to worry about |
250 | 0 | * cache disappearing under our feet. |
251 | 0 | */ |
252 | 0 | while ( pod_target < p2m->pod.count ) |
253 | 0 | { |
254 | 0 | struct page_info * page; |
255 | 0 | unsigned int order; |
256 | 0 | unsigned long i; |
257 | 0 |
|
258 | 0 | if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES |
259 | 0 | && !page_list_empty(&p2m->pod.super) ) |
260 | 0 | order = PAGE_ORDER_2M; |
261 | 0 | else |
262 | 0 | order = PAGE_ORDER_4K; |
263 | 0 |
|
264 | 0 | page = p2m_pod_cache_get(p2m, order); |
265 | 0 |
|
266 | 0 | ASSERT(page != NULL); |
267 | 0 |
|
268 | 0 | /* Then free them */ |
269 | 0 | for ( i = 0 ; i < (1UL << order) ; i++ ) |
270 | 0 | { |
271 | 0 | /* Copied from common/memory.c:guest_remove_page() */ |
272 | 0 | if ( unlikely(!get_page(page + i, d)) ) |
273 | 0 | { |
274 | 0 | gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); |
275 | 0 | ret = -EINVAL; |
276 | 0 | goto out; |
277 | 0 | } |
278 | 0 |
|
279 | 0 | if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) |
280 | 0 | put_page_and_type(page + i); |
281 | 0 |
|
282 | 0 | if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) |
283 | 0 | put_page(page + i); |
284 | 0 |
|
285 | 0 | put_page(page + i); |
286 | 0 |
|
287 | 0 | if ( preemptible && pod_target != p2m->pod.count && |
288 | 0 | hypercall_preempt_check() ) |
289 | 0 | { |
290 | 0 | ret = -ERESTART; |
291 | 0 | goto out; |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | 0 |
|
296 | 0 | out: |
297 | 0 | return ret; |
298 | 0 | } |
299 | | |
300 | | /* |
301 | | * The "right behavior" here requires some careful thought. First, some |
302 | | * definitions: |
303 | | * + M: static_max |
304 | | * + B: number of pages the balloon driver has ballooned down to. |
305 | | * + P: Number of populated pages. |
306 | | * + T: Old target |
307 | | * + T': New target |
308 | | * |
309 | | * The following equations should hold: |
310 | | * 0 <= P <= T <= B <= M |
311 | | * d->arch.p2m->pod.entry_count == B - P |
312 | | * d->tot_pages == P + d->arch.p2m->pod.count |
313 | | * |
314 | | * Now we have the following potential cases to cover: |
315 | | * B <T': Set the PoD cache size equal to the number of outstanding PoD |
316 | | * entries. The balloon driver will deflate the balloon to give back |
317 | | * the remainder of the ram to the guest OS. |
318 | | * T <T'<B : Increase PoD cache size. |
319 | | * T'<T<=B : Here we have a choice. We can decrease the size of the cache, |
320 | | * get the memory right away. However, that means every time we |
321 | | * reduce the memory target we risk the guest attempting to populate the |
322 | | * memory before the balloon driver has reached its new target. Safer to |
323 | | * never reduce the cache size here, but only when the balloon driver frees |
324 | | * PoD ranges. |
325 | | * |
326 | | * If there are many zero pages, we could reach the target also by doing |
327 | | * zero sweeps and marking the ranges PoD; but the balloon driver will have |
328 | | * to free this memory eventually anyway, so we don't actually gain that much |
329 | | * by doing so. |
330 | | * |
331 | | * NB that the equation (B<T') may require adjustment to the cache |
332 | | * size as PoD pages are freed as well; i.e., freeing a PoD-backed |
333 | | * entry when pod.entry_count == pod.count requires us to reduce both |
334 | | * pod.entry_count and pod.count. |
335 | | */ |
336 | | int |
337 | | p2m_pod_set_mem_target(struct domain *d, unsigned long target) |
338 | 0 | { |
339 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
340 | 0 | int ret = 0; |
341 | 0 | unsigned long populated, pod_target; |
342 | 0 |
|
343 | 0 | pod_lock(p2m); |
344 | 0 |
|
345 | 0 | /* P == B: Nothing to do (unless the guest is being created). */ |
346 | 0 | populated = d->tot_pages - p2m->pod.count; |
347 | 0 | if ( populated > 0 && p2m->pod.entry_count == 0 ) |
348 | 0 | goto out; |
349 | 0 |
|
350 | 0 | /* Don't do anything if the domain is being torn down */ |
351 | 0 | if ( d->is_dying ) |
352 | 0 | goto out; |
353 | 0 |
|
354 | 0 | /* |
355 | 0 | * T' < B: Don't reduce the cache size; let the balloon driver |
356 | 0 | * take care of it. |
357 | 0 | */ |
358 | 0 | if ( target < d->tot_pages ) |
359 | 0 | goto out; |
360 | 0 |
|
361 | 0 | pod_target = target - populated; |
362 | 0 |
|
363 | 0 | /* |
364 | 0 | * B < T': Set the cache size equal to # of outstanding entries, |
365 | 0 | * let the balloon driver fill in the rest. |
366 | 0 | */ |
367 | 0 | if ( populated > 0 && pod_target > p2m->pod.entry_count ) |
368 | 0 | pod_target = p2m->pod.entry_count; |
369 | 0 |
|
370 | 0 | ASSERT( pod_target >= p2m->pod.count ); |
371 | 0 |
|
372 | 0 | ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); |
373 | 0 |
|
374 | 0 | out: |
375 | 0 | pod_unlock(p2m); |
376 | 0 |
|
377 | 0 | return ret; |
378 | 0 | } |
379 | | |
380 | | int p2m_pod_empty_cache(struct domain *d) |
381 | 0 | { |
382 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
383 | 0 | struct page_info *page; |
384 | 0 | unsigned int i; |
385 | 0 |
|
386 | 0 | /* After this barrier no new PoD activities can happen. */ |
387 | 0 | BUG_ON(!d->is_dying); |
388 | 0 | spin_barrier(&p2m->pod.lock.lock); |
389 | 0 |
|
390 | 0 | lock_page_alloc(p2m); |
391 | 0 |
|
392 | 0 | while ( (page = page_list_remove_head(&p2m->pod.super)) ) |
393 | 0 | { |
394 | 0 | for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ ) |
395 | 0 | { |
396 | 0 | BUG_ON(page_get_owner(page + i) != d); |
397 | 0 | page_list_add_tail(page + i, &d->page_list); |
398 | 0 | } |
399 | 0 |
|
400 | 0 | p2m->pod.count -= SUPERPAGE_PAGES; |
401 | 0 |
|
402 | 0 | if ( hypercall_preempt_check() ) |
403 | 0 | goto out; |
404 | 0 | } |
405 | 0 |
|
406 | 0 | for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i ) |
407 | 0 | { |
408 | 0 | BUG_ON(page_get_owner(page) != d); |
409 | 0 | page_list_add_tail(page, &d->page_list); |
410 | 0 |
|
411 | 0 | p2m->pod.count -= 1; |
412 | 0 |
|
413 | 0 | if ( i && !(i & 511) && hypercall_preempt_check() ) |
414 | 0 | goto out; |
415 | 0 | } |
416 | 0 |
|
417 | 0 | BUG_ON(p2m->pod.count != 0); |
418 | 0 |
|
419 | 0 | out: |
420 | 0 | unlock_page_alloc(p2m); |
421 | 0 | return p2m->pod.count ? -ERESTART : 0; |
422 | 0 | } |
423 | | |
424 | | int |
425 | | p2m_pod_offline_or_broken_hit(struct page_info *p) |
426 | 0 | { |
427 | 0 | struct domain *d; |
428 | 0 | struct p2m_domain *p2m; |
429 | 0 | struct page_info *q, *tmp; |
430 | 0 | unsigned long mfn, bmfn; |
431 | 0 |
|
432 | 0 | if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) |
433 | 0 | return 0; |
434 | 0 |
|
435 | 0 | pod_lock(p2m); |
436 | 0 | bmfn = mfn_x(page_to_mfn(p)); |
437 | 0 | page_list_for_each_safe(q, tmp, &p2m->pod.super) |
438 | 0 | { |
439 | 0 | mfn = mfn_x(page_to_mfn(q)); |
440 | 0 | if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) ) |
441 | 0 | { |
442 | 0 | unsigned long i; |
443 | 0 | page_list_del(q, &p2m->pod.super); |
444 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i++) |
445 | 0 | { |
446 | 0 | q = mfn_to_page(_mfn(mfn + i)); |
447 | 0 | page_list_add_tail(q, &p2m->pod.single); |
448 | 0 | } |
449 | 0 | page_list_del(p, &p2m->pod.single); |
450 | 0 | p2m->pod.count--; |
451 | 0 | goto pod_hit; |
452 | 0 | } |
453 | 0 | } |
454 | 0 |
|
455 | 0 | page_list_for_each_safe(q, tmp, &p2m->pod.single) |
456 | 0 | { |
457 | 0 | mfn = mfn_x(page_to_mfn(q)); |
458 | 0 | if ( mfn == bmfn ) |
459 | 0 | { |
460 | 0 | page_list_del(p, &p2m->pod.single); |
461 | 0 | p2m->pod.count--; |
462 | 0 | goto pod_hit; |
463 | 0 | } |
464 | 0 | } |
465 | 0 |
|
466 | 0 | pod_unlock(p2m); |
467 | 0 | return 0; |
468 | 0 |
|
469 | 0 | pod_hit: |
470 | 0 | lock_page_alloc(p2m); |
471 | 0 | /* Insertion must be at list head (see iommu_populate_page_table()). */ |
472 | 0 | page_list_add(p, &d->arch.relmem_list); |
473 | 0 | unlock_page_alloc(p2m); |
474 | 0 | pod_unlock(p2m); |
475 | 0 | return 1; |
476 | 0 | } |
477 | | |
478 | | void |
479 | | p2m_pod_offline_or_broken_replace(struct page_info *p) |
480 | 0 | { |
481 | 0 | struct domain *d; |
482 | 0 | struct p2m_domain *p2m; |
483 | 0 | nodeid_t node = phys_to_nid(page_to_maddr(p)); |
484 | 0 |
|
485 | 0 | if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) |
486 | 0 | return; |
487 | 0 |
|
488 | 0 | free_domheap_page(p); |
489 | 0 |
|
490 | 0 | p = alloc_domheap_page(d, MEMF_node(node)); |
491 | 0 | if ( unlikely(!p) ) |
492 | 0 | return; |
493 | 0 |
|
494 | 0 | pod_lock(p2m); |
495 | 0 | p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K); |
496 | 0 | pod_unlock(p2m); |
497 | 0 | return; |
498 | 0 | } |
499 | | |
500 | | static int |
501 | | p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); |
502 | | |
503 | | |
504 | | /* |
505 | | * This function is needed for two reasons: |
506 | | * + To properly handle clearing of PoD entries |
507 | | * + To "steal back" memory being freed for the PoD cache, rather than |
508 | | * releasing it. |
509 | | * |
510 | | * Once both of these functions have been completed, we can return and |
511 | | * allow decrease_reservation() to handle everything else. |
512 | | */ |
513 | | int |
514 | | p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) |
515 | 0 | { |
516 | 0 | int ret = 0; |
517 | 0 | unsigned long i, n; |
518 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
519 | 0 | bool_t steal_for_cache; |
520 | 0 | long pod, nonpod, ram; |
521 | 0 |
|
522 | 0 | gfn_lock(p2m, gfn, order); |
523 | 0 | pod_lock(p2m); |
524 | 0 |
|
525 | 0 | /* |
526 | 0 | * If we don't have any outstanding PoD entries, let things take their |
527 | 0 | * course. |
528 | 0 | */ |
529 | 0 | if ( p2m->pod.entry_count == 0 ) |
530 | 0 | goto out_unlock; |
531 | 0 |
|
532 | 0 | if ( unlikely(d->is_dying) ) |
533 | 0 | goto out_unlock; |
534 | 0 |
|
535 | 0 | pod = nonpod = ram = 0; |
536 | 0 |
|
537 | 0 | /* Figure out if we need to steal some freed memory for our cache */ |
538 | 0 | steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); |
539 | 0 |
|
540 | 0 | for ( i = 0; i < (1UL << order); i += n ) |
541 | 0 | { |
542 | 0 | p2m_access_t a; |
543 | 0 | p2m_type_t t; |
544 | 0 | unsigned int cur_order; |
545 | 0 |
|
546 | 0 | p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL); |
547 | 0 | n = 1UL << min(order, cur_order); |
548 | 0 | if ( t == p2m_populate_on_demand ) |
549 | 0 | pod += n; |
550 | 0 | else |
551 | 0 | { |
552 | 0 | nonpod += n; |
553 | 0 | if ( p2m_is_ram(t) ) |
554 | 0 | ram += n; |
555 | 0 | } |
556 | 0 | } |
557 | 0 |
|
558 | 0 | /* No populate-on-demand? Don't need to steal anything? Then we're done!*/ |
559 | 0 | if ( !pod && !steal_for_cache ) |
560 | 0 | goto out_unlock; |
561 | 0 |
|
562 | 0 | if ( !nonpod ) |
563 | 0 | { |
564 | 0 | /* |
565 | 0 | * All PoD: Mark the whole region invalid and tell caller |
566 | 0 | * we're done. |
567 | 0 | */ |
568 | 0 | p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, |
569 | 0 | p2m->default_access); |
570 | 0 | p2m->pod.entry_count -= 1UL << order; |
571 | 0 | BUG_ON(p2m->pod.entry_count < 0); |
572 | 0 | ret = 1; |
573 | 0 | goto out_entry_check; |
574 | 0 | } |
575 | 0 |
|
576 | 0 | /* |
577 | 0 | * Try to grab entire superpages if possible. Since the common case is for |
578 | 0 | * drivers to pass back singleton pages, see if we can take the whole page |
579 | 0 | * back and mark the rest PoD. |
580 | 0 | * No need to do this though if |
581 | 0 | * - order >= SUPERPAGE_ORDER (the loop below will take care of this) |
582 | 0 | * - not all of the pages were RAM (now knowing order < SUPERPAGE_ORDER) |
583 | 0 | */ |
584 | 0 | if ( steal_for_cache && order < SUPERPAGE_ORDER && ram == (1UL << order) && |
585 | 0 | p2m_pod_zero_check_superpage(p2m, _gfn(gfn_x(gfn) & ~(SUPERPAGE_PAGES - 1))) ) |
586 | 0 | { |
587 | 0 | pod = 1UL << order; |
588 | 0 | ram = nonpod = 0; |
589 | 0 | ASSERT(steal_for_cache == (p2m->pod.entry_count > p2m->pod.count)); |
590 | 0 | } |
591 | 0 |
|
592 | 0 | /* |
593 | 0 | * Process as long as: |
594 | 0 | * + There are PoD entries to handle, or |
595 | 0 | * + There is ram left, and we want to steal it |
596 | 0 | */ |
597 | 0 | for ( i = 0; |
598 | 0 | i < (1UL << order) && (pod > 0 || (steal_for_cache && ram > 0)); |
599 | 0 | i += n ) |
600 | 0 | { |
601 | 0 | mfn_t mfn; |
602 | 0 | p2m_type_t t; |
603 | 0 | p2m_access_t a; |
604 | 0 | unsigned int cur_order; |
605 | 0 |
|
606 | 0 | mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL); |
607 | 0 | if ( order < cur_order ) |
608 | 0 | cur_order = order; |
609 | 0 | n = 1UL << cur_order; |
610 | 0 | if ( t == p2m_populate_on_demand ) |
611 | 0 | { |
612 | 0 | p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order, |
613 | 0 | p2m_invalid, p2m->default_access); |
614 | 0 | p2m->pod.entry_count -= n; |
615 | 0 | BUG_ON(p2m->pod.entry_count < 0); |
616 | 0 | pod -= n; |
617 | 0 | } |
618 | 0 | else if ( steal_for_cache && p2m_is_ram(t) ) |
619 | 0 | { |
620 | 0 | /* |
621 | 0 | * If we need less than 1 << cur_order, we may end up stealing |
622 | 0 | * more memory here than we actually need. This will be rectified |
623 | 0 | * below, however; and stealing too much and then freeing what we |
624 | 0 | * need may allow us to free smaller pages from the cache, and |
625 | 0 | * avoid breaking up superpages. |
626 | 0 | */ |
627 | 0 | struct page_info *page; |
628 | 0 | unsigned long j; |
629 | 0 |
|
630 | 0 | ASSERT(mfn_valid(mfn)); |
631 | 0 |
|
632 | 0 | page = mfn_to_page(mfn); |
633 | 0 |
|
634 | 0 | p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order, |
635 | 0 | p2m_invalid, p2m->default_access); |
636 | 0 | p2m_tlb_flush_sync(p2m); |
637 | 0 | for ( j = 0; j < n; ++j ) |
638 | 0 | set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); |
639 | 0 | p2m_pod_cache_add(p2m, page, cur_order); |
640 | 0 |
|
641 | 0 | steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); |
642 | 0 |
|
643 | 0 | nonpod -= n; |
644 | 0 | ram -= n; |
645 | 0 | } |
646 | 0 | } |
647 | 0 |
|
648 | 0 | /* |
649 | 0 | * If there are no more non-PoD entries, tell decrease_reservation() that |
650 | 0 | * there's nothing left to do. |
651 | 0 | */ |
652 | 0 | if ( nonpod == 0 ) |
653 | 0 | ret = 1; |
654 | 0 |
|
655 | 0 | out_entry_check: |
656 | 0 | /* If we've reduced our "liabilities" beyond our "assets", free some */ |
657 | 0 | if ( p2m->pod.entry_count < p2m->pod.count ) |
658 | 0 | { |
659 | 0 | p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/); |
660 | 0 | } |
661 | 0 |
|
662 | 0 | out_unlock: |
663 | 0 | pod_unlock(p2m); |
664 | 0 | gfn_unlock(p2m, gfn, order); |
665 | 0 | return ret; |
666 | 0 | } |
667 | | |
668 | | void p2m_pod_dump_data(struct domain *d) |
669 | 0 | { |
670 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
671 | 0 |
|
672 | 0 | printk(" PoD entries=%ld cachesize=%ld\n", |
673 | 0 | p2m->pod.entry_count, p2m->pod.count); |
674 | 0 | } |
675 | | |
676 | | |
677 | | /* |
678 | | * Search for all-zero superpages to be reclaimed as superpages for the |
679 | | * PoD cache. Must be called w/ pod lock held, must lock the superpage |
680 | | * in the p2m. |
681 | | */ |
682 | | static int |
683 | | p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn) |
684 | 0 | { |
685 | 0 | mfn_t mfn, mfn0 = INVALID_MFN; |
686 | 0 | p2m_type_t type, type0 = 0; |
687 | 0 | unsigned long * map = NULL; |
688 | 0 | int ret=0, reset = 0; |
689 | 0 | unsigned long i, n; |
690 | 0 | unsigned int j; |
691 | 0 | int max_ref = 1; |
692 | 0 | struct domain *d = p2m->domain; |
693 | 0 |
|
694 | 0 | ASSERT(pod_locked_by_me(p2m)); |
695 | 0 |
|
696 | 0 | if ( !superpage_aligned(gfn_x(gfn)) ) |
697 | 0 | goto out; |
698 | 0 |
|
699 | 0 | /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ |
700 | 0 | if ( paging_mode_shadow(d) ) |
701 | 0 | max_ref++; |
702 | 0 |
|
703 | 0 | /* |
704 | 0 | * NOTE: this is why we don't enforce deadlock constraints between p2m |
705 | 0 | * and pod locks. |
706 | 0 | */ |
707 | 0 | gfn_lock(p2m, gfn, SUPERPAGE_ORDER); |
708 | 0 |
|
709 | 0 | /* |
710 | 0 | * Look up the mfns, checking to make sure they're the same mfn |
711 | 0 | * and aligned, and mapping them. |
712 | 0 | */ |
713 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i += n ) |
714 | 0 | { |
715 | 0 | p2m_access_t a; |
716 | 0 | unsigned int cur_order; |
717 | 0 | unsigned long k; |
718 | 0 | const struct page_info *page; |
719 | 0 |
|
720 | 0 | mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &type, &a, 0, |
721 | 0 | &cur_order, NULL); |
722 | 0 |
|
723 | 0 | /* |
724 | 0 | * Conditions that must be met for superpage-superpage: |
725 | 0 | * + All gfns are ram types |
726 | 0 | * + All gfns have the same type |
727 | 0 | * + All of the mfns are allocated to a domain |
728 | 0 | * + None of the mfns are used as pagetables, or allocated via xenheap |
729 | 0 | * + The first mfn is 2-meg aligned |
730 | 0 | * + All the other mfns are in sequence |
731 | 0 | * Adding for good measure: |
732 | 0 | * + None of the mfns are likely to be mapped elsewhere (refcount |
733 | 0 | * 2 or less for shadow, 1 for hap) |
734 | 0 | */ |
735 | 0 | if ( !p2m_is_ram(type) ) |
736 | 0 | goto out; |
737 | 0 |
|
738 | 0 | if ( i == 0 ) |
739 | 0 | { |
740 | 0 | if ( !superpage_aligned(mfn_x(mfn)) ) |
741 | 0 | goto out; |
742 | 0 | mfn0 = mfn; |
743 | 0 | type0 = type; |
744 | 0 | } |
745 | 0 | else if ( type != type0 || !mfn_eq(mfn, mfn_add(mfn0, i)) ) |
746 | 0 | goto out; |
747 | 0 |
|
748 | 0 | n = 1UL << min(cur_order, SUPERPAGE_ORDER + 0U); |
749 | 0 | for ( k = 0, page = mfn_to_page(mfn); k < n; ++k, ++page ) |
750 | 0 | if ( !(page->count_info & PGC_allocated) || |
751 | 0 | (page->count_info & (PGC_page_table | PGC_xen_heap)) || |
752 | 0 | (page->count_info & PGC_count_mask) > max_ref ) |
753 | 0 | goto out; |
754 | 0 | } |
755 | 0 |
|
756 | 0 | /* Now, do a quick check to see if it may be zero before unmapping. */ |
757 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i++ ) |
758 | 0 | { |
759 | 0 | /* Quick zero-check */ |
760 | 0 | map = map_domain_page(mfn_add(mfn0, i)); |
761 | 0 |
|
762 | 0 | for ( j = 0; j < 16; j++ ) |
763 | 0 | if ( *(map + j) != 0 ) |
764 | 0 | break; |
765 | 0 |
|
766 | 0 | unmap_domain_page(map); |
767 | 0 |
|
768 | 0 | if ( j < 16 ) |
769 | 0 | goto out; |
770 | 0 |
|
771 | 0 | } |
772 | 0 |
|
773 | 0 | /* Try to remove the page, restoring old mapping if it fails. */ |
774 | 0 | p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_2M, |
775 | 0 | p2m_populate_on_demand, p2m->default_access); |
776 | 0 | p2m_tlb_flush_sync(p2m); |
777 | 0 |
|
778 | 0 | /* |
779 | 0 | * Make none of the MFNs are used elsewhere... for example, mapped |
780 | 0 | * via the grant table interface, or by qemu. Allow one refcount for |
781 | 0 | * being allocated to the domain. |
782 | 0 | */ |
783 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i++ ) |
784 | 0 | { |
785 | 0 | mfn = mfn_add(mfn0, i); |
786 | 0 | if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 ) |
787 | 0 | { |
788 | 0 | reset = 1; |
789 | 0 | goto out_reset; |
790 | 0 | } |
791 | 0 | } |
792 | 0 |
|
793 | 0 | /* Finally, do a full zero-check */ |
794 | 0 | for ( i = 0; i < SUPERPAGE_PAGES; i++ ) |
795 | 0 | { |
796 | 0 | map = map_domain_page(mfn_add(mfn0, i)); |
797 | 0 |
|
798 | 0 | for ( j = 0; j < (PAGE_SIZE / sizeof(*map)); j++ ) |
799 | 0 | if ( *(map+j) != 0 ) |
800 | 0 | { |
801 | 0 | reset = 1; |
802 | 0 | break; |
803 | 0 | } |
804 | 0 |
|
805 | 0 | unmap_domain_page(map); |
806 | 0 |
|
807 | 0 | if ( reset ) |
808 | 0 | goto out_reset; |
809 | 0 | } |
810 | 0 |
|
811 | 0 | if ( tb_init_done ) |
812 | 0 | { |
813 | 0 | struct { |
814 | 0 | u64 gfn, mfn; |
815 | 0 | int d:16,order:16; |
816 | 0 | } t; |
817 | 0 |
|
818 | 0 | t.gfn = gfn_x(gfn); |
819 | 0 | t.mfn = mfn_x(mfn); |
820 | 0 | t.d = d->domain_id; |
821 | 0 | t.order = 9; |
822 | 0 |
|
823 | 0 | __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); |
824 | 0 | } |
825 | 0 |
|
826 | 0 | /* |
827 | 0 | * Finally! We've passed all the checks, and can add the mfn superpage |
828 | 0 | * back on the PoD cache, and account for the new p2m PoD entries. |
829 | 0 | */ |
830 | 0 | p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M); |
831 | 0 | p2m->pod.entry_count += SUPERPAGE_PAGES; |
832 | 0 |
|
833 | 0 | ret = SUPERPAGE_PAGES; |
834 | 0 |
|
835 | 0 | out_reset: |
836 | 0 | if ( reset ) |
837 | 0 | p2m_set_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); |
838 | 0 |
|
839 | 0 | out: |
840 | 0 | gfn_unlock(p2m, gfn, SUPERPAGE_ORDER); |
841 | 0 | return ret; |
842 | 0 | } |
843 | | |
844 | | static void |
845 | | p2m_pod_zero_check(struct p2m_domain *p2m, const gfn_t *gfns, int count) |
846 | 0 | { |
847 | 0 | mfn_t mfns[count]; |
848 | 0 | p2m_type_t types[count]; |
849 | 0 | unsigned long *map[count]; |
850 | 0 | struct domain *d = p2m->domain; |
851 | 0 |
|
852 | 0 | int i, j; |
853 | 0 | int max_ref = 1; |
854 | 0 |
|
855 | 0 | /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ |
856 | 0 | if ( paging_mode_shadow(d) ) |
857 | 0 | max_ref++; |
858 | 0 |
|
859 | 0 | /* First, get the gfn list, translate to mfns, and map the pages. */ |
860 | 0 | for ( i = 0; i < count; i++ ) |
861 | 0 | { |
862 | 0 | p2m_access_t a; |
863 | 0 | struct page_info *pg; |
864 | 0 |
|
865 | 0 | mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, |
866 | 0 | 0, NULL, NULL); |
867 | 0 | pg = mfn_to_page(mfns[i]); |
868 | 0 |
|
869 | 0 | /* |
870 | 0 | * If this is ram, and not a pagetable or from the xen heap, and |
871 | 0 | * probably not mapped elsewhere, map it; otherwise, skip. |
872 | 0 | */ |
873 | 0 | if ( p2m_is_ram(types[i]) && (pg->count_info & PGC_allocated) && |
874 | 0 | !(pg->count_info & (PGC_page_table | PGC_xen_heap)) && |
875 | 0 | ((pg->count_info & PGC_count_mask) <= max_ref) ) |
876 | 0 | map[i] = map_domain_page(mfns[i]); |
877 | 0 | else |
878 | 0 | map[i] = NULL; |
879 | 0 | } |
880 | 0 |
|
881 | 0 | /* |
882 | 0 | * Then, go through and check for zeroed pages, removing write permission |
883 | 0 | * for those with zeroes. |
884 | 0 | */ |
885 | 0 | for ( i = 0; i < count; i++ ) |
886 | 0 | { |
887 | 0 | if ( !map[i] ) |
888 | 0 | continue; |
889 | 0 |
|
890 | 0 | /* Quick zero-check */ |
891 | 0 | for ( j = 0; j < 16; j++ ) |
892 | 0 | if ( *(map[i] + j) != 0 ) |
893 | 0 | break; |
894 | 0 |
|
895 | 0 | if ( j < 16 ) |
896 | 0 | { |
897 | 0 | unmap_domain_page(map[i]); |
898 | 0 | map[i] = NULL; |
899 | 0 | continue; |
900 | 0 | } |
901 | 0 |
|
902 | 0 | /* Try to remove the page, restoring old mapping if it fails. */ |
903 | 0 | p2m_set_entry(p2m, gfns[i], INVALID_MFN, PAGE_ORDER_4K, |
904 | 0 | p2m_populate_on_demand, p2m->default_access); |
905 | 0 |
|
906 | 0 | /* |
907 | 0 | * See if the page was successfully unmapped. (Allow one refcount |
908 | 0 | * for being allocated to a domain.) |
909 | 0 | */ |
910 | 0 | if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) |
911 | 0 | { |
912 | 0 | unmap_domain_page(map[i]); |
913 | 0 | map[i] = NULL; |
914 | 0 |
|
915 | 0 | p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, |
916 | 0 | types[i], p2m->default_access); |
917 | 0 |
|
918 | 0 | continue; |
919 | 0 | } |
920 | 0 | } |
921 | 0 |
|
922 | 0 | p2m_tlb_flush_sync(p2m); |
923 | 0 |
|
924 | 0 | /* Now check each page for real */ |
925 | 0 | for ( i = 0; i < count; i++ ) |
926 | 0 | { |
927 | 0 | if ( !map[i] ) |
928 | 0 | continue; |
929 | 0 |
|
930 | 0 | for ( j = 0; j < (PAGE_SIZE / sizeof(*map[i])); j++ ) |
931 | 0 | if ( *(map[i] + j) != 0 ) |
932 | 0 | break; |
933 | 0 |
|
934 | 0 | unmap_domain_page(map[i]); |
935 | 0 |
|
936 | 0 | /* |
937 | 0 | * See comment in p2m_pod_zero_check_superpage() re gnttab |
938 | 0 | * check timing. |
939 | 0 | */ |
940 | 0 | if ( j < (PAGE_SIZE / sizeof(*map[i])) ) |
941 | 0 | { |
942 | 0 | p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, |
943 | 0 | types[i], p2m->default_access); |
944 | 0 | } |
945 | 0 | else |
946 | 0 | { |
947 | 0 | if ( tb_init_done ) |
948 | 0 | { |
949 | 0 | struct { |
950 | 0 | u64 gfn, mfn; |
951 | 0 | int d:16,order:16; |
952 | 0 | } t; |
953 | 0 |
|
954 | 0 | t.gfn = gfn_x(gfns[i]); |
955 | 0 | t.mfn = mfn_x(mfns[i]); |
956 | 0 | t.d = d->domain_id; |
957 | 0 | t.order = 0; |
958 | 0 |
|
959 | 0 | __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); |
960 | 0 | } |
961 | 0 |
|
962 | 0 | /* Add to cache, and account for the new p2m PoD entry */ |
963 | 0 | p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K); |
964 | 0 | p2m->pod.entry_count++; |
965 | 0 | } |
966 | 0 | } |
967 | 0 |
|
968 | 0 | } |
969 | | |
970 | 0 | #define POD_SWEEP_LIMIT 1024 |
971 | 0 | #define POD_SWEEP_STRIDE 16 |
972 | | static void |
973 | | p2m_pod_emergency_sweep(struct p2m_domain *p2m) |
974 | 0 | { |
975 | 0 | gfn_t gfns[POD_SWEEP_STRIDE]; |
976 | 0 | unsigned long i, j = 0, start, limit; |
977 | 0 | p2m_type_t t; |
978 | 0 |
|
979 | 0 |
|
980 | 0 | if ( gfn_eq(p2m->pod.reclaim_single, _gfn(0)) ) |
981 | 0 | p2m->pod.reclaim_single = p2m->pod.max_guest; |
982 | 0 |
|
983 | 0 | start = gfn_x(p2m->pod.reclaim_single); |
984 | 0 | limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; |
985 | 0 |
|
986 | 0 | /* FIXME: Figure out how to avoid superpages */ |
987 | 0 | /* |
988 | 0 | * NOTE: Promote to globally locking the p2m. This will get complicated |
989 | 0 | * in a fine-grained scenario. If we lock each gfn individually we must be |
990 | 0 | * careful about spinlock recursion limits and POD_SWEEP_STRIDE. |
991 | 0 | */ |
992 | 0 | p2m_lock(p2m); |
993 | 0 | for ( i = gfn_x(p2m->pod.reclaim_single); i > 0 ; i-- ) |
994 | 0 | { |
995 | 0 | p2m_access_t a; |
996 | 0 | (void)p2m->get_entry(p2m, _gfn(i), &t, &a, 0, NULL, NULL); |
997 | 0 | if ( p2m_is_ram(t) ) |
998 | 0 | { |
999 | 0 | gfns[j] = _gfn(i); |
1000 | 0 | j++; |
1001 | 0 | BUG_ON(j > POD_SWEEP_STRIDE); |
1002 | 0 | if ( j == POD_SWEEP_STRIDE ) |
1003 | 0 | { |
1004 | 0 | p2m_pod_zero_check(p2m, gfns, j); |
1005 | 0 | j = 0; |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | /* |
1009 | 0 | * Stop if we're past our limit and we have found *something*. |
1010 | 0 | * |
1011 | 0 | * NB that this is a zero-sum game; we're increasing our cache size |
1012 | 0 | * by re-increasing our 'debt'. Since we hold the pod lock, |
1013 | 0 | * (entry_count - count) must remain the same. |
1014 | 0 | */ |
1015 | 0 | if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) ) |
1016 | 0 | break; |
1017 | 0 | } |
1018 | 0 |
|
1019 | 0 | if ( j ) |
1020 | 0 | p2m_pod_zero_check(p2m, gfns, j); |
1021 | 0 |
|
1022 | 0 | p2m_unlock(p2m); |
1023 | 0 | p2m->pod.reclaim_single = _gfn(i ? i - 1 : i); |
1024 | 0 |
|
1025 | 0 | } |
1026 | | |
1027 | | static void pod_eager_reclaim(struct p2m_domain *p2m) |
1028 | 0 | { |
1029 | 0 | struct pod_mrp_list *mrp = &p2m->pod.mrp; |
1030 | 0 | unsigned int i = 0; |
1031 | 0 |
|
1032 | 0 | /* |
1033 | 0 | * Always check one page for reclaimation. |
1034 | 0 | * |
1035 | 0 | * If the PoD pool is empty, keep checking some space is found, or all |
1036 | 0 | * entries have been exhaused. |
1037 | 0 | */ |
1038 | 0 | do |
1039 | 0 | { |
1040 | 0 | unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list); |
1041 | 0 | gfn_t gfn = _gfn(mrp->list[idx]); |
1042 | 0 |
|
1043 | 0 | if ( !gfn_eq(gfn, INVALID_GFN) ) |
1044 | 0 | { |
1045 | 0 | if ( gfn_x(gfn) & POD_LAST_SUPERPAGE ) |
1046 | 0 | { |
1047 | 0 | gfn = _gfn(gfn_x(gfn) & ~POD_LAST_SUPERPAGE); |
1048 | 0 |
|
1049 | 0 | if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 ) |
1050 | 0 | { |
1051 | 0 | unsigned int x; |
1052 | 0 |
|
1053 | 0 | for ( x = 0; x < SUPERPAGE_PAGES; ++x, gfn = gfn_add(gfn, 1) ) |
1054 | 0 | p2m_pod_zero_check(p2m, &gfn, 1); |
1055 | 0 | } |
1056 | 0 | } |
1057 | 0 | else |
1058 | 0 | p2m_pod_zero_check(p2m, &gfn, 1); |
1059 | 0 |
|
1060 | 0 | mrp->list[idx] = gfn_x(INVALID_GFN); |
1061 | 0 | } |
1062 | 0 |
|
1063 | 0 | } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) ); |
1064 | 0 | } |
1065 | | |
1066 | | static void pod_eager_record(struct p2m_domain *p2m, gfn_t gfn, |
1067 | | unsigned int order) |
1068 | 0 | { |
1069 | 0 | struct pod_mrp_list *mrp = &p2m->pod.mrp; |
1070 | 0 |
|
1071 | 0 | ASSERT(!gfn_eq(gfn, INVALID_GFN)); |
1072 | 0 |
|
1073 | 0 | mrp->list[mrp->idx++] = |
1074 | 0 | gfn_x(gfn) | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0); |
1075 | 0 | mrp->idx %= ARRAY_SIZE(mrp->list); |
1076 | 0 | } |
1077 | | |
1078 | | bool |
1079 | | p2m_pod_demand_populate(struct p2m_domain *p2m, gfn_t gfn, |
1080 | | unsigned int order) |
1081 | 0 | { |
1082 | 0 | struct domain *d = p2m->domain; |
1083 | 0 | struct page_info *p = NULL; /* Compiler warnings */ |
1084 | 0 | gfn_t gfn_aligned = _gfn((gfn_x(gfn) >> order) << order); |
1085 | 0 | mfn_t mfn; |
1086 | 0 | unsigned long i; |
1087 | 0 |
|
1088 | 0 | ASSERT(gfn_locked_by_me(p2m, gfn)); |
1089 | 0 | pod_lock(p2m); |
1090 | 0 |
|
1091 | 0 | /* |
1092 | 0 | * This check is done with the pod lock held. This will make sure that |
1093 | 0 | * even if d->is_dying changes under our feet, p2m_pod_empty_cache() |
1094 | 0 | * won't start until we're done. |
1095 | 0 | */ |
1096 | 0 | if ( unlikely(d->is_dying) ) |
1097 | 0 | goto out_fail; |
1098 | 0 |
|
1099 | 0 |
|
1100 | 0 | /* |
1101 | 0 | * Because PoD does not have cache list for 1GB pages, it has to remap |
1102 | 0 | * 1GB region to 2MB chunks for a retry. |
1103 | 0 | */ |
1104 | 0 | if ( order == PAGE_ORDER_1G ) |
1105 | 0 | { |
1106 | 0 | pod_unlock(p2m); |
1107 | 0 | /* |
1108 | 0 | * Note that we are supposed to call p2m_set_entry() 512 times to |
1109 | 0 | * split 1GB into 512 2MB pages here. But We only do once here because |
1110 | 0 | * p2m_set_entry() should automatically shatter the 1GB page into |
1111 | 0 | * 512 2MB pages. The rest of 511 calls are unnecessary. |
1112 | 0 | * |
1113 | 0 | * NOTE: In a fine-grained p2m locking scenario this operation |
1114 | 0 | * may need to promote its locking from gfn->1g superpage |
1115 | 0 | */ |
1116 | 0 | p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_2M, |
1117 | 0 | p2m_populate_on_demand, p2m->default_access); |
1118 | 0 | return true; |
1119 | 0 | } |
1120 | 0 |
|
1121 | 0 | /* Only reclaim if we're in actual need of more cache. */ |
1122 | 0 | if ( p2m->pod.entry_count > p2m->pod.count ) |
1123 | 0 | pod_eager_reclaim(p2m); |
1124 | 0 |
|
1125 | 0 | /* |
1126 | 0 | * Only sweep if we're actually out of memory. Doing anything else |
1127 | 0 | * causes unnecessary time and fragmentation of superpages in the p2m. |
1128 | 0 | */ |
1129 | 0 | if ( p2m->pod.count == 0 ) |
1130 | 0 | p2m_pod_emergency_sweep(p2m); |
1131 | 0 |
|
1132 | 0 | /* If the sweep failed, give up. */ |
1133 | 0 | if ( p2m->pod.count == 0 ) |
1134 | 0 | goto out_of_memory; |
1135 | 0 |
|
1136 | 0 | /* Keep track of the highest gfn demand-populated by a guest fault */ |
1137 | 0 | p2m->pod.max_guest = gfn_max(gfn, p2m->pod.max_guest); |
1138 | 0 |
|
1139 | 0 | /* |
1140 | 0 | * Get a page f/ the cache. A NULL return value indicates that the |
1141 | 0 | * 2-meg range should be marked singleton PoD, and retried. |
1142 | 0 | */ |
1143 | 0 | if ( (p = p2m_pod_cache_get(p2m, order)) == NULL ) |
1144 | 0 | goto remap_and_retry; |
1145 | 0 |
|
1146 | 0 | mfn = page_to_mfn(p); |
1147 | 0 |
|
1148 | 0 | BUG_ON((mfn_x(mfn) & ((1UL << order) - 1)) != 0); |
1149 | 0 |
|
1150 | 0 | p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, |
1151 | 0 | p2m->default_access); |
1152 | 0 |
|
1153 | 0 | for( i = 0; i < (1UL << order); i++ ) |
1154 | 0 | { |
1155 | 0 | set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn_aligned) + i); |
1156 | 0 | paging_mark_dirty(d, mfn_add(mfn, i)); |
1157 | 0 | } |
1158 | 0 |
|
1159 | 0 | p2m->pod.entry_count -= (1UL << order); |
1160 | 0 | BUG_ON(p2m->pod.entry_count < 0); |
1161 | 0 |
|
1162 | 0 | pod_eager_record(p2m, gfn_aligned, order); |
1163 | 0 |
|
1164 | 0 | if ( tb_init_done ) |
1165 | 0 | { |
1166 | 0 | struct { |
1167 | 0 | u64 gfn, mfn; |
1168 | 0 | int d:16,order:16; |
1169 | 0 | } t; |
1170 | 0 |
|
1171 | 0 | t.gfn = gfn_x(gfn); |
1172 | 0 | t.mfn = mfn_x(mfn); |
1173 | 0 | t.d = d->domain_id; |
1174 | 0 | t.order = order; |
1175 | 0 |
|
1176 | 0 | __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); |
1177 | 0 | } |
1178 | 0 |
|
1179 | 0 | pod_unlock(p2m); |
1180 | 0 | return true; |
1181 | 0 | out_of_memory: |
1182 | 0 | pod_unlock(p2m); |
1183 | 0 |
|
1184 | 0 | printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n", |
1185 | 0 | __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count, |
1186 | 0 | current->domain->domain_id); |
1187 | 0 | domain_crash(d); |
1188 | 0 | return false; |
1189 | 0 | out_fail: |
1190 | 0 | pod_unlock(p2m); |
1191 | 0 | return false; |
1192 | 0 | remap_and_retry: |
1193 | 0 | BUG_ON(order != PAGE_ORDER_2M); |
1194 | 0 | pod_unlock(p2m); |
1195 | 0 |
|
1196 | 0 | /* Remap this 2-meg region in singleton chunks */ |
1197 | 0 | /* |
1198 | 0 | * NOTE: In a p2m fine-grained lock scenario this might |
1199 | 0 | * need promoting the gfn lock from gfn->2M superpage. |
1200 | 0 | */ |
1201 | 0 | for ( i = 0; i < (1UL << order); i++ ) |
1202 | 0 | p2m_set_entry(p2m, gfn_add(gfn_aligned, i), INVALID_MFN, PAGE_ORDER_4K, |
1203 | 0 | p2m_populate_on_demand, p2m->default_access); |
1204 | 0 | if ( tb_init_done ) |
1205 | 0 | { |
1206 | 0 | struct { |
1207 | 0 | u64 gfn; |
1208 | 0 | int d:16; |
1209 | 0 | } t; |
1210 | 0 |
|
1211 | 0 | t.gfn = gfn_x(gfn); |
1212 | 0 | t.d = d->domain_id; |
1213 | 0 |
|
1214 | 0 | __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t); |
1215 | 0 | } |
1216 | 0 |
|
1217 | 0 | return true; |
1218 | 0 | } |
1219 | | |
1220 | | |
1221 | | int |
1222 | | guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l, |
1223 | | unsigned int order) |
1224 | 0 | { |
1225 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
1226 | 0 | gfn_t gfn = _gfn(gfn_l); |
1227 | 0 | unsigned long i, n, pod_count = 0; |
1228 | 0 | int rc = 0; |
1229 | 0 |
|
1230 | 0 | if ( !paging_mode_translate(d) ) |
1231 | 0 | return -EINVAL; |
1232 | 0 |
|
1233 | 0 | gfn_lock(p2m, gfn, order); |
1234 | 0 |
|
1235 | 0 | P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l); |
1236 | 0 |
|
1237 | 0 | /* Make sure all gpfns are unused */ |
1238 | 0 | for ( i = 0; i < (1UL << order); i += n ) |
1239 | 0 | { |
1240 | 0 | p2m_type_t ot; |
1241 | 0 | p2m_access_t a; |
1242 | 0 | unsigned int cur_order; |
1243 | 0 |
|
1244 | 0 | p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL); |
1245 | 0 | n = 1UL << min(order, cur_order); |
1246 | 0 | if ( p2m_is_ram(ot) ) |
1247 | 0 | { |
1248 | 0 | P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); |
1249 | 0 | rc = -EBUSY; |
1250 | 0 | goto out; |
1251 | 0 | } |
1252 | 0 | else if ( ot == p2m_populate_on_demand ) |
1253 | 0 | { |
1254 | 0 | /* Count how man PoD entries we'll be replacing if successful */ |
1255 | 0 | pod_count += n; |
1256 | 0 | } |
1257 | 0 | } |
1258 | 0 |
|
1259 | 0 | /* Now, actually do the two-way mapping */ |
1260 | 0 | rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, |
1261 | 0 | p2m_populate_on_demand, p2m->default_access); |
1262 | 0 | if ( rc == 0 ) |
1263 | 0 | { |
1264 | 0 | pod_lock(p2m); |
1265 | 0 | p2m->pod.entry_count += 1UL << order; |
1266 | 0 | p2m->pod.entry_count -= pod_count; |
1267 | 0 | BUG_ON(p2m->pod.entry_count < 0); |
1268 | 0 | pod_unlock(p2m); |
1269 | 0 | } |
1270 | 0 |
|
1271 | 0 | out: |
1272 | 0 | gfn_unlock(p2m, gfn, order); |
1273 | 0 |
|
1274 | 0 | return rc; |
1275 | 0 | } |
1276 | | |