debuggers.hg

view xen/common/page_alloc.c @ 21959:581ebaa7e2da

numa: Attempt more efficient NUMA allocation in hypervisor by default.

1. Try to allocate from nodes containing CPUs which a guest can be
scheduled on.
2. Remember which node we allocated from last, and round-robin
allocations among above-mentioned nodes.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 04 15:35:28 2010 +0100 (2010-08-04)
parents 9d965ac1b0db
children 9c5f084135b8
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <public/sysctl.h>
40 #include <asm/page.h>
41 #include <asm/numa.h>
42 #include <asm/flushtlb.h>
44 /*
45 * Comma-separated list of hexadecimal page numbers containing bad bytes.
46 * e.g. 'badpage=0x3f45,0x8a321'.
47 */
48 static char __initdata opt_badpage[100] = "";
49 string_param("badpage", opt_badpage);
51 /*
52 * no-bootscrub -> Free pages are not zeroed during boot.
53 */
54 static int opt_bootscrub __initdata = 1;
55 boolean_param("bootscrub", opt_bootscrub);
57 /*
58 * Bit width of the DMA heap -- used to override NUMA-node-first.
59 * allocation strategy, which can otherwise exhaust low memory.
60 */
61 static unsigned int dma_bitsize;
62 integer_param("dma_bits", dma_bitsize);
64 #define round_pgdown(_p) ((_p)&PAGE_MASK)
65 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
67 /* Offlined page list, protected by heap_lock. */
68 PAGE_LIST_HEAD(page_offlined_list);
69 /* Broken page list, protected by heap_lock. */
70 PAGE_LIST_HEAD(page_broken_list);
72 /*************************
73 * BOOT-TIME ALLOCATOR
74 */
76 static unsigned long __initdata first_valid_mfn = ~0UL;
78 static struct bootmem_region {
79 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
80 } *__initdata bootmem_region_list;
81 static unsigned int __initdata nr_bootmem_regions;
83 static void __init boot_bug(int line)
84 {
85 panic("Boot BUG at %s:%d\n", __FILE__, line);
86 }
87 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
89 static void __init bootmem_region_add(unsigned long s, unsigned long e)
90 {
91 unsigned int i;
93 if ( (bootmem_region_list == NULL) && (s < e) )
94 bootmem_region_list = mfn_to_virt(s++);
96 if ( s >= e )
97 return;
99 for ( i = 0; i < nr_bootmem_regions; i++ )
100 if ( s < bootmem_region_list[i].e )
101 break;
103 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
104 BOOT_BUG_ON(nr_bootmem_regions ==
105 (PAGE_SIZE / sizeof(struct bootmem_region)));
107 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
108 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
109 bootmem_region_list[i] = (struct bootmem_region) { s, e };
110 nr_bootmem_regions++;
111 }
113 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
114 {
115 unsigned int i;
117 for ( i = 0; i < nr_bootmem_regions; i++ )
118 {
119 struct bootmem_region *r = &bootmem_region_list[i];
120 if ( e <= r->s )
121 break;
122 if ( s >= r->e )
123 continue;
124 if ( s <= r->s )
125 {
126 r->s = min(e, r->e);
127 }
128 else if ( e >= r->e )
129 {
130 r->e = s;
131 }
132 else
133 {
134 unsigned long _e = r->e;
135 r->e = s;
136 bootmem_region_add(e, _e);
137 }
138 }
139 }
141 void __init init_boot_pages(paddr_t ps, paddr_t pe)
142 {
143 unsigned long bad_spfn, bad_epfn;
144 const char *p;
146 ps = round_pgup(ps);
147 pe = round_pgdown(pe);
148 if ( pe <= ps )
149 return;
151 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
153 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
155 /* Check new pages against the bad-page list. */
156 p = opt_badpage;
157 while ( *p != '\0' )
158 {
159 bad_spfn = simple_strtoul(p, &p, 0);
160 bad_epfn = bad_spfn;
162 if ( *p == '-' )
163 {
164 p++;
165 bad_epfn = simple_strtoul(p, &p, 0);
166 if ( bad_epfn < bad_spfn )
167 bad_epfn = bad_spfn;
168 }
170 if ( *p == ',' )
171 p++;
172 else if ( *p != '\0' )
173 break;
175 if ( bad_epfn == bad_spfn )
176 printk("Marking page %lx as bad\n", bad_spfn);
177 else
178 printk("Marking pages %lx through %lx as bad\n",
179 bad_spfn, bad_epfn);
181 bootmem_region_zap(bad_spfn, bad_epfn+1);
182 }
183 }
185 unsigned long __init alloc_boot_pages(
186 unsigned long nr_pfns, unsigned long pfn_align)
187 {
188 unsigned long pg, _e;
189 int i;
191 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
192 {
193 struct bootmem_region *r = &bootmem_region_list[i];
194 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
195 if ( pg < r->s )
196 continue;
197 _e = r->e;
198 r->e = pg;
199 bootmem_region_add(pg + nr_pfns, _e);
200 return pg;
201 }
203 BOOT_BUG_ON(1);
204 return 0;
205 }
209 /*************************
210 * BINARY BUDDY ALLOCATOR
211 */
213 #define MEMZONE_XEN 0
214 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
216 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
217 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
218 (fls(page_to_mfn(pg)) - 1))
220 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
221 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
222 #define heap(node, zone, order) ((*_heap[node])[zone][order])
224 static unsigned long *avail[MAX_NUMNODES];
225 static long total_avail_pages;
227 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
228 static long midsize_alloc_zone_pages;
229 #define MIDSIZE_ALLOC_FRAC 128
231 static DEFINE_SPINLOCK(heap_lock);
233 static unsigned long init_node_heap(int node, unsigned long mfn,
234 unsigned long nr, bool_t *use_tail)
235 {
236 /* First node to be discovered has its heap metadata statically alloced. */
237 static heap_by_zone_and_order_t _heap_static;
238 static unsigned long avail_static[NR_ZONES];
239 static int first_node_initialised;
240 unsigned long needed = (sizeof(**_heap) +
241 sizeof(**avail) * NR_ZONES +
242 PAGE_SIZE - 1) >> PAGE_SHIFT;
243 int i, j;
245 if ( !first_node_initialised )
246 {
247 _heap[node] = &_heap_static;
248 avail[node] = avail_static;
249 first_node_initialised = 1;
250 needed = 0;
251 }
252 #ifdef DIRECTMAP_VIRT_END
253 else if ( *use_tail && nr >= needed &&
254 (mfn + nr) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
255 {
256 _heap[node] = mfn_to_virt(mfn + nr - needed);
257 avail[node] = mfn_to_virt(mfn + nr - 1) +
258 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
259 }
260 else if ( nr >= needed &&
261 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
262 {
263 _heap[node] = mfn_to_virt(mfn);
264 avail[node] = mfn_to_virt(mfn + needed - 1) +
265 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
266 *use_tail = 0;
267 }
268 #endif
269 else if ( get_order_from_bytes(sizeof(**_heap)) ==
270 get_order_from_pages(needed) )
271 {
272 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
273 BUG_ON(!_heap[node]);
274 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
275 sizeof(**avail) * NR_ZONES;
276 needed = 0;
277 }
278 else
279 {
280 _heap[node] = xmalloc(heap_by_zone_and_order_t);
281 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
282 BUG_ON(!_heap[node] || !avail[node]);
283 needed = 0;
284 }
286 memset(avail[node], 0, NR_ZONES * sizeof(long));
288 for ( i = 0; i < NR_ZONES; i++ )
289 for ( j = 0; j <= MAX_ORDER; j++ )
290 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
292 return needed;
293 }
295 /* Allocate 2^@order contiguous pages. */
296 static struct page_info *alloc_heap_pages(
297 unsigned int zone_lo, unsigned int zone_hi,
298 unsigned int order, unsigned int memflags,
299 struct domain *d)
300 {
301 unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
302 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
303 unsigned long request = 1UL << order;
304 cpumask_t extra_cpus_mask, mask;
305 struct page_info *pg;
306 nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
308 if ( node == NUMA_NO_NODE )
309 {
310 memflags &= ~MEMF_exact_node;
311 if ( d != NULL )
312 {
313 node = next_node(d->last_alloc_node, nodemask);
314 if ( node >= MAX_NUMNODES )
315 node = first_node(nodemask);
316 }
317 if ( node >= MAX_NUMNODES )
318 node = cpu_to_node(smp_processor_id());
319 }
320 first_node = node;
322 ASSERT(node >= 0);
323 ASSERT(zone_lo <= zone_hi);
324 ASSERT(zone_hi < NR_ZONES);
326 if ( unlikely(order > MAX_ORDER) )
327 return NULL;
329 spin_lock(&heap_lock);
331 /*
332 * TMEM: When available memory is scarce due to tmem absorbing it, allow
333 * only mid-size allocations to avoid worst of fragmentation issues.
334 * Others try tmem pools then fail. This is a workaround until all
335 * post-dom0-creation-multi-page allocations can be eliminated.
336 */
337 if ( opt_tmem && ((order == 0) || (order >= 9)) &&
338 (total_avail_pages <= midsize_alloc_zone_pages) &&
339 tmem_freeable_pages() )
340 goto try_tmem;
342 /*
343 * Start with requested node, but exhaust all node memory in requested
344 * zone before failing, only calc new node value if we fail to find memory
345 * in target node, this avoids needless computation on fast-path.
346 */
347 for ( ; ; )
348 {
349 zone = zone_hi;
350 do {
351 /* Check if target node can support the allocation. */
352 if ( !avail[node] || (avail[node][zone] < request) )
353 continue;
355 /* Find smallest order which can satisfy the request. */
356 for ( j = order; j <= MAX_ORDER; j++ )
357 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
358 goto found;
359 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
361 if ( memflags & MEMF_exact_node )
362 goto not_found;
364 /* Pick next node. */
365 if ( !node_isset(node, nodemask) )
366 {
367 /* Very first node may be caller-specified and outside nodemask. */
368 ASSERT(!nodemask_retry);
369 first_node = node = first_node(nodemask);
370 if ( node < MAX_NUMNODES )
371 continue;
372 }
373 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
374 node = first_node(nodemask);
375 if ( node == first_node )
376 {
377 /* When we have tried all in nodemask, we fall back to others. */
378 if ( nodemask_retry++ )
379 goto not_found;
380 nodes_andnot(nodemask, node_online_map, nodemask);
381 first_node = node = first_node(nodemask);
382 if ( node >= MAX_NUMNODES )
383 goto not_found;
384 }
385 }
387 try_tmem:
388 /* Try to free memory from tmem */
389 if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
390 {
391 /* reassigning an already allocated anonymous heap page */
392 spin_unlock(&heap_lock);
393 return pg;
394 }
396 not_found:
397 /* No suitable memory blocks. Fail the request. */
398 spin_unlock(&heap_lock);
399 return NULL;
401 found:
402 /* We may have to halve the chunk a number of times. */
403 while ( j != order )
404 {
405 PFN_ORDER(pg) = --j;
406 page_list_add_tail(pg, &heap(node, zone, j));
407 pg += 1 << j;
408 }
410 ASSERT(avail[node][zone] >= request);
411 avail[node][zone] -= request;
412 total_avail_pages -= request;
413 ASSERT(total_avail_pages >= 0);
415 if ( d != NULL )
416 d->last_alloc_node = node;
418 spin_unlock(&heap_lock);
420 cpus_clear(mask);
422 for ( i = 0; i < (1 << order); i++ )
423 {
424 /* Reference count must continuously be zero for free pages. */
425 BUG_ON(pg[i].count_info != PGC_state_free);
426 pg[i].count_info = PGC_state_inuse;
428 if ( pg[i].u.free.need_tlbflush )
429 {
430 /* Add in extra CPUs that need flushing because of this page. */
431 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
432 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
433 cpus_or(mask, mask, extra_cpus_mask);
434 }
436 /* Initialise fields which have other uses for free pages. */
437 pg[i].u.inuse.type_info = 0;
438 page_set_owner(&pg[i], NULL);
439 }
441 if ( unlikely(!cpus_empty(mask)) )
442 {
443 perfc_incr(need_flush_tlb_flush);
444 flush_tlb_mask(&mask);
445 }
447 return pg;
448 }
450 /* Remove any offlined page in the buddy pointed to by head. */
451 static int reserve_offlined_page(struct page_info *head)
452 {
453 unsigned int node = phys_to_nid(page_to_maddr(head));
454 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
455 struct page_info *cur_head;
456 int cur_order;
458 ASSERT(spin_is_locked(&heap_lock));
460 cur_head = head;
462 page_list_del(head, &heap(node, zone, head_order));
464 while ( cur_head < (head + (1 << head_order)) )
465 {
466 struct page_info *pg;
467 int next_order;
469 if ( page_state_is(cur_head, offlined) )
470 {
471 cur_head++;
472 continue;
473 }
475 next_order = cur_order = 0;
477 while ( cur_order < head_order )
478 {
479 next_order = cur_order + 1;
481 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
482 goto merge;
484 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
485 i < (1 << next_order);
486 i++, pg++ )
487 if ( page_state_is(pg, offlined) )
488 break;
489 if ( i == ( 1 << next_order) )
490 {
491 cur_order = next_order;
492 continue;
493 }
494 else
495 {
496 merge:
497 /* We don't consider merging outside the head_order. */
498 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
499 PFN_ORDER(cur_head) = cur_order;
500 cur_head += (1 << cur_order);
501 break;
502 }
503 }
504 }
506 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
507 {
508 if ( !page_state_is(cur_head, offlined) )
509 continue;
511 avail[node][zone]--;
512 total_avail_pages--;
513 ASSERT(total_avail_pages >= 0);
515 page_list_add_tail(cur_head,
516 test_bit(_PGC_broken, &cur_head->count_info) ?
517 &page_broken_list : &page_offlined_list);
519 count++;
520 }
522 return count;
523 }
525 /* Free 2^@order set of pages. */
526 static void free_heap_pages(
527 struct page_info *pg, unsigned int order)
528 {
529 unsigned long mask;
530 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
531 unsigned int zone = page_to_zone(pg);
533 ASSERT(order <= MAX_ORDER);
534 ASSERT(node >= 0);
536 for ( i = 0; i < (1 << order); i++ )
537 {
538 /*
539 * Cannot assume that count_info == 0, as there are some corner cases
540 * where it isn't the case and yet it isn't a bug:
541 * 1. page_get_owner() is NULL
542 * 2. page_get_owner() is a domain that was never accessible by
543 * its domid (e.g., failed to fully construct the domain).
544 * 3. page was never addressable by the guest (e.g., it's an
545 * auto-translate-physmap guest and the page was never included
546 * in its pseudophysical address space).
547 * In all the above cases there can be no guest mappings of this page.
548 */
549 ASSERT(!page_state_is(&pg[i], offlined));
550 pg[i].count_info =
551 ((pg[i].count_info & PGC_broken) |
552 (page_state_is(&pg[i], offlining)
553 ? PGC_state_offlined : PGC_state_free));
554 if ( page_state_is(&pg[i], offlined) )
555 tainted = 1;
557 /* If a page has no owner it will need no safety TLB flush. */
558 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
559 if ( pg[i].u.free.need_tlbflush )
560 pg[i].tlbflush_timestamp = tlbflush_current_time();
561 }
563 spin_lock(&heap_lock);
565 avail[node][zone] += 1 << order;
566 total_avail_pages += 1 << order;
568 if ( opt_tmem )
569 midsize_alloc_zone_pages = max(
570 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
572 /* Merge chunks as far as possible. */
573 while ( order < MAX_ORDER )
574 {
575 mask = 1UL << order;
577 if ( (page_to_mfn(pg) & mask) )
578 {
579 /* Merge with predecessor block? */
580 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
581 !page_state_is(pg-mask, free) ||
582 (PFN_ORDER(pg-mask) != order) )
583 break;
584 pg -= mask;
585 page_list_del(pg, &heap(node, zone, order));
586 }
587 else
588 {
589 /* Merge with successor block? */
590 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
591 !page_state_is(pg+mask, free) ||
592 (PFN_ORDER(pg+mask) != order) )
593 break;
594 page_list_del(pg + mask, &heap(node, zone, order));
595 }
597 order++;
599 /* After merging, pg should remain in the same node. */
600 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
601 }
603 PFN_ORDER(pg) = order;
604 page_list_add_tail(pg, &heap(node, zone, order));
606 if ( tainted )
607 reserve_offlined_page(pg);
609 spin_unlock(&heap_lock);
610 }
613 /*
614 * Following possible status for a page:
615 * free and Online; free and offlined; free and offlined and broken;
616 * assigned and online; assigned and offlining; assigned and offling and broken
617 *
618 * Following rules applied for page offline:
619 * Once a page is broken, it can't be assigned anymore
620 * A page will be offlined only if it is free
621 * return original count_info
622 */
623 static unsigned long mark_page_offline(struct page_info *pg, int broken)
624 {
625 unsigned long nx, x, y = pg->count_info;
627 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
628 ASSERT(spin_is_locked(&heap_lock));
630 do {
631 nx = x = y;
633 if ( ((x & PGC_state) != PGC_state_offlined) &&
634 ((x & PGC_state) != PGC_state_offlining) )
635 {
636 nx &= ~PGC_state;
637 nx |= (((x & PGC_state) == PGC_state_free)
638 ? PGC_state_offlined : PGC_state_offlining);
639 }
641 if ( broken )
642 nx |= PGC_broken;
644 if ( x == nx )
645 break;
646 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
648 return y;
649 }
651 static int reserve_heap_page(struct page_info *pg)
652 {
653 struct page_info *head = NULL;
654 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
655 unsigned int zone = page_to_zone(pg);
657 for ( i = 0; i <= MAX_ORDER; i++ )
658 {
659 struct page_info *tmp;
661 if ( page_list_empty(&heap(node, zone, i)) )
662 continue;
664 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
665 {
666 if ( (head <= pg) &&
667 (head + (1UL << i) > pg) )
668 return reserve_offlined_page(head);
669 }
670 }
672 return -EINVAL;
674 }
676 int offline_page(unsigned long mfn, int broken, uint32_t *status)
677 {
678 unsigned long old_info = 0;
679 struct domain *owner;
680 int ret = 0;
681 struct page_info *pg;
683 if ( !mfn_valid(mfn) )
684 {
685 dprintk(XENLOG_WARNING,
686 "try to offline page out of range %lx\n", mfn);
687 return -EINVAL;
688 }
690 *status = 0;
691 pg = mfn_to_page(mfn);
693 if ( is_xen_fixed_mfn(mfn) )
694 {
695 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
696 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
697 return -EPERM;
698 }
700 /*
701 * N.B. xen's txt in x86_64 is marked reserved and handled already.
702 * Also kexec range is reserved.
703 */
704 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
705 {
706 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
707 return -EINVAL;
708 }
710 spin_lock(&heap_lock);
712 old_info = mark_page_offline(pg, broken);
714 if ( page_state_is(pg, free) )
715 {
716 /* Free pages are reserve directly */
717 reserve_heap_page(pg);
718 *status = PG_OFFLINE_OFFLINED;
719 }
720 else if ( page_state_is(pg, offlined) )
721 {
722 *status = PG_OFFLINE_OFFLINED;
723 }
724 else if ( (owner = page_get_owner_and_reference(pg)) )
725 {
726 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
727 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
728 /* Release the reference since it will not be allocated anymore */
729 put_page(pg);
730 }
731 else if ( old_info & PGC_xen_heap )
732 {
733 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
734 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
735 }
736 else
737 {
738 /*
739 * assign_pages does not hold heap_lock, so small window that the owner
740 * may be set later, but please notice owner will only change from
741 * NULL to be set, not verse, since page is offlining now.
742 * No windows If called from #MC handler, since all CPU are in softirq
743 * If called from user space like CE handling, tools can wait some time
744 * before call again.
745 */
746 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
747 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
748 }
750 if ( broken )
751 *status |= PG_OFFLINE_BROKEN;
753 spin_unlock(&heap_lock);
755 return ret;
756 }
758 /*
759 * Online the memory.
760 * The caller should make sure end_pfn <= max_page,
761 * if not, expand_pages() should be called prior to online_page().
762 */
763 unsigned int online_page(unsigned long mfn, uint32_t *status)
764 {
765 unsigned long x, nx, y;
766 struct page_info *pg;
767 int ret;
769 if ( !mfn_valid(mfn) )
770 {
771 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
772 return -EINVAL;
773 }
775 pg = mfn_to_page(mfn);
777 spin_lock(&heap_lock);
779 y = pg->count_info;
780 do {
781 ret = *status = 0;
783 if ( y & PGC_broken )
784 {
785 ret = -EINVAL;
786 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
787 break;
788 }
790 if ( (y & PGC_state) == PGC_state_offlined )
791 {
792 page_list_del(pg, &page_offlined_list);
793 *status = PG_ONLINE_ONLINED;
794 }
795 else if ( (y & PGC_state) == PGC_state_offlining )
796 {
797 *status = PG_ONLINE_ONLINED;
798 }
799 else
800 {
801 break;
802 }
804 x = y;
805 nx = (x & ~PGC_state) | PGC_state_inuse;
806 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
808 spin_unlock(&heap_lock);
810 if ( (y & PGC_state) == PGC_state_offlined )
811 free_heap_pages(pg, 0);
813 return ret;
814 }
816 int query_page_offline(unsigned long mfn, uint32_t *status)
817 {
818 struct page_info *pg;
820 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
821 {
822 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
823 return -EINVAL;
824 }
826 *status = 0;
827 spin_lock(&heap_lock);
829 pg = mfn_to_page(mfn);
831 if ( page_state_is(pg, offlining) )
832 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
833 if ( pg->count_info & PGC_broken )
834 *status |= PG_OFFLINE_STATUS_BROKEN;
835 if ( page_state_is(pg, offlined) )
836 *status |= PG_OFFLINE_STATUS_OFFLINED;
838 spin_unlock(&heap_lock);
840 return 0;
841 }
843 /*
844 * Hand the specified arbitrary page range to the specified heap zone
845 * checking the node_id of the previous page. If they differ and the
846 * latter is not on a MAX_ORDER boundary, then we reserve the page by
847 * not freeing it to the buddy allocator.
848 */
849 static void init_heap_pages(
850 struct page_info *pg, unsigned long nr_pages)
851 {
852 unsigned int nid_curr, nid_prev;
853 unsigned long i;
855 nid_prev = phys_to_nid(page_to_maddr(pg-1));
857 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
858 {
859 nid_curr = phys_to_nid(page_to_maddr(pg+i));
861 if ( unlikely(!avail[nid_curr]) )
862 {
863 unsigned long s = page_to_mfn(pg + i);
864 unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
865 bool_t use_tail = (nid_curr == phys_to_nid(pfn_to_paddr(e - 1))) &&
866 !(s & ((1UL << MAX_ORDER) - 1)) &&
867 (find_first_set_bit(e) <= find_first_set_bit(s));
868 unsigned long n;
870 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i,
871 &use_tail);
872 BUG_ON(i + n > nr_pages);
873 if ( n && !use_tail )
874 {
875 i += n - 1;
876 continue;
877 }
878 if ( i + n == nr_pages )
879 break;
880 nr_pages -= n;
881 }
883 /*
884 * Free pages of the same node, or if they differ, but are on a
885 * MAX_ORDER alignment boundary (which already get reserved).
886 */
887 if ( (nid_curr == nid_prev) ||
888 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
889 free_heap_pages(pg+i, 0);
890 else
891 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
892 page_to_mfn(pg+i));
893 }
894 }
896 static unsigned long avail_heap_pages(
897 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
898 {
899 unsigned int i, zone;
900 unsigned long free_pages = 0;
902 if ( zone_hi >= NR_ZONES )
903 zone_hi = NR_ZONES - 1;
905 for_each_online_node(i)
906 {
907 if ( !avail[i] )
908 continue;
909 for ( zone = zone_lo; zone <= zone_hi; zone++ )
910 if ( (node == -1) || (node == i) )
911 free_pages += avail[i][zone];
912 }
914 return free_pages;
915 }
917 unsigned long total_free_pages(void)
918 {
919 return total_avail_pages - midsize_alloc_zone_pages;
920 }
922 void __init end_boot_allocator(void)
923 {
924 unsigned int i;
926 /* Pages that are free now go to the domain sub-allocator. */
927 for ( i = 0; i < nr_bootmem_regions; i++ )
928 {
929 struct bootmem_region *r = &bootmem_region_list[i];
930 if ( (r->s < r->e) &&
931 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
932 {
933 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
934 r->e = r->s;
935 break;
936 }
937 }
938 for ( i = nr_bootmem_regions; i-- > 0; )
939 {
940 struct bootmem_region *r = &bootmem_region_list[i];
941 if ( r->s < r->e )
942 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
943 }
944 init_heap_pages(virt_to_page(bootmem_region_list), 1);
946 if ( !dma_bitsize && (num_online_nodes() > 1) )
947 {
948 #ifdef CONFIG_X86
949 dma_bitsize = min_t(unsigned int,
950 fls(NODE_DATA(0)->node_spanned_pages) - 1
951 + PAGE_SHIFT - 2,
952 32);
953 #else
954 dma_bitsize = 32;
955 #endif
956 }
958 printk("Domain heap initialised");
959 if ( dma_bitsize )
960 printk(" DMA width %u bits", dma_bitsize);
961 printk("\n");
962 }
964 /*
965 * Scrub all unallocated pages in all heap zones. This function is more
966 * convoluted than appears necessary because we do not want to continuously
967 * hold the lock while scrubbing very large memory areas.
968 */
969 void __init scrub_heap_pages(void)
970 {
971 unsigned long mfn;
972 struct page_info *pg;
974 if ( !opt_bootscrub )
975 return;
977 printk("Scrubbing Free RAM: ");
979 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
980 {
981 process_pending_softirqs();
983 pg = mfn_to_page(mfn);
985 /* Quick lock-free check. */
986 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
987 continue;
989 /* Every 100MB, print a progress dot. */
990 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
991 printk(".");
993 spin_lock(&heap_lock);
995 /* Re-check page status with lock held. */
996 if ( page_state_is(pg, free) )
997 scrub_one_page(pg);
999 spin_unlock(&heap_lock);
1002 printk("done.\n");
1007 /*************************
1008 * XEN-HEAP SUB-ALLOCATOR
1009 */
1011 #if !defined(__x86_64__) && !defined(__ia64__)
1013 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1015 ps = round_pgup(ps);
1016 pe = round_pgdown(pe);
1017 if ( pe <= ps )
1018 return;
1020 /*
1021 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
1022 * prevent merging of power-of-two blocks across the zone boundary.
1023 */
1024 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
1025 ps += PAGE_SIZE;
1026 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
1027 pe -= PAGE_SIZE;
1029 memguard_guard_range(maddr_to_virt(ps), pe - ps);
1031 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
1035 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1037 struct page_info *pg;
1039 ASSERT(!in_irq());
1041 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
1042 order, memflags, NULL);
1043 if ( unlikely(pg == NULL) )
1044 return NULL;
1046 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
1048 return page_to_virt(pg);
1052 void free_xenheap_pages(void *v, unsigned int order)
1054 ASSERT(!in_irq());
1056 if ( v == NULL )
1057 return;
1059 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
1061 free_heap_pages(virt_to_page(v), order);
1064 #else
1066 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1068 init_domheap_pages(ps, pe);
1071 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1073 struct page_info *pg;
1074 unsigned int i;
1076 ASSERT(!in_irq());
1078 pg = alloc_domheap_pages(NULL, order, memflags);
1079 if ( unlikely(pg == NULL) )
1080 return NULL;
1082 for ( i = 0; i < (1u << order); i++ )
1083 pg[i].count_info |= PGC_xen_heap;
1085 return page_to_virt(pg);
1088 void free_xenheap_pages(void *v, unsigned int order)
1090 struct page_info *pg;
1091 unsigned int i;
1093 ASSERT(!in_irq());
1095 if ( v == NULL )
1096 return;
1098 pg = virt_to_page(v);
1100 for ( i = 0; i < (1u << order); i++ )
1101 pg[i].count_info &= ~PGC_xen_heap;
1103 free_heap_pages(pg, order);
1106 #endif
1110 /*************************
1111 * DOMAIN-HEAP SUB-ALLOCATOR
1112 */
1114 void init_domheap_pages(paddr_t ps, paddr_t pe)
1116 unsigned long smfn, emfn;
1118 ASSERT(!in_irq());
1120 smfn = round_pgup(ps) >> PAGE_SHIFT;
1121 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1123 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1127 int assign_pages(
1128 struct domain *d,
1129 struct page_info *pg,
1130 unsigned int order,
1131 unsigned int memflags)
1133 unsigned long i;
1135 spin_lock(&d->page_alloc_lock);
1137 if ( unlikely(d->is_dying) )
1139 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1140 d->domain_id);
1141 goto fail;
1144 if ( !(memflags & MEMF_no_refcount) )
1146 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1148 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
1149 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
1150 "%u > %u\n", d->domain_id,
1151 d->tot_pages + (1 << order), d->max_pages);
1152 goto fail;
1155 if ( unlikely(d->tot_pages == 0) )
1156 get_knownalive_domain(d);
1158 d->tot_pages += 1 << order;
1161 for ( i = 0; i < (1 << order); i++ )
1163 ASSERT(page_get_owner(&pg[i]) == NULL);
1164 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1165 page_set_owner(&pg[i], d);
1166 wmb(); /* Domain pointer must be visible before updating refcnt. */
1167 pg[i].count_info = PGC_allocated | 1;
1168 page_list_add_tail(&pg[i], &d->page_list);
1171 spin_unlock(&d->page_alloc_lock);
1172 return 0;
1174 fail:
1175 spin_unlock(&d->page_alloc_lock);
1176 return -1;
1180 struct page_info *alloc_domheap_pages(
1181 struct domain *d, unsigned int order, unsigned int memflags)
1183 struct page_info *pg = NULL;
1184 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1185 unsigned int dma_zone;
1187 ASSERT(!in_irq());
1189 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1190 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1191 return NULL;
1193 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1194 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
1196 if ( (pg == NULL) &&
1197 ((memflags & MEMF_no_dma) ||
1198 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
1199 memflags, d)) == NULL)) )
1200 return NULL;
1202 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1204 free_heap_pages(pg, order);
1205 return NULL;
1208 return pg;
1211 void free_domheap_pages(struct page_info *pg, unsigned int order)
1213 int i, drop_dom_ref;
1214 struct domain *d = page_get_owner(pg);
1216 ASSERT(!in_irq());
1218 if ( unlikely(is_xen_heap_page(pg)) )
1220 /* NB. May recursively lock from relinquish_memory(). */
1221 spin_lock_recursive(&d->page_alloc_lock);
1223 for ( i = 0; i < (1 << order); i++ )
1224 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1226 d->xenheap_pages -= 1 << order;
1227 drop_dom_ref = (d->xenheap_pages == 0);
1229 spin_unlock_recursive(&d->page_alloc_lock);
1231 else if ( likely(d != NULL) && likely(d != dom_cow) )
1233 /* NB. May recursively lock from relinquish_memory(). */
1234 spin_lock_recursive(&d->page_alloc_lock);
1236 for ( i = 0; i < (1 << order); i++ )
1238 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1239 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1242 d->tot_pages -= 1 << order;
1243 drop_dom_ref = (d->tot_pages == 0);
1245 spin_unlock_recursive(&d->page_alloc_lock);
1247 /*
1248 * Normally we expect a domain to clear pages before freeing them, if
1249 * it cares about the secrecy of their contents. However, after a
1250 * domain has died we assume responsibility for erasure.
1251 */
1252 if ( unlikely(d->is_dying) )
1253 for ( i = 0; i < (1 << order); i++ )
1254 scrub_one_page(&pg[i]);
1256 free_heap_pages(pg, order);
1258 else if ( unlikely(d == dom_cow) )
1260 ASSERT(order == 0);
1261 scrub_one_page(pg);
1262 free_heap_pages(pg, 0);
1263 drop_dom_ref = 0;
1265 else
1267 /* Freeing anonymous domain-heap pages. */
1268 free_heap_pages(pg, order);
1269 drop_dom_ref = 0;
1272 if ( drop_dom_ref )
1273 put_domain(d);
1276 unsigned long avail_domheap_pages_region(
1277 unsigned int node, unsigned int min_width, unsigned int max_width)
1279 int zone_lo, zone_hi;
1281 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1282 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1284 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1285 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1287 return avail_heap_pages(zone_lo, zone_hi, node);
1290 unsigned long avail_domheap_pages(void)
1292 return avail_heap_pages(MEMZONE_XEN + 1,
1293 NR_ZONES - 1,
1294 -1);
1297 unsigned long avail_node_heap_pages(unsigned int nodeid)
1299 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
1303 static void pagealloc_info(unsigned char key)
1305 unsigned int zone = MEMZONE_XEN;
1306 unsigned long n, total = 0;
1308 printk("Physical memory information:\n");
1309 printk(" Xen heap: %lukB free\n",
1310 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1312 while ( ++zone < NR_ZONES )
1314 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1316 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1317 total = 0;
1320 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1322 total += n;
1323 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1327 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1330 static struct keyhandler pagealloc_info_keyhandler = {
1331 .diagnostic = 1,
1332 .u.fn = pagealloc_info,
1333 .desc = "memory info"
1334 };
1336 static __init int pagealloc_keyhandler_init(void)
1338 register_keyhandler('m', &pagealloc_info_keyhandler);
1339 return 0;
1341 __initcall(pagealloc_keyhandler_init);
1344 void scrub_one_page(struct page_info *pg)
1346 void *p = __map_domain_page(pg);
1348 if ( unlikely(pg->count_info & PGC_broken) )
1349 return;
1351 #ifndef NDEBUG
1352 /* Avoid callers relying on allocations returning zeroed pages. */
1353 memset(p, 0xc2, PAGE_SIZE);
1354 #else
1355 /* For a production build, clear_page() is the fastest way to scrub. */
1356 clear_page(p);
1357 #endif
1359 unmap_domain_page(p);
1362 static void dump_heap(unsigned char key)
1364 s_time_t now = NOW();
1365 int i, j;
1367 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1368 (u32)(now>>32), (u32)now);
1370 for ( i = 0; i < MAX_NUMNODES; i++ )
1372 if ( !avail[i] )
1373 continue;
1374 for ( j = 0; j < NR_ZONES; j++ )
1375 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1376 i, j, avail[i][j]);
1380 static struct keyhandler dump_heap_keyhandler = {
1381 .diagnostic = 1,
1382 .u.fn = dump_heap,
1383 .desc = "dump heap info"
1384 };
1386 static __init int register_heap_trigger(void)
1388 register_keyhandler('H', &dump_heap_keyhandler);
1389 return 0;
1391 __initcall(register_heap_trigger);
1393 /*
1394 * Local variables:
1395 * mode: C
1396 * c-set-style: "BSD"
1397 * c-basic-offset: 4
1398 * tab-width: 4
1399 * indent-tabs-mode: nil
1400 * End:
1401 */