debuggers.hg

view xen/common/page_alloc.c @ 20991:3a0bd7ca6b11

When tmem is enabled, reserve a fraction of memory
for allocations of 0<order<9 to avoid fragmentation
issues.

Signed-off by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 15 17:54:04 2010 +0000 (2010-02-15)
parents 391cb20b6ea9
children 077089e37ac9
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <public/sysctl.h>
40 #include <asm/page.h>
41 #include <asm/numa.h>
42 #include <asm/flushtlb.h>
44 /*
45 * Comma-separated list of hexadecimal page numbers containing bad bytes.
46 * e.g. 'badpage=0x3f45,0x8a321'.
47 */
48 static char __initdata opt_badpage[100] = "";
49 string_param("badpage", opt_badpage);
51 /*
52 * no-bootscrub -> Free pages are not zeroed during boot.
53 */
54 static int opt_bootscrub __initdata = 1;
55 boolean_param("bootscrub", opt_bootscrub);
57 /*
58 * Bit width of the DMA heap -- used to override NUMA-node-first.
59 * allocation strategy, which can otherwise exhaust low memory.
60 */
61 static unsigned int dma_bitsize;
62 integer_param("dma_bits", dma_bitsize);
64 #define round_pgdown(_p) ((_p)&PAGE_MASK)
65 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
67 /* Offlined page list, protected by heap_lock. */
68 PAGE_LIST_HEAD(page_offlined_list);
69 /* Broken page list, protected by heap_lock. */
70 PAGE_LIST_HEAD(page_broken_list);
72 /*************************
73 * BOOT-TIME ALLOCATOR
74 */
76 static unsigned long __initdata first_valid_mfn = ~0UL;
78 static struct bootmem_region {
79 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
80 } *__initdata bootmem_region_list;
81 static unsigned int __initdata nr_bootmem_regions;
83 static void __init boot_bug(int line)
84 {
85 panic("Boot BUG at %s:%d\n", __FILE__, line);
86 }
87 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
89 static void __init bootmem_region_add(unsigned long s, unsigned long e)
90 {
91 unsigned int i;
93 if ( (bootmem_region_list == NULL) && (s < e) )
94 bootmem_region_list = mfn_to_virt(s++);
96 if ( s >= e )
97 return;
99 for ( i = 0; i < nr_bootmem_regions; i++ )
100 if ( s < bootmem_region_list[i].e )
101 break;
103 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
104 BOOT_BUG_ON(nr_bootmem_regions ==
105 (PAGE_SIZE / sizeof(struct bootmem_region)));
107 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
108 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
109 bootmem_region_list[i] = (struct bootmem_region) { s, e };
110 nr_bootmem_regions++;
111 }
113 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
114 {
115 unsigned int i;
117 for ( i = 0; i < nr_bootmem_regions; i++ )
118 {
119 struct bootmem_region *r = &bootmem_region_list[i];
120 if ( e <= r->s )
121 break;
122 if ( s >= r->e )
123 continue;
124 if ( s <= r->s )
125 {
126 r->s = min(e, r->e);
127 }
128 else if ( e >= r->e )
129 {
130 r->e = s;
131 }
132 else
133 {
134 unsigned long _e = r->e;
135 r->e = s;
136 bootmem_region_add(e, _e);
137 }
138 }
139 }
141 void __init init_boot_pages(paddr_t ps, paddr_t pe)
142 {
143 unsigned long bad_spfn, bad_epfn;
144 const char *p;
146 ps = round_pgup(ps);
147 pe = round_pgdown(pe);
148 if ( pe <= ps )
149 return;
151 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
153 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
155 /* Check new pages against the bad-page list. */
156 p = opt_badpage;
157 while ( *p != '\0' )
158 {
159 bad_spfn = simple_strtoul(p, &p, 0);
160 bad_epfn = bad_spfn;
162 if ( *p == '-' )
163 {
164 p++;
165 bad_epfn = simple_strtoul(p, &p, 0);
166 if ( bad_epfn < bad_spfn )
167 bad_epfn = bad_spfn;
168 }
170 if ( *p == ',' )
171 p++;
172 else if ( *p != '\0' )
173 break;
175 if ( bad_epfn == bad_spfn )
176 printk("Marking page %lx as bad\n", bad_spfn);
177 else
178 printk("Marking pages %lx through %lx as bad\n",
179 bad_spfn, bad_epfn);
181 bootmem_region_zap(bad_spfn, bad_epfn+1);
182 }
183 }
185 unsigned long __init alloc_boot_pages(
186 unsigned long nr_pfns, unsigned long pfn_align)
187 {
188 unsigned long pg, _e;
189 int i;
191 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
192 {
193 struct bootmem_region *r = &bootmem_region_list[i];
194 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
195 if ( pg < r->s )
196 continue;
197 _e = r->e;
198 r->e = pg;
199 bootmem_region_add(pg + nr_pfns, _e);
200 return pg;
201 }
203 BOOT_BUG_ON(1);
204 return 0;
205 }
209 /*************************
210 * BINARY BUDDY ALLOCATOR
211 */
213 #define MEMZONE_XEN 0
214 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
216 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
217 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
218 (fls(page_to_mfn(pg)) - 1))
220 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
221 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
222 #define heap(node, zone, order) ((*_heap[node])[zone][order])
224 static unsigned long *avail[MAX_NUMNODES];
225 static long total_avail_pages;
227 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
228 static long midsize_alloc_zone_pages;
229 #define MIDSIZE_ALLOC_FRAC 128
231 static DEFINE_SPINLOCK(heap_lock);
233 static unsigned long init_node_heap(int node, unsigned long mfn,
234 unsigned long nr)
235 {
236 /* First node to be discovered has its heap metadata statically alloced. */
237 static heap_by_zone_and_order_t _heap_static;
238 static unsigned long avail_static[NR_ZONES];
239 static int first_node_initialised;
240 unsigned long needed = (sizeof(**_heap) +
241 sizeof(**avail) * NR_ZONES +
242 PAGE_SIZE - 1) >> PAGE_SHIFT;
243 int i, j;
245 if ( !first_node_initialised )
246 {
247 _heap[node] = &_heap_static;
248 avail[node] = avail_static;
249 first_node_initialised = 1;
250 needed = 0;
251 }
252 #ifdef DIRECTMAP_VIRT_END
253 else if ( nr >= needed &&
254 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
255 {
256 _heap[node] = mfn_to_virt(mfn);
257 avail[node] = mfn_to_virt(mfn + needed - 1) +
258 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
259 }
260 #endif
261 else if ( get_order_from_bytes(sizeof(**_heap)) ==
262 get_order_from_pages(needed) )
263 {
264 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
265 BUG_ON(!_heap[node]);
266 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
267 sizeof(**avail) * NR_ZONES;
268 needed = 0;
269 }
270 else
271 {
272 _heap[node] = xmalloc(heap_by_zone_and_order_t);
273 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
274 BUG_ON(!_heap[node] || !avail[node]);
275 needed = 0;
276 }
278 memset(avail[node], 0, NR_ZONES * sizeof(long));
280 for ( i = 0; i < NR_ZONES; i++ )
281 for ( j = 0; j <= MAX_ORDER; j++ )
282 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
284 return needed;
285 }
287 /* Allocate 2^@order contiguous pages. */
288 static struct page_info *alloc_heap_pages(
289 unsigned int zone_lo, unsigned int zone_hi,
290 unsigned int node, unsigned int order, unsigned int memflags)
291 {
292 unsigned int i, j, zone = 0;
293 unsigned int num_nodes = num_online_nodes();
294 unsigned long request = 1UL << order;
295 cpumask_t extra_cpus_mask, mask;
296 struct page_info *pg;
298 if ( node == NUMA_NO_NODE )
299 node = cpu_to_node(smp_processor_id());
301 ASSERT(node >= 0);
302 ASSERT(zone_lo <= zone_hi);
303 ASSERT(zone_hi < NR_ZONES);
305 if ( unlikely(order > MAX_ORDER) )
306 return NULL;
308 spin_lock(&heap_lock);
310 /*
311 * TMEM: When available memory is scarce, allow only mid-size allocations
312 * to avoid worst of fragmentation issues.
313 */
314 if ( opt_tmem && ((order == 0) || (order >= 9)) &&
315 (total_avail_pages <= midsize_alloc_zone_pages) )
316 goto fail;
318 /*
319 * Start with requested node, but exhaust all node memory in requested
320 * zone before failing, only calc new node value if we fail to find memory
321 * in target node, this avoids needless computation on fast-path.
322 */
323 for ( i = 0; i < num_nodes; i++ )
324 {
325 zone = zone_hi;
326 do {
327 /* Check if target node can support the allocation. */
328 if ( !avail[node] || (avail[node][zone] < request) )
329 continue;
331 /* Find smallest order which can satisfy the request. */
332 for ( j = order; j <= MAX_ORDER; j++ )
333 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
334 goto found;
335 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
337 /* Pick next node, wrapping around if needed. */
338 node = next_node(node, node_online_map);
339 if (node == MAX_NUMNODES)
340 node = first_node(node_online_map);
341 }
343 /* Try to free memory from tmem */
344 if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
345 {
346 /* reassigning an already allocated anonymous heap page */
347 spin_unlock(&heap_lock);
348 return pg;
349 }
351 fail:
352 /* No suitable memory blocks. Fail the request. */
353 spin_unlock(&heap_lock);
354 return NULL;
356 found:
357 /* We may have to halve the chunk a number of times. */
358 while ( j != order )
359 {
360 PFN_ORDER(pg) = --j;
361 page_list_add_tail(pg, &heap(node, zone, j));
362 pg += 1 << j;
363 }
365 ASSERT(avail[node][zone] >= request);
366 avail[node][zone] -= request;
367 total_avail_pages -= request;
368 ASSERT(total_avail_pages >= 0);
370 spin_unlock(&heap_lock);
372 cpus_clear(mask);
374 for ( i = 0; i < (1 << order); i++ )
375 {
376 /* Reference count must continuously be zero for free pages. */
377 BUG_ON(pg[i].count_info != PGC_state_free);
378 pg[i].count_info = PGC_state_inuse;
380 if ( pg[i].u.free.need_tlbflush )
381 {
382 /* Add in extra CPUs that need flushing because of this page. */
383 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
384 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
385 cpus_or(mask, mask, extra_cpus_mask);
386 }
388 /* Initialise fields which have other uses for free pages. */
389 pg[i].u.inuse.type_info = 0;
390 page_set_owner(&pg[i], NULL);
391 }
393 if ( unlikely(!cpus_empty(mask)) )
394 {
395 perfc_incr(need_flush_tlb_flush);
396 flush_tlb_mask(&mask);
397 }
399 return pg;
400 }
402 /* Remove any offlined page in the buddy pointed to by head. */
403 static int reserve_offlined_page(struct page_info *head)
404 {
405 unsigned int node = phys_to_nid(page_to_maddr(head));
406 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
407 struct page_info *cur_head;
408 int cur_order;
410 ASSERT(spin_is_locked(&heap_lock));
412 cur_head = head;
414 page_list_del(head, &heap(node, zone, head_order));
416 while ( cur_head < (head + (1 << head_order)) )
417 {
418 struct page_info *pg;
419 int next_order;
421 if ( page_state_is(cur_head, offlined) )
422 {
423 cur_head++;
424 continue;
425 }
427 next_order = cur_order = 0;
429 while ( cur_order < head_order )
430 {
431 next_order = cur_order + 1;
433 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
434 goto merge;
436 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
437 i < (1 << next_order);
438 i++, pg++ )
439 if ( page_state_is(pg, offlined) )
440 break;
441 if ( i == ( 1 << next_order) )
442 {
443 cur_order = next_order;
444 continue;
445 }
446 else
447 {
448 merge:
449 /* We don't consider merging outside the head_order. */
450 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
451 PFN_ORDER(cur_head) = cur_order;
452 cur_head += (1 << cur_order);
453 break;
454 }
455 }
456 }
458 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
459 {
460 if ( !page_state_is(cur_head, offlined) )
461 continue;
463 avail[node][zone]--;
464 total_avail_pages--;
465 ASSERT(total_avail_pages >= 0);
467 page_list_add_tail(cur_head,
468 test_bit(_PGC_broken, &cur_head->count_info) ?
469 &page_broken_list : &page_offlined_list);
471 count++;
472 }
474 return count;
475 }
477 /* Free 2^@order set of pages. */
478 static void free_heap_pages(
479 struct page_info *pg, unsigned int order)
480 {
481 unsigned long mask;
482 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
483 unsigned int zone = page_to_zone(pg);
485 ASSERT(order <= MAX_ORDER);
486 ASSERT(node >= 0);
488 for ( i = 0; i < (1 << order); i++ )
489 {
490 /*
491 * Cannot assume that count_info == 0, as there are some corner cases
492 * where it isn't the case and yet it isn't a bug:
493 * 1. page_get_owner() is NULL
494 * 2. page_get_owner() is a domain that was never accessible by
495 * its domid (e.g., failed to fully construct the domain).
496 * 3. page was never addressable by the guest (e.g., it's an
497 * auto-translate-physmap guest and the page was never included
498 * in its pseudophysical address space).
499 * In all the above cases there can be no guest mappings of this page.
500 */
501 ASSERT(!page_state_is(&pg[i], offlined));
502 pg[i].count_info =
503 ((pg[i].count_info & PGC_broken) |
504 (page_state_is(&pg[i], offlining)
505 ? PGC_state_offlined : PGC_state_free));
506 if ( page_state_is(&pg[i], offlined) )
507 tainted = 1;
509 /* If a page has no owner it will need no safety TLB flush. */
510 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
511 if ( pg[i].u.free.need_tlbflush )
512 pg[i].tlbflush_timestamp = tlbflush_current_time();
513 }
515 spin_lock(&heap_lock);
517 avail[node][zone] += 1 << order;
518 total_avail_pages += 1 << order;
520 if ( opt_tmem )
521 midsize_alloc_zone_pages = max(
522 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
524 /* Merge chunks as far as possible. */
525 while ( order < MAX_ORDER )
526 {
527 mask = 1UL << order;
529 if ( (page_to_mfn(pg) & mask) )
530 {
531 /* Merge with predecessor block? */
532 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
533 !page_state_is(pg-mask, free) ||
534 (PFN_ORDER(pg-mask) != order) )
535 break;
536 pg -= mask;
537 page_list_del(pg, &heap(node, zone, order));
538 }
539 else
540 {
541 /* Merge with successor block? */
542 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
543 !page_state_is(pg+mask, free) ||
544 (PFN_ORDER(pg+mask) != order) )
545 break;
546 page_list_del(pg + mask, &heap(node, zone, order));
547 }
549 order++;
551 /* After merging, pg should remain in the same node. */
552 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
553 }
555 PFN_ORDER(pg) = order;
556 page_list_add_tail(pg, &heap(node, zone, order));
558 if ( tainted )
559 reserve_offlined_page(pg);
561 spin_unlock(&heap_lock);
562 }
565 /*
566 * Following possible status for a page:
567 * free and Online; free and offlined; free and offlined and broken;
568 * assigned and online; assigned and offlining; assigned and offling and broken
569 *
570 * Following rules applied for page offline:
571 * Once a page is broken, it can't be assigned anymore
572 * A page will be offlined only if it is free
573 * return original count_info
574 */
575 static unsigned long mark_page_offline(struct page_info *pg, int broken)
576 {
577 unsigned long nx, x, y = pg->count_info;
579 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
580 ASSERT(spin_is_locked(&heap_lock));
582 do {
583 nx = x = y;
585 if ( ((x & PGC_state) != PGC_state_offlined) &&
586 ((x & PGC_state) != PGC_state_offlining) )
587 {
588 nx &= ~PGC_state;
589 nx |= (((x & PGC_state) == PGC_state_free)
590 ? PGC_state_offlined : PGC_state_offlining);
591 }
593 if ( broken )
594 nx |= PGC_broken;
596 if ( x == nx )
597 break;
598 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
600 return y;
601 }
603 static int reserve_heap_page(struct page_info *pg)
604 {
605 struct page_info *head = NULL;
606 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
607 unsigned int zone = page_to_zone(pg);
609 for ( i = 0; i <= MAX_ORDER; i++ )
610 {
611 struct page_info *tmp;
613 if ( page_list_empty(&heap(node, zone, i)) )
614 continue;
616 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
617 {
618 if ( (head <= pg) &&
619 (head + (1UL << i) > pg) )
620 return reserve_offlined_page(head);
621 }
622 }
624 return -EINVAL;
626 }
628 int offline_page(unsigned long mfn, int broken, uint32_t *status)
629 {
630 unsigned long old_info = 0;
631 struct domain *owner;
632 int ret = 0;
633 struct page_info *pg;
635 if ( !mfn_valid(mfn) )
636 {
637 dprintk(XENLOG_WARNING,
638 "try to offline page out of range %lx\n", mfn);
639 return -EINVAL;
640 }
642 *status = 0;
643 pg = mfn_to_page(mfn);
645 if ( is_xen_fixed_mfn(mfn) )
646 {
647 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
648 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
649 return -EPERM;
650 }
652 /*
653 * N.B. xen's txt in x86_64 is marked reserved and handled already.
654 * Also kexec range is reserved.
655 */
656 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
657 {
658 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
659 return -EINVAL;
660 }
662 spin_lock(&heap_lock);
664 old_info = mark_page_offline(pg, broken);
666 if ( page_state_is(pg, free) )
667 {
668 /* Free pages are reserve directly */
669 reserve_heap_page(pg);
670 *status = PG_OFFLINE_OFFLINED;
671 }
672 else if ( page_state_is(pg, offlined) )
673 {
674 *status = PG_OFFLINE_OFFLINED;
675 }
676 else if ( (owner = page_get_owner_and_reference(pg)) )
677 {
678 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
679 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
680 /* Release the reference since it will not be allocated anymore */
681 put_page(pg);
682 }
683 else if ( old_info & PGC_xen_heap )
684 {
685 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
686 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
687 }
688 else
689 {
690 /*
691 * assign_pages does not hold heap_lock, so small window that the owner
692 * may be set later, but please notice owner will only change from
693 * NULL to be set, not verse, since page is offlining now.
694 * No windows If called from #MC handler, since all CPU are in softirq
695 * If called from user space like CE handling, tools can wait some time
696 * before call again.
697 */
698 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
699 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
700 }
702 if ( broken )
703 *status |= PG_OFFLINE_BROKEN;
705 spin_unlock(&heap_lock);
707 return ret;
708 }
710 /*
711 * Online the memory.
712 * The caller should make sure end_pfn <= max_page,
713 * if not, expand_pages() should be called prior to online_page().
714 */
715 unsigned int online_page(unsigned long mfn, uint32_t *status)
716 {
717 unsigned long x, nx, y;
718 struct page_info *pg;
719 int ret;
721 if ( !mfn_valid(mfn) )
722 {
723 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
724 return -EINVAL;
725 }
727 pg = mfn_to_page(mfn);
729 spin_lock(&heap_lock);
731 y = pg->count_info;
732 do {
733 ret = *status = 0;
735 if ( y & PGC_broken )
736 {
737 ret = -EINVAL;
738 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
739 break;
740 }
742 if ( (y & PGC_state) == PGC_state_offlined )
743 {
744 page_list_del(pg, &page_offlined_list);
745 *status = PG_ONLINE_ONLINED;
746 }
747 else if ( (y & PGC_state) == PGC_state_offlining )
748 {
749 *status = PG_ONLINE_ONLINED;
750 }
751 else
752 {
753 break;
754 }
756 x = y;
757 nx = (x & ~PGC_state) | PGC_state_inuse;
758 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
760 spin_unlock(&heap_lock);
762 if ( (y & PGC_state) == PGC_state_offlined )
763 free_heap_pages(pg, 0);
765 return ret;
766 }
768 int query_page_offline(unsigned long mfn, uint32_t *status)
769 {
770 struct page_info *pg;
772 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
773 {
774 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
775 return -EINVAL;
776 }
778 *status = 0;
779 spin_lock(&heap_lock);
781 pg = mfn_to_page(mfn);
783 if ( page_state_is(pg, offlining) )
784 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
785 if ( pg->count_info & PGC_broken )
786 *status |= PG_OFFLINE_STATUS_BROKEN;
787 if ( page_state_is(pg, offlined) )
788 *status |= PG_OFFLINE_STATUS_OFFLINED;
790 spin_unlock(&heap_lock);
792 return 0;
793 }
795 /*
796 * Hand the specified arbitrary page range to the specified heap zone
797 * checking the node_id of the previous page. If they differ and the
798 * latter is not on a MAX_ORDER boundary, then we reserve the page by
799 * not freeing it to the buddy allocator.
800 */
801 static void init_heap_pages(
802 struct page_info *pg, unsigned long nr_pages)
803 {
804 unsigned int nid_curr, nid_prev;
805 unsigned long i;
807 nid_prev = phys_to_nid(page_to_maddr(pg-1));
809 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
810 {
811 nid_curr = phys_to_nid(page_to_maddr(pg+i));
813 if ( unlikely(!avail[nid_curr]) )
814 {
815 unsigned long n;
817 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
818 if ( n )
819 {
820 BUG_ON(i + n > nr_pages);
821 i += n - 1;
822 continue;
823 }
824 }
826 /*
827 * Free pages of the same node, or if they differ, but are on a
828 * MAX_ORDER alignment boundary (which already get reserved).
829 */
830 if ( (nid_curr == nid_prev) ||
831 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
832 free_heap_pages(pg+i, 0);
833 else
834 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
835 page_to_mfn(pg+i));
836 }
837 }
839 static unsigned long avail_heap_pages(
840 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
841 {
842 unsigned int i, zone;
843 unsigned long free_pages = 0;
845 if ( zone_hi >= NR_ZONES )
846 zone_hi = NR_ZONES - 1;
848 for_each_online_node(i)
849 {
850 if ( !avail[i] )
851 continue;
852 for ( zone = zone_lo; zone <= zone_hi; zone++ )
853 if ( (node == -1) || (node == i) )
854 free_pages += avail[i][zone];
855 }
857 return free_pages;
858 }
860 unsigned long total_free_pages(void)
861 {
862 return total_avail_pages - midsize_alloc_zone_pages;
863 }
865 void __init end_boot_allocator(void)
866 {
867 unsigned int i;
869 /* Pages that are free now go to the domain sub-allocator. */
870 for ( i = 0; i < nr_bootmem_regions; i++ )
871 {
872 struct bootmem_region *r = &bootmem_region_list[i];
873 if ( r->s < r->e )
874 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
875 }
876 init_heap_pages(virt_to_page(bootmem_region_list), 1);
878 if ( !dma_bitsize && (num_online_nodes() > 1) )
879 {
880 #ifdef CONFIG_X86
881 dma_bitsize = min_t(unsigned int,
882 fls(NODE_DATA(0)->node_spanned_pages) - 1
883 + PAGE_SHIFT - 2,
884 32);
885 #else
886 dma_bitsize = 32;
887 #endif
888 }
890 printk("Domain heap initialised");
891 if ( dma_bitsize )
892 printk(" DMA width %u bits", dma_bitsize);
893 printk("\n");
894 }
896 /*
897 * Scrub all unallocated pages in all heap zones. This function is more
898 * convoluted than appears necessary because we do not want to continuously
899 * hold the lock while scrubbing very large memory areas.
900 */
901 void __init scrub_heap_pages(void)
902 {
903 unsigned long mfn;
904 struct page_info *pg;
906 if ( !opt_bootscrub )
907 return;
909 printk("Scrubbing Free RAM: ");
911 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
912 {
913 process_pending_softirqs();
915 pg = mfn_to_page(mfn);
917 /* Quick lock-free check. */
918 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
919 continue;
921 /* Every 100MB, print a progress dot. */
922 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
923 printk(".");
925 spin_lock(&heap_lock);
927 /* Re-check page status with lock held. */
928 if ( page_state_is(pg, free) )
929 scrub_one_page(pg);
931 spin_unlock(&heap_lock);
932 }
934 printk("done.\n");
935 }
939 /*************************
940 * XEN-HEAP SUB-ALLOCATOR
941 */
943 #if !defined(__x86_64__) && !defined(__ia64__)
945 void init_xenheap_pages(paddr_t ps, paddr_t pe)
946 {
947 ps = round_pgup(ps);
948 pe = round_pgdown(pe);
949 if ( pe <= ps )
950 return;
952 /*
953 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
954 * prevent merging of power-of-two blocks across the zone boundary.
955 */
956 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
957 ps += PAGE_SIZE;
958 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
959 pe -= PAGE_SIZE;
961 memguard_guard_range(maddr_to_virt(ps), pe - ps);
963 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
964 }
967 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
968 {
969 struct page_info *pg;
971 ASSERT(!in_irq());
973 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
974 cpu_to_node(smp_processor_id()), order, memflags);
975 if ( unlikely(pg == NULL) )
976 return NULL;
978 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
980 return page_to_virt(pg);
981 }
984 void free_xenheap_pages(void *v, unsigned int order)
985 {
986 ASSERT(!in_irq());
988 if ( v == NULL )
989 return;
991 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
993 free_heap_pages(virt_to_page(v), order);
994 }
996 #else
998 void init_xenheap_pages(paddr_t ps, paddr_t pe)
999 {
1000 init_domheap_pages(ps, pe);
1003 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1005 struct page_info *pg;
1006 unsigned int i;
1008 ASSERT(!in_irq());
1010 pg = alloc_domheap_pages(NULL, order, memflags);
1011 if ( unlikely(pg == NULL) )
1012 return NULL;
1014 for ( i = 0; i < (1u << order); i++ )
1015 pg[i].count_info |= PGC_xen_heap;
1017 return page_to_virt(pg);
1020 void free_xenheap_pages(void *v, unsigned int order)
1022 struct page_info *pg;
1023 unsigned int i;
1025 ASSERT(!in_irq());
1027 if ( v == NULL )
1028 return;
1030 pg = virt_to_page(v);
1032 for ( i = 0; i < (1u << order); i++ )
1033 pg[i].count_info &= ~PGC_xen_heap;
1035 free_heap_pages(pg, order);
1038 #endif
1042 /*************************
1043 * DOMAIN-HEAP SUB-ALLOCATOR
1044 */
1046 void init_domheap_pages(paddr_t ps, paddr_t pe)
1048 unsigned long smfn, emfn;
1050 ASSERT(!in_irq());
1052 smfn = round_pgup(ps) >> PAGE_SHIFT;
1053 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1055 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1059 int assign_pages(
1060 struct domain *d,
1061 struct page_info *pg,
1062 unsigned int order,
1063 unsigned int memflags)
1065 unsigned long i;
1067 spin_lock(&d->page_alloc_lock);
1069 if ( unlikely(d->is_dying) )
1071 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1072 d->domain_id);
1073 goto fail;
1076 if ( !(memflags & MEMF_no_refcount) )
1078 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1080 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
1081 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
1082 "%u > %u\n", d->domain_id,
1083 d->tot_pages + (1 << order), d->max_pages);
1084 goto fail;
1087 if ( unlikely(d->tot_pages == 0) )
1088 get_knownalive_domain(d);
1090 d->tot_pages += 1 << order;
1093 for ( i = 0; i < (1 << order); i++ )
1095 ASSERT(page_get_owner(&pg[i]) == NULL);
1096 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1097 page_set_owner(&pg[i], d);
1098 wmb(); /* Domain pointer must be visible before updating refcnt. */
1099 pg[i].count_info = PGC_allocated | 1;
1100 page_list_add_tail(&pg[i], &d->page_list);
1103 spin_unlock(&d->page_alloc_lock);
1104 return 0;
1106 fail:
1107 spin_unlock(&d->page_alloc_lock);
1108 return -1;
1112 struct page_info *alloc_domheap_pages(
1113 struct domain *d, unsigned int order, unsigned int memflags)
1115 struct page_info *pg = NULL;
1116 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1117 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
1119 ASSERT(!in_irq());
1121 if ( (node == NUMA_NO_NODE) && (d != NULL) )
1122 node = domain_to_node(d);
1124 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1125 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1126 return NULL;
1128 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1129 pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
1131 if ( (pg == NULL) &&
1132 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
1133 node, order, memflags)) == NULL) )
1134 return NULL;
1136 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1138 free_heap_pages(pg, order);
1139 return NULL;
1142 return pg;
1145 void free_domheap_pages(struct page_info *pg, unsigned int order)
1147 int i, drop_dom_ref;
1148 struct domain *d = page_get_owner(pg);
1150 ASSERT(!in_irq());
1152 if ( unlikely(is_xen_heap_page(pg)) )
1154 /* NB. May recursively lock from relinquish_memory(). */
1155 spin_lock_recursive(&d->page_alloc_lock);
1157 for ( i = 0; i < (1 << order); i++ )
1158 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1160 d->xenheap_pages -= 1 << order;
1161 drop_dom_ref = (d->xenheap_pages == 0);
1163 spin_unlock_recursive(&d->page_alloc_lock);
1165 else if ( likely(d != NULL) && likely(d != dom_cow) )
1167 /* NB. May recursively lock from relinquish_memory(). */
1168 spin_lock_recursive(&d->page_alloc_lock);
1170 for ( i = 0; i < (1 << order); i++ )
1172 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1173 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1176 d->tot_pages -= 1 << order;
1177 drop_dom_ref = (d->tot_pages == 0);
1179 spin_unlock_recursive(&d->page_alloc_lock);
1181 /*
1182 * Normally we expect a domain to clear pages before freeing them, if
1183 * it cares about the secrecy of their contents. However, after a
1184 * domain has died we assume responsibility for erasure.
1185 */
1186 if ( unlikely(d->is_dying) )
1187 for ( i = 0; i < (1 << order); i++ )
1188 scrub_one_page(&pg[i]);
1190 free_heap_pages(pg, order);
1192 else if ( unlikely(d == dom_cow) )
1194 ASSERT(order == 0);
1195 scrub_one_page(pg);
1196 free_heap_pages(pg, 0);
1197 drop_dom_ref = 0;
1199 else
1201 /* Freeing anonymous domain-heap pages. */
1202 free_heap_pages(pg, order);
1203 drop_dom_ref = 0;
1206 if ( drop_dom_ref )
1207 put_domain(d);
1210 unsigned long avail_domheap_pages_region(
1211 unsigned int node, unsigned int min_width, unsigned int max_width)
1213 int zone_lo, zone_hi;
1215 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1216 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1218 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1219 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1221 return avail_heap_pages(zone_lo, zone_hi, node);
1224 unsigned long avail_domheap_pages(void)
1226 return avail_heap_pages(MEMZONE_XEN + 1,
1227 NR_ZONES - 1,
1228 -1);
1231 static void pagealloc_info(unsigned char key)
1233 unsigned int zone = MEMZONE_XEN;
1234 unsigned long n, total = 0;
1236 printk("Physical memory information:\n");
1237 printk(" Xen heap: %lukB free\n",
1238 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1240 while ( ++zone < NR_ZONES )
1242 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1244 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1245 total = 0;
1248 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1250 total += n;
1251 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1255 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1258 static struct keyhandler pagealloc_info_keyhandler = {
1259 .diagnostic = 1,
1260 .u.fn = pagealloc_info,
1261 .desc = "memory info"
1262 };
1264 static __init int pagealloc_keyhandler_init(void)
1266 register_keyhandler('m', &pagealloc_info_keyhandler);
1267 return 0;
1269 __initcall(pagealloc_keyhandler_init);
1272 void scrub_one_page(struct page_info *pg)
1274 void *p = __map_domain_page(pg);
1276 if ( unlikely(pg->count_info & PGC_broken) )
1277 return;
1279 #ifndef NDEBUG
1280 /* Avoid callers relying on allocations returning zeroed pages. */
1281 memset(p, 0xc2, PAGE_SIZE);
1282 #else
1283 /* For a production build, clear_page() is the fastest way to scrub. */
1284 clear_page(p);
1285 #endif
1287 unmap_domain_page(p);
1290 static void dump_heap(unsigned char key)
1292 s_time_t now = NOW();
1293 int i, j;
1295 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1296 (u32)(now>>32), (u32)now);
1298 for ( i = 0; i < MAX_NUMNODES; i++ )
1300 if ( !avail[i] )
1301 continue;
1302 for ( j = 0; j < NR_ZONES; j++ )
1303 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1304 i, j, avail[i][j]);
1308 static struct keyhandler dump_heap_keyhandler = {
1309 .diagnostic = 1,
1310 .u.fn = dump_heap,
1311 .desc = "dump heap info"
1312 };
1314 static __init int register_heap_trigger(void)
1316 register_keyhandler('H', &dump_heap_keyhandler);
1317 return 0;
1319 __initcall(register_heap_trigger);
1321 /*
1322 * Local variables:
1323 * mode: C
1324 * c-set-style: "BSD"
1325 * c-basic-offset: 4
1326 * tab-width: 4
1327 * indent-tabs-mode: nil
1328 * End:
1329 */