debuggers.hg

view xen/common/page_alloc.c @ 20931:39424ff0c91c

tboot: fix S3 issue for Intel Trusted Execution Technology.

Those unmapped pages cause page fault when MACing them and finally
cause S3 failure.

Signed-off-by: Shane Wang <shane.wang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 03 09:44:12 2010 +0000 (2010-02-03)
parents a60f508548a8
children 391cb20b6ea9
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <public/sysctl.h>
40 #include <asm/page.h>
41 #include <asm/numa.h>
42 #include <asm/flushtlb.h>
44 /*
45 * Comma-separated list of hexadecimal page numbers containing bad bytes.
46 * e.g. 'badpage=0x3f45,0x8a321'.
47 */
48 static char __initdata opt_badpage[100] = "";
49 string_param("badpage", opt_badpage);
51 /*
52 * no-bootscrub -> Free pages are not zeroed during boot.
53 */
54 static int opt_bootscrub __initdata = 1;
55 boolean_param("bootscrub", opt_bootscrub);
57 /*
58 * Bit width of the DMA heap -- used to override NUMA-node-first.
59 * allocation strategy, which can otherwise exhaust low memory.
60 */
61 static unsigned int dma_bitsize;
62 integer_param("dma_bits", dma_bitsize);
64 #define round_pgdown(_p) ((_p)&PAGE_MASK)
65 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
67 /* Offlined page list, protected by heap_lock. */
68 PAGE_LIST_HEAD(page_offlined_list);
69 /* Broken page list, protected by heap_lock. */
70 PAGE_LIST_HEAD(page_broken_list);
72 /*************************
73 * BOOT-TIME ALLOCATOR
74 */
76 static unsigned long __initdata first_valid_mfn = ~0UL;
78 static struct bootmem_region {
79 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
80 } *__initdata bootmem_region_list;
81 static unsigned int __initdata nr_bootmem_regions;
83 static void __init boot_bug(int line)
84 {
85 panic("Boot BUG at %s:%d\n", __FILE__, line);
86 }
87 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
89 static void __init bootmem_region_add(unsigned long s, unsigned long e)
90 {
91 unsigned int i;
93 if ( (bootmem_region_list == NULL) && (s < e) )
94 bootmem_region_list = mfn_to_virt(s++);
96 if ( s >= e )
97 return;
99 for ( i = 0; i < nr_bootmem_regions; i++ )
100 if ( s < bootmem_region_list[i].e )
101 break;
103 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
104 BOOT_BUG_ON(nr_bootmem_regions ==
105 (PAGE_SIZE / sizeof(struct bootmem_region)));
107 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
108 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
109 bootmem_region_list[i] = (struct bootmem_region) { s, e };
110 nr_bootmem_regions++;
111 }
113 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
114 {
115 unsigned int i;
117 for ( i = 0; i < nr_bootmem_regions; i++ )
118 {
119 struct bootmem_region *r = &bootmem_region_list[i];
120 if ( e <= r->s )
121 break;
122 if ( s >= r->e )
123 continue;
124 if ( s <= r->s )
125 {
126 r->s = min(e, r->e);
127 }
128 else if ( e >= r->e )
129 {
130 r->e = s;
131 }
132 else
133 {
134 unsigned long _e = r->e;
135 r->e = s;
136 bootmem_region_add(e, _e);
137 }
138 }
139 }
141 void __init init_boot_pages(paddr_t ps, paddr_t pe)
142 {
143 unsigned long bad_spfn, bad_epfn;
144 const char *p;
146 ps = round_pgup(ps);
147 pe = round_pgdown(pe);
148 if ( pe <= ps )
149 return;
151 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
153 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
155 /* Check new pages against the bad-page list. */
156 p = opt_badpage;
157 while ( *p != '\0' )
158 {
159 bad_spfn = simple_strtoul(p, &p, 0);
160 bad_epfn = bad_spfn;
162 if ( *p == '-' )
163 {
164 p++;
165 bad_epfn = simple_strtoul(p, &p, 0);
166 if ( bad_epfn < bad_spfn )
167 bad_epfn = bad_spfn;
168 }
170 if ( *p == ',' )
171 p++;
172 else if ( *p != '\0' )
173 break;
175 if ( bad_epfn == bad_spfn )
176 printk("Marking page %lx as bad\n", bad_spfn);
177 else
178 printk("Marking pages %lx through %lx as bad\n",
179 bad_spfn, bad_epfn);
181 bootmem_region_zap(bad_spfn, bad_epfn+1);
182 }
183 }
185 unsigned long __init alloc_boot_pages(
186 unsigned long nr_pfns, unsigned long pfn_align)
187 {
188 unsigned long pg, _e;
189 int i;
191 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
192 {
193 struct bootmem_region *r = &bootmem_region_list[i];
194 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
195 if ( pg < r->s )
196 continue;
197 _e = r->e;
198 r->e = pg;
199 bootmem_region_add(pg + nr_pfns, _e);
200 return pg;
201 }
203 BOOT_BUG_ON(1);
204 return 0;
205 }
209 /*************************
210 * BINARY BUDDY ALLOCATOR
211 */
213 #define MEMZONE_XEN 0
214 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
216 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
217 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
218 (fls(page_to_mfn(pg)) - 1))
220 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
221 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
222 #define heap(node, zone, order) ((*_heap[node])[zone][order])
224 static unsigned long *avail[MAX_NUMNODES];
225 static long total_avail_pages;
227 static DEFINE_SPINLOCK(heap_lock);
229 static unsigned long init_node_heap(int node, unsigned long mfn,
230 unsigned long nr)
231 {
232 /* First node to be discovered has its heap metadata statically alloced. */
233 static heap_by_zone_and_order_t _heap_static;
234 static unsigned long avail_static[NR_ZONES];
235 static int first_node_initialised;
236 unsigned long needed = (sizeof(**_heap) +
237 sizeof(**avail) * NR_ZONES +
238 PAGE_SIZE - 1) >> PAGE_SHIFT;
239 int i, j;
241 if ( !first_node_initialised )
242 {
243 _heap[node] = &_heap_static;
244 avail[node] = avail_static;
245 first_node_initialised = 1;
246 needed = 0;
247 }
248 #ifdef DIRECTMAP_VIRT_END
249 else if ( nr >= needed &&
250 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
251 {
252 _heap[node] = mfn_to_virt(mfn);
253 avail[node] = mfn_to_virt(mfn + needed - 1) +
254 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
255 }
256 #endif
257 else if ( get_order_from_bytes(sizeof(**_heap)) ==
258 get_order_from_pages(needed) )
259 {
260 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
261 BUG_ON(!_heap[node]);
262 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
263 sizeof(**avail) * NR_ZONES;
264 needed = 0;
265 }
266 else
267 {
268 _heap[node] = xmalloc(heap_by_zone_and_order_t);
269 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
270 BUG_ON(!_heap[node] || !avail[node]);
271 needed = 0;
272 }
274 memset(avail[node], 0, NR_ZONES * sizeof(long));
276 for ( i = 0; i < NR_ZONES; i++ )
277 for ( j = 0; j <= MAX_ORDER; j++ )
278 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
280 return needed;
281 }
283 /* Allocate 2^@order contiguous pages. */
284 static struct page_info *alloc_heap_pages(
285 unsigned int zone_lo, unsigned int zone_hi,
286 unsigned int node, unsigned int order, unsigned int memflags)
287 {
288 unsigned int i, j, zone = 0;
289 unsigned int num_nodes = num_online_nodes();
290 unsigned long request = 1UL << order;
291 cpumask_t extra_cpus_mask, mask;
292 struct page_info *pg;
294 if ( node == NUMA_NO_NODE )
295 node = cpu_to_node(smp_processor_id());
297 ASSERT(node >= 0);
298 ASSERT(zone_lo <= zone_hi);
299 ASSERT(zone_hi < NR_ZONES);
301 if ( unlikely(order > MAX_ORDER) )
302 return NULL;
304 spin_lock(&heap_lock);
306 /*
307 * Start with requested node, but exhaust all node memory in requested
308 * zone before failing, only calc new node value if we fail to find memory
309 * in target node, this avoids needless computation on fast-path.
310 */
311 for ( i = 0; i < num_nodes; i++ )
312 {
313 zone = zone_hi;
314 do {
315 /* Check if target node can support the allocation. */
316 if ( !avail[node] || (avail[node][zone] < request) )
317 continue;
319 /* Find smallest order which can satisfy the request. */
320 for ( j = order; j <= MAX_ORDER; j++ )
321 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
322 goto found;
323 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
325 /* Pick next node, wrapping around if needed. */
326 node = next_node(node, node_online_map);
327 if (node == MAX_NUMNODES)
328 node = first_node(node_online_map);
329 }
331 /* Try to free memory from tmem */
332 if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
333 {
334 /* reassigning an already allocated anonymous heap page */
335 spin_unlock(&heap_lock);
336 return pg;
337 }
339 /* No suitable memory blocks. Fail the request. */
340 spin_unlock(&heap_lock);
341 return NULL;
343 found:
344 /* We may have to halve the chunk a number of times. */
345 while ( j != order )
346 {
347 PFN_ORDER(pg) = --j;
348 page_list_add_tail(pg, &heap(node, zone, j));
349 pg += 1 << j;
350 }
352 ASSERT(avail[node][zone] >= request);
353 avail[node][zone] -= request;
354 total_avail_pages -= request;
355 ASSERT(total_avail_pages >= 0);
357 spin_unlock(&heap_lock);
359 cpus_clear(mask);
361 for ( i = 0; i < (1 << order); i++ )
362 {
363 /* Reference count must continuously be zero for free pages. */
364 BUG_ON(pg[i].count_info != PGC_state_free);
365 pg[i].count_info = PGC_state_inuse;
367 if ( pg[i].u.free.need_tlbflush )
368 {
369 /* Add in extra CPUs that need flushing because of this page. */
370 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
371 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
372 cpus_or(mask, mask, extra_cpus_mask);
373 }
375 /* Initialise fields which have other uses for free pages. */
376 pg[i].u.inuse.type_info = 0;
377 page_set_owner(&pg[i], NULL);
378 }
380 if ( unlikely(!cpus_empty(mask)) )
381 {
382 perfc_incr(need_flush_tlb_flush);
383 flush_tlb_mask(&mask);
384 }
386 return pg;
387 }
389 /* Remove any offlined page in the buddy pointed to by head. */
390 static int reserve_offlined_page(struct page_info *head)
391 {
392 unsigned int node = phys_to_nid(page_to_maddr(head));
393 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
394 struct page_info *cur_head;
395 int cur_order;
397 ASSERT(spin_is_locked(&heap_lock));
399 cur_head = head;
401 page_list_del(head, &heap(node, zone, head_order));
403 while ( cur_head < (head + (1 << head_order)) )
404 {
405 struct page_info *pg;
406 int next_order;
408 if ( page_state_is(cur_head, offlined) )
409 {
410 cur_head++;
411 continue;
412 }
414 next_order = cur_order = 0;
416 while ( cur_order < head_order )
417 {
418 next_order = cur_order + 1;
420 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
421 goto merge;
423 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
424 i < (1 << next_order);
425 i++, pg++ )
426 if ( page_state_is(pg, offlined) )
427 break;
428 if ( i == ( 1 << next_order) )
429 {
430 cur_order = next_order;
431 continue;
432 }
433 else
434 {
435 merge:
436 /* We don't consider merging outside the head_order. */
437 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
438 PFN_ORDER(cur_head) = cur_order;
439 cur_head += (1 << cur_order);
440 break;
441 }
442 }
443 }
445 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
446 {
447 if ( !page_state_is(cur_head, offlined) )
448 continue;
450 avail[node][zone]--;
451 total_avail_pages--;
452 ASSERT(total_avail_pages >= 0);
454 page_list_add_tail(cur_head,
455 test_bit(_PGC_broken, &cur_head->count_info) ?
456 &page_broken_list : &page_offlined_list);
458 count++;
459 }
461 return count;
462 }
464 /* Free 2^@order set of pages. */
465 static void free_heap_pages(
466 struct page_info *pg, unsigned int order)
467 {
468 unsigned long mask;
469 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
470 unsigned int zone = page_to_zone(pg);
472 ASSERT(order <= MAX_ORDER);
473 ASSERT(node >= 0);
475 for ( i = 0; i < (1 << order); i++ )
476 {
477 /*
478 * Cannot assume that count_info == 0, as there are some corner cases
479 * where it isn't the case and yet it isn't a bug:
480 * 1. page_get_owner() is NULL
481 * 2. page_get_owner() is a domain that was never accessible by
482 * its domid (e.g., failed to fully construct the domain).
483 * 3. page was never addressable by the guest (e.g., it's an
484 * auto-translate-physmap guest and the page was never included
485 * in its pseudophysical address space).
486 * In all the above cases there can be no guest mappings of this page.
487 */
488 ASSERT(!page_state_is(&pg[i], offlined));
489 pg[i].count_info =
490 ((pg[i].count_info & PGC_broken) |
491 (page_state_is(&pg[i], offlining)
492 ? PGC_state_offlined : PGC_state_free));
493 if ( page_state_is(&pg[i], offlined) )
494 tainted = 1;
496 /* If a page has no owner it will need no safety TLB flush. */
497 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
498 if ( pg[i].u.free.need_tlbflush )
499 pg[i].tlbflush_timestamp = tlbflush_current_time();
500 }
502 spin_lock(&heap_lock);
504 avail[node][zone] += 1 << order;
505 total_avail_pages += 1 << order;
507 /* Merge chunks as far as possible. */
508 while ( order < MAX_ORDER )
509 {
510 mask = 1UL << order;
512 if ( (page_to_mfn(pg) & mask) )
513 {
514 /* Merge with predecessor block? */
515 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
516 !page_state_is(pg-mask, free) ||
517 (PFN_ORDER(pg-mask) != order) )
518 break;
519 pg -= mask;
520 page_list_del(pg, &heap(node, zone, order));
521 }
522 else
523 {
524 /* Merge with successor block? */
525 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
526 !page_state_is(pg+mask, free) ||
527 (PFN_ORDER(pg+mask) != order) )
528 break;
529 page_list_del(pg + mask, &heap(node, zone, order));
530 }
532 order++;
534 /* After merging, pg should remain in the same node. */
535 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
536 }
538 PFN_ORDER(pg) = order;
539 page_list_add_tail(pg, &heap(node, zone, order));
541 if ( tainted )
542 reserve_offlined_page(pg);
544 spin_unlock(&heap_lock);
545 }
548 /*
549 * Following possible status for a page:
550 * free and Online; free and offlined; free and offlined and broken;
551 * assigned and online; assigned and offlining; assigned and offling and broken
552 *
553 * Following rules applied for page offline:
554 * Once a page is broken, it can't be assigned anymore
555 * A page will be offlined only if it is free
556 * return original count_info
557 */
558 static unsigned long mark_page_offline(struct page_info *pg, int broken)
559 {
560 unsigned long nx, x, y = pg->count_info;
562 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
563 ASSERT(spin_is_locked(&heap_lock));
565 do {
566 nx = x = y;
568 if ( ((x & PGC_state) != PGC_state_offlined) &&
569 ((x & PGC_state) != PGC_state_offlining) )
570 {
571 nx &= ~PGC_state;
572 nx |= (((x & PGC_state) == PGC_state_free)
573 ? PGC_state_offlined : PGC_state_offlining);
574 }
576 if ( broken )
577 nx |= PGC_broken;
579 if ( x == nx )
580 break;
581 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
583 return y;
584 }
586 static int reserve_heap_page(struct page_info *pg)
587 {
588 struct page_info *head = NULL;
589 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
590 unsigned int zone = page_to_zone(pg);
592 for ( i = 0; i <= MAX_ORDER; i++ )
593 {
594 struct page_info *tmp;
596 if ( page_list_empty(&heap(node, zone, i)) )
597 continue;
599 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
600 {
601 if ( (head <= pg) &&
602 (head + (1UL << i) > pg) )
603 return reserve_offlined_page(head);
604 }
605 }
607 return -EINVAL;
609 }
611 int offline_page(unsigned long mfn, int broken, uint32_t *status)
612 {
613 unsigned long old_info = 0;
614 struct domain *owner;
615 int ret = 0;
616 struct page_info *pg;
618 if ( !mfn_valid(mfn) )
619 {
620 dprintk(XENLOG_WARNING,
621 "try to offline page out of range %lx\n", mfn);
622 return -EINVAL;
623 }
625 *status = 0;
626 pg = mfn_to_page(mfn);
628 if ( is_xen_fixed_mfn(mfn) )
629 {
630 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
631 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
632 return -EPERM;
633 }
635 /*
636 * N.B. xen's txt in x86_64 is marked reserved and handled already.
637 * Also kexec range is reserved.
638 */
639 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
640 {
641 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
642 return -EINVAL;
643 }
645 spin_lock(&heap_lock);
647 old_info = mark_page_offline(pg, broken);
649 if ( page_state_is(pg, free) )
650 {
651 /* Free pages are reserve directly */
652 reserve_heap_page(pg);
653 *status = PG_OFFLINE_OFFLINED;
654 }
655 else if ( page_state_is(pg, offlined) )
656 {
657 *status = PG_OFFLINE_OFFLINED;
658 }
659 else if ( (owner = page_get_owner_and_reference(pg)) )
660 {
661 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
662 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
663 /* Release the reference since it will not be allocated anymore */
664 put_page(pg);
665 }
666 else if ( old_info & PGC_xen_heap )
667 {
668 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
669 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
670 }
671 else
672 {
673 /*
674 * assign_pages does not hold heap_lock, so small window that the owner
675 * may be set later, but please notice owner will only change from
676 * NULL to be set, not verse, since page is offlining now.
677 * No windows If called from #MC handler, since all CPU are in softirq
678 * If called from user space like CE handling, tools can wait some time
679 * before call again.
680 */
681 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
682 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
683 }
685 if ( broken )
686 *status |= PG_OFFLINE_BROKEN;
688 spin_unlock(&heap_lock);
690 return ret;
691 }
693 /*
694 * Online the memory.
695 * The caller should make sure end_pfn <= max_page,
696 * if not, expand_pages() should be called prior to online_page().
697 */
698 unsigned int online_page(unsigned long mfn, uint32_t *status)
699 {
700 unsigned long x, nx, y;
701 struct page_info *pg;
702 int ret;
704 if ( !mfn_valid(mfn) )
705 {
706 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
707 return -EINVAL;
708 }
710 pg = mfn_to_page(mfn);
712 spin_lock(&heap_lock);
714 y = pg->count_info;
715 do {
716 ret = *status = 0;
718 if ( y & PGC_broken )
719 {
720 ret = -EINVAL;
721 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
722 break;
723 }
725 if ( (y & PGC_state) == PGC_state_offlined )
726 {
727 page_list_del(pg, &page_offlined_list);
728 *status = PG_ONLINE_ONLINED;
729 }
730 else if ( (y & PGC_state) == PGC_state_offlining )
731 {
732 *status = PG_ONLINE_ONLINED;
733 }
734 else
735 {
736 break;
737 }
739 x = y;
740 nx = (x & ~PGC_state) | PGC_state_inuse;
741 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
743 spin_unlock(&heap_lock);
745 if ( (y & PGC_state) == PGC_state_offlined )
746 free_heap_pages(pg, 0);
748 return ret;
749 }
751 int query_page_offline(unsigned long mfn, uint32_t *status)
752 {
753 struct page_info *pg;
755 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
756 {
757 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
758 return -EINVAL;
759 }
761 *status = 0;
762 spin_lock(&heap_lock);
764 pg = mfn_to_page(mfn);
766 if ( page_state_is(pg, offlining) )
767 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
768 if ( pg->count_info & PGC_broken )
769 *status |= PG_OFFLINE_STATUS_BROKEN;
770 if ( page_state_is(pg, offlined) )
771 *status |= PG_OFFLINE_STATUS_OFFLINED;
773 spin_unlock(&heap_lock);
775 return 0;
776 }
778 /*
779 * Hand the specified arbitrary page range to the specified heap zone
780 * checking the node_id of the previous page. If they differ and the
781 * latter is not on a MAX_ORDER boundary, then we reserve the page by
782 * not freeing it to the buddy allocator.
783 */
784 static void init_heap_pages(
785 struct page_info *pg, unsigned long nr_pages)
786 {
787 unsigned int nid_curr, nid_prev;
788 unsigned long i;
790 nid_prev = phys_to_nid(page_to_maddr(pg-1));
792 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
793 {
794 nid_curr = phys_to_nid(page_to_maddr(pg+i));
796 if ( unlikely(!avail[nid_curr]) )
797 {
798 unsigned long n;
800 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
801 if ( n )
802 {
803 BUG_ON(i + n > nr_pages);
804 i += n - 1;
805 continue;
806 }
807 }
809 /*
810 * Free pages of the same node, or if they differ, but are on a
811 * MAX_ORDER alignment boundary (which already get reserved).
812 */
813 if ( (nid_curr == nid_prev) ||
814 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
815 free_heap_pages(pg+i, 0);
816 else
817 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
818 page_to_mfn(pg+i));
819 }
820 }
822 static unsigned long avail_heap_pages(
823 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
824 {
825 unsigned int i, zone;
826 unsigned long free_pages = 0;
828 if ( zone_hi >= NR_ZONES )
829 zone_hi = NR_ZONES - 1;
831 for_each_online_node(i)
832 {
833 if ( !avail[i] )
834 continue;
835 for ( zone = zone_lo; zone <= zone_hi; zone++ )
836 if ( (node == -1) || (node == i) )
837 free_pages += avail[i][zone];
838 }
840 return free_pages;
841 }
843 unsigned long total_free_pages(void)
844 {
845 return total_avail_pages;
846 }
848 void __init end_boot_allocator(void)
849 {
850 unsigned int i;
852 /* Pages that are free now go to the domain sub-allocator. */
853 for ( i = 0; i < nr_bootmem_regions; i++ )
854 {
855 struct bootmem_region *r = &bootmem_region_list[i];
856 if ( r->s < r->e )
857 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
858 }
859 init_heap_pages(virt_to_page(bootmem_region_list), 1);
861 if ( !dma_bitsize && (num_online_nodes() > 1) )
862 {
863 #ifdef CONFIG_X86
864 dma_bitsize = min_t(unsigned int,
865 fls(NODE_DATA(0)->node_spanned_pages) - 1
866 + PAGE_SHIFT - 2,
867 32);
868 #else
869 dma_bitsize = 32;
870 #endif
871 }
873 printk("Domain heap initialised");
874 if ( dma_bitsize )
875 printk(" DMA width %u bits", dma_bitsize);
876 printk("\n");
877 }
879 /*
880 * Scrub all unallocated pages in all heap zones. This function is more
881 * convoluted than appears necessary because we do not want to continuously
882 * hold the lock while scrubbing very large memory areas.
883 */
884 void __init scrub_heap_pages(void)
885 {
886 unsigned long mfn;
887 struct page_info *pg;
889 if ( !opt_bootscrub )
890 return;
892 printk("Scrubbing Free RAM: ");
894 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
895 {
896 process_pending_softirqs();
898 pg = mfn_to_page(mfn);
900 /* Quick lock-free check. */
901 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
902 continue;
904 /* Every 100MB, print a progress dot. */
905 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
906 printk(".");
908 spin_lock(&heap_lock);
910 /* Re-check page status with lock held. */
911 if ( page_state_is(pg, free) )
912 scrub_one_page(pg);
914 spin_unlock(&heap_lock);
915 }
917 printk("done.\n");
918 }
922 /*************************
923 * XEN-HEAP SUB-ALLOCATOR
924 */
926 #if !defined(__x86_64__) && !defined(__ia64__)
928 void init_xenheap_pages(paddr_t ps, paddr_t pe)
929 {
930 ps = round_pgup(ps);
931 pe = round_pgdown(pe);
932 if ( pe <= ps )
933 return;
935 /*
936 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
937 * prevent merging of power-of-two blocks across the zone boundary.
938 */
939 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
940 ps += PAGE_SIZE;
941 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
942 pe -= PAGE_SIZE;
944 memguard_guard_range(maddr_to_virt(ps), pe - ps);
946 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
947 }
950 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
951 {
952 struct page_info *pg;
954 ASSERT(!in_irq());
956 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
957 cpu_to_node(smp_processor_id()), order, memflags);
958 if ( unlikely(pg == NULL) )
959 return NULL;
961 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
963 return page_to_virt(pg);
964 }
967 void free_xenheap_pages(void *v, unsigned int order)
968 {
969 ASSERT(!in_irq());
971 if ( v == NULL )
972 return;
974 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
976 free_heap_pages(virt_to_page(v), order);
977 }
979 #else
981 void init_xenheap_pages(paddr_t ps, paddr_t pe)
982 {
983 init_domheap_pages(ps, pe);
984 }
986 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
987 {
988 struct page_info *pg;
989 unsigned int i;
991 ASSERT(!in_irq());
993 pg = alloc_domheap_pages(NULL, order, memflags);
994 if ( unlikely(pg == NULL) )
995 return NULL;
997 for ( i = 0; i < (1u << order); i++ )
998 pg[i].count_info |= PGC_xen_heap;
1000 return page_to_virt(pg);
1003 void free_xenheap_pages(void *v, unsigned int order)
1005 struct page_info *pg;
1006 unsigned int i;
1008 ASSERT(!in_irq());
1010 if ( v == NULL )
1011 return;
1013 pg = virt_to_page(v);
1015 for ( i = 0; i < (1u << order); i++ )
1016 pg[i].count_info &= ~PGC_xen_heap;
1018 free_heap_pages(pg, order);
1021 #endif
1025 /*************************
1026 * DOMAIN-HEAP SUB-ALLOCATOR
1027 */
1029 void init_domheap_pages(paddr_t ps, paddr_t pe)
1031 unsigned long smfn, emfn;
1033 ASSERT(!in_irq());
1035 smfn = round_pgup(ps) >> PAGE_SHIFT;
1036 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1038 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1042 int assign_pages(
1043 struct domain *d,
1044 struct page_info *pg,
1045 unsigned int order,
1046 unsigned int memflags)
1048 unsigned long i;
1050 spin_lock(&d->page_alloc_lock);
1052 if ( unlikely(d->is_dying) )
1054 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1055 d->domain_id);
1056 goto fail;
1059 if ( !(memflags & MEMF_no_refcount) )
1061 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1063 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
1064 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
1065 "%u > %u\n", d->domain_id,
1066 d->tot_pages + (1 << order), d->max_pages);
1067 goto fail;
1070 if ( unlikely(d->tot_pages == 0) )
1071 get_knownalive_domain(d);
1073 d->tot_pages += 1 << order;
1076 for ( i = 0; i < (1 << order); i++ )
1078 ASSERT(page_get_owner(&pg[i]) == NULL);
1079 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1080 page_set_owner(&pg[i], d);
1081 wmb(); /* Domain pointer must be visible before updating refcnt. */
1082 pg[i].count_info = PGC_allocated | 1;
1083 page_list_add_tail(&pg[i], &d->page_list);
1086 spin_unlock(&d->page_alloc_lock);
1087 return 0;
1089 fail:
1090 spin_unlock(&d->page_alloc_lock);
1091 return -1;
1095 struct page_info *alloc_domheap_pages(
1096 struct domain *d, unsigned int order, unsigned int memflags)
1098 struct page_info *pg = NULL;
1099 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1100 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
1102 ASSERT(!in_irq());
1104 if ( (node == NUMA_NO_NODE) && (d != NULL) )
1105 node = domain_to_node(d);
1107 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1108 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1109 return NULL;
1111 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1112 pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
1114 if ( (pg == NULL) &&
1115 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
1116 node, order, memflags)) == NULL) )
1117 return NULL;
1119 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1121 free_heap_pages(pg, order);
1122 return NULL;
1125 return pg;
1128 void free_domheap_pages(struct page_info *pg, unsigned int order)
1130 int i, drop_dom_ref;
1131 struct domain *d = page_get_owner(pg);
1133 ASSERT(!in_irq());
1135 if ( unlikely(is_xen_heap_page(pg)) )
1137 /* NB. May recursively lock from relinquish_memory(). */
1138 spin_lock_recursive(&d->page_alloc_lock);
1140 for ( i = 0; i < (1 << order); i++ )
1141 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1143 d->xenheap_pages -= 1 << order;
1144 drop_dom_ref = (d->xenheap_pages == 0);
1146 spin_unlock_recursive(&d->page_alloc_lock);
1148 else if ( likely(d != NULL) && likely(d != dom_cow) )
1150 /* NB. May recursively lock from relinquish_memory(). */
1151 spin_lock_recursive(&d->page_alloc_lock);
1153 for ( i = 0; i < (1 << order); i++ )
1155 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1156 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1159 d->tot_pages -= 1 << order;
1160 drop_dom_ref = (d->tot_pages == 0);
1162 spin_unlock_recursive(&d->page_alloc_lock);
1164 /*
1165 * Normally we expect a domain to clear pages before freeing them, if
1166 * it cares about the secrecy of their contents. However, after a
1167 * domain has died we assume responsibility for erasure.
1168 */
1169 if ( unlikely(d->is_dying) )
1170 for ( i = 0; i < (1 << order); i++ )
1171 scrub_one_page(&pg[i]);
1173 free_heap_pages(pg, order);
1175 else if ( unlikely(d == dom_cow) )
1177 ASSERT(order == 0);
1178 scrub_one_page(pg);
1179 free_heap_pages(pg, 0);
1180 drop_dom_ref = 0;
1182 else
1184 /* Freeing anonymous domain-heap pages. */
1185 free_heap_pages(pg, order);
1186 drop_dom_ref = 0;
1189 if ( drop_dom_ref )
1190 put_domain(d);
1193 unsigned long avail_domheap_pages_region(
1194 unsigned int node, unsigned int min_width, unsigned int max_width)
1196 int zone_lo, zone_hi;
1198 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1199 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1201 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1202 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1204 return avail_heap_pages(zone_lo, zone_hi, node);
1207 unsigned long avail_domheap_pages(void)
1209 return avail_heap_pages(MEMZONE_XEN + 1,
1210 NR_ZONES - 1,
1211 -1);
1214 static void pagealloc_info(unsigned char key)
1216 unsigned int zone = MEMZONE_XEN;
1217 unsigned long n, total = 0;
1219 printk("Physical memory information:\n");
1220 printk(" Xen heap: %lukB free\n",
1221 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1223 while ( ++zone < NR_ZONES )
1225 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1227 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1228 total = 0;
1231 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1233 total += n;
1234 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1238 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1241 static struct keyhandler pagealloc_info_keyhandler = {
1242 .diagnostic = 1,
1243 .u.fn = pagealloc_info,
1244 .desc = "memory info"
1245 };
1247 static __init int pagealloc_keyhandler_init(void)
1249 register_keyhandler('m', &pagealloc_info_keyhandler);
1250 return 0;
1252 __initcall(pagealloc_keyhandler_init);
1255 void scrub_one_page(struct page_info *pg)
1257 void *p = __map_domain_page(pg);
1259 #ifndef NDEBUG
1260 /* Avoid callers relying on allocations returning zeroed pages. */
1261 memset(p, 0xc2, PAGE_SIZE);
1262 #else
1263 /* For a production build, clear_page() is the fastest way to scrub. */
1264 clear_page(p);
1265 #endif
1267 unmap_domain_page(p);
1270 static void dump_heap(unsigned char key)
1272 s_time_t now = NOW();
1273 int i, j;
1275 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1276 (u32)(now>>32), (u32)now);
1278 for ( i = 0; i < MAX_NUMNODES; i++ )
1280 if ( !avail[i] )
1281 continue;
1282 for ( j = 0; j < NR_ZONES; j++ )
1283 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1284 i, j, avail[i][j]);
1288 static struct keyhandler dump_heap_keyhandler = {
1289 .diagnostic = 1,
1290 .u.fn = dump_heap,
1291 .desc = "dump heap info"
1292 };
1294 static __init int register_heap_trigger(void)
1296 register_keyhandler('H', &dump_heap_keyhandler);
1297 return 0;
1299 __initcall(register_heap_trigger);
1301 /*
1302 * Local variables:
1303 * mode: C
1304 * c-set-style: "BSD"
1305 * c-basic-offset: 4
1306 * tab-width: 4
1307 * indent-tabs-mode: nil
1308 * End:
1309 */