debuggers.hg

view xen/common/page_alloc.c @ 22848:6341fe0f4e5a

Added tag 4.1.0-rc2 for changeset 9dca60d88c63
author Keir Fraser <keir@xen.org>
date Tue Jan 25 14:06:55 2011 +0000 (2011-01-25)
parents c3e478eafabc
children
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <xen/tmem_xen.h>
40 #include <public/sysctl.h>
41 #include <asm/page.h>
42 #include <asm/numa.h>
43 #include <asm/flushtlb.h>
45 /*
46 * Comma-separated list of hexadecimal page numbers containing bad bytes.
47 * e.g. 'badpage=0x3f45,0x8a321'.
48 */
49 static char __initdata opt_badpage[100] = "";
50 string_param("badpage", opt_badpage);
52 /*
53 * no-bootscrub -> Free pages are not zeroed during boot.
54 */
55 static bool_t opt_bootscrub __initdata = 1;
56 boolean_param("bootscrub", opt_bootscrub);
58 /*
59 * Bit width of the DMA heap -- used to override NUMA-node-first.
60 * allocation strategy, which can otherwise exhaust low memory.
61 */
62 static unsigned int dma_bitsize;
63 integer_param("dma_bits", dma_bitsize);
65 #define round_pgdown(_p) ((_p)&PAGE_MASK)
66 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
68 /* Offlined page list, protected by heap_lock. */
69 PAGE_LIST_HEAD(page_offlined_list);
70 /* Broken page list, protected by heap_lock. */
71 PAGE_LIST_HEAD(page_broken_list);
73 /*************************
74 * BOOT-TIME ALLOCATOR
75 */
77 static unsigned long __initdata first_valid_mfn = ~0UL;
79 static struct bootmem_region {
80 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
81 } *__initdata bootmem_region_list;
82 static unsigned int __initdata nr_bootmem_regions;
84 static void __init boot_bug(int line)
85 {
86 panic("Boot BUG at %s:%d\n", __FILE__, line);
87 }
88 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
90 static void __init bootmem_region_add(unsigned long s, unsigned long e)
91 {
92 unsigned int i;
94 if ( (bootmem_region_list == NULL) && (s < e) )
95 bootmem_region_list = mfn_to_virt(s++);
97 if ( s >= e )
98 return;
100 for ( i = 0; i < nr_bootmem_regions; i++ )
101 if ( s < bootmem_region_list[i].e )
102 break;
104 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
105 BOOT_BUG_ON(nr_bootmem_regions ==
106 (PAGE_SIZE / sizeof(struct bootmem_region)));
108 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
109 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
110 bootmem_region_list[i] = (struct bootmem_region) { s, e };
111 nr_bootmem_regions++;
112 }
114 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
115 {
116 unsigned int i;
118 for ( i = 0; i < nr_bootmem_regions; i++ )
119 {
120 struct bootmem_region *r = &bootmem_region_list[i];
121 if ( e <= r->s )
122 break;
123 if ( s >= r->e )
124 continue;
125 if ( s <= r->s )
126 {
127 r->s = min(e, r->e);
128 }
129 else if ( e >= r->e )
130 {
131 r->e = s;
132 }
133 else
134 {
135 unsigned long _e = r->e;
136 r->e = s;
137 bootmem_region_add(e, _e);
138 }
139 }
140 }
142 void __init init_boot_pages(paddr_t ps, paddr_t pe)
143 {
144 unsigned long bad_spfn, bad_epfn;
145 const char *p;
147 ps = round_pgup(ps);
148 pe = round_pgdown(pe);
149 if ( pe <= ps )
150 return;
152 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
154 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
156 /* Check new pages against the bad-page list. */
157 p = opt_badpage;
158 while ( *p != '\0' )
159 {
160 bad_spfn = simple_strtoul(p, &p, 0);
161 bad_epfn = bad_spfn;
163 if ( *p == '-' )
164 {
165 p++;
166 bad_epfn = simple_strtoul(p, &p, 0);
167 if ( bad_epfn < bad_spfn )
168 bad_epfn = bad_spfn;
169 }
171 if ( *p == ',' )
172 p++;
173 else if ( *p != '\0' )
174 break;
176 if ( bad_epfn == bad_spfn )
177 printk("Marking page %lx as bad\n", bad_spfn);
178 else
179 printk("Marking pages %lx through %lx as bad\n",
180 bad_spfn, bad_epfn);
182 bootmem_region_zap(bad_spfn, bad_epfn+1);
183 }
184 }
186 unsigned long __init alloc_boot_pages(
187 unsigned long nr_pfns, unsigned long pfn_align)
188 {
189 unsigned long pg, _e;
190 int i;
192 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
193 {
194 struct bootmem_region *r = &bootmem_region_list[i];
195 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
196 if ( pg < r->s )
197 continue;
198 _e = r->e;
199 r->e = pg;
200 bootmem_region_add(pg + nr_pfns, _e);
201 return pg;
202 }
204 BOOT_BUG_ON(1);
205 return 0;
206 }
210 /*************************
211 * BINARY BUDDY ALLOCATOR
212 */
214 #define MEMZONE_XEN 0
215 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
217 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
218 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
219 (fls(page_to_mfn(pg)) - 1))
221 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
222 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
223 #define heap(node, zone, order) ((*_heap[node])[zone][order])
225 static unsigned long *avail[MAX_NUMNODES];
226 static long total_avail_pages;
228 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
229 static long midsize_alloc_zone_pages;
230 #define MIDSIZE_ALLOC_FRAC 128
232 static DEFINE_SPINLOCK(heap_lock);
234 static unsigned long init_node_heap(int node, unsigned long mfn,
235 unsigned long nr, bool_t *use_tail)
236 {
237 /* First node to be discovered has its heap metadata statically alloced. */
238 static heap_by_zone_and_order_t _heap_static;
239 static unsigned long avail_static[NR_ZONES];
240 static int first_node_initialised;
241 unsigned long needed = (sizeof(**_heap) +
242 sizeof(**avail) * NR_ZONES +
243 PAGE_SIZE - 1) >> PAGE_SHIFT;
244 int i, j;
246 if ( !first_node_initialised )
247 {
248 _heap[node] = &_heap_static;
249 avail[node] = avail_static;
250 first_node_initialised = 1;
251 needed = 0;
252 }
253 #ifdef DIRECTMAP_VIRT_END
254 else if ( *use_tail && nr >= needed &&
255 (mfn + nr) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
256 {
257 _heap[node] = mfn_to_virt(mfn + nr - needed);
258 avail[node] = mfn_to_virt(mfn + nr - 1) +
259 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
260 }
261 else if ( nr >= needed &&
262 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
263 {
264 _heap[node] = mfn_to_virt(mfn);
265 avail[node] = mfn_to_virt(mfn + needed - 1) +
266 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
267 *use_tail = 0;
268 }
269 #endif
270 else if ( get_order_from_bytes(sizeof(**_heap)) ==
271 get_order_from_pages(needed) )
272 {
273 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
274 BUG_ON(!_heap[node]);
275 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
276 sizeof(**avail) * NR_ZONES;
277 needed = 0;
278 }
279 else
280 {
281 _heap[node] = xmalloc(heap_by_zone_and_order_t);
282 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
283 BUG_ON(!_heap[node] || !avail[node]);
284 needed = 0;
285 }
287 memset(avail[node], 0, NR_ZONES * sizeof(long));
289 for ( i = 0; i < NR_ZONES; i++ )
290 for ( j = 0; j <= MAX_ORDER; j++ )
291 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
293 return needed;
294 }
296 /* Allocate 2^@order contiguous pages. */
297 static struct page_info *alloc_heap_pages(
298 unsigned int zone_lo, unsigned int zone_hi,
299 unsigned int order, unsigned int memflags,
300 struct domain *d)
301 {
302 unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
303 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
304 unsigned long request = 1UL << order;
305 cpumask_t extra_cpus_mask, mask;
306 struct page_info *pg;
307 nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
309 if ( node == NUMA_NO_NODE )
310 {
311 memflags &= ~MEMF_exact_node;
312 if ( d != NULL )
313 {
314 node = next_node(d->last_alloc_node, nodemask);
315 if ( node >= MAX_NUMNODES )
316 node = first_node(nodemask);
317 }
318 if ( node >= MAX_NUMNODES )
319 node = cpu_to_node(smp_processor_id());
320 }
321 first_node = node;
323 ASSERT(node >= 0);
324 ASSERT(zone_lo <= zone_hi);
325 ASSERT(zone_hi < NR_ZONES);
327 if ( unlikely(order > MAX_ORDER) )
328 return NULL;
330 spin_lock(&heap_lock);
332 /*
333 * TMEM: When available memory is scarce due to tmem absorbing it, allow
334 * only mid-size allocations to avoid worst of fragmentation issues.
335 * Others try tmem pools then fail. This is a workaround until all
336 * post-dom0-creation-multi-page allocations can be eliminated.
337 */
338 if ( opt_tmem && ((order == 0) || (order >= 9)) &&
339 (total_avail_pages <= midsize_alloc_zone_pages) &&
340 tmem_freeable_pages() )
341 goto try_tmem;
343 /*
344 * Start with requested node, but exhaust all node memory in requested
345 * zone before failing, only calc new node value if we fail to find memory
346 * in target node, this avoids needless computation on fast-path.
347 */
348 for ( ; ; )
349 {
350 zone = zone_hi;
351 do {
352 /* Check if target node can support the allocation. */
353 if ( !avail[node] || (avail[node][zone] < request) )
354 continue;
356 /* Find smallest order which can satisfy the request. */
357 for ( j = order; j <= MAX_ORDER; j++ )
358 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
359 goto found;
360 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
362 if ( memflags & MEMF_exact_node )
363 goto not_found;
365 /* Pick next node. */
366 if ( !node_isset(node, nodemask) )
367 {
368 /* Very first node may be caller-specified and outside nodemask. */
369 ASSERT(!nodemask_retry);
370 first_node = node = first_node(nodemask);
371 if ( node < MAX_NUMNODES )
372 continue;
373 }
374 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
375 node = first_node(nodemask);
376 if ( node == first_node )
377 {
378 /* When we have tried all in nodemask, we fall back to others. */
379 if ( nodemask_retry++ )
380 goto not_found;
381 nodes_andnot(nodemask, node_online_map, nodemask);
382 first_node = node = first_node(nodemask);
383 if ( node >= MAX_NUMNODES )
384 goto not_found;
385 }
386 }
388 try_tmem:
389 /* Try to free memory from tmem */
390 if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
391 {
392 /* reassigning an already allocated anonymous heap page */
393 spin_unlock(&heap_lock);
394 return pg;
395 }
397 not_found:
398 /* No suitable memory blocks. Fail the request. */
399 spin_unlock(&heap_lock);
400 return NULL;
402 found:
403 /* We may have to halve the chunk a number of times. */
404 while ( j != order )
405 {
406 PFN_ORDER(pg) = --j;
407 page_list_add_tail(pg, &heap(node, zone, j));
408 pg += 1 << j;
409 }
411 ASSERT(avail[node][zone] >= request);
412 avail[node][zone] -= request;
413 total_avail_pages -= request;
414 ASSERT(total_avail_pages >= 0);
416 if ( d != NULL )
417 d->last_alloc_node = node;
419 cpus_clear(mask);
421 for ( i = 0; i < (1 << order); i++ )
422 {
423 /* Reference count must continuously be zero for free pages. */
424 BUG_ON(pg[i].count_info != PGC_state_free);
425 pg[i].count_info = PGC_state_inuse;
427 if ( pg[i].u.free.need_tlbflush )
428 {
429 /* Add in extra CPUs that need flushing because of this page. */
430 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
431 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
432 cpus_or(mask, mask, extra_cpus_mask);
433 }
435 /* Initialise fields which have other uses for free pages. */
436 pg[i].u.inuse.type_info = 0;
437 page_set_owner(&pg[i], NULL);
438 }
440 spin_unlock(&heap_lock);
442 if ( unlikely(!cpus_empty(mask)) )
443 {
444 perfc_incr(need_flush_tlb_flush);
445 flush_tlb_mask(&mask);
446 }
448 return pg;
449 }
451 /* Remove any offlined page in the buddy pointed to by head. */
452 static int reserve_offlined_page(struct page_info *head)
453 {
454 unsigned int node = phys_to_nid(page_to_maddr(head));
455 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
456 struct page_info *cur_head;
457 int cur_order;
459 ASSERT(spin_is_locked(&heap_lock));
461 cur_head = head;
463 page_list_del(head, &heap(node, zone, head_order));
465 while ( cur_head < (head + (1 << head_order)) )
466 {
467 struct page_info *pg;
468 int next_order;
470 if ( page_state_is(cur_head, offlined) )
471 {
472 cur_head++;
473 continue;
474 }
476 next_order = cur_order = 0;
478 while ( cur_order < head_order )
479 {
480 next_order = cur_order + 1;
482 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
483 goto merge;
485 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
486 i < (1 << next_order);
487 i++, pg++ )
488 if ( page_state_is(pg, offlined) )
489 break;
490 if ( i == ( 1 << next_order) )
491 {
492 cur_order = next_order;
493 continue;
494 }
495 else
496 {
497 merge:
498 /* We don't consider merging outside the head_order. */
499 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
500 PFN_ORDER(cur_head) = cur_order;
501 cur_head += (1 << cur_order);
502 break;
503 }
504 }
505 }
507 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
508 {
509 if ( !page_state_is(cur_head, offlined) )
510 continue;
512 avail[node][zone]--;
513 total_avail_pages--;
514 ASSERT(total_avail_pages >= 0);
516 page_list_add_tail(cur_head,
517 test_bit(_PGC_broken, &cur_head->count_info) ?
518 &page_broken_list : &page_offlined_list);
520 count++;
521 }
523 return count;
524 }
526 /* Free 2^@order set of pages. */
527 static void free_heap_pages(
528 struct page_info *pg, unsigned int order)
529 {
530 unsigned long mask;
531 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
532 unsigned int zone = page_to_zone(pg);
534 ASSERT(order <= MAX_ORDER);
535 ASSERT(node >= 0);
537 spin_lock(&heap_lock);
539 for ( i = 0; i < (1 << order); i++ )
540 {
541 /*
542 * Cannot assume that count_info == 0, as there are some corner cases
543 * where it isn't the case and yet it isn't a bug:
544 * 1. page_get_owner() is NULL
545 * 2. page_get_owner() is a domain that was never accessible by
546 * its domid (e.g., failed to fully construct the domain).
547 * 3. page was never addressable by the guest (e.g., it's an
548 * auto-translate-physmap guest and the page was never included
549 * in its pseudophysical address space).
550 * In all the above cases there can be no guest mappings of this page.
551 */
552 ASSERT(!page_state_is(&pg[i], offlined));
553 pg[i].count_info =
554 ((pg[i].count_info & PGC_broken) |
555 (page_state_is(&pg[i], offlining)
556 ? PGC_state_offlined : PGC_state_free));
557 if ( page_state_is(&pg[i], offlined) )
558 tainted = 1;
560 /* If a page has no owner it will need no safety TLB flush. */
561 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
562 if ( pg[i].u.free.need_tlbflush )
563 pg[i].tlbflush_timestamp = tlbflush_current_time();
564 }
566 avail[node][zone] += 1 << order;
567 total_avail_pages += 1 << order;
569 if ( opt_tmem )
570 midsize_alloc_zone_pages = max(
571 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
573 /* Merge chunks as far as possible. */
574 while ( order < MAX_ORDER )
575 {
576 mask = 1UL << order;
578 if ( (page_to_mfn(pg) & mask) )
579 {
580 /* Merge with predecessor block? */
581 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
582 !page_state_is(pg-mask, free) ||
583 (PFN_ORDER(pg-mask) != order) ||
584 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
585 break;
586 pg -= mask;
587 page_list_del(pg, &heap(node, zone, order));
588 }
589 else
590 {
591 /* Merge with successor block? */
592 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
593 !page_state_is(pg+mask, free) ||
594 (PFN_ORDER(pg+mask) != order) ||
595 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
596 break;
597 page_list_del(pg + mask, &heap(node, zone, order));
598 }
600 order++;
601 }
603 PFN_ORDER(pg) = order;
604 page_list_add_tail(pg, &heap(node, zone, order));
606 if ( tainted )
607 reserve_offlined_page(pg);
609 spin_unlock(&heap_lock);
610 }
613 /*
614 * Following possible status for a page:
615 * free and Online; free and offlined; free and offlined and broken;
616 * assigned and online; assigned and offlining; assigned and offling and broken
617 *
618 * Following rules applied for page offline:
619 * Once a page is broken, it can't be assigned anymore
620 * A page will be offlined only if it is free
621 * return original count_info
622 */
623 static unsigned long mark_page_offline(struct page_info *pg, int broken)
624 {
625 unsigned long nx, x, y = pg->count_info;
627 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
628 ASSERT(spin_is_locked(&heap_lock));
630 do {
631 nx = x = y;
633 if ( ((x & PGC_state) != PGC_state_offlined) &&
634 ((x & PGC_state) != PGC_state_offlining) )
635 {
636 nx &= ~PGC_state;
637 nx |= (((x & PGC_state) == PGC_state_free)
638 ? PGC_state_offlined : PGC_state_offlining);
639 }
641 if ( broken )
642 nx |= PGC_broken;
644 if ( x == nx )
645 break;
646 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
648 return y;
649 }
651 static int reserve_heap_page(struct page_info *pg)
652 {
653 struct page_info *head = NULL;
654 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
655 unsigned int zone = page_to_zone(pg);
657 for ( i = 0; i <= MAX_ORDER; i++ )
658 {
659 struct page_info *tmp;
661 if ( page_list_empty(&heap(node, zone, i)) )
662 continue;
664 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
665 {
666 if ( (head <= pg) &&
667 (head + (1UL << i) > pg) )
668 return reserve_offlined_page(head);
669 }
670 }
672 return -EINVAL;
674 }
676 int offline_page(unsigned long mfn, int broken, uint32_t *status)
677 {
678 unsigned long old_info = 0;
679 struct domain *owner;
680 int ret = 0;
681 struct page_info *pg;
683 if ( !mfn_valid(mfn) )
684 {
685 dprintk(XENLOG_WARNING,
686 "try to offline page out of range %lx\n", mfn);
687 return -EINVAL;
688 }
690 *status = 0;
691 pg = mfn_to_page(mfn);
693 if ( is_xen_fixed_mfn(mfn) )
694 {
695 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
696 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
697 return -EPERM;
698 }
700 /*
701 * N.B. xen's txt in x86_64 is marked reserved and handled already.
702 * Also kexec range is reserved.
703 */
704 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
705 {
706 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
707 return -EINVAL;
708 }
710 spin_lock(&heap_lock);
712 old_info = mark_page_offline(pg, broken);
714 if ( page_state_is(pg, free) )
715 {
716 /* Free pages are reserve directly */
717 reserve_heap_page(pg);
718 *status = PG_OFFLINE_OFFLINED;
719 }
720 else if ( page_state_is(pg, offlined) )
721 {
722 *status = PG_OFFLINE_OFFLINED;
723 }
724 else if ( (owner = page_get_owner_and_reference(pg)) )
725 {
726 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
727 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
728 /* Release the reference since it will not be allocated anymore */
729 put_page(pg);
730 }
731 else if ( old_info & PGC_xen_heap )
732 {
733 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
734 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
735 }
736 else
737 {
738 /*
739 * assign_pages does not hold heap_lock, so small window that the owner
740 * may be set later, but please notice owner will only change from
741 * NULL to be set, not verse, since page is offlining now.
742 * No windows If called from #MC handler, since all CPU are in softirq
743 * If called from user space like CE handling, tools can wait some time
744 * before call again.
745 */
746 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
747 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
748 }
750 if ( broken )
751 *status |= PG_OFFLINE_BROKEN;
753 spin_unlock(&heap_lock);
755 return ret;
756 }
758 /*
759 * Online the memory.
760 * The caller should make sure end_pfn <= max_page,
761 * if not, expand_pages() should be called prior to online_page().
762 */
763 unsigned int online_page(unsigned long mfn, uint32_t *status)
764 {
765 unsigned long x, nx, y;
766 struct page_info *pg;
767 int ret;
769 if ( !mfn_valid(mfn) )
770 {
771 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
772 return -EINVAL;
773 }
775 pg = mfn_to_page(mfn);
777 spin_lock(&heap_lock);
779 y = pg->count_info;
780 do {
781 ret = *status = 0;
783 if ( y & PGC_broken )
784 {
785 ret = -EINVAL;
786 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
787 break;
788 }
790 if ( (y & PGC_state) == PGC_state_offlined )
791 {
792 page_list_del(pg, &page_offlined_list);
793 *status = PG_ONLINE_ONLINED;
794 }
795 else if ( (y & PGC_state) == PGC_state_offlining )
796 {
797 *status = PG_ONLINE_ONLINED;
798 }
799 else
800 {
801 break;
802 }
804 x = y;
805 nx = (x & ~PGC_state) | PGC_state_inuse;
806 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
808 spin_unlock(&heap_lock);
810 if ( (y & PGC_state) == PGC_state_offlined )
811 free_heap_pages(pg, 0);
813 return ret;
814 }
816 int query_page_offline(unsigned long mfn, uint32_t *status)
817 {
818 struct page_info *pg;
820 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
821 {
822 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
823 return -EINVAL;
824 }
826 *status = 0;
827 spin_lock(&heap_lock);
829 pg = mfn_to_page(mfn);
831 if ( page_state_is(pg, offlining) )
832 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
833 if ( pg->count_info & PGC_broken )
834 *status |= PG_OFFLINE_STATUS_BROKEN;
835 if ( page_state_is(pg, offlined) )
836 *status |= PG_OFFLINE_STATUS_OFFLINED;
838 spin_unlock(&heap_lock);
840 return 0;
841 }
843 /*
844 * Hand the specified arbitrary page range to the specified heap zone
845 * checking the node_id of the previous page. If they differ and the
846 * latter is not on a MAX_ORDER boundary, then we reserve the page by
847 * not freeing it to the buddy allocator.
848 */
849 static void init_heap_pages(
850 struct page_info *pg, unsigned long nr_pages)
851 {
852 unsigned long i;
854 for ( i = 0; i < nr_pages; i++ )
855 {
856 unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
858 if ( unlikely(!avail[nid]) )
859 {
860 unsigned long s = page_to_mfn(pg + i);
861 unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
862 bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
863 !(s & ((1UL << MAX_ORDER) - 1)) &&
864 (find_first_set_bit(e) <= find_first_set_bit(s));
865 unsigned long n;
867 n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
868 &use_tail);
869 BUG_ON(i + n > nr_pages);
870 if ( n && !use_tail )
871 {
872 i += n - 1;
873 continue;
874 }
875 if ( i + n == nr_pages )
876 break;
877 nr_pages -= n;
878 }
880 free_heap_pages(pg+i, 0);
881 }
882 }
884 static unsigned long avail_heap_pages(
885 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
886 {
887 unsigned int i, zone;
888 unsigned long free_pages = 0;
890 if ( zone_hi >= NR_ZONES )
891 zone_hi = NR_ZONES - 1;
893 for_each_online_node(i)
894 {
895 if ( !avail[i] )
896 continue;
897 for ( zone = zone_lo; zone <= zone_hi; zone++ )
898 if ( (node == -1) || (node == i) )
899 free_pages += avail[i][zone];
900 }
902 return free_pages;
903 }
905 unsigned long total_free_pages(void)
906 {
907 return total_avail_pages - midsize_alloc_zone_pages;
908 }
910 void __init end_boot_allocator(void)
911 {
912 unsigned int i;
914 /* Pages that are free now go to the domain sub-allocator. */
915 for ( i = 0; i < nr_bootmem_regions; i++ )
916 {
917 struct bootmem_region *r = &bootmem_region_list[i];
918 if ( (r->s < r->e) &&
919 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
920 {
921 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
922 r->e = r->s;
923 break;
924 }
925 }
926 for ( i = nr_bootmem_regions; i-- > 0; )
927 {
928 struct bootmem_region *r = &bootmem_region_list[i];
929 if ( r->s < r->e )
930 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
931 }
932 init_heap_pages(virt_to_page(bootmem_region_list), 1);
934 if ( !dma_bitsize && (num_online_nodes() > 1) )
935 {
936 #ifdef CONFIG_X86
937 dma_bitsize = min_t(unsigned int,
938 fls(NODE_DATA(0)->node_spanned_pages) - 1
939 + PAGE_SHIFT - 2,
940 32);
941 #else
942 dma_bitsize = 32;
943 #endif
944 }
946 printk("Domain heap initialised");
947 if ( dma_bitsize )
948 printk(" DMA width %u bits", dma_bitsize);
949 printk("\n");
950 }
952 /*
953 * Scrub all unallocated pages in all heap zones. This function is more
954 * convoluted than appears necessary because we do not want to continuously
955 * hold the lock while scrubbing very large memory areas.
956 */
957 void __init scrub_heap_pages(void)
958 {
959 unsigned long mfn;
960 struct page_info *pg;
962 if ( !opt_bootscrub )
963 return;
965 printk("Scrubbing Free RAM: ");
967 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
968 {
969 process_pending_softirqs();
971 pg = mfn_to_page(mfn);
973 /* Quick lock-free check. */
974 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
975 continue;
977 /* Every 100MB, print a progress dot. */
978 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
979 printk(".");
981 spin_lock(&heap_lock);
983 /* Re-check page status with lock held. */
984 if ( page_state_is(pg, free) )
985 scrub_one_page(pg);
987 spin_unlock(&heap_lock);
988 }
990 printk("done.\n");
991 }
995 /*************************
996 * XEN-HEAP SUB-ALLOCATOR
997 */
999 #if !defined(__x86_64__) && !defined(__ia64__)
1001 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1003 ps = round_pgup(ps);
1004 pe = round_pgdown(pe);
1005 if ( pe <= ps )
1006 return;
1008 /*
1009 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
1010 * prevent merging of power-of-two blocks across the zone boundary.
1011 */
1012 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
1013 ps += PAGE_SIZE;
1014 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
1015 pe -= PAGE_SIZE;
1017 memguard_guard_range(maddr_to_virt(ps), pe - ps);
1019 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
1023 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1025 struct page_info *pg;
1027 ASSERT(!in_irq());
1029 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
1030 order, memflags, NULL);
1031 if ( unlikely(pg == NULL) )
1032 return NULL;
1034 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
1036 return page_to_virt(pg);
1040 void free_xenheap_pages(void *v, unsigned int order)
1042 ASSERT(!in_irq());
1044 if ( v == NULL )
1045 return;
1047 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
1049 free_heap_pages(virt_to_page(v), order);
1052 #else
1054 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1056 init_domheap_pages(ps, pe);
1059 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1061 struct page_info *pg;
1062 unsigned int i;
1064 ASSERT(!in_irq());
1066 pg = alloc_domheap_pages(NULL, order, memflags);
1067 if ( unlikely(pg == NULL) )
1068 return NULL;
1070 for ( i = 0; i < (1u << order); i++ )
1071 pg[i].count_info |= PGC_xen_heap;
1073 return page_to_virt(pg);
1076 void free_xenheap_pages(void *v, unsigned int order)
1078 struct page_info *pg;
1079 unsigned int i;
1081 ASSERT(!in_irq());
1083 if ( v == NULL )
1084 return;
1086 pg = virt_to_page(v);
1088 for ( i = 0; i < (1u << order); i++ )
1089 pg[i].count_info &= ~PGC_xen_heap;
1091 free_heap_pages(pg, order);
1094 #endif
1098 /*************************
1099 * DOMAIN-HEAP SUB-ALLOCATOR
1100 */
1102 void init_domheap_pages(paddr_t ps, paddr_t pe)
1104 unsigned long smfn, emfn;
1106 ASSERT(!in_irq());
1108 smfn = round_pgup(ps) >> PAGE_SHIFT;
1109 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1111 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1115 int assign_pages(
1116 struct domain *d,
1117 struct page_info *pg,
1118 unsigned int order,
1119 unsigned int memflags)
1121 unsigned long i;
1123 spin_lock(&d->page_alloc_lock);
1125 if ( unlikely(d->is_dying) )
1127 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1128 d->domain_id);
1129 goto fail;
1132 if ( !(memflags & MEMF_no_refcount) )
1134 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1136 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
1137 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
1138 "%u > %u\n", d->domain_id,
1139 d->tot_pages + (1 << order), d->max_pages);
1140 goto fail;
1143 if ( unlikely(d->tot_pages == 0) )
1144 get_knownalive_domain(d);
1146 d->tot_pages += 1 << order;
1149 for ( i = 0; i < (1 << order); i++ )
1151 ASSERT(page_get_owner(&pg[i]) == NULL);
1152 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1153 page_set_owner(&pg[i], d);
1154 wmb(); /* Domain pointer must be visible before updating refcnt. */
1155 pg[i].count_info = PGC_allocated | 1;
1156 page_list_add_tail(&pg[i], &d->page_list);
1159 spin_unlock(&d->page_alloc_lock);
1160 return 0;
1162 fail:
1163 spin_unlock(&d->page_alloc_lock);
1164 return -1;
1168 struct page_info *alloc_domheap_pages(
1169 struct domain *d, unsigned int order, unsigned int memflags)
1171 struct page_info *pg = NULL;
1172 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1173 unsigned int dma_zone;
1175 ASSERT(!in_irq());
1177 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1178 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1179 return NULL;
1181 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1182 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
1184 if ( (pg == NULL) &&
1185 ((memflags & MEMF_no_dma) ||
1186 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
1187 memflags, d)) == NULL)) )
1188 return NULL;
1190 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1192 free_heap_pages(pg, order);
1193 return NULL;
1196 return pg;
1199 void free_domheap_pages(struct page_info *pg, unsigned int order)
1201 int i, drop_dom_ref;
1202 struct domain *d = page_get_owner(pg);
1204 ASSERT(!in_irq());
1206 if ( unlikely(is_xen_heap_page(pg)) )
1208 /* NB. May recursively lock from relinquish_memory(). */
1209 spin_lock_recursive(&d->page_alloc_lock);
1211 for ( i = 0; i < (1 << order); i++ )
1212 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1214 d->xenheap_pages -= 1 << order;
1215 drop_dom_ref = (d->xenheap_pages == 0);
1217 spin_unlock_recursive(&d->page_alloc_lock);
1219 else if ( likely(d != NULL) && likely(d != dom_cow) )
1221 /* NB. May recursively lock from relinquish_memory(). */
1222 spin_lock_recursive(&d->page_alloc_lock);
1224 for ( i = 0; i < (1 << order); i++ )
1226 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1227 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1230 d->tot_pages -= 1 << order;
1231 drop_dom_ref = (d->tot_pages == 0);
1233 spin_unlock_recursive(&d->page_alloc_lock);
1235 /*
1236 * Normally we expect a domain to clear pages before freeing them, if
1237 * it cares about the secrecy of their contents. However, after a
1238 * domain has died we assume responsibility for erasure.
1239 */
1240 if ( unlikely(d->is_dying) )
1241 for ( i = 0; i < (1 << order); i++ )
1242 scrub_one_page(&pg[i]);
1244 free_heap_pages(pg, order);
1246 else if ( unlikely(d == dom_cow) )
1248 ASSERT(order == 0);
1249 scrub_one_page(pg);
1250 free_heap_pages(pg, 0);
1251 drop_dom_ref = 0;
1253 else
1255 /* Freeing anonymous domain-heap pages. */
1256 free_heap_pages(pg, order);
1257 drop_dom_ref = 0;
1260 if ( drop_dom_ref )
1261 put_domain(d);
1264 unsigned long avail_domheap_pages_region(
1265 unsigned int node, unsigned int min_width, unsigned int max_width)
1267 int zone_lo, zone_hi;
1269 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1270 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1272 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1273 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1275 return avail_heap_pages(zone_lo, zone_hi, node);
1278 unsigned long avail_domheap_pages(void)
1280 return avail_heap_pages(MEMZONE_XEN + 1,
1281 NR_ZONES - 1,
1282 -1);
1285 unsigned long avail_node_heap_pages(unsigned int nodeid)
1287 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
1291 static void pagealloc_info(unsigned char key)
1293 unsigned int zone = MEMZONE_XEN;
1294 unsigned long n, total = 0;
1296 printk("Physical memory information:\n");
1297 printk(" Xen heap: %lukB free\n",
1298 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1300 while ( ++zone < NR_ZONES )
1302 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1304 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1305 total = 0;
1308 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1310 total += n;
1311 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1315 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1318 static struct keyhandler pagealloc_info_keyhandler = {
1319 .diagnostic = 1,
1320 .u.fn = pagealloc_info,
1321 .desc = "memory info"
1322 };
1324 static __init int pagealloc_keyhandler_init(void)
1326 register_keyhandler('m', &pagealloc_info_keyhandler);
1327 return 0;
1329 __initcall(pagealloc_keyhandler_init);
1332 void scrub_one_page(struct page_info *pg)
1334 void *p = __map_domain_page(pg);
1336 if ( unlikely(pg->count_info & PGC_broken) )
1337 return;
1339 #ifndef NDEBUG
1340 /* Avoid callers relying on allocations returning zeroed pages. */
1341 memset(p, 0xc2, PAGE_SIZE);
1342 #else
1343 /* For a production build, clear_page() is the fastest way to scrub. */
1344 clear_page(p);
1345 #endif
1347 unmap_domain_page(p);
1350 static void dump_heap(unsigned char key)
1352 s_time_t now = NOW();
1353 int i, j;
1355 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1356 (u32)(now>>32), (u32)now);
1358 for ( i = 0; i < MAX_NUMNODES; i++ )
1360 if ( !avail[i] )
1361 continue;
1362 for ( j = 0; j < NR_ZONES; j++ )
1363 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1364 i, j, avail[i][j]);
1368 static struct keyhandler dump_heap_keyhandler = {
1369 .diagnostic = 1,
1370 .u.fn = dump_heap,
1371 .desc = "dump heap info"
1372 };
1374 static __init int register_heap_trigger(void)
1376 register_keyhandler('H', &dump_heap_keyhandler);
1377 return 0;
1379 __initcall(register_heap_trigger);
1381 /*
1382 * Local variables:
1383 * mode: C
1384 * c-set-style: "BSD"
1385 * c-basic-offset: 4
1386 * tab-width: 4
1387 * indent-tabs-mode: nil
1388 * End:
1389 */