debuggers.hg

view xen/common/page_alloc.c @ 22855:1d1eec7e1fb4

xl: Perform minimal validation of virtual disk file while parsing config file

This patch performs some very basic validation on the virtual disk
file passed through the config file. This validation ensures that we
don't go too far with the initialization like spawn qemu and more
while there could be some potentially fundamental issues.

[ Patch fixed up to work with PHYSTYPE_EMPTY 22808:6ec61438713a -iwj ]

Signed-off-by: Kamala Narasimhan <kamala.narasimhan@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
author Kamala Narasimhan <kamala.narasimhan@gmail.com>
date Tue Jan 25 18:09:49 2011 +0000 (2011-01-25)
parents c3e478eafabc
children
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <xen/tmem_xen.h>
40 #include <public/sysctl.h>
41 #include <asm/page.h>
42 #include <asm/numa.h>
43 #include <asm/flushtlb.h>
45 /*
46 * Comma-separated list of hexadecimal page numbers containing bad bytes.
47 * e.g. 'badpage=0x3f45,0x8a321'.
48 */
49 static char __initdata opt_badpage[100] = "";
50 string_param("badpage", opt_badpage);
52 /*
53 * no-bootscrub -> Free pages are not zeroed during boot.
54 */
55 static bool_t opt_bootscrub __initdata = 1;
56 boolean_param("bootscrub", opt_bootscrub);
58 /*
59 * Bit width of the DMA heap -- used to override NUMA-node-first.
60 * allocation strategy, which can otherwise exhaust low memory.
61 */
62 static unsigned int dma_bitsize;
63 integer_param("dma_bits", dma_bitsize);
65 #define round_pgdown(_p) ((_p)&PAGE_MASK)
66 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
68 /* Offlined page list, protected by heap_lock. */
69 PAGE_LIST_HEAD(page_offlined_list);
70 /* Broken page list, protected by heap_lock. */
71 PAGE_LIST_HEAD(page_broken_list);
73 /*************************
74 * BOOT-TIME ALLOCATOR
75 */
77 static unsigned long __initdata first_valid_mfn = ~0UL;
79 static struct bootmem_region {
80 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
81 } *__initdata bootmem_region_list;
82 static unsigned int __initdata nr_bootmem_regions;
84 static void __init boot_bug(int line)
85 {
86 panic("Boot BUG at %s:%d\n", __FILE__, line);
87 }
88 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
90 static void __init bootmem_region_add(unsigned long s, unsigned long e)
91 {
92 unsigned int i;
94 if ( (bootmem_region_list == NULL) && (s < e) )
95 bootmem_region_list = mfn_to_virt(s++);
97 if ( s >= e )
98 return;
100 for ( i = 0; i < nr_bootmem_regions; i++ )
101 if ( s < bootmem_region_list[i].e )
102 break;
104 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
105 BOOT_BUG_ON(nr_bootmem_regions ==
106 (PAGE_SIZE / sizeof(struct bootmem_region)));
108 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
109 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
110 bootmem_region_list[i] = (struct bootmem_region) { s, e };
111 nr_bootmem_regions++;
112 }
114 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
115 {
116 unsigned int i;
118 for ( i = 0; i < nr_bootmem_regions; i++ )
119 {
120 struct bootmem_region *r = &bootmem_region_list[i];
121 if ( e <= r->s )
122 break;
123 if ( s >= r->e )
124 continue;
125 if ( s <= r->s )
126 {
127 r->s = min(e, r->e);
128 }
129 else if ( e >= r->e )
130 {
131 r->e = s;
132 }
133 else
134 {
135 unsigned long _e = r->e;
136 r->e = s;
137 bootmem_region_add(e, _e);
138 }
139 }
140 }
142 void __init init_boot_pages(paddr_t ps, paddr_t pe)
143 {
144 unsigned long bad_spfn, bad_epfn;
145 const char *p;
147 ps = round_pgup(ps);
148 pe = round_pgdown(pe);
149 if ( pe <= ps )
150 return;
152 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
154 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
156 /* Check new pages against the bad-page list. */
157 p = opt_badpage;
158 while ( *p != '\0' )
159 {
160 bad_spfn = simple_strtoul(p, &p, 0);
161 bad_epfn = bad_spfn;
163 if ( *p == '-' )
164 {
165 p++;
166 bad_epfn = simple_strtoul(p, &p, 0);
167 if ( bad_epfn < bad_spfn )
168 bad_epfn = bad_spfn;
169 }
171 if ( *p == ',' )
172 p++;
173 else if ( *p != '\0' )
174 break;
176 if ( bad_epfn == bad_spfn )
177 printk("Marking page %lx as bad\n", bad_spfn);
178 else
179 printk("Marking pages %lx through %lx as bad\n",
180 bad_spfn, bad_epfn);
182 bootmem_region_zap(bad_spfn, bad_epfn+1);
183 }
184 }
186 unsigned long __init alloc_boot_pages(
187 unsigned long nr_pfns, unsigned long pfn_align)
188 {
189 unsigned long pg, _e;
190 int i;
192 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
193 {
194 struct bootmem_region *r = &bootmem_region_list[i];
195 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
196 if ( pg < r->s )
197 continue;
198 _e = r->e;
199 r->e = pg;
200 bootmem_region_add(pg + nr_pfns, _e);
201 return pg;
202 }
204 BOOT_BUG_ON(1);
205 return 0;
206 }
210 /*************************
211 * BINARY BUDDY ALLOCATOR
212 */
214 #define MEMZONE_XEN 0
215 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
217 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
218 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
219 (fls(page_to_mfn(pg)) - 1))
221 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
222 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
223 #define heap(node, zone, order) ((*_heap[node])[zone][order])
225 static unsigned long *avail[MAX_NUMNODES];
226 static long total_avail_pages;
228 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
229 static long midsize_alloc_zone_pages;
230 #define MIDSIZE_ALLOC_FRAC 128
232 static DEFINE_SPINLOCK(heap_lock);
234 static unsigned long init_node_heap(int node, unsigned long mfn,
235 unsigned long nr, bool_t *use_tail)
236 {
237 /* First node to be discovered has its heap metadata statically alloced. */
238 static heap_by_zone_and_order_t _heap_static;
239 static unsigned long avail_static[NR_ZONES];
240 static int first_node_initialised;
241 unsigned long needed = (sizeof(**_heap) +
242 sizeof(**avail) * NR_ZONES +
243 PAGE_SIZE - 1) >> PAGE_SHIFT;
244 int i, j;
246 if ( !first_node_initialised )
247 {
248 _heap[node] = &_heap_static;
249 avail[node] = avail_static;
250 first_node_initialised = 1;
251 needed = 0;
252 }
253 #ifdef DIRECTMAP_VIRT_END
254 else if ( *use_tail && nr >= needed &&
255 (mfn + nr) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
256 {
257 _heap[node] = mfn_to_virt(mfn + nr - needed);
258 avail[node] = mfn_to_virt(mfn + nr - 1) +
259 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
260 }
261 else if ( nr >= needed &&
262 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
263 {
264 _heap[node] = mfn_to_virt(mfn);
265 avail[node] = mfn_to_virt(mfn + needed - 1) +
266 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
267 *use_tail = 0;
268 }
269 #endif
270 else if ( get_order_from_bytes(sizeof(**_heap)) ==
271 get_order_from_pages(needed) )
272 {
273 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
274 BUG_ON(!_heap[node]);
275 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
276 sizeof(**avail) * NR_ZONES;
277 needed = 0;
278 }
279 else
280 {
281 _heap[node] = xmalloc(heap_by_zone_and_order_t);
282 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
283 BUG_ON(!_heap[node] || !avail[node]);
284 needed = 0;
285 }
287 memset(avail[node], 0, NR_ZONES * sizeof(long));
289 for ( i = 0; i < NR_ZONES; i++ )
290 for ( j = 0; j <= MAX_ORDER; j++ )
291 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
293 return needed;
294 }
296 /* Allocate 2^@order contiguous pages. */
297 static struct page_info *alloc_heap_pages(
298 unsigned int zone_lo, unsigned int zone_hi,
299 unsigned int order, unsigned int memflags,
300 struct domain *d)
301 {
302 unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
303 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
304 unsigned long request = 1UL << order;
305 cpumask_t extra_cpus_mask, mask;
306 struct page_info *pg;
307 nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
309 if ( node == NUMA_NO_NODE )
310 {
311 memflags &= ~MEMF_exact_node;
312 if ( d != NULL )
313 {
314 node = next_node(d->last_alloc_node, nodemask);
315 if ( node >= MAX_NUMNODES )
316 node = first_node(nodemask);
317 }
318 if ( node >= MAX_NUMNODES )
319 node = cpu_to_node(smp_processor_id());
320 }
321 first_node = node;
323 ASSERT(node >= 0);
324 ASSERT(zone_lo <= zone_hi);
325 ASSERT(zone_hi < NR_ZONES);
327 if ( unlikely(order > MAX_ORDER) )
328 return NULL;
330 spin_lock(&heap_lock);
332 /*
333 * TMEM: When available memory is scarce due to tmem absorbing it, allow
334 * only mid-size allocations to avoid worst of fragmentation issues.
335 * Others try tmem pools then fail. This is a workaround until all
336 * post-dom0-creation-multi-page allocations can be eliminated.
337 */
338 if ( opt_tmem && ((order == 0) || (order >= 9)) &&
339 (total_avail_pages <= midsize_alloc_zone_pages) &&
340 tmem_freeable_pages() )
341 goto try_tmem;
343 /*
344 * Start with requested node, but exhaust all node memory in requested
345 * zone before failing, only calc new node value if we fail to find memory
346 * in target node, this avoids needless computation on fast-path.
347 */
348 for ( ; ; )
349 {
350 zone = zone_hi;
351 do {
352 /* Check if target node can support the allocation. */
353 if ( !avail[node] || (avail[node][zone] < request) )
354 continue;
356 /* Find smallest order which can satisfy the request. */
357 for ( j = order; j <= MAX_ORDER; j++ )
358 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
359 goto found;
360 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
362 if ( memflags & MEMF_exact_node )
363 goto not_found;
365 /* Pick next node. */
366 if ( !node_isset(node, nodemask) )
367 {
368 /* Very first node may be caller-specified and outside nodemask. */
369 ASSERT(!nodemask_retry);
370 first_node = node = first_node(nodemask);
371 if ( node < MAX_NUMNODES )
372 continue;
373 }
374 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
375 node = first_node(nodemask);
376 if ( node == first_node )
377 {
378 /* When we have tried all in nodemask, we fall back to others. */
379 if ( nodemask_retry++ )
380 goto not_found;
381 nodes_andnot(nodemask, node_online_map, nodemask);
382 first_node = node = first_node(nodemask);
383 if ( node >= MAX_NUMNODES )
384 goto not_found;
385 }
386 }
388 try_tmem:
389 /* Try to free memory from tmem */
390 if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
391 {
392 /* reassigning an already allocated anonymous heap page */
393 spin_unlock(&heap_lock);
394 return pg;
395 }
397 not_found:
398 /* No suitable memory blocks. Fail the request. */
399 spin_unlock(&heap_lock);
400 return NULL;
402 found:
403 /* We may have to halve the chunk a number of times. */
404 while ( j != order )
405 {
406 PFN_ORDER(pg) = --j;
407 page_list_add_tail(pg, &heap(node, zone, j));
408 pg += 1 << j;
409 }
411 ASSERT(avail[node][zone] >= request);
412 avail[node][zone] -= request;
413 total_avail_pages -= request;
414 ASSERT(total_avail_pages >= 0);
416 if ( d != NULL )
417 d->last_alloc_node = node;
419 cpus_clear(mask);
421 for ( i = 0; i < (1 << order); i++ )
422 {
423 /* Reference count must continuously be zero for free pages. */
424 BUG_ON(pg[i].count_info != PGC_state_free);
425 pg[i].count_info = PGC_state_inuse;
427 if ( pg[i].u.free.need_tlbflush )
428 {
429 /* Add in extra CPUs that need flushing because of this page. */
430 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
431 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
432 cpus_or(mask, mask, extra_cpus_mask);
433 }
435 /* Initialise fields which have other uses for free pages. */
436 pg[i].u.inuse.type_info = 0;
437 page_set_owner(&pg[i], NULL);
438 }
440 spin_unlock(&heap_lock);
442 if ( unlikely(!cpus_empty(mask)) )
443 {
444 perfc_incr(need_flush_tlb_flush);
445 flush_tlb_mask(&mask);
446 }
448 return pg;
449 }
451 /* Remove any offlined page in the buddy pointed to by head. */
452 static int reserve_offlined_page(struct page_info *head)
453 {
454 unsigned int node = phys_to_nid(page_to_maddr(head));
455 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
456 struct page_info *cur_head;
457 int cur_order;
459 ASSERT(spin_is_locked(&heap_lock));
461 cur_head = head;
463 page_list_del(head, &heap(node, zone, head_order));
465 while ( cur_head < (head + (1 << head_order)) )
466 {
467 struct page_info *pg;
468 int next_order;
470 if ( page_state_is(cur_head, offlined) )
471 {
472 cur_head++;
473 continue;
474 }
476 next_order = cur_order = 0;
478 while ( cur_order < head_order )
479 {
480 next_order = cur_order + 1;
482 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
483 goto merge;
485 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
486 i < (1 << next_order);
487 i++, pg++ )
488 if ( page_state_is(pg, offlined) )
489 break;
490 if ( i == ( 1 << next_order) )
491 {
492 cur_order = next_order;
493 continue;
494 }
495 else
496 {
497 merge:
498 /* We don't consider merging outside the head_order. */
499 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
500 PFN_ORDER(cur_head) = cur_order;
501 cur_head += (1 << cur_order);
502 break;
503 }
504 }
505 }
507 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
508 {
509 if ( !page_state_is(cur_head, offlined) )
510 continue;
512 avail[node][zone]--;
513 total_avail_pages--;
514 ASSERT(total_avail_pages >= 0);
516 page_list_add_tail(cur_head,
517 test_bit(_PGC_broken, &cur_head->count_info) ?
518 &page_broken_list : &page_offlined_list);
520 count++;
521 }
523 return count;
524 }
526 /* Free 2^@order set of pages. */
527 static void free_heap_pages(
528 struct page_info *pg, unsigned int order)
529 {
530 unsigned long mask;
531 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
532 unsigned int zone = page_to_zone(pg);
534 ASSERT(order <= MAX_ORDER);
535 ASSERT(node >= 0);
537 spin_lock(&heap_lock);
539 for ( i = 0; i < (1 << order); i++ )
540 {
541 /*
542 * Cannot assume that count_info == 0, as there are some corner cases
543 * where it isn't the case and yet it isn't a bug:
544 * 1. page_get_owner() is NULL
545 * 2. page_get_owner() is a domain that was never accessible by
546 * its domid (e.g., failed to fully construct the domain).
547 * 3. page was never addressable by the guest (e.g., it's an
548 * auto-translate-physmap guest and the page was never included
549 * in its pseudophysical address space).
550 * In all the above cases there can be no guest mappings of this page.
551 */
552 ASSERT(!page_state_is(&pg[i], offlined));
553 pg[i].count_info =
554 ((pg[i].count_info & PGC_broken) |
555 (page_state_is(&pg[i], offlining)
556 ? PGC_state_offlined : PGC_state_free));
557 if ( page_state_is(&pg[i], offlined) )
558 tainted = 1;
560 /* If a page has no owner it will need no safety TLB flush. */
561 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
562 if ( pg[i].u.free.need_tlbflush )
563 pg[i].tlbflush_timestamp = tlbflush_current_time();
564 }
566 avail[node][zone] += 1 << order;
567 total_avail_pages += 1 << order;
569 if ( opt_tmem )
570 midsize_alloc_zone_pages = max(
571 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
573 /* Merge chunks as far as possible. */
574 while ( order < MAX_ORDER )
575 {
576 mask = 1UL << order;
578 if ( (page_to_mfn(pg) & mask) )
579 {
580 /* Merge with predecessor block? */
581 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
582 !page_state_is(pg-mask, free) ||
583 (PFN_ORDER(pg-mask) != order) ||
584 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
585 break;
586 pg -= mask;
587 page_list_del(pg, &heap(node, zone, order));
588 }
589 else
590 {
591 /* Merge with successor block? */
592 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
593 !page_state_is(pg+mask, free) ||
594 (PFN_ORDER(pg+mask) != order) ||
595 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
596 break;
597 page_list_del(pg + mask, &heap(node, zone, order));
598 }
600 order++;
601 }
603 PFN_ORDER(pg) = order;
604 page_list_add_tail(pg, &heap(node, zone, order));
606 if ( tainted )
607 reserve_offlined_page(pg);
609 spin_unlock(&heap_lock);
610 }
613 /*
614 * Following possible status for a page:
615 * free and Online; free and offlined; free and offlined and broken;
616 * assigned and online; assigned and offlining; assigned and offling and broken
617 *
618 * Following rules applied for page offline:
619 * Once a page is broken, it can't be assigned anymore
620 * A page will be offlined only if it is free
621 * return original count_info
622 */
623 static unsigned long mark_page_offline(struct page_info *pg, int broken)
624 {
625 unsigned long nx, x, y = pg->count_info;
627 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
628 ASSERT(spin_is_locked(&heap_lock));
630 do {
631 nx = x = y;
633 if ( ((x & PGC_state) != PGC_state_offlined) &&
634 ((x & PGC_state) != PGC_state_offlining) )
635 {
636 nx &= ~PGC_state;
637 nx |= (((x & PGC_state) == PGC_state_free)
638 ? PGC_state_offlined : PGC_state_offlining);
639 }
641 if ( broken )
642 nx |= PGC_broken;
644 if ( x == nx )
645 break;
646 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
648 return y;
649 }
651 static int reserve_heap_page(struct page_info *pg)
652 {
653 struct page_info *head = NULL;
654 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
655 unsigned int zone = page_to_zone(pg);
657 for ( i = 0; i <= MAX_ORDER; i++ )
658 {
659 struct page_info *tmp;
661 if ( page_list_empty(&heap(node, zone, i)) )
662 continue;
664 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
665 {
666 if ( (head <= pg) &&
667 (head + (1UL << i) > pg) )
668 return reserve_offlined_page(head);
669 }
670 }
672 return -EINVAL;
674 }
676 int offline_page(unsigned long mfn, int broken, uint32_t *status)
677 {
678 unsigned long old_info = 0;
679 struct domain *owner;
680 int ret = 0;
681 struct page_info *pg;
683 if ( !mfn_valid(mfn) )
684 {
685 dprintk(XENLOG_WARNING,
686 "try to offline page out of range %lx\n", mfn);
687 return -EINVAL;
688 }
690 *status = 0;
691 pg = mfn_to_page(mfn);
693 if ( is_xen_fixed_mfn(mfn) )
694 {
695 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
696 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
697 return -EPERM;
698 }
700 /*
701 * N.B. xen's txt in x86_64 is marked reserved and handled already.
702 * Also kexec range is reserved.
703 */
704 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
705 {
706 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
707 return -EINVAL;
708 }
710 spin_lock(&heap_lock);
712 old_info = mark_page_offline(pg, broken);
714 if ( page_state_is(pg, free) )
715 {
716 /* Free pages are reserve directly */
717 reserve_heap_page(pg);
718 *status = PG_OFFLINE_OFFLINED;
719 }
720 else if ( page_state_is(pg, offlined) )
721 {
722 *status = PG_OFFLINE_OFFLINED;
723 }
724 else if ( (owner = page_get_owner_and_reference(pg)) )
725 {
726 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
727 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
728 /* Release the reference since it will not be allocated anymore */
729 put_page(pg);
730 }
731 else if ( old_info & PGC_xen_heap )
732 {
733 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
734 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
735 }
736 else
737 {
738 /*
739 * assign_pages does not hold heap_lock, so small window that the owner
740 * may be set later, but please notice owner will only change from
741 * NULL to be set, not verse, since page is offlining now.
742 * No windows If called from #MC handler, since all CPU are in softirq
743 * If called from user space like CE handling, tools can wait some time
744 * before call again.
745 */
746 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
747 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
748 }
750 if ( broken )
751 *status |= PG_OFFLINE_BROKEN;
753 spin_unlock(&heap_lock);
755 return ret;
756 }
758 /*
759 * Online the memory.
760 * The caller should make sure end_pfn <= max_page,
761 * if not, expand_pages() should be called prior to online_page().
762 */
763 unsigned int online_page(unsigned long mfn, uint32_t *status)
764 {
765 unsigned long x, nx, y;
766 struct page_info *pg;
767 int ret;
769 if ( !mfn_valid(mfn) )
770 {
771 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
772 return -EINVAL;
773 }
775 pg = mfn_to_page(mfn);
777 spin_lock(&heap_lock);
779 y = pg->count_info;
780 do {
781 ret = *status = 0;
783 if ( y & PGC_broken )
784 {
785 ret = -EINVAL;
786 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
787 break;
788 }
790 if ( (y & PGC_state) == PGC_state_offlined )
791 {
792 page_list_del(pg, &page_offlined_list);
793 *status = PG_ONLINE_ONLINED;
794 }
795 else if ( (y & PGC_state) == PGC_state_offlining )
796 {
797 *status = PG_ONLINE_ONLINED;
798 }
799 else
800 {
801 break;
802 }
804 x = y;
805 nx = (x & ~PGC_state) | PGC_state_inuse;
806 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
808 spin_unlock(&heap_lock);
810 if ( (y & PGC_state) == PGC_state_offlined )
811 free_heap_pages(pg, 0);
813 return ret;
814 }
816 int query_page_offline(unsigned long mfn, uint32_t *status)
817 {
818 struct page_info *pg;
820 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
821 {
822 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
823 return -EINVAL;
824 }
826 *status = 0;
827 spin_lock(&heap_lock);
829 pg = mfn_to_page(mfn);
831 if ( page_state_is(pg, offlining) )
832 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
833 if ( pg->count_info & PGC_broken )
834 *status |= PG_OFFLINE_STATUS_BROKEN;
835 if ( page_state_is(pg, offlined) )
836 *status |= PG_OFFLINE_STATUS_OFFLINED;
838 spin_unlock(&heap_lock);
840 return 0;
841 }
843 /*
844 * Hand the specified arbitrary page range to the specified heap zone
845 * checking the node_id of the previous page. If they differ and the
846 * latter is not on a MAX_ORDER boundary, then we reserve the page by
847 * not freeing it to the buddy allocator.
848 */
849 static void init_heap_pages(
850 struct page_info *pg, unsigned long nr_pages)
851 {
852 unsigned long i;
854 for ( i = 0; i < nr_pages; i++ )
855 {
856 unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
858 if ( unlikely(!avail[nid]) )
859 {
860 unsigned long s = page_to_mfn(pg + i);
861 unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
862 bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
863 !(s & ((1UL << MAX_ORDER) - 1)) &&
864 (find_first_set_bit(e) <= find_first_set_bit(s));
865 unsigned long n;
867 n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
868 &use_tail);
869 BUG_ON(i + n > nr_pages);
870 if ( n && !use_tail )
871 {
872 i += n - 1;
873 continue;
874 }
875 if ( i + n == nr_pages )
876 break;
877 nr_pages -= n;
878 }
880 free_heap_pages(pg+i, 0);
881 }
882 }
884 static unsigned long avail_heap_pages(
885 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
886 {
887 unsigned int i, zone;
888 unsigned long free_pages = 0;
890 if ( zone_hi >= NR_ZONES )
891 zone_hi = NR_ZONES - 1;
893 for_each_online_node(i)
894 {
895 if ( !avail[i] )
896 continue;
897 for ( zone = zone_lo; zone <= zone_hi; zone++ )
898 if ( (node == -1) || (node == i) )
899 free_pages += avail[i][zone];
900 }
902 return free_pages;
903 }
905 unsigned long total_free_pages(void)
906 {
907 return total_avail_pages - midsize_alloc_zone_pages;
908 }
910 void __init end_boot_allocator(void)
911 {
912 unsigned int i;
914 /* Pages that are free now go to the domain sub-allocator. */
915 for ( i = 0; i < nr_bootmem_regions; i++ )
916 {
917 struct bootmem_region *r = &bootmem_region_list[i];
918 if ( (r->s < r->e) &&
919 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
920 {
921 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
922 r->e = r->s;
923 break;
924 }
925 }
926 for ( i = nr_bootmem_regions; i-- > 0; )
927 {
928 struct bootmem_region *r = &bootmem_region_list[i];
929 if ( r->s < r->e )
930 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
931 }
932 init_heap_pages(virt_to_page(bootmem_region_list), 1);
934 if ( !dma_bitsize && (num_online_nodes() > 1) )
935 {
936 #ifdef CONFIG_X86
937 dma_bitsize = min_t(unsigned int,
938 fls(NODE_DATA(0)->node_spanned_pages) - 1
939 + PAGE_SHIFT - 2,
940 32);
941 #else
942 dma_bitsize = 32;
943 #endif
944 }
946 printk("Domain heap initialised");
947 if ( dma_bitsize )
948 printk(" DMA width %u bits", dma_bitsize);
949 printk("\n");
950 }
952 /*
953 * Scrub all unallocated pages in all heap zones. This function is more
954 * convoluted than appears necessary because we do not want to continuously
955 * hold the lock while scrubbing very large memory areas.
956 */
957 void __init scrub_heap_pages(void)
958 {
959 unsigned long mfn;
960 struct page_info *pg;
962 if ( !opt_bootscrub )
963 return;
965 printk("Scrubbing Free RAM: ");
967 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
968 {
969 process_pending_softirqs();
971 pg = mfn_to_page(mfn);
973 /* Quick lock-free check. */
974 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
975 continue;
977 /* Every 100MB, print a progress dot. */
978 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
979 printk(".");
981 spin_lock(&heap_lock);
983 /* Re-check page status with lock held. */
984 if ( page_state_is(pg, free) )
985 scrub_one_page(pg);
987 spin_unlock(&heap_lock);
988 }
990 printk("done.\n");
991 }
995 /*************************
996 * XEN-HEAP SUB-ALLOCATOR
997 */
999 #if !defined(__x86_64__) && !defined(__ia64__)
1001 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1003 ps = round_pgup(ps);
1004 pe = round_pgdown(pe);
1005 if ( pe <= ps )
1006 return;
1008 /*
1009 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
1010 * prevent merging of power-of-two blocks across the zone boundary.
1011 */
1012 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
1013 ps += PAGE_SIZE;
1014 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
1015 pe -= PAGE_SIZE;
1017 memguard_guard_range(maddr_to_virt(ps), pe - ps);
1019 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
1023 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1025 struct page_info *pg;
1027 ASSERT(!in_irq());
1029 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
1030 order, memflags, NULL);
1031 if ( unlikely(pg == NULL) )
1032 return NULL;
1034 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
1036 return page_to_virt(pg);
1040 void free_xenheap_pages(void *v, unsigned int order)
1042 ASSERT(!in_irq());
1044 if ( v == NULL )
1045 return;
1047 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
1049 free_heap_pages(virt_to_page(v), order);
1052 #else
1054 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1056 init_domheap_pages(ps, pe);
1059 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1061 struct page_info *pg;
1062 unsigned int i;
1064 ASSERT(!in_irq());
1066 pg = alloc_domheap_pages(NULL, order, memflags);
1067 if ( unlikely(pg == NULL) )
1068 return NULL;
1070 for ( i = 0; i < (1u << order); i++ )
1071 pg[i].count_info |= PGC_xen_heap;
1073 return page_to_virt(pg);
1076 void free_xenheap_pages(void *v, unsigned int order)
1078 struct page_info *pg;
1079 unsigned int i;
1081 ASSERT(!in_irq());
1083 if ( v == NULL )
1084 return;
1086 pg = virt_to_page(v);
1088 for ( i = 0; i < (1u << order); i++ )
1089 pg[i].count_info &= ~PGC_xen_heap;
1091 free_heap_pages(pg, order);
1094 #endif
1098 /*************************
1099 * DOMAIN-HEAP SUB-ALLOCATOR
1100 */
1102 void init_domheap_pages(paddr_t ps, paddr_t pe)
1104 unsigned long smfn, emfn;
1106 ASSERT(!in_irq());
1108 smfn = round_pgup(ps) >> PAGE_SHIFT;
1109 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1111 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1115 int assign_pages(
1116 struct domain *d,
1117 struct page_info *pg,
1118 unsigned int order,
1119 unsigned int memflags)
1121 unsigned long i;
1123 spin_lock(&d->page_alloc_lock);
1125 if ( unlikely(d->is_dying) )
1127 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1128 d->domain_id);
1129 goto fail;
1132 if ( !(memflags & MEMF_no_refcount) )
1134 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1136 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
1137 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
1138 "%u > %u\n", d->domain_id,
1139 d->tot_pages + (1 << order), d->max_pages);
1140 goto fail;
1143 if ( unlikely(d->tot_pages == 0) )
1144 get_knownalive_domain(d);
1146 d->tot_pages += 1 << order;
1149 for ( i = 0; i < (1 << order); i++ )
1151 ASSERT(page_get_owner(&pg[i]) == NULL);
1152 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1153 page_set_owner(&pg[i], d);
1154 wmb(); /* Domain pointer must be visible before updating refcnt. */
1155 pg[i].count_info = PGC_allocated | 1;
1156 page_list_add_tail(&pg[i], &d->page_list);
1159 spin_unlock(&d->page_alloc_lock);
1160 return 0;
1162 fail:
1163 spin_unlock(&d->page_alloc_lock);
1164 return -1;
1168 struct page_info *alloc_domheap_pages(
1169 struct domain *d, unsigned int order, unsigned int memflags)
1171 struct page_info *pg = NULL;
1172 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1173 unsigned int dma_zone;
1175 ASSERT(!in_irq());
1177 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1178 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1179 return NULL;
1181 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1182 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
1184 if ( (pg == NULL) &&
1185 ((memflags & MEMF_no_dma) ||
1186 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
1187 memflags, d)) == NULL)) )
1188 return NULL;
1190 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1192 free_heap_pages(pg, order);
1193 return NULL;
1196 return pg;
1199 void free_domheap_pages(struct page_info *pg, unsigned int order)
1201 int i, drop_dom_ref;
1202 struct domain *d = page_get_owner(pg);
1204 ASSERT(!in_irq());
1206 if ( unlikely(is_xen_heap_page(pg)) )
1208 /* NB. May recursively lock from relinquish_memory(). */
1209 spin_lock_recursive(&d->page_alloc_lock);
1211 for ( i = 0; i < (1 << order); i++ )
1212 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1214 d->xenheap_pages -= 1 << order;
1215 drop_dom_ref = (d->xenheap_pages == 0);
1217 spin_unlock_recursive(&d->page_alloc_lock);
1219 else if ( likely(d != NULL) && likely(d != dom_cow) )
1221 /* NB. May recursively lock from relinquish_memory(). */
1222 spin_lock_recursive(&d->page_alloc_lock);
1224 for ( i = 0; i < (1 << order); i++ )
1226 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1227 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1230 d->tot_pages -= 1 << order;
1231 drop_dom_ref = (d->tot_pages == 0);
1233 spin_unlock_recursive(&d->page_alloc_lock);
1235 /*
1236 * Normally we expect a domain to clear pages before freeing them, if
1237 * it cares about the secrecy of their contents. However, after a
1238 * domain has died we assume responsibility for erasure.
1239 */
1240 if ( unlikely(d->is_dying) )
1241 for ( i = 0; i < (1 << order); i++ )
1242 scrub_one_page(&pg[i]);
1244 free_heap_pages(pg, order);
1246 else if ( unlikely(d == dom_cow) )
1248 ASSERT(order == 0);
1249 scrub_one_page(pg);
1250 free_heap_pages(pg, 0);
1251 drop_dom_ref = 0;
1253 else
1255 /* Freeing anonymous domain-heap pages. */
1256 free_heap_pages(pg, order);
1257 drop_dom_ref = 0;
1260 if ( drop_dom_ref )
1261 put_domain(d);
1264 unsigned long avail_domheap_pages_region(
1265 unsigned int node, unsigned int min_width, unsigned int max_width)
1267 int zone_lo, zone_hi;
1269 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1270 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1272 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1273 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1275 return avail_heap_pages(zone_lo, zone_hi, node);
1278 unsigned long avail_domheap_pages(void)
1280 return avail_heap_pages(MEMZONE_XEN + 1,
1281 NR_ZONES - 1,
1282 -1);
1285 unsigned long avail_node_heap_pages(unsigned int nodeid)
1287 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
1291 static void pagealloc_info(unsigned char key)
1293 unsigned int zone = MEMZONE_XEN;
1294 unsigned long n, total = 0;
1296 printk("Physical memory information:\n");
1297 printk(" Xen heap: %lukB free\n",
1298 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1300 while ( ++zone < NR_ZONES )
1302 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1304 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1305 total = 0;
1308 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1310 total += n;
1311 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1315 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1318 static struct keyhandler pagealloc_info_keyhandler = {
1319 .diagnostic = 1,
1320 .u.fn = pagealloc_info,
1321 .desc = "memory info"
1322 };
1324 static __init int pagealloc_keyhandler_init(void)
1326 register_keyhandler('m', &pagealloc_info_keyhandler);
1327 return 0;
1329 __initcall(pagealloc_keyhandler_init);
1332 void scrub_one_page(struct page_info *pg)
1334 void *p = __map_domain_page(pg);
1336 if ( unlikely(pg->count_info & PGC_broken) )
1337 return;
1339 #ifndef NDEBUG
1340 /* Avoid callers relying on allocations returning zeroed pages. */
1341 memset(p, 0xc2, PAGE_SIZE);
1342 #else
1343 /* For a production build, clear_page() is the fastest way to scrub. */
1344 clear_page(p);
1345 #endif
1347 unmap_domain_page(p);
1350 static void dump_heap(unsigned char key)
1352 s_time_t now = NOW();
1353 int i, j;
1355 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1356 (u32)(now>>32), (u32)now);
1358 for ( i = 0; i < MAX_NUMNODES; i++ )
1360 if ( !avail[i] )
1361 continue;
1362 for ( j = 0; j < NR_ZONES; j++ )
1363 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1364 i, j, avail[i][j]);
1368 static struct keyhandler dump_heap_keyhandler = {
1369 .diagnostic = 1,
1370 .u.fn = dump_heap,
1371 .desc = "dump heap info"
1372 };
1374 static __init int register_heap_trigger(void)
1376 register_keyhandler('H', &dump_heap_keyhandler);
1377 return 0;
1379 __initcall(register_heap_trigger);
1381 /*
1382 * Local variables:
1383 * mode: C
1384 * c-set-style: "BSD"
1385 * c-basic-offset: 4
1386 * tab-width: 4
1387 * indent-tabs-mode: nil
1388 * End:
1389 */