debuggers.hg

annotate xen/common/page_alloc.c @ 22848:6341fe0f4e5a

Added tag 4.1.0-rc2 for changeset 9dca60d88c63
author Keir Fraser <keir@xen.org>
date Tue Jan 25 14:06:55 2011 +0000 (2011-01-25)
parents c3e478eafabc
children
rev   line source
iap10@274 1 /******************************************************************************
iap10@274 2 * page_alloc.c
iap10@274 3 *
kaf24@1249 4 * Simple buddy heap allocator for Xen.
iap10@274 5 *
kaf24@1971 6 * Copyright (c) 2002-2004 K A Fraser
kfraser@11932 7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
kaf24@804 8 *
kaf24@804 9 * This program is free software; you can redistribute it and/or modify
kaf24@804 10 * it under the terms of the GNU General Public License as published by
kaf24@804 11 * the Free Software Foundation; either version 2 of the License, or
kaf24@804 12 * (at your option) any later version.
kaf24@804 13 *
kaf24@804 14 * This program is distributed in the hope that it will be useful,
kaf24@804 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
kaf24@804 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kaf24@804 17 * GNU General Public License for more details.
kaf24@804 18 *
kaf24@804 19 * You should have received a copy of the GNU General Public License
kaf24@804 20 * along with this program; if not, write to the Free Software
kaf24@804 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
iap10@274 22 */
iap10@274 23
kaf24@1248 24 #include <xen/config.h>
kaf24@1248 25 #include <xen/init.h>
kaf24@1248 26 #include <xen/types.h>
kaf24@1248 27 #include <xen/lib.h>
kaf24@4254 28 #include <xen/sched.h>
kaf24@1248 29 #include <xen/spinlock.h>
kaf24@4877 30 #include <xen/mm.h>
kaf24@1979 31 #include <xen/irq.h>
kaf24@4305 32 #include <xen/softirq.h>
kaf24@5394 33 #include <xen/domain_page.h>
kaf24@9051 34 #include <xen/keyhandler.h>
awilliam@11068 35 #include <xen/perfc.h>
kfraser@11958 36 #include <xen/numa.h>
kfraser@11958 37 #include <xen/nodemask.h>
keir@19684 38 #include <xen/tmem.h>
keir@22502 39 #include <xen/tmem_xen.h>
keir@19324 40 #include <public/sysctl.h>
kaf24@4231 41 #include <asm/page.h>
keir@17421 42 #include <asm/numa.h>
Tim@15666 43 #include <asm/flushtlb.h>
iap10@274 44
kaf24@3372 45 /*
kaf24@3372 46 * Comma-separated list of hexadecimal page numbers containing bad bytes.
kaf24@3372 47 * e.g. 'badpage=0x3f45,0x8a321'.
kaf24@3372 48 */
keir@20173 49 static char __initdata opt_badpage[100] = "";
kaf24@3372 50 string_param("badpage", opt_badpage);
iap10@274 51
kaf24@9541 52 /*
kfraser@15549 53 * no-bootscrub -> Free pages are not zeroed during boot.
kfraser@15549 54 */
keir@22676 55 static bool_t opt_bootscrub __initdata = 1;
kfraser@15549 56 boolean_param("bootscrub", opt_bootscrub);
kfraser@15549 57
kfraser@15549 58 /*
keir@18195 59 * Bit width of the DMA heap -- used to override NUMA-node-first.
keir@18195 60 * allocation strategy, which can otherwise exhaust low memory.
kfraser@12638 61 */
keir@18195 62 static unsigned int dma_bitsize;
keir@18195 63 integer_param("dma_bits", dma_bitsize);
kaf24@9541 64
kaf24@3392 65 #define round_pgdown(_p) ((_p)&PAGE_MASK)
kaf24@3392 66 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
kaf24@3392 67
keir@19381 68 /* Offlined page list, protected by heap_lock. */
keir@19324 69 PAGE_LIST_HEAD(page_offlined_list);
keir@19381 70 /* Broken page list, protected by heap_lock. */
keir@19381 71 PAGE_LIST_HEAD(page_broken_list);
keir@19324 72
iap10@274 73 /*************************
kaf24@3392 74 * BOOT-TIME ALLOCATOR
kaf24@3392 75 */
kaf24@3392 76
keir@19952 77 static unsigned long __initdata first_valid_mfn = ~0UL;
keir@19952 78
keir@19952 79 static struct bootmem_region {
keir@19952 80 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
keir@19952 81 } *__initdata bootmem_region_list;
keir@19952 82 static unsigned int __initdata nr_bootmem_regions;
keir@19952 83
keir@19952 84 static void __init boot_bug(int line)
keir@19952 85 {
keir@19952 86 panic("Boot BUG at %s:%d\n", __FILE__, line);
keir@19952 87 }
keir@19952 88 #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__);
kfraser@13058 89
keir@19952 90 static void __init bootmem_region_add(unsigned long s, unsigned long e)
kaf24@3392 91 {
keir@19952 92 unsigned int i;
keir@19952 93
keir@19952 94 if ( (bootmem_region_list == NULL) && (s < e) )
keir@19952 95 bootmem_region_list = mfn_to_virt(s++);
keir@19952 96
keir@19952 97 if ( s >= e )
keir@19952 98 return;
kaf24@6108 99
keir@19952 100 for ( i = 0; i < nr_bootmem_regions; i++ )
keir@19952 101 if ( s < bootmem_region_list[i].e )
keir@19952 102 break;
keir@19952 103
keir@19952 104 BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
keir@19952 105 BOOT_BUG_ON(nr_bootmem_regions ==
keir@19952 106 (PAGE_SIZE / sizeof(struct bootmem_region)));
kaf24@3392 107
keir@19952 108 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
keir@19952 109 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
keir@19952 110 bootmem_region_list[i] = (struct bootmem_region) { s, e };
keir@19952 111 nr_bootmem_regions++;
keir@19952 112 }
keir@19952 113
keir@19952 114 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
keir@19952 115 {
keir@19952 116 unsigned int i;
kaf24@3392 117
keir@19952 118 for ( i = 0; i < nr_bootmem_regions; i++ )
keir@19952 119 {
keir@19952 120 struct bootmem_region *r = &bootmem_region_list[i];
keir@19952 121 if ( e <= r->s )
keir@19952 122 break;
keir@19952 123 if ( s >= r->e )
keir@19952 124 continue;
keir@19952 125 if ( s <= r->s )
keir@19952 126 {
keir@19952 127 r->s = min(e, r->e);
keir@19952 128 }
keir@19952 129 else if ( e >= r->e )
keir@19952 130 {
keir@19952 131 r->e = s;
keir@19952 132 }
keir@19952 133 else
keir@19952 134 {
keir@19952 135 unsigned long _e = r->e;
keir@19952 136 r->e = s;
keir@19952 137 bootmem_region_add(e, _e);
keir@19952 138 }
keir@19952 139 }
kaf24@3392 140 }
kaf24@3392 141
keir@15081 142 void __init init_boot_pages(paddr_t ps, paddr_t pe)
kaf24@3392 143 {
keir@19952 144 unsigned long bad_spfn, bad_epfn;
kfraser@13158 145 const char *p;
kaf24@3392 146
kaf24@3392 147 ps = round_pgup(ps);
kaf24@3392 148 pe = round_pgdown(pe);
kaf24@5036 149 if ( pe <= ps )
kaf24@5036 150 return;
kaf24@3392 151
kfraser@13058 152 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
kfraser@13058 153
keir@19952 154 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
kaf24@3392 155
kaf24@3392 156 /* Check new pages against the bad-page list. */
kaf24@3392 157 p = opt_badpage;
kaf24@3392 158 while ( *p != '\0' )
kaf24@3392 159 {
kaf24@9576 160 bad_spfn = simple_strtoul(p, &p, 0);
kaf24@9576 161 bad_epfn = bad_spfn;
kaf24@9576 162
kaf24@9576 163 if ( *p == '-' )
kaf24@9576 164 {
kaf24@9576 165 p++;
kaf24@9576 166 bad_epfn = simple_strtoul(p, &p, 0);
kaf24@9576 167 if ( bad_epfn < bad_spfn )
kaf24@9576 168 bad_epfn = bad_spfn;
kaf24@9576 169 }
kaf24@3392 170
kaf24@3392 171 if ( *p == ',' )
kaf24@3392 172 p++;
kaf24@3392 173 else if ( *p != '\0' )
kaf24@3392 174 break;
kaf24@3392 175
kaf24@9576 176 if ( bad_epfn == bad_spfn )
kaf24@9576 177 printk("Marking page %lx as bad\n", bad_spfn);
kaf24@9576 178 else
kaf24@9576 179 printk("Marking pages %lx through %lx as bad\n",
kaf24@9576 180 bad_spfn, bad_epfn);
kaf24@9576 181
keir@19952 182 bootmem_region_zap(bad_spfn, bad_epfn+1);
kaf24@3392 183 }
kaf24@3392 184 }
kaf24@3392 185
keir@15081 186 unsigned long __init alloc_boot_pages(
kfraser@14088 187 unsigned long nr_pfns, unsigned long pfn_align)
kaf24@3392 188 {
keir@19952 189 unsigned long pg, _e;
keir@19952 190 int i;
kaf24@3392 191
keir@19952 192 for ( i = nr_bootmem_regions - 1; i >= 0; i-- )
kaf24@3392 193 {
keir@19952 194 struct bootmem_region *r = &bootmem_region_list[i];
keir@19952 195 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
keir@19952 196 if ( pg < r->s )
keir@19952 197 continue;
keir@19952 198 _e = r->e;
keir@19952 199 r->e = pg;
keir@19952 200 bootmem_region_add(pg + nr_pfns, _e);
keir@19952 201 return pg;
kaf24@3392 202 }
kaf24@3392 203
keir@19952 204 BOOT_BUG_ON(1);
kfraser@14088 205 return 0;
kaf24@3392 206 }
kaf24@3392 207
kaf24@3392 208
kaf24@3392 209
kaf24@3392 210 /*************************
iap10@274 211 * BINARY BUDDY ALLOCATOR
iap10@274 212 */
iap10@274 213
kaf24@1974 214 #define MEMZONE_XEN 0
kfraser@14130 215 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
kaf24@5951 216
keir@19099 217 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
keir@19099 218 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
keir@19099 219 (fls(page_to_mfn(pg)) - 1))
kaf24@1974 220
keir@19170 221 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
keir@14134 222 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
keir@14134 223 #define heap(node, zone, order) ((*_heap[node])[zone][order])
kaf24@1974 224
keir@14134 225 static unsigned long *avail[MAX_NUMNODES];
keir@20641 226 static long total_avail_pages;
iap10@274 227
keir@20991 228 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
keir@20991 229 static long midsize_alloc_zone_pages;
keir@20991 230 #define MIDSIZE_ALLOC_FRAC 128
keir@20991 231
kaf24@10288 232 static DEFINE_SPINLOCK(heap_lock);
iap10@274 233
keir@19179 234 static unsigned long init_node_heap(int node, unsigned long mfn,
keir@21080 235 unsigned long nr, bool_t *use_tail)
keir@14134 236 {
kfraser@15214 237 /* First node to be discovered has its heap metadata statically alloced. */
kfraser@15214 238 static heap_by_zone_and_order_t _heap_static;
kfraser@15214 239 static unsigned long avail_static[NR_ZONES];
keir@16614 240 static int first_node_initialised;
keir@19179 241 unsigned long needed = (sizeof(**_heap) +
keir@19179 242 sizeof(**avail) * NR_ZONES +
keir@19179 243 PAGE_SIZE - 1) >> PAGE_SHIFT;
keir@14134 244 int i, j;
kfraser@15214 245
keir@16614 246 if ( !first_node_initialised )
kfraser@15214 247 {
kfraser@15214 248 _heap[node] = &_heap_static;
kfraser@15214 249 avail[node] = avail_static;
keir@16614 250 first_node_initialised = 1;
keir@19179 251 needed = 0;
keir@19179 252 }
keir@19179 253 #ifdef DIRECTMAP_VIRT_END
keir@21080 254 else if ( *use_tail && nr >= needed &&
keir@21080 255 (mfn + nr) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
keir@21080 256 {
keir@21080 257 _heap[node] = mfn_to_virt(mfn + nr - needed);
keir@21080 258 avail[node] = mfn_to_virt(mfn + nr - 1) +
keir@21080 259 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
keir@21080 260 }
keir@19179 261 else if ( nr >= needed &&
keir@19452 262 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
keir@19179 263 {
keir@19179 264 _heap[node] = mfn_to_virt(mfn);
keir@19479 265 avail[node] = mfn_to_virt(mfn + needed - 1) +
keir@19479 266 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
keir@21080 267 *use_tail = 0;
keir@19179 268 }
keir@19179 269 #endif
keir@19179 270 else if ( get_order_from_bytes(sizeof(**_heap)) ==
keir@19179 271 get_order_from_pages(needed) )
keir@19179 272 {
keir@19179 273 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
keir@19179 274 BUG_ON(!_heap[node]);
keir@19179 275 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
keir@19179 276 sizeof(**avail) * NR_ZONES;
keir@19179 277 needed = 0;
kfraser@15214 278 }
kfraser@15214 279 else
kfraser@15214 280 {
kfraser@15214 281 _heap[node] = xmalloc(heap_by_zone_and_order_t);
kfraser@15214 282 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
kfraser@15214 283 BUG_ON(!_heap[node] || !avail[node]);
keir@19179 284 needed = 0;
kfraser@15214 285 }
kfraser@15214 286
kfraser@15214 287 memset(avail[node], 0, NR_ZONES * sizeof(long));
kfraser@15214 288
keir@14134 289 for ( i = 0; i < NR_ZONES; i++ )
keir@14134 290 for ( j = 0; j <= MAX_ORDER; j++ )
keir@19170 291 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
keir@19179 292
keir@19179 293 return needed;
keir@14134 294 }
keir@14134 295
kfraser@14126 296 /* Allocate 2^@order contiguous pages. */
kfraser@14126 297 static struct page_info *alloc_heap_pages(
kfraser@14316 298 unsigned int zone_lo, unsigned int zone_hi,
keir@21959 299 unsigned int order, unsigned int memflags,
keir@21959 300 struct domain *d)
iap10@274 301 {
keir@21959 302 unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
keir@21959 303 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
kfraser@14289 304 unsigned long request = 1UL << order;
kfraser@14322 305 cpumask_t extra_cpus_mask, mask;
kaf24@8764 306 struct page_info *pg;
keir@21959 307 nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
iap10@274 308
keir@17421 309 if ( node == NUMA_NO_NODE )
keir@21765 310 {
keir@21959 311 memflags &= ~MEMF_exact_node;
keir@21959 312 if ( d != NULL )
keir@21959 313 {
keir@21959 314 node = next_node(d->last_alloc_node, nodemask);
keir@21959 315 if ( node >= MAX_NUMNODES )
keir@21959 316 node = first_node(nodemask);
keir@21959 317 }
keir@21959 318 if ( node >= MAX_NUMNODES )
keir@21959 319 node = cpu_to_node(smp_processor_id());
keir@21765 320 }
keir@21959 321 first_node = node;
keir@17421 322
kfraser@11932 323 ASSERT(node >= 0);
kfraser@14130 324 ASSERT(zone_lo <= zone_hi);
kfraser@14130 325 ASSERT(zone_hi < NR_ZONES);
kaf24@3499 326
kaf24@3499 327 if ( unlikely(order > MAX_ORDER) )
kaf24@1979 328 return NULL;
kaf24@1979 329
kaf24@2844 330 spin_lock(&heap_lock);
iap10@274 331
kfraser@14316 332 /*
keir@21704 333 * TMEM: When available memory is scarce due to tmem absorbing it, allow
keir@21704 334 * only mid-size allocations to avoid worst of fragmentation issues.
keir@21704 335 * Others try tmem pools then fail. This is a workaround until all
keir@21704 336 * post-dom0-creation-multi-page allocations can be eliminated.
keir@20991 337 */
keir@20991 338 if ( opt_tmem && ((order == 0) || (order >= 9)) &&
keir@21704 339 (total_avail_pages <= midsize_alloc_zone_pages) &&
keir@21704 340 tmem_freeable_pages() )
keir@21001 341 goto try_tmem;
keir@20991 342
keir@20991 343 /*
kfraser@14316 344 * Start with requested node, but exhaust all node memory in requested
kfraser@14316 345 * zone before failing, only calc new node value if we fail to find memory
kfraser@14316 346 * in target node, this avoids needless computation on fast-path.
kfraser@14316 347 */
keir@21959 348 for ( ; ; )
kfraser@11932 349 {
kfraser@14316 350 zone = zone_hi;
kfraser@14316 351 do {
kfraser@14316 352 /* Check if target node can support the allocation. */
kfraser@14316 353 if ( !avail[node] || (avail[node][zone] < request) )
kfraser@14316 354 continue;
kfraser@14316 355
kfraser@14316 356 /* Find smallest order which can satisfy the request. */
kfraser@14316 357 for ( j = order; j <= MAX_ORDER; j++ )
keir@19170 358 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
kfraser@14316 359 goto found;
kfraser@14316 360 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
keir@14134 361
keir@21959 362 if ( memflags & MEMF_exact_node )
keir@21765 363 goto not_found;
keir@21765 364
keir@21959 365 /* Pick next node. */
keir@21959 366 if ( !node_isset(node, nodemask) )
keir@21959 367 {
keir@21959 368 /* Very first node may be caller-specified and outside nodemask. */
keir@21959 369 ASSERT(!nodemask_retry);
keir@21959 370 first_node = node = first_node(nodemask);
keir@21959 371 if ( node < MAX_NUMNODES )
keir@21959 372 continue;
keir@21959 373 }
keir@21959 374 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
keir@21959 375 node = first_node(nodemask);
keir@21959 376 if ( node == first_node )
keir@21959 377 {
keir@21959 378 /* When we have tried all in nodemask, we fall back to others. */
keir@21959 379 if ( nodemask_retry++ )
keir@21959 380 goto not_found;
keir@21959 381 nodes_andnot(nodemask, node_online_map, nodemask);
keir@21959 382 first_node = node = first_node(nodemask);
keir@21959 383 if ( node >= MAX_NUMNODES )
keir@21959 384 goto not_found;
keir@21959 385 }
kfraser@11932 386 }
iap10@274 387
keir@21001 388 try_tmem:
keir@19684 389 /* Try to free memory from tmem */
keir@21959 390 if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
keir@19684 391 {
keir@19684 392 /* reassigning an already allocated anonymous heap page */
keir@19684 393 spin_unlock(&heap_lock);
keir@19684 394 return pg;
keir@19684 395 }
keir@19684 396
keir@21765 397 not_found:
kaf24@3499 398 /* No suitable memory blocks. Fail the request. */
kaf24@3499 399 spin_unlock(&heap_lock);
kaf24@3499 400 return NULL;
kaf24@3499 401
kaf24@3499 402 found:
kaf24@1971 403 /* We may have to halve the chunk a number of times. */
kfraser@11932 404 while ( j != order )
iap10@274 405 {
kfraser@11932 406 PFN_ORDER(pg) = --j;
keir@19170 407 page_list_add_tail(pg, &heap(node, zone, j));
kfraser@11932 408 pg += 1 << j;
iap10@274 409 }
keir@19951 410
keir@14134 411 ASSERT(avail[node][zone] >= request);
keir@14134 412 avail[node][zone] -= request;
keir@20641 413 total_avail_pages -= request;
keir@20641 414 ASSERT(total_avail_pages >= 0);
iap10@274 415
keir@21959 416 if ( d != NULL )
keir@21959 417 d->last_alloc_node = node;
keir@21959 418
kfraser@14322 419 cpus_clear(mask);
kfraser@14322 420
kfraser@14322 421 for ( i = 0; i < (1 << order); i++ )
kfraser@14322 422 {
kfraser@14322 423 /* Reference count must continuously be zero for free pages. */
keir@19951 424 BUG_ON(pg[i].count_info != PGC_state_free);
keir@19951 425 pg[i].count_info = PGC_state_inuse;
kfraser@14322 426
keir@19202 427 if ( pg[i].u.free.need_tlbflush )
keir@19202 428 {
keir@19202 429 /* Add in extra CPUs that need flushing because of this page. */
keir@19202 430 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
keir@19202 431 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
keir@19202 432 cpus_or(mask, mask, extra_cpus_mask);
keir@19202 433 }
kfraser@14322 434
kfraser@14322 435 /* Initialise fields which have other uses for free pages. */
kfraser@14322 436 pg[i].u.inuse.type_info = 0;
kfraser@14322 437 page_set_owner(&pg[i], NULL);
kfraser@14322 438 }
kfraser@14322 439
keir@22181 440 spin_unlock(&heap_lock);
keir@22181 441
kfraser@14322 442 if ( unlikely(!cpus_empty(mask)) )
kfraser@14322 443 {
kfraser@14625 444 perfc_incr(need_flush_tlb_flush);
keir@19689 445 flush_tlb_mask(&mask);
kfraser@14322 446 }
kfraser@14322 447
kaf24@1974 448 return pg;
iap10@274 449 }
iap10@274 450
keir@19381 451 /* Remove any offlined page in the buddy pointed to by head. */
keir@19324 452 static int reserve_offlined_page(struct page_info *head)
keir@19324 453 {
keir@19324 454 unsigned int node = phys_to_nid(page_to_maddr(head));
keir@19324 455 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
keir@19324 456 struct page_info *cur_head;
keir@19324 457 int cur_order;
keir@19324 458
keir@19324 459 ASSERT(spin_is_locked(&heap_lock));
keir@19324 460
keir@19324 461 cur_head = head;
keir@19324 462
keir@19324 463 page_list_del(head, &heap(node, zone, head_order));
keir@19324 464
keir@19324 465 while ( cur_head < (head + (1 << head_order)) )
keir@19324 466 {
keir@19324 467 struct page_info *pg;
keir@19324 468 int next_order;
keir@19324 469
keir@19951 470 if ( page_state_is(cur_head, offlined) )
keir@19324 471 {
keir@19324 472 cur_head++;
keir@19324 473 continue;
keir@19324 474 }
keir@19324 475
keir@19324 476 next_order = cur_order = 0;
keir@19324 477
keir@19381 478 while ( cur_order < head_order )
keir@19324 479 {
keir@19324 480 next_order = cur_order + 1;
keir@19324 481
keir@19381 482 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
keir@19324 483 goto merge;
keir@19324 484
keir@19381 485 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
keir@19381 486 i < (1 << next_order);
keir@19381 487 i++, pg++ )
keir@19951 488 if ( page_state_is(pg, offlined) )
keir@19324 489 break;
keir@19381 490 if ( i == ( 1 << next_order) )
keir@19324 491 {
keir@19324 492 cur_order = next_order;
keir@19324 493 continue;
keir@19324 494 }
keir@19324 495 else
keir@19324 496 {
keir@19381 497 merge:
keir@19381 498 /* We don't consider merging outside the head_order. */
keir@19324 499 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
keir@19324 500 PFN_ORDER(cur_head) = cur_order;
keir@19324 501 cur_head += (1 << cur_order);
keir@19324 502 break;
keir@19324 503 }
keir@19324 504 }
keir@19324 505 }
keir@19324 506
keir@19381 507 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
keir@19324 508 {
keir@19951 509 if ( !page_state_is(cur_head, offlined) )
keir@19324 510 continue;
keir@19324 511
keir@19381 512 avail[node][zone]--;
keir@20641 513 total_avail_pages--;
keir@20641 514 ASSERT(total_avail_pages >= 0);
keir@19324 515
keir@19381 516 page_list_add_tail(cur_head,
keir@19381 517 test_bit(_PGC_broken, &cur_head->count_info) ?
keir@19381 518 &page_broken_list : &page_offlined_list);
keir@19324 519
keir@19381 520 count++;
keir@19324 521 }
keir@19324 522
keir@19324 523 return count;
keir@19324 524 }
keir@19324 525
kaf24@1974 526 /* Free 2^@order set of pages. */
kfraser@14126 527 static void free_heap_pages(
keir@19099 528 struct page_info *pg, unsigned int order)
iap10@274 529 {
keir@22783 530 unsigned long mask;
keir@19324 531 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
keir@19099 532 unsigned int zone = page_to_zone(pg);
iap10@274 533
kaf24@3499 534 ASSERT(order <= MAX_ORDER);
kfraser@11932 535 ASSERT(node >= 0);
kaf24@3499 536
keir@22181 537 spin_lock(&heap_lock);
keir@22181 538
kfraser@14322 539 for ( i = 0; i < (1 << order); i++ )
kfraser@14322 540 {
keir@14797 541 /*
keir@14797 542 * Cannot assume that count_info == 0, as there are some corner cases
keir@14797 543 * where it isn't the case and yet it isn't a bug:
keir@14797 544 * 1. page_get_owner() is NULL
keir@14797 545 * 2. page_get_owner() is a domain that was never accessible by
keir@14797 546 * its domid (e.g., failed to fully construct the domain).
keir@14797 547 * 3. page was never addressable by the guest (e.g., it's an
keir@14797 548 * auto-translate-physmap guest and the page was never included
keir@14797 549 * in its pseudophysical address space).
keir@14797 550 * In all the above cases there can be no guest mappings of this page.
keir@14797 551 */
keir@19951 552 ASSERT(!page_state_is(&pg[i], offlined));
keir@19951 553 pg[i].count_info =
keir@19951 554 ((pg[i].count_info & PGC_broken) |
keir@19951 555 (page_state_is(&pg[i], offlining)
keir@19951 556 ? PGC_state_offlined : PGC_state_free));
keir@19951 557 if ( page_state_is(&pg[i], offlined) )
keir@19324 558 tainted = 1;
keir@14797 559
keir@19200 560 /* If a page has no owner it will need no safety TLB flush. */
keir@19202 561 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
keir@19202 562 if ( pg[i].u.free.need_tlbflush )
keir@19202 563 pg[i].tlbflush_timestamp = tlbflush_current_time();
kfraser@14322 564 }
kfraser@14322 565
keir@14134 566 avail[node][zone] += 1 << order;
keir@20641 567 total_avail_pages += 1 << order;
keir@14134 568
keir@20991 569 if ( opt_tmem )
keir@20991 570 midsize_alloc_zone_pages = max(
keir@20991 571 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
keir@20991 572
iap10@274 573 /* Merge chunks as far as possible. */
kaf24@1974 574 while ( order < MAX_ORDER )
iap10@274 575 {
kfraser@14289 576 mask = 1UL << order;
kaf24@1971 577
kaf24@8764 578 if ( (page_to_mfn(pg) & mask) )
iap10@274 579 {
iap10@274 580 /* Merge with predecessor block? */
keir@20011 581 if ( !mfn_valid(page_to_mfn(pg-mask)) ||
keir@20011 582 !page_state_is(pg-mask, free) ||
keir@22179 583 (PFN_ORDER(pg-mask) != order) ||
keir@22179 584 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
kaf24@1183 585 break;
kaf24@1971 586 pg -= mask;
keir@19170 587 page_list_del(pg, &heap(node, zone, order));
iap10@274 588 }
iap10@274 589 else
iap10@274 590 {
iap10@274 591 /* Merge with successor block? */
keir@20011 592 if ( !mfn_valid(page_to_mfn(pg+mask)) ||
keir@20011 593 !page_state_is(pg+mask, free) ||
keir@22179 594 (PFN_ORDER(pg+mask) != order) ||
keir@22179 595 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
kaf24@1183 596 break;
keir@19170 597 page_list_del(pg + mask, &heap(node, zone, order));
iap10@274 598 }
keir@19324 599
iap10@274 600 order++;
iap10@274 601 }
iap10@274 602
kaf24@1971 603 PFN_ORDER(pg) = order;
keir@19170 604 page_list_add_tail(pg, &heap(node, zone, order));
kaf24@1974 605
keir@19381 606 if ( tainted )
keir@19324 607 reserve_offlined_page(pg);
keir@19324 608
kaf24@2844 609 spin_unlock(&heap_lock);
kaf24@1974 610 }
kaf24@1974 611
keir@19324 612
keir@19324 613 /*
keir@19324 614 * Following possible status for a page:
keir@19324 615 * free and Online; free and offlined; free and offlined and broken;
keir@19324 616 * assigned and online; assigned and offlining; assigned and offling and broken
keir@19324 617 *
keir@19324 618 * Following rules applied for page offline:
keir@19324 619 * Once a page is broken, it can't be assigned anymore
keir@19324 620 * A page will be offlined only if it is free
keir@19324 621 * return original count_info
keir@19324 622 */
keir@19324 623 static unsigned long mark_page_offline(struct page_info *pg, int broken)
keir@19324 624 {
keir@19324 625 unsigned long nx, x, y = pg->count_info;
keir@19324 626
keir@19324 627 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
keir@19324 628 ASSERT(spin_is_locked(&heap_lock));
keir@19324 629
keir@19324 630 do {
keir@19324 631 nx = x = y;
keir@19324 632
keir@19951 633 if ( ((x & PGC_state) != PGC_state_offlined) &&
keir@19951 634 ((x & PGC_state) != PGC_state_offlining) )
keir@19324 635 {
keir@19951 636 nx &= ~PGC_state;
keir@19951 637 nx |= (((x & PGC_state) == PGC_state_free)
keir@19951 638 ? PGC_state_offlined : PGC_state_offlining);
keir@19381 639 }
keir@19324 640
keir@19381 641 if ( broken )
keir@19324 642 nx |= PGC_broken;
keir@19951 643
keir@19951 644 if ( x == nx )
keir@19951 645 break;
keir@19324 646 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
keir@19324 647
keir@19324 648 return y;
keir@19324 649 }
keir@19324 650
keir@19324 651 static int reserve_heap_page(struct page_info *pg)
keir@19324 652 {
keir@19324 653 struct page_info *head = NULL;
keir@19324 654 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
keir@19324 655 unsigned int zone = page_to_zone(pg);
keir@19324 656
keir@19324 657 for ( i = 0; i <= MAX_ORDER; i++ )
keir@19324 658 {
keir@19324 659 struct page_info *tmp;
keir@19324 660
keir@19324 661 if ( page_list_empty(&heap(node, zone, i)) )
keir@19324 662 continue;
keir@19324 663
keir@19381 664 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
keir@19324 665 {
keir@19324 666 if ( (head <= pg) &&
keir@19324 667 (head + (1UL << i) > pg) )
keir@19324 668 return reserve_offlined_page(head);
keir@19324 669 }
keir@19324 670 }
keir@19324 671
keir@19324 672 return -EINVAL;
keir@19324 673
keir@19324 674 }
keir@19324 675
keir@19324 676 int offline_page(unsigned long mfn, int broken, uint32_t *status)
keir@19324 677 {
keir@19324 678 unsigned long old_info = 0;
keir@19324 679 struct domain *owner;
keir@19324 680 int ret = 0;
keir@19324 681 struct page_info *pg;
keir@19324 682
keir@20226 683 if ( !mfn_valid(mfn) )
keir@19324 684 {
keir@19324 685 dprintk(XENLOG_WARNING,
keir@19324 686 "try to offline page out of range %lx\n", mfn);
keir@19324 687 return -EINVAL;
keir@19324 688 }
keir@19324 689
keir@19324 690 *status = 0;
keir@19324 691 pg = mfn_to_page(mfn);
keir@19324 692
keir@19324 693 if ( is_xen_fixed_mfn(mfn) )
keir@19324 694 {
keir@19324 695 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
keir@19324 696 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
keir@19324 697 return -EPERM;
keir@19324 698 }
keir@19324 699
keir@19324 700 /*
keir@19952 701 * N.B. xen's txt in x86_64 is marked reserved and handled already.
keir@19952 702 * Also kexec range is reserved.
keir@19324 703 */
keir@19952 704 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
keir@19952 705 {
keir@19324 706 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
keir@19324 707 return -EINVAL;
keir@19952 708 }
keir@19324 709
keir@19324 710 spin_lock(&heap_lock);
keir@19324 711
keir@19324 712 old_info = mark_page_offline(pg, broken);
keir@19324 713
keir@19951 714 if ( page_state_is(pg, free) )
keir@19324 715 {
keir@19324 716 /* Free pages are reserve directly */
keir@19324 717 reserve_heap_page(pg);
keir@19324 718 *status = PG_OFFLINE_OFFLINED;
keir@19324 719 }
keir@19951 720 else if ( page_state_is(pg, offlined) )
keir@19324 721 {
keir@19324 722 *status = PG_OFFLINE_OFFLINED;
keir@19324 723 }
keir@19381 724 else if ( (owner = page_get_owner_and_reference(pg)) )
keir@19324 725 {
keir@19324 726 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
keir@19324 727 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
keir@19324 728 /* Release the reference since it will not be allocated anymore */
keir@19324 729 put_page(pg);
keir@19324 730 }
keir@19952 731 else if ( old_info & PGC_xen_heap )
keir@19324 732 {
keir@19324 733 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
keir@19324 734 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
keir@19324 735 }
keir@19324 736 else
keir@19324 737 {
keir@19324 738 /*
keir@19324 739 * assign_pages does not hold heap_lock, so small window that the owner
keir@19324 740 * may be set later, but please notice owner will only change from
keir@19324 741 * NULL to be set, not verse, since page is offlining now.
keir@19324 742 * No windows If called from #MC handler, since all CPU are in softirq
keir@19324 743 * If called from user space like CE handling, tools can wait some time
keir@19324 744 * before call again.
keir@19324 745 */
keir@19324 746 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
keir@19324 747 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
keir@19324 748 }
keir@19324 749
keir@19381 750 if ( broken )
keir@19324 751 *status |= PG_OFFLINE_BROKEN;
keir@19324 752
keir@19324 753 spin_unlock(&heap_lock);
keir@19324 754
keir@19324 755 return ret;
keir@19324 756 }
keir@19324 757
keir@19324 758 /*
keir@19324 759 * Online the memory.
keir@19324 760 * The caller should make sure end_pfn <= max_page,
keir@19324 761 * if not, expand_pages() should be called prior to online_page().
keir@19324 762 */
keir@19324 763 unsigned int online_page(unsigned long mfn, uint32_t *status)
keir@19324 764 {
keir@19951 765 unsigned long x, nx, y;
keir@19324 766 struct page_info *pg;
keir@19951 767 int ret;
keir@19324 768
keir@20011 769 if ( !mfn_valid(mfn) )
keir@19324 770 {
keir@19324 771 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
keir@19324 772 return -EINVAL;
keir@19324 773 }
keir@19324 774
keir@19324 775 pg = mfn_to_page(mfn);
keir@19324 776
keir@19324 777 spin_lock(&heap_lock);
keir@19324 778
keir@19951 779 y = pg->count_info;
keir@19951 780 do {
keir@19951 781 ret = *status = 0;
keir@19951 782
keir@19951 783 if ( y & PGC_broken )
keir@19951 784 {
keir@19951 785 ret = -EINVAL;
keir@19951 786 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
keir@19951 787 break;
keir@19951 788 }
keir@19951 789
keir@19951 790 if ( (y & PGC_state) == PGC_state_offlined )
keir@19951 791 {
keir@19951 792 page_list_del(pg, &page_offlined_list);
keir@19951 793 *status = PG_ONLINE_ONLINED;
keir@19951 794 }
keir@19951 795 else if ( (y & PGC_state) == PGC_state_offlining )
keir@19951 796 {
keir@19951 797 *status = PG_ONLINE_ONLINED;
keir@19951 798 }
keir@19951 799 else
keir@19951 800 {
keir@19951 801 break;
keir@19951 802 }
keir@19951 803
keir@19951 804 x = y;
keir@19951 805 nx = (x & ~PGC_state) | PGC_state_inuse;
keir@19951 806 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
keir@19951 807
keir@19324 808 spin_unlock(&heap_lock);
keir@19324 809
keir@19951 810 if ( (y & PGC_state) == PGC_state_offlined )
keir@19324 811 free_heap_pages(pg, 0);
keir@19324 812
keir@19324 813 return ret;
keir@19324 814 }
keir@19324 815
keir@19324 816 int query_page_offline(unsigned long mfn, uint32_t *status)
keir@19324 817 {
keir@19324 818 struct page_info *pg;
keir@19324 819
keir@20011 820 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
keir@19324 821 {
keir@19324 822 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
keir@19324 823 return -EINVAL;
keir@19324 824 }
keir@19324 825
keir@19324 826 *status = 0;
keir@19324 827 spin_lock(&heap_lock);
keir@19324 828
keir@19324 829 pg = mfn_to_page(mfn);
keir@19324 830
keir@19951 831 if ( page_state_is(pg, offlining) )
keir@19324 832 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
keir@19951 833 if ( pg->count_info & PGC_broken )
keir@19324 834 *status |= PG_OFFLINE_STATUS_BROKEN;
keir@19951 835 if ( page_state_is(pg, offlined) )
keir@19324 836 *status |= PG_OFFLINE_STATUS_OFFLINED;
keir@19324 837
keir@19324 838 spin_unlock(&heap_lock);
keir@19324 839
keir@19324 840 return 0;
keir@19324 841 }
keir@19324 842
kfraser@14126 843 /*
kfraser@14126 844 * Hand the specified arbitrary page range to the specified heap zone
kfraser@14126 845 * checking the node_id of the previous page. If they differ and the
kfraser@14126 846 * latter is not on a MAX_ORDER boundary, then we reserve the page by
kfraser@14126 847 * not freeing it to the buddy allocator.
kfraser@14126 848 */
keir@16611 849 static void init_heap_pages(
keir@19099 850 struct page_info *pg, unsigned long nr_pages)
kfraser@14126 851 {
kfraser@14126 852 unsigned long i;
kfraser@14126 853
keir@22179 854 for ( i = 0; i < nr_pages; i++ )
kfraser@14126 855 {
keir@22179 856 unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
kfraser@14126 857
keir@22179 858 if ( unlikely(!avail[nid]) )
keir@19179 859 {
keir@21080 860 unsigned long s = page_to_mfn(pg + i);
keir@21080 861 unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
keir@22179 862 bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
keir@21080 863 !(s & ((1UL << MAX_ORDER) - 1)) &&
keir@21080 864 (find_first_set_bit(e) <= find_first_set_bit(s));
keir@19179 865 unsigned long n;
keir@19179 866
keir@22179 867 n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
keir@21080 868 &use_tail);
keir@21080 869 BUG_ON(i + n > nr_pages);
keir@21080 870 if ( n && !use_tail )
keir@19179 871 {
keir@19179 872 i += n - 1;
keir@19179 873 continue;
keir@19179 874 }
keir@21080 875 if ( i + n == nr_pages )
keir@21080 876 break;
keir@21080 877 nr_pages -= n;
keir@19179 878 }
keir@14134 879
keir@22179 880 free_heap_pages(pg+i, 0);
kfraser@14126 881 }
kfraser@14126 882 }
kfraser@14126 883
kfraser@14126 884 static unsigned long avail_heap_pages(
kfraser@14130 885 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
kfraser@14126 886 {
keir@20328 887 unsigned int i, zone;
kfraser@14126 888 unsigned long free_pages = 0;
kfraser@14126 889
kfraser@14130 890 if ( zone_hi >= NR_ZONES )
kfraser@14130 891 zone_hi = NR_ZONES - 1;
keir@14134 892
keir@20328 893 for_each_online_node(i)
keir@14134 894 {
keir@14134 895 if ( !avail[i] )
keir@14134 896 continue;
keir@14134 897 for ( zone = zone_lo; zone <= zone_hi; zone++ )
kfraser@14130 898 if ( (node == -1) || (node == i) )
keir@14134 899 free_pages += avail[i][zone];
keir@14134 900 }
kfraser@14126 901
kfraser@14126 902 return free_pages;
kfraser@14126 903 }
kfraser@14126 904
keir@20641 905 unsigned long total_free_pages(void)
keir@20641 906 {
keir@20991 907 return total_avail_pages - midsize_alloc_zone_pages;
keir@20641 908 }
keir@20641 909
keir@15081 910 void __init end_boot_allocator(void)
kfraser@14126 911 {
keir@19952 912 unsigned int i;
kfraser@14126 913
kfraser@14126 914 /* Pages that are free now go to the domain sub-allocator. */
keir@19952 915 for ( i = 0; i < nr_bootmem_regions; i++ )
kfraser@14126 916 {
keir@19952 917 struct bootmem_region *r = &bootmem_region_list[i];
keir@21080 918 if ( (r->s < r->e) &&
keir@21080 919 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
keir@21080 920 {
keir@21080 921 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
keir@21080 922 r->e = r->s;
keir@21080 923 break;
keir@21080 924 }
keir@21080 925 }
keir@21080 926 for ( i = nr_bootmem_regions; i-- > 0; )
keir@21080 927 {
keir@21080 928 struct bootmem_region *r = &bootmem_region_list[i];
keir@19952 929 if ( r->s < r->e )
keir@19952 930 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
kfraser@14126 931 }
keir@19952 932 init_heap_pages(virt_to_page(bootmem_region_list), 1);
kfraser@14126 933
keir@18195 934 if ( !dma_bitsize && (num_online_nodes() > 1) )
keir@18196 935 {
keir@18196 936 #ifdef CONFIG_X86
keir@18195 937 dma_bitsize = min_t(unsigned int,
keir@18195 938 fls(NODE_DATA(0)->node_spanned_pages) - 1
keir@18195 939 + PAGE_SHIFT - 2,
keir@18195 940 32);
keir@18196 941 #else
keir@18196 942 dma_bitsize = 32;
keir@18196 943 #endif
keir@18196 944 }
keir@18195 945
keir@18195 946 printk("Domain heap initialised");
keir@18195 947 if ( dma_bitsize )
keir@18195 948 printk(" DMA width %u bits", dma_bitsize);
keir@18195 949 printk("\n");
kfraser@14126 950 }
kaf24@1974 951
kaf24@2810 952 /*
kaf24@2810 953 * Scrub all unallocated pages in all heap zones. This function is more
kaf24@2810 954 * convoluted than appears necessary because we do not want to continuously
kfraser@14322 955 * hold the lock while scrubbing very large memory areas.
kaf24@2810 956 */
keir@15081 957 void __init scrub_heap_pages(void)
kaf24@2810 958 {
kfraser@13058 959 unsigned long mfn;
keir@19951 960 struct page_info *pg;
kaf24@2810 961
kfraser@15549 962 if ( !opt_bootscrub )
kfraser@15549 963 return;
kfraser@15549 964
kaf24@2888 965 printk("Scrubbing Free RAM: ");
kaf24@2888 966
kfraser@13058 967 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
kaf24@2810 968 {
keir@20760 969 process_pending_softirqs();
kaf24@5850 970
keir@19951 971 pg = mfn_to_page(mfn);
keir@19951 972
kaf24@2810 973 /* Quick lock-free check. */
keir@20011 974 if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
kaf24@2810 975 continue;
kaf24@5850 976
kfraser@13117 977 /* Every 100MB, print a progress dot. */
kfraser@13117 978 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
kfraser@13117 979 printk(".");
kfraser@13117 980
kfraser@14322 981 spin_lock(&heap_lock);
kaf24@5850 982
kaf24@2810 983 /* Re-check page status with lock held. */
keir@19951 984 if ( page_state_is(pg, free) )
keir@19951 985 scrub_one_page(pg);
kaf24@5850 986
kfraser@14322 987 spin_unlock(&heap_lock);
kaf24@2810 988 }
kaf24@2888 989
kaf24@2888 990 printk("done.\n");
kaf24@2810 991 }
kaf24@2810 992
kaf24@2810 993
kaf24@1974 994
kaf24@1974 995 /*************************
kaf24@1974 996 * XEN-HEAP SUB-ALLOCATOR
kaf24@1974 997 */
kaf24@1974 998
yamahata@19168 999 #if !defined(__x86_64__) && !defined(__ia64__)
keir@19092 1000
kaf24@8764 1001 void init_xenheap_pages(paddr_t ps, paddr_t pe)
kaf24@1974 1002 {
kaf24@1974 1003 ps = round_pgup(ps);
kaf24@1974 1004 pe = round_pgdown(pe);
kaf24@6134 1005 if ( pe <= ps )
kaf24@6134 1006 return;
kaf24@2844 1007
kaf24@4055 1008 /*
kaf24@4055 1009 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
kaf24@4055 1010 * prevent merging of power-of-two blocks across the zone boundary.
kaf24@4055 1011 */
keir@16376 1012 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
kfraser@15073 1013 ps += PAGE_SIZE;
keir@16376 1014 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
kaf24@4055 1015 pe -= PAGE_SIZE;
kaf24@4055 1016
keir@20931 1017 memguard_guard_range(maddr_to_virt(ps), pe - ps);
keir@20931 1018
keir@19099 1019 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
kaf24@1974 1020 }
kaf24@1974 1021
kaf24@2844 1022
keir@19143 1023 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
kaf24@1974 1024 {
kaf24@8764 1025 struct page_info *pg;
kaf24@1974 1026
kfraser@14322 1027 ASSERT(!in_irq());
kfraser@14322 1028
keir@19684 1029 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
keir@21959 1030 order, memflags, NULL);
kaf24@2844 1031 if ( unlikely(pg == NULL) )
keir@19143 1032 return NULL;
kaf24@2382 1033
kaf24@1974 1034 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
kaf24@2382 1035
kaf24@5436 1036 return page_to_virt(pg);
kaf24@1974 1037 }
kaf24@1974 1038
kaf24@2844 1039
kaf24@5436 1040 void free_xenheap_pages(void *v, unsigned int order)
kaf24@1974 1041 {
kfraser@14322 1042 ASSERT(!in_irq());
kaf24@2844 1043
kaf24@7803 1044 if ( v == NULL )
kaf24@7803 1045 return;
kaf24@7803 1046
kfraser@14322 1047 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
kaf24@2844 1048
keir@19099 1049 free_heap_pages(virt_to_page(v), order);
iap10@274 1050 }
kaf24@1974 1051
keir@19092 1052 #else
keir@19092 1053
keir@19092 1054 void init_xenheap_pages(paddr_t ps, paddr_t pe)
keir@19092 1055 {
keir@19092 1056 init_domheap_pages(ps, pe);
keir@19092 1057 }
keir@19092 1058
keir@19143 1059 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
keir@19092 1060 {
keir@19092 1061 struct page_info *pg;
keir@19092 1062 unsigned int i;
keir@19092 1063
keir@19092 1064 ASSERT(!in_irq());
keir@19092 1065
keir@19143 1066 pg = alloc_domheap_pages(NULL, order, memflags);
keir@19092 1067 if ( unlikely(pg == NULL) )
keir@19143 1068 return NULL;
keir@19092 1069
keir@19092 1070 for ( i = 0; i < (1u << order); i++ )
keir@19092 1071 pg[i].count_info |= PGC_xen_heap;
keir@19092 1072
keir@19092 1073 return page_to_virt(pg);
keir@19092 1074 }
keir@19092 1075
keir@19092 1076 void free_xenheap_pages(void *v, unsigned int order)
keir@19092 1077 {
keir@19092 1078 struct page_info *pg;
keir@19092 1079 unsigned int i;
keir@19092 1080
keir@19092 1081 ASSERT(!in_irq());
keir@19092 1082
keir@19092 1083 if ( v == NULL )
keir@19092 1084 return;
keir@19092 1085
keir@19092 1086 pg = virt_to_page(v);
keir@19092 1087
keir@19092 1088 for ( i = 0; i < (1u << order); i++ )
keir@19092 1089 pg[i].count_info &= ~PGC_xen_heap;
keir@19092 1090
keir@19099 1091 free_heap_pages(pg, order);
keir@19092 1092 }
keir@19092 1093
keir@19092 1094 #endif
keir@19092 1095
kaf24@1974 1096
kaf24@1974 1097
kaf24@1974 1098 /*************************
kaf24@1974 1099 * DOMAIN-HEAP SUB-ALLOCATOR
kaf24@1974 1100 */
kaf24@1974 1101
kaf24@8764 1102 void init_domheap_pages(paddr_t ps, paddr_t pe)
kaf24@1974 1103 {
keir@19099 1104 unsigned long smfn, emfn;
kaf24@6721 1105
kaf24@2844 1106 ASSERT(!in_irq());
kaf24@2844 1107
keir@19099 1108 smfn = round_pgup(ps) >> PAGE_SHIFT;
keir@19099 1109 emfn = round_pgdown(pe) >> PAGE_SHIFT;
keir@19095 1110
keir@19099 1111 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
kaf24@1974 1112 }
kaf24@1974 1113
kaf24@2844 1114
kfraser@10398 1115 int assign_pages(
kfraser@10398 1116 struct domain *d,
kfraser@10398 1117 struct page_info *pg,
kfraser@10398 1118 unsigned int order,
kfraser@10398 1119 unsigned int memflags)
kfraser@10398 1120 {
kfraser@10398 1121 unsigned long i;
kfraser@10398 1122
kfraser@10398 1123 spin_lock(&d->page_alloc_lock);
kfraser@10398 1124
kfraser@14677 1125 if ( unlikely(d->is_dying) )
kfraser@10398 1126 {
kaf24@12062 1127 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
kaf24@12056 1128 d->domain_id);
kfraser@10398 1129 goto fail;
kfraser@10398 1130 }
kfraser@10398 1131
kfraser@10398 1132 if ( !(memflags & MEMF_no_refcount) )
kfraser@10398 1133 {
kfraser@10398 1134 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
kfraser@10398 1135 {
keir@20505 1136 if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
keir@20505 1137 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
keir@20505 1138 "%u > %u\n", d->domain_id,
keir@20505 1139 d->tot_pages + (1 << order), d->max_pages);
kfraser@10398 1140 goto fail;
kfraser@10398 1141 }
kfraser@10398 1142
kfraser@10398 1143 if ( unlikely(d->tot_pages == 0) )
kfraser@10398 1144 get_knownalive_domain(d);
kfraser@10398 1145
kfraser@10398 1146 d->tot_pages += 1 << order;
kfraser@10398 1147 }
kfraser@10398 1148
kfraser@10398 1149 for ( i = 0; i < (1 << order); i++ )
kfraser@10398 1150 {
kfraser@10398 1151 ASSERT(page_get_owner(&pg[i]) == NULL);
kfraser@10398 1152 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
kfraser@10398 1153 page_set_owner(&pg[i], d);
kfraser@10398 1154 wmb(); /* Domain pointer must be visible before updating refcnt. */
kfraser@10398 1155 pg[i].count_info = PGC_allocated | 1;
keir@19170 1156 page_list_add_tail(&pg[i], &d->page_list);
kfraser@10398 1157 }
kfraser@10398 1158
kfraser@10398 1159 spin_unlock(&d->page_alloc_lock);
kfraser@10398 1160 return 0;
kfraser@10398 1161
kfraser@10398 1162 fail:
kfraser@10398 1163 spin_unlock(&d->page_alloc_lock);
kfraser@10398 1164 return -1;
kfraser@10398 1165 }
kfraser@10398 1166
kfraser@10398 1167
keir@17421 1168 struct page_info *alloc_domheap_pages(
keir@17421 1169 struct domain *d, unsigned int order, unsigned int memflags)
kaf24@1974 1170 {
kaf24@8764 1171 struct page_info *pg = NULL;
keir@14135 1172 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
keir@21959 1173 unsigned int dma_zone;
kaf24@1979 1174
kaf24@1979 1175 ASSERT(!in_irq());
kaf24@1979 1176
keir@16587 1177 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
keir@19095 1178 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
keir@16586 1179 return NULL;
keir@16586 1180
keir@19095 1181 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
keir@21959 1182 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
kfraser@14131 1183
kfraser@14322 1184 if ( (pg == NULL) &&
keir@21318 1185 ((memflags & MEMF_no_dma) ||
keir@21959 1186 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
keir@21959 1187 memflags, d)) == NULL)) )
kfraser@14322 1188 return NULL;
kaf24@4340 1189
kfraser@10398 1190 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
kaf24@1979 1191 {
keir@19099 1192 free_heap_pages(pg, order);
kaf24@1979 1193 return NULL;
kaf24@1979 1194 }
kaf24@1979 1195
kaf24@1974 1196 return pg;
kaf24@1974 1197 }
kaf24@1974 1198
kaf24@8764 1199 void free_domheap_pages(struct page_info *pg, unsigned int order)
kaf24@1974 1200 {
kaf24@1979 1201 int i, drop_dom_ref;
kaf24@3669 1202 struct domain *d = page_get_owner(pg);
kaf24@1979 1203
kaf24@2844 1204 ASSERT(!in_irq());
kaf24@2844 1205
keir@16376 1206 if ( unlikely(is_xen_heap_page(pg)) )
kaf24@1979 1207 {
kaf24@4493 1208 /* NB. May recursively lock from relinquish_memory(). */
kaf24@1979 1209 spin_lock_recursive(&d->page_alloc_lock);
kaf24@2382 1210
kaf24@2360 1211 for ( i = 0; i < (1 << order); i++ )
keir@19170 1212 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
kaf24@2382 1213
kaf24@1979 1214 d->xenheap_pages -= 1 << order;
kaf24@1979 1215 drop_dom_ref = (d->xenheap_pages == 0);
kaf24@2382 1216
kaf24@1979 1217 spin_unlock_recursive(&d->page_alloc_lock);
kaf24@1979 1218 }
keir@20723 1219 else if ( likely(d != NULL) && likely(d != dom_cow) )
kaf24@1979 1220 {
kaf24@4493 1221 /* NB. May recursively lock from relinquish_memory(). */
kaf24@1979 1222 spin_lock_recursive(&d->page_alloc_lock);
kaf24@1979 1223
kaf24@1979 1224 for ( i = 0; i < (1 << order); i++ )
kaf24@1979 1225 {
kfraser@14322 1226 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
keir@19170 1227 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
kaf24@1979 1228 }
kaf24@1979 1229
kaf24@1979 1230 d->tot_pages -= 1 << order;
kaf24@1979 1231 drop_dom_ref = (d->tot_pages == 0);
kaf24@1979 1232
kaf24@1979 1233 spin_unlock_recursive(&d->page_alloc_lock);
kaf24@1979 1234
keir@19926 1235 /*
keir@19926 1236 * Normally we expect a domain to clear pages before freeing them, if
keir@19926 1237 * it cares about the secrecy of their contents. However, after a
keir@19926 1238 * domain has died we assume responsibility for erasure.
keir@19926 1239 */
keir@19926 1240 if ( unlikely(d->is_dying) )
kaf24@4305 1241 for ( i = 0; i < (1 << order); i++ )
keir@19924 1242 scrub_one_page(&pg[i]);
keir@19926 1243
keir@19926 1244 free_heap_pages(pg, order);
kaf24@1979 1245 }
keir@20723 1246 else if ( unlikely(d == dom_cow) )
keir@20723 1247 {
keir@20723 1248 ASSERT(order == 0);
keir@20723 1249 scrub_one_page(pg);
keir@20723 1250 free_heap_pages(pg, 0);
keir@20723 1251 drop_dom_ref = 0;
keir@20723 1252 }
kaf24@2329 1253 else
kaf24@2329 1254 {
kaf24@7989 1255 /* Freeing anonymous domain-heap pages. */
keir@19099 1256 free_heap_pages(pg, order);
kaf24@2329 1257 drop_dom_ref = 0;
kaf24@2329 1258 }
kaf24@1979 1259
kaf24@1979 1260 if ( drop_dom_ref )
kaf24@1979 1261 put_domain(d);
kaf24@1974 1262 }
kaf24@1974 1263
kfraser@15572 1264 unsigned long avail_domheap_pages_region(
kfraser@15572 1265 unsigned int node, unsigned int min_width, unsigned int max_width)
kfraser@15572 1266 {
kfraser@15572 1267 int zone_lo, zone_hi;
kfraser@15572 1268
keir@19095 1269 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
keir@19095 1270 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
kfraser@15572 1271
keir@19095 1272 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
keir@19095 1273 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
kfraser@15572 1274
kfraser@15572 1275 return avail_heap_pages(zone_lo, zone_hi, node);
kfraser@15572 1276 }
kaf24@2844 1277
kaf24@1974 1278 unsigned long avail_domheap_pages(void)
kaf24@1974 1279 {
keir@18195 1280 return avail_heap_pages(MEMZONE_XEN + 1,
keir@18195 1281 NR_ZONES - 1,
keir@18195 1282 -1);
kaf24@1974 1283 }
kaf24@4305 1284
keir@21164 1285 unsigned long avail_node_heap_pages(unsigned int nodeid)
keir@21164 1286 {
keir@21164 1287 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
keir@21164 1288 }
keir@21164 1289
keir@21164 1290
keir@20048 1291 static void pagealloc_info(unsigned char key)
kaf24@9051 1292 {
kfraser@14130 1293 unsigned int zone = MEMZONE_XEN;
keir@18194 1294 unsigned long n, total = 0;
kfraser@14130 1295
kaf24@9051 1296 printk("Physical memory information:\n");
kfraser@14130 1297 printk(" Xen heap: %lukB free\n",
kfraser@14130 1298 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
kfraser@14130 1299
kfraser@14130 1300 while ( ++zone < NR_ZONES )
kfraser@14130 1301 {
keir@18195 1302 if ( (zone + PAGE_SHIFT) == dma_bitsize )
kfraser@14130 1303 {
kfraser@14130 1304 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
kfraser@14130 1305 total = 0;
kfraser@14130 1306 }
kfraser@14130 1307
kfraser@14130 1308 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
kfraser@14130 1309 {
kfraser@14130 1310 total += n;
kfraser@14130 1311 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
kfraser@14130 1312 }
kfraser@14130 1313 }
kfraser@14130 1314
kfraser@14130 1315 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
kaf24@9051 1316 }
kaf24@9051 1317
keir@20048 1318 static struct keyhandler pagealloc_info_keyhandler = {
keir@20048 1319 .diagnostic = 1,
keir@20048 1320 .u.fn = pagealloc_info,
keir@20048 1321 .desc = "memory info"
keir@20048 1322 };
kaf24@9051 1323
kaf24@9051 1324 static __init int pagealloc_keyhandler_init(void)
kaf24@9051 1325 {
keir@20048 1326 register_keyhandler('m', &pagealloc_info_keyhandler);
kaf24@9051 1327 return 0;
kaf24@9051 1328 }
kaf24@9051 1329 __initcall(pagealloc_keyhandler_init);
kaf24@9051 1330
kaf24@9051 1331
keir@19684 1332 void scrub_one_page(struct page_info *pg)
keir@19684 1333 {
keir@20277 1334 void *p = __map_domain_page(pg);
keir@19684 1335
keir@20958 1336 if ( unlikely(pg->count_info & PGC_broken) )
keir@20958 1337 return;
keir@20958 1338
keir@19924 1339 #ifndef NDEBUG
keir@19924 1340 /* Avoid callers relying on allocations returning zeroed pages. */
keir@19924 1341 memset(p, 0xc2, PAGE_SIZE);
keir@19924 1342 #else
keir@19924 1343 /* For a production build, clear_page() is the fastest way to scrub. */
keir@19924 1344 clear_page(p);
keir@19924 1345 #endif
keir@19684 1346
keir@19924 1347 unmap_domain_page(p);
kaf24@10539 1348 }
kaf24@10539 1349
kfraser@11932 1350 static void dump_heap(unsigned char key)
kfraser@11932 1351 {
keir@14134 1352 s_time_t now = NOW();
keir@14134 1353 int i, j;
kfraser@11932 1354
kfraser@11932 1355 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
kfraser@11932 1356 (u32)(now>>32), (u32)now);
kfraser@11932 1357
keir@14134 1358 for ( i = 0; i < MAX_NUMNODES; i++ )
keir@14134 1359 {
keir@14134 1360 if ( !avail[i] )
keir@14134 1361 continue;
keir@14134 1362 for ( j = 0; j < NR_ZONES; j++ )
keir@14134 1363 printk("heap[node=%d][zone=%d] -> %lu pages\n",
keir@14134 1364 i, j, avail[i][j]);
keir@14134 1365 }
kfraser@11932 1366 }
kfraser@11932 1367
keir@20048 1368 static struct keyhandler dump_heap_keyhandler = {
keir@20048 1369 .diagnostic = 1,
keir@20048 1370 .u.fn = dump_heap,
keir@20048 1371 .desc = "dump heap info"
keir@20048 1372 };
keir@20048 1373
kfraser@11932 1374 static __init int register_heap_trigger(void)
kfraser@11932 1375 {
keir@20048 1376 register_keyhandler('H', &dump_heap_keyhandler);
kfraser@11932 1377 return 0;
kfraser@11932 1378 }
kfraser@11932 1379 __initcall(register_heap_trigger);
kfraser@11932 1380
kaf24@3952 1381 /*
kaf24@3952 1382 * Local variables:
kaf24@3952 1383 * mode: C
kaf24@3952 1384 * c-set-style: "BSD"
kaf24@3952 1385 * c-basic-offset: 4
kaf24@3952 1386 * tab-width: 4
kaf24@3952 1387 * indent-tabs-mode: nil
kaf24@4026 1388 * End:
kaf24@3952 1389 */