Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/page_alloc.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * page_alloc.c
3
 * 
4
 * Simple buddy heap allocator for Xen.
5
 * 
6
 * Copyright (c) 2002-2004 K A Fraser
7
 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8
 * 
9
 * This program is free software; you can redistribute it and/or modify
10
 * it under the terms of the GNU General Public License as published by
11
 * the Free Software Foundation; either version 2 of the License, or
12
 * (at your option) any later version.
13
 * 
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 * GNU General Public License for more details.
18
 * 
19
 * You should have received a copy of the GNU General Public License
20
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
21
 */
22
23
/*
24
 * In general Xen maintains two pools of memory:
25
 *
26
 * - Xen heap: Memory which is always mapped (i.e accessible by
27
 *             virtual address), via a permanent and contiguous
28
 *             "direct mapping". Macros like va() and pa() are valid
29
 *             for such memory and it is always permissible to stash
30
 *             pointers to Xen heap memory in data structures etc.
31
 *
32
 *             Xen heap pages are always anonymous (that is, not tied
33
 *             or accounted to any particular domain).
34
 *
35
 * - Dom heap: Memory which must be explicitly mapped, usually
36
 *             transiently with map_domain_page(), in order to be
37
 *             used. va() and pa() are not valid for such memory. Care
38
 *             should be taken when stashing pointers to dom heap
39
 *             pages that those mappings are permanent (e.g. vmap() or
40
 *             map_domain_page_global()), it is not safe to stash
41
 *             transient mappings such as those from map_domain_page()
42
 *
43
 *             Dom heap pages are often tied to a particular domain,
44
 *             but need not be (passing domain==NULL results in an
45
 *             anonymous dom heap allocation).
46
 *
47
 * The exact nature of this split is a (sub)arch decision which can
48
 * select one of three main variants:
49
 *
50
 * CONFIG_SEPARATE_XENHEAP=y
51
 *
52
 *   The xen heap is maintained as an entirely separate heap.
53
 *
54
 *   Arch code arranges for some (perhaps small) amount of physical
55
 *   memory to be covered by a direct mapping and registers that
56
 *   memory as the Xen heap (via init_xenheap_pages()) and the
57
 *   remainder as the dom heap.
58
 *
59
 *   This mode of operation is most commonly used by 32-bit arches
60
 *   where the virtual address space is insufficient to map all RAM.
61
 *
62
 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM
63
 *
64
 *   All of RAM is covered by a permanent contiguous mapping and there
65
 *   is only a single heap.
66
 *
67
 *   Memory allocated from the Xen heap is flagged (in
68
 *   page_info.count_info) with PGC_xen_heap. Memory allocated from
69
 *   the Dom heap must still be explicitly mapped before use
70
 *   (e.g. with map_domain_page) in particular in common code.
71
 *
72
 *   xenheap_max_mfn() should not be called by arch code.
73
 *
74
 *   This mode of operation is most commonly used by 64-bit arches
75
 *   which have sufficient free virtual address space to permanently
76
 *   map the largest practical amount RAM currently expected on that
77
 *   arch.
78
 *
79
 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM
80
 *
81
 *   There is a single heap, but only the beginning (up to some
82
 *   threshold) is covered by a permanent contiguous mapping.
83
 *
84
 *   Memory allocated from the Xen heap is allocated from below the
85
 *   threshold and flagged with PGC_xen_heap. Memory allocated from
86
 *   the dom heap is allocated from anywhere in the heap (although it
87
 *   will prefer to allocate from as high as possible to try and keep
88
 *   Xen heap suitable memory available).
89
 *
90
 *   Arch code must call xenheap_max_mfn() to signal the limit of the
91
 *   direct mapping.
92
 *
93
 *   This mode of operation is most commonly used by 64-bit arches
94
 *   which have a restricted amount of virtual address space available
95
 *   for a direct map (due to e.g. reservations for other purposes)
96
 *   such that it is not possible to map all of RAM on systems with
97
 *   the largest practical amount of RAM currently expected on that
98
 *   arch.
99
 *
100
 * Boot Allocator
101
 *
102
 *   In addition to the two primary pools (xen heap and dom heap) a
103
 *   third "boot allocator" is used at start of day. This is a
104
 *   simplified allocator which can be used.
105
 *
106
 *   Typically all memory which is destined to be dom heap memory
107
 *   (which is everything in the CONFIG_SEPARATE_XENHEAP=n
108
 *   configurations) is first allocated to the boot allocator (with
109
 *   init_boot_pages()) and is then handed over to the main dom heap in
110
 *   end_boot_allocator().
111
 *
112
 * "Contiguous" mappings
113
 *
114
 *   Note that although the above talks about "contiguous" mappings
115
 *   some architectures implement a scheme ("PDX compression") to
116
 *   compress unused portions of the machine address space (i.e. large
117
 *   gaps between distinct banks of memory) in order to avoid creating
118
 *   enormous frame tables and direct maps which mostly map
119
 *   nothing. Thus a contiguous mapping may still have distinct
120
 *   regions within it.
121
 */
122
123
#include <xen/init.h>
124
#include <xen/types.h>
125
#include <xen/lib.h>
126
#include <xen/sched.h>
127
#include <xen/spinlock.h>
128
#include <xen/mm.h>
129
#include <xen/irq.h>
130
#include <xen/softirq.h>
131
#include <xen/domain_page.h>
132
#include <xen/keyhandler.h>
133
#include <xen/perfc.h>
134
#include <xen/pfn.h>
135
#include <xen/numa.h>
136
#include <xen/nodemask.h>
137
#include <xen/event.h>
138
#include <xen/tmem.h>
139
#include <xen/tmem_xen.h>
140
#include <public/sysctl.h>
141
#include <public/sched.h>
142
#include <asm/page.h>
143
#include <asm/numa.h>
144
#include <asm/flushtlb.h>
145
#ifdef CONFIG_X86
146
#include <asm/p2m.h>
147
#include <asm/setup.h> /* for highmem_start only */
148
#else
149
#define p2m_pod_offline_or_broken_hit(pg) 0
150
#define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
151
#endif
152
153
/*
154
 * Comma-separated list of hexadecimal page numbers containing bad bytes.
155
 * e.g. 'badpage=0x3f45,0x8a321'.
156
 */
157
static char __initdata opt_badpage[100] = "";
158
string_param("badpage", opt_badpage);
159
160
/*
161
 * no-bootscrub -> Free pages are not zeroed during boot.
162
 */
163
static bool_t opt_bootscrub __initdata = 1;
164
boolean_param("bootscrub", opt_bootscrub);
165
166
/*
167
 * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs
168
 * on all NUMA nodes.
169
 */
170
static unsigned long __initdata opt_bootscrub_chunk = MB(128);
171
size_param("bootscrub_chunk", opt_bootscrub_chunk);
172
173
#ifdef CONFIG_SCRUB_DEBUG
174
static bool __read_mostly scrub_debug;
175
#else
176
#define scrub_debug    false
177
#endif
178
179
/*
180
 * Bit width of the DMA heap -- used to override NUMA-node-first.
181
 * allocation strategy, which can otherwise exhaust low memory.
182
 */
183
static unsigned int dma_bitsize;
184
integer_param("dma_bits", dma_bitsize);
185
186
/* Offlined page list, protected by heap_lock. */
187
PAGE_LIST_HEAD(page_offlined_list);
188
/* Broken page list, protected by heap_lock. */
189
PAGE_LIST_HEAD(page_broken_list);
190
191
/*************************
192
 * BOOT-TIME ALLOCATOR
193
 */
194
195
/*
196
 * first_valid_mfn is exported because it is use in ARM specific NUMA
197
 * helpers. See comment in asm-arm/numa.h.
198
 */
199
unsigned long first_valid_mfn = ~0UL;
200
201
static struct bootmem_region {
202
    unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
203
} *__initdata bootmem_region_list;
204
static unsigned int __initdata nr_bootmem_regions;
205
206
struct scrub_region {
207
    unsigned long offset;
208
    unsigned long start;
209
    unsigned long per_cpu_sz;
210
    unsigned long rem;
211
    cpumask_t cpus;
212
};
213
static struct scrub_region __initdata region[MAX_NUMNODES];
214
static unsigned long __initdata chunk_size;
215
216
static void __init bootmem_region_add(unsigned long s, unsigned long e)
217
93
{
218
93
    unsigned int i;
219
93
220
93
    if ( (bootmem_region_list == NULL) && (s < e) )
221
1
        bootmem_region_list = mfn_to_virt(s++);
222
93
223
93
    if ( s >= e )
224
77
        return;
225
93
226
134
    for ( i = 0; i < nr_bootmem_regions; i++ )
227
120
        if ( s < bootmem_region_list[i].e )
228
2
            break;
229
16
230
16
    BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
231
16
    BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region)));
232
16
233
16
    memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
234
16
            (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
235
16
    bootmem_region_list[i] = (struct bootmem_region) { s, e };
236
16
    nr_bootmem_regions++;
237
16
}
238
239
static void __init bootmem_region_zap(unsigned long s, unsigned long e)
240
0
{
241
0
    unsigned int i;
242
0
243
0
    for ( i = 0; i < nr_bootmem_regions; i++ )
244
0
    {
245
0
        struct bootmem_region *r = &bootmem_region_list[i];
246
0
        if ( e <= r->s )
247
0
            break;
248
0
        if ( s >= r->e )
249
0
            continue;
250
0
        if ( s <= r->s )
251
0
        {
252
0
            r->s = min(e, r->e);
253
0
        }
254
0
        else if ( e >= r->e )
255
0
        {
256
0
            r->e = s;
257
0
        }
258
0
        else
259
0
        {
260
0
            unsigned long _e = r->e;
261
0
            r->e = s;
262
0
            bootmem_region_add(e, _e);
263
0
        }
264
0
    }
265
0
}
266
267
void __init init_boot_pages(paddr_t ps, paddr_t pe)
268
23
{
269
23
    unsigned long bad_spfn, bad_epfn;
270
23
    const char *p;
271
23
#ifdef CONFIG_X86
272
23
    const unsigned long *badpage = NULL;
273
23
    unsigned int i, array_size;
274
23
275
23
    BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
276
23
                 MAX_ORDER + 1);
277
23
#endif
278
23
    BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long));
279
23
280
23
    ps = round_pgup(ps);
281
23
    pe = round_pgdown(pe);
282
23
    if ( pe <= ps )
283
8
        return;
284
23
285
15
    first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
286
15
287
15
    bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
288
15
289
15
#ifdef CONFIG_X86
290
15
    /* 
291
15
     * Here we put platform-specific memory range workarounds, i.e.
292
15
     * memory known to be corrupt or otherwise in need to be reserved on
293
15
     * specific platforms.
294
15
     * We get these certain pages and remove them from memory region list.
295
15
     */
296
15
    badpage = get_platform_badpages(&array_size);
297
15
    if ( badpage )
298
0
    {
299
0
        for ( i = 0; i < array_size; i++ )
300
0
        {
301
0
            bootmem_region_zap(*badpage >> PAGE_SHIFT,
302
0
                               (*badpage >> PAGE_SHIFT) + 1);
303
0
            badpage++;
304
0
        }
305
0
    }
306
15
#endif
307
15
308
15
    /* Check new pages against the bad-page list. */
309
15
    p = opt_badpage;
310
15
    while ( *p != '\0' )
311
0
    {
312
0
        bad_spfn = simple_strtoul(p, &p, 0);
313
0
        bad_epfn = bad_spfn;
314
0
315
0
        if ( *p == '-' )
316
0
        {
317
0
            p++;
318
0
            bad_epfn = simple_strtoul(p, &p, 0);
319
0
            if ( bad_epfn < bad_spfn )
320
0
                bad_epfn = bad_spfn;
321
0
        }
322
0
323
0
        if ( *p == ',' )
324
0
            p++;
325
0
        else if ( *p != '\0' )
326
0
            break;
327
0
328
0
        bootmem_region_zap(bad_spfn, bad_epfn+1);
329
0
    }
330
15
}
331
332
mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
333
78
{
334
78
    unsigned long pg, _e;
335
78
    unsigned int i = nr_bootmem_regions;
336
78
337
78
    BUG_ON(!nr_bootmem_regions);
338
78
339
141
    while ( i-- )
340
141
    {
341
141
        struct bootmem_region *r = &bootmem_region_list[i];
342
141
343
141
        pg = (r->e - nr_pfns) & ~(pfn_align - 1);
344
141
        if ( pg >= r->e || pg < r->s )
345
63
            continue;
346
141
347
141
#if defined(CONFIG_X86) && !defined(NDEBUG)
348
141
        /*
349
141
         * Filtering pfn_align == 1 since the only allocations using a bigger
350
141
         * alignment are the ones used for setting up the frame table chunks.
351
141
         * Those allocations get remapped anyway, i.e. them not having 1:1
352
141
         * mappings always accessible is not a problem.
353
141
         */
354
78
        if ( highmem_start && pfn_align == 1 &&
355
0
             r->e > PFN_DOWN(highmem_start) )
356
0
        {
357
0
            pg = r->s;
358
0
            if ( pg + nr_pfns > PFN_DOWN(highmem_start) )
359
0
                continue;
360
0
            r->s = pg + nr_pfns;
361
0
            return _mfn(pg);
362
0
        }
363
78
#endif
364
78
365
78
        _e = r->e;
366
78
        r->e = pg;
367
78
        bootmem_region_add(pg + nr_pfns, _e);
368
78
        return _mfn(pg);
369
78
    }
370
78
371
0
    BUG();
372
0
}
373
374
375
376
/*************************
377
 * BINARY BUDDY ALLOCATOR
378
 */
379
380
43.2k
#define MEMZONE_XEN 0
381
18.4E
#define NR_ZONES    (PADDR_BITS - PAGE_SHIFT + 1)
382
383
0
#define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
384
4.18M
#define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
385
4.18M
                          (flsl(page_to_mfn(pg)) ? : 1))
386
387
typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
388
static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
389
8.42M
#define heap(node, zone, order) ((*_heap[node])[zone][order])
390
391
static unsigned long node_need_scrub[MAX_NUMNODES];
392
393
static unsigned long *avail[MAX_NUMNODES];
394
static long total_avail_pages;
395
396
/* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
397
static long midsize_alloc_zone_pages;
398
#define MIDSIZE_ALLOC_FRAC 128
399
400
static DEFINE_SPINLOCK(heap_lock);
401
static long outstanding_claims; /* total outstanding claims by all domains */
402
403
unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
404
77
{
405
77
    long dom_before, dom_after, dom_claimed, sys_before, sys_after;
406
77
407
77
    ASSERT(spin_is_locked(&d->page_alloc_lock));
408
77
    d->tot_pages += pages;
409
77
410
77
    /*
411
77
     * can test d->claimed_pages race-free because it can only change
412
77
     * if d->page_alloc_lock and heap_lock are both held, see also
413
77
     * domain_set_outstanding_pages below
414
77
     */
415
77
    if ( !d->outstanding_pages )
416
77
        goto out;
417
77
418
0
    spin_lock(&heap_lock);
419
0
    /* adjust domain outstanding pages; may not go negative */
420
0
    dom_before = d->outstanding_pages;
421
0
    dom_after = dom_before - pages;
422
0
    BUG_ON(dom_before < 0);
423
0
    dom_claimed = dom_after < 0 ? 0 : dom_after;
424
0
    d->outstanding_pages = dom_claimed;
425
0
    /* flag accounting bug if system outstanding_claims would go negative */
426
0
    sys_before = outstanding_claims;
427
0
    sys_after = sys_before - (dom_before - dom_claimed);
428
0
    BUG_ON(sys_after < 0);
429
0
    outstanding_claims = sys_after;
430
0
    spin_unlock(&heap_lock);
431
0
432
77
out:
433
77
    return d->tot_pages;
434
0
}
435
436
int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
437
0
{
438
0
    int ret = -ENOMEM;
439
0
    unsigned long claim, avail_pages;
440
0
441
0
    /*
442
0
     * take the domain's page_alloc_lock, else all d->tot_page adjustments
443
0
     * must always take the global heap_lock rather than only in the much
444
0
     * rarer case that d->outstanding_pages is non-zero
445
0
     */
446
0
    spin_lock(&d->page_alloc_lock);
447
0
    spin_lock(&heap_lock);
448
0
449
0
    /* pages==0 means "unset" the claim. */
450
0
    if ( pages == 0 )
451
0
    {
452
0
        outstanding_claims -= d->outstanding_pages;
453
0
        d->outstanding_pages = 0;
454
0
        ret = 0;
455
0
        goto out;
456
0
    }
457
0
458
0
    /* only one active claim per domain please */
459
0
    if ( d->outstanding_pages )
460
0
    {
461
0
        ret = -EINVAL;
462
0
        goto out;
463
0
    }
464
0
465
0
    /* disallow a claim not exceeding current tot_pages or above max_pages */
466
0
    if ( (pages <= d->tot_pages) || (pages > d->max_pages) )
467
0
    {
468
0
        ret = -EINVAL;
469
0
        goto out;
470
0
    }
471
0
472
0
    /* how much memory is available? */
473
0
    avail_pages = total_avail_pages;
474
0
475
0
    /* Note: The usage of claim means that allocation from a guest *might*
476
0
     * have to come from freeable memory. Using free memory is always better, if
477
0
     * it is available, than using freeable memory.
478
0
     *
479
0
     * But that is OK as once the claim has been made, it still can take minutes
480
0
     * before the claim is fully satisfied. Tmem can make use of the unclaimed
481
0
     * pages during this time (to store ephemeral/freeable pages only,
482
0
     * not persistent pages).
483
0
     */
484
0
    avail_pages += tmem_freeable_pages();
485
0
    avail_pages -= outstanding_claims;
486
0
487
0
    /*
488
0
     * Note, if domain has already allocated memory before making a claim
489
0
     * then the claim must take tot_pages into account
490
0
     */
491
0
    claim = pages - d->tot_pages;
492
0
    if ( claim > avail_pages )
493
0
        goto out;
494
0
495
0
    /* yay, claim fits in available memory, stake the claim, success! */
496
0
    d->outstanding_pages = claim;
497
0
    outstanding_claims += d->outstanding_pages;
498
0
    ret = 0;
499
0
500
0
out:
501
0
    spin_unlock(&heap_lock);
502
0
    spin_unlock(&d->page_alloc_lock);
503
0
    return ret;
504
0
}
505
506
void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
507
0
{
508
0
    spin_lock(&heap_lock);
509
0
    *outstanding_pages = outstanding_claims;
510
0
    *free_pages =  avail_domheap_pages();
511
0
    spin_unlock(&heap_lock);
512
0
}
513
514
static bool_t __read_mostly first_node_initialised;
515
#ifndef CONFIG_SEPARATE_XENHEAP
516
static unsigned int __read_mostly xenheap_bits;
517
#else
518
#define xenheap_bits 0
519
#endif
520
521
static unsigned long init_node_heap(int node, unsigned long mfn,
522
                                    unsigned long nr, bool_t *use_tail)
523
1
{
524
1
    /* First node to be discovered has its heap metadata statically alloced. */
525
1
    static heap_by_zone_and_order_t _heap_static;
526
1
    static unsigned long avail_static[NR_ZONES];
527
1
    unsigned long needed = (sizeof(**_heap) +
528
1
                            sizeof(**avail) * NR_ZONES +
529
1
                            PAGE_SIZE - 1) >> PAGE_SHIFT;
530
1
    int i, j;
531
1
532
1
    if ( !first_node_initialised )
533
1
    {
534
1
        _heap[node] = &_heap_static;
535
1
        avail[node] = avail_static;
536
1
        first_node_initialised = 1;
537
1
        needed = 0;
538
1
    }
539
0
    else if ( *use_tail && nr >= needed &&
540
0
              arch_mfn_in_directmap(mfn + nr) &&
541
0
              (!xenheap_bits ||
542
0
               !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
543
0
    {
544
0
        _heap[node] = mfn_to_virt(mfn + nr - needed);
545
0
        avail[node] = mfn_to_virt(mfn + nr - 1) +
546
0
                      PAGE_SIZE - sizeof(**avail) * NR_ZONES;
547
0
    }
548
0
    else if ( nr >= needed &&
549
0
              arch_mfn_in_directmap(mfn + needed) &&
550
0
              (!xenheap_bits ||
551
0
               !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
552
0
    {
553
0
        _heap[node] = mfn_to_virt(mfn);
554
0
        avail[node] = mfn_to_virt(mfn + needed - 1) +
555
0
                      PAGE_SIZE - sizeof(**avail) * NR_ZONES;
556
0
        *use_tail = 0;
557
0
    }
558
0
    else if ( get_order_from_bytes(sizeof(**_heap)) ==
559
0
              get_order_from_pages(needed) )
560
0
    {
561
0
        _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
562
0
        BUG_ON(!_heap[node]);
563
0
        avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
564
0
                      sizeof(**avail) * NR_ZONES;
565
0
        needed = 0;
566
0
    }
567
0
    else
568
0
    {
569
0
        _heap[node] = xmalloc(heap_by_zone_and_order_t);
570
0
        avail[node] = xmalloc_array(unsigned long, NR_ZONES);
571
0
        BUG_ON(!_heap[node] || !avail[node]);
572
0
        needed = 0;
573
0
    }
574
1
575
1
    memset(avail[node], 0, NR_ZONES * sizeof(long));
576
1
577
42
    for ( i = 0; i < NR_ZONES; i++ )
578
820
        for ( j = 0; j <= MAX_ORDER; j++ )
579
779
            INIT_PAGE_LIST_HEAD(&heap(node, i, j));
580
1
581
1
    return needed;
582
1
}
583
584
/* Default to 64 MiB */
585
1
#define DEFAULT_LOW_MEM_VIRQ    (((paddr_t) 64)   << 20)
586
#define MAX_LOW_MEM_VIRQ        (((paddr_t) 1024) << 20)
587
588
static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1);
589
size_param("low_mem_virq_limit", opt_low_mem_virq);
590
591
/* Thresholds to control hysteresis. In pages */
592
/* When memory grows above this threshold, reset hysteresis.
593
 * -1 initially to not reset until at least one virq issued. */
594
static unsigned long low_mem_virq_high      = -1UL;
595
/* Threshold at which we issue virq */
596
static unsigned long low_mem_virq_th        = 0;
597
/* Original threshold after all checks completed */
598
static unsigned long low_mem_virq_orig      = 0;
599
/* Order for current threshold */
600
static unsigned int  low_mem_virq_th_order  = 0;
601
602
/* Perform bootstrapping checks and set bounds */
603
static void __init setup_low_mem_virq(void)
604
1
{
605
1
    unsigned int order;
606
1
    paddr_t threshold;
607
1
    bool_t halve;
608
1
609
1
    /* If the user specifies zero, then he/she doesn't want this virq
610
1
     * to ever trigger. */
611
1
    if ( opt_low_mem_virq == 0 )
612
0
    {
613
0
        low_mem_virq_th = -1UL;
614
0
        return;
615
0
    }
616
1
617
1
    /* If the user did not specify a knob, remember that */
618
1
    halve = (opt_low_mem_virq == ((paddr_t) -1));
619
1
    threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq;
620
1
621
1
    /* Dom0 has already been allocated by now. So check we won't be
622
1
     * complaining immediately with whatever's left of the heap. */
623
1
    threshold = min(threshold,
624
1
                    ((paddr_t) total_avail_pages) << PAGE_SHIFT);
625
1
626
1
    /* Then, cap to some predefined maximum */
627
1
    threshold = min(threshold, MAX_LOW_MEM_VIRQ);
628
1
629
1
    /* If the user specified no knob, and we are at the current available
630
1
     * level, halve the threshold. */
631
1
    if ( halve &&
632
1
         (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) )
633
0
        threshold >>= 1;
634
1
635
1
    /* Zero? Have to fire immediately */
636
1
    threshold = max(threshold, (paddr_t) PAGE_SIZE);
637
1
638
1
    /* Threshold bytes -> pages */
639
1
    low_mem_virq_th = threshold >> PAGE_SHIFT;
640
1
641
1
    /* Next, round the threshold down to the next order */
642
1
    order = get_order_from_pages(low_mem_virq_th);
643
1
    if ( (1UL << order) > low_mem_virq_th )
644
0
        order--;
645
1
646
1
    /* Set bounds, ready to go */
647
1
    low_mem_virq_th = low_mem_virq_orig = 1UL << order;
648
1
    low_mem_virq_th_order = order;
649
1
650
1
    printk("Initial low memory virq threshold set at %#lx pages.\n",
651
1
            low_mem_virq_th);
652
1
}
653
654
static void check_low_mem_virq(void)
655
43.2k
{
656
43.2k
    unsigned long avail_pages = total_avail_pages +
657
43.2k
        tmem_freeable_pages() - outstanding_claims;
658
43.2k
659
43.2k
    if ( unlikely(avail_pages <= low_mem_virq_th) )
660
0
    {
661
0
        send_global_virq(VIRQ_ENOMEM);
662
0
663
0
        /* Update thresholds. Next warning will be when we drop below
664
0
         * next order. However, we wait until we grow beyond one
665
0
         * order above us to complain again at the current order */
666
0
        low_mem_virq_high   = 1UL << (low_mem_virq_th_order + 1);
667
0
        if ( low_mem_virq_th_order > 0 )
668
0
            low_mem_virq_th_order--;
669
0
        low_mem_virq_th     = 1UL << low_mem_virq_th_order;
670
0
        return;
671
0
    }
672
43.2k
673
43.2k
    if ( unlikely(avail_pages >= low_mem_virq_high) )
674
0
    {
675
0
        /* Reset hysteresis. Bring threshold up one order.
676
0
         * If we are back where originally set, set high
677
0
         * threshold to -1 to avoid further growth of
678
0
         * virq threshold. */
679
0
        low_mem_virq_th_order++;
680
0
        low_mem_virq_th = 1UL << low_mem_virq_th_order;
681
0
        if ( low_mem_virq_th == low_mem_virq_orig )
682
0
            low_mem_virq_high = -1UL;
683
0
        else
684
0
            low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2);
685
0
    }
686
43.2k
}
687
688
/* Pages that need a scrub are added to tail, otherwise to head. */
689
static void page_list_add_scrub(struct page_info *pg, unsigned int node,
690
                                unsigned int zone, unsigned int order,
691
                                unsigned int first_dirty)
692
4.18M
{
693
4.18M
    PFN_ORDER(pg) = order;
694
4.18M
    pg->u.free.first_dirty = first_dirty;
695
4.18M
    pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
696
4.18M
697
4.18M
    if ( first_dirty != INVALID_DIRTY_IDX )
698
127
    {
699
127
        ASSERT(first_dirty < (1U << order));
700
127
        page_list_add_tail(pg, &heap(node, zone, order));
701
127
    }
702
4.18M
    else
703
4.18M
        page_list_add(pg, &heap(node, zone, order));
704
4.18M
}
705
706
/* SCRUB_PATTERN needs to be a repeating series of bytes. */
707
#ifndef NDEBUG
708
113
#define SCRUB_PATTERN        0xc2c2c2c2c2c2c2c2ULL
709
#else
710
#define SCRUB_PATTERN        0ULL
711
#endif
712
#define SCRUB_BYTE_PATTERN   (SCRUB_PATTERN & 0xff)
713
714
static void poison_one_page(struct page_info *pg)
715
228
{
716
228
#ifdef CONFIG_SCRUB_DEBUG
717
228
    mfn_t mfn = _mfn(page_to_mfn(pg));
718
228
    uint64_t *ptr;
719
228
720
228
    if ( !scrub_debug )
721
115
        return;
722
228
723
113
    ptr = map_domain_page(mfn);
724
113
    *ptr = ~SCRUB_PATTERN;
725
113
    unmap_domain_page(ptr);
726
113
#endif
727
113
}
728
729
static void check_one_page(struct page_info *pg)
730
0
{
731
0
#ifdef CONFIG_SCRUB_DEBUG
732
0
    mfn_t mfn = _mfn(page_to_mfn(pg));
733
0
    const uint64_t *ptr;
734
0
    unsigned int i;
735
0
736
0
    if ( !scrub_debug )
737
0
        return;
738
0
739
0
    ptr = map_domain_page(mfn);
740
0
    for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ )
741
0
        BUG_ON(ptr[i] != SCRUB_PATTERN);
742
0
    unmap_domain_page(ptr);
743
0
#endif
744
0
}
745
746
static void check_and_stop_scrub(struct page_info *head)
747
4.14M
{
748
4.14M
    if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
749
7
    {
750
7
        typeof(head->u.free) pgfree;
751
7
752
7
        head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
753
7
        spin_lock_kick();
754
7
        for ( ; ; )
755
234
        {
756
234
            /* Can't ACCESS_ONCE() a bitfield. */
757
234
            pgfree.val = ACCESS_ONCE(head->u.free.val);
758
234
            if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT )
759
7
                break;
760
227
            cpu_relax();
761
227
        }
762
7
    }
763
4.14M
}
764
765
static struct page_info *get_free_buddy(unsigned int zone_lo,
766
                                        unsigned int zone_hi,
767
                                        unsigned int order, unsigned int memflags,
768
                                        const struct domain *d)
769
43.2k
{
770
43.2k
    nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
771
34.9k
    nodemask_t nodemask = d ? d->node_affinity : node_online_map;
772
43.2k
    unsigned int j, zone, nodemask_retry = 0;
773
43.2k
    struct page_info *pg;
774
43.2k
    bool use_unscrubbed = (memflags & MEMF_no_scrub);
775
43.2k
776
43.2k
    if ( node == NUMA_NO_NODE )
777
43.2k
    {
778
43.2k
        if ( d != NULL )
779
34.9k
        {
780
34.9k
            node = next_node(d->last_alloc_node, nodemask);
781
34.9k
            if ( node >= MAX_NUMNODES )
782
34.6k
                node = first_node(nodemask);
783
34.9k
        }
784
43.2k
        if ( node >= MAX_NUMNODES )
785
8.27k
            node = cpu_to_node(smp_processor_id());
786
43.2k
    }
787
71
    else if ( unlikely(node >= MAX_NUMNODES) )
788
0
    {
789
0
        ASSERT_UNREACHABLE();
790
0
        return NULL;
791
0
    }
792
43.2k
    first_node = node;
793
43.2k
794
43.2k
    /*
795
43.2k
     * Start with requested node, but exhaust all node memory in requested 
796
43.2k
     * zone before failing, only calc new node value if we fail to find memory 
797
43.2k
     * in target node, this avoids needless computation on fast-path.
798
43.2k
     */
799
43.2k
    for ( ; ; )
800
59.5k
    {
801
59.5k
        zone = zone_hi;
802
1.42M
        do {
803
1.42M
            /* Check if target node can support the allocation. */
804
1.42M
            if ( !avail[node] || (avail[node][zone] < (1UL << order)) )
805
1.38M
                continue;
806
1.42M
807
1.42M
            /* Find smallest order which can satisfy the request. */
808
86.5k
            for ( j = order; j <= MAX_ORDER; j++ )
809
86.5k
            {
810
86.5k
                if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
811
43.2k
                {
812
43.2k
                    if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
813
43.2k
                        return pg;
814
43.2k
                    /*
815
43.2k
                     * We grab single pages (order=0) even if they are
816
43.2k
                     * unscrubbed. Given that scrubbing one page is fairly quick
817
43.2k
                     * it is not worth breaking higher orders.
818
43.2k
                     */
819
13
                    if ( (order == 0) || use_unscrubbed )
820
13
                    {
821
13
                        check_and_stop_scrub(pg);
822
13
                        return pg;
823
13
                    }
824
13
825
0
                    page_list_add_tail(pg, &heap(node, zone, j));
826
0
                }
827
86.5k
            }
828
1.38M
        } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
829
59.5k
830
16.2k
        if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
831
0
            return NULL;
832
16.2k
833
16.2k
        /* Pick next node. */
834
16.2k
        if ( !node_isset(node, nodemask) )
835
0
        {
836
0
            /* Very first node may be caller-specified and outside nodemask. */
837
0
            ASSERT(!nodemask_retry);
838
0
            first_node = node = first_node(nodemask);
839
0
            if ( node < MAX_NUMNODES )
840
0
                continue;
841
0
        }
842
16.2k
        else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
843
270
            node = first_node(nodemask);
844
16.2k
        if ( node == first_node )
845
12
        {
846
12
            /* When we have tried all in nodemask, we fall back to others. */
847
12
            if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
848
0
                return NULL;
849
12
            nodes_andnot(nodemask, node_online_map, nodemask);
850
12
            first_node = node = first_node(nodemask);
851
12
            if ( node >= MAX_NUMNODES )
852
12
                return NULL;
853
12
        }
854
16.2k
    }
855
43.2k
}
856
857
/* Allocate 2^@order contiguous pages. */
858
static struct page_info *alloc_heap_pages(
859
    unsigned int zone_lo, unsigned int zone_hi,
860
    unsigned int order, unsigned int memflags,
861
    struct domain *d)
862
43.2k
{
863
43.2k
    nodeid_t node;
864
43.2k
    unsigned int i, buddy_order, zone, first_dirty;
865
43.2k
    unsigned long request = 1UL << order;
866
43.2k
    struct page_info *pg;
867
43.2k
    bool need_tlbflush = false;
868
43.2k
    uint32_t tlbflush_timestamp = 0;
869
43.2k
    unsigned int dirty_cnt = 0;
870
43.2k
871
43.2k
    /* Make sure there are enough bits in memflags for nodeID. */
872
43.2k
    BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
873
43.2k
874
43.2k
    ASSERT(zone_lo <= zone_hi);
875
43.2k
    ASSERT(zone_hi < NR_ZONES);
876
43.2k
877
43.2k
    if ( unlikely(order > MAX_ORDER) )
878
0
        return NULL;
879
43.2k
880
43.2k
    spin_lock(&heap_lock);
881
43.2k
882
43.2k
    /*
883
43.2k
     * Claimed memory is considered unavailable unless the request
884
43.2k
     * is made by a domain with sufficient unclaimed pages.
885
43.2k
     */
886
43.2k
    if ( (outstanding_claims + request >
887
43.2k
          total_avail_pages + tmem_freeable_pages()) &&
888
0
          ((memflags & MEMF_no_refcount) ||
889
0
           !d || d->outstanding_pages < request) )
890
0
    {
891
0
        spin_unlock(&heap_lock);
892
0
        return NULL;
893
0
    }
894
43.2k
895
43.2k
    /*
896
43.2k
     * TMEM: When available memory is scarce due to tmem absorbing it, allow
897
43.2k
     * only mid-size allocations to avoid worst of fragmentation issues.
898
43.2k
     * Others try tmem pools then fail.  This is a workaround until all
899
43.2k
     * post-dom0-creation-multi-page allocations can be eliminated.
900
43.2k
     */
901
43.2k
    if ( ((order == 0) || (order >= 9)) &&
902
43.2k
         (total_avail_pages <= midsize_alloc_zone_pages) &&
903
0
         tmem_freeable_pages() )
904
0
    {
905
0
        /* Try to free memory from tmem. */
906
0
        pg = tmem_relinquish_pages(order, memflags);
907
0
        spin_unlock(&heap_lock);
908
0
        return pg;
909
0
    }
910
43.2k
911
43.2k
    pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d);
912
43.2k
    /* Try getting a dirty buddy if we couldn't get a clean one. */
913
43.2k
    if ( !pg && !(memflags & MEMF_no_scrub) )
914
6
        pg = get_free_buddy(zone_lo, zone_hi, order,
915
6
                            memflags | MEMF_no_scrub, d);
916
43.2k
    if ( !pg )
917
6
    {
918
6
        /* No suitable memory blocks. Fail the request. */
919
6
        spin_unlock(&heap_lock);
920
6
        return NULL;
921
6
    }
922
43.2k
923
43.2k
    node = phys_to_nid(page_to_maddr(pg));
924
43.2k
    zone = page_to_zone(pg);
925
43.2k
    buddy_order = PFN_ORDER(pg);
926
43.2k
927
43.2k
    first_dirty = pg->u.free.first_dirty;
928
43.2k
929
43.2k
    /* We may have to halve the chunk a number of times. */
930
86.4k
    while ( buddy_order != order )
931
43.2k
    {
932
43.2k
        buddy_order--;
933
43.2k
        page_list_add_scrub(pg, node, zone, buddy_order,
934
43.2k
                            (1U << buddy_order) > first_dirty ?
935
43.2k
                            first_dirty : INVALID_DIRTY_IDX);
936
43.2k
        pg += 1U << buddy_order;
937
43.2k
938
43.2k
        if ( first_dirty != INVALID_DIRTY_IDX )
939
8
        {
940
8
            /* Adjust first_dirty */
941
8
            if ( first_dirty >= 1U << buddy_order )
942
3
                first_dirty -= 1U << buddy_order;
943
8
            else
944
5
                first_dirty = 0; /* We've moved past original first_dirty */
945
8
        }
946
43.2k
    }
947
43.2k
948
43.2k
    ASSERT(avail[node][zone] >= request);
949
43.2k
    avail[node][zone] -= request;
950
43.2k
    total_avail_pages -= request;
951
43.2k
    ASSERT(total_avail_pages >= 0);
952
43.2k
953
43.2k
    check_low_mem_virq();
954
43.2k
955
43.2k
    if ( d != NULL )
956
34.9k
        d->last_alloc_node = node;
957
43.2k
958
4.15M
    for ( i = 0; i < (1 << order); i++ )
959
4.11M
    {
960
4.11M
        /* Reference count must continuously be zero for free pages. */
961
4.11M
        BUG_ON((pg[i].count_info & ~PGC_need_scrub) != PGC_state_free);
962
4.11M
963
4.11M
        /* PGC_need_scrub can only be set if first_dirty is valid */
964
4.11M
        ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub));
965
4.11M
966
4.11M
        /* Preserve PGC_need_scrub so we can check it after lock is dropped. */
967
4.11M
        pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub);
968
4.11M
969
4.11M
        if ( !(memflags & MEMF_no_tlbflush) )
970
4.11M
            accumulate_tlbflush(&need_tlbflush, &pg[i],
971
4.11M
                                &tlbflush_timestamp);
972
4.11M
973
4.11M
        /* Initialise fields which have other uses for free pages. */
974
4.11M
        pg[i].u.inuse.type_info = 0;
975
4.11M
        page_set_owner(&pg[i], NULL);
976
4.11M
977
4.11M
        /* Ensure cache and RAM are consistent for platforms where the
978
4.11M
         * guest can control its own visibility of/through the cache.
979
4.11M
         */
980
4.11M
        flush_page_to_ram(page_to_mfn(&pg[i]), !(memflags & MEMF_no_icache_flush));
981
4.11M
    }
982
43.2k
983
43.2k
    spin_unlock(&heap_lock);
984
43.2k
985
43.2k
    if ( first_dirty != INVALID_DIRTY_IDX ||
986
43.2k
         (scrub_debug && !(memflags & MEMF_no_scrub)) )
987
13
    {
988
64
        for ( i = 0; i < (1U << order); i++ )
989
51
        {
990
51
            if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
991
51
            {
992
51
                if ( !(memflags & MEMF_no_scrub) )
993
2
                    scrub_one_page(&pg[i]);
994
51
995
51
                dirty_cnt++;
996
51
997
51
                spin_lock(&heap_lock);
998
51
                pg[i].count_info &= ~PGC_need_scrub;
999
51
                spin_unlock(&heap_lock);
1000
51
            }
1001
0
            else if ( !(memflags & MEMF_no_scrub) )
1002
0
                check_one_page(&pg[i]);
1003
51
        }
1004
13
1005
13
        if ( dirty_cnt )
1006
13
        {
1007
13
            spin_lock(&heap_lock);
1008
13
            node_need_scrub[node] -= dirty_cnt;
1009
13
            spin_unlock(&heap_lock);
1010
13
        }
1011
13
    }
1012
43.2k
1013
43.2k
    if ( need_tlbflush )
1014
1
        filtered_flush_tlb_mask(tlbflush_timestamp);
1015
43.2k
1016
43.2k
    return pg;
1017
43.2k
}
1018
1019
/* Remove any offlined page in the buddy pointed to by head. */
1020
static int reserve_offlined_page(struct page_info *head)
1021
0
{
1022
0
    unsigned int node = phys_to_nid(page_to_maddr(head));
1023
0
    int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
1024
0
    struct page_info *cur_head;
1025
0
    unsigned int cur_order, first_dirty;
1026
0
1027
0
    ASSERT(spin_is_locked(&heap_lock));
1028
0
1029
0
    cur_head = head;
1030
0
1031
0
    check_and_stop_scrub(head);
1032
0
    /*
1033
0
     * We may break the buddy so let's mark the head as clean. Then, when
1034
0
     * merging chunks back into the heap, we will see whether the chunk has
1035
0
     * unscrubbed pages and set its first_dirty properly.
1036
0
     */
1037
0
    first_dirty = head->u.free.first_dirty;
1038
0
    head->u.free.first_dirty = INVALID_DIRTY_IDX;
1039
0
1040
0
    page_list_del(head, &heap(node, zone, head_order));
1041
0
1042
0
    while ( cur_head < (head + (1 << head_order)) )
1043
0
    {
1044
0
        struct page_info *pg;
1045
0
        int next_order;
1046
0
1047
0
        if ( page_state_is(cur_head, offlined) )
1048
0
        {
1049
0
            cur_head++;
1050
0
            if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
1051
0
                first_dirty--;
1052
0
            continue;
1053
0
        }
1054
0
1055
0
        next_order = cur_order = 0;
1056
0
1057
0
        while ( cur_order < head_order )
1058
0
        {
1059
0
            next_order = cur_order + 1;
1060
0
1061
0
            if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
1062
0
                goto merge;
1063
0
1064
0
            for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
1065
0
                  i < (1 << next_order);
1066
0
                  i++, pg++ )
1067
0
                if ( page_state_is(pg, offlined) )
1068
0
                    break;
1069
0
            if ( i == ( 1 << next_order) )
1070
0
            {
1071
0
                cur_order = next_order;
1072
0
                continue;
1073
0
            }
1074
0
            else
1075
0
            {
1076
0
            merge:
1077
0
                /* We don't consider merging outside the head_order. */
1078
0
                page_list_add_scrub(cur_head, node, zone, cur_order,
1079
0
                                    (1U << cur_order) > first_dirty ?
1080
0
                                    first_dirty : INVALID_DIRTY_IDX);
1081
0
                cur_head += (1 << cur_order);
1082
0
1083
0
                /* Adjust first_dirty if needed. */
1084
0
                if ( first_dirty != INVALID_DIRTY_IDX )
1085
0
                {
1086
0
                    if ( first_dirty >=  1U << cur_order )
1087
0
                        first_dirty -= 1U << cur_order;
1088
0
                    else
1089
0
                        first_dirty = 0;
1090
0
                }
1091
0
1092
0
                break;
1093
0
            }
1094
0
        }
1095
0
    }
1096
0
1097
0
    for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
1098
0
    {
1099
0
        if ( !page_state_is(cur_head, offlined) )
1100
0
            continue;
1101
0
1102
0
        avail[node][zone]--;
1103
0
        total_avail_pages--;
1104
0
        ASSERT(total_avail_pages >= 0);
1105
0
1106
0
        page_list_add_tail(cur_head,
1107
0
                           test_bit(_PGC_broken, &cur_head->count_info) ?
1108
0
                           &page_broken_list : &page_offlined_list);
1109
0
1110
0
        count++;
1111
0
    }
1112
0
1113
0
    return count;
1114
0
}
1115
1116
static nodemask_t node_scrubbing;
1117
1118
/*
1119
 * If get_node is true this will return closest node that needs to be scrubbed,
1120
 * with appropriate bit in node_scrubbing set.
1121
 * If get_node is not set, this will return *a* node that needs to be scrubbed.
1122
 * node_scrubbing bitmask will no be updated.
1123
 * If no node needs scrubbing then NUMA_NO_NODE is returned.
1124
 */
1125
static unsigned int node_to_scrub(bool get_node)
1126
1.97M
{
1127
1.97M
    nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
1128
1.97M
    nodeid_t closest = NUMA_NO_NODE;
1129
1.97M
    u8 dist, shortest = 0xff;
1130
1.97M
1131
1.97M
    if ( node == NUMA_NO_NODE )
1132
0
        node = 0;
1133
1.97M
1134
1.97M
    if ( node_need_scrub[node] &&
1135
27
         (!get_node || !node_test_and_set(node, node_scrubbing)) )
1136
17
        return node;
1137
1.97M
1138
1.97M
    /*
1139
1.97M
     * See if there are memory-only nodes that need scrubbing and choose
1140
1.97M
     * the closest one.
1141
1.97M
     */
1142
1.97M
    local_node = node;
1143
1.97M
    for ( ; ; )
1144
1.98M
    {
1145
1.98M
        do {
1146
1.98M
            node = cycle_node(node, node_online_map);
1147
1.98M
        } while ( !cpumask_empty(&node_to_cpumask(node)) &&
1148
2.11M
                  (node != local_node) );
1149
1.98M
1150
1.98M
        if ( node == local_node )
1151
2.12M
            break;
1152
1.98M
1153
18.4E
        if ( node_need_scrub[node] )
1154
0
        {
1155
0
            if ( !get_node )
1156
0
                return node;
1157
0
1158
0
            dist = __node_distance(local_node, node);
1159
0
1160
0
            /*
1161
0
             * Grab the node right away. If we find a closer node later we will
1162
0
             * release this one. While there is a chance that another CPU will
1163
0
             * not be able to scrub that node when it is searching for scrub work
1164
0
             * at the same time it will be able to do so next time it wakes up.
1165
0
             * The alternative would be to perform this search under a lock but
1166
0
             * then we'd need to take this lock every time we come in here.
1167
0
             */
1168
0
            if ( (dist < shortest || closest == NUMA_NO_NODE) &&
1169
0
                 !node_test_and_set(node, node_scrubbing) )
1170
0
            {
1171
0
                if ( closest != NUMA_NO_NODE )
1172
0
                    node_clear(closest, node_scrubbing);
1173
0
                shortest = dist;
1174
0
                closest = node;
1175
0
            }
1176
0
        }
1177
18.4E
    }
1178
1.97M
1179
1.97M
    return closest;
1180
1.97M
}
1181
1182
struct scrub_wait_state {
1183
    struct page_info *pg;
1184
    unsigned int first_dirty;
1185
    bool drop;
1186
};
1187
1188
static void scrub_continue(void *data)
1189
29
{
1190
29
    struct scrub_wait_state *st = data;
1191
29
1192
29
    if ( st->drop )
1193
18
        return;
1194
29
1195
11
    if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1196
1
    {
1197
1
        /* There is a waiter for this buddy. Release it. */
1198
1
        st->drop = true;
1199
1
        st->pg->u.free.first_dirty = st->first_dirty;
1200
1
        smp_wmb();
1201
1
        st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1202
1
    }
1203
11
}
1204
1205
bool scrub_free_pages(void)
1206
1.94M
{
1207
1.94M
    struct page_info *pg;
1208
1.94M
    unsigned int zone;
1209
1.94M
    unsigned int cpu = smp_processor_id();
1210
1.94M
    bool preempt = false;
1211
1.94M
    nodeid_t node;
1212
1.94M
    unsigned int cnt = 0;
1213
1.94M
  
1214
1.94M
    node = node_to_scrub(true);
1215
1.94M
    if ( node == NUMA_NO_NODE )
1216
2.11M
        return false;
1217
1.94M
 
1218
18.4E
    spin_lock(&heap_lock);
1219
18.4E
1220
18.4E
    for ( zone = 0; zone < NR_ZONES; zone++ )
1221
216
    {
1222
216
        unsigned int order = MAX_ORDER;
1223
216
1224
4.07k
        do {
1225
4.08k
            while ( !page_list_empty(&heap(node, zone, order)) )
1226
183
            {
1227
183
                unsigned int i, dirty_cnt;
1228
183
                struct scrub_wait_state st;
1229
183
1230
183
                /* Unscrubbed pages are always at the end of the list. */
1231
183
                pg = page_list_last(&heap(node, zone, order));
1232
183
                if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
1233
167
                    break;
1234
183
1235
16
                ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING);
1236
16
                pg->u.free.scrub_state = BUDDY_SCRUBBING;
1237
16
1238
16
                spin_unlock(&heap_lock);
1239
16
1240
16
                dirty_cnt = 0;
1241
16
1242
187
                for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
1243
177
                {
1244
177
                    if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1245
177
                    {
1246
177
                        scrub_one_page(&pg[i]);
1247
177
                        /*
1248
177
                         * We can modify count_info without holding heap
1249
177
                         * lock since we effectively locked this buddy by
1250
177
                         * setting its scrub_state.
1251
177
                         */
1252
177
                        pg[i].count_info &= ~PGC_need_scrub;
1253
177
                        dirty_cnt++;
1254
177
                        cnt += 100; /* scrubbed pages add heavier weight. */
1255
177
                    }
1256
177
                    else
1257
0
                        cnt++;
1258
177
1259
177
                    if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1260
6
                    {
1261
6
                        /* Someone wants this chunk. Drop everything. */
1262
6
1263
6
                        pg->u.free.first_dirty = (i == (1U << order) - 1) ?
1264
5
                            INVALID_DIRTY_IDX : i + 1; 
1265
6
                        smp_wmb();
1266
6
                        pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1267
6
1268
6
                        spin_lock(&heap_lock);
1269
6
                        node_need_scrub[node] -= dirty_cnt;
1270
6
                        spin_unlock(&heap_lock);
1271
6
                        goto out_nolock;
1272
6
                    }
1273
177
1274
177
                    /*
1275
177
                     * Scrub a few (8) pages before becoming eligible for
1276
177
                     * preemption. But also count non-scrubbing loop iterations
1277
177
                     * so that we don't get stuck here with an almost clean
1278
177
                     * heap.
1279
177
                     */
1280
171
                    if ( cnt > 800 && softirq_pending(cpu) )
1281
0
                    {
1282
0
                        preempt = true;
1283
0
                        break;
1284
0
                    }
1285
171
                }
1286
16
1287
10
                st.pg = pg;
1288
10
                /*
1289
10
                 * get_free_buddy() grabs a buddy with first_dirty set to
1290
10
                 * INVALID_DIRTY_IDX so we can't set pg's first_dirty here.
1291
10
                 * It will be set either below or in the lock callback (in
1292
10
                 * scrub_continue()).
1293
10
                 */
1294
10
                st.first_dirty = (i >= (1U << order) - 1) ?
1295
10
                    INVALID_DIRTY_IDX : i + 1;
1296
10
                st.drop = false;
1297
10
                spin_lock_cb(&heap_lock, scrub_continue, &st);
1298
10
1299
10
                node_need_scrub[node] -= dirty_cnt;
1300
10
1301
10
                if ( st.drop )
1302
1
                    goto out;
1303
10
1304
9
                if ( i >= (1U << order) - 1 )
1305
9
                {
1306
9
                    page_list_del(pg, &heap(node, zone, order));
1307
9
                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
1308
9
                }
1309
9
                else
1310
0
                    pg->u.free.first_dirty = i + 1;
1311
9
1312
9
                pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1313
9
1314
9
                if ( preempt || (node_need_scrub[node] == 0) )
1315
3
                    goto out;
1316
9
            }
1317
4.06k
        } while ( order-- != 0 );
1318
216
    }
1319
18.4E
1320
4
 out:
1321
4
    spin_unlock(&heap_lock);
1322
4
1323
10
 out_nolock:
1324
10
    node_clear(node, node_scrubbing);
1325
10
    return node_to_scrub(false) != NUMA_NO_NODE;
1326
4
}
1327
1328
/* Free 2^@order set of pages. */
1329
static void free_heap_pages(
1330
    struct page_info *pg, unsigned int order, bool need_scrub)
1331
4.14M
{
1332
4.14M
    unsigned long mask, mfn = page_to_mfn(pg);
1333
4.14M
    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
1334
4.14M
    unsigned int zone = page_to_zone(pg);
1335
4.14M
1336
4.14M
    ASSERT(order <= MAX_ORDER);
1337
4.14M
    ASSERT(node >= 0);
1338
4.14M
1339
4.14M
    spin_lock(&heap_lock);
1340
4.14M
1341
8.29M
    for ( i = 0; i < (1 << order); i++ )
1342
4.14M
    {
1343
4.14M
        /*
1344
4.14M
         * Cannot assume that count_info == 0, as there are some corner cases
1345
4.14M
         * where it isn't the case and yet it isn't a bug:
1346
4.14M
         *  1. page_get_owner() is NULL
1347
4.14M
         *  2. page_get_owner() is a domain that was never accessible by
1348
4.14M
         *     its domid (e.g., failed to fully construct the domain).
1349
4.14M
         *  3. page was never addressable by the guest (e.g., it's an
1350
4.14M
         *     auto-translate-physmap guest and the page was never included
1351
4.14M
         *     in its pseudophysical address space).
1352
4.14M
         * In all the above cases there can be no guest mappings of this page.
1353
4.14M
         */
1354
4.14M
        ASSERT(!page_state_is(&pg[i], offlined));
1355
4.14M
        pg[i].count_info =
1356
4.14M
            ((pg[i].count_info & PGC_broken) |
1357
4.14M
             (page_state_is(&pg[i], offlining)
1358
4.14M
              ? PGC_state_offlined : PGC_state_free));
1359
4.14M
        if ( page_state_is(&pg[i], offlined) )
1360
0
            tainted = 1;
1361
4.14M
1362
4.14M
        /* If a page has no owner it will need no safety TLB flush. */
1363
4.14M
        pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
1364
4.14M
        if ( pg[i].u.free.need_tlbflush )
1365
1
            page_set_tlbflush_timestamp(&pg[i]);
1366
4.14M
1367
4.14M
        /* This page is not a guest frame any more. */
1368
4.14M
        page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
1369
4.14M
        set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
1370
4.14M
1371
4.14M
        if ( need_scrub )
1372
228
        {
1373
228
            pg[i].count_info |= PGC_need_scrub;
1374
228
            poison_one_page(&pg[i]);
1375
228
        }
1376
4.14M
    }
1377
4.14M
1378
4.14M
    avail[node][zone] += 1 << order;
1379
4.14M
    total_avail_pages += 1 << order;
1380
4.14M
    if ( need_scrub )
1381
122
    {
1382
122
        node_need_scrub[node] += 1 << order;
1383
122
        pg->u.free.first_dirty = 0;
1384
122
    }
1385
4.14M
    else
1386
4.14M
        pg->u.free.first_dirty = INVALID_DIRTY_IDX;
1387
4.14M
1388
4.14M
    if ( tmem_enabled() )
1389
0
        midsize_alloc_zone_pages = max(
1390
4.14M
            midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
1391
4.14M
1392
4.14M
    /* Merge chunks as far as possible. */
1393
8.29M
    while ( order < MAX_ORDER )
1394
8.29M
    {
1395
8.29M
        mask = 1UL << order;
1396
8.29M
1397
8.29M
        if ( (page_to_mfn(pg) & mask) )
1398
4.14M
        {
1399
4.14M
            struct page_info *predecessor = pg - mask;
1400
4.14M
1401
4.14M
            /* Merge with predecessor block? */
1402
4.14M
            if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
1403
4.14M
                 !page_state_is(predecessor, free) ||
1404
4.14M
                 (PFN_ORDER(predecessor) != order) ||
1405
4.14M
                 (phys_to_nid(page_to_maddr(predecessor)) != node) )
1406
46
                break;
1407
4.14M
1408
4.14M
            check_and_stop_scrub(predecessor);
1409
4.14M
1410
4.14M
            page_list_del(predecessor, &heap(node, zone, order));
1411
4.14M
1412
4.14M
            /* Keep predecessor's first_dirty if it is already set. */
1413
4.14M
            if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
1414
4.14M
                 pg->u.free.first_dirty != INVALID_DIRTY_IDX )
1415
9
                predecessor->u.free.first_dirty = (1U << order) +
1416
9
                                                  pg->u.free.first_dirty;
1417
4.14M
1418
4.14M
            pg = predecessor;
1419
4.14M
        }
1420
8.29M
        else
1421
4.14M
        {
1422
4.14M
            struct page_info *successor = pg + mask;
1423
4.14M
1424
4.14M
            /* Merge with successor block? */
1425
4.14M
            if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
1426
4.14M
                 !page_state_is(successor, free) ||
1427
8
                 (PFN_ORDER(successor) != order) ||
1428
8
                 (phys_to_nid(page_to_maddr(successor)) != node) )
1429
4.14M
                break;
1430
4.14M
1431
8
            check_and_stop_scrub(successor);
1432
8
1433
8
            page_list_del(successor, &heap(node, zone, order));
1434
8
        }
1435
8.29M
1436
4.14M
        order++;
1437
4.14M
    }
1438
4.14M
1439
4.14M
    page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
1440
4.14M
1441
4.14M
    if ( tainted )
1442
0
        reserve_offlined_page(pg);
1443
4.14M
1444
4.14M
    spin_unlock(&heap_lock);
1445
4.14M
}
1446
1447
1448
/*
1449
 * Following rules applied for page offline:
1450
 * Once a page is broken, it can't be assigned anymore
1451
 * A page will be offlined only if it is free
1452
 * return original count_info
1453
 */
1454
static unsigned long mark_page_offline(struct page_info *pg, int broken)
1455
0
{
1456
0
    unsigned long nx, x, y = pg->count_info;
1457
0
1458
0
    ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
1459
0
    ASSERT(spin_is_locked(&heap_lock));
1460
0
1461
0
    do {
1462
0
        nx = x = y;
1463
0
1464
0
        if ( ((x & PGC_state) != PGC_state_offlined) &&
1465
0
             ((x & PGC_state) != PGC_state_offlining) )
1466
0
        {
1467
0
            nx &= ~PGC_state;
1468
0
            nx |= (((x & PGC_state) == PGC_state_free)
1469
0
                   ? PGC_state_offlined : PGC_state_offlining);
1470
0
        }
1471
0
1472
0
        if ( broken )
1473
0
            nx |= PGC_broken;
1474
0
1475
0
        if ( x == nx )
1476
0
            break;
1477
0
    } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1478
0
1479
0
    return y;
1480
0
}
1481
1482
static int reserve_heap_page(struct page_info *pg)
1483
0
{
1484
0
    struct page_info *head = NULL;
1485
0
    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
1486
0
    unsigned int zone = page_to_zone(pg);
1487
0
1488
0
    for ( i = 0; i <= MAX_ORDER; i++ )
1489
0
    {
1490
0
        struct page_info *tmp;
1491
0
1492
0
        if ( page_list_empty(&heap(node, zone, i)) )
1493
0
            continue;
1494
0
1495
0
        page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
1496
0
        {
1497
0
            if ( (head <= pg) &&
1498
0
                 (head + (1UL << i) > pg) )
1499
0
                return reserve_offlined_page(head);
1500
0
        }
1501
0
    }
1502
0
1503
0
    return -EINVAL;
1504
0
1505
0
}
1506
1507
int offline_page(unsigned long mfn, int broken, uint32_t *status)
1508
0
{
1509
0
    unsigned long old_info = 0;
1510
0
    struct domain *owner;
1511
0
    struct page_info *pg;
1512
0
1513
0
    if ( !mfn_valid(_mfn(mfn)) )
1514
0
    {
1515
0
        dprintk(XENLOG_WARNING,
1516
0
                "try to offline page out of range %lx\n", mfn);
1517
0
        return -EINVAL;
1518
0
    }
1519
0
1520
0
    *status = 0;
1521
0
    pg = mfn_to_page(mfn);
1522
0
1523
0
    if ( is_xen_fixed_mfn(mfn) )
1524
0
    {
1525
0
        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
1526
0
          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1527
0
        return -EPERM;
1528
0
    }
1529
0
1530
0
    /*
1531
0
     * N.B. xen's txt in x86_64 is marked reserved and handled already.
1532
0
     * Also kexec range is reserved.
1533
0
     */
1534
0
    if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1535
0
    {
1536
0
        *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
1537
0
        return -EINVAL;
1538
0
    }
1539
0
1540
0
    /*
1541
0
     * NB. When broken page belong to guest, usually hypervisor will
1542
0
     * notify the guest to handle the broken page. However, hypervisor
1543
0
     * need to prevent malicious guest access the broken page again.
1544
0
     * Under such case, hypervisor shutdown guest, preventing recursive mce.
1545
0
     */
1546
0
    if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) )
1547
0
    {
1548
0
        *status = PG_OFFLINE_AGAIN;
1549
0
        domain_shutdown(owner, SHUTDOWN_crash);
1550
0
        return 0;
1551
0
    }
1552
0
1553
0
    spin_lock(&heap_lock);
1554
0
1555
0
    old_info = mark_page_offline(pg, broken);
1556
0
1557
0
    if ( page_state_is(pg, offlined) )
1558
0
    {
1559
0
        reserve_heap_page(pg);
1560
0
1561
0
        spin_unlock(&heap_lock);
1562
0
1563
0
        *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN
1564
0
                         : PG_OFFLINE_OFFLINED;
1565
0
        return 0;
1566
0
    }
1567
0
1568
0
    spin_unlock(&heap_lock);
1569
0
1570
0
    if ( (owner = page_get_owner_and_reference(pg)) )
1571
0
    {
1572
0
        if ( p2m_pod_offline_or_broken_hit(pg) )
1573
0
        {
1574
0
            put_page(pg);
1575
0
            p2m_pod_offline_or_broken_replace(pg);
1576
0
            *status = PG_OFFLINE_OFFLINED;
1577
0
        }
1578
0
        else
1579
0
        {
1580
0
            *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
1581
0
                      (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
1582
0
            /* Release the reference since it will not be allocated anymore */
1583
0
            put_page(pg);
1584
0
        }
1585
0
    }
1586
0
    else if ( old_info & PGC_xen_heap )
1587
0
    {
1588
0
        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
1589
0
                  (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1590
0
    }
1591
0
    else
1592
0
    {
1593
0
        /*
1594
0
         * assign_pages does not hold heap_lock, so small window that the owner
1595
0
         * may be set later, but please notice owner will only change from
1596
0
         * NULL to be set, not verse, since page is offlining now.
1597
0
         * No windows If called from #MC handler, since all CPU are in softirq
1598
0
         * If called from user space like CE handling, tools can wait some time
1599
0
         * before call again.
1600
0
         */
1601
0
        *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
1602
0
                  (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
1603
0
    }
1604
0
1605
0
    if ( broken )
1606
0
        *status |= PG_OFFLINE_BROKEN;
1607
0
1608
0
    return 0;
1609
0
}
1610
1611
/*
1612
 * Online the memory.
1613
 *   The caller should make sure end_pfn <= max_page,
1614
 *   if not, expand_pages() should be called prior to online_page().
1615
 */
1616
unsigned int online_page(unsigned long mfn, uint32_t *status)
1617
0
{
1618
0
    unsigned long x, nx, y;
1619
0
    struct page_info *pg;
1620
0
    int ret;
1621
0
1622
0
    if ( !mfn_valid(_mfn(mfn)) )
1623
0
    {
1624
0
        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1625
0
        return -EINVAL;
1626
0
    }
1627
0
1628
0
    pg = mfn_to_page(mfn);
1629
0
1630
0
    spin_lock(&heap_lock);
1631
0
1632
0
    y = pg->count_info;
1633
0
    do {
1634
0
        ret = *status = 0;
1635
0
1636
0
        if ( y & PGC_broken )
1637
0
        {
1638
0
            ret = -EINVAL;
1639
0
            *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
1640
0
            break;
1641
0
        }
1642
0
1643
0
        if ( (y & PGC_state) == PGC_state_offlined )
1644
0
        {
1645
0
            page_list_del(pg, &page_offlined_list);
1646
0
            *status = PG_ONLINE_ONLINED;
1647
0
        }
1648
0
        else if ( (y & PGC_state) == PGC_state_offlining )
1649
0
        {
1650
0
            *status = PG_ONLINE_ONLINED;
1651
0
        }
1652
0
        else
1653
0
        {
1654
0
            break;
1655
0
        }
1656
0
1657
0
        x = y;
1658
0
        nx = (x & ~PGC_state) | PGC_state_inuse;
1659
0
    } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1660
0
1661
0
    spin_unlock(&heap_lock);
1662
0
1663
0
    if ( (y & PGC_state) == PGC_state_offlined )
1664
0
        free_heap_pages(pg, 0, false);
1665
0
1666
0
    return ret;
1667
0
}
1668
1669
int query_page_offline(unsigned long mfn, uint32_t *status)
1670
0
{
1671
0
    struct page_info *pg;
1672
0
1673
0
    if ( !mfn_valid(_mfn(mfn)) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1674
0
    {
1675
0
        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1676
0
        return -EINVAL;
1677
0
    }
1678
0
1679
0
    *status = 0;
1680
0
    spin_lock(&heap_lock);
1681
0
1682
0
    pg = mfn_to_page(mfn);
1683
0
1684
0
    if ( page_state_is(pg, offlining) )
1685
0
        *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
1686
0
    if ( pg->count_info & PGC_broken )
1687
0
        *status |= PG_OFFLINE_STATUS_BROKEN;
1688
0
    if ( page_state_is(pg, offlined) )
1689
0
        *status |= PG_OFFLINE_STATUS_OFFLINED;
1690
0
1691
0
    spin_unlock(&heap_lock);
1692
0
1693
0
    return 0;
1694
0
}
1695
1696
/*
1697
 * Hand the specified arbitrary page range to the specified heap zone
1698
 * checking the node_id of the previous page.  If they differ and the
1699
 * latter is not on a MAX_ORDER boundary, then we reserve the page by
1700
 * not freeing it to the buddy allocator.
1701
 */
1702
static void init_heap_pages(
1703
    struct page_info *pg, unsigned long nr_pages)
1704
20
{
1705
20
    unsigned long i;
1706
20
1707
20
    /*
1708
20
     * Some pages may not go through the boot allocator (e.g reserved
1709
20
     * memory at boot but released just after --- kernel, initramfs,
1710
20
     * etc.).
1711
20
     * Update first_valid_mfn to ensure those regions are covered.
1712
20
     */
1713
20
    spin_lock(&heap_lock);
1714
20
    first_valid_mfn = min_t(unsigned long, page_to_mfn(pg), first_valid_mfn);
1715
20
    spin_unlock(&heap_lock);
1716
20
1717
4.14M
    for ( i = 0; i < nr_pages; i++ )
1718
4.14M
    {
1719
4.14M
        unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
1720
4.14M
1721
4.14M
        if ( unlikely(!avail[nid]) )
1722
1
        {
1723
1
            unsigned long s = page_to_mfn(pg + i);
1724
1
            unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
1725
1
            bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
1726
1
                              !(s & ((1UL << MAX_ORDER) - 1)) &&
1727
0
                              (find_first_set_bit(e) <= find_first_set_bit(s));
1728
1
            unsigned long n;
1729
1
1730
1
            n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
1731
1
                               &use_tail);
1732
1
            BUG_ON(i + n > nr_pages);
1733
1
            if ( n && !use_tail )
1734
0
            {
1735
0
                i += n - 1;
1736
0
                continue;
1737
0
            }
1738
1
            if ( i + n == nr_pages )
1739
0
                break;
1740
1
            nr_pages -= n;
1741
1
        }
1742
4.14M
1743
4.14M
        free_heap_pages(pg + i, 0, scrub_debug);
1744
4.14M
    }
1745
20
}
1746
1747
static unsigned long avail_heap_pages(
1748
    unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
1749
1
{
1750
1
    unsigned int i, zone;
1751
1
    unsigned long free_pages = 0;
1752
1
1753
1
    if ( zone_hi >= NR_ZONES )
1754
0
        zone_hi = NR_ZONES - 1;
1755
1
1756
1
    for_each_online_node(i)
1757
1
    {
1758
1
        if ( !avail[i] )
1759
0
            continue;
1760
41
        for ( zone = zone_lo; zone <= zone_hi; zone++ )
1761
40
            if ( (node == -1) || (node == i) )
1762
40
                free_pages += avail[i][zone];
1763
1
    }
1764
1
1765
1
    return free_pages;
1766
1
}
1767
1768
unsigned long total_free_pages(void)
1769
0
{
1770
0
    return total_avail_pages - midsize_alloc_zone_pages;
1771
0
}
1772
1773
void __init end_boot_allocator(void)
1774
1
{
1775
1
    unsigned int i;
1776
1
1777
1
    /* Pages that are free now go to the domain sub-allocator. */
1778
1
    for ( i = 0; i < nr_bootmem_regions; i++ )
1779
1
    {
1780
1
        struct bootmem_region *r = &bootmem_region_list[i];
1781
1
        if ( (r->s < r->e) &&
1782
1
             (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
1783
1
        {
1784
1
            init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1785
1
            r->e = r->s;
1786
1
            break;
1787
1
        }
1788
1
    }
1789
17
    for ( i = nr_bootmem_regions; i-- > 0; )
1790
16
    {
1791
16
        struct bootmem_region *r = &bootmem_region_list[i];
1792
16
        if ( r->s < r->e )
1793
15
            init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1794
16
    }
1795
1
    nr_bootmem_regions = 0;
1796
1
    init_heap_pages(virt_to_page(bootmem_region_list), 1);
1797
1
1798
1
    if ( !dma_bitsize && (num_online_nodes() > 1) )
1799
0
        dma_bitsize = arch_get_dma_bitsize();
1800
1
1801
1
    printk("Domain heap initialised");
1802
1
    if ( dma_bitsize )
1803
0
        printk(" DMA width %u bits", dma_bitsize);
1804
1
    printk("\n");
1805
1
}
1806
1807
static void __init smp_scrub_heap_pages(void *data)
1808
134
{
1809
134
    unsigned long mfn, start, end;
1810
134
    struct page_info *pg;
1811
134
    struct scrub_region *r;
1812
134
    unsigned int temp_cpu, cpu_idx = 0;
1813
134
    nodeid_t node;
1814
134
    unsigned int cpu = smp_processor_id();
1815
134
1816
134
    if ( data )
1817
0
        r = data;
1818
134
    else
1819
134
    {
1820
134
        node = cpu_to_node(cpu);
1821
134
        if ( node == NUMA_NO_NODE )
1822
0
            return;
1823
134
        r = &region[node];
1824
134
    }
1825
134
1826
134
    /* Determine the current CPU's index into CPU's linked to this node. */
1827
134
    for_each_cpu ( temp_cpu, &r->cpus )
1828
338
    {
1829
338
        if ( cpu == temp_cpu )
1830
115
            break;
1831
223
        cpu_idx++;
1832
223
    }
1833
134
1834
134
    /* Calculate the starting mfn for this CPU's memory block. */
1835
134
    start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
1836
134
1837
134
    /* Calculate the end mfn into this CPU's memory block for this iteration. */
1838
134
    if ( r->offset + chunk_size >= r->per_cpu_sz )
1839
6
    {
1840
6
        end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
1841
6
1842
6
        if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) )
1843
1
            end += r->rem;
1844
6
    }
1845
134
    else
1846
128
        end = start + chunk_size;
1847
134
1848
830k
    for ( mfn = start; mfn < end; mfn++ )
1849
830k
    {
1850
830k
        pg = mfn_to_page(mfn);
1851
830k
1852
830k
        /* Check the mfn is valid and page is free. */
1853
830k
        if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) )
1854
820k
            continue;
1855
830k
1856
9.25k
        scrub_one_page(pg);
1857
9.25k
    }
1858
134
}
1859
1860
static int __init find_non_smt(unsigned int node, cpumask_t *dest)
1861
1
{
1862
1
    cpumask_t node_cpus;
1863
1
    unsigned int i, cpu;
1864
1
1865
1
    cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map);
1866
1
    cpumask_clear(dest);
1867
1
    for_each_cpu ( i, &node_cpus )
1868
12
    {
1869
12
        if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
1870
6
            continue;
1871
6
        cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
1872
6
        __cpumask_set_cpu(cpu, dest);
1873
6
    }
1874
1
    return cpumask_weight(dest);
1875
1
}
1876
1877
/*
1878
 * Scrub all unallocated pages in all heap zones. This function uses all
1879
 * online cpu's to scrub the memory in parallel.
1880
 */
1881
static void __init scrub_heap_pages(void)
1882
1
{
1883
1
    cpumask_t node_cpus, all_worker_cpus;
1884
1
    unsigned int i, j;
1885
1
    unsigned long offset, max_per_cpu_sz = 0;
1886
1
    unsigned long start, end;
1887
1
    unsigned long rem = 0;
1888
1
    int last_distance, best_node;
1889
1
    int cpus;
1890
1
1891
1
    cpumask_clear(&all_worker_cpus);
1892
1
    /* Scrub block size. */
1893
1
    chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
1894
1
    if ( chunk_size == 0 )
1895
0
        chunk_size = MB(128) >> PAGE_SHIFT;
1896
1
1897
1
    /* Round #0 - figure out amounts and which CPUs to use. */
1898
1
    for_each_online_node ( i )
1899
1
    {
1900
1
        if ( !node_spanned_pages(i) )
1901
0
            continue;
1902
1
        /* Calculate Node memory start and end address. */
1903
1
        start = max(node_start_pfn(i), first_valid_mfn);
1904
1
        end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
1905
1
        /* Just in case NODE has 1 page and starts below first_valid_mfn. */
1906
1
        end = max(end, start);
1907
1
        /* CPUs that are online and on this node (if none, that it is OK). */
1908
1
        cpus = find_non_smt(i, &node_cpus);
1909
1
        cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
1910
1
        if ( cpus <= 0 )
1911
0
        {
1912
0
            /* No CPUs on this node. Round #2 will take of it. */
1913
0
            rem = 0;
1914
0
            region[i].per_cpu_sz = (end - start);
1915
0
        }
1916
1
        else
1917
1
        {
1918
1
            rem = (end - start) % cpus;
1919
1
            region[i].per_cpu_sz = (end - start) / cpus;
1920
1
            if ( region[i].per_cpu_sz > max_per_cpu_sz )
1921
1
                max_per_cpu_sz = region[i].per_cpu_sz;
1922
1
        }
1923
1
        region[i].start = start;
1924
1
        region[i].rem = rem;
1925
1
        cpumask_copy(&region[i].cpus, &node_cpus);
1926
1
    }
1927
1
1928
1
    printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(),
1929
1
           cpumask_weight(&all_worker_cpus));
1930
1
1931
1
    /* Round: #1 - do NUMA nodes with CPUs. */
1932
25
    for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
1933
24
    {
1934
24
        for_each_online_node ( i )
1935
24
            region[i].offset = offset;
1936
24
1937
24
        process_pending_softirqs();
1938
24
1939
24
        spin_lock(&heap_lock);
1940
24
        on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
1941
24
        spin_unlock(&heap_lock);
1942
24
1943
24
        printk(".");
1944
24
    }
1945
1
1946
1
    /*
1947
1
     * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node
1948
1
     * closest to us and with CPUs.
1949
1
     */
1950
1
    for_each_online_node ( i )
1951
1
    {
1952
1
        node_cpus = node_to_cpumask(i);
1953
1
1954
1
        if ( !cpumask_empty(&node_cpus) )
1955
1
            continue;
1956
1
1957
0
        last_distance = INT_MAX;
1958
0
        best_node = first_node(node_online_map);
1959
0
        /* Figure out which NODE CPUs are close. */
1960
0
        for_each_online_node ( j )
1961
0
        {
1962
0
            u8 distance;
1963
0
1964
0
            if ( cpumask_empty(&node_to_cpumask(j)) )
1965
0
                continue;
1966
0
1967
0
            distance = __node_distance(i, j);
1968
0
            if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
1969
0
            {
1970
0
                last_distance = distance;
1971
0
                best_node = j;
1972
0
            }
1973
0
        }
1974
0
        /*
1975
0
         * Use CPUs from best node, and if there are no CPUs on the
1976
0
         * first node (the default) use the BSP.
1977
0
         */
1978
0
        cpus = find_non_smt(best_node, &node_cpus);
1979
0
        if ( cpus == 0 )
1980
0
        {
1981
0
            __cpumask_set_cpu(smp_processor_id(), &node_cpus);
1982
0
            cpus = 1;
1983
0
        }
1984
0
        /* We already have the node information from round #0. */
1985
0
        region[i].rem = region[i].per_cpu_sz % cpus;
1986
0
        region[i].per_cpu_sz /= cpus;
1987
0
        max_per_cpu_sz = region[i].per_cpu_sz;
1988
0
        cpumask_copy(&region[i].cpus, &node_cpus);
1989
0
1990
0
        for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
1991
0
        {
1992
0
            region[i].offset = offset;
1993
0
1994
0
            process_pending_softirqs();
1995
0
1996
0
            spin_lock(&heap_lock);
1997
0
            on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 1);
1998
0
            spin_unlock(&heap_lock);
1999
0
2000
0
            printk(".");
2001
0
        }
2002
0
    }
2003
1
2004
1
    printk("done.\n");
2005
1
2006
1
#ifdef CONFIG_SCRUB_DEBUG
2007
1
    scrub_debug = true;
2008
1
#endif
2009
1
}
2010
2011
void __init heap_init_late(void)
2012
1
{
2013
1
    /*
2014
1
     * Now that the heap is initialized set bounds
2015
1
     * for the low mem virq algorithm.
2016
1
     */
2017
1
    setup_low_mem_virq();
2018
1
2019
1
    if ( opt_bootscrub )
2020
1
        scrub_heap_pages();
2021
1
}
2022
2023
2024
/*************************
2025
 * XEN-HEAP SUB-ALLOCATOR
2026
 */
2027
2028
#if defined(CONFIG_SEPARATE_XENHEAP)
2029
2030
void init_xenheap_pages(paddr_t ps, paddr_t pe)
2031
{
2032
    ps = round_pgup(ps);
2033
    pe = round_pgdown(pe);
2034
    if ( pe <= ps )
2035
        return;
2036
2037
    /*
2038
     * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
2039
     * prevent merging of power-of-two blocks across the zone boundary.
2040
     */
2041
    if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
2042
        ps += PAGE_SIZE;
2043
    if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
2044
        pe -= PAGE_SIZE;
2045
2046
    memguard_guard_range(maddr_to_virt(ps), pe - ps);
2047
2048
    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
2049
}
2050
2051
2052
void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2053
{
2054
    struct page_info *pg;
2055
2056
    ASSERT(!in_irq());
2057
2058
    pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
2059
                          order, memflags | MEMF_no_scrub, NULL);
2060
    if ( unlikely(pg == NULL) )
2061
        return NULL;
2062
2063
    memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
2064
2065
    return page_to_virt(pg);
2066
}
2067
2068
2069
void free_xenheap_pages(void *v, unsigned int order)
2070
{
2071
    ASSERT(!in_irq());
2072
2073
    if ( v == NULL )
2074
        return;
2075
2076
    memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
2077
2078
    free_heap_pages(virt_to_page(v), order, false);
2079
}
2080
2081
#else
2082
2083
void __init xenheap_max_mfn(unsigned long mfn)
2084
0
{
2085
0
    ASSERT(!first_node_initialised);
2086
0
    ASSERT(!xenheap_bits);
2087
0
    BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
2088
0
    xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
2089
0
    printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
2090
0
}
2091
2092
void init_xenheap_pages(paddr_t ps, paddr_t pe)
2093
1
{
2094
1
    init_domheap_pages(ps, pe);
2095
1
}
2096
2097
void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2098
197
{
2099
197
    struct page_info *pg;
2100
197
    unsigned int i;
2101
197
2102
197
    ASSERT(!in_irq());
2103
197
2104
197
    if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits )
2105
0
        memflags &= ~MEMF_bits(~0U);
2106
197
    if ( !(memflags >> _MEMF_bits) )
2107
166
        memflags |= MEMF_bits(xenheap_bits);
2108
197
2109
197
    pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub);
2110
197
    if ( unlikely(pg == NULL) )
2111
0
        return NULL;
2112
197
2113
817
    for ( i = 0; i < (1u << order); i++ )
2114
620
        pg[i].count_info |= PGC_xen_heap;
2115
197
2116
197
    return page_to_virt(pg);
2117
197
}
2118
2119
void free_xenheap_pages(void *v, unsigned int order)
2120
9
{
2121
9
    struct page_info *pg;
2122
9
    unsigned int i;
2123
9
2124
9
    ASSERT(!in_irq());
2125
9
2126
9
    if ( v == NULL )
2127
0
        return;
2128
9
2129
9
    pg = virt_to_page(v);
2130
9
2131
124
    for ( i = 0; i < (1u << order); i++ )
2132
115
        pg[i].count_info &= ~PGC_xen_heap;
2133
9
2134
9
    free_heap_pages(pg, order, true);
2135
9
}
2136
2137
#endif
2138
2139
2140
2141
/*************************
2142
 * DOMAIN-HEAP SUB-ALLOCATOR
2143
 */
2144
2145
void init_domheap_pages(paddr_t ps, paddr_t pe)
2146
3
{
2147
3
    unsigned long smfn, emfn;
2148
3
2149
3
    ASSERT(!in_irq());
2150
3
2151
3
    smfn = round_pgup(ps) >> PAGE_SHIFT;
2152
3
    emfn = round_pgdown(pe) >> PAGE_SHIFT;
2153
3
2154
3
    if ( emfn <= smfn )
2155
0
        return;
2156
3
2157
3
    init_heap_pages(mfn_to_page(smfn), emfn - smfn);
2158
3
}
2159
2160
2161
int assign_pages(
2162
    struct domain *d,
2163
    struct page_info *pg,
2164
    unsigned int order,
2165
    unsigned int memflags)
2166
76
{
2167
76
    int rc = 0;
2168
76
    unsigned long i;
2169
76
2170
76
    spin_lock(&d->page_alloc_lock);
2171
76
2172
76
    if ( unlikely(d->is_dying) )
2173
0
    {
2174
0
        gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
2175
0
                d->domain_id);
2176
0
        rc = -EINVAL;
2177
0
        goto out;
2178
0
    }
2179
76
2180
76
    if ( !(memflags & MEMF_no_refcount) )
2181
76
    {
2182
76
        if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
2183
0
        {
2184
0
            if ( !tmem_enabled() || order != 0 || d->tot_pages != d->max_pages )
2185
0
                gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
2186
0
                        "%u > %u\n", d->domain_id,
2187
0
                        d->tot_pages + (1 << order), d->max_pages);
2188
0
            rc = -E2BIG;
2189
0
            goto out;
2190
0
        }
2191
76
2192
76
        if ( unlikely(d->tot_pages == 0) )
2193
1
            get_knownalive_domain(d);
2194
76
2195
76
        domain_adjust_tot_pages(d, 1 << order);
2196
76
    }
2197
76
2198
4.05M
    for ( i = 0; i < (1 << order); i++ )
2199
4.05M
    {
2200
4.05M
        ASSERT(page_get_owner(&pg[i]) == NULL);
2201
4.05M
        ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
2202
4.05M
        page_set_owner(&pg[i], d);
2203
4.05M
        smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
2204
4.05M
        pg[i].count_info = PGC_allocated | 1;
2205
4.05M
        page_list_add_tail(&pg[i], &d->page_list);
2206
4.05M
    }
2207
76
2208
76
 out:
2209
76
    spin_unlock(&d->page_alloc_lock);
2210
76
    return rc;
2211
76
}
2212
2213
2214
struct page_info *alloc_domheap_pages(
2215
    struct domain *d, unsigned int order, unsigned int memflags)
2216
43.2k
{
2217
43.2k
    struct page_info *pg = NULL;
2218
43.2k
    unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
2219
43.2k
    unsigned int dma_zone;
2220
43.2k
2221
43.2k
    ASSERT(!in_irq());
2222
43.2k
2223
43.2k
    bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
2224
43.2k
                                      bits ? : (BITS_PER_LONG+PAGE_SHIFT));
2225
43.2k
    if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
2226
0
        return NULL;
2227
43.2k
2228
43.2k
    if ( memflags & MEMF_no_owner )
2229
34.8k
        memflags |= MEMF_no_refcount;
2230
43.2k
2231
43.2k
    if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
2232
0
        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
2233
43.2k
2234
43.2k
    if ( (pg == NULL) &&
2235
43.2k
         ((memflags & MEMF_no_dma) ||
2236
43.2k
          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
2237
43.2k
                                  memflags, d)) == NULL)) )
2238
25
         return NULL;
2239
43.2k
2240
43.2k
    if ( d && !(memflags & MEMF_no_owner) &&
2241
76
         assign_pages(d, pg, order, memflags) )
2242
0
    {
2243
0
        free_heap_pages(pg, order, memflags & MEMF_no_scrub);
2244
0
        return NULL;
2245
0
    }
2246
43.2k
    
2247
43.2k
    return pg;
2248
43.2k
}
2249
2250
void free_domheap_pages(struct page_info *pg, unsigned int order)
2251
150
{
2252
150
    struct domain *d = page_get_owner(pg);
2253
150
    unsigned int i;
2254
150
    bool_t drop_dom_ref;
2255
150
2256
150
    ASSERT(!in_irq());
2257
150
2258
150
    if ( unlikely(is_xen_heap_page(pg)) )
2259
149
    {
2260
149
        /* NB. May recursively lock from relinquish_memory(). */
2261
149
        spin_lock_recursive(&d->page_alloc_lock);
2262
149
2263
298
        for ( i = 0; i < (1 << order); i++ )
2264
149
            arch_free_heap_page(d, &pg[i]);
2265
149
2266
149
        d->xenheap_pages -= 1 << order;
2267
149
        drop_dom_ref = (d->xenheap_pages == 0);
2268
149
2269
149
        spin_unlock_recursive(&d->page_alloc_lock);
2270
149
    }
2271
150
    else
2272
1
    {
2273
1
        bool_t scrub;
2274
1
2275
1
        if ( likely(d) && likely(d != dom_cow) )
2276
1
        {
2277
1
            /* NB. May recursively lock from relinquish_memory(). */
2278
1
            spin_lock_recursive(&d->page_alloc_lock);
2279
1
2280
2
            for ( i = 0; i < (1 << order); i++ )
2281
1
            {
2282
1
                BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
2283
1
                arch_free_heap_page(d, &pg[i]);
2284
1
            }
2285
1
2286
1
            drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order));
2287
1
2288
1
            spin_unlock_recursive(&d->page_alloc_lock);
2289
1
2290
1
            /*
2291
1
             * Normally we expect a domain to clear pages before freeing them,
2292
1
             * if it cares about the secrecy of their contents. However, after
2293
1
             * a domain has died we assume responsibility for erasure.
2294
1
             */
2295
1
            scrub = d->is_dying || scrub_debug;
2296
1
        }
2297
1
        else
2298
0
        {
2299
0
            /*
2300
0
             * All we need to check is that on dom_cow only order-0 chunks
2301
0
             * make it here. Due to the if() above, the only two possible
2302
0
             * cases right now are d == NULL and d == dom_cow. To protect
2303
0
             * against relaxation of that if() condition without updating the
2304
0
             * check here, don't check d != dom_cow for now.
2305
0
             */
2306
0
            ASSERT(!d || !order);
2307
0
            drop_dom_ref = 0;
2308
0
            scrub = 1;
2309
0
        }
2310
1
2311
1
        free_heap_pages(pg, order, scrub);
2312
1
    }
2313
150
2314
150
    if ( drop_dom_ref )
2315
0
        put_domain(d);
2316
150
}
2317
2318
unsigned long avail_domheap_pages_region(
2319
    unsigned int node, unsigned int min_width, unsigned int max_width)
2320
1
{
2321
1
    int zone_lo, zone_hi;
2322
1
2323
1
    zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
2324
1
    zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
2325
1
2326
1
    zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
2327
1
    zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
2328
1
2329
1
    return avail_heap_pages(zone_lo, zone_hi, node);
2330
1
}
2331
2332
unsigned long avail_domheap_pages(void)
2333
0
{
2334
0
    return avail_heap_pages(MEMZONE_XEN + 1,
2335
0
                            NR_ZONES - 1,
2336
0
                            -1);
2337
0
}
2338
2339
unsigned long avail_node_heap_pages(unsigned int nodeid)
2340
0
{
2341
0
    return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
2342
0
}
2343
2344
2345
static void pagealloc_info(unsigned char key)
2346
0
{
2347
0
    unsigned int zone = MEMZONE_XEN;
2348
0
    unsigned long n, total = 0;
2349
0
2350
0
    printk("Physical memory information:\n");
2351
0
    printk("    Xen heap: %lukB free\n",
2352
0
           avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
2353
0
2354
0
    while ( ++zone < NR_ZONES )
2355
0
    {
2356
0
        if ( (zone + PAGE_SHIFT) == dma_bitsize )
2357
0
        {
2358
0
            printk("    DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
2359
0
            total = 0;
2360
0
        }
2361
0
2362
0
        if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
2363
0
        {
2364
0
            total += n;
2365
0
            printk("    heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
2366
0
        }
2367
0
    }
2368
0
2369
0
    printk("    Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
2370
0
}
2371
2372
static __init int pagealloc_keyhandler_init(void)
2373
1
{
2374
1
    register_keyhandler('m', pagealloc_info, "memory info", 1);
2375
1
    return 0;
2376
1
}
2377
__initcall(pagealloc_keyhandler_init);
2378
2379
2380
void scrub_one_page(struct page_info *pg)
2381
34.7k
{
2382
34.7k
    if ( unlikely(pg->count_info & PGC_broken) )
2383
0
        return;
2384
34.7k
2385
34.7k
#ifndef NDEBUG
2386
34.7k
    /* Avoid callers relying on allocations returning zeroed pages. */
2387
34.7k
    unmap_domain_page(memset(__map_domain_page(pg),
2388
34.7k
                             SCRUB_BYTE_PATTERN, PAGE_SIZE));
2389
34.7k
#else
2390
    /* For a production build, clear_page() is the fastest way to scrub. */
2391
    clear_domain_page(_mfn(page_to_mfn(pg)));
2392
#endif
2393
34.7k
}
2394
2395
static void dump_heap(unsigned char key)
2396
0
{
2397
0
    s_time_t      now = NOW();
2398
0
    int           i, j;
2399
0
2400
0
    printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
2401
0
           (u32)(now>>32), (u32)now);
2402
0
2403
0
    for ( i = 0; i < MAX_NUMNODES; i++ )
2404
0
    {
2405
0
        if ( !avail[i] )
2406
0
            continue;
2407
0
        for ( j = 0; j < NR_ZONES; j++ )
2408
0
            printk("heap[node=%d][zone=%d] -> %lu pages\n",
2409
0
                   i, j, avail[i][j]);
2410
0
    }
2411
0
2412
0
    for ( i = 0; i < MAX_NUMNODES; i++ )
2413
0
    {
2414
0
        if ( !node_need_scrub[i] )
2415
0
            continue;
2416
0
        printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]);
2417
0
    }
2418
0
}
2419
2420
static __init int register_heap_trigger(void)
2421
1
{
2422
1
    register_keyhandler('H', dump_heap, "dump heap info", 1);
2423
1
    return 0;
2424
1
}
2425
__initcall(register_heap_trigger);
2426
2427
/*
2428
 * Local variables:
2429
 * mode: C
2430
 * c-file-style: "BSD"
2431
 * c-basic-offset: 4
2432
 * tab-width: 4
2433
 * indent-tabs-mode: nil
2434
 * End:
2435
 */