Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/numa.c
Line
Count
Source (jump to first uncovered line)
1
/* 
2
 * Generic VM initialization for x86-64 NUMA setups.
3
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4
 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5
 */ 
6
7
#include <xen/mm.h>
8
#include <xen/string.h>
9
#include <xen/init.h>
10
#include <xen/ctype.h>
11
#include <xen/nodemask.h>
12
#include <xen/numa.h>
13
#include <xen/keyhandler.h>
14
#include <xen/time.h>
15
#include <xen/smp.h>
16
#include <xen/pfn.h>
17
#include <asm/acpi.h>
18
#include <xen/sched.h>
19
#include <xen/softirq.h>
20
21
static int numa_setup(const char *s);
22
custom_param("numa", numa_setup);
23
24
#ifndef Dprintk
25
#define Dprintk(x...)
26
#endif
27
28
/* from proto.h */
29
#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
30
31
struct node_data node_data[MAX_NUMNODES];
32
33
/* Mapping from pdx to node id */
34
int memnode_shift;
35
static typeof(*memnodemap) _memnodemap[64];
36
unsigned long memnodemapsize;
37
u8 *memnodemap;
38
39
nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
40
    [0 ... NR_CPUS-1] = NUMA_NO_NODE
41
};
42
/*
43
 * Keep BIOS's CPU2node information, should not be used for memory allocaion
44
 */
45
nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
46
    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
47
};
48
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
49
50
nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
51
52
bool numa_off;
53
s8 acpi_numa = 0;
54
55
int srat_disabled(void)
56
0
{
57
0
    return numa_off || acpi_numa < 0;
58
0
}
59
60
/*
61
 * Given a shift value, try to populate memnodemap[]
62
 * Returns :
63
 * 1 if OK
64
 * 0 if memnodmap[] too small (of shift too small)
65
 * -1 if node overlap or lost ram (shift too big)
66
 */
67
static int __init populate_memnodemap(const struct node *nodes,
68
                                      int numnodes, int shift, nodeid_t *nodeids)
69
0
{
70
0
    unsigned long spdx, epdx;
71
0
    int i, res = -1;
72
0
73
0
    memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
74
0
    for ( i = 0; i < numnodes; i++ )
75
0
    {
76
0
        spdx = paddr_to_pdx(nodes[i].start);
77
0
        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
78
0
        if ( spdx >= epdx )
79
0
            continue;
80
0
        if ( (epdx >> shift) >= memnodemapsize )
81
0
            return 0;
82
0
        do {
83
0
            if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
84
0
                return -1;
85
0
86
0
            if ( !nodeids )
87
0
                memnodemap[spdx >> shift] = i;
88
0
            else
89
0
                memnodemap[spdx >> shift] = nodeids[i];
90
0
91
0
            spdx += (1UL << shift);
92
0
        } while ( spdx < epdx );
93
0
        res = 1;
94
0
    }
95
0
96
0
    return res;
97
0
}
98
99
static int __init allocate_cachealigned_memnodemap(void)
100
0
{
101
0
    unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
102
0
    unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
103
0
104
0
    memnodemap = mfn_to_virt(mfn);
105
0
    mfn <<= PAGE_SHIFT;
106
0
    size <<= PAGE_SHIFT;
107
0
    printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
108
0
           mfn, mfn + size);
109
0
    memnodemapsize = size / sizeof(*memnodemap);
110
0
111
0
    return 0;
112
0
}
113
114
/*
115
 * The LSB of all start and end addresses in the node map is the value of the
116
 * maximum possible shift.
117
 */
118
static int __init extract_lsb_from_nodes(const struct node *nodes,
119
                                         int numnodes)
120
0
{
121
0
    int i, nodes_used = 0;
122
0
    unsigned long spdx, epdx;
123
0
    unsigned long bitfield = 0, memtop = 0;
124
0
125
0
    for ( i = 0; i < numnodes; i++ )
126
0
    {
127
0
        spdx = paddr_to_pdx(nodes[i].start);
128
0
        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
129
0
        if ( spdx >= epdx )
130
0
            continue;
131
0
        bitfield |= spdx;
132
0
        nodes_used++;
133
0
        if ( epdx > memtop )
134
0
            memtop = epdx;
135
0
    }
136
0
    if ( nodes_used <= 1 )
137
0
        i = BITS_PER_LONG - 1;
138
0
    else
139
0
        i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
140
0
    memnodemapsize = (memtop >> i) + 1;
141
0
    return i;
142
0
}
143
144
int __init compute_hash_shift(struct node *nodes, int numnodes,
145
                              nodeid_t *nodeids)
146
0
{
147
0
    int shift;
148
0
149
0
    shift = extract_lsb_from_nodes(nodes, numnodes);
150
0
    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
151
0
        memnodemap = _memnodemap;
152
0
    else if ( allocate_cachealigned_memnodemap() )
153
0
        return -1;
154
0
    printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
155
0
156
0
    if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
157
0
    {
158
0
        printk(KERN_INFO "Your memory is not aligned you need to "
159
0
               "rebuild your hypervisor with a bigger NODEMAPSIZE "
160
0
               "shift=%d\n", shift);
161
0
        return -1;
162
0
    }
163
0
164
0
    return shift;
165
0
}
166
/* initialize NODE_DATA given nodeid and start/end */
167
void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
168
1
{ 
169
1
    unsigned long start_pfn, end_pfn;
170
1
171
1
    start_pfn = start >> PAGE_SHIFT;
172
1
    end_pfn = end >> PAGE_SHIFT;
173
1
174
1
    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
175
1
    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
176
1
177
1
    node_set_online(nodeid);
178
1
} 
179
180
void __init numa_init_array(void)
181
0
{
182
0
    int rr, i;
183
0
184
0
    /* There are unfortunately some poorly designed mainboards around
185
0
       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
186
0
       mapping. To avoid this fill in the mapping for all possible
187
0
       CPUs, as the number of CPUs is not known yet.
188
0
       We round robin the existing nodes. */
189
0
    rr = first_node(node_online_map);
190
0
    for ( i = 0; i < nr_cpu_ids; i++ )
191
0
    {
192
0
        if ( cpu_to_node[i] != NUMA_NO_NODE )
193
0
            continue;
194
0
        numa_set_node(i, rr);
195
0
        rr = next_node(rr, node_online_map);
196
0
        if ( rr == MAX_NUMNODES )
197
0
            rr = first_node(node_online_map);
198
0
    }
199
0
}
200
201
#ifdef CONFIG_NUMA_EMU
202
static int numa_fake __initdata = 0;
203
204
/* Numa emulation */
205
static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
206
0
{
207
0
    int i;
208
0
    struct node nodes[MAX_NUMNODES];
209
0
    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
210
0
211
0
    /* Kludge needed for the hash function */
212
0
    if ( hweight64(sz) > 1 )
213
0
    {
214
0
        u64 x = 1;
215
0
        while ( (x << 1) < sz )
216
0
            x <<= 1;
217
0
        if ( x < sz/2 )
218
0
            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
219
0
        sz = x;
220
0
    }
221
0
222
0
    memset(&nodes,0,sizeof(nodes));
223
0
    for ( i = 0; i < numa_fake; i++ )
224
0
    {
225
0
        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
226
0
        if ( i == numa_fake - 1 )
227
0
            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
228
0
        nodes[i].end = nodes[i].start + sz;
229
0
        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
230
0
               i,
231
0
               nodes[i].start, nodes[i].end,
232
0
               (nodes[i].end - nodes[i].start) >> 20);
233
0
        node_set_online(i);
234
0
    }
235
0
    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
236
0
    if ( memnode_shift < 0 )
237
0
    {
238
0
        memnode_shift = 0;
239
0
        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
240
0
        return -1;
241
0
    }
242
0
    for_each_online_node ( i )
243
0
        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
244
0
    numa_init_array();
245
0
246
0
    return 0;
247
0
}
248
#endif
249
250
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
251
1
{ 
252
1
    int i;
253
1
254
1
#ifdef CONFIG_NUMA_EMU
255
1
    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
256
0
        return;
257
1
#endif
258
1
259
1
#ifdef CONFIG_ACPI_NUMA
260
1
    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
261
1
         (u64)end_pfn << PAGE_SHIFT) )
262
0
        return;
263
1
#endif
264
1
265
1
    printk(KERN_INFO "%s\n",
266
1
           numa_off ? "NUMA turned off" : "No NUMA configuration found");
267
1
268
1
    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
269
1
           (u64)start_pfn << PAGE_SHIFT,
270
1
           (u64)end_pfn << PAGE_SHIFT);
271
1
    /* setup dummy node covering all memory */
272
1
    memnode_shift = BITS_PER_LONG - 1;
273
1
    memnodemap = _memnodemap;
274
1
    nodes_clear(node_online_map);
275
1
    node_set_online(0);
276
257
    for ( i = 0; i < nr_cpu_ids; i++ )
277
256
        numa_set_node(i, 0);
278
1
    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
279
1
    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
280
1
                    (u64)end_pfn << PAGE_SHIFT);
281
1
}
282
283
void numa_add_cpu(int cpu)
284
12
{
285
12
    cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
286
12
} 
287
288
void numa_set_node(int cpu, nodeid_t node)
289
280
{
290
280
    cpu_to_node[cpu] = node;
291
280
}
292
293
/* [numa=off] */
294
static __init int numa_setup(const char *opt)
295
0
{
296
0
    if ( !strncmp(opt,"off",3) )
297
0
        numa_off = true;
298
0
    else if ( !strncmp(opt,"on",2) )
299
0
        numa_off = false;
300
0
#ifdef CONFIG_NUMA_EMU
301
0
    else if ( !strncmp(opt, "fake=", 5) )
302
0
    {
303
0
        numa_off = false;
304
0
        numa_fake = simple_strtoul(opt+5,NULL,0);
305
0
        if ( numa_fake >= MAX_NUMNODES )
306
0
            numa_fake = MAX_NUMNODES;
307
0
    }
308
0
#endif
309
0
#ifdef CONFIG_ACPI_NUMA
310
0
    else if ( !strncmp(opt,"noacpi",6) )
311
0
    {
312
0
        numa_off = false;
313
0
        acpi_numa = -1;
314
0
    }
315
0
#endif
316
0
    else
317
0
        return -EINVAL;
318
0
319
0
    return 0;
320
0
} 
321
322
/*
323
 * Setup early cpu_to_node.
324
 *
325
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
326
 * and apicid_to_node[] tables have valid entries for a CPU.
327
 * This means we skip cpu_to_node[] initialisation for NUMA
328
 * emulation and faking node case (when running a kernel compiled
329
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
330
 * is already initialized in a round robin manner at numa_init_array,
331
 * prior to this call, and this initialization is good enough
332
 * for the fake NUMA cases.
333
 */
334
void __init init_cpu_to_node(void)
335
1
{
336
1
    unsigned int i;
337
1
    nodeid_t node;
338
1
339
13
    for ( i = 0; i < nr_cpu_ids; i++ )
340
12
    {
341
12
        u32 apicid = x86_cpu_to_apicid[i];
342
12
        if ( apicid == BAD_APICID )
343
0
            continue;
344
12
        node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
345
12
        if ( node == NUMA_NO_NODE || !node_online(node) )
346
12
            node = 0;
347
12
        numa_set_node(i, node);
348
12
    }
349
1
}
350
351
unsigned int __init arch_get_dma_bitsize(void)
352
0
{
353
0
    unsigned int node;
354
0
355
0
    for_each_online_node(node)
356
0
        if ( node_spanned_pages(node) &&
357
0
             !(node_start_pfn(node) >> (32 - PAGE_SHIFT)) )
358
0
            break;
359
0
    if ( node >= MAX_NUMNODES )
360
0
        panic("No node with memory below 4Gb");
361
0
362
0
    /*
363
0
     * Try to not reserve the whole node's memory for DMA, but dividing
364
0
     * its spanned pages by (arbitrarily chosen) 4.
365
0
     */
366
0
    return min_t(unsigned int,
367
0
                 flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
368
0
                 + PAGE_SHIFT, 32);
369
0
}
370
371
static void dump_numa(unsigned char key)
372
0
{
373
0
    s_time_t now = NOW();
374
0
    unsigned int i, j, n;
375
0
    int err;
376
0
    struct domain *d;
377
0
    struct page_info *page;
378
0
    unsigned int page_num_node[MAX_NUMNODES];
379
0
    const struct vnuma_info *vnuma;
380
0
381
0
    printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
382
0
           (u32)(now>>32), (u32)now);
383
0
384
0
    for_each_online_node ( i )
385
0
    {
386
0
        paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
387
0
388
0
        printk("NODE%u start->%lu size->%lu free->%lu\n",
389
0
               i, node_start_pfn(i), node_spanned_pages(i),
390
0
               avail_node_heap_pages(i));
391
0
        /* sanity check phys_to_nid() */
392
0
        if ( phys_to_nid(pa) != i )
393
0
            printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
394
0
                   pa, phys_to_nid(pa), i);
395
0
    }
396
0
397
0
    j = cpumask_first(&cpu_online_map);
398
0
    n = 0;
399
0
    for_each_online_cpu ( i )
400
0
    {
401
0
        if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
402
0
        {
403
0
            if ( n > 1 )
404
0
                printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
405
0
            else
406
0
                printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
407
0
            j = i;
408
0
            n = 1;
409
0
        }
410
0
        else
411
0
            ++n;
412
0
    }
413
0
    if ( n > 1 )
414
0
        printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
415
0
    else
416
0
        printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
417
0
418
0
    rcu_read_lock(&domlist_read_lock);
419
0
420
0
    printk("Memory location of each domain:\n");
421
0
    for_each_domain ( d )
422
0
    {
423
0
        process_pending_softirqs();
424
0
425
0
        printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
426
0
427
0
        for_each_online_node ( i )
428
0
            page_num_node[i] = 0;
429
0
430
0
        spin_lock(&d->page_alloc_lock);
431
0
        page_list_for_each(page, &d->page_list)
432
0
        {
433
0
            i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
434
0
            page_num_node[i]++;
435
0
        }
436
0
        spin_unlock(&d->page_alloc_lock);
437
0
438
0
        for_each_online_node ( i )
439
0
            printk("    Node %u: %u\n", i, page_num_node[i]);
440
0
441
0
        if ( !read_trylock(&d->vnuma_rwlock) )
442
0
            continue;
443
0
444
0
        if ( !d->vnuma )
445
0
        {
446
0
            read_unlock(&d->vnuma_rwlock);
447
0
            continue;
448
0
        }
449
0
450
0
        vnuma = d->vnuma;
451
0
        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
452
0
               vnuma->nr_vnodes, d->max_vcpus);
453
0
        for ( i = 0; i < vnuma->nr_vnodes; i++ )
454
0
        {
455
0
            unsigned int start_cpu = ~0U;
456
0
457
0
            err = snprintf(keyhandler_scratch, 12, "%3u",
458
0
                    vnuma->vnode_to_pnode[i]);
459
0
            if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
460
0
                strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
461
0
462
0
            printk("       %3u: pnode %s,", i, keyhandler_scratch);
463
0
464
0
            printk(" vcpus ");
465
0
466
0
            for ( j = 0; j < d->max_vcpus; j++ )
467
0
            {
468
0
                if ( !(j & 0x3f) )
469
0
                    process_pending_softirqs();
470
0
471
0
                if ( vnuma->vcpu_to_vnode[j] == i )
472
0
                {
473
0
                    if ( start_cpu == ~0U )
474
0
                    {
475
0
                        printk("%d", j);
476
0
                        start_cpu = j;
477
0
                    }
478
0
                }
479
0
                else if ( start_cpu != ~0U )
480
0
                {
481
0
                    if ( j - 1 != start_cpu )
482
0
                        printk("-%d ", j - 1);
483
0
                    else
484
0
                        printk(" ");
485
0
                    start_cpu = ~0U;
486
0
                }
487
0
            }
488
0
489
0
            if ( start_cpu != ~0U  && start_cpu != j - 1 )
490
0
                printk("-%d", j - 1);
491
0
492
0
            printk("\n");
493
0
494
0
            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
495
0
            {
496
0
                if ( vnuma->vmemrange[j].nid == i )
497
0
                    printk("           %016"PRIx64" - %016"PRIx64"\n",
498
0
                           vnuma->vmemrange[j].start,
499
0
                           vnuma->vmemrange[j].end);
500
0
            }
501
0
        }
502
0
503
0
        read_unlock(&d->vnuma_rwlock);
504
0
    }
505
0
506
0
    rcu_read_unlock(&domlist_read_lock);
507
0
}
508
509
static __init int register_numa_trigger(void)
510
1
{
511
1
    register_keyhandler('u', dump_numa, "dump NUMA info", 1);
512
1
    return 0;
513
1
}
514
__initcall(register_numa_trigger);
515