Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/smpboot.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * x86 SMP booting functions
3
 *
4
 * This inherits a great deal from Linux's SMP boot code:
5
 *  (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6
 *  (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include <xen/init.h>
23
#include <xen/kernel.h>
24
#include <xen/mm.h>
25
#include <xen/domain.h>
26
#include <xen/domain_page.h>
27
#include <xen/sched.h>
28
#include <xen/sched-if.h>
29
#include <xen/irq.h>
30
#include <xen/delay.h>
31
#include <xen/softirq.h>
32
#include <xen/tasklet.h>
33
#include <xen/serial.h>
34
#include <xen/numa.h>
35
#include <xen/cpu.h>
36
#include <asm/current.h>
37
#include <asm/mc146818rtc.h>
38
#include <asm/desc.h>
39
#include <asm/div64.h>
40
#include <asm/flushtlb.h>
41
#include <asm/msr.h>
42
#include <asm/mtrr.h>
43
#include <asm/time.h>
44
#include <asm/tboot.h>
45
#include <mach_apic.h>
46
#include <mach_wakecpu.h>
47
#include <smpboot_hooks.h>
48
49
/* Override macros from asm/page.h to make them work with mfn_t */
50
#undef mfn_to_page
51
11
#define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
52
#undef page_to_mfn
53
13
#define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
54
55
11
#define setup_trampoline()    (bootsym_phys(trampoline_realmode_entry))
56
57
unsigned long __read_mostly trampoline_phys;
58
59
/* representing HT siblings of each logical CPU */
60
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
61
/* representing HT and core siblings of each logical CPU */
62
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
63
64
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
65
static cpumask_t scratch_cpu0mask;
66
67
cpumask_t cpu_online_map __read_mostly;
68
EXPORT_SYMBOL(cpu_online_map);
69
70
unsigned int __read_mostly nr_sockets;
71
cpumask_t **__read_mostly socket_cpumask;
72
static cpumask_t *secondary_socket_cpumask;
73
74
struct cpuinfo_x86 cpu_data[NR_CPUS];
75
76
u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
77
  { [0 ... NR_CPUS-1] = BAD_APICID };
78
79
static int cpu_error;
80
static enum cpu_state {
81
    CPU_STATE_DYING,    /* slave -> master: I am dying */
82
    CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
83
    CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
84
    CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
85
    CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
86
    CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
87
} cpu_state;
88
44
#define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0)
89
90
void *stack_base[NR_CPUS];
91
92
void initialize_cpu_data(unsigned int cpu)
93
13
{
94
13
    cpu_data[cpu] = boot_cpu_data;
95
13
}
96
97
static void smp_store_cpu_info(int id)
98
11
{
99
11
    unsigned int socket;
100
11
101
11
    identify_cpu(&cpu_data[id]);
102
11
103
11
    socket = cpu_to_socket(id);
104
11
    if ( !socket_cpumask[socket] )
105
0
    {
106
0
        socket_cpumask[socket] = secondary_socket_cpumask;
107
0
        secondary_socket_cpumask = NULL;
108
0
    }
109
11
}
110
111
/*
112
 * TSC's upper 32 bits can't be written in earlier CPUs (before
113
 * Prescott), there is no way to resync one AP against BP.
114
 */
115
bool disable_tsc_sync;
116
117
static atomic_t tsc_count;
118
static uint64_t tsc_value;
119
static cpumask_t tsc_sync_cpu_mask;
120
121
static void synchronize_tsc_master(unsigned int slave)
122
11
{
123
11
    unsigned int i;
124
11
125
11
    if ( disable_tsc_sync )
126
0
        return;
127
11
128
11
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
129
11
         !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
130
11
        return;
131
11
132
0
    for ( i = 1; i <= 5; i++ )
133
0
    {
134
0
        tsc_value = rdtsc_ordered();
135
0
        wmb();
136
0
        atomic_inc(&tsc_count);
137
0
        while ( atomic_read(&tsc_count) != (i<<1) )
138
0
            cpu_relax();
139
0
    }
140
0
141
0
    atomic_set(&tsc_count, 0);
142
0
    cpumask_clear_cpu(slave, &tsc_sync_cpu_mask);
143
0
}
144
145
static void synchronize_tsc_slave(unsigned int slave)
146
11
{
147
11
    unsigned int i;
148
11
149
11
    if ( disable_tsc_sync )
150
0
        return;
151
11
152
11
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
153
11
         !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
154
11
        return;
155
11
156
0
    for ( i = 1; i <= 5; i++ )
157
0
    {
158
0
        while ( atomic_read(&tsc_count) != ((i<<1)-1) )
159
0
            cpu_relax();
160
0
        rmb();
161
0
        /*
162
0
         * If a CPU has been physically hotplugged, we may as well write
163
0
         * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
164
0
         * not sync up a new CPU's TSC for us.
165
0
         */
166
0
        __write_tsc(tsc_value);
167
0
        atomic_inc(&tsc_count);
168
0
    }
169
0
}
170
171
static void smp_callin(void)
172
11
{
173
11
    unsigned int cpu = smp_processor_id();
174
11
    int i, rc;
175
11
176
11
    /* Wait 2s total for startup. */
177
11
    Dprintk("Waiting for CALLOUT.\n");
178
11
    for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
179
0
    {
180
0
        BUG_ON(i >= 200);
181
0
        cpu_relax();
182
0
        mdelay(10);
183
0
    }
184
11
185
11
    /*
186
11
     * The boot CPU has finished the init stage and is spinning on cpu_state
187
11
     * update until we finish. We are free to set up this CPU: first the APIC.
188
11
     */
189
11
    Dprintk("CALLIN, before setup_local_APIC().\n");
190
11
    x2apic_ap_setup();
191
11
    setup_local_APIC();
192
11
193
11
    /* Save our processor parameters. */
194
11
    smp_store_cpu_info(cpu);
195
11
196
11
    if ( (rc = hvm_cpu_up()) != 0 )
197
0
    {
198
0
        printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu);
199
0
        cpu_error = rc;
200
0
        clear_local_APIC();
201
0
        spin_debug_enable();
202
0
        cpu_exit_clear(cpu);
203
0
        (*dead_idle)();
204
0
    }
205
11
206
11
    /* Allow the master to continue. */
207
11
    set_cpu_state(CPU_STATE_CALLIN);
208
11
209
11
    synchronize_tsc_slave(cpu);
210
11
211
11
    /* And wait for our final Ack. */
212
98.5k
    while ( cpu_state != CPU_STATE_ONLINE )
213
98.5k
        cpu_relax();
214
11
}
215
216
static int booting_cpu;
217
218
/* CPUs for which sibling maps can be computed. */
219
static cpumask_t cpu_sibling_setup_map;
220
221
static void link_thread_siblings(int cpu1, int cpu2)
222
18
{
223
18
    cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2));
224
18
    cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1));
225
18
    cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2));
226
18
    cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
227
18
}
228
229
static void set_cpu_sibling_map(int cpu)
230
12
{
231
12
    int i;
232
12
    struct cpuinfo_x86 *c = cpu_data;
233
12
234
12
    cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
235
12
236
12
    cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
237
12
238
12
    if ( c[cpu].x86_num_siblings > 1 )
239
12
    {
240
12
        for_each_cpu ( i, &cpu_sibling_setup_map )
241
78
        {
242
78
            if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
243
0
                if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
244
0
                     (c[cpu].compute_unit_id == c[i].compute_unit_id) )
245
0
                    link_thread_siblings(cpu, i);
246
78
            } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
247
78
                        (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
248
18
                link_thread_siblings(cpu, i);
249
18
            }
250
78
        }
251
12
    }
252
12
    else
253
0
    {
254
0
        cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
255
0
    }
256
12
257
12
    if ( c[cpu].x86_max_cores == 1 )
258
0
    {
259
0
        cpumask_copy(per_cpu(cpu_core_mask, cpu),
260
0
                     per_cpu(cpu_sibling_mask, cpu));
261
0
        c[cpu].booted_cores = 1;
262
0
        return;
263
0
    }
264
12
265
12
    for_each_cpu ( i, &cpu_sibling_setup_map )
266
78
    {
267
78
        if ( c[cpu].phys_proc_id == c[i].phys_proc_id )
268
78
        {
269
78
            cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu));
270
78
            cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i));
271
78
            /*
272
78
             *  Does this new cpu bringup a new core?
273
78
             */
274
78
            if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
275
36
            {
276
36
                /*
277
36
                 * for each core in package, increment
278
36
                 * the booted_cores for this new cpu
279
36
                 */
280
36
                if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i )
281
21
                    c[cpu].booted_cores++;
282
36
                /*
283
36
                 * increment the core count for all
284
36
                 * the other cpus in this package
285
36
                 */
286
36
                if ( i != cpu )
287
30
                    c[i].booted_cores++;
288
36
            }
289
42
            else if ( (i != cpu) && !c[cpu].booted_cores )
290
6
            {
291
6
                c[cpu].booted_cores = c[i].booted_cores;
292
6
            }
293
78
        }
294
78
    }
295
12
}
296
297
void start_secondary(void *unused)
298
11
{
299
11
    /*
300
11
     * Dont put anything before smp_callin(), SMP booting is so fragile that we
301
11
     * want to limit the things done here to the most necessary things.
302
11
     */
303
11
    unsigned int cpu = booting_cpu;
304
11
305
11
    /* Critical region without IDT or TSS.  Any fault is deadly! */
306
11
307
11
    set_processor_id(cpu);
308
11
    set_current(idle_vcpu[cpu]);
309
11
    this_cpu(curr_vcpu) = idle_vcpu[cpu];
310
11
    rdmsrl(MSR_EFER, this_cpu(efer));
311
11
312
11
    /*
313
11
     * Just as during early bootstrap, it is convenient here to disable
314
11
     * spinlock checking while we have IRQs disabled. This allows us to
315
11
     * acquire IRQ-unsafe locks when it would otherwise be disallowed.
316
11
     *
317
11
     * It is safe because the race we are usually trying to avoid involves
318
11
     * a group of CPUs rendezvousing in an IPI handler, where one cannot
319
11
     * join because it is spinning with IRQs disabled waiting to acquire a
320
11
     * lock held by another in the rendezvous group (the lock must be an
321
11
     * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and
322
11
     * hence had IRQs enabled). This is a deadlock scenario.
323
11
     *
324
11
     * However, no CPU can be involved in rendezvous until it is online,
325
11
     * hence no such group can be waiting for this CPU until it is
326
11
     * visible in cpu_online_map. Hence such a deadlock is not possible.
327
11
     */
328
11
    spin_debug_disable();
329
11
330
11
    load_system_tables();
331
11
332
11
    /* Full exception support from here on in. */
333
11
334
11
    /* Safe to enable feature such as CR4.MCE with the IDT set up now. */
335
11
    write_cr4(mmu_cr4_features);
336
11
337
11
    percpu_traps_init();
338
11
339
11
    cpu_init();
340
11
341
11
    initialize_cpu_data(cpu);
342
11
343
11
    if ( system_state <= SYS_STATE_smp_boot )
344
11
        early_microcode_update_cpu(false);
345
11
    else
346
0
        microcode_resume_cpu(cpu);
347
11
348
11
    smp_callin();
349
11
350
11
    init_percpu_time();
351
11
352
11
    setup_secondary_APIC_clock();
353
11
354
11
    /*
355
11
     * low-memory mappings have been cleared, flush them from
356
11
     * the local TLBs too.
357
11
     */
358
11
    flush_tlb_local();
359
11
360
11
    /* This must be done before setting cpu_online_map */
361
11
    spin_debug_enable();
362
11
    set_cpu_sibling_map(cpu);
363
11
    notify_cpu_starting(cpu);
364
11
    wmb();
365
11
366
11
    /*
367
11
     * We need to hold vector_lock so there the set of online cpus
368
11
     * does not change while we are assigning vectors to cpus.  Holding
369
11
     * this lock ensures we don't half assign or remove an irq from a cpu.
370
11
     */
371
11
    lock_vector_lock();
372
11
    setup_vector_irq(cpu);
373
11
    cpumask_set_cpu(cpu, &cpu_online_map);
374
11
    unlock_vector_lock();
375
11
376
11
    /* We can take interrupts now: we're officially "up". */
377
11
    local_irq_enable();
378
11
    mtrr_ap_init();
379
11
380
11
    wmb();
381
11
    startup_cpu_idle_loop();
382
11
}
383
384
extern void *stack_start;
385
386
static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
387
11
{
388
11
    unsigned long send_status = 0, accept_status = 0;
389
11
    int maxlvt, timeout, i;
390
11
391
11
    /*
392
11
     * Be paranoid about clearing APIC errors.
393
11
     */
394
11
    apic_write(APIC_ESR, 0);
395
11
    apic_read(APIC_ESR);
396
11
397
11
    Dprintk("Asserting INIT.\n");
398
11
399
11
    /*
400
11
     * Turn INIT on target chip via IPI
401
11
     */
402
11
    apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
403
11
                   phys_apicid);
404
11
405
11
    if ( !x2apic_enabled )
406
0
    {
407
0
        Dprintk("Waiting for send to finish...\n");
408
0
        timeout = 0;
409
0
        do {
410
0
            Dprintk("+");
411
0
            udelay(100);
412
0
            send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
413
0
        } while ( send_status && (timeout++ < 1000) );
414
0
415
0
        mdelay(10);
416
0
417
0
        Dprintk("Deasserting INIT.\n");
418
0
419
0
        apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
420
0
421
0
        Dprintk("Waiting for send to finish...\n");
422
0
        timeout = 0;
423
0
        do {
424
0
            Dprintk("+");
425
0
            udelay(100);
426
0
            send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
427
0
        } while ( send_status && (timeout++ < 1000) );
428
0
    }
429
11
    else if ( tboot_in_measured_env() )
430
0
    {
431
0
        /*
432
0
         * With tboot AP is actually spinning in a mini-guest before
433
0
         * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit,
434
0
         * update VMCS to tracking SIPIs and VMResume.
435
0
         *
436
0
         * While AP is in root mode handling the INIT the CPU will drop
437
0
         * any SIPIs
438
0
         */
439
0
        udelay(10);
440
0
    }
441
11
442
11
    maxlvt = get_maxlvt();
443
11
444
33
    for ( i = 0; i < 2; i++ )
445
22
    {
446
22
        Dprintk("Sending STARTUP #%d.\n", i+1);
447
22
        apic_write(APIC_ESR, 0);
448
22
        apic_read(APIC_ESR);
449
22
        Dprintk("After apic_write.\n");
450
22
451
22
        /*
452
22
         * STARTUP IPI
453
22
         * Boot on the stack
454
22
         */
455
22
        apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
456
22
457
22
        if ( !x2apic_enabled )
458
0
        {
459
0
            /* Give the other CPU some time to accept the IPI. */
460
0
            udelay(300);
461
0
462
0
            Dprintk("Startup point 1.\n");
463
0
464
0
            Dprintk("Waiting for send to finish...\n");
465
0
            timeout = 0;
466
0
            do {
467
0
                Dprintk("+");
468
0
                udelay(100);
469
0
                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
470
0
            } while ( send_status && (timeout++ < 1000) );
471
0
472
0
            /* Give the other CPU some time to accept the IPI. */
473
0
            udelay(200);
474
0
        }
475
22
476
22
        /* Due to the Pentium erratum 3AP. */
477
22
        if ( maxlvt > 3 )
478
22
        {
479
22
            apic_write(APIC_ESR, 0);
480
22
        }
481
22
        accept_status = (apic_read(APIC_ESR) & 0xEF);
482
22
        if ( send_status || accept_status )
483
0
            break;
484
22
    }
485
11
    Dprintk("After Startup.\n");
486
11
487
11
    if ( send_status )
488
0
        printk("APIC never delivered???\n");
489
11
    if ( accept_status )
490
0
        printk("APIC delivery error (%lx).\n", accept_status);
491
11
492
11
    return (send_status | accept_status);
493
11
}
494
495
int alloc_cpu_id(void)
496
11
{
497
11
    cpumask_t tmp_map;
498
11
    int cpu;
499
11
500
11
    cpumask_complement(&tmp_map, &cpu_present_map);
501
11
    cpu = cpumask_first(&tmp_map);
502
11
    return (cpu < nr_cpu_ids) ? cpu : -ENODEV;
503
11
}
504
505
static int do_boot_cpu(int apicid, int cpu)
506
11
{
507
11
    int timeout, boot_error = 0, rc = 0;
508
11
    unsigned long start_eip;
509
11
510
11
    /*
511
11
     * Save current MTRR state in case it was changed since early boot
512
11
     * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
513
11
     */
514
11
    mtrr_save_state();
515
11
516
11
    booting_cpu = cpu;
517
11
518
11
    /* start_eip had better be page-aligned! */
519
11
    start_eip = setup_trampoline();
520
11
521
11
    /* So we see what's up   */
522
11
    if ( opt_cpu_info )
523
0
        printk("Booting processor %d/%d eip %lx\n",
524
0
               cpu, apicid, start_eip);
525
11
526
11
    stack_start = stack_base[cpu];
527
11
528
11
    /* This grunge runs the startup process for the targeted processor. */
529
11
530
11
    set_cpu_state(CPU_STATE_INIT);
531
11
532
11
    Dprintk("Setting warm reset code and vector.\n");
533
11
534
11
    smpboot_setup_warm_reset_vector(start_eip);
535
11
536
11
    /* Starting actual IPI sequence... */
537
11
    if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) )
538
11
        boot_error = wakeup_secondary_cpu(apicid, start_eip);
539
11
540
11
    if ( !boot_error )
541
11
    {
542
11
        /* Allow AP to start initializing. */
543
11
        set_cpu_state(CPU_STATE_CALLOUT);
544
11
        Dprintk("After Callout %d.\n", cpu);
545
11
546
11
        /* Wait 5s total for a response. */
547
22
        for ( timeout = 0; timeout < 50000; timeout++ )
548
22
        {
549
22
            if ( cpu_state != CPU_STATE_CALLOUT )
550
11
                break;
551
11
            udelay(100);
552
11
        }
553
11
554
11
        if ( cpu_state == CPU_STATE_CALLIN )
555
11
        {
556
11
            /* number CPUs logically, starting from 1 (BSP is 0) */
557
11
            Dprintk("OK.\n");
558
11
            print_cpu_info(cpu);
559
11
            synchronize_tsc_master(cpu);
560
11
            Dprintk("CPU has booted.\n");
561
11
        }
562
0
        else if ( cpu_state == CPU_STATE_DEAD )
563
0
        {
564
0
            rmb();
565
0
            rc = cpu_error;
566
0
        }
567
0
        else
568
0
        {
569
0
            boot_error = 1;
570
0
            mb();
571
0
            if ( bootsym(trampoline_cpu_started) == 0xA5 )
572
0
                /* trampoline started but...? */
573
0
                printk("Stuck ??\n");
574
0
            else
575
0
                /* trampoline code not run */
576
0
                printk("Not responding.\n");
577
0
        }
578
11
    }
579
11
580
11
    if ( boot_error )
581
0
    {
582
0
        cpu_exit_clear(cpu);
583
0
        rc = -EIO;
584
0
    }
585
11
586
11
    /* mark "stuck" area as not stuck */
587
11
    bootsym(trampoline_cpu_started) = 0;
588
11
    mb();
589
11
590
11
    smpboot_restore_warm_reset_vector();
591
11
592
11
    return rc;
593
11
}
594
595
11
#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
596
597
unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
598
12
{
599
12
    unsigned long stub_va;
600
12
    struct page_info *pg;
601
12
602
12
    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
603
12
604
12
    if ( *mfn )
605
11
        pg = mfn_to_page(_mfn(*mfn));
606
12
    else
607
1
    {
608
1
        nodeid_t node = cpu_to_node(cpu);
609
1
        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
610
1
611
1
        pg = alloc_domheap_page(NULL, memflags);
612
1
        if ( !pg )
613
0
            return 0;
614
1
615
1
        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
616
1
    }
617
12
618
12
    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
619
12
    if ( map_pages_to_xen(stub_va, mfn_x(page_to_mfn(pg)), 1,
620
12
                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
621
0
    {
622
0
        if ( !*mfn )
623
0
            free_domheap_page(pg);
624
0
        stub_va = 0;
625
0
    }
626
12
    else if ( !*mfn )
627
1
        *mfn = mfn_x(page_to_mfn(pg));
628
12
629
12
    return stub_va;
630
12
}
631
632
void cpu_exit_clear(unsigned int cpu)
633
0
{
634
0
    cpu_uninit(cpu);
635
0
    set_cpu_state(CPU_STATE_DEAD);
636
0
}
637
638
static void cpu_smpboot_free(unsigned int cpu)
639
0
{
640
0
    unsigned int order, socket = cpu_to_socket(cpu);
641
0
    struct cpuinfo_x86 *c = cpu_data;
642
0
643
0
    if ( cpumask_empty(socket_cpumask[socket]) )
644
0
    {
645
0
        xfree(socket_cpumask[socket]);
646
0
        socket_cpumask[socket] = NULL;
647
0
    }
648
0
649
0
    c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
650
0
    c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
651
0
    c[cpu].compute_unit_id = INVALID_CUID;
652
0
    cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
653
0
654
0
    free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
655
0
    free_cpumask_var(per_cpu(cpu_core_mask, cpu));
656
0
    if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
657
0
        free_cpumask_var(per_cpu(scratch_cpumask, cpu));
658
0
659
0
    if ( per_cpu(stubs.addr, cpu) )
660
0
    {
661
0
        mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
662
0
        unsigned char *stub_page = map_domain_page(mfn);
663
0
        unsigned int i;
664
0
665
0
        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
666
0
        for ( i = 0; i < STUBS_PER_PAGE; ++i )
667
0
            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
668
0
                break;
669
0
        unmap_domain_page(stub_page);
670
0
        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
671
0
                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
672
0
        if ( i == STUBS_PER_PAGE )
673
0
            free_domheap_page(mfn_to_page(mfn));
674
0
    }
675
0
676
0
    order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
677
0
    free_xenheap_pages(per_cpu(gdt_table, cpu), order);
678
0
679
0
    free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
680
0
681
0
    order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
682
0
    free_xenheap_pages(idt_tables[cpu], order);
683
0
    idt_tables[cpu] = NULL;
684
0
685
0
    if ( stack_base[cpu] != NULL )
686
0
    {
687
0
        memguard_unguard_stack(stack_base[cpu]);
688
0
        free_xenheap_pages(stack_base[cpu], STACK_ORDER);
689
0
        stack_base[cpu] = NULL;
690
0
    }
691
0
}
692
693
static int cpu_smpboot_alloc(unsigned int cpu)
694
11
{
695
11
    unsigned int i, order, memflags = 0;
696
11
    nodeid_t node = cpu_to_node(cpu);
697
11
    struct desc_struct *gdt;
698
11
    unsigned long stub_page;
699
11
700
11
    if ( node != NUMA_NO_NODE )
701
11
        memflags = MEMF_node(node);
702
11
703
11
    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
704
11
    if ( stack_base[cpu] == NULL )
705
0
        goto oom;
706
11
    memguard_guard_stack(stack_base[cpu]);
707
11
708
11
    order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
709
11
    per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
710
11
    if ( gdt == NULL )
711
0
        goto oom;
712
11
    memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
713
11
    BUILD_BUG_ON(NR_CPUS > 0x10000);
714
11
    gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
715
11
716
11
    per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
717
11
    if ( gdt == NULL )
718
0
        goto oom;
719
11
    memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
720
11
    gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
721
11
722
11
    order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
723
11
    idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
724
11
    if ( idt_tables[cpu] == NULL )
725
0
        goto oom;
726
11
    memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
727
11
    set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
728
11
    set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
729
11
    set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
730
11
731
11
    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
732
11
          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
733
11
        if ( cpu_online(i) && cpu_to_node(i) == node )
734
11
        {
735
11
            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
736
11
            break;
737
11
        }
738
11
    BUG_ON(i == cpu);
739
11
    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
740
11
    if ( !stub_page )
741
0
        goto oom;
742
11
    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
743
11
744
11
    if ( secondary_socket_cpumask == NULL &&
745
1
         (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
746
0
        goto oom;
747
11
748
11
    if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
749
11
         zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
750
11
         alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
751
11
        return 0;
752
11
753
0
 oom:
754
0
    cpu_smpboot_free(cpu);
755
0
    return -ENOMEM;
756
11
}
757
758
static int cpu_smpboot_callback(
759
    struct notifier_block *nfb, unsigned long action, void *hcpu)
760
33
{
761
33
    unsigned int cpu = (unsigned long)hcpu;
762
33
    int rc = 0;
763
33
764
33
    switch ( action )
765
33
    {
766
11
    case CPU_UP_PREPARE:
767
11
        rc = cpu_smpboot_alloc(cpu);
768
11
        break;
769
0
    case CPU_UP_CANCELED:
770
0
    case CPU_DEAD:
771
0
        cpu_smpboot_free(cpu);
772
0
        break;
773
22
    default:
774
22
        break;
775
33
    }
776
33
777
33
    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
778
33
}
779
780
static struct notifier_block cpu_smpboot_nfb = {
781
    .notifier_call = cpu_smpboot_callback
782
};
783
784
void __init smp_prepare_cpus(unsigned int max_cpus)
785
1
{
786
1
    register_cpu_notifier(&cpu_smpboot_nfb);
787
1
788
1
    mtrr_aps_sync_begin();
789
1
790
1
    /* Setup boot CPU information */
791
1
    initialize_cpu_data(0); /* Final full version of the data */
792
1
    print_cpu_info(0);
793
1
794
1
    boot_cpu_physical_apicid = get_apic_id();
795
1
    x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
796
1
797
1
    stack_base[0] = stack_start;
798
1
799
1
    set_nr_sockets();
800
1
801
1
    socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
802
1
    if ( socket_cpumask == NULL ||
803
1
         (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
804
0
        panic("No memory for socket CPU siblings map");
805
1
806
1
    if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
807
1
         !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
808
0
        panic("No memory for boot CPU sibling/core maps");
809
1
810
1
    set_cpu_sibling_map(0);
811
1
812
1
    /*
813
1
     * If we couldn't find an SMP configuration at boot time,
814
1
     * get out of here now!
815
1
     */
816
1
    if ( !smp_found_config && !acpi_lapic )
817
0
    {
818
0
        printk(KERN_NOTICE "SMP motherboard not detected.\n");
819
0
    init_uniprocessor:
820
0
        physids_clear(phys_cpu_present_map);
821
0
        physid_set(0, phys_cpu_present_map);
822
0
        if (APIC_init_uniprocessor())
823
0
            printk(KERN_NOTICE "Local APIC not detected."
824
0
                   " Using dummy APIC emulation.\n");
825
0
        return;
826
0
    }
827
1
828
1
    /*
829
1
     * Should not be necessary because the MP table should list the boot
830
1
     * CPU too, but we do it for the sake of robustness anyway.
831
1
     * Makes no sense to do this check in clustered apic mode, so skip it
832
1
     */
833
1
    if ( !check_apicid_present(boot_cpu_physical_apicid) )
834
0
    {
835
0
        printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
836
0
               boot_cpu_physical_apicid);
837
0
        physid_set(get_apic_id(), phys_cpu_present_map);
838
0
    }
839
1
840
1
    /* If we couldn't find a local APIC, then get out of here now! */
841
1
    if ( !cpu_has_apic )
842
0
    {
843
0
        printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
844
0
               boot_cpu_physical_apicid);
845
0
        goto init_uniprocessor;
846
0
    }
847
1
848
1
    verify_local_APIC();
849
1
850
1
    connect_bsp_APIC();
851
1
    setup_local_APIC();
852
1
853
1
    smpboot_setup_io_apic();
854
1
855
1
    setup_boot_APIC_clock();
856
1
}
857
858
void __init smp_prepare_boot_cpu(void)
859
1
{
860
1
    unsigned int cpu = smp_processor_id();
861
1
862
1
    cpumask_set_cpu(cpu, &cpu_online_map);
863
1
    cpumask_set_cpu(cpu, &cpu_present_map);
864
1
#if NR_CPUS > 2 * BITS_PER_LONG
865
1
    per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
866
1
#endif
867
1
}
868
869
static void
870
remove_siblinginfo(int cpu)
871
0
{
872
0
    int sibling;
873
0
874
0
    cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
875
0
876
0
    for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
877
0
    {
878
0
        cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
879
0
        /* Last thread sibling in this cpu core going down. */
880
0
        if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
881
0
            cpu_data[sibling].booted_cores--;
882
0
    }
883
0
884
0
    for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
885
0
        cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
886
0
    cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
887
0
    cpumask_clear(per_cpu(cpu_core_mask, cpu));
888
0
}
889
890
void __cpu_disable(void)
891
0
{
892
0
    int cpu = smp_processor_id();
893
0
894
0
    set_cpu_state(CPU_STATE_DYING);
895
0
896
0
    local_irq_disable();
897
0
    clear_local_APIC();
898
0
    /* Allow any queued timer interrupts to get serviced */
899
0
    local_irq_enable();
900
0
    mdelay(1);
901
0
    local_irq_disable();
902
0
903
0
    time_suspend();
904
0
905
0
    remove_siblinginfo(cpu);
906
0
907
0
    /* It's now safe to remove this processor from the online map */
908
0
    cpumask_clear_cpu(cpu, &cpu_online_map);
909
0
    fixup_irqs(&cpu_online_map, 1);
910
0
    fixup_eoi();
911
0
912
0
    if ( cpu_disable_scheduler(cpu) )
913
0
        BUG();
914
0
}
915
916
void __cpu_die(unsigned int cpu)
917
0
{
918
0
    /* We don't do anything here: idle task is faking death itself. */
919
0
    unsigned int i = 0;
920
0
    enum cpu_state seen_state;
921
0
922
0
    while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
923
0
    {
924
0
        BUG_ON(seen_state != CPU_STATE_DYING);
925
0
        mdelay(100);
926
0
        cpu_relax();
927
0
        process_pending_softirqs();
928
0
        if ( (++i % 10) == 0 )
929
0
            printk(KERN_ERR "CPU %u still not dead...\n", cpu);
930
0
    }
931
0
}
932
933
int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
934
0
{
935
0
    int cpu = -1;
936
0
937
0
    dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
938
0
            apic_id, acpi_id, pxm);
939
0
940
0
    if ( (acpi_id >= MAX_MADT_ENTRIES) ||
941
0
         (apic_id >= MAX_APICS) ||
942
0
         (pxm >= 256) )
943
0
        return -EINVAL;
944
0
945
0
    if ( !cpu_hotplug_begin() )
946
0
        return -EBUSY;
947
0
948
0
    /* Detect if the cpu has been added before */
949
0
    if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID )
950
0
    {
951
0
        cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id)
952
0
            ? -EINVAL : -EEXIST;
953
0
        goto out;
954
0
    }
955
0
956
0
    if ( physid_isset(apic_id, phys_cpu_present_map) )
957
0
    {
958
0
        cpu = -EEXIST;
959
0
        goto out;
960
0
    }
961
0
962
0
    if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 )
963
0
        goto out;
964
0
965
0
    x86_acpiid_to_apicid[acpi_id] = apic_id;
966
0
967
0
    if ( !srat_disabled() )
968
0
    {
969
0
        nodeid_t node = setup_node(pxm);
970
0
971
0
        if ( node == NUMA_NO_NODE )
972
0
        {
973
0
            dprintk(XENLOG_WARNING,
974
0
                    "Setup node failed for pxm %x\n", pxm);
975
0
            x86_acpiid_to_apicid[acpi_id] = BAD_APICID;
976
0
            mp_unregister_lapic(apic_id, cpu);
977
0
            cpu = node;
978
0
            goto out;
979
0
        }
980
0
        if ( apic_id < MAX_LOCAL_APIC )
981
0
             apicid_to_node[apic_id] = node;
982
0
    }
983
0
984
0
    /* Physically added CPUs do not have synchronised TSC. */
985
0
    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
986
0
    {
987
0
        static bool once_only;
988
0
989
0
        if ( !test_and_set_bool(once_only) )
990
0
            printk(XENLOG_WARNING
991
0
                   " ** New physical CPU %u may have skewed TSC and hence "
992
0
                   "break assumed cross-CPU TSC coherency.\n"
993
0
                   " ** Consider using boot parameter \"tsc=skewed\" "
994
0
                   "which forces TSC emulation where appropriate.\n", cpu);
995
0
        cpumask_set_cpu(cpu, &tsc_sync_cpu_mask);
996
0
    }
997
0
998
0
    srat_detect_node(cpu);
999
0
    numa_add_cpu(cpu);
1000
0
    dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
1001
0
 out:
1002
0
    cpu_hotplug_done();
1003
0
    return cpu;
1004
0
}
1005
1006
1007
int __cpu_up(unsigned int cpu)
1008
11
{
1009
11
    int apicid, ret;
1010
11
1011
11
    if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
1012
0
        return -ENODEV;
1013
11
1014
11
    if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
1015
0
        return ret;
1016
11
1017
11
    time_latch_stamps();
1018
11
1019
11
    set_cpu_state(CPU_STATE_ONLINE);
1020
5.51k
    while ( !cpu_online(cpu) )
1021
5.50k
    {
1022
5.50k
        cpu_relax();
1023
5.50k
        process_pending_softirqs();
1024
5.50k
    }
1025
11
1026
11
    return 0;
1027
11
}
1028
1029
1030
void __init smp_cpus_done(void)
1031
1
{
1032
1
    if ( nmi_watchdog == NMI_LOCAL_APIC )
1033
0
        check_nmi_watchdog();
1034
1
1035
1
    setup_ioapic_dest();
1036
1
1037
1
    mtrr_save_state();
1038
1
    mtrr_aps_sync_end();
1039
1
}
1040
1041
void __init smp_intr_init(void)
1042
1
{
1043
1
    int irq, vector, seridx, cpu = smp_processor_id();
1044
1
1045
1
    /*
1046
1
     * IRQ0 must be given a fixed assignment and initialized,
1047
1
     * because it's used before the IO-APIC is set up.
1048
1
     */
1049
1
    irq_to_desc(0)->arch.vector = IRQ0_VECTOR;
1050
1
1051
1
    /*
1052
1
     * Also ensure serial interrupts are high priority. We do not
1053
1
     * want them to be blocked by unacknowledged guest-bound interrupts.
1054
1
     */
1055
5
    for ( seridx = 0; seridx <= SERHND_IDX; seridx++ )
1056
4
    {
1057
4
        if ( (irq = serial_irq(seridx)) < 0 )
1058
3
            continue;
1059
1
        vector = alloc_hipriority_vector();
1060
1
        per_cpu(vector_irq, cpu)[vector] = irq;
1061
1
        irq_to_desc(irq)->arch.vector = vector;
1062
1
        cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map);
1063
1
    }
1064
1
1065
1
    /* Direct IPI vectors. */
1066
1
    set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
1067
1
    set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt);
1068
1
    set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1069
1
    set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt);
1070
1
}