Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/schedule.c
Line
Count
Source (jump to first uncovered line)
1
/****************************************************************************
2
 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3
 * (C) 2002-2003 University of Cambridge
4
 * (C) 2004      - Mark Williamson - Intel Research Cambridge
5
 ****************************************************************************
6
 *
7
 *        File: common/schedule.c
8
 *      Author: Rolf Neugebauer & Keir Fraser
9
 *              Updated for generic API by Mark Williamson
10
 * 
11
 * Description: Generic CPU scheduling code
12
 *              implements support functionality for the Xen scheduler API.
13
 *
14
 */
15
16
#ifndef COMPAT
17
#include <xen/init.h>
18
#include <xen/lib.h>
19
#include <xen/sched.h>
20
#include <xen/domain.h>
21
#include <xen/delay.h>
22
#include <xen/event.h>
23
#include <xen/time.h>
24
#include <xen/timer.h>
25
#include <xen/perfc.h>
26
#include <xen/sched-if.h>
27
#include <xen/softirq.h>
28
#include <xen/trace.h>
29
#include <xen/mm.h>
30
#include <xen/err.h>
31
#include <xen/guest_access.h>
32
#include <xen/hypercall.h>
33
#include <xen/multicall.h>
34
#include <xen/cpu.h>
35
#include <xen/preempt.h>
36
#include <xen/event.h>
37
#include <public/sched.h>
38
#include <xsm/xsm.h>
39
#include <xen/err.h>
40
41
/* opt_sched: scheduler - default to configured value */
42
static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
43
string_param("sched", opt_sched);
44
45
/* if sched_smt_power_savings is set,
46
 * scheduler will give preferrence to partially idle package compared to
47
 * the full idle package, when picking pCPU to schedule vCPU.
48
 */
49
bool_t sched_smt_power_savings = 0;
50
boolean_param("sched_smt_power_savings", sched_smt_power_savings);
51
52
/* Default scheduling rate limit: 1ms 
53
 * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
54
 * */
55
int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
56
integer_param("sched_ratelimit_us", sched_ratelimit_us);
57
/* Various timer handlers. */
58
static void s_timer_fn(void *unused);
59
static void vcpu_periodic_timer_fn(void *data);
60
static void vcpu_singleshot_timer_fn(void *data);
61
static void poll_timer_fn(void *data);
62
63
/* This is global for now so that private implementations can reach it */
64
DEFINE_PER_CPU(struct schedule_data, schedule_data);
65
DEFINE_PER_CPU(struct scheduler *, scheduler);
66
67
/* Scratch space for cpumasks. */
68
DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
69
70
extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
71
6
#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
72
6
#define schedulers __start_schedulers_array
73
74
static struct scheduler __read_mostly ops;
75
76
#define SCHED_OP(opsptr, fn, ...)                                          \
77
8.70M
         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
78
18.4E
          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
79
80
static inline struct scheduler *dom_scheduler(const struct domain *d)
81
114
{
82
114
    if ( likely(d->cpupool != NULL) )
83
75
        return d->cpupool->sched;
84
114
85
114
    /*
86
114
     * If d->cpupool is NULL, this is the idle domain. This is special
87
114
     * because the idle domain does not really belong to any cpupool, and,
88
114
     * hence, does not really have a scheduler.
89
114
     *
90
114
     * This is (should be!) only called like this for allocating the idle
91
114
     * vCPUs for the first time, during boot, in which case what we want
92
114
     * is the default scheduler that has been, choosen at boot.
93
114
     */
94
39
    ASSERT(is_idle_domain(d));
95
39
    return &ops;
96
114
}
97
98
static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
99
14.1M
{
100
14.1M
    struct domain *d = v->domain;
101
14.1M
102
14.1M
    if ( likely(d->cpupool != NULL) )
103
14.2M
        return d->cpupool->sched;
104
14.1M
105
14.1M
    /*
106
14.1M
     * If d->cpupool is NULL, this is a vCPU of the idle domain. And this
107
14.1M
     * case is special because the idle domain does not really belong to
108
14.1M
     * a cpupool and, hence, doesn't really have a scheduler). In fact, its
109
14.1M
     * vCPUs (may) run on pCPUs which are in different pools, with different
110
14.1M
     * schedulers.
111
14.1M
     *
112
14.1M
     * What we want, in this case, is the scheduler of the pCPU where this
113
14.1M
     * particular idle vCPU is running. And, since v->processor never changes
114
14.1M
     * for idle vCPUs, it is safe to use it, with no locks, to figure that out.
115
14.1M
     */
116
18.4E
    ASSERT(is_idle_domain(d));
117
18.4E
    return per_cpu(scheduler, v->processor);
118
14.1M
}
119
0
#define VCPU2ONLINE(_v) cpupool_domain_cpumask((_v)->domain)
120
121
static inline void trace_runstate_change(struct vcpu *v, int new_state)
122
393k
{
123
393k
    struct { uint32_t vcpu:16, domain:16; } d;
124
393k
    uint32_t event;
125
393k
126
393k
    if ( likely(!tb_init_done) )
127
393k
        return;
128
393k
129
18.4E
    d.vcpu = v->vcpu_id;
130
18.4E
    d.domain = v->domain->domain_id;
131
18.4E
132
18.4E
    event = TRC_SCHED_RUNSTATE_CHANGE;
133
18.4E
    event |= ( v->runstate.state & 0x3 ) << 8;
134
18.4E
    event |= ( new_state & 0x3 ) << 4;
135
18.4E
136
18.4E
    __trace_var(event, 1/*tsc*/, sizeof(d), &d);
137
18.4E
}
138
139
static inline void trace_continue_running(struct vcpu *v)
140
4.49M
{
141
4.49M
    struct { uint32_t vcpu:16, domain:16; } d;
142
4.49M
143
4.49M
    if ( likely(!tb_init_done) )
144
4.49M
        return;
145
4.49M
146
8.19k
    d.vcpu = v->vcpu_id;
147
8.19k
    d.domain = v->domain->domain_id;
148
8.19k
149
8.19k
    __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
150
8.19k
}
151
152
static inline void vcpu_urgent_count_update(struct vcpu *v)
153
393k
{
154
393k
    if ( is_idle_vcpu(v) )
155
131k
        return;
156
393k
157
262k
    if ( unlikely(v->is_urgent) )
158
0
    {
159
0
        if ( !(v->pause_flags & VPF_blocked) ||
160
0
             !test_bit(v->vcpu_id, v->domain->poll_mask) )
161
0
        {
162
0
            v->is_urgent = 0;
163
0
            atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
164
0
        }
165
0
    }
166
262k
    else
167
262k
    {
168
262k
        if ( unlikely(v->pause_flags & VPF_blocked) &&
169
64.8k
             unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
170
0
        {
171
0
            v->is_urgent = 1;
172
0
            atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
173
0
        }
174
262k
    }
175
262k
}
176
177
static inline void vcpu_runstate_change(
178
    struct vcpu *v, int new_state, s_time_t new_entry_time)
179
393k
{
180
393k
    s_time_t delta;
181
393k
182
393k
    ASSERT(v->runstate.state != new_state);
183
393k
    ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
184
393k
185
393k
    vcpu_urgent_count_update(v);
186
393k
187
393k
    trace_runstate_change(v, new_state);
188
393k
189
393k
    delta = new_entry_time - v->runstate.state_entry_time;
190
393k
    if ( delta > 0 )
191
339k
    {
192
339k
        v->runstate.time[v->runstate.state] += delta;
193
339k
        v->runstate.state_entry_time = new_entry_time;
194
339k
    }
195
393k
196
393k
    v->runstate.state = new_state;
197
393k
}
198
199
void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
200
48
{
201
48
    spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v);
202
48
    s_time_t delta;
203
48
204
48
    memcpy(runstate, &v->runstate, sizeof(*runstate));
205
48
    delta = NOW() - runstate->state_entry_time;
206
48
    if ( delta > 0 )
207
48
        runstate->time[runstate->state] += delta;
208
48
209
48
    if ( unlikely(lock != NULL) )
210
44
        vcpu_schedule_unlock_irq(lock, v);
211
48
}
212
213
uint64_t get_cpu_idle_time(unsigned int cpu)
214
0
{
215
0
    struct vcpu_runstate_info state = { 0 };
216
0
    struct vcpu *v = idle_vcpu[cpu];
217
0
218
0
    if ( cpu_online(cpu) && v )
219
0
        vcpu_runstate_get(v, &state);
220
0
221
0
    return state.time[RUNSTATE_running];
222
0
}
223
224
/*
225
 * If locks are different, take the one with the lower address first.
226
 * This avoids dead- or live-locks when this code is running on both
227
 * cpus at the same time.
228
 */
229
static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
230
                                   unsigned long *flags)
231
96
{
232
96
    if ( lock1 == lock2 )
233
66
    {
234
66
        spin_lock_irqsave(lock1, *flags);
235
66
    }
236
30
    else if ( lock1 < lock2 )
237
10
    {
238
10
        spin_lock_irqsave(lock1, *flags);
239
10
        spin_lock(lock2);
240
10
    }
241
30
    else
242
20
    {
243
20
        spin_lock_irqsave(lock2, *flags);
244
20
        spin_lock(lock1);
245
20
    }
246
96
}
247
248
static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
249
                                     unsigned long flags)
250
96
{
251
96
    if ( lock1 != lock2 )
252
30
        spin_unlock(lock2);
253
96
    spin_unlock_irqrestore(lock1, flags);
254
96
}
255
256
int sched_init_vcpu(struct vcpu *v, unsigned int processor) 
257
24
{
258
24
    struct domain *d = v->domain;
259
24
260
24
    /*
261
24
     * Initialize processor and affinity settings. The idler, and potentially
262
24
     * domain-0 VCPUs, are pinned onto their respective physical CPUs.
263
24
     */
264
24
    v->processor = processor;
265
24
    if ( is_idle_domain(d) || d->is_pinned )
266
12
        cpumask_copy(v->cpu_hard_affinity, cpumask_of(processor));
267
24
    else
268
12
        cpumask_setall(v->cpu_hard_affinity);
269
24
270
24
    cpumask_setall(v->cpu_soft_affinity);
271
24
272
24
    /* Initialise the per-vcpu timers. */
273
24
    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
274
24
               v, v->processor);
275
24
    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
276
24
               v, v->processor);
277
24
    init_timer(&v->poll_timer, poll_timer_fn,
278
24
               v, v->processor);
279
24
280
24
    v->sched_priv = SCHED_OP(dom_scheduler(d), alloc_vdata, v,
281
24
                 d->sched_priv);
282
24
    if ( v->sched_priv == NULL )
283
0
        return 1;
284
24
285
24
    /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
286
24
    if ( is_idle_domain(d) )
287
12
    {
288
12
        per_cpu(schedule_data, v->processor).curr = v;
289
12
        v->is_running = 1;
290
12
    }
291
24
    else
292
12
    {
293
12
        SCHED_OP(dom_scheduler(d), insert_vcpu, v);
294
12
    }
295
24
296
24
    return 0;
297
24
}
298
299
static void sched_move_irqs(struct vcpu *v)
300
536
{
301
536
    arch_move_irqs(v);
302
536
    evtchn_move_pirqs(v);
303
536
}
304
305
int sched_move_domain(struct domain *d, struct cpupool *c)
306
0
{
307
0
    struct vcpu *v;
308
0
    unsigned int new_p;
309
0
    void **vcpu_priv;
310
0
    void *domdata;
311
0
    void *vcpudata;
312
0
    struct scheduler *old_ops;
313
0
    void *old_domdata;
314
0
315
0
    for_each_vcpu ( d, v )
316
0
    {
317
0
        if ( v->affinity_broken )
318
0
            return -EBUSY;
319
0
    }
320
0
321
0
    domdata = SCHED_OP(c->sched, alloc_domdata, d);
322
0
    if ( domdata == NULL )
323
0
        return -ENOMEM;
324
0
325
0
    vcpu_priv = xzalloc_array(void *, d->max_vcpus);
326
0
    if ( vcpu_priv == NULL )
327
0
    {
328
0
        SCHED_OP(c->sched, free_domdata, domdata);
329
0
        return -ENOMEM;
330
0
    }
331
0
332
0
    for_each_vcpu ( d, v )
333
0
    {
334
0
        vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
335
0
        if ( vcpu_priv[v->vcpu_id] == NULL )
336
0
        {
337
0
            for_each_vcpu ( d, v )
338
0
                xfree(vcpu_priv[v->vcpu_id]);
339
0
            xfree(vcpu_priv);
340
0
            SCHED_OP(c->sched, free_domdata, domdata);
341
0
            return -ENOMEM;
342
0
        }
343
0
    }
344
0
345
0
    domain_pause(d);
346
0
347
0
    old_ops = dom_scheduler(d);
348
0
    old_domdata = d->sched_priv;
349
0
350
0
    for_each_vcpu ( d, v )
351
0
    {
352
0
        SCHED_OP(old_ops, remove_vcpu, v);
353
0
    }
354
0
355
0
    d->cpupool = c;
356
0
    d->sched_priv = domdata;
357
0
358
0
    new_p = cpumask_first(c->cpu_valid);
359
0
    for_each_vcpu ( d, v )
360
0
    {
361
0
        spinlock_t *lock;
362
0
363
0
        vcpudata = v->sched_priv;
364
0
365
0
        migrate_timer(&v->periodic_timer, new_p);
366
0
        migrate_timer(&v->singleshot_timer, new_p);
367
0
        migrate_timer(&v->poll_timer, new_p);
368
0
369
0
        cpumask_setall(v->cpu_hard_affinity);
370
0
        cpumask_setall(v->cpu_soft_affinity);
371
0
372
0
        lock = vcpu_schedule_lock_irq(v);
373
0
        v->processor = new_p;
374
0
        /*
375
0
         * With v->processor modified we must not
376
0
         * - make any further changes assuming we hold the scheduler lock,
377
0
         * - use vcpu_schedule_unlock_irq().
378
0
         */
379
0
        spin_unlock_irq(lock);
380
0
381
0
        v->sched_priv = vcpu_priv[v->vcpu_id];
382
0
        if ( !d->is_dying )
383
0
            sched_move_irqs(v);
384
0
385
0
        new_p = cpumask_cycle(new_p, c->cpu_valid);
386
0
387
0
        SCHED_OP(c->sched, insert_vcpu, v);
388
0
389
0
        SCHED_OP(old_ops, free_vdata, vcpudata);
390
0
    }
391
0
392
0
    domain_update_node_affinity(d);
393
0
394
0
    domain_unpause(d);
395
0
396
0
    SCHED_OP(old_ops, free_domdata, old_domdata);
397
0
398
0
    xfree(vcpu_priv);
399
0
400
0
    return 0;
401
0
}
402
403
void sched_destroy_vcpu(struct vcpu *v)
404
0
{
405
0
    kill_timer(&v->periodic_timer);
406
0
    kill_timer(&v->singleshot_timer);
407
0
    kill_timer(&v->poll_timer);
408
0
    if ( test_and_clear_bool(v->is_urgent) )
409
0
        atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
410
0
    SCHED_OP(vcpu_scheduler(v), remove_vcpu, v);
411
0
    SCHED_OP(vcpu_scheduler(v), free_vdata, v->sched_priv);
412
0
}
413
414
int sched_init_domain(struct domain *d, int poolid)
415
2
{
416
2
    int ret;
417
2
418
2
    ASSERT(d->cpupool == NULL);
419
2
420
2
    if ( (ret = cpupool_add_domain(d, poolid)) )
421
0
        return ret;
422
2
423
2
    SCHED_STAT_CRANK(dom_init);
424
2
    TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
425
2
    return SCHED_OP(dom_scheduler(d), init_domain, d);
426
2
}
427
428
void sched_destroy_domain(struct domain *d)
429
0
{
430
0
    ASSERT(d->cpupool != NULL || is_idle_domain(d));
431
0
432
0
    SCHED_STAT_CRANK(dom_destroy);
433
0
    TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
434
0
    SCHED_OP(dom_scheduler(d), destroy_domain, d);
435
0
436
0
    cpupool_rm_domain(d);
437
0
}
438
439
void vcpu_sleep_nosync(struct vcpu *v)
440
360
{
441
360
    unsigned long flags;
442
360
    spinlock_t *lock;
443
360
444
360
    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
445
360
446
360
    lock = vcpu_schedule_lock_irqsave(v, &flags);
447
360
448
360
    if ( likely(!vcpu_runnable(v)) )
449
360
    {
450
360
        if ( v->runstate.state == RUNSTATE_runnable )
451
1
            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
452
360
453
360
        SCHED_OP(vcpu_scheduler(v), sleep, v);
454
360
    }
455
360
456
360
    vcpu_schedule_unlock_irqrestore(lock, flags, v);
457
360
}
458
459
void vcpu_sleep_sync(struct vcpu *v)
460
315
{
461
315
    vcpu_sleep_nosync(v);
462
315
463
3.22k
    while ( !vcpu_runnable(v) && v->is_running )
464
2.90k
        cpu_relax();
465
315
466
315
    sync_vcpu_execstate(v);
467
315
}
468
469
void vcpu_wake(struct vcpu *v)
470
67.2k
{
471
67.2k
    unsigned long flags;
472
67.2k
    spinlock_t *lock;
473
67.2k
474
67.2k
    TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
475
67.2k
476
67.2k
    lock = vcpu_schedule_lock_irqsave(v, &flags);
477
67.2k
478
67.2k
    if ( likely(vcpu_runnable(v)) )
479
67.1k
    {
480
67.1k
        if ( v->runstate.state >= RUNSTATE_blocked )
481
66.3k
            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
482
67.1k
        SCHED_OP(vcpu_scheduler(v), wake, v);
483
67.1k
    }
484
86
    else if ( !(v->pause_flags & VPF_blocked) )
485
117
    {
486
117
        if ( v->runstate.state == RUNSTATE_blocked )
487
0
            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
488
117
    }
489
67.2k
490
67.2k
    vcpu_schedule_unlock_irqrestore(lock, flags, v);
491
67.2k
}
492
493
void vcpu_unblock(struct vcpu *v)
494
103k
{
495
103k
    if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
496
36.8k
        return;
497
103k
498
103k
    /* Polling period ends when a VCPU is unblocked. */
499
67.0k
    if ( unlikely(v->poll_evtchn != 0) )
500
0
    {
501
0
        v->poll_evtchn = 0;
502
0
        /*
503
0
         * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
504
0
         * this VCPU (and it then going back to sleep on poll_mask).
505
0
         * Test-and-clear is idiomatic and ensures clear_bit not reordered.
506
0
         */
507
0
        if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
508
0
            clear_bit(_VPF_blocked, &v->pause_flags);
509
0
    }
510
67.0k
511
67.0k
    vcpu_wake(v);
512
67.0k
}
513
514
/*
515
 * Do the actual movement of a vcpu from old to new CPU. Locks for *both*
516
 * CPUs needs to have been taken already when calling this!
517
 */
518
static void vcpu_move_locked(struct vcpu *v, unsigned int new_cpu)
519
65
{
520
65
    unsigned int old_cpu = v->processor;
521
65
522
65
    /*
523
65
     * Transfer urgency status to new CPU before switching CPUs, as
524
65
     * once the switch occurs, v->is_urgent is no longer protected by
525
65
     * the per-CPU scheduler lock we are holding.
526
65
     */
527
65
    if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
528
0
    {
529
0
        atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
530
0
        atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
531
0
    }
532
65
533
65
    /*
534
65
     * Actual CPU switch to new CPU.  This is safe because the lock
535
65
     * pointer cant' change while the current lock is held.
536
65
     */
537
65
    if ( vcpu_scheduler(v)->migrate )
538
0
        SCHED_OP(vcpu_scheduler(v), migrate, v, new_cpu);
539
65
    else
540
65
        v->processor = new_cpu;
541
65
}
542
543
/*
544
 * Move a vcpu from its current processor to a target new processor,
545
 * without asking the scheduler to do any placement. This is intended
546
 * for being called from special contexts, where things are quiet
547
 * enough that no contention is supposed to happen (i.e., during
548
 * shutdown or software suspend, like ACPI S3).
549
 */
550
static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
551
0
{
552
0
    unsigned long flags;
553
0
    spinlock_t *lock, *new_lock;
554
0
555
0
    ASSERT(system_state == SYS_STATE_suspend);
556
0
    ASSERT(!vcpu_runnable(v) && (atomic_read(&v->pause_count) ||
557
0
                                 atomic_read(&v->domain->pause_count)));
558
0
559
0
    lock = per_cpu(schedule_data, v->processor).schedule_lock;
560
0
    new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
561
0
562
0
    sched_spin_lock_double(lock, new_lock, &flags);
563
0
    ASSERT(new_cpu != v->processor);
564
0
    vcpu_move_locked(v, new_cpu);
565
0
    sched_spin_unlock_double(lock, new_lock, flags);
566
0
567
0
    sched_move_irqs(v);
568
0
}
569
570
static void vcpu_migrate(struct vcpu *v)
571
66
{
572
66
    unsigned long flags;
573
66
    unsigned int old_cpu, new_cpu;
574
66
    spinlock_t *old_lock, *new_lock;
575
66
    bool_t pick_called = 0;
576
66
577
66
    old_cpu = new_cpu = v->processor;
578
66
    for ( ; ; )
579
96
    {
580
96
        /*
581
96
         * We need another iteration if the pre-calculated lock addresses
582
96
         * are not correct any longer after evaluating old and new cpu holding
583
96
         * the locks.
584
96
         */
585
96
        old_lock = per_cpu(schedule_data, old_cpu).schedule_lock;
586
96
        new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
587
96
588
96
        sched_spin_lock_double(old_lock, new_lock, &flags);
589
96
590
96
        old_cpu = v->processor;
591
96
        if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock )
592
96
        {
593
96
            /*
594
96
             * If we selected a CPU on the previosu iteration, check if it
595
96
             * remains suitable for running this vCPU.
596
96
             */
597
96
            if ( pick_called &&
598
30
                 (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
599
96
                 cpumask_test_cpu(new_cpu, v->cpu_hard_affinity) &&
600
30
                 cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
601
30
                break;
602
96
603
96
            /* Select a new CPU. */
604
66
            new_cpu = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
605
66
            if ( (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
606
36
                 cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
607
36
                break;
608
30
            pick_called = 1;
609
30
        }
610
96
        else
611
0
        {
612
0
            /*
613
0
             * We do not hold the scheduler lock appropriate for this vCPU.
614
0
             * Thus we cannot select a new CPU on this iteration. Try again.
615
0
             */
616
0
            pick_called = 0;
617
0
        }
618
96
619
30
        sched_spin_unlock_double(old_lock, new_lock, flags);
620
30
    }
621
66
622
66
    /*
623
66
     * NB. Check of v->running happens /after/ setting migration flag
624
66
     * because they both happen in (different) spinlock regions, and those
625
66
     * regions are strictly serialised.
626
66
     */
627
66
    if ( v->is_running ||
628
65
         !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
629
1
    {
630
1
        sched_spin_unlock_double(old_lock, new_lock, flags);
631
1
        return;
632
1
    }
633
66
634
65
    vcpu_move_locked(v, new_cpu);
635
65
636
65
    sched_spin_unlock_double(old_lock, new_lock, flags);
637
65
638
65
    if ( old_cpu != new_cpu )
639
30
        sched_move_irqs(v);
640
65
641
65
    /* Wake on new CPU. */
642
65
    vcpu_wake(v);
643
65
}
644
645
/*
646
 * Force a VCPU through a deschedule/reschedule path.
647
 * For example, using this when setting the periodic timer period means that
648
 * most periodic-timer state need only be touched from within the scheduler
649
 * which can thus be done without need for synchronisation.
650
 */
651
void vcpu_force_reschedule(struct vcpu *v)
652
14
{
653
14
    spinlock_t *lock = vcpu_schedule_lock_irq(v);
654
14
655
14
    if ( v->is_running )
656
1
        set_bit(_VPF_migrating, &v->pause_flags);
657
14
    vcpu_schedule_unlock_irq(lock, v);
658
14
659
14
    if ( v->pause_flags & VPF_migrating )
660
1
    {
661
1
        vcpu_sleep_nosync(v);
662
1
        vcpu_migrate(v);
663
1
    }
664
14
}
665
666
void restore_vcpu_affinity(struct domain *d)
667
0
{
668
0
    unsigned int cpu = smp_processor_id();
669
0
    struct vcpu *v;
670
0
671
0
    ASSERT(system_state == SYS_STATE_resume);
672
0
673
0
    for_each_vcpu ( d, v )
674
0
    {
675
0
        spinlock_t *lock;
676
0
677
0
        ASSERT(!vcpu_runnable(v));
678
0
679
0
        lock = vcpu_schedule_lock_irq(v);
680
0
681
0
        if ( v->affinity_broken )
682
0
        {
683
0
            cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
684
0
            v->affinity_broken = 0;
685
0
686
0
        }
687
0
688
0
        /*
689
0
         * During suspend (in cpu_disable_scheduler()), we moved every vCPU
690
0
         * to BSP (which, as of now, is pCPU 0), as a temporary measure to
691
0
         * allow the nonboot processors to have their data structure freed
692
0
         * and go to sleep. But nothing guardantees that the BSP is a valid
693
0
         * pCPU for a particular domain.
694
0
         *
695
0
         * Therefore, here, before actually unpausing the domains, we should
696
0
         * set v->processor of each of their vCPUs to something that will
697
0
         * make sense for the scheduler of the cpupool in which they are in.
698
0
         */
699
0
        cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
700
0
                    cpupool_domain_cpumask(v->domain));
701
0
        v->processor = cpumask_any(cpumask_scratch_cpu(cpu));
702
0
703
0
        spin_unlock_irq(lock);
704
0
705
0
        lock = vcpu_schedule_lock_irq(v);
706
0
        v->processor = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
707
0
        spin_unlock_irq(lock);
708
0
    }
709
0
710
0
    domain_update_node_affinity(d);
711
0
}
712
713
/*
714
 * This function is used by cpu_hotplug code from stop_machine context
715
 * and from cpupools to switch schedulers on a cpu.
716
 */
717
int cpu_disable_scheduler(unsigned int cpu)
718
0
{
719
0
    struct domain *d;
720
0
    struct vcpu *v;
721
0
    struct cpupool *c;
722
0
    cpumask_t online_affinity;
723
0
    unsigned int new_cpu;
724
0
    int ret = 0;
725
0
726
0
    c = per_cpu(cpupool, cpu);
727
0
    if ( c == NULL )
728
0
        return ret;
729
0
730
0
    /*
731
0
     * We'd need the domain RCU lock, but:
732
0
     *  - when we are called from cpupool code, it's acquired there already;
733
0
     *  - when we are called for CPU teardown, we're in stop-machine context,
734
0
     *    so that's not be a problem.
735
0
     */
736
0
    for_each_domain_in_cpupool ( d, c )
737
0
    {
738
0
        for_each_vcpu ( d, v )
739
0
        {
740
0
            unsigned long flags;
741
0
            spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
742
0
743
0
            cpumask_and(&online_affinity, v->cpu_hard_affinity, c->cpu_valid);
744
0
            if ( cpumask_empty(&online_affinity) &&
745
0
                 cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
746
0
            {
747
0
                if ( v->affinity_broken )
748
0
                {
749
0
                    /* The vcpu is temporarily pinned, can't move it. */
750
0
                    vcpu_schedule_unlock_irqrestore(lock, flags, v);
751
0
                    ret = -EADDRINUSE;
752
0
                    break;
753
0
                }
754
0
755
0
                if (system_state == SYS_STATE_suspend)
756
0
                {
757
0
                    cpumask_copy(v->cpu_hard_affinity_saved,
758
0
                                 v->cpu_hard_affinity);
759
0
                    v->affinity_broken = 1;
760
0
                }
761
0
                else
762
0
                    printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
763
0
764
0
                cpumask_setall(v->cpu_hard_affinity);
765
0
            }
766
0
767
0
            if ( v->processor != cpu )
768
0
            {
769
0
                /* The vcpu is not on this cpu, so we can move on. */
770
0
                vcpu_schedule_unlock_irqrestore(lock, flags, v);
771
0
                continue;
772
0
            }
773
0
774
0
            /* If it is on this cpu, we must send it away. */
775
0
            if ( unlikely(system_state == SYS_STATE_suspend) )
776
0
            {
777
0
                vcpu_schedule_unlock_irqrestore(lock, flags, v);
778
0
779
0
                /*
780
0
                 * If we are doing a shutdown/suspend, it is not necessary to
781
0
                 * ask the scheduler to chime in. In fact:
782
0
                 *  * there is no reason for it: the end result we are after
783
0
                 *    is just 'all the vcpus on the boot pcpu, and no vcpu
784
0
                 *    anywhere else', so let's just go for it;
785
0
                 *  * it's wrong, for cpupools with only non-boot pcpus, as
786
0
                 *    the scheduler would always fail to send the vcpus away
787
0
                 *    from the last online (non boot) pcpu!
788
0
                 *
789
0
                 * Therefore, in the shutdown/suspend case, we just pick up
790
0
                 * one (still) online pcpu. Note that, at this stage, all
791
0
                 * domains (including dom0) have been paused already, so we
792
0
                 * do not expect any vcpu activity at all.
793
0
                 */
794
0
                cpumask_andnot(&online_affinity, &cpu_online_map,
795
0
                               cpumask_of(cpu));
796
0
                BUG_ON(cpumask_empty(&online_affinity));
797
0
                /*
798
0
                 * As boot cpu is, usually, pcpu #0, using cpumask_first()
799
0
                 * will make us converge quicker.
800
0
                 */
801
0
                new_cpu = cpumask_first(&online_affinity);
802
0
                vcpu_move_nosched(v, new_cpu);
803
0
            }
804
0
            else
805
0
            {
806
0
                /*
807
0
                 * OTOH, if the system is still live, and we are here because
808
0
                 * we are doing some cpupool manipulations:
809
0
                 *  * we want to call the scheduler, and let it re-evaluation
810
0
                 *    the placement of the vcpu, taking into account the new
811
0
                 *    cpupool configuration;
812
0
                 *  * the scheduler will always fine a suitable solution, or
813
0
                 *    things would have failed before getting in here.
814
0
                 */
815
0
                set_bit(_VPF_migrating, &v->pause_flags);
816
0
                vcpu_schedule_unlock_irqrestore(lock, flags, v);
817
0
                vcpu_sleep_nosync(v);
818
0
                vcpu_migrate(v);
819
0
820
0
                /*
821
0
                 * The only caveat, in this case, is that if a vcpu active in
822
0
                 * the hypervisor isn't migratable. In this case, the caller
823
0
                 * should try again after releasing and reaquiring all locks.
824
0
                 */
825
0
                if ( v->processor == cpu )
826
0
                    ret = -EAGAIN;
827
0
            }
828
0
        }
829
0
    }
830
0
831
0
    return ret;
832
0
}
833
834
static int vcpu_set_affinity(
835
    struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
836
0
{
837
0
    spinlock_t *lock;
838
0
    int ret = 0;
839
0
840
0
    lock = vcpu_schedule_lock_irq(v);
841
0
842
0
    if ( v->affinity_broken )
843
0
        ret = -EBUSY;
844
0
    else
845
0
    {
846
0
        cpumask_copy(which, affinity);
847
0
848
0
        /*
849
0
         * Always ask the scheduler to re-evaluate placement
850
0
         * when changing the affinity.
851
0
         */
852
0
        set_bit(_VPF_migrating, &v->pause_flags);
853
0
    }
854
0
855
0
    vcpu_schedule_unlock_irq(lock, v);
856
0
857
0
    domain_update_node_affinity(v->domain);
858
0
859
0
    if ( v->pause_flags & VPF_migrating )
860
0
    {
861
0
        vcpu_sleep_nosync(v);
862
0
        vcpu_migrate(v);
863
0
    }
864
0
865
0
    return ret;
866
0
}
867
868
int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
869
0
{
870
0
    cpumask_t online_affinity;
871
0
    cpumask_t *online;
872
0
873
0
    if ( v->domain->is_pinned )
874
0
        return -EINVAL;
875
0
876
0
    online = VCPU2ONLINE(v);
877
0
    cpumask_and(&online_affinity, affinity, online);
878
0
    if ( cpumask_empty(&online_affinity) )
879
0
        return -EINVAL;
880
0
881
0
    return vcpu_set_affinity(v, affinity, v->cpu_hard_affinity);
882
0
}
883
884
int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
885
0
{
886
0
    return vcpu_set_affinity(v, affinity, v->cpu_soft_affinity);
887
0
}
888
889
/* Block the currently-executing domain until a pertinent event occurs. */
890
void vcpu_block(void)
891
65.3k
{
892
65.3k
    struct vcpu *v = current;
893
65.3k
894
65.3k
    set_bit(_VPF_blocked, &v->pause_flags);
895
65.3k
896
65.3k
    arch_vcpu_block(v);
897
65.3k
898
65.3k
    /* Check for events /after/ blocking: avoids wakeup waiting race. */
899
65.3k
    if ( local_events_need_delivery() )
900
171
    {
901
171
        clear_bit(_VPF_blocked, &v->pause_flags);
902
171
    }
903
65.3k
    else
904
65.2k
    {
905
65.2k
        TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
906
65.2k
        raise_softirq(SCHEDULE_SOFTIRQ);
907
65.2k
    }
908
65.3k
}
909
910
static void vcpu_block_enable_events(void)
911
65.4k
{
912
65.4k
    local_event_delivery_enable();
913
65.4k
    vcpu_block();
914
65.4k
}
915
916
static long do_poll(struct sched_poll *sched_poll)
917
0
{
918
0
    struct vcpu   *v = current;
919
0
    struct domain *d = v->domain;
920
0
    evtchn_port_t  port;
921
0
    long           rc;
922
0
    unsigned int   i;
923
0
924
0
    /* Fairly arbitrary limit. */
925
0
    if ( sched_poll->nr_ports > 128 )
926
0
        return -EINVAL;
927
0
928
0
    if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
929
0
        return -EFAULT;
930
0
931
0
    set_bit(_VPF_blocked, &v->pause_flags);
932
0
    v->poll_evtchn = -1;
933
0
    set_bit(v->vcpu_id, d->poll_mask);
934
0
935
0
    arch_vcpu_block(v);
936
0
937
0
#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
938
    /* Check for events /after/ setting flags: avoids wakeup waiting race. */
939
    smp_mb();
940
941
    /*
942
     * Someone may have seen we are blocked but not that we are polling, or
943
     * vice versa. We are certainly being woken, so clean up and bail. Beyond
944
     * this point others can be guaranteed to clean up for us if they wake us.
945
     */
946
    rc = 0;
947
    if ( (v->poll_evtchn == 0) ||
948
         !test_bit(_VPF_blocked, &v->pause_flags) ||
949
         !test_bit(v->vcpu_id, d->poll_mask) )
950
        goto out;
951
#endif
952
0
953
0
    rc = 0;
954
0
    if ( local_events_need_delivery() )
955
0
        goto out;
956
0
957
0
    for ( i = 0; i < sched_poll->nr_ports; i++ )
958
0
    {
959
0
        rc = -EFAULT;
960
0
        if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
961
0
            goto out;
962
0
963
0
        rc = -EINVAL;
964
0
        if ( port >= d->max_evtchns )
965
0
            goto out;
966
0
967
0
        rc = 0;
968
0
        if ( evtchn_port_is_pending(d, port) )
969
0
            goto out;
970
0
    }
971
0
972
0
    if ( sched_poll->nr_ports == 1 )
973
0
        v->poll_evtchn = port;
974
0
975
0
    if ( sched_poll->timeout != 0 )
976
0
        set_timer(&v->poll_timer, sched_poll->timeout);
977
0
978
0
    TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
979
0
    raise_softirq(SCHEDULE_SOFTIRQ);
980
0
981
0
    return 0;
982
0
983
0
 out:
984
0
    v->poll_evtchn = 0;
985
0
    clear_bit(v->vcpu_id, d->poll_mask);
986
0
    clear_bit(_VPF_blocked, &v->pause_flags);
987
0
    return rc;
988
0
}
989
990
/* Voluntarily yield the processor for this allocation. */
991
long vcpu_yield(void)
992
4.53M
{
993
4.53M
    struct vcpu * v=current;
994
4.53M
    spinlock_t *lock = vcpu_schedule_lock_irq(v);
995
4.53M
996
4.53M
    SCHED_OP(vcpu_scheduler(v), yield, v);
997
4.53M
    vcpu_schedule_unlock_irq(lock, v);
998
4.53M
999
4.53M
    SCHED_STAT_CRANK(vcpu_yield);
1000
4.53M
1001
4.53M
    TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
1002
4.53M
    raise_softirq(SCHEDULE_SOFTIRQ);
1003
4.53M
    return 0;
1004
4.53M
}
1005
1006
static void domain_watchdog_timeout(void *data)
1007
0
{
1008
0
    struct domain *d = data;
1009
0
1010
0
    if ( d->is_shutting_down || d->is_dying )
1011
0
        return;
1012
0
1013
0
    printk("Watchdog timer fired for domain %u\n", d->domain_id);
1014
0
    domain_shutdown(d, SHUTDOWN_watchdog);
1015
0
}
1016
1017
static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
1018
0
{
1019
0
    if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
1020
0
        return -EINVAL;
1021
0
1022
0
    spin_lock(&d->watchdog_lock);
1023
0
1024
0
    if ( id == 0 )
1025
0
    {
1026
0
        for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
1027
0
        {
1028
0
            if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
1029
0
                continue;
1030
0
            set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1031
0
            break;
1032
0
        }
1033
0
        spin_unlock(&d->watchdog_lock);
1034
0
        return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
1035
0
    }
1036
0
1037
0
    id -= 1;
1038
0
    if ( !test_bit(id, &d->watchdog_inuse_map) )
1039
0
    {
1040
0
        spin_unlock(&d->watchdog_lock);
1041
0
        return -EINVAL;
1042
0
    }
1043
0
1044
0
    if ( timeout == 0 )
1045
0
    {
1046
0
        stop_timer(&d->watchdog_timer[id]);
1047
0
        clear_bit(id, &d->watchdog_inuse_map);
1048
0
    }
1049
0
    else
1050
0
    {
1051
0
        set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1052
0
    }
1053
0
1054
0
    spin_unlock(&d->watchdog_lock);
1055
0
    return 0;
1056
0
}
1057
1058
void watchdog_domain_init(struct domain *d)
1059
5
{
1060
5
    unsigned int i;
1061
5
1062
5
    spin_lock_init(&d->watchdog_lock);
1063
5
1064
5
    d->watchdog_inuse_map = 0;
1065
5
1066
15
    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1067
10
        init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
1068
5
}
1069
1070
void watchdog_domain_destroy(struct domain *d)
1071
0
{
1072
0
    unsigned int i;
1073
0
1074
0
    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1075
0
        kill_timer(&d->watchdog_timer[i]);
1076
0
}
1077
1078
int vcpu_pin_override(struct vcpu *v, int cpu)
1079
0
{
1080
0
    spinlock_t *lock;
1081
0
    int ret = -EINVAL;
1082
0
1083
0
    lock = vcpu_schedule_lock_irq(v);
1084
0
1085
0
    if ( cpu < 0 )
1086
0
    {
1087
0
        if ( v->affinity_broken )
1088
0
        {
1089
0
            cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
1090
0
            v->affinity_broken = 0;
1091
0
            set_bit(_VPF_migrating, &v->pause_flags);
1092
0
            ret = 0;
1093
0
        }
1094
0
    }
1095
0
    else if ( cpu < nr_cpu_ids )
1096
0
    {
1097
0
        if ( v->affinity_broken )
1098
0
            ret = -EBUSY;
1099
0
        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
1100
0
        {
1101
0
            cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
1102
0
            v->affinity_broken = 1;
1103
0
            cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
1104
0
            set_bit(_VPF_migrating, &v->pause_flags);
1105
0
            ret = 0;
1106
0
        }
1107
0
    }
1108
0
1109
0
    vcpu_schedule_unlock_irq(lock, v);
1110
0
1111
0
    domain_update_node_affinity(v->domain);
1112
0
1113
0
    if ( v->pause_flags & VPF_migrating )
1114
0
    {
1115
0
        vcpu_sleep_nosync(v);
1116
0
        vcpu_migrate(v);
1117
0
    }
1118
0
1119
0
    return ret;
1120
0
}
1121
1122
typedef long ret_t;
1123
1124
#endif /* !COMPAT */
1125
1126
ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1127
4.58M
{
1128
0
    ret_t ret = 0;
1129
4.58M
1130
4.58M
    switch ( cmd )
1131
4.58M
    {
1132
4.52M
    case SCHEDOP_yield:
1133
4.52M
    {
1134
4.52M
        ret = vcpu_yield();
1135
4.52M
        break;
1136
4.52M
    }
1137
4.52M
1138
65.6k
    case SCHEDOP_block:
1139
65.6k
    {
1140
65.6k
        vcpu_block_enable_events();
1141
65.6k
        break;
1142
4.52M
    }
1143
4.52M
1144
0
    case SCHEDOP_shutdown:
1145
0
    {
1146
0
        struct sched_shutdown sched_shutdown;
1147
0
1148
0
        ret = -EFAULT;
1149
0
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
1150
0
            break;
1151
0
1152
0
        ret = 0;
1153
0
        TRACE_3D(TRC_SCHED_SHUTDOWN,
1154
0
                 current->domain->domain_id, current->vcpu_id,
1155
0
                 sched_shutdown.reason);
1156
0
        domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1157
0
1158
0
        break;
1159
0
    }
1160
0
1161
0
    case SCHEDOP_shutdown_code:
1162
0
    {
1163
0
        struct sched_shutdown sched_shutdown;
1164
0
        struct domain *d = current->domain;
1165
0
1166
0
        ret = -EFAULT;
1167
0
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
1168
0
            break;
1169
0
1170
0
        TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1171
0
                 d->domain_id, current->vcpu_id, sched_shutdown.reason);
1172
0
1173
0
        spin_lock(&d->shutdown_lock);
1174
0
        if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1175
0
            d->shutdown_code = (u8)sched_shutdown.reason;
1176
0
        spin_unlock(&d->shutdown_lock);
1177
0
1178
0
        ret = 0;
1179
0
        break;
1180
0
    }
1181
0
1182
0
    case SCHEDOP_poll:
1183
0
    {
1184
0
        struct sched_poll sched_poll;
1185
0
1186
0
        ret = -EFAULT;
1187
0
        if ( copy_from_guest(&sched_poll, arg, 1) )
1188
0
            break;
1189
0
1190
0
        ret = do_poll(&sched_poll);
1191
0
1192
0
        break;
1193
0
    }
1194
0
1195
0
    case SCHEDOP_remote_shutdown:
1196
0
    {
1197
0
        struct domain *d;
1198
0
        struct sched_remote_shutdown sched_remote_shutdown;
1199
0
1200
0
        ret = -EFAULT;
1201
0
        if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1202
0
            break;
1203
0
1204
0
        ret = -ESRCH;
1205
0
        d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1206
0
        if ( d == NULL )
1207
0
            break;
1208
0
1209
0
        ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1210
0
        if ( likely(!ret) )
1211
0
            domain_shutdown(d, sched_remote_shutdown.reason);
1212
0
1213
0
        rcu_unlock_domain(d);
1214
0
1215
0
        break;
1216
0
    }
1217
0
1218
0
    case SCHEDOP_watchdog:
1219
0
    {
1220
0
        struct sched_watchdog sched_watchdog;
1221
0
1222
0
        ret = -EFAULT;
1223
0
        if ( copy_from_guest(&sched_watchdog, arg, 1) )
1224
0
            break;
1225
0
1226
0
        ret = domain_watchdog(
1227
0
            current->domain, sched_watchdog.id, sched_watchdog.timeout);
1228
0
        break;
1229
0
    }
1230
0
1231
0
    case SCHEDOP_pin_override:
1232
0
    {
1233
0
        struct sched_pin_override sched_pin_override;
1234
0
1235
0
        ret = -EPERM;
1236
0
        if ( !is_hardware_domain(current->domain) )
1237
0
            break;
1238
0
1239
0
        ret = -EFAULT;
1240
0
        if ( copy_from_guest(&sched_pin_override, arg, 1) )
1241
0
            break;
1242
0
1243
0
        ret = vcpu_pin_override(current, sched_pin_override.pcpu);
1244
0
1245
0
        break;
1246
0
    }
1247
0
1248
0
    default:
1249
0
        ret = -ENOSYS;
1250
4.58M
    }
1251
4.58M
1252
5.02M
    return ret;
1253
4.58M
}
do_sched_op
Line
Count
Source
1127
4.58M
{
1128
4.58M
    ret_t ret = 0;
1129
4.58M
1130
4.58M
    switch ( cmd )
1131
4.58M
    {
1132
4.52M
    case SCHEDOP_yield:
1133
4.52M
    {
1134
4.52M
        ret = vcpu_yield();
1135
4.52M
        break;
1136
4.52M
    }
1137
4.52M
1138
65.6k
    case SCHEDOP_block:
1139
65.6k
    {
1140
65.6k
        vcpu_block_enable_events();
1141
65.6k
        break;
1142
4.52M
    }
1143
4.52M
1144
0
    case SCHEDOP_shutdown:
1145
0
    {
1146
0
        struct sched_shutdown sched_shutdown;
1147
0
1148
0
        ret = -EFAULT;
1149
0
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
1150
0
            break;
1151
0
1152
0
        ret = 0;
1153
0
        TRACE_3D(TRC_SCHED_SHUTDOWN,
1154
0
                 current->domain->domain_id, current->vcpu_id,
1155
0
                 sched_shutdown.reason);
1156
0
        domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1157
0
1158
0
        break;
1159
0
    }
1160
0
1161
0
    case SCHEDOP_shutdown_code:
1162
0
    {
1163
0
        struct sched_shutdown sched_shutdown;
1164
0
        struct domain *d = current->domain;
1165
0
1166
0
        ret = -EFAULT;
1167
0
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
1168
0
            break;
1169
0
1170
0
        TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1171
0
                 d->domain_id, current->vcpu_id, sched_shutdown.reason);
1172
0
1173
0
        spin_lock(&d->shutdown_lock);
1174
0
        if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1175
0
            d->shutdown_code = (u8)sched_shutdown.reason;
1176
0
        spin_unlock(&d->shutdown_lock);
1177
0
1178
0
        ret = 0;
1179
0
        break;
1180
0
    }
1181
0
1182
0
    case SCHEDOP_poll:
1183
0
    {
1184
0
        struct sched_poll sched_poll;
1185
0
1186
0
        ret = -EFAULT;
1187
0
        if ( copy_from_guest(&sched_poll, arg, 1) )
1188
0
            break;
1189
0
1190
0
        ret = do_poll(&sched_poll);
1191
0
1192
0
        break;
1193
0
    }
1194
0
1195
0
    case SCHEDOP_remote_shutdown:
1196
0
    {
1197
0
        struct domain *d;
1198
0
        struct sched_remote_shutdown sched_remote_shutdown;
1199
0
1200
0
        ret = -EFAULT;
1201
0
        if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1202
0
            break;
1203
0
1204
0
        ret = -ESRCH;
1205
0
        d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1206
0
        if ( d == NULL )
1207
0
            break;
1208
0
1209
0
        ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1210
0
        if ( likely(!ret) )
1211
0
            domain_shutdown(d, sched_remote_shutdown.reason);
1212
0
1213
0
        rcu_unlock_domain(d);
1214
0
1215
0
        break;
1216
0
    }
1217
0
1218
0
    case SCHEDOP_watchdog:
1219
0
    {
1220
0
        struct sched_watchdog sched_watchdog;
1221
0
1222
0
        ret = -EFAULT;
1223
0
        if ( copy_from_guest(&sched_watchdog, arg, 1) )
1224
0
            break;
1225
0
1226
0
        ret = domain_watchdog(
1227
0
            current->domain, sched_watchdog.id, sched_watchdog.timeout);
1228
0
        break;
1229
0
    }
1230
0
1231
0
    case SCHEDOP_pin_override:
1232
0
    {
1233
0
        struct sched_pin_override sched_pin_override;
1234
0
1235
0
        ret = -EPERM;
1236
0
        if ( !is_hardware_domain(current->domain) )
1237
0
            break;
1238
0
1239
0
        ret = -EFAULT;
1240
0
        if ( copy_from_guest(&sched_pin_override, arg, 1) )
1241
0
            break;
1242
0
1243
0
        ret = vcpu_pin_override(current, sched_pin_override.pcpu);
1244
0
1245
0
        break;
1246
0
    }
1247
0
1248
0
    default:
1249
0
        ret = -ENOSYS;
1250
4.58M
    }
1251
4.58M
1252
5.02M
    return ret;
1253
4.58M
}
Unexecuted instantiation: compat_sched_op
1254
1255
#ifndef COMPAT
1256
1257
/* Per-vcpu oneshot-timer hypercall. */
1258
long do_set_timer_op(s_time_t timeout)
1259
0
{
1260
0
    struct vcpu *v = current;
1261
0
    s_time_t offset = timeout - NOW();
1262
0
1263
0
    if ( timeout == 0 )
1264
0
    {
1265
0
        stop_timer(&v->singleshot_timer);
1266
0
    }
1267
0
    else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
1268
0
              unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
1269
0
    {
1270
0
        /*
1271
0
         * Linux workaround: occasionally we will see timeouts a long way in 
1272
0
         * the future due to wrapping in Linux's jiffy time handling. We check 
1273
0
         * for timeouts wrapped negative, and for positive timeouts more than 
1274
0
         * about 13 days in the future (2^50ns). The correct fix is to trigger 
1275
0
         * an interrupt immediately (since Linux in fact has pending work to 
1276
0
         * do in this situation). However, older guests also set a long timeout
1277
0
         * when they have *no* pending timers at all: setting an immediate
1278
0
         * timeout in this case can burn a lot of CPU. We therefore go for a
1279
0
         * reasonable middleground of triggering a timer event in 100ms.
1280
0
         */
1281
0
        gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
1282
0
                 timeout);
1283
0
        set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
1284
0
    }
1285
0
    else
1286
0
    {
1287
0
        migrate_timer(&v->singleshot_timer, smp_processor_id());
1288
0
        set_timer(&v->singleshot_timer, timeout);
1289
0
    }
1290
0
1291
0
    return 0;
1292
0
}
1293
1294
/* sched_id - fetch ID of current scheduler */
1295
int sched_id(void)
1296
0
{
1297
0
    return ops.sched_id;
1298
0
}
1299
1300
/* Adjust scheduling parameter for a given domain. */
1301
long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
1302
0
{
1303
0
    long ret;
1304
0
1305
0
    ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
1306
0
    if ( ret )
1307
0
        return ret;
1308
0
1309
0
    if ( op->sched_id != dom_scheduler(d)->sched_id )
1310
0
        return -EINVAL;
1311
0
1312
0
    switch ( op->cmd )
1313
0
    {
1314
0
    case XEN_DOMCTL_SCHEDOP_putinfo:
1315
0
    case XEN_DOMCTL_SCHEDOP_getinfo:
1316
0
    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1317
0
    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1318
0
        break;
1319
0
    default:
1320
0
        return -EINVAL;
1321
0
    }
1322
0
1323
0
    /* NB: the pluggable scheduler code needs to take care
1324
0
     * of locking by itself. */
1325
0
    if ( (ret = SCHED_OP(dom_scheduler(d), adjust, d, op)) == 0 )
1326
0
        TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
1327
0
1328
0
    return ret;
1329
0
}
1330
1331
long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
1332
0
{
1333
0
    struct cpupool *pool;
1334
0
    int rc;
1335
0
1336
0
    rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
1337
0
    if ( rc )
1338
0
        return rc;
1339
0
1340
0
    if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
1341
0
         (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
1342
0
        return -EINVAL;
1343
0
1344
0
    pool = cpupool_get_by_id(op->cpupool_id);
1345
0
    if ( pool == NULL )
1346
0
        return -ESRCH;
1347
0
1348
0
    rc = ((op->sched_id == pool->sched->sched_id)
1349
0
          ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
1350
0
1351
0
    cpupool_put(pool);
1352
0
1353
0
    return rc;
1354
0
}
1355
1356
static void vcpu_periodic_timer_work(struct vcpu *v)
1357
165k
{
1358
165k
    s_time_t now = NOW();
1359
165k
    s_time_t periodic_next_event;
1360
165k
1361
165k
    if ( v->periodic_period == 0 )
1362
165k
        return;
1363
165k
1364
271
    periodic_next_event = v->periodic_last_event + v->periodic_period;
1365
271
1366
271
    if ( now >= periodic_next_event )
1367
0
    {
1368
0
        send_timer_event(v);
1369
0
        v->periodic_last_event = now;
1370
0
        periodic_next_event = now + v->periodic_period;
1371
0
    }
1372
271
1373
271
    migrate_timer(&v->periodic_timer, smp_processor_id());
1374
271
    set_timer(&v->periodic_timer, periodic_next_event);
1375
271
}
1376
1377
/* 
1378
 * The main function
1379
 * - deschedule the current domain (scheduler independent).
1380
 * - pick a new domain (scheduler dependent).
1381
 */
1382
static void schedule(void)
1383
4.92M
{
1384
4.92M
    struct vcpu          *prev = current, *next = NULL;
1385
4.92M
    s_time_t              now;
1386
4.92M
    struct scheduler     *sched;
1387
4.92M
    unsigned long        *tasklet_work = &this_cpu(tasklet_work_to_do);
1388
4.92M
    bool_t                tasklet_work_scheduled = 0;
1389
4.92M
    struct schedule_data *sd;
1390
4.92M
    spinlock_t           *lock;
1391
4.92M
    struct task_slice     next_slice;
1392
4.92M
    int cpu = smp_processor_id();
1393
4.92M
1394
4.92M
    ASSERT_NOT_IN_ATOMIC();
1395
4.92M
1396
4.92M
    SCHED_STAT_CRANK(sched_run);
1397
4.92M
1398
4.92M
    sd = &this_cpu(schedule_data);
1399
4.92M
1400
4.92M
    /* Update tasklet scheduling status. */
1401
4.92M
    switch ( *tasklet_work )
1402
4.92M
    {
1403
44
    case TASKLET_enqueued:
1404
44
        set_bit(_TASKLET_scheduled, tasklet_work);
1405
44
        /* fallthrough */
1406
44
    case TASKLET_enqueued|TASKLET_scheduled:
1407
44
        tasklet_work_scheduled = 1;
1408
44
        break;
1409
44
    case TASKLET_scheduled:
1410
44
        clear_bit(_TASKLET_scheduled, tasklet_work);
1411
4.88M
    case 0:
1412
4.88M
        /*tasklet_work_scheduled = 0;*/
1413
4.88M
        break;
1414
0
    default:
1415
0
        BUG();
1416
4.92M
    }
1417
4.92M
1418
4.88M
    lock = pcpu_schedule_lock_irq(cpu);
1419
4.88M
1420
4.88M
    now = NOW();
1421
4.88M
1422
4.88M
    stop_timer(&sd->s_timer);
1423
4.88M
    
1424
4.88M
    /* get policy-specific decision on scheduling... */
1425
4.88M
    sched = this_cpu(scheduler);
1426
4.88M
    next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1427
4.88M
1428
4.88M
    next = next_slice.task;
1429
4.88M
1430
4.88M
    sd->curr = next;
1431
4.88M
1432
4.88M
    if ( next_slice.time >= 0 ) /* -ve means no limit */
1433
4.92M
        set_timer(&sd->s_timer, now + next_slice.time);
1434
4.88M
1435
4.88M
    if ( unlikely(prev == next) )
1436
4.57M
    {
1437
4.57M
        pcpu_schedule_unlock_irq(lock, cpu);
1438
4.57M
        TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
1439
4.57M
                 next->domain->domain_id, next->vcpu_id,
1440
4.57M
                 now - prev->runstate.state_entry_time,
1441
4.57M
                 next_slice.time);
1442
4.57M
        trace_continue_running(next);
1443
4.57M
        return continue_running(prev);
1444
4.57M
    }
1445
4.88M
1446
314k
    TRACE_3D(TRC_SCHED_SWITCH_INFPREV,
1447
314k
             prev->domain->domain_id, prev->vcpu_id,
1448
314k
             now - prev->runstate.state_entry_time);
1449
314k
    TRACE_4D(TRC_SCHED_SWITCH_INFNEXT,
1450
314k
             next->domain->domain_id, next->vcpu_id,
1451
314k
             (next->runstate.state == RUNSTATE_runnable) ?
1452
314k
             (now - next->runstate.state_entry_time) : 0,
1453
314k
             next_slice.time);
1454
314k
1455
314k
    ASSERT(prev->runstate.state == RUNSTATE_running);
1456
314k
1457
314k
    TRACE_4D(TRC_SCHED_SWITCH,
1458
314k
             prev->domain->domain_id, prev->vcpu_id,
1459
314k
             next->domain->domain_id, next->vcpu_id);
1460
314k
1461
314k
    vcpu_runstate_change(
1462
314k
        prev,
1463
314k
        ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
1464
249k
         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1465
314k
        now);
1466
314k
    prev->last_run_time = now;
1467
314k
1468
314k
    ASSERT(next->runstate.state != RUNSTATE_running);
1469
314k
    vcpu_runstate_change(next, RUNSTATE_running, now);
1470
314k
1471
314k
    /*
1472
314k
     * NB. Don't add any trace records from here until the actual context
1473
314k
     * switch, else lost_records resume will not work properly.
1474
314k
     */
1475
314k
1476
314k
    ASSERT(!next->is_running);
1477
314k
    next->is_running = 1;
1478
314k
1479
314k
    pcpu_schedule_unlock_irq(lock, cpu);
1480
314k
1481
314k
    SCHED_STAT_CRANK(sched_ctx);
1482
314k
1483
314k
    stop_timer(&prev->periodic_timer);
1484
314k
1485
314k
    if ( next_slice.migrated )
1486
506
        sched_move_irqs(next);
1487
314k
1488
314k
    vcpu_periodic_timer_work(next);
1489
314k
1490
314k
    context_switch(prev, next);
1491
314k
}
1492
1493
void context_saved(struct vcpu *prev)
1494
163k
{
1495
163k
    /* Clear running flag /after/ writing context to memory. */
1496
163k
    smp_wmb();
1497
163k
1498
163k
    prev->is_running = 0;
1499
163k
1500
163k
    /* Check for migration request /after/ clearing running flag. */
1501
163k
    smp_mb();
1502
163k
1503
163k
    SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
1504
163k
1505
163k
    if ( unlikely(prev->pause_flags & VPF_migrating) )
1506
65
        vcpu_migrate(prev);
1507
163k
}
1508
1509
/* The scheduler timer: force a run through the scheduler */
1510
static void s_timer_fn(void *unused)
1511
2.10k
{
1512
2.10k
    raise_softirq(SCHEDULE_SOFTIRQ);
1513
2.10k
    SCHED_STAT_CRANK(sched_irq);
1514
2.10k
}
1515
1516
/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1517
static void vcpu_periodic_timer_fn(void *data)
1518
0
{
1519
0
    struct vcpu *v = data;
1520
0
    vcpu_periodic_timer_work(v);
1521
0
}
1522
1523
/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1524
static void vcpu_singleshot_timer_fn(void *data)
1525
5.74k
{
1526
5.74k
    struct vcpu *v = data;
1527
5.74k
    send_timer_event(v);
1528
5.74k
}
1529
1530
/* SCHEDOP_poll timeout callback. */
1531
static void poll_timer_fn(void *data)
1532
0
{
1533
0
    struct vcpu *v = data;
1534
0
1535
0
    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1536
0
        vcpu_unblock(v);
1537
0
}
1538
1539
static int cpu_schedule_up(unsigned int cpu)
1540
12
{
1541
12
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1542
12
    void *sched_priv;
1543
12
1544
12
    per_cpu(scheduler, cpu) = &ops;
1545
12
    spin_lock_init(&sd->_lock);
1546
12
    sd->schedule_lock = &sd->_lock;
1547
12
    sd->curr = idle_vcpu[cpu];
1548
12
    init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1549
12
    atomic_set(&sd->urgent_count, 0);
1550
12
1551
12
    /* Boot CPU is dealt with later in schedule_init(). */
1552
12
    if ( cpu == 0 )
1553
1
        return 0;
1554
12
1555
11
    if ( idle_vcpu[cpu] == NULL )
1556
11
        alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1557
11
    else
1558
0
    {
1559
0
        struct vcpu *idle = idle_vcpu[cpu];
1560
0
1561
0
        /*
1562
0
         * During (ACPI?) suspend the idle vCPU for this pCPU is not freed,
1563
0
         * while its scheduler specific data (what is pointed by sched_priv)
1564
0
         * is. Also, at this stage of the resume path, we attach the pCPU
1565
0
         * to the default scheduler, no matter in what cpupool it was before
1566
0
         * suspend. To avoid inconsistency, let's allocate default scheduler
1567
0
         * data for the idle vCPU here. If the pCPU was in a different pool
1568
0
         * with a different scheduler, it is schedule_cpu_switch(), invoked
1569
0
         * later, that will set things up as appropriate.
1570
0
         */
1571
0
        ASSERT(idle->sched_priv == NULL);
1572
0
1573
0
        idle->sched_priv = SCHED_OP(&ops, alloc_vdata, idle,
1574
0
                                    idle->domain->sched_priv);
1575
0
        if ( idle->sched_priv == NULL )
1576
0
            return -ENOMEM;
1577
0
    }
1578
11
    if ( idle_vcpu[cpu] == NULL )
1579
0
        return -ENOMEM;
1580
11
1581
11
    /*
1582
11
     * We don't want to risk calling xfree() on an sd->sched_priv
1583
11
     * (e.g., inside free_pdata, from cpu_schedule_down() called
1584
11
     * during CPU_UP_CANCELLED) that contains an IS_ERR value.
1585
11
     */
1586
11
    sched_priv = SCHED_OP(&ops, alloc_pdata, cpu);
1587
11
    if ( IS_ERR(sched_priv) )
1588
0
        return PTR_ERR(sched_priv);
1589
11
1590
11
    sd->sched_priv = sched_priv;
1591
11
1592
11
    return 0;
1593
11
}
1594
1595
static void cpu_schedule_down(unsigned int cpu)
1596
0
{
1597
0
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1598
0
    struct scheduler *sched = per_cpu(scheduler, cpu);
1599
0
1600
0
    SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
1601
0
    SCHED_OP(sched, free_vdata, idle_vcpu[cpu]->sched_priv);
1602
0
1603
0
    idle_vcpu[cpu]->sched_priv = NULL;
1604
0
    sd->sched_priv = NULL;
1605
0
1606
0
    kill_timer(&sd->s_timer);
1607
0
}
1608
1609
static int cpu_schedule_callback(
1610
    struct notifier_block *nfb, unsigned long action, void *hcpu)
1611
33
{
1612
33
    unsigned int cpu = (unsigned long)hcpu;
1613
33
    struct scheduler *sched = per_cpu(scheduler, cpu);
1614
33
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1615
33
    int rc = 0;
1616
33
1617
33
    /*
1618
33
     * From the scheduler perspective, bringing up a pCPU requires
1619
33
     * allocating and initializing the per-pCPU scheduler specific data,
1620
33
     * as well as "registering" this pCPU to the scheduler (which may
1621
33
     * involve modifying some scheduler wide data structures).
1622
33
     * This happens by calling the alloc_pdata and init_pdata hooks, in
1623
33
     * this order. A scheduler that does not need to allocate any per-pCPU
1624
33
     * data can avoid implementing alloc_pdata. init_pdata may, however, be
1625
33
     * necessary/useful in this case too (e.g., it can contain the "register
1626
33
     * the pCPU to the scheduler" part). alloc_pdata (if present) is called
1627
33
     * during CPU_UP_PREPARE. init_pdata (if present) is called during
1628
33
     * CPU_STARTING.
1629
33
     *
1630
33
     * On the other hand, at teardown, we need to reverse what has been done
1631
33
     * during initialization, and then free the per-pCPU specific data. This
1632
33
     * happens by calling the deinit_pdata and free_pdata hooks, in this
1633
33
     * order. If no per-pCPU memory was allocated, there is no need to
1634
33
     * provide an implementation of free_pdata. deinit_pdata may, however,
1635
33
     * be necessary/useful in this case too (e.g., it can undo something done
1636
33
     * on scheduler wide data structure during init_pdata). Both deinit_pdata
1637
33
     * and free_pdata are called during CPU_DEAD.
1638
33
     *
1639
33
     * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED
1640
33
     * *before* having called init_pdata. In this case, as there is no
1641
33
     * initialization needing undoing, only free_pdata should be called.
1642
33
     * This means it is possible to call free_pdata just after alloc_pdata,
1643
33
     * without a init_pdata/deinit_pdata "cycle" in between the two.
1644
33
     *
1645
33
     * So, in summary, the usage pattern should look either
1646
33
     *  - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or
1647
33
     *  - alloc_pdata-->free_pdata.
1648
33
     */
1649
33
    switch ( action )
1650
33
    {
1651
11
    case CPU_STARTING:
1652
11
        SCHED_OP(sched, init_pdata, sd->sched_priv, cpu);
1653
11
        break;
1654
11
    case CPU_UP_PREPARE:
1655
11
        rc = cpu_schedule_up(cpu);
1656
11
        break;
1657
0
    case CPU_DEAD:
1658
0
        SCHED_OP(sched, deinit_pdata, sd->sched_priv, cpu);
1659
0
        /* Fallthrough */
1660
0
    case CPU_UP_CANCELED:
1661
0
        cpu_schedule_down(cpu);
1662
0
        break;
1663
11
    default:
1664
11
        break;
1665
33
    }
1666
33
1667
33
    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1668
33
}
1669
1670
static struct notifier_block cpu_schedule_nfb = {
1671
    .notifier_call = cpu_schedule_callback
1672
};
1673
1674
/* Initialise the data structures. */
1675
void __init scheduler_init(void)
1676
1
{
1677
1
    struct domain *idle_domain;
1678
1
    int i;
1679
1
1680
1
    open_softirq(SCHEDULE_SOFTIRQ, schedule);
1681
1
1682
6
    for ( i = 0; i < NUM_SCHEDULERS; i++)
1683
5
    {
1684
5
        if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
1685
0
            schedulers[i] = NULL;
1686
5
        else if ( !ops.name && !strcmp(schedulers[i]->opt_name, opt_sched) )
1687
1
            ops = *schedulers[i];
1688
5
    }
1689
1
1690
1
    if ( !ops.name )
1691
0
    {
1692
0
        printk("Could not find scheduler: %s\n", opt_sched);
1693
0
        for ( i = 0; i < NUM_SCHEDULERS; i++ )
1694
0
            if ( schedulers[i] &&
1695
0
                 !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
1696
0
            {
1697
0
                ops = *schedulers[i];
1698
0
                break;
1699
0
            }
1700
0
        BUG_ON(!ops.name);
1701
0
        printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
1702
0
    }
1703
1
1704
1
    if ( cpu_schedule_up(0) )
1705
0
        BUG();
1706
1
    register_cpu_notifier(&cpu_schedule_nfb);
1707
1
1708
1
    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1709
1
    if ( SCHED_OP(&ops, init) )
1710
0
        panic("scheduler returned error on init");
1711
1
1712
1
    if ( sched_ratelimit_us &&
1713
1
         (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
1714
1
          || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
1715
0
    {
1716
0
        printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
1717
0
               " Resetting to default %u\n",
1718
0
               XEN_SYSCTL_SCHED_RATELIMIT_MIN,
1719
0
               XEN_SYSCTL_SCHED_RATELIMIT_MAX,
1720
0
               SCHED_DEFAULT_RATELIMIT_US);
1721
0
        sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
1722
0
    }
1723
1
1724
1
    idle_domain = domain_create(DOMID_IDLE, 0, 0, NULL);
1725
1
    BUG_ON(IS_ERR(idle_domain));
1726
1
    idle_domain->vcpu = idle_vcpu;
1727
1
    idle_domain->max_vcpus = nr_cpu_ids;
1728
1
    if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1729
0
        BUG();
1730
1
    this_cpu(schedule_data).sched_priv = SCHED_OP(&ops, alloc_pdata, 0);
1731
1
    BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv));
1732
1
    SCHED_OP(&ops, init_pdata, this_cpu(schedule_data).sched_priv, 0);
1733
1
}
1734
1735
/*
1736
 * Move a pCPU outside of the influence of the scheduler of its current
1737
 * cpupool, or subject it to the scheduler of a new cpupool.
1738
 *
1739
 * For the pCPUs that are removed from their cpupool, their scheduler becomes
1740
 * &ops (the default scheduler, selected at boot, which also services the
1741
 * default cpupool). However, as these pCPUs are not really part of any pool,
1742
 * there won't be any scheduling event on them, not even from the default
1743
 * scheduler. Basically, they will just sit idle until they are explicitly
1744
 * added back to a cpupool.
1745
 */
1746
int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1747
12
{
1748
12
    struct vcpu *idle;
1749
12
    void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1750
12
    struct scheduler *old_ops = per_cpu(scheduler, cpu);
1751
12
    struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1752
12
    struct cpupool *old_pool = per_cpu(cpupool, cpu);
1753
12
    spinlock_t * old_lock;
1754
12
1755
12
    /*
1756
12
     * pCPUs only move from a valid cpupool to free (i.e., out of any pool),
1757
12
     * or from free to a valid cpupool. In the former case (which happens when
1758
12
     * c is NULL), we want the CPU to have been marked as free already, as
1759
12
     * well as to not be valid for the source pool any longer, when we get to
1760
12
     * here. In the latter case (which happens when c is a valid cpupool), we
1761
12
     * want the CPU to still be marked as free, as well as to not yet be valid
1762
12
     * for the destination pool.
1763
12
     */
1764
12
    ASSERT(c != old_pool && (c != NULL || old_pool != NULL));
1765
12
    ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
1766
12
    ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) ||
1767
12
           (c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid)));
1768
12
1769
12
    if ( old_ops == new_ops )
1770
12
        goto out;
1771
12
1772
12
    /*
1773
12
     * To setup the cpu for the new scheduler we need:
1774
12
     *  - a valid instance of per-CPU scheduler specific data, as it is
1775
12
     *    allocated by SCHED_OP(alloc_pdata). Note that we do not want to
1776
12
     *    initialize it yet (i.e., we are not calling SCHED_OP(init_pdata)).
1777
12
     *    That will be done by the target scheduler, in SCHED_OP(switch_sched),
1778
12
     *    in proper ordering and with locking.
1779
12
     *  - a valid instance of per-vCPU scheduler specific data, for the idle
1780
12
     *    vCPU of cpu. That is what the target scheduler will use for the
1781
12
     *    sched_priv field of the per-vCPU info of the idle domain.
1782
12
     */
1783
0
    idle = idle_vcpu[cpu];
1784
0
    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1785
0
    if ( IS_ERR(ppriv) )
1786
0
        return PTR_ERR(ppriv);
1787
0
    vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1788
0
    if ( vpriv == NULL )
1789
0
    {
1790
0
        SCHED_OP(new_ops, free_pdata, ppriv, cpu);
1791
0
        return -ENOMEM;
1792
0
    }
1793
0
1794
0
    SCHED_OP(old_ops, tick_suspend, cpu);
1795
0
1796
0
    /*
1797
0
     * The actual switch, including (if necessary) the rerouting of the
1798
0
     * scheduler lock to whatever new_ops prefers,  needs to happen in one
1799
0
     * critical section, protected by old_ops' lock, or races are possible.
1800
0
     * It is, in fact, the lock of another scheduler that we are taking (the
1801
0
     * scheduler of the cpupool that cpu still belongs to). But that is ok
1802
0
     * as, anyone trying to schedule on this cpu will spin until when we
1803
0
     * release that lock (bottom of this function). When he'll get the lock
1804
0
     * --thanks to the loop inside *_schedule_lock() functions-- he'll notice
1805
0
     * that the lock itself changed, and retry acquiring the new one (which
1806
0
     * will be the correct, remapped one, at that point).
1807
0
     */
1808
0
    old_lock = pcpu_schedule_lock_irq(cpu);
1809
0
1810
0
    vpriv_old = idle->sched_priv;
1811
0
    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1812
0
    SCHED_OP(new_ops, switch_sched, cpu, ppriv, vpriv);
1813
0
1814
0
    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
1815
0
    spin_unlock_irq(old_lock);
1816
0
1817
0
    SCHED_OP(new_ops, tick_resume, cpu);
1818
0
1819
0
    SCHED_OP(old_ops, deinit_pdata, ppriv_old, cpu);
1820
0
1821
0
    SCHED_OP(old_ops, free_vdata, vpriv_old);
1822
0
    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1823
0
1824
12
 out:
1825
12
    per_cpu(cpupool, cpu) = c;
1826
12
    /* When a cpu is added to a pool, trigger it to go pick up some work */
1827
12
    if ( c != NULL )
1828
12
        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1829
12
1830
12
    return 0;
1831
0
}
1832
1833
struct scheduler *scheduler_get_default(void)
1834
1
{
1835
1
    return &ops;
1836
1
}
1837
1838
struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1839
0
{
1840
0
    int i;
1841
0
    struct scheduler *sched;
1842
0
1843
0
    for ( i = 0; i < NUM_SCHEDULERS; i++ )
1844
0
        if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
1845
0
            goto found;
1846
0
    *perr = -ENOENT;
1847
0
    return NULL;
1848
0
1849
0
 found:
1850
0
    *perr = -ENOMEM;
1851
0
    if ( (sched = xmalloc(struct scheduler)) == NULL )
1852
0
        return NULL;
1853
0
    memcpy(sched, schedulers[i], sizeof(*sched));
1854
0
    if ( (*perr = SCHED_OP(sched, init)) != 0 )
1855
0
    {
1856
0
        xfree(sched);
1857
0
        sched = NULL;
1858
0
    }
1859
0
1860
0
    return sched;
1861
0
}
1862
1863
void scheduler_free(struct scheduler *sched)
1864
0
{
1865
0
    BUG_ON(sched == &ops);
1866
0
    SCHED_OP(sched, deinit);
1867
0
    xfree(sched);
1868
0
}
1869
1870
void schedule_dump(struct cpupool *c)
1871
0
{
1872
0
    unsigned int      i;
1873
0
    struct scheduler *sched;
1874
0
    cpumask_t        *cpus;
1875
0
1876
0
    /* Locking, if necessary, must be handled withing each scheduler */
1877
0
1878
0
    if ( c != NULL )
1879
0
    {
1880
0
        sched = c->sched;
1881
0
        cpus = c->cpu_valid;
1882
0
        printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1883
0
        SCHED_OP(sched, dump_settings);
1884
0
    }
1885
0
    else
1886
0
    {
1887
0
        sched = &ops;
1888
0
        cpus = &cpupool_free_cpus;
1889
0
    }
1890
0
1891
0
    if ( sched->dump_cpu_state != NULL )
1892
0
    {
1893
0
        printk("CPUs info:\n");
1894
0
        for_each_cpu (i, cpus)
1895
0
            SCHED_OP(sched, dump_cpu_state, i);
1896
0
    }
1897
0
}
1898
1899
void sched_tick_suspend(void)
1900
1.89M
{
1901
1.89M
    struct scheduler *sched;
1902
1.89M
    unsigned int cpu = smp_processor_id();
1903
1.89M
1904
1.89M
    sched = per_cpu(scheduler, cpu);
1905
1.89M
    SCHED_OP(sched, tick_suspend, cpu);
1906
1.89M
    rcu_idle_enter(cpu);
1907
1.89M
    rcu_idle_timer_start();
1908
1.89M
}
1909
1910
void sched_tick_resume(void)
1911
1.97M
{
1912
1.97M
    struct scheduler *sched;
1913
1.97M
    unsigned int cpu = smp_processor_id();
1914
1.97M
1915
1.97M
    rcu_idle_timer_stop();
1916
1.97M
    rcu_idle_exit(cpu);
1917
1.97M
    sched = per_cpu(scheduler, cpu);
1918
1.97M
    SCHED_OP(sched, tick_resume, cpu);
1919
1.97M
}
1920
1921
void wait(void)
1922
0
{
1923
0
    schedule();
1924
0
}
1925
1926
#ifdef CONFIG_COMPAT
1927
#include "compat/schedule.c"
1928
#endif
1929
1930
#endif /* !COMPAT */
1931
1932
/*
1933
 * Local variables:
1934
 * mode: C
1935
 * c-file-style: "BSD"
1936
 * c-basic-offset: 4
1937
 * tab-width: 4
1938
 * indent-tabs-mode: nil
1939
 * End:
1940
 */