Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/sched_credit.c
Line
Count
Source (jump to first uncovered line)
1
/****************************************************************************
2
 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3
 ****************************************************************************
4
 *
5
 *        File: common/csched_credit.c
6
 *      Author: Emmanuel Ackaouy
7
 *
8
 * Description: Credit-based SMP CPU scheduler
9
 */
10
11
#include <xen/init.h>
12
#include <xen/lib.h>
13
#include <xen/sched.h>
14
#include <xen/domain.h>
15
#include <xen/delay.h>
16
#include <xen/event.h>
17
#include <xen/time.h>
18
#include <xen/sched-if.h>
19
#include <xen/softirq.h>
20
#include <asm/atomic.h>
21
#include <asm/div64.h>
22
#include <xen/errno.h>
23
#include <xen/keyhandler.h>
24
#include <xen/trace.h>
25
#include <xen/err.h>
26
27
28
/*
29
 * Locking:
30
 * - Scheduler-lock (a.k.a. runqueue lock):
31
 *  + is per-runqueue, and there is one runqueue per-cpu;
32
 *  + serializes all runqueue manipulation operations;
33
 * - Private data lock (a.k.a. private scheduler lock):
34
 *  + serializes accesses to the scheduler global state (weight,
35
 *    credit, balance_credit, etc);
36
 *  + serializes updates to the domains' scheduling parameters.
37
 *
38
 * Ordering is "private lock always comes first":
39
 *  + if we need both locks, we must acquire the private
40
 *    scheduler lock for first;
41
 *  + if we already own a runqueue lock, we must never acquire
42
 *    the private scheduler lock.
43
 */
44
45
/*
46
 * Basic constants
47
 */
48
1
#define CSCHED_DEFAULT_WEIGHT       256
49
1
#define CSCHED_TICKS_PER_TSLICE     3
50
/* Default timeslice: 30ms */
51
0
#define CSCHED_DEFAULT_TSLICE_MS    30
52
3.07M
#define CSCHED_CREDITS_PER_MSEC     10
53
/* Never set a timer shorter than this value. */
54
57
#define CSCHED_MIN_TIMER            XEN_SYSCTL_SCHED_RATELIMIT_MIN
55
56
57
/*
58
 * Priorities
59
 */
60
12.9k
#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
61
76.8k
#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
62
5.02M
#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
63
10.1M
#define CSCHED_PRI_IDLE         -64     /* idle */
64
65
66
/*
67
 * Flags
68
 *
69
 * Note that svc->flags (where these flags live) is protected by an
70
 * inconsistent set of locks. Therefore atomic-safe bit operations must
71
 * be used for accessing it.
72
 */
73
#define CSCHED_FLAG_VCPU_PARKED    0x0  /* VCPU over capped credits */
74
#define CSCHED_FLAG_VCPU_YIELD     0x1  /* VCPU yielding */
75
#define CSCHED_FLAG_VCPU_MIGRATING 0x2  /* VCPU may have moved to a new pcpu */
76
77
78
/*
79
 * Useful macros
80
 */
81
#define CSCHED_PRIV(_ops)   \
82
6.65M
    ((struct csched_private *)((_ops)->sched_data))
83
#define CSCHED_PCPU(_c)     \
84
19.5M
    ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
85
19.0M
#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
86
0
#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
87
9.53M
#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
88
89
90
/*
91
 * CSCHED_STATS
92
 *
93
 * Manage very basic per-vCPU counters and stats.
94
 *
95
 * Useful for debugging live systems. The stats are displayed
96
 * with runq dumps ('r' on the Xen console).
97
 */
98
#ifdef SCHED_STATS
99
100
#define CSCHED_STATS
101
102
#define SCHED_VCPU_STATS_RESET(_V)                      \
103
    do                                                  \
104
    {                                                   \
105
        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
106
    } while ( 0 )
107
108
#define SCHED_VCPU_STAT_CRANK(_V, _X)       (((_V)->stats._X)++)
109
110
#define SCHED_VCPU_STAT_SET(_V, _X, _Y)     (((_V)->stats._X) = (_Y))
111
112
#else /* !SCHED_STATS */
113
114
#undef CSCHED_STATS
115
116
24
#define SCHED_VCPU_STATS_RESET(_V)         do {} while ( 0 )
117
2.74k
#define SCHED_VCPU_STAT_CRANK(_V, _X)      do {} while ( 0 )
118
9.30k
#define SCHED_VCPU_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
119
120
#endif /* SCHED_STATS */
121
122
123
/*
124
 * Credit tracing events ("only" 512 available!). Check
125
 * include/public/trace.h for more details.
126
 */
127
#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1)
128
#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2)
129
#define TRC_CSCHED_ACCOUNT_STOP  TRC_SCHED_CLASS_EVT(CSCHED, 3)
130
#define TRC_CSCHED_STOLEN_VCPU   TRC_SCHED_CLASS_EVT(CSCHED, 4)
131
#define TRC_CSCHED_PICKED_CPU    TRC_SCHED_CLASS_EVT(CSCHED, 5)
132
0
#define TRC_CSCHED_TICKLE        TRC_SCHED_CLASS_EVT(CSCHED, 6)
133
#define TRC_CSCHED_BOOST_START   TRC_SCHED_CLASS_EVT(CSCHED, 7)
134
#define TRC_CSCHED_BOOST_END     TRC_SCHED_CLASS_EVT(CSCHED, 8)
135
0
#define TRC_CSCHED_SCHEDULE      TRC_SCHED_CLASS_EVT(CSCHED, 9)
136
0
#define TRC_CSCHED_RATELIMIT     TRC_SCHED_CLASS_EVT(CSCHED, 10)
137
#define TRC_CSCHED_STEAL_CHECK   TRC_SCHED_CLASS_EVT(CSCHED, 11)
138
139
/*
140
 * Boot parameters
141
 */
142
static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
143
integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms);
144
145
/*
146
 * Physical CPU
147
 */
148
struct csched_pcpu {
149
    struct list_head runq;
150
    uint32_t runq_sort_last;
151
152
    unsigned int idle_bias;
153
    unsigned int nr_runnable;
154
155
    unsigned int tick;
156
    struct timer ticker;
157
};
158
159
/*
160
 * Virtual CPU
161
 */
162
struct csched_vcpu {
163
    struct list_head runq_elem;
164
    struct list_head active_vcpu_elem;
165
166
    /* Up-pointers */
167
    struct csched_dom *sdom;
168
    struct vcpu *vcpu;
169
170
    s_time_t start_time;   /* When we were scheduled (used for credit) */
171
    unsigned flags;
172
    int pri;
173
174
    atomic_t credit;
175
    unsigned int residual;
176
177
#ifdef CSCHED_STATS
178
    struct {
179
        int credit_last;
180
        uint32_t credit_incr;
181
        uint32_t state_active;
182
        uint32_t state_idle;
183
        uint32_t migrate_q;
184
        uint32_t migrate_r;
185
        uint32_t kicked_away;
186
    } stats;
187
#endif
188
};
189
190
/*
191
 * Domain
192
 */
193
struct csched_dom {
194
    struct list_head active_vcpu;
195
    struct list_head active_sdom_elem;
196
    struct domain *dom;
197
    uint16_t active_vcpu_count;
198
    uint16_t weight;
199
    uint16_t cap;
200
};
201
202
/*
203
 * System-wide private data
204
 */
205
struct csched_private {
206
    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
207
    spinlock_t lock;
208
209
    cpumask_var_t idlers;
210
    cpumask_var_t cpus;
211
    uint32_t *balance_bias;
212
    uint32_t runq_sort;
213
    unsigned int ratelimit_us;
214
215
    /* Period of master and tick in milliseconds */
216
    unsigned int tslice_ms, tick_period_us, ticks_per_tslice;
217
    uint32_t ncpus;
218
219
    struct list_head active_sdom;
220
    uint32_t weight;
221
    uint32_t credit;
222
    int credit_balance;
223
    unsigned int credits_per_tslice;
224
225
    unsigned int master;
226
    struct timer master_ticker;
227
};
228
229
static void csched_tick(void *_cpu);
230
static void csched_acct(void *dummy);
231
232
static inline int
233
__vcpu_on_runq(struct csched_vcpu *svc)
234
9.60M
{
235
9.60M
    return !list_empty(&svc->runq_elem);
236
9.60M
}
237
238
static inline struct csched_vcpu *
239
__runq_elem(struct list_head *elem)
240
13.6M
{
241
13.6M
    return list_entry(elem, struct csched_vcpu, runq_elem);
242
13.6M
}
243
244
/* Is the first element of cpu's runq (if any) cpu's idle vcpu? */
245
static inline bool_t is_runq_idle(unsigned int cpu)
246
10.4k
{
247
10.4k
    /*
248
10.4k
     * We're peeking at cpu's runq, we must hold the proper lock.
249
10.4k
     */
250
10.4k
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
251
10.4k
252
10.4k
    return list_empty(RUNQ(cpu)) ||
253
10.3k
           is_idle_vcpu(__runq_elem(RUNQ(cpu)->next)->vcpu);
254
10.4k
}
255
256
static inline void
257
inc_nr_runnable(unsigned int cpu)
258
67.3k
{
259
67.3k
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
260
67.3k
    CSCHED_PCPU(cpu)->nr_runnable++;
261
67.3k
262
67.3k
}
263
264
static inline void
265
dec_nr_runnable(unsigned int cpu)
266
65.5k
{
267
65.5k
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
268
65.5k
    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1);
269
65.5k
    CSCHED_PCPU(cpu)->nr_runnable--;
270
65.5k
}
271
272
static inline void
273
__runq_insert(struct csched_vcpu *svc)
274
4.81M
{
275
4.81M
    unsigned int cpu = svc->vcpu->processor;
276
4.81M
    const struct list_head * const runq = RUNQ(cpu);
277
4.81M
    struct list_head *iter;
278
4.81M
279
4.81M
    BUG_ON( __vcpu_on_runq(svc) );
280
4.81M
281
4.81M
    list_for_each( iter, runq )
282
4.66M
    {
283
4.66M
        const struct csched_vcpu * const iter_svc = __runq_elem(iter);
284
4.66M
        if ( svc->pri > iter_svc->pri )
285
4.56M
            break;
286
4.66M
    }
287
4.81M
288
4.81M
    /* If the vcpu yielded, try to put it behind one lower-priority
289
4.81M
     * runnable vcpu if we can.  The next runq_sort will bring it forward
290
4.81M
     * within 30ms if the queue too long. */
291
4.81M
    if ( test_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags)
292
4.63M
         && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
293
600
    {
294
600
        iter=iter->next;
295
600
296
600
        /* Some sanity checks */
297
600
        BUG_ON(iter == runq);
298
600
    }
299
4.81M
300
4.81M
    list_add_tail(&svc->runq_elem, iter);
301
4.81M
}
302
303
static inline void
304
runq_insert(struct csched_vcpu *svc)
305
66.8k
{
306
66.8k
    __runq_insert(svc);
307
66.8k
    inc_nr_runnable(svc->vcpu->processor);
308
66.8k
}
309
310
static inline void
311
__runq_remove(struct csched_vcpu *svc)
312
4.85M
{
313
4.85M
    BUG_ON( !__vcpu_on_runq(svc) );
314
4.85M
    list_del_init(&svc->runq_elem);
315
4.85M
}
316
317
static inline void
318
runq_remove(struct csched_vcpu *svc)
319
507
{
320
507
    dec_nr_runnable(svc->vcpu->processor);
321
507
    __runq_remove(svc);
322
507
}
323
324
static void burn_credits(struct csched_vcpu *svc, s_time_t now)
325
4.64M
{
326
4.64M
    s_time_t delta;
327
4.64M
    uint64_t val;
328
4.64M
    unsigned int credits;
329
4.64M
330
4.64M
    /* Assert svc is current */
331
4.64M
    ASSERT( svc == CSCHED_VCPU(curr_on_cpu(svc->vcpu->processor)) );
332
4.64M
333
4.64M
    if ( (delta = now - svc->start_time) <= 0 )
334
3.10M
        return;
335
4.64M
336
1.53M
    val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual;
337
1.53M
    svc->residual = do_div(val, MILLISECS(1));
338
1.53M
    credits = val;
339
1.53M
    ASSERT(credits == val); /* make sure we haven't truncated val */
340
1.53M
    atomic_sub(credits, &svc->credit);
341
1.53M
    svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
342
1.53M
}
343
344
static bool_t __read_mostly opt_tickle_one_idle = 1;
345
boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
346
347
DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
348
349
static inline void __runq_tickle(struct csched_vcpu *new)
350
66.8k
{
351
66.8k
    unsigned int cpu = new->vcpu->processor;
352
66.8k
    struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu));
353
66.8k
    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
354
66.8k
    cpumask_t mask, idle_mask, *online;
355
66.8k
    int balance_step, idlers_empty;
356
66.8k
357
66.8k
    ASSERT(cur);
358
66.8k
    cpumask_clear(&mask);
359
66.8k
360
66.8k
    online = cpupool_domain_cpumask(new->sdom->dom);
361
66.8k
    cpumask_and(&idle_mask, prv->idlers, online);
362
66.8k
    idlers_empty = cpumask_empty(&idle_mask);
363
66.8k
364
66.8k
    /*
365
66.8k
     * If the pcpu is idle, or there are no idlers and the new
366
66.8k
     * vcpu is a higher priority than the old vcpu, run it here.
367
66.8k
     *
368
66.8k
     * If there are idle cpus, first try to find one suitable to run
369
66.8k
     * new, so we can avoid preempting cur.  If we cannot find a
370
66.8k
     * suitable idler on which to run new, run it here, but try to
371
66.8k
     * find a suitable idler on which to run cur instead.
372
66.8k
     */
373
66.8k
    if ( cur->pri == CSCHED_PRI_IDLE
374
196
         || (idlers_empty && new->pri > cur->pri) )
375
66.6k
    {
376
66.6k
        if ( cur->pri != CSCHED_PRI_IDLE )
377
0
            SCHED_STAT_CRANK(tickled_busy_cpu);
378
66.6k
        else
379
66.6k
            SCHED_STAT_CRANK(tickled_idle_cpu);
380
66.6k
        __cpumask_set_cpu(cpu, &mask);
381
66.6k
    }
382
184
    else if ( !idlers_empty )
383
194
    {
384
194
        /*
385
194
         * Soft and hard affinity balancing loop. For vcpus without
386
194
         * a useful soft affinity, consider hard affinity only.
387
194
         */
388
194
        for_each_affinity_balance_step( balance_step )
389
388
        {
390
388
            int new_idlers_empty;
391
388
392
388
            if ( balance_step == BALANCE_SOFT_AFFINITY
393
194
                 && !has_soft_affinity(new->vcpu,
394
194
                                       new->vcpu->cpu_hard_affinity) )
395
194
                continue;
396
388
397
388
            /* Are there idlers suitable for new (for this balance step)? */
398
194
            affinity_balance_cpumask(new->vcpu, balance_step,
399
194
                                     cpumask_scratch_cpu(cpu));
400
194
            cpumask_and(cpumask_scratch_cpu(cpu),
401
194
                        cpumask_scratch_cpu(cpu), &idle_mask);
402
194
            new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu));
403
194
404
194
            /*
405
194
             * Let's not be too harsh! If there aren't idlers suitable
406
194
             * for new in its soft affinity mask, make sure we check its
407
194
             * hard affinity as well, before taking final decisions.
408
194
             */
409
194
            if ( new_idlers_empty
410
0
                 && balance_step == BALANCE_SOFT_AFFINITY )
411
0
                continue;
412
194
413
194
            /*
414
194
             * If there are no suitable idlers for new, and it's higher
415
194
             * priority than cur, check whether we can migrate cur away.
416
194
             * We have to do it indirectly, via _VPF_migrating (instead
417
194
             * of just tickling any idler suitable for cur) because cur
418
194
             * is running.
419
194
             *
420
194
             * If there are suitable idlers for new, no matter priorities,
421
194
             * leave cur alone (as it is running and is, likely, cache-hot)
422
194
             * and wake some of them (which is waking up and so is, likely,
423
194
             * cache cold anyway).
424
194
             */
425
194
            if ( new_idlers_empty && new->pri > cur->pri )
426
0
            {
427
0
                if ( cpumask_intersects(cur->vcpu->cpu_hard_affinity,
428
0
                                        &idle_mask) )
429
0
                {
430
0
                    SCHED_VCPU_STAT_CRANK(cur, kicked_away);
431
0
                    SCHED_VCPU_STAT_CRANK(cur, migrate_r);
432
0
                    SCHED_STAT_CRANK(migrate_kicked_away);
433
0
                    set_bit(_VPF_migrating, &cur->vcpu->pause_flags);
434
0
                }
435
0
                /* Tickle cpu anyway, to let new preempt cur. */
436
0
                SCHED_STAT_CRANK(tickled_busy_cpu);
437
0
                __cpumask_set_cpu(cpu, &mask);
438
0
            }
439
194
            else if ( !new_idlers_empty )
440
194
            {
441
194
                /* Which of the idlers suitable for new shall we wake up? */
442
194
                SCHED_STAT_CRANK(tickled_idle_cpu);
443
194
                if ( opt_tickle_one_idle )
444
194
                {
445
194
                    this_cpu(last_tickle_cpu) =
446
194
                        cpumask_cycle(this_cpu(last_tickle_cpu),
447
194
                                      cpumask_scratch_cpu(cpu));
448
194
                    __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
449
194
                }
450
194
                else
451
0
                    cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu));
452
194
            }
453
194
454
194
            /* Did we find anyone? */
455
194
            if ( !cpumask_empty(&mask) )
456
194
                break;
457
194
        }
458
194
    }
459
66.8k
460
66.8k
    if ( !cpumask_empty(&mask) )
461
66.8k
    {
462
66.8k
        if ( unlikely(tb_init_done) )
463
0
        {
464
0
            /* Avoid TRACE_*: saves checking !tb_init_done each step */
465
0
            for_each_cpu(cpu, &mask)
466
0
                __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
467
0
        }
468
66.8k
469
66.8k
        /*
470
66.8k
         * Mark the designated CPUs as busy and send them all the scheduler
471
66.8k
         * interrupt. We need the for_each_cpu for dealing with the
472
66.8k
         * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
473
66.8k
         * can't use cpumask_andnot(), because prv->idlers needs atomic access.
474
66.8k
         *
475
66.8k
         * In the default (and most common) case, when opt_rickle_one_idle is
476
66.8k
         * true, the loop does only one step, and only one bit is cleared.
477
66.8k
         */
478
66.8k
        for_each_cpu(cpu, &mask)
479
66.8k
            cpumask_clear_cpu(cpu, prv->idlers);
480
66.8k
        cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
481
66.8k
    }
482
66.8k
    else
483
18.4E
        SCHED_STAT_CRANK(tickled_no_cpu);
484
66.8k
}
485
486
static void
487
csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
488
0
{
489
0
    struct csched_private *prv = CSCHED_PRIV(ops);
490
0
491
0
    /*
492
0
     * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're
493
0
     * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed
494
0
     * very early. xfree() does not really mind, but we want to be sure that,
495
0
     * when we get here, either init_pdata has never been called, or
496
0
     * deinit_pdata has been called already.
497
0
     */
498
0
    ASSERT(!cpumask_test_cpu(cpu, prv->cpus));
499
0
500
0
    xfree(pcpu);
501
0
}
502
503
static void
504
csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
505
0
{
506
0
    struct csched_private *prv = CSCHED_PRIV(ops);
507
0
    struct csched_pcpu *spc = pcpu;
508
0
    unsigned int node = cpu_to_node(cpu);
509
0
    unsigned long flags;
510
0
511
0
    /*
512
0
     * Scheduler specific data for this pCPU must still be there and and be
513
0
     * valid. In fact, if we are here:
514
0
     *  1. alloc_pdata must have been called for this cpu, and free_pdata
515
0
     *     must not have been called on it before us,
516
0
     *  2. init_pdata must have been called on this cpu, and deinit_pdata
517
0
     *     (us!) must not have been called on it already.
518
0
     */
519
0
    ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus));
520
0
521
0
    spin_lock_irqsave(&prv->lock, flags);
522
0
523
0
    prv->credit -= prv->credits_per_tslice;
524
0
    prv->ncpus--;
525
0
    cpumask_clear_cpu(cpu, prv->idlers);
526
0
    cpumask_clear_cpu(cpu, prv->cpus);
527
0
    if ( (prv->master == cpu) && (prv->ncpus > 0) )
528
0
    {
529
0
        prv->master = cpumask_first(prv->cpus);
530
0
        migrate_timer(&prv->master_ticker, prv->master);
531
0
    }
532
0
    if ( prv->balance_bias[node] == cpu )
533
0
    {
534
0
        cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node));
535
0
        if ( !cpumask_empty(cpumask_scratch) )
536
0
            prv->balance_bias[node] =  cpumask_first(cpumask_scratch);
537
0
    }
538
0
    kill_timer(&spc->ticker);
539
0
    if ( prv->ncpus == 0 )
540
0
        kill_timer(&prv->master_ticker);
541
0
542
0
    spin_unlock_irqrestore(&prv->lock, flags);
543
0
}
544
545
static void *
546
csched_alloc_pdata(const struct scheduler *ops, int cpu)
547
12
{
548
12
    struct csched_pcpu *spc;
549
12
550
12
    /* Allocate per-PCPU info */
551
12
    spc = xzalloc(struct csched_pcpu);
552
12
    if ( spc == NULL )
553
0
        return ERR_PTR(-ENOMEM);
554
12
555
12
    return spc;
556
12
}
557
558
static void
559
init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
560
12
{
561
12
    ASSERT(spin_is_locked(&prv->lock));
562
12
    /* cpu data needs to be allocated, but STILL uninitialized. */
563
12
    ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
564
12
565
12
    /* Initialize/update system-wide config */
566
12
    prv->credit += prv->credits_per_tslice;
567
12
    prv->ncpus++;
568
12
    cpumask_set_cpu(cpu, prv->cpus);
569
12
    if ( prv->ncpus == 1 )
570
1
    {
571
1
        prv->master = cpu;
572
1
        init_timer(&prv->master_ticker, csched_acct, prv, cpu);
573
1
        set_timer(&prv->master_ticker,
574
1
                  NOW() + MILLISECS(prv->tslice_ms));
575
1
    }
576
12
577
12
    cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu)));
578
12
    if ( cpumask_weight(cpumask_scratch) == 1 )
579
1
        prv->balance_bias[cpu_to_node(cpu)] = cpu;
580
12
581
12
    init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
582
12
    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
583
12
584
12
    INIT_LIST_HEAD(&spc->runq);
585
12
    spc->runq_sort_last = prv->runq_sort;
586
12
    spc->idle_bias = nr_cpu_ids - 1;
587
12
588
12
    /* Start off idling... */
589
12
    BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
590
12
    cpumask_set_cpu(cpu, prv->idlers);
591
12
    spc->nr_runnable = 0;
592
12
}
593
594
static void
595
csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
596
12
{
597
12
    unsigned long flags;
598
12
    struct csched_private *prv = CSCHED_PRIV(ops);
599
12
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
600
12
601
12
    /*
602
12
     * This is called either during during boot, resume or hotplug, in
603
12
     * case Credit1 is the scheduler chosen at boot. In such cases, the
604
12
     * scheduler lock for cpu is already pointing to the default per-cpu
605
12
     * spinlock, as Credit1 needs it, so there is no remapping to be done.
606
12
     */
607
12
    ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
608
12
609
12
    spin_lock_irqsave(&prv->lock, flags);
610
12
    init_pdata(prv, pdata, cpu);
611
12
    spin_unlock_irqrestore(&prv->lock, flags);
612
12
}
613
614
/* Change the scheduler of cpu to us (Credit). */
615
static void
616
csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
617
                    void *pdata, void *vdata)
618
0
{
619
0
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
620
0
    struct csched_private *prv = CSCHED_PRIV(new_ops);
621
0
    struct csched_vcpu *svc = vdata;
622
0
623
0
    ASSERT(svc && is_idle_vcpu(svc->vcpu));
624
0
625
0
    idle_vcpu[cpu]->sched_priv = vdata;
626
0
627
0
    /*
628
0
     * We are holding the runqueue lock already (it's been taken in
629
0
     * schedule_cpu_switch()). It actually may or may not be the 'right'
630
0
     * one for this cpu, but that is ok for preventing races.
631
0
     */
632
0
    ASSERT(!local_irq_is_enabled());
633
0
    spin_lock(&prv->lock);
634
0
    init_pdata(prv, pdata, cpu);
635
0
    spin_unlock(&prv->lock);
636
0
637
0
    per_cpu(scheduler, cpu) = new_ops;
638
0
    per_cpu(schedule_data, cpu).sched_priv = pdata;
639
0
640
0
    /*
641
0
     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
642
0
     * if it is free (and it can be) we want that anyone that manages
643
0
     * taking it, finds all the initializations we've done above in place.
644
0
     */
645
0
    smp_mb();
646
0
    sd->schedule_lock = &sd->_lock;
647
0
}
648
649
#ifndef NDEBUG
650
static inline void
651
__csched_vcpu_check(struct vcpu *vc)
652
9.51M
{
653
9.51M
    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
654
9.51M
    struct csched_dom * const sdom = svc->sdom;
655
9.51M
656
9.51M
    BUG_ON( svc->vcpu != vc );
657
9.51M
    BUG_ON( sdom != CSCHED_DOM(vc->domain) );
658
9.51M
    if ( sdom )
659
9.40M
    {
660
9.40M
        BUG_ON( is_idle_vcpu(vc) );
661
9.40M
        BUG_ON( sdom->dom != vc->domain );
662
9.40M
    }
663
9.51M
    else
664
110k
    {
665
110k
        BUG_ON( !is_idle_vcpu(vc) );
666
110k
    }
667
9.51M
668
9.51M
    SCHED_STAT_CRANK(vcpu_check);
669
9.51M
}
670
9.68M
#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
671
#else
672
#define CSCHED_VCPU_CHECK(_vc)
673
#endif
674
675
/*
676
 * Delay, in microseconds, between migrations of a VCPU between PCPUs.
677
 * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
678
 * implicit overheads such as cache-warming. 1ms (1000) has been measured
679
 * as a good value.
680
 */
681
static unsigned int vcpu_migration_delay;
682
integer_param("vcpu_migration_delay", vcpu_migration_delay);
683
684
void set_vcpu_migration_delay(unsigned int delay)
685
0
{
686
0
    vcpu_migration_delay = delay;
687
0
}
688
689
unsigned int get_vcpu_migration_delay(void)
690
0
{
691
0
    return vcpu_migration_delay;
692
0
}
693
694
static inline int
695
__csched_vcpu_is_cache_hot(struct vcpu *v)
696
506
{
697
506
    int hot = ((NOW() - v->last_run_time) <
698
506
               ((uint64_t)vcpu_migration_delay * 1000u));
699
506
700
506
    if ( hot )
701
0
        SCHED_STAT_CRANK(vcpu_hot);
702
506
703
506
    return hot;
704
506
}
705
706
static inline int
707
__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask)
708
506
{
709
506
    /*
710
506
     * Don't pick up work that's hot on peer PCPU, or that can't (or
711
506
     * would prefer not to) run on cpu.
712
506
     *
713
506
     * The caller is supposed to have already checked that vc is also
714
506
     * not running.
715
506
     */
716
506
    ASSERT(!vc->is_running);
717
506
718
506
    return !__csched_vcpu_is_cache_hot(vc) &&
719
506
           cpumask_test_cpu(dest_cpu, mask);
720
506
}
721
722
static int
723
_csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
724
10.2k
{
725
10.2k
    cpumask_t cpus;
726
10.2k
    cpumask_t idlers;
727
10.2k
    cpumask_t *online;
728
10.2k
    struct csched_pcpu *spc = NULL;
729
10.2k
    int cpu = vc->processor;
730
10.2k
    int balance_step;
731
10.2k
732
10.2k
    /* Store in cpus the mask of online cpus on which the domain can run */
733
10.2k
    online = cpupool_domain_cpumask(vc->domain);
734
10.2k
    cpumask_and(&cpus, vc->cpu_hard_affinity, online);
735
10.2k
736
10.2k
    for_each_affinity_balance_step( balance_step )
737
20.1k
    {
738
20.1k
        /*
739
20.1k
         * We want to pick up a pcpu among the ones that are online and
740
20.1k
         * can accommodate vc, which is basically what we computed above
741
20.1k
         * and stored in cpus. As far as hard affinity is concerned,
742
20.1k
         * there always will be at least one of these pcpus, hence cpus
743
20.1k
         * is never empty and the calls to cpumask_cycle() and
744
20.1k
         * cpumask_test_cpu() below are ok.
745
20.1k
         *
746
20.1k
         * On the other hand, when considering soft affinity too, it
747
20.1k
         * is possible for the mask to become empty (for instance, if the
748
20.1k
         * domain has been put in a cpupool that does not contain any of the
749
20.1k
         * pcpus in its soft affinity), which would result in the ASSERT()-s
750
20.1k
         * inside cpumask_*() operations triggering (in debug builds).
751
20.1k
         *
752
20.1k
         * Therefore, in this case, we filter the soft affinity mask against
753
20.1k
         * cpus and, if the result is empty, we just skip the soft affinity
754
20.1k
         * balancing step all together.
755
20.1k
         */
756
20.1k
        if ( balance_step == BALANCE_SOFT_AFFINITY
757
10.2k
             && !has_soft_affinity(vc, &cpus) )
758
10.1k
            continue;
759
20.1k
760
20.1k
        /* Pick an online CPU from the proper affinity mask */
761
10.0k
        affinity_balance_cpumask(vc, balance_step, &cpus);
762
10.0k
        cpumask_and(&cpus, &cpus, online);
763
10.0k
764
10.0k
        /* If present, prefer vc's current processor */
765
10.0k
        cpu = cpumask_test_cpu(vc->processor, &cpus)
766
10.3k
                ? vc->processor
767
18.4E
                : cpumask_cycle(vc->processor, &cpus);
768
10.0k
        ASSERT(cpumask_test_cpu(cpu, &cpus));
769
10.0k
770
10.0k
        /*
771
10.0k
         * Try to find an idle processor within the above constraints.
772
10.0k
         *
773
10.0k
         * In multi-core and multi-threaded CPUs, not all idle execution
774
10.0k
         * vehicles are equal!
775
10.0k
         *
776
10.0k
         * We give preference to the idle execution vehicle with the most
777
10.0k
         * idling neighbours in its grouping. This distributes work across
778
10.0k
         * distinct cores first and guarantees we don't do something stupid
779
10.0k
         * like run two VCPUs on co-hyperthreads while there are idle cores
780
10.0k
         * or sockets.
781
10.0k
         *
782
10.0k
         * Notice that, when computing the "idleness" of cpu, we may want to
783
10.0k
         * discount vc. That is, iff vc is the currently running and the only
784
10.0k
         * runnable vcpu on cpu, we add cpu to the idlers.
785
10.0k
         */
786
10.0k
        cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
787
10.4k
        if ( vc->processor == cpu && is_runq_idle(cpu) )
788
9.52k
            __cpumask_set_cpu(cpu, &idlers);
789
10.0k
        cpumask_and(&cpus, &cpus, &idlers);
790
10.0k
791
10.0k
        /*
792
10.0k
         * It is important that cpu points to an idle processor, if a suitable
793
10.0k
         * one exists (and we can use cpus to check and, possibly, choose a new
794
10.0k
         * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and
795
10.0k
         * cpu points to a busy thread with an idle sibling, both the threads
796
10.0k
         * will be considered the same, from the "idleness" calculation point
797
10.0k
         * of view", preventing vcpu from being moved to the thread that is
798
10.0k
         * actually idle.
799
10.0k
         *
800
10.0k
         * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so
801
10.0k
         * we check for it first.
802
10.0k
         */
803
10.0k
        if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) )
804
31
            cpu = cpumask_cycle(cpu, &cpus);
805
10.0k
        __cpumask_clear_cpu(cpu, &cpus);
806
10.0k
807
30.7k
        while ( !cpumask_empty(&cpus) )
808
20.7k
        {
809
20.7k
            cpumask_t cpu_idlers;
810
20.7k
            cpumask_t nxt_idlers;
811
20.7k
            int nxt, weight_cpu, weight_nxt;
812
20.7k
            int migrate_factor;
813
20.7k
814
20.7k
            nxt = cpumask_cycle(cpu, &cpus);
815
20.7k
816
20.7k
            if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) )
817
20.7k
            {
818
20.7k
                /* We're on the same socket, so check the busy-ness of threads.
819
20.7k
                 * Migrate if # of idlers is less at all */
820
20.7k
                ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
821
20.7k
                migrate_factor = 1;
822
20.7k
                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask,
823
20.7k
                            cpu));
824
20.7k
                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask,
825
20.7k
                            nxt));
826
20.7k
            }
827
20.7k
            else
828
18.4E
            {
829
18.4E
                /* We're on different sockets, so check the busy-ness of cores.
830
18.4E
                 * Migrate only if the other core is twice as idle */
831
18.4E
                ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
832
18.4E
                migrate_factor = 2;
833
18.4E
                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu));
834
18.4E
                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt));
835
18.4E
            }
836
20.7k
837
20.7k
            weight_cpu = cpumask_weight(&cpu_idlers);
838
20.7k
            weight_nxt = cpumask_weight(&nxt_idlers);
839
20.7k
            /* smt_power_savings: consolidate work rather than spreading it */
840
20.7k
            if ( sched_smt_power_savings ?
841
0
                 weight_cpu > weight_nxt :
842
20.7k
                 weight_cpu * migrate_factor < weight_nxt )
843
64
            {
844
64
                cpumask_and(&nxt_idlers, &cpus, &nxt_idlers);
845
64
                spc = CSCHED_PCPU(nxt);
846
64
                cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers);
847
64
                cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu));
848
64
            }
849
20.7k
            else
850
20.6k
            {
851
20.6k
                cpumask_andnot(&cpus, &cpus, &nxt_idlers);
852
20.6k
            }
853
20.7k
        }
854
10.0k
855
10.0k
        /* Stop if cpu is idle */
856
10.0k
        if ( cpumask_test_cpu(cpu, &idlers) )
857
9.78k
            break;
858
10.0k
    }
859
10.2k
860
10.2k
    if ( commit && spc )
861
30
       spc->idle_bias = cpu;
862
10.2k
863
10.2k
    TRACE_3D(TRC_CSCHED_PICKED_CPU, vc->domain->domain_id, vc->vcpu_id, cpu);
864
10.2k
865
10.2k
    return cpu;
866
10.2k
}
867
868
static int
869
csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
870
78
{
871
78
    struct csched_vcpu *svc = CSCHED_VCPU(vc);
872
78
873
78
    /*
874
78
     * We have been called by vcpu_migrate() (in schedule.c), as part
875
78
     * of the process of seeing if vc can be migrated to another pcpu.
876
78
     * We make a note about this in svc->flags so that later, in
877
78
     * csched_vcpu_wake() (still called from vcpu_migrate()) we won't
878
78
     * get boosted, which we don't deserve as we are "only" migrating.
879
78
     */
880
78
    set_bit(CSCHED_FLAG_VCPU_MIGRATING, &svc->flags);
881
78
    return _csched_cpu_pick(ops, vc, 1);
882
78
}
883
884
static inline void
885
__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
886
1.08k
{
887
1.08k
    struct csched_dom * const sdom = svc->sdom;
888
1.08k
    unsigned long flags;
889
1.08k
890
1.08k
    spin_lock_irqsave(&prv->lock, flags);
891
1.08k
892
1.08k
    if ( list_empty(&svc->active_vcpu_elem) )
893
1.08k
    {
894
1.08k
        SCHED_VCPU_STAT_CRANK(svc, state_active);
895
1.08k
        SCHED_STAT_CRANK(acct_vcpu_active);
896
1.08k
897
1.08k
        sdom->active_vcpu_count++;
898
1.08k
        list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
899
1.08k
        /* Make weight per-vcpu */
900
1.08k
        prv->weight += sdom->weight;
901
1.08k
        if ( list_empty(&sdom->active_sdom_elem) )
902
171
        {
903
171
            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
904
171
        }
905
1.08k
    }
906
1.08k
907
1.08k
    TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id,
908
1.08k
             svc->vcpu->vcpu_id, sdom->active_vcpu_count);
909
1.08k
910
1.08k
    spin_unlock_irqrestore(&prv->lock, flags);
911
1.08k
}
912
913
static inline void
914
__csched_vcpu_acct_stop_locked(struct csched_private *prv,
915
    struct csched_vcpu *svc)
916
1.08k
{
917
1.08k
    struct csched_dom * const sdom = svc->sdom;
918
1.08k
919
1.08k
    BUG_ON( list_empty(&svc->active_vcpu_elem) );
920
1.08k
921
1.08k
    SCHED_VCPU_STAT_CRANK(svc, state_idle);
922
1.08k
    SCHED_STAT_CRANK(acct_vcpu_idle);
923
1.08k
924
1.08k
    BUG_ON( prv->weight < sdom->weight );
925
1.08k
    sdom->active_vcpu_count--;
926
1.08k
    list_del_init(&svc->active_vcpu_elem);
927
1.08k
    prv->weight -= sdom->weight;
928
1.08k
    if ( list_empty(&sdom->active_vcpu) )
929
170
    {
930
170
        list_del_init(&sdom->active_sdom_elem);
931
170
    }
932
1.08k
933
1.08k
    TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id,
934
1.08k
             svc->vcpu->vcpu_id, sdom->active_vcpu_count);
935
1.08k
}
936
937
static void
938
csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
939
10.3k
{
940
10.3k
    struct csched_vcpu * const svc = CSCHED_VCPU(current);
941
10.3k
    const struct scheduler *ops = per_cpu(scheduler, cpu);
942
10.3k
943
10.3k
    ASSERT( current->processor == cpu );
944
10.3k
    ASSERT( svc->sdom != NULL );
945
10.3k
    ASSERT( !is_idle_vcpu(svc->vcpu) );
946
10.3k
947
10.3k
    /*
948
10.3k
     * If this VCPU's priority was boosted when it last awoke, reset it.
949
10.3k
     * If the VCPU is found here, then it's consuming a non-negligeable
950
10.3k
     * amount of CPU resources and should no longer be boosted.
951
10.3k
     */
952
10.3k
    if ( svc->pri == CSCHED_PRI_TS_BOOST )
953
1.50k
    {
954
1.50k
        svc->pri = CSCHED_PRI_TS_UNDER;
955
1.50k
        TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id,
956
1.50k
                 svc->vcpu->vcpu_id);
957
1.50k
    }
958
10.3k
959
10.3k
    /*
960
10.3k
     * Update credits
961
10.3k
     */
962
10.3k
    burn_credits(svc, NOW());
963
10.3k
964
10.3k
    /*
965
10.3k
     * Put this VCPU and domain back on the active list if it was
966
10.3k
     * idling.
967
10.3k
     */
968
10.3k
    if ( list_empty(&svc->active_vcpu_elem) )
969
1.08k
    {
970
1.08k
        __csched_vcpu_acct_start(prv, svc);
971
1.08k
    }
972
10.3k
    else
973
9.26k
    {
974
9.26k
        unsigned int new_cpu;
975
9.26k
        unsigned long flags;
976
9.26k
        spinlock_t *lock = vcpu_schedule_lock_irqsave(current, &flags);
977
9.26k
978
9.26k
        /*
979
9.26k
         * If it's been active a while, check if we'd be better off
980
9.26k
         * migrating it to run elsewhere (see multi-core and multi-thread
981
9.26k
         * support in csched_cpu_pick()).
982
9.26k
         */
983
9.26k
        new_cpu = _csched_cpu_pick(ops, current, 0);
984
9.26k
985
9.26k
        vcpu_schedule_unlock_irqrestore(lock, flags, current);
986
9.26k
987
9.26k
        if ( new_cpu != cpu )
988
64
        {
989
64
            SCHED_VCPU_STAT_CRANK(svc, migrate_r);
990
64
            SCHED_STAT_CRANK(migrate_running);
991
64
            set_bit(_VPF_migrating, &current->pause_flags);
992
64
            /*
993
64
             * As we are about to tickle cpu, we should clear its bit in
994
64
             * idlers. But, if we are here, it means there is someone running
995
64
             * on it, and hence the bit must be zero already.
996
64
             */
997
64
            ASSERT(!cpumask_test_cpu(cpu,
998
64
                                     CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
999
64
            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1000
64
        }
1001
9.26k
    }
1002
10.3k
}
1003
1004
static void *
1005
csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
1006
24
{
1007
24
    struct csched_vcpu *svc;
1008
24
1009
24
    /* Allocate per-VCPU info */
1010
24
    svc = xzalloc(struct csched_vcpu);
1011
24
    if ( svc == NULL )
1012
0
        return NULL;
1013
24
1014
24
    INIT_LIST_HEAD(&svc->runq_elem);
1015
24
    INIT_LIST_HEAD(&svc->active_vcpu_elem);
1016
24
    svc->sdom = dd;
1017
24
    svc->vcpu = vc;
1018
24
    svc->pri = is_idle_domain(vc->domain) ?
1019
12
        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
1020
24
    SCHED_VCPU_STATS_RESET(svc);
1021
24
    SCHED_STAT_CRANK(vcpu_alloc);
1022
24
    return svc;
1023
24
}
1024
1025
static void
1026
csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
1027
12
{
1028
12
    struct csched_vcpu *svc = vc->sched_priv;
1029
12
    spinlock_t *lock;
1030
12
1031
12
    BUG_ON( is_idle_vcpu(vc) );
1032
12
1033
12
    /* csched_cpu_pick() looks in vc->processor's runq, so we need the lock. */
1034
12
    lock = vcpu_schedule_lock_irq(vc);
1035
12
1036
12
    vc->processor = csched_cpu_pick(ops, vc);
1037
12
1038
12
    spin_unlock_irq(lock);
1039
12
1040
12
    lock = vcpu_schedule_lock_irq(vc);
1041
12
1042
12
    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
1043
0
        runq_insert(svc);
1044
12
1045
12
    vcpu_schedule_unlock_irq(lock, vc);
1046
12
1047
12
    SCHED_STAT_CRANK(vcpu_insert);
1048
12
}
1049
1050
static void
1051
csched_free_vdata(const struct scheduler *ops, void *priv)
1052
0
{
1053
0
    struct csched_vcpu *svc = priv;
1054
0
1055
0
    BUG_ON( !list_empty(&svc->runq_elem) );
1056
0
1057
0
    xfree(svc);
1058
0
}
1059
1060
static void
1061
csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
1062
0
{
1063
0
    struct csched_private *prv = CSCHED_PRIV(ops);
1064
0
    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1065
0
    struct csched_dom * const sdom = svc->sdom;
1066
0
1067
0
    SCHED_STAT_CRANK(vcpu_remove);
1068
0
1069
0
    ASSERT(!__vcpu_on_runq(svc));
1070
0
1071
0
    if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
1072
0
    {
1073
0
        SCHED_STAT_CRANK(vcpu_unpark);
1074
0
        vcpu_unpause(svc->vcpu);
1075
0
    }
1076
0
1077
0
    spin_lock_irq(&prv->lock);
1078
0
1079
0
    if ( !list_empty(&svc->active_vcpu_elem) )
1080
0
        __csched_vcpu_acct_stop_locked(prv, svc);
1081
0
1082
0
    spin_unlock_irq(&prv->lock);
1083
0
1084
0
    BUG_ON( sdom == NULL );
1085
0
}
1086
1087
static void
1088
csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
1089
360
{
1090
360
    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1091
360
    unsigned int cpu = vc->processor;
1092
360
1093
360
    SCHED_STAT_CRANK(vcpu_sleep);
1094
360
1095
360
    BUG_ON( is_idle_vcpu(vc) );
1096
360
1097
360
    if ( curr_on_cpu(cpu) == vc )
1098
54
    {
1099
54
        /*
1100
54
         * We are about to tickle cpu, so we should clear its bit in idlers.
1101
54
         * But, we are here because vc is going to sleep while running on cpu,
1102
54
         * so the bit must be zero already.
1103
54
         */
1104
54
        ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
1105
54
        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1106
54
    }
1107
306
    else if ( __vcpu_on_runq(svc) )
1108
1
        runq_remove(svc);
1109
360
}
1110
1111
static void
1112
csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
1113
67.1k
{
1114
67.1k
    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1115
67.1k
    bool_t migrating;
1116
67.1k
1117
67.1k
    BUG_ON( is_idle_vcpu(vc) );
1118
67.1k
1119
67.1k
    if ( unlikely(curr_on_cpu(vc->processor) == vc) )
1120
301
    {
1121
301
        SCHED_STAT_CRANK(vcpu_wake_running);
1122
301
        return;
1123
301
    }
1124
66.8k
    if ( unlikely(__vcpu_on_runq(svc)) )
1125
0
    {
1126
0
        SCHED_STAT_CRANK(vcpu_wake_onrunq);
1127
0
        return;
1128
0
    }
1129
66.8k
1130
66.8k
    if ( likely(vcpu_runnable(vc)) )
1131
66.8k
        SCHED_STAT_CRANK(vcpu_wake_runnable);
1132
66.8k
    else
1133
18.4E
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);
1134
66.8k
1135
66.8k
    /*
1136
66.8k
     * We temporarly boost the priority of awaking VCPUs!
1137
66.8k
     *
1138
66.8k
     * If this VCPU consumes a non negligeable amount of CPU, it
1139
66.8k
     * will eventually find itself in the credit accounting code
1140
66.8k
     * path where its priority will be reset to normal.
1141
66.8k
     *
1142
66.8k
     * If on the other hand the VCPU consumes little CPU and is
1143
66.8k
     * blocking and awoken a lot (doing I/O for example), its
1144
66.8k
     * priority will remain boosted, optimizing it's wake-to-run
1145
66.8k
     * latencies.
1146
66.8k
     *
1147
66.8k
     * This allows wake-to-run latency sensitive VCPUs to preempt
1148
66.8k
     * more CPU resource intensive VCPUs without impacting overall 
1149
66.8k
     * system fairness.
1150
66.8k
     *
1151
66.8k
     * There are two cases, when we don't want to boost:
1152
66.8k
     *  - VCPUs that are waking up after a migration, rather than
1153
66.8k
     *    after having block;
1154
66.8k
     *  - VCPUs of capped domains unpausing after earning credits
1155
66.8k
     *    they had overspent.
1156
66.8k
     */
1157
66.8k
    migrating = test_and_clear_bit(CSCHED_FLAG_VCPU_MIGRATING, &svc->flags);
1158
66.8k
1159
66.8k
    if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER &&
1160
2.57k
         !test_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
1161
2.57k
    {
1162
2.57k
        TRACE_2D(TRC_CSCHED_BOOST_START, vc->domain->domain_id, vc->vcpu_id);
1163
2.57k
        SCHED_STAT_CRANK(vcpu_boost);
1164
2.57k
        svc->pri = CSCHED_PRI_TS_BOOST;
1165
2.57k
    }
1166
66.8k
1167
66.8k
    /* Put the VCPU on the runq and tickle CPUs */
1168
66.8k
    runq_insert(svc);
1169
66.8k
    __runq_tickle(svc);
1170
66.8k
}
1171
1172
static void
1173
csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc)
1174
4.69M
{
1175
4.69M
    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1176
4.69M
1177
4.69M
    /* Let the scheduler know that this vcpu is trying to yield */
1178
4.69M
    set_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags);
1179
4.69M
}
1180
1181
static int
1182
csched_dom_cntl(
1183
    const struct scheduler *ops,
1184
    struct domain *d,
1185
    struct xen_domctl_scheduler_op *op)
1186
0
{
1187
0
    struct csched_dom * const sdom = CSCHED_DOM(d);
1188
0
    struct csched_private *prv = CSCHED_PRIV(ops);
1189
0
    unsigned long flags;
1190
0
    int rc = 0;
1191
0
1192
0
    /* Protect both get and put branches with the pluggable scheduler
1193
0
     * lock. Runq lock not needed anywhere in here. */
1194
0
    spin_lock_irqsave(&prv->lock, flags);
1195
0
1196
0
    switch ( op->cmd )
1197
0
    {
1198
0
    case XEN_DOMCTL_SCHEDOP_getinfo:
1199
0
        op->u.credit.weight = sdom->weight;
1200
0
        op->u.credit.cap = sdom->cap;
1201
0
        break;
1202
0
    case XEN_DOMCTL_SCHEDOP_putinfo:
1203
0
        if ( op->u.credit.weight != 0 )
1204
0
        {
1205
0
            if ( !list_empty(&sdom->active_sdom_elem) )
1206
0
            {
1207
0
                prv->weight -= sdom->weight * sdom->active_vcpu_count;
1208
0
                prv->weight += op->u.credit.weight * sdom->active_vcpu_count;
1209
0
            }
1210
0
            sdom->weight = op->u.credit.weight;
1211
0
        }
1212
0
1213
0
        if ( op->u.credit.cap != (uint16_t)~0U )
1214
0
            sdom->cap = op->u.credit.cap;
1215
0
        break;
1216
0
    default:
1217
0
        rc = -EINVAL;
1218
0
        break;
1219
0
    }
1220
0
1221
0
    spin_unlock_irqrestore(&prv->lock, flags);
1222
0
1223
0
    return rc;
1224
0
}
1225
1226
static inline void
1227
__csched_set_tslice(struct csched_private *prv, unsigned timeslice)
1228
1
{
1229
1
    prv->tslice_ms = timeslice;
1230
1
    prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
1231
1
    if ( prv->tslice_ms < prv->ticks_per_tslice )
1232
0
        prv->ticks_per_tslice = 1;
1233
1
    prv->tick_period_us = prv->tslice_ms * 1000 / prv->ticks_per_tslice;
1234
1
    prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * prv->tslice_ms;
1235
1
    prv->credit = prv->credits_per_tslice * prv->ncpus;
1236
1
}
1237
1238
static int
1239
csched_sys_cntl(const struct scheduler *ops,
1240
                        struct xen_sysctl_scheduler_op *sc)
1241
0
{
1242
0
    int rc = -EINVAL;
1243
0
    struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit;
1244
0
    struct csched_private *prv = CSCHED_PRIV(ops);
1245
0
    unsigned long flags;
1246
0
1247
0
    switch ( sc->cmd )
1248
0
    {
1249
0
    case XEN_SYSCTL_SCHEDOP_putinfo:
1250
0
        if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
1251
0
             || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
1252
0
             || (params->ratelimit_us
1253
0
                 && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
1254
0
                     || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
1255
0
             || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) )
1256
0
                goto out;
1257
0
1258
0
        spin_lock_irqsave(&prv->lock, flags);
1259
0
        __csched_set_tslice(prv, params->tslice_ms);
1260
0
        if ( !prv->ratelimit_us && params->ratelimit_us )
1261
0
            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
1262
0
        else if ( prv->ratelimit_us && !params->ratelimit_us )
1263
0
            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
1264
0
        prv->ratelimit_us = params->ratelimit_us;
1265
0
        spin_unlock_irqrestore(&prv->lock, flags);
1266
0
1267
0
        /* FALLTHRU */
1268
0
    case XEN_SYSCTL_SCHEDOP_getinfo:
1269
0
        params->tslice_ms = prv->tslice_ms;
1270
0
        params->ratelimit_us = prv->ratelimit_us;
1271
0
        rc = 0;
1272
0
        break;
1273
0
    }
1274
0
    out:
1275
0
    return rc;
1276
0
}
1277
1278
static void *
1279
csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
1280
1
{
1281
1
    struct csched_dom *sdom;
1282
1
1283
1
    sdom = xzalloc(struct csched_dom);
1284
1
    if ( sdom == NULL )
1285
0
        return NULL;
1286
1
1287
1
    /* Initialize credit and weight */
1288
1
    INIT_LIST_HEAD(&sdom->active_vcpu);
1289
1
    INIT_LIST_HEAD(&sdom->active_sdom_elem);
1290
1
    sdom->dom = dom;
1291
1
    sdom->weight = CSCHED_DEFAULT_WEIGHT;
1292
1
1293
1
    return (void *)sdom;
1294
1
}
1295
1296
static int
1297
csched_dom_init(const struct scheduler *ops, struct domain *dom)
1298
2
{
1299
2
    struct csched_dom *sdom;
1300
2
1301
2
    if ( is_idle_domain(dom) )
1302
1
        return 0;
1303
2
1304
1
    sdom = csched_alloc_domdata(ops, dom);
1305
1
    if ( sdom == NULL )
1306
0
        return -ENOMEM;
1307
1
1308
1
    dom->sched_priv = sdom;
1309
1
1310
1
    return 0;
1311
1
}
1312
1313
static void
1314
csched_free_domdata(const struct scheduler *ops, void *data)
1315
0
{
1316
0
    xfree(data);
1317
0
}
1318
1319
static void
1320
csched_dom_destroy(const struct scheduler *ops, struct domain *dom)
1321
0
{
1322
0
    csched_free_domdata(ops, CSCHED_DOM(dom));
1323
0
}
1324
1325
/*
1326
 * This is a O(n) optimized sort of the runq.
1327
 *
1328
 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
1329
 * through the runq and move up any UNDERs that are preceded by OVERS. We
1330
 * remember the last UNDER to make the move up operation O(1).
1331
 */
1332
static void
1333
csched_runq_sort(struct csched_private *prv, unsigned int cpu)
1334
12.0k
{
1335
12.0k
    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
1336
12.0k
    struct list_head *runq, *elem, *next, *last_under;
1337
12.0k
    struct csched_vcpu *svc_elem;
1338
12.0k
    spinlock_t *lock;
1339
12.0k
    unsigned long flags;
1340
12.0k
    int sort_epoch;
1341
12.0k
1342
12.0k
    sort_epoch = prv->runq_sort;
1343
12.0k
    if ( sort_epoch == spc->runq_sort_last )
1344
7.14k
        return;
1345
12.0k
1346
4.92k
    spc->runq_sort_last = sort_epoch;
1347
4.92k
1348
4.92k
    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
1349
4.92k
1350
4.92k
    runq = &spc->runq;
1351
4.92k
    elem = runq->next;
1352
4.92k
    last_under = runq;
1353
4.92k
1354
9.30k
    while ( elem != runq )
1355
4.37k
    {
1356
4.37k
        next = elem->next;
1357
4.37k
        svc_elem = __runq_elem(elem);
1358
4.37k
1359
4.37k
        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
1360
25
        {
1361
25
            /* does elem need to move up the runq? */
1362
25
            if ( elem->prev != last_under )
1363
0
            {
1364
0
                list_del(elem);
1365
0
                list_add(elem, last_under);
1366
0
            }
1367
25
            last_under = elem;
1368
25
        }
1369
4.37k
1370
4.37k
        elem = next;
1371
4.37k
    }
1372
4.92k
1373
4.92k
    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
1374
4.92k
}
1375
1376
static void
1377
csched_acct(void* dummy)
1378
2.89k
{
1379
2.89k
    struct csched_private *prv = dummy;
1380
2.89k
    unsigned long flags;
1381
2.89k
    struct list_head *iter_vcpu, *next_vcpu;
1382
2.89k
    struct list_head *iter_sdom, *next_sdom;
1383
2.89k
    struct csched_vcpu *svc;
1384
2.89k
    struct csched_dom *sdom;
1385
2.89k
    uint32_t credit_total;
1386
2.89k
    uint32_t weight_total;
1387
2.89k
    uint32_t weight_left;
1388
2.89k
    uint32_t credit_fair;
1389
2.89k
    uint32_t credit_peak;
1390
2.89k
    uint32_t credit_cap;
1391
2.89k
    int credit_balance;
1392
2.89k
    int credit_xtra;
1393
2.89k
    int credit;
1394
2.89k
1395
2.89k
1396
2.89k
    spin_lock_irqsave(&prv->lock, flags);
1397
2.89k
1398
2.89k
    weight_total = prv->weight;
1399
2.89k
    credit_total = prv->credit;
1400
2.89k
1401
2.89k
    /* Converge balance towards 0 when it drops negative */
1402
2.89k
    if ( prv->credit_balance < 0 )
1403
59
    {
1404
59
        credit_total -= prv->credit_balance;
1405
59
        SCHED_STAT_CRANK(acct_balance);
1406
59
    }
1407
2.89k
1408
2.89k
    if ( unlikely(weight_total == 0) )
1409
1.54k
    {
1410
1.54k
        prv->credit_balance = 0;
1411
1.54k
        spin_unlock_irqrestore(&prv->lock, flags);
1412
1.54k
        SCHED_STAT_CRANK(acct_no_work);
1413
1.54k
        goto out;
1414
1.54k
    }
1415
2.89k
1416
1.34k
    SCHED_STAT_CRANK(acct_run);
1417
1.34k
1418
1.34k
    weight_left = weight_total;
1419
1.34k
    credit_balance = 0;
1420
1.34k
    credit_xtra = 0;
1421
1.34k
    credit_cap = 0U;
1422
1.34k
1423
1.34k
    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
1424
1.34k
    {
1425
1.34k
        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1426
1.34k
1427
1.34k
        BUG_ON( is_idle_domain(sdom->dom) );
1428
1.34k
        BUG_ON( sdom->active_vcpu_count == 0 );
1429
1.34k
        BUG_ON( sdom->weight == 0 );
1430
1.34k
        BUG_ON( (sdom->weight * sdom->active_vcpu_count) > weight_left );
1431
1.34k
1432
1.34k
        weight_left -= ( sdom->weight * sdom->active_vcpu_count );
1433
1.34k
1434
1.34k
        /*
1435
1.34k
         * A domain's fair share is computed using its weight in competition
1436
1.34k
         * with that of all other active domains.
1437
1.34k
         *
1438
1.34k
         * At most, a domain can use credits to run all its active VCPUs
1439
1.34k
         * for one full accounting period. We allow a domain to earn more
1440
1.34k
         * only when the system-wide credit balance is negative.
1441
1.34k
         */
1442
1.34k
        credit_peak = sdom->active_vcpu_count * prv->credits_per_tslice;
1443
1.34k
        if ( prv->credit_balance < 0 )
1444
59
        {
1445
59
            credit_peak += ( ( -prv->credit_balance
1446
59
                               * sdom->weight
1447
59
                               * sdom->active_vcpu_count) +
1448
59
                             (weight_total - 1)
1449
59
                           ) / weight_total;
1450
59
        }
1451
1.34k
1452
1.34k
        if ( sdom->cap != 0U )
1453
0
        {
1454
0
            credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100;
1455
0
            if ( credit_cap < credit_peak )
1456
0
                credit_peak = credit_cap;
1457
0
1458
0
            /* FIXME -- set cap per-vcpu as well...? */
1459
0
            credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
1460
0
                         ) / sdom->active_vcpu_count;
1461
0
        }
1462
1.34k
1463
1.34k
        credit_fair = ( ( credit_total
1464
1.34k
                          * sdom->weight
1465
1.34k
                          * sdom->active_vcpu_count )
1466
1.34k
                        + (weight_total - 1)
1467
1.34k
                      ) / weight_total;
1468
1.34k
1469
1.34k
        if ( credit_fair < credit_peak )
1470
0
        {
1471
0
            credit_xtra = 1;
1472
0
        }
1473
1.34k
        else
1474
1.34k
        {
1475
1.34k
            if ( weight_left != 0U )
1476
0
            {
1477
0
                /* Give other domains a chance at unused credits */
1478
0
                credit_total += ( ( ( credit_fair - credit_peak
1479
0
                                    ) * weight_total
1480
0
                                  ) + ( weight_left - 1 )
1481
0
                                ) / weight_left;
1482
0
            }
1483
1.34k
1484
1.34k
            if ( credit_xtra )
1485
0
            {
1486
0
                /*
1487
0
                 * Lazily keep domains with extra credits at the head of
1488
0
                 * the queue to give others a chance at them in future
1489
0
                 * accounting periods.
1490
0
                 */
1491
0
                SCHED_STAT_CRANK(acct_reorder);
1492
0
                list_del(&sdom->active_sdom_elem);
1493
0
                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
1494
0
            }
1495
1.34k
1496
1.34k
            credit_fair = credit_peak;
1497
1.34k
        }
1498
1.34k
1499
1.34k
        /* Compute fair share per VCPU */
1500
1.34k
        credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
1501
1.34k
                      ) / sdom->active_vcpu_count;
1502
1.34k
1503
1.34k
1504
1.34k
        list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
1505
4.65k
        {
1506
4.65k
            svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
1507
4.65k
            BUG_ON( sdom != svc->sdom );
1508
4.65k
1509
4.65k
            /* Increment credit */
1510
4.65k
            atomic_add(credit_fair, &svc->credit);
1511
4.65k
            credit = atomic_read(&svc->credit);
1512
4.65k
1513
4.65k
            /*
1514
4.65k
             * Recompute priority or, if VCPU is idling, remove it from
1515
4.65k
             * the active list.
1516
4.65k
             */
1517
4.65k
            if ( credit < 0 )
1518
493
            {
1519
493
                svc->pri = CSCHED_PRI_TS_OVER;
1520
493
1521
493
                /* Park running VCPUs of capped-out domains */
1522
493
                if ( sdom->cap != 0U &&
1523
0
                     credit < -credit_cap &&
1524
0
                     !test_and_set_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
1525
0
                {
1526
0
                    SCHED_STAT_CRANK(vcpu_park);
1527
0
                    vcpu_pause_nosync(svc->vcpu);
1528
0
                }
1529
493
1530
493
                /* Lower bound on credits */
1531
493
                if ( credit < -prv->credits_per_tslice )
1532
1
                {
1533
1
                    SCHED_STAT_CRANK(acct_min_credit);
1534
1
                    credit = -prv->credits_per_tslice;
1535
1
                    atomic_set(&svc->credit, credit);
1536
1
                }
1537
493
            }
1538
4.65k
            else
1539
4.15k
            {
1540
4.15k
                svc->pri = CSCHED_PRI_TS_UNDER;
1541
4.15k
1542
4.15k
                /* Unpark any capped domains whose credits go positive */
1543
4.15k
                if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) )
1544
0
                {
1545
0
                    /*
1546
0
                     * It's important to unset the flag AFTER the unpause()
1547
0
                     * call to make sure the VCPU's priority is not boosted
1548
0
                     * if it is woken up here.
1549
0
                     */
1550
0
                    SCHED_STAT_CRANK(vcpu_unpark);
1551
0
                    vcpu_unpause(svc->vcpu);
1552
0
                }
1553
4.15k
1554
4.15k
                /* Upper bound on credits means VCPU stops earning */
1555
4.15k
                if ( credit > prv->credits_per_tslice )
1556
1.08k
                {
1557
1.08k
                    __csched_vcpu_acct_stop_locked(prv, svc);
1558
1.08k
                    /* Divide credits in half, so that when it starts
1559
1.08k
                     * accounting again, it starts a little bit "ahead" */
1560
1.08k
                    credit /= 2;
1561
1.08k
                    atomic_set(&svc->credit, credit);
1562
1.08k
                }
1563
4.15k
            }
1564
4.65k
1565
4.65k
            SCHED_VCPU_STAT_SET(svc, credit_last, credit);
1566
4.65k
            SCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
1567
4.65k
            credit_balance += credit;
1568
4.65k
        }
1569
1.34k
    }
1570
1.34k
1571
1.34k
    prv->credit_balance = credit_balance;
1572
1.34k
1573
1.34k
    spin_unlock_irqrestore(&prv->lock, flags);
1574
1.34k
1575
1.34k
    /* Inform each CPU that its runq needs to be sorted */
1576
1.34k
    prv->runq_sort++;
1577
1.34k
1578
2.89k
out:
1579
2.89k
    set_timer( &prv->master_ticker,
1580
2.89k
               NOW() + MILLISECS(prv->tslice_ms));
1581
2.89k
}
1582
1583
static void
1584
csched_tick(void *_cpu)
1585
11.0k
{
1586
11.0k
    unsigned int cpu = (unsigned long)_cpu;
1587
11.0k
    struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1588
11.0k
    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
1589
11.0k
1590
11.0k
    spc->tick++;
1591
11.0k
1592
11.0k
    /*
1593
11.0k
     * Accounting for running VCPU
1594
11.0k
     */
1595
11.0k
    if ( !is_idle_vcpu(current) )
1596
10.3k
        csched_vcpu_acct(prv, cpu);
1597
11.0k
1598
11.0k
    /*
1599
11.0k
     * Check if runq needs to be sorted
1600
11.0k
     *
1601
11.0k
     * Every physical CPU resorts the runq after the accounting master has
1602
11.0k
     * modified priorities. This is a special O(n) sort and runs at most
1603
11.0k
     * once per accounting period (currently 30 milliseconds).
1604
11.0k
     */
1605
11.0k
    csched_runq_sort(prv, cpu);
1606
11.0k
1607
11.0k
    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
1608
11.0k
}
1609
1610
static struct csched_vcpu *
1611
csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
1612
2.39k
{
1613
2.39k
    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1614
2.39k
    struct csched_vcpu *speer;
1615
2.39k
    struct list_head *iter;
1616
2.39k
    struct vcpu *vc;
1617
2.39k
1618
2.39k
    ASSERT(peer_pcpu != NULL);
1619
2.39k
1620
2.39k
    /*
1621
2.39k
     * Don't steal from an idle CPU's runq because it's about to
1622
2.39k
     * pick up work from it itself.
1623
2.39k
     */
1624
2.39k
    if ( unlikely(is_idle_vcpu(curr_on_cpu(peer_cpu))) )
1625
1
        goto out;
1626
2.39k
1627
2.39k
    list_for_each( iter, &peer_pcpu->runq )
1628
3.08k
    {
1629
3.08k
        speer = __runq_elem(iter);
1630
3.08k
1631
3.08k
        /*
1632
3.08k
         * If next available VCPU here is not of strictly higher
1633
3.08k
         * priority than ours, this PCPU is useless to us.
1634
3.08k
         */
1635
3.08k
        if ( speer->pri <= pri )
1636
1.88k
            break;
1637
3.08k
1638
3.08k
        /* Is this VCPU runnable on our PCPU? */
1639
1.19k
        vc = speer->vcpu;
1640
1.19k
        BUG_ON( is_idle_vcpu(vc) );
1641
1.19k
1642
1.19k
        /*
1643
1.19k
         * If the vcpu is still in peer_cpu's scheduling tail, or if it
1644
1.19k
         * has no useful soft affinity, skip it.
1645
1.19k
         *
1646
1.19k
         * In fact, what we want is to check if we have any "soft-affine
1647
1.19k
         * work" to steal, before starting to look at "hard-affine work".
1648
1.19k
         *
1649
1.19k
         * Notice that, if not even one vCPU on this runq has a useful
1650
1.19k
         * soft affinity, we could have avoid considering this runq for
1651
1.19k
         * a soft balancing step in the first place. This, for instance,
1652
1.19k
         * can be implemented by taking note of on what runq there are
1653
1.19k
         * vCPUs with useful soft affinities in some sort of bitmap
1654
1.19k
         * or counter.
1655
1.19k
         */
1656
1.19k
        if ( vc->is_running ||
1657
1.05k
             (balance_step == BALANCE_SOFT_AFFINITY
1658
546
              && !has_soft_affinity(vc, vc->cpu_hard_affinity)) )
1659
691
            continue;
1660
1.19k
1661
506
        affinity_balance_cpumask(vc, balance_step, cpumask_scratch);
1662
506
        if ( __csched_vcpu_is_migrateable(vc, cpu, cpumask_scratch) )
1663
506
        {
1664
506
            /* We got a candidate. Grab it! */
1665
506
            TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu,
1666
506
                     vc->domain->domain_id, vc->vcpu_id);
1667
506
            SCHED_VCPU_STAT_CRANK(speer, migrate_q);
1668
506
            SCHED_STAT_CRANK(migrate_queued);
1669
506
            WARN_ON(vc->is_urgent);
1670
506
            runq_remove(speer);
1671
506
            vc->processor = cpu;
1672
506
            /*
1673
506
             * speer will start executing directly on cpu, without having to
1674
506
             * go through runq_insert(). So we must update the runnable count
1675
506
             * for cpu here.
1676
506
             */
1677
506
            inc_nr_runnable(cpu);
1678
506
            return speer;
1679
506
        }
1680
506
    }
1681
1.88k
 out:
1682
1.88k
    SCHED_STAT_CRANK(steal_peer_idle);
1683
1.88k
    return NULL;
1684
2.39k
}
1685
1686
static struct csched_vcpu *
1687
csched_load_balance(struct csched_private *prv, int cpu,
1688
    struct csched_vcpu *snext, bool_t *stolen)
1689
385k
{
1690
385k
    struct cpupool *c = per_cpu(cpupool, cpu);
1691
385k
    struct csched_vcpu *speer;
1692
385k
    cpumask_t workers;
1693
385k
    cpumask_t *online;
1694
385k
    int peer_cpu, first_cpu, peer_node, bstep;
1695
385k
    int node = cpu_to_node(cpu);
1696
385k
1697
385k
    BUG_ON( cpu != snext->vcpu->processor );
1698
385k
    online = cpupool_online_cpumask(c);
1699
385k
1700
385k
    /*
1701
385k
     * If this CPU is going offline, or is not (yet) part of any cpupool
1702
385k
     * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
1703
385k
     */
1704
385k
    if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
1705
0
        goto out;
1706
385k
1707
385k
    if ( snext->pri == CSCHED_PRI_IDLE )
1708
66.5k
        SCHED_STAT_CRANK(load_balance_idle);
1709
318k
    else if ( snext->pri == CSCHED_PRI_TS_OVER )
1710
318k
        SCHED_STAT_CRANK(load_balance_over);
1711
318k
    else
1712
516
        SCHED_STAT_CRANK(load_balance_other);
1713
385k
1714
385k
    /*
1715
385k
     * Let's look around for work to steal, taking both hard affinity
1716
385k
     * and soft affinity into account. More specifically, we check all
1717
385k
     * the non-idle CPUs' runq, looking for:
1718
385k
     *  1. any "soft-affine work" to steal first,
1719
385k
     *  2. if not finding anything, any "hard-affine work" to steal.
1720
385k
     */
1721
385k
    for_each_affinity_balance_step( bstep )
1722
751k
    {
1723
751k
        /*
1724
751k
         * We peek at the non-idling CPUs in a node-wise fashion. In fact,
1725
751k
         * it is more likely that we find some affine work on our same
1726
751k
         * node, not to mention that migrating vcpus within the same node
1727
751k
         * could well expected to be cheaper than across-nodes (memory
1728
751k
         * stays local, there might be some node-wide cache[s], etc.).
1729
751k
         */
1730
751k
        peer_node = node;
1731
751k
        do
1732
751k
        {
1733
751k
            /* Select the pCPUs in this node that have work we can steal. */
1734
751k
            cpumask_andnot(&workers, online, prv->idlers);
1735
751k
            cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
1736
751k
            __cpumask_clear_cpu(cpu, &workers);
1737
751k
1738
751k
            first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers);
1739
751k
            if ( first_cpu >= nr_cpu_ids )
1740
57.9k
                goto next_node;
1741
693k
            peer_cpu = first_cpu;
1742
693k
            do
1743
6.06M
            {
1744
6.06M
                spinlock_t *lock;
1745
6.06M
1746
6.06M
                /*
1747
6.06M
                 * If there is only one runnable vCPU on peer_cpu, it means
1748
6.06M
                 * there's no one to be stolen in its runqueue, so skip it.
1749
6.06M
                 *
1750
6.06M
                 * Checking this without holding the lock is racy... But that's
1751
6.06M
                 * the whole point of this optimization!
1752
6.06M
                 *
1753
6.06M
                 * In more details:
1754
6.06M
                 * - if we race with dec_nr_runnable(), we may try to take the
1755
6.06M
                 *   lock and call csched_runq_steal() for no reason. This is
1756
6.06M
                 *   not a functional issue, and should be infrequent enough.
1757
6.06M
                 *   And we can avoid that by re-checking nr_runnable after
1758
6.06M
                 *   having grabbed the lock, if we want;
1759
6.06M
                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
1760
6.06M
                 *   have runnable vCPUs in its runqueue, but that's not a
1761
6.06M
                 *   problem because:
1762
6.06M
                 *   + if racing with csched_vcpu_insert() or csched_vcpu_wake(),
1763
6.06M
                 *     __runq_tickle() will be called afterwords, so the vCPU
1764
6.06M
                 *     won't get stuck in the runqueue for too long;
1765
6.06M
                 *   + if racing with csched_runq_steal(), it may be that a
1766
6.06M
                 *     vCPU that we could have picked up, stays in a runqueue
1767
6.06M
                 *     until someone else tries to steal it again. But this is
1768
6.06M
                 *     no worse than what can happen already (without this
1769
6.06M
                 *     optimization), it the pCPU would schedule right after we
1770
6.06M
                 *     have taken the lock, and hence block on it.
1771
6.06M
                 */
1772
6.06M
                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
1773
6.04M
                {
1774
6.04M
                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
1775
6.04M
                    goto next_cpu;
1776
6.04M
                }
1777
6.06M
1778
6.06M
                /*
1779
6.06M
                 * Get ahold of the scheduler lock for this peer CPU.
1780
6.06M
                 *
1781
6.06M
                 * Note: We don't spin on this lock but simply try it. Spinning
1782
6.06M
                 * could cause a deadlock if the peer CPU is also load
1783
6.06M
                 * balancing and trying to lock this CPU.
1784
6.06M
                 */
1785
20.6k
                lock = pcpu_schedule_trylock(peer_cpu);
1786
20.6k
                SCHED_STAT_CRANK(steal_trylock);
1787
20.6k
                if ( !lock )
1788
1.03k
                {
1789
1.03k
                    SCHED_STAT_CRANK(steal_trylock_failed);
1790
1.03k
                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
1791
1.03k
                    goto next_cpu;
1792
1.03k
                }
1793
20.6k
1794
19.5k
                TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
1795
19.5k
1796
19.5k
                /* Any work over there to steal? */
1797
19.5k
                speer = cpumask_test_cpu(peer_cpu, online) ?
1798
17.1k
                    csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
1799
19.5k
                pcpu_schedule_unlock(lock, peer_cpu);
1800
19.5k
1801
19.5k
                /* As soon as one vcpu is found, balancing ends */
1802
19.5k
                if ( speer != NULL )
1803
506
                {
1804
506
                    *stolen = 1;
1805
506
                    /*
1806
506
                     * Next time we'll look for work to steal on this node, we
1807
506
                     * will start from the next pCPU, with respect to this one,
1808
506
                     * so we don't risk stealing always from the same ones.
1809
506
                     */
1810
506
                    prv->balance_bias[peer_node] = peer_cpu;
1811
506
                    return speer;
1812
506
                }
1813
19.5k
1814
6.01M
 next_cpu:
1815
6.01M
                peer_cpu = cpumask_cycle(peer_cpu, &workers);
1816
6.01M
1817
6.01M
            } while( peer_cpu != first_cpu );
1818
693k
1819
754k
 next_node:
1820
754k
            peer_node = cycle_node(peer_node, node_online_map);
1821
754k
        } while( peer_node != node );
1822
751k
    }
1823
385k
1824
376k
 out:
1825
376k
    /* Failed to find more important work elsewhere... */
1826
376k
    __runq_remove(snext);
1827
376k
    return snext;
1828
385k
}
1829
1830
/*
1831
 * This function is in the critical path. It is designed to be simple and
1832
 * fast for the common case.
1833
 */
1834
static struct task_slice
1835
csched_schedule(
1836
    const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1837
4.70M
{
1838
4.70M
    const int cpu = smp_processor_id();
1839
4.70M
    struct list_head * const runq = RUNQ(cpu);
1840
4.70M
    struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1841
4.70M
    struct csched_private *prv = CSCHED_PRIV(ops);
1842
4.70M
    struct csched_vcpu *snext;
1843
4.70M
    struct task_slice ret;
1844
4.70M
    s_time_t runtime, tslice;
1845
4.70M
1846
4.70M
    SCHED_STAT_CRANK(schedule);
1847
4.70M
    CSCHED_VCPU_CHECK(current);
1848
4.70M
1849
4.70M
    /*
1850
4.70M
     * Here in Credit1 code, we usually just call TRACE_nD() helpers, and
1851
4.70M
     * don't care about packing. But scheduling happens very often, so it
1852
4.70M
     * actually is important that the record is as small as possible.
1853
4.70M
     */
1854
4.70M
    if ( unlikely(tb_init_done) )
1855
0
    {
1856
0
        struct {
1857
0
            unsigned cpu:16, tasklet:8, idle:8;
1858
0
        } d;
1859
0
        d.cpu = cpu;
1860
0
        d.tasklet = tasklet_work_scheduled;
1861
0
        d.idle = is_idle_vcpu(current);
1862
0
        __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d),
1863
0
                    (unsigned char *)&d);
1864
0
    }
1865
4.70M
1866
4.70M
    runtime = now - current->runstate.state_entry_time;
1867
4.70M
    if ( runtime < 0 ) /* Does this ever happen? */
1868
62.4k
        runtime = 0;
1869
4.70M
1870
4.70M
    if ( !is_idle_vcpu(scurr->vcpu) )
1871
4.63M
    {
1872
4.63M
        /* Update credits of a non-idle VCPU. */
1873
4.63M
        burn_credits(scurr, now);
1874
4.63M
        scurr->start_time -= now;
1875
4.63M
    }
1876
4.70M
    else
1877
69.7k
    {
1878
69.7k
        /* Re-instate a boosted idle VCPU as normal-idle. */
1879
69.7k
        scurr->pri = CSCHED_PRI_IDLE;
1880
69.7k
    }
1881
4.70M
1882
4.70M
    /* Choices, choices:
1883
4.70M
     * - If we have a tasklet, we need to run the idle vcpu no matter what.
1884
4.70M
     * - If sched rate limiting is in effect, and the current vcpu has
1885
4.70M
     *   run for less than that amount of time, continue the current one,
1886
4.70M
     *   but with a shorter timeslice and return it immediately
1887
4.70M
     * - Otherwise, chose the one with the highest priority (which may
1888
4.70M
     *   be the one currently running)
1889
4.70M
     * - If the currently running one is TS_OVER, see if there
1890
4.70M
     *   is a higher priority one waiting on the runqueue of another
1891
4.70M
     *   cpu and steal it.
1892
4.70M
     */
1893
4.70M
1894
4.70M
    /*
1895
4.70M
     * If we have schedule rate limiting enabled, check to see
1896
4.70M
     * how long we've run for.
1897
4.70M
     *
1898
4.70M
     * If scurr is yielding, however, we don't let rate limiting kick in.
1899
4.70M
     * In fact, it may be the case that scurr is about to spin, and there's
1900
4.70M
     * no point forcing it to do so until rate limiting expires.
1901
4.70M
     */
1902
4.70M
    if ( !test_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags)
1903
134k
         && !tasklet_work_scheduled
1904
134k
         && prv->ratelimit_us
1905
134k
         && vcpu_runnable(current)
1906
68.8k
         && !is_idle_vcpu(current)
1907
2.60k
         && runtime < MICROSECS(prv->ratelimit_us) )
1908
242
    {
1909
242
        snext = scurr;
1910
242
        snext->start_time += now;
1911
242
        perfc_incr(delay_ms);
1912
242
        /*
1913
242
         * Next timeslice must last just until we'll have executed for
1914
242
         * ratelimit_us. However, to avoid setting a really short timer, which
1915
242
         * will most likely be inaccurate and counterproductive, we never go
1916
242
         * below CSCHED_MIN_TIMER.
1917
242
         */
1918
242
        tslice = MICROSECS(prv->ratelimit_us) - runtime;
1919
242
        if ( unlikely(runtime < CSCHED_MIN_TIMER) )
1920
57
            tslice = CSCHED_MIN_TIMER;
1921
242
        if ( unlikely(tb_init_done) )
1922
0
        {
1923
0
            struct {
1924
0
                unsigned vcpu:16, dom:16;
1925
0
                unsigned runtime;
1926
0
            } d;
1927
0
            d.dom = scurr->vcpu->domain->domain_id;
1928
0
            d.vcpu = scurr->vcpu->vcpu_id;
1929
0
            d.runtime = runtime;
1930
0
            __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d),
1931
0
                        (unsigned char *)&d);
1932
0
        }
1933
242
1934
242
        ret.migrated = 0;
1935
242
        goto out;
1936
242
    }
1937
4.70M
    tslice = MILLISECS(prv->tslice_ms);
1938
4.70M
1939
4.70M
    /*
1940
4.70M
     * Select next runnable local VCPU (ie top of local runq)
1941
4.70M
     */
1942
4.70M
    if ( vcpu_runnable(current) )
1943
4.83M
        __runq_insert(scurr);
1944
4.70M
    else
1945
18.4E
    {
1946
18.4E
        BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1947
18.4E
        /* Current has blocked. Update the runnable counter for this cpu. */
1948
18.4E
        dec_nr_runnable(cpu);
1949
18.4E
    }
1950
4.70M
1951
4.70M
    snext = __runq_elem(runq->next);
1952
4.70M
    ret.migrated = 0;
1953
4.70M
1954
4.70M
    /* Tasklet work (which runs in idle VCPU context) overrides all else. */
1955
4.70M
    if ( tasklet_work_scheduled )
1956
44
    {
1957
44
        TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
1958
44
        snext = CSCHED_VCPU(idle_vcpu[cpu]);
1959
44
        snext->pri = CSCHED_PRI_TS_BOOST;
1960
44
    }
1961
4.70M
1962
4.70M
    /*
1963
4.70M
     * Clear YIELD flag before scheduling out
1964
4.70M
     */
1965
4.70M
    clear_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags);
1966
4.70M
1967
4.70M
    /*
1968
4.70M
     * SMP Load balance:
1969
4.70M
     *
1970
4.70M
     * If the next highest priority local runnable VCPU has already eaten
1971
4.70M
     * through its credits, look on other PCPUs to see if we have more
1972
4.70M
     * urgent work... If not, csched_load_balance() will return snext, but
1973
4.70M
     * already removed from the runq.
1974
4.70M
     */
1975
4.70M
    if ( snext->pri > CSCHED_PRI_TS_OVER )
1976
4.52M
        __runq_remove(snext);
1977
4.70M
    else
1978
184k
        snext = csched_load_balance(prv, cpu, snext, &ret.migrated);
1979
4.70M
1980
4.70M
    /*
1981
4.70M
     * Update idlers mask if necessary. When we're idling, other CPUs
1982
4.70M
     * will tickle us when they get extra work.
1983
4.70M
     */
1984
4.88M
    if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE )
1985
65.4k
    {
1986
65.4k
        if ( !cpumask_test_cpu(cpu, prv->idlers) )
1987
65.5k
            cpumask_set_cpu(cpu, prv->idlers);
1988
65.4k
    }
1989
4.64M
    else if ( cpumask_test_cpu(cpu, prv->idlers) )
1990
0
    {
1991
0
        cpumask_clear_cpu(cpu, prv->idlers);
1992
0
    }
1993
4.70M
1994
4.70M
    if ( !is_idle_vcpu(snext->vcpu) )
1995
4.89M
        snext->start_time += now;
1996
4.70M
1997
4.97M
out:
1998
4.97M
    /*
1999
4.97M
     * Return task to run next...
2000
4.97M
     */
2001
4.97M
    ret.time = (is_idle_vcpu(snext->vcpu) ?
2002
4.90M
                -1 : tslice);
2003
4.97M
    ret.task = snext->vcpu;
2004
4.97M
2005
4.97M
    CSCHED_VCPU_CHECK(ret.task);
2006
4.97M
    return ret;
2007
4.70M
}
2008
2009
static void
2010
csched_dump_vcpu(struct csched_vcpu *svc)
2011
0
{
2012
0
    struct csched_dom * const sdom = svc->sdom;
2013
0
2014
0
    printk("[%i.%i] pri=%i flags=%x cpu=%i",
2015
0
            svc->vcpu->domain->domain_id,
2016
0
            svc->vcpu->vcpu_id,
2017
0
            svc->pri,
2018
0
            svc->flags,
2019
0
            svc->vcpu->processor);
2020
0
2021
0
    if ( sdom )
2022
0
    {
2023
0
        printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit),
2024
0
                sdom->weight, sdom->cap);
2025
0
#ifdef CSCHED_STATS
2026
        printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}",
2027
                svc->stats.credit_last,
2028
                svc->stats.credit_incr,
2029
                svc->stats.state_active,
2030
                svc->stats.state_idle,
2031
                svc->stats.migrate_q,
2032
                svc->stats.migrate_r,
2033
                svc->stats.kicked_away);
2034
#endif
2035
0
    }
2036
0
2037
0
    printk("\n");
2038
0
}
2039
2040
static void
2041
csched_dump_pcpu(const struct scheduler *ops, int cpu)
2042
0
{
2043
0
    struct list_head *runq, *iter;
2044
0
    struct csched_private *prv = CSCHED_PRIV(ops);
2045
0
    struct csched_pcpu *spc;
2046
0
    struct csched_vcpu *svc;
2047
0
    spinlock_t *lock;
2048
0
    unsigned long flags;
2049
0
    int loop;
2050
0
#define cpustr keyhandler_scratch
2051
0
2052
0
    /*
2053
0
     * We need both locks:
2054
0
     * - csched_dump_vcpu() wants to access domains' scheduling
2055
0
     *   parameters, which are protected by the private scheduler lock;
2056
0
     * - we scan through the runqueue, so we need the proper runqueue
2057
0
     *   lock (the one of the runqueue of this cpu).
2058
0
     */
2059
0
    spin_lock_irqsave(&prv->lock, flags);
2060
0
    lock = pcpu_schedule_lock(cpu);
2061
0
2062
0
    spc = CSCHED_PCPU(cpu);
2063
0
    runq = &spc->runq;
2064
0
2065
0
    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
2066
0
    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
2067
0
           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
2068
0
    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
2069
0
    printk("core=%s\n", cpustr);
2070
0
2071
0
    /* current VCPU (nothing to say if that's the idle vcpu). */
2072
0
    svc = CSCHED_VCPU(curr_on_cpu(cpu));
2073
0
    if ( svc && !is_idle_vcpu(svc->vcpu) )
2074
0
    {
2075
0
        printk("\trun: ");
2076
0
        csched_dump_vcpu(svc);
2077
0
    }
2078
0
2079
0
    loop = 0;
2080
0
    list_for_each( iter, runq )
2081
0
    {
2082
0
        svc = __runq_elem(iter);
2083
0
        if ( svc )
2084
0
        {
2085
0
            printk("\t%3d: ", ++loop);
2086
0
            csched_dump_vcpu(svc);
2087
0
        }
2088
0
    }
2089
0
2090
0
    pcpu_schedule_unlock(lock, cpu);
2091
0
    spin_unlock_irqrestore(&prv->lock, flags);
2092
0
#undef cpustr
2093
0
}
2094
2095
static void
2096
csched_dump(const struct scheduler *ops)
2097
0
{
2098
0
    struct list_head *iter_sdom, *iter_svc;
2099
0
    struct csched_private *prv = CSCHED_PRIV(ops);
2100
0
    int loop;
2101
0
    unsigned long flags;
2102
0
2103
0
    spin_lock_irqsave(&prv->lock, flags);
2104
0
2105
0
#define idlers_buf keyhandler_scratch
2106
0
2107
0
    printk("info:\n"
2108
0
           "\tncpus              = %u\n"
2109
0
           "\tmaster             = %u\n"
2110
0
           "\tcredit             = %u\n"
2111
0
           "\tcredit balance     = %d\n"
2112
0
           "\tweight             = %u\n"
2113
0
           "\trunq_sort          = %u\n"
2114
0
           "\tdefault-weight     = %d\n"
2115
0
           "\ttslice             = %dms\n"
2116
0
           "\tratelimit          = %dus\n"
2117
0
           "\tcredits per msec   = %d\n"
2118
0
           "\tticks per tslice   = %d\n"
2119
0
           "\tmigration delay    = %uus\n",
2120
0
           prv->ncpus,
2121
0
           prv->master,
2122
0
           prv->credit,
2123
0
           prv->credit_balance,
2124
0
           prv->weight,
2125
0
           prv->runq_sort,
2126
0
           CSCHED_DEFAULT_WEIGHT,
2127
0
           prv->tslice_ms,
2128
0
           prv->ratelimit_us,
2129
0
           CSCHED_CREDITS_PER_MSEC,
2130
0
           prv->ticks_per_tslice,
2131
0
           vcpu_migration_delay);
2132
0
2133
0
    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
2134
0
    printk("idlers: %s\n", idlers_buf);
2135
0
2136
0
    printk("active vcpus:\n");
2137
0
    loop = 0;
2138
0
    list_for_each( iter_sdom, &prv->active_sdom )
2139
0
    {
2140
0
        struct csched_dom *sdom;
2141
0
        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
2142
0
2143
0
        list_for_each( iter_svc, &sdom->active_vcpu )
2144
0
        {
2145
0
            struct csched_vcpu *svc;
2146
0
            spinlock_t *lock;
2147
0
2148
0
            svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
2149
0
            lock = vcpu_schedule_lock(svc->vcpu);
2150
0
2151
0
            printk("\t%3d: ", ++loop);
2152
0
            csched_dump_vcpu(svc);
2153
0
2154
0
            vcpu_schedule_unlock(lock, svc->vcpu);
2155
0
        }
2156
0
    }
2157
0
#undef idlers_buf
2158
0
2159
0
    spin_unlock_irqrestore(&prv->lock, flags);
2160
0
}
2161
2162
static int
2163
csched_init(struct scheduler *ops)
2164
1
{
2165
1
    struct csched_private *prv;
2166
1
2167
1
    prv = xzalloc(struct csched_private);
2168
1
    if ( prv == NULL )
2169
0
        return -ENOMEM;
2170
1
2171
1
    prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
2172
1
    if ( prv->balance_bias == NULL )
2173
0
    {
2174
0
        xfree(prv);
2175
0
        return -ENOMEM;
2176
0
    }
2177
1
2178
1
    if ( !zalloc_cpumask_var(&prv->cpus) ||
2179
1
         !zalloc_cpumask_var(&prv->idlers) )
2180
0
    {
2181
0
        free_cpumask_var(prv->cpus);
2182
0
        xfree(prv->balance_bias);
2183
0
        xfree(prv);
2184
0
        return -ENOMEM;
2185
0
    }
2186
1
2187
1
    ops->sched_data = prv;
2188
1
    spin_lock_init(&prv->lock);
2189
1
    INIT_LIST_HEAD(&prv->active_sdom);
2190
1
    prv->master = UINT_MAX;
2191
1
2192
1
    if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
2193
1
         || sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN )
2194
0
    {
2195
0
        printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n"
2196
0
               " Resetting to default %u\n",
2197
0
               XEN_SYSCTL_CSCHED_TSLICE_MIN,
2198
0
               XEN_SYSCTL_CSCHED_TSLICE_MAX,
2199
0
               CSCHED_DEFAULT_TSLICE_MS);
2200
0
        sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
2201
0
    }
2202
1
2203
1
    __csched_set_tslice(prv, sched_credit_tslice_ms);
2204
1
2205
1
    if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
2206
0
    {
2207
0
        printk("WARNING: sched_ratelimit_us >" 
2208
0
               "sched_credit_tslice_ms is undefined\n"
2209
0
               "Setting ratelimit_us to 1000 * tslice_ms\n");
2210
0
        prv->ratelimit_us = 1000 * prv->tslice_ms;
2211
0
    }
2212
1
    else
2213
1
        prv->ratelimit_us = sched_ratelimit_us;
2214
1
    return 0;
2215
1
}
2216
2217
static void
2218
csched_deinit(struct scheduler *ops)
2219
0
{
2220
0
    struct csched_private *prv;
2221
0
2222
0
    prv = CSCHED_PRIV(ops);
2223
0
    if ( prv != NULL )
2224
0
    {
2225
0
        ops->sched_data = NULL;
2226
0
        free_cpumask_var(prv->cpus);
2227
0
        free_cpumask_var(prv->idlers);
2228
0
        xfree(prv->balance_bias);
2229
0
        xfree(prv);
2230
0
    }
2231
0
}
2232
2233
static void csched_tick_suspend(const struct scheduler *ops, unsigned int cpu)
2234
1.89M
{
2235
1.89M
    struct csched_pcpu *spc;
2236
1.89M
2237
1.89M
    spc = CSCHED_PCPU(cpu);
2238
1.89M
2239
1.89M
    stop_timer(&spc->ticker);
2240
1.89M
}
2241
2242
static void csched_tick_resume(const struct scheduler *ops, unsigned int cpu)
2243
1.86M
{
2244
1.86M
    struct csched_private *prv;
2245
1.86M
    struct csched_pcpu *spc;
2246
1.86M
    uint64_t now = NOW();
2247
1.86M
2248
1.86M
    spc = CSCHED_PCPU(cpu);
2249
1.86M
2250
1.86M
    prv = CSCHED_PRIV(ops);
2251
1.86M
2252
1.86M
    set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us)
2253
1.86M
            - now % MICROSECS(prv->tick_period_us) );
2254
1.86M
}
2255
2256
static const struct scheduler sched_credit_def = {
2257
    .name           = "SMP Credit Scheduler",
2258
    .opt_name       = "credit",
2259
    .sched_id       = XEN_SCHEDULER_CREDIT,
2260
    .sched_data     = NULL,
2261
2262
    .init_domain    = csched_dom_init,
2263
    .destroy_domain = csched_dom_destroy,
2264
2265
    .insert_vcpu    = csched_vcpu_insert,
2266
    .remove_vcpu    = csched_vcpu_remove,
2267
2268
    .sleep          = csched_vcpu_sleep,
2269
    .wake           = csched_vcpu_wake,
2270
    .yield          = csched_vcpu_yield,
2271
2272
    .adjust         = csched_dom_cntl,
2273
    .adjust_global  = csched_sys_cntl,
2274
2275
    .pick_cpu       = csched_cpu_pick,
2276
    .do_schedule    = csched_schedule,
2277
2278
    .dump_cpu_state = csched_dump_pcpu,
2279
    .dump_settings  = csched_dump,
2280
    .init           = csched_init,
2281
    .deinit         = csched_deinit,
2282
    .alloc_vdata    = csched_alloc_vdata,
2283
    .free_vdata     = csched_free_vdata,
2284
    .alloc_pdata    = csched_alloc_pdata,
2285
    .init_pdata     = csched_init_pdata,
2286
    .deinit_pdata   = csched_deinit_pdata,
2287
    .free_pdata     = csched_free_pdata,
2288
    .switch_sched   = csched_switch_sched,
2289
    .alloc_domdata  = csched_alloc_domdata,
2290
    .free_domdata   = csched_free_domdata,
2291
2292
    .tick_suspend   = csched_tick_suspend,
2293
    .tick_resume    = csched_tick_resume,
2294
};
2295
2296
REGISTER_SCHEDULER(sched_credit_def);