Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/sched_credit2.c
Line
Count
Source (jump to first uncovered line)
1
2
/****************************************************************************
3
 * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
4
 ****************************************************************************
5
 *
6
 *        File: common/sched_credit2.c
7
 *      Author: George Dunlap
8
 *
9
 * Description: Credit-based SMP CPU scheduler
10
 * Based on an earlier verson by Emmanuel Ackaouy.
11
 */
12
13
#include <xen/init.h>
14
#include <xen/lib.h>
15
#include <xen/sched.h>
16
#include <xen/domain.h>
17
#include <xen/delay.h>
18
#include <xen/event.h>
19
#include <xen/time.h>
20
#include <xen/perfc.h>
21
#include <xen/sched-if.h>
22
#include <xen/softirq.h>
23
#include <asm/div64.h>
24
#include <xen/errno.h>
25
#include <xen/trace.h>
26
#include <xen/cpu.h>
27
#include <xen/keyhandler.h>
28
29
/* Meant only for helping developers during debugging. */
30
/* #define d2printk printk */
31
#define d2printk(x...)
32
33
34
/*
35
 * Credit2 tracing events ("only" 512 available!). Check
36
 * include/public/trace.h for more details.
37
 */
38
#define TRC_CSCHED2_TICK             TRC_SCHED_CLASS_EVT(CSCHED2, 1)
39
0
#define TRC_CSCHED2_RUNQ_POS         TRC_SCHED_CLASS_EVT(CSCHED2, 2)
40
0
#define TRC_CSCHED2_CREDIT_BURN      TRC_SCHED_CLASS_EVT(CSCHED2, 3)
41
#define TRC_CSCHED2_CREDIT_ADD       TRC_SCHED_CLASS_EVT(CSCHED2, 4)
42
0
#define TRC_CSCHED2_TICKLE_CHECK     TRC_SCHED_CLASS_EVT(CSCHED2, 5)
43
0
#define TRC_CSCHED2_TICKLE           TRC_SCHED_CLASS_EVT(CSCHED2, 6)
44
0
#define TRC_CSCHED2_CREDIT_RESET     TRC_SCHED_CLASS_EVT(CSCHED2, 7)
45
0
#define TRC_CSCHED2_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(CSCHED2, 8)
46
0
#define TRC_CSCHED2_UPDATE_LOAD      TRC_SCHED_CLASS_EVT(CSCHED2, 9)
47
0
#define TRC_CSCHED2_RUNQ_ASSIGN      TRC_SCHED_CLASS_EVT(CSCHED2, 10)
48
0
#define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
49
0
#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
50
0
#define TRC_CSCHED2_TICKLE_NEW       TRC_SCHED_CLASS_EVT(CSCHED2, 13)
51
0
#define TRC_CSCHED2_RUNQ_MAX_WEIGHT  TRC_SCHED_CLASS_EVT(CSCHED2, 14)
52
0
#define TRC_CSCHED2_MIGRATE          TRC_SCHED_CLASS_EVT(CSCHED2, 15)
53
0
#define TRC_CSCHED2_LOAD_CHECK       TRC_SCHED_CLASS_EVT(CSCHED2, 16)
54
0
#define TRC_CSCHED2_LOAD_BALANCE     TRC_SCHED_CLASS_EVT(CSCHED2, 17)
55
0
#define TRC_CSCHED2_PICKED_CPU       TRC_SCHED_CLASS_EVT(CSCHED2, 19)
56
0
#define TRC_CSCHED2_RUNQ_CANDIDATE   TRC_SCHED_CLASS_EVT(CSCHED2, 20)
57
0
#define TRC_CSCHED2_SCHEDULE         TRC_SCHED_CLASS_EVT(CSCHED2, 21)
58
0
#define TRC_CSCHED2_RATELIMIT        TRC_SCHED_CLASS_EVT(CSCHED2, 22)
59
0
#define TRC_CSCHED2_RUNQ_CAND_CHECK  TRC_SCHED_CLASS_EVT(CSCHED2, 23)
60
61
/*
62
 * WARNING: This is still in an experimental phase.  Status and work can be found at the
63
 * credit2 wiki page:
64
 *  http://wiki.xen.org/wiki/Credit2_Scheduler_Development
65
 *
66
 * TODO:
67
 * + Hyperthreading
68
 *  - Look for non-busy core if possible
69
 *  - "Discount" time run on a thread with busy siblings
70
 * + Algorithm:
71
 *  - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
72
 *    a flash animation in the background) can we schedule it with low enough latency
73
 *    so that audio doesn't skip?
74
 *  - Cap and reservation: How to implement with the current system?
75
 * + Optimizing
76
 *  - Profiling, making new algorithms, making math more efficient (no long division)
77
 */
78
79
/*
80
 * Design:
81
 *
82
 * VMs "burn" credits based on their weight; higher weight means
83
 * credits burn more slowly.  The highest weight vcpu burns credits at
84
 * a rate of 1 credit per nanosecond.  Others burn proportionally
85
 * more.
86
 *
87
 * vcpus are inserted into the runqueue by credit order.
88
 *
89
 * Credits are "reset" when the next vcpu in the runqueue is less than
90
 * or equal to zero.  At that point, everyone's credits are "clipped"
91
 * to a small value, and a fixed credit is added to everyone.
92
 */
93
94
/*
95
 * Utilization cap:
96
 *
97
 * Setting an pCPU utilization cap for a domain means the following:
98
 *
99
 * - a domain can have a cap, expressed in terms of % of physical CPU time.
100
 *   A domain that must not use more than 1/4 of _one_ physical CPU, will
101
 *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
102
 *   physical CPU time, will be given a cap of 150%;
103
 *
104
 * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
105
 *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
106
 *   vCPUs, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
107
 *   split among the v vCPUs. How much each of the vCPUs will actually get,
108
 *   during any given interval of time, is unspecified (as it depends on
109
 *   various aspects: workload, system load, etc.). For instance, it is
110
 *   possible that, during a given time interval, 2 vCPUs use 100% each,
111
 *   and the other two use nothing; while during another time interval,
112
 *   two vCPUs use 80%, one uses 10% and the other 30%; or that each use
113
 *   50% (and so on and so forth).
114
 *
115
 * For implementing this, we use the following approach:
116
 *
117
 * - each domain is given a 'budget', an each domain has a timer, which
118
 *   replenishes the domain's budget periodically. The budget is the amount
119
 *   of time the vCPUs of the domain can use every 'period';
120
 *
121
 * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
122
 *   (but each domain has its own timer; so the all are periodic by the same
123
 *   period, but replenishment of the budgets of the various domains, at
124
 *   periods boundaries, are not synchronous);
125
 *
126
 * - when vCPUs run, they consume budget. When they don't run, they don't
127
 *   consume budget. If there is no budget left for the domain, no vCPU of
128
 *   that domain can run. If a vCPU tries to run and finds that there is no
129
 *   budget, it blocks.
130
 *   At whatever time a vCPU wants to run, it must check the domain's budget,
131
 *   and if there is some, it can use it.
132
 *
133
 * - budget is replenished to the top of the capacity for the domain once
134
 *   per period. Even if there was some leftover budget from previous period,
135
 *   though, the budget after a replenishment will always be at most equal
136
 *   to the total capacify of the domain ('tot_budget');
137
 *
138
 * - when a budget replenishment occurs, if there are vCPUs that had been
139
 *   blocked because of lack of budget, they'll be unblocked, and they will
140
 *   (potentially) be able to run again.
141
 *
142
 * Finally, some even more implementation related detail:
143
 *
144
 * - budget is stored in a domain-wide pool. vCPUs of the domain that want
145
 *   to run go to such pool, and grub some. When they do so, the amount
146
 *   they grabbed is _immediately_ removed from the pool. This happens in
147
 *   vcpu_grab_budget();
148
 *
149
 * - when vCPUs stop running, if they've not consumed all the budget they
150
 *   took, the leftover is put back in the pool. This happens in
151
 *   vcpu_return_budget();
152
 *
153
 * - the above means that a vCPU can find out that there is no budget and
154
 *   block, not only if the cap has actually been reached (for this period),
155
 *   but also if some other vCPUs, in order to run, have grabbed a certain
156
 *   quota of budget, no matter whether they've already used it all or not.
157
 *   A vCPU blocking because (any form of) lack of budget is said to be
158
 *   "parked", and such blocking happens in park_vcpu();
159
 *
160
 * - when a vCPU stops running, and puts back some budget in the domain pool,
161
 *   we need to check whether there is someone which has been parked and that
162
 *   can be unparked. This happens in unpark_parked_vcpus(), called from
163
 *   csched2_context_saved();
164
 *
165
 * - of course, unparking happens also as a consequence of the domain's budget
166
 *   being replenished by the periodic timer. This also occurs by means of
167
 *   calling csched2_context_saved() (but from replenish_domain_budget());
168
 *
169
 * - parked vCPUs of a domain are kept in a (per-domain) list, called
170
 *   'parked_vcpus'). Manipulation of the list and of the domain-wide budget
171
 *   pool, must occur only when holding the 'budget_lock'.
172
 */
173
174
/*
175
 * Locking:
176
 *
177
 * - runqueue lock
178
 *  + it is per-runqueue, so:
179
 *   * cpus in a runqueue take the runqueue lock, when using
180
 *     pcpu_schedule_lock() / vcpu_schedule_lock() (and friends),
181
 *   * a cpu may (try to) take a "remote" runqueue lock, e.g., for
182
 *     load balancing;
183
 *  + serializes runqueue operations (removing and inserting vcpus);
184
 *  + protects runqueue-wide data in csched2_runqueue_data;
185
 *  + protects vcpu parameters in csched2_vcpu for the vcpu in the
186
 *    runqueue.
187
 *
188
 * - Private scheduler lock
189
 *  + protects scheduler-wide data in csched2_private, such as:
190
 *   * the list of domains active in this scheduler,
191
 *   * what cpus and what runqueues are active and in what
192
 *     runqueue each cpu is;
193
 *  + serializes the operation of changing the weights of domains;
194
 *
195
 * - Budget lock
196
 *  + it is per-domain;
197
 *  + protects, in domains that have an utilization cap;
198
 *   * manipulation of the total budget of the domain (as it is shared
199
 *     among all vCPUs of the domain),
200
 *   * manipulation of the list of vCPUs that are blocked waiting for
201
 *     some budget to be available.
202
 *
203
 * - Type:
204
 *  + runqueue locks are 'regular' spinlocks;
205
 *  + the private scheduler lock can be an rwlock. In fact, data
206
 *    it protects is modified only during initialization, cpupool
207
 *    manipulation and when changing weights, and read in all
208
 *    other cases (e.g., during load balancing);
209
 *  + budget locks are 'regular' spinlocks.
210
 *
211
 * Ordering:
212
 *  + tylock must be used when wanting to take a runqueue lock,
213
 *    if we already hold another one;
214
 *  + if taking both a runqueue lock and the private scheduler
215
 *    lock is, the latter must always be taken for first;
216
 *  + if taking both a runqueue lock and a budget lock, the former
217
 *    must always be taken for first.
218
 */
219
220
/*
221
 * Basic constants
222
 */
223
/* Default weight: How much a new domain starts with. */
224
0
#define CSCHED2_DEFAULT_WEIGHT       256
225
/*
226
 * Min timer: Minimum length a timer will be set, to
227
 * achieve efficiency.
228
 */
229
0
#define CSCHED2_MIN_TIMER            MICROSECS(500)
230
/*
231
 * Amount of credit VMs begin with, and are reset to.
232
 * ATM, set so that highest-weight VMs can only run for 10ms
233
 * before a reset event.
234
 */
235
0
#define CSCHED2_CREDIT_INIT          MILLISECS(10)
236
/*
237
 * Amount of credit the idle vcpus have. It never changes, as idle
238
 * vcpus does not consume credits, and it must be lower than whatever
239
 * amount of credit 'regular' vcpu would end up with.
240
 */
241
0
#define CSCHED2_IDLE_CREDIT          (-(1U<<30))
242
/*
243
 * Carryover: How much "extra" credit may be carried over after
244
 * a reset.
245
 */
246
0
#define CSCHED2_CARRYOVER_MAX        CSCHED2_MIN_TIMER
247
/*
248
 * Stickiness: Cross-L2 migration resistance.  Should be less than
249
 * MIN_TIMER.
250
 */
251
0
#define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
252
/* How much to "compensate" a vcpu for L2 migration. */
253
0
#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
254
/* How tolerant we should be when peeking at runtime of vcpus on other cpus */
255
0
#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
256
/* Reset: Value below which credit will be reset. */
257
0
#define CSCHED2_CREDIT_RESET         0
258
/* Max timer: Maximum time a guest can be run for. */
259
0
#define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
260
/* Period of the cap replenishment timer. */
261
0
#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
262
263
/*
264
 * Flags
265
 */
266
/*
267
 * CSFLAG_scheduled: Is this vcpu either running on, or context-switching off,
268
 * a physical cpu?
269
 * + Accessed only with runqueue lock held
270
 * + Set when chosen as next in csched2_schedule().
271
 * + Cleared after context switch has been saved in csched2_context_saved()
272
 * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
273
 *   set CSFLAG_delayed_runq_add
274
 * + Checked to be false in runq_insert.
275
 */
276
0
#define __CSFLAG_scheduled 1
277
0
#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
278
/*
279
 * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
280
 * being context switched out?
281
 * + Set when scheduling out in csched2_schedule() if prev is runnable
282
 * + Set in csched2_vcpu_wake if it finds CSFLAG_scheduled set
283
 * + Read in csched2_context_saved().  If set, it adds prev to the runqueue and
284
 *   clears the bit.
285
 */
286
0
#define __CSFLAG_delayed_runq_add 2
287
0
#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
288
/*
289
 * CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a
290
 * credit2-initiated runq migrate request; migrate it to the runqueue indicated
291
 * in the svc struct. 
292
 */
293
0
#define __CSFLAG_runq_migrate_request 3
294
0
#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
295
/*
296
 * CSFLAG_vcpu_yield: this vcpu was running, and has called vcpu_yield(). The
297
 * scheduler is invoked to see if we can give the cpu to someone else, and
298
 * get back to the yielding vcpu in a while.
299
 */
300
#define __CSFLAG_vcpu_yield 4
301
#define CSFLAG_vcpu_yield (1U<<__CSFLAG_vcpu_yield)
302
303
static unsigned int __read_mostly opt_migrate_resist = 500;
304
integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
305
306
/*
307
 * Load tracking and load balancing
308
 *
309
 * Load history of runqueues and vcpus is accounted for by using an
310
 * exponential weighted moving average algorithm. However, instead of using
311
 * fractions,we shift everything to left by the number of bits we want to
312
 * use for representing the fractional part (Q-format).
313
 *
314
 * We may also want to reduce the precision of time accounting, to
315
 * accommodate 'longer  windows'. So, if that is the case, we just need to
316
 * shift all time samples to the right.
317
 *
318
 * The details of the formulas used for load tracking are explained close to
319
 * update_runq_load(). Let's just say here that, with full nanosecond time
320
 * granularity, a 30 bits wide 'decaying window' is ~1 second long.
321
 *
322
 * We want to consider the following equations:
323
 *
324
 *  avg[0] = load*P
325
 *  avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W,  0 <= delta <= W
326
 *
327
 * where W is the length of the window, P the multiplier for transitiong into
328
 * Q-format fixed point arithmetic and load is the instantaneous load of a
329
 * runqueue, which basically is the number of runnable vcpus there are on the
330
 * runqueue (for the meaning of the other terms, look at the doc comment to
331
 *  update_runq_load()).
332
 *
333
 *  So, again, with full nanosecond granularity, and 1 second window, we have:
334
 *
335
 *  W = 2^30
336
 *  P = 2^18
337
 *
338
 * The maximum possible value for the average load, which we want to store in
339
 * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
340
 * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
341
 * means we can have 2^45 vcpus in each runqueue, before overflow occurs!
342
 *
343
 * However, it can happen that, at step j+1, if:
344
 *
345
 *  avg[j] = load*P
346
 *  delta = W
347
 *
348
 * then:
349
 *
350
 *  avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
351
 *
352
 * So we must be able to deal with W*load*P. This means load can't be higher
353
 * than:
354
 *
355
 *  2^(63 - 30 - 18) = 2^15 = 32768
356
 *
357
 * So 32768 is the maximum number of vcpus the we can have in a runqueue,
358
 * at any given time, and still not have problems with the load tracking
359
 * calculations... and this is more than fine.
360
 *
361
 * As a matter of fact, since we are using microseconds granularity, we have
362
 * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
363
 * may be 2^25 = 33554432 vcpus in a runq before we have to start thinking
364
 * about overflow.
365
 */
366
367
/* If >0, decreases the granularity of time samples used for load tracking. */
368
0
#define LOADAVG_GRANULARITY_SHIFT   (10)
369
/* Time window during which we still give value to previous load history. */
370
0
#define LOADAVG_WINDOW_SHIFT        (30)
371
/* 18 bits by default (and not less than 4) for decimals. */
372
#define LOADAVG_PRECISION_SHIFT     (18)
373
0
#define LOADAVG_PRECISION_SHIFT_MIN (4)
374
375
/*
376
 * Both the length of the window and the number of fractional bits can be
377
 * decided with boot parameters.
378
 *
379
 * The length of the window is always expressed in nanoseconds. The actual
380
 * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
381
 */
382
static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
383
integer_param("credit2_load_window_shift", opt_load_window_shift);
384
static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
385
integer_param("credit2_load_precision_shift", opt_load_precision_shift);
386
387
static int __read_mostly opt_underload_balance_tolerance = 0;
388
integer_param("credit2_balance_under", opt_underload_balance_tolerance);
389
static int __read_mostly opt_overload_balance_tolerance = -3;
390
integer_param("credit2_balance_over", opt_overload_balance_tolerance);
391
/*
392
 * Domains subject to a cap receive a replenishment of their runtime budget
393
 * once every opt_cap_period interval. Default is 10 ms. The amount of budget
394
 * they receive depends on their cap. For instance, a domain with a 50% cap
395
 * will receive 50% of 10 ms, so 5 ms.
396
 */
397
static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
398
integer_param("credit2_cap_period_ms", opt_cap_period);
399
400
/*
401
 * Runqueue organization.
402
 *
403
 * The various cpus are to be assigned each one to a runqueue, and we
404
 * want that to happen basing on topology. At the moment, it is possible
405
 * to choose to arrange runqueues to be:
406
 *
407
 * - per-cpu: meaning that there will be one runqueue per logical cpu. This
408
 *            will happen when if the opt_runqueue parameter is set to 'cpu'.
409
 *
410
 * - per-core: meaning that there will be one runqueue per each physical
411
 *             core of the host. This will happen if the opt_runqueue
412
 *             parameter is set to 'core';
413
 *
414
 * - per-socket: meaning that there will be one runqueue per each physical
415
 *               socket (AKA package, which often, but not always, also
416
 *               matches a NUMA node) of the host; This will happen if
417
 *               the opt_runqueue parameter is set to 'socket';
418
 *
419
 * - per-node: meaning that there will be one runqueue per each physical
420
 *             NUMA node of the host. This will happen if the opt_runqueue
421
 *             parameter is set to 'node';
422
 *
423
 * - global: meaning that there will be only one runqueue to which all the
424
 *           (logical) processors of the host belong. This will happen if
425
 *           the opt_runqueue parameter is set to 'all'.
426
 *
427
 * Depending on the value of opt_runqueue, therefore, cpus that are part of
428
 * either the same physical core, the same physical socket, the same NUMA
429
 * node, or just all of them, will be put together to form runqueues.
430
 */
431
0
#define OPT_RUNQUEUE_CPU    0
432
0
#define OPT_RUNQUEUE_CORE   1
433
0
#define OPT_RUNQUEUE_SOCKET 2
434
0
#define OPT_RUNQUEUE_NODE   3
435
0
#define OPT_RUNQUEUE_ALL    4
436
static const char *const opt_runqueue_str[] = {
437
    [OPT_RUNQUEUE_CPU] = "cpu",
438
    [OPT_RUNQUEUE_CORE] = "core",
439
    [OPT_RUNQUEUE_SOCKET] = "socket",
440
    [OPT_RUNQUEUE_NODE] = "node",
441
    [OPT_RUNQUEUE_ALL] = "all"
442
};
443
static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;
444
445
static int parse_credit2_runqueue(const char *s)
446
0
{
447
0
    unsigned int i;
448
0
449
0
    for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
450
0
    {
451
0
        if ( !strcmp(s, opt_runqueue_str[i]) )
452
0
        {
453
0
            opt_runqueue = i;
454
0
            return 0;
455
0
        }
456
0
    }
457
0
458
0
    return -EINVAL;
459
0
}
460
custom_param("credit2_runqueue", parse_credit2_runqueue);
461
462
/*
463
 * Per-runqueue data
464
 */
465
struct csched2_runqueue_data {
466
    spinlock_t lock;           /* Lock for this runqueue                     */
467
468
    struct list_head runq;     /* Ordered list of runnable vms               */
469
    int id;                    /* ID of this runqueue (-1 if invalid)        */
470
471
    int load;                  /* Instantaneous load (num of non-idle vcpus) */
472
    s_time_t load_last_update; /* Last time average was updated              */
473
    s_time_t avgload;          /* Decaying queue load                        */
474
    s_time_t b_avgload;        /* Decaying queue load modified by balancing  */
475
476
    cpumask_t active,          /* CPUs enabled for this runqueue             */
477
        smt_idle,              /* Fully idle-and-untickled cores (see below) */
478
        tickled,               /* Have been asked to go through schedule     */
479
        idle;                  /* Currently idle pcpus                       */
480
481
    struct list_head svc;      /* List of all vcpus assigned to the runqueue */
482
    unsigned int max_weight;   /* Max weight of the vcpus in this runqueue   */
483
    unsigned int pick_bias;    /* Last picked pcpu. Start from it next time  */
484
};
485
486
/*
487
 * System-wide private data
488
 */
489
struct csched2_private {
490
    rwlock_t lock;                     /* Private scheduler lock             */
491
492
    unsigned int load_precision_shift; /* Precision of load calculations     */
493
    unsigned int load_window_shift;    /* Lenght of load decaying window     */
494
    unsigned int ratelimit_us;         /* Rate limiting for this scheduler   */
495
496
    cpumask_t active_queues;           /* Runqueues with (maybe) active cpus */
497
    struct csched2_runqueue_data *rqd; /* Data of the various runqueues      */
498
499
    cpumask_t initialized;             /* CPUs part of this scheduler        */
500
    struct list_head sdom;             /* List of domains (for debug key)    */
501
};
502
503
/*
504
 * Physical CPU
505
 *
506
 * The only per-pCPU information we need to maintain is of which runqueue
507
 * each CPU is part of.
508
 */
509
static DEFINE_PER_CPU(int, runq_map);
510
511
/*
512
 * Virtual CPU
513
 */
514
struct csched2_vcpu {
515
    struct csched2_dom *sdom;          /* Up-pointer to domain                */
516
    struct vcpu *vcpu;                 /* Up-pointer, to vcpu                 */
517
    struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue          */
518
519
    int credit;                        /* Current amount of credit            */
520
    unsigned int weight;               /* Weight of this vcpu                 */
521
    unsigned int residual;             /* Reminder of div(max_weight/weight)  */
522
    unsigned flags;                    /* Status flags (16 bits would be ok,  */
523
    s_time_t budget;                   /* Current budget (if domains has cap) */
524
                                       /* but clear_bit() does not like that) */
525
    s_time_t budget_quota;             /* Budget to which vCPU is entitled    */
526
527
    s_time_t start_time;               /* Time we were scheduled (for credit) */
528
529
    /* Individual contribution to load                                        */
530
    s_time_t load_last_update;         /* Last time average was updated       */
531
    s_time_t avgload;                  /* Decaying queue load                 */
532
533
    struct list_head runq_elem;        /* On the runqueue (rqd->runq)         */
534
    struct list_head parked_elem;      /* On the parked_vcpus list            */
535
    struct list_head rqd_elem;         /* On csched2_runqueue_data's svc list */
536
    struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
537
    int tickled_cpu;                   /* Cpu that will pick us (-1 if none)  */
538
};
539
540
/*
541
 * Domain
542
 */
543
struct csched2_dom {
544
    struct domain *dom;         /* Up-pointer to domain                       */
545
546
    spinlock_t budget_lock;     /* Serialized budget calculations             */
547
    s_time_t tot_budget;        /* Total amount of budget                     */
548
    s_time_t budget;            /* Currently available budget                 */
549
550
    struct timer *repl_timer;   /* Timer for periodic replenishment of budget */
551
    s_time_t next_repl;         /* Time at which next replenishment occurs    */
552
    struct list_head parked_vcpus; /* List of CPUs waiting for budget         */
553
554
    struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list       */
555
    uint16_t weight;            /* User specified weight                      */
556
    uint16_t cap;               /* User specified cap                         */
557
    uint16_t nr_vcpus;          /* Number of vcpus of this domain             */
558
};
559
560
/*
561
 * Accessor helpers functions.
562
 */
563
static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
564
0
{
565
0
    return ops->sched_data;
566
0
}
567
568
static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v)
569
0
{
570
0
    return v->sched_priv;
571
0
}
572
573
static inline struct csched2_dom *csched2_dom(const struct domain *d)
574
0
{
575
0
    return d->sched_priv;
576
0
}
577
578
/* CPU to runq_id macro */
579
static inline int c2r(unsigned int cpu)
580
0
{
581
0
    return per_cpu(runq_map, cpu);
582
0
}
583
584
/* CPU to runqueue struct macro */
585
static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
586
                                                  unsigned int cpu)
587
0
{
588
0
    return &csched2_priv(ops)->rqd[c2r(cpu)];
589
0
}
590
591
/* Does the domain of this vCPU have a cap? */
592
static inline bool has_cap(const struct csched2_vcpu *svc)
593
0
{
594
0
    return svc->budget != STIME_MAX;
595
0
}
596
597
/*
598
 * Hyperthreading (SMT) support.
599
 *
600
 * We use a special per-runq mask (smt_idle) and update it according to the
601
 * following logic:
602
 *  - when _all_ the SMT sibling in a core are idle, all their corresponding
603
 *    bits are set in the smt_idle mask;
604
 *  - when even _just_one_ of the SMT siblings in a core is not idle, all the
605
 *    bits correspondings to it and to all its siblings are clear in the
606
 *    smt_idle mask.
607
 *
608
 * Once we have such a mask, it is easy to implement a policy that, either:
609
 *  - uses fully idle cores first: it is enough to try to schedule the vcpus
610
 *    on pcpus from smt_idle mask first. This is what happens if
611
 *    sched_smt_power_savings was not set at boot (default), and it maximizes
612
 *    true parallelism, and hence performance;
613
 *  - uses already busy cores first: it is enough to try to schedule the vcpus
614
 *    on pcpus that are idle, but are not in smt_idle. This is what happens if
615
 *    sched_smt_power_savings is set at boot, and it allows as more cores as
616
 *    possible to stay in low power states, minimizing power consumption.
617
 *
618
 * This logic is entirely implemented in runq_tickle(), and that is enough.
619
 * In fact, in this scheduler, placement of a vcpu on one of the pcpus of a
620
 * runq, _always_ happens by means of tickling:
621
 *  - when a vcpu wakes up, it calls csched2_vcpu_wake(), which calls
622
 *    runq_tickle();
623
 *  - when a migration is initiated in schedule.c, we call csched2_cpu_pick(),
624
 *    csched2_vcpu_migrate() (which calls migrate()) and csched2_vcpu_wake().
625
 *    csched2_cpu_pick() looks for the least loaded runq and return just any
626
 *    of its processors. Then, csched2_vcpu_migrate() just moves the vcpu to
627
 *    the chosen runq, and it is again runq_tickle(), called by
628
 *    csched2_vcpu_wake() that actually decides what pcpu to use within the
629
 *    chosen runq;
630
 *  - when a migration is initiated in sched_credit2.c, by calling  migrate()
631
 *    directly, that again temporarily use a random pcpu from the new runq,
632
 *    and then calls runq_tickle(), by itself.
633
 */
634
635
/*
636
 * If all the siblings of cpu (including cpu itself) are both idle and
637
 * untickled, set all their bits in mask.
638
 *
639
 * NB that rqd->smt_idle is different than rqd->idle.  rqd->idle
640
 * records pcpus that at are merely idle (i.e., at the moment do not
641
 * have a vcpu running on them).  But you have to manually filter out
642
 * which pcpus have been tickled in order to find cores that are not
643
 * going to be busy soon.  Filtering out tickled cpus pairwise is a
644
 * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
645
 * the bits of a pcpu are set only if all the threads on its core are
646
 * both idle *and* untickled.
647
 *
648
 * This means changing the mask when either rqd->idle or rqd->tickled
649
 * changes.
650
 */
651
static inline
652
void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
653
                       cpumask_t *mask)
654
0
{
655
0
    const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu);
656
0
657
0
    if ( cpumask_subset(cpu_siblings, idlers) )
658
0
        cpumask_or(mask, mask, cpu_siblings);
659
0
}
660
661
/*
662
 * Clear the bits of all the siblings of cpu from mask (if necessary).
663
 */
664
static inline
665
void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
666
0
{
667
0
    const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu);
668
0
669
0
    if ( cpumask_subset(cpu_siblings, mask) )
670
0
        cpumask_andnot(mask, mask, per_cpu(cpu_sibling_mask, cpu));
671
0
}
672
673
/*
674
 * In csched2_cpu_pick(), it may not be possible to actually look at remote
675
 * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
676
 * we pick, in order of decreasing preference:
677
 *  1) svc's current pcpu, if it is part of svc's soft affinity;
678
 *  2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
679
 *  3) svc's current pcpu, if it is part of svc's hard affinity;
680
 *  4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
681
 *  5) just one valid pcpu from svc's hard affinity
682
 *
683
 * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
684
 * note that at least 5 is guaranteed to _always_ return at least one pcpu.
685
 */
686
static int get_fallback_cpu(struct csched2_vcpu *svc)
687
0
{
688
0
    struct vcpu *v = svc->vcpu;
689
0
    unsigned int bs;
690
0
691
0
    SCHED_STAT_CRANK(need_fallback_cpu);
692
0
693
0
    for_each_affinity_balance_step( bs )
694
0
    {
695
0
        int cpu = v->processor;
696
0
697
0
        if ( bs == BALANCE_SOFT_AFFINITY &&
698
0
             !has_soft_affinity(v, v->cpu_hard_affinity) )
699
0
            continue;
700
0
701
0
        affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu));
702
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
703
0
                    cpupool_domain_cpumask(v->domain));
704
0
705
0
        /*
706
0
         * This is cases 1 or 3 (depending on bs): if v->processor is (still)
707
0
         * in our affinity, go for it, for cache betterness.
708
0
         */
709
0
        if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
710
0
            return cpu;
711
0
712
0
        /*
713
0
         * This is cases 2 or 4 (depending on bs): v->processor isn't there
714
0
         * any longer, check if we at least can stay in our current runq.
715
0
         */
716
0
        if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
717
0
                                       &svc->rqd->active)) )
718
0
        {
719
0
            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
720
0
                        &svc->rqd->active);
721
0
            return cpumask_first(cpumask_scratch_cpu(cpu));
722
0
        }
723
0
724
0
        /*
725
0
         * We may well pick any valid pcpu from our soft-affinity, outside
726
0
         * of our current runqueue, but we decide not to. In fact, changing
727
0
         * runqueue is slow, affects load distribution, and is a source of
728
0
         * overhead for the vcpus running on the other runqueue (we need the
729
0
         * lock). So, better do that as a consequence of a well informed
730
0
         * decision (or if we really don't have any other chance, as we will,
731
0
         * at step 5, if we get to there).
732
0
         *
733
0
         * Also, being here, looking for a fallback, is an unfortunate and
734
0
         * infrequent event, while the decision of putting us in the runqueue
735
0
         * wehere we are was (likely) made taking all the relevant factors
736
0
         * into account. So let's not disrupt that, just for the sake of
737
0
         * soft-affinity, and let's wait here to be able to made (hopefully,
738
0
         * soon), another similar well informed decision.
739
0
         */
740
0
        if ( bs == BALANCE_SOFT_AFFINITY )
741
0
            continue;
742
0
743
0
        /*
744
0
         * This is cases 5: last stand, just one valid pcpu from our hard
745
0
         * affinity. It's guaranteed that there is at least one valid cpu,
746
0
         * and therefore we are sure that we return it, and never really
747
0
         * exit the loop.
748
0
         */
749
0
        ASSERT(bs == BALANCE_HARD_AFFINITY &&
750
0
               !cpumask_empty(cpumask_scratch_cpu(cpu)));
751
0
        cpu = cpumask_first(cpumask_scratch_cpu(cpu));
752
0
        if ( likely(cpu < nr_cpu_ids) )
753
0
            return cpu;
754
0
    }
755
0
    ASSERT_UNREACHABLE();
756
0
    /*
757
0
     * We can't be here.  But if that somehow happen (in non-debug builds),
758
0
     * at least return something which both online and in our hard-affinity.
759
0
     */
760
0
    return cpumask_any(cpumask_scratch_cpu(v->processor));
761
0
}
762
763
/*
764
 * Time-to-credit, credit-to-time.
765
 *
766
 * We keep track of the "residual" time to make sure that frequent short
767
 * schedules still get accounted for in the end.
768
 *
769
 * FIXME: Do pre-calculated division?
770
 */
771
static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
772
                          struct csched2_vcpu *svc)
773
0
{
774
0
    uint64_t val = time * rqd->max_weight + svc->residual;
775
0
776
0
    svc->residual = do_div(val, svc->weight);
777
0
    svc->credit -= val;
778
0
}
779
780
static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_vcpu *svc)
781
0
{
782
0
    return credit * svc->weight / rqd->max_weight;
783
0
}
784
785
/*
786
 * Runqueue related code.
787
 */
788
789
static inline int vcpu_on_runq(struct csched2_vcpu *svc)
790
0
{
791
0
    return !list_empty(&svc->runq_elem);
792
0
}
793
794
static inline struct csched2_vcpu * runq_elem(struct list_head *elem)
795
0
{
796
0
    return list_entry(elem, struct csched2_vcpu, runq_elem);
797
0
}
798
799
static void activate_runqueue(struct csched2_private *prv, int rqi)
800
0
{
801
0
    struct csched2_runqueue_data *rqd;
802
0
803
0
    rqd = prv->rqd + rqi;
804
0
805
0
    BUG_ON(!cpumask_empty(&rqd->active));
806
0
807
0
    rqd->max_weight = 1;
808
0
    rqd->id = rqi;
809
0
    INIT_LIST_HEAD(&rqd->svc);
810
0
    INIT_LIST_HEAD(&rqd->runq);
811
0
    spin_lock_init(&rqd->lock);
812
0
813
0
    __cpumask_set_cpu(rqi, &prv->active_queues);
814
0
}
815
816
static void deactivate_runqueue(struct csched2_private *prv, int rqi)
817
0
{
818
0
    struct csched2_runqueue_data *rqd;
819
0
820
0
    rqd = prv->rqd + rqi;
821
0
822
0
    BUG_ON(!cpumask_empty(&rqd->active));
823
0
824
0
    rqd->id = -1;
825
0
826
0
    __cpumask_clear_cpu(rqi, &prv->active_queues);
827
0
}
828
829
static inline bool same_node(unsigned int cpua, unsigned int cpub)
830
0
{
831
0
    return cpu_to_node(cpua) == cpu_to_node(cpub);
832
0
}
833
834
static inline bool same_socket(unsigned int cpua, unsigned int cpub)
835
0
{
836
0
    return cpu_to_socket(cpua) == cpu_to_socket(cpub);
837
0
}
838
839
static inline bool same_core(unsigned int cpua, unsigned int cpub)
840
0
{
841
0
    return same_socket(cpua, cpub) &&
842
0
           cpu_to_core(cpua) == cpu_to_core(cpub);
843
0
}
844
845
static unsigned int
846
cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
847
0
{
848
0
    struct csched2_runqueue_data *rqd;
849
0
    unsigned int rqi;
850
0
851
0
    for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
852
0
    {
853
0
        unsigned int peer_cpu;
854
0
855
0
        /*
856
0
         * As soon as we come across an uninitialized runqueue, use it.
857
0
         * In fact, either:
858
0
         *  - we are initializing the first cpu, and we assign it to
859
0
         *    runqueue 0. This is handy, especially if we are dealing
860
0
         *    with the boot cpu (if credit2 is the default scheduler),
861
0
         *    as we would not be able to use cpu_to_socket() and similar
862
0
         *    helpers anyway (they're result of which is not reliable yet);
863
0
         *  - we have gone through all the active runqueues, and have not
864
0
         *    found anyone whose cpus' topology matches the one we are
865
0
         *    dealing with, so activating a new runqueue is what we want.
866
0
         */
867
0
        if ( prv->rqd[rqi].id == -1 )
868
0
            break;
869
0
870
0
        rqd = prv->rqd + rqi;
871
0
        BUG_ON(cpumask_empty(&rqd->active));
872
0
873
0
        peer_cpu = cpumask_first(&rqd->active);
874
0
        BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
875
0
               cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);
876
0
877
0
        if (opt_runqueue == OPT_RUNQUEUE_CPU)
878
0
            continue;
879
0
        if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
880
0
             (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
881
0
             (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
882
0
             (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
883
0
            break;
884
0
    }
885
0
886
0
    /* We really expect to be able to assign each cpu to a runqueue. */
887
0
    BUG_ON(rqi >= nr_cpu_ids);
888
0
889
0
    return rqi;
890
0
}
891
892
/* Find the domain with the highest weight. */
893
static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
894
                              int old_weight)
895
0
{
896
0
    /* Try to avoid brute-force search:
897
0
     * - If new_weight is larger, max_weigth <- new_weight
898
0
     * - If old_weight != max_weight, someone else is still max_weight
899
0
     *   (No action required)
900
0
     * - If old_weight == max_weight, brute-force search for max weight
901
0
     */
902
0
    if ( new_weight > rqd->max_weight )
903
0
    {
904
0
        rqd->max_weight = new_weight;
905
0
        SCHED_STAT_CRANK(upd_max_weight_quick);
906
0
    }
907
0
    else if ( old_weight == rqd->max_weight )
908
0
    {
909
0
        struct list_head *iter;
910
0
        int max_weight = 1;
911
0
912
0
        list_for_each( iter, &rqd->svc )
913
0
        {
914
0
            struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, rqd_elem);
915
0
916
0
            if ( svc->weight > max_weight )
917
0
                max_weight = svc->weight;
918
0
        }
919
0
920
0
        rqd->max_weight = max_weight;
921
0
        SCHED_STAT_CRANK(upd_max_weight_full);
922
0
    }
923
0
924
0
    if ( unlikely(tb_init_done) )
925
0
    {
926
0
        struct {
927
0
            unsigned rqi:16, max_weight:16;
928
0
        } d;
929
0
        d.rqi = rqd->id;
930
0
        d.max_weight = rqd->max_weight;
931
0
        __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
932
0
                    sizeof(d),
933
0
                    (unsigned char *)&d);
934
0
    }
935
0
}
936
937
/* Add and remove from runqueue assignment (not active run queue) */
938
static void
939
_runq_assign(struct csched2_vcpu *svc, struct csched2_runqueue_data *rqd)
940
0
{
941
0
942
0
    svc->rqd = rqd;
943
0
    list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
944
0
945
0
    update_max_weight(svc->rqd, svc->weight, 0);
946
0
947
0
    /* Expected new load based on adding this vcpu */
948
0
    rqd->b_avgload += svc->avgload;
949
0
950
0
    if ( unlikely(tb_init_done) )
951
0
    {
952
0
        struct {
953
0
            unsigned vcpu:16, dom:16;
954
0
            unsigned rqi:16;
955
0
        } d;
956
0
        d.dom = svc->vcpu->domain->domain_id;
957
0
        d.vcpu = svc->vcpu->vcpu_id;
958
0
        d.rqi=rqd->id;
959
0
        __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
960
0
                    sizeof(d),
961
0
                    (unsigned char *)&d);
962
0
    }
963
0
964
0
}
965
966
static void
967
runq_assign(const struct scheduler *ops, struct vcpu *vc)
968
0
{
969
0
    struct csched2_vcpu *svc = vc->sched_priv;
970
0
971
0
    ASSERT(svc->rqd == NULL);
972
0
973
0
    _runq_assign(svc, c2rqd(ops, vc->processor));
974
0
}
975
976
static void
977
_runq_deassign(struct csched2_vcpu *svc)
978
0
{
979
0
    struct csched2_runqueue_data *rqd = svc->rqd;
980
0
981
0
    ASSERT(!vcpu_on_runq(svc));
982
0
    ASSERT(!(svc->flags & CSFLAG_scheduled));
983
0
984
0
    list_del_init(&svc->rqd_elem);
985
0
    update_max_weight(rqd, 0, svc->weight);
986
0
987
0
    /* Expected new load based on removing this vcpu */
988
0
    rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
989
0
990
0
    svc->rqd = NULL;
991
0
}
992
993
static void
994
runq_deassign(const struct scheduler *ops, struct vcpu *vc)
995
0
{
996
0
    struct csched2_vcpu *svc = vc->sched_priv;
997
0
998
0
    ASSERT(svc->rqd == c2rqd(ops, vc->processor));
999
0
1000
0
    _runq_deassign(svc);
1001
0
}
1002
1003
/*
1004
 * Track the runq load by gathering instantaneous load samples, and using
1005
 * exponentially weighted moving average (EWMA) for the 'decaying'.
1006
 *
1007
 * We consider a window of length W=2^(prv->load_window_shift) nsecs
1008
 * (which takes LOADAVG_GRANULARITY_SHIFT into account).
1009
 *
1010
 * If load is the instantaneous load, the formula for EWMA looks as follows,
1011
 * for the i-eth sample:
1012
 *
1013
 *  avg[i] = a*load + (1 - a)*avg[i-1]
1014
 *
1015
 * where avg[i] is the new value of the average load, avg[i-1] is the value
1016
 * of the average load calculated so far, and a is a coefficient less or
1017
 * equal to 1.
1018
 *
1019
 * So, for us, it becomes:
1020
 *
1021
 *  avgload = a*load + (1 - a)*avgload
1022
 *
1023
 * For determining a, we consider _when_ we are doing the load update, wrt
1024
 * the length of the window. We define delta as follows:
1025
 *
1026
 *  delta = t - load_last_update
1027
 *
1028
 * where t is current time (i.e., time at which we are both sampling and
1029
 * updating the load average) and load_last_update is the last time we did
1030
 * that.
1031
 *
1032
 * There are two possible situations:
1033
 *
1034
 * a) delta <= W
1035
 *    this means that, during the last window of length W, the runeuque load
1036
 *    was avgload for (W - detla) time, and load for delta time:
1037
 *
1038
 *                |----------- W ---------|
1039
 *                |                       |
1040
 *                |     load_last_update  t
1041
 *     -------------------------|---------|---
1042
 *                |             |         |
1043
 *                \__W - delta__/\_delta__/
1044
 *                |             |         |
1045
 *                |___avgload___|__load___|
1046
 *
1047
 *    So, what about using delta/W as our smoothing coefficient a. If we do,
1048
 *    here's what happens:
1049
 *
1050
 *     a = delta / W
1051
 *     1 - a = 1 - (delta / W) = (W - delta) / W
1052
 *
1053
 *    Which matches the above description of what happened in the last
1054
 *    window of length W.
1055
 *
1056
 *    Note that this also means that the weight that we assign to both the
1057
 *    latest load sample, and to previous history, varies at each update.
1058
 *    The longer the latest load sample has been in efect, within the last
1059
 *    window, the higher it weights (and the lesser the previous history
1060
 *    weights).
1061
 *
1062
 *    This is some sort of extension of plain EWMA to fit even better to our
1063
 *    use case.
1064
 *
1065
 * b) delta > W
1066
 *    this means more than a full window has passed since the last update:
1067
 *
1068
 *                |----------- W ---------|
1069
 *                |                       |
1070
 *       load_last_update                 t
1071
 *     ----|------------------------------|---
1072
 *         |                              |
1073
 *         \_________________delta________/
1074
 *
1075
 *    Basically, it means the last load sample has been in effect for more
1076
 *    than W time, and hence we should just use it, and forget everything
1077
 *    before that.
1078
 *
1079
 *    This can be seen as a 'reset condition', occurring when, for whatever
1080
 *    reason, load has not been updated for longer than we expected. (It is
1081
 *    also how avgload is assigned its first value.)
1082
 *
1083
 * The formula for avgload then becomes:
1084
 *
1085
 *  avgload = (delta/W)*load + (W - delta)*avgload/W
1086
 *  avgload = delta*load/W + W*avgload/W - delta*avgload/W
1087
 *  avgload = avgload + delta*load/W - delta*avgload/W
1088
 *
1089
 * So, final form is:
1090
 *
1091
 *  avgload_0 = load
1092
 *  avgload = avgload + delta*load/W - delta*avgload/W,  0<=delta<=W
1093
 *
1094
 * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
1095
 * what happens if we  update the load twice, at the same time instant?):
1096
 *
1097
 *  avgload = avgload + 0*load/W - 0*avgload/W
1098
 *  avgload = avgload
1099
 *
1100
 * and when delta is W (i.e., what happens if we update at the last
1101
 * possible instant before the window 'expires'?):
1102
 *
1103
 *  avgload = avgload + W*load/W - W*avgload/W
1104
 *  avgload = avgload + load - avgload
1105
 *  avgload = load
1106
 *
1107
 * Which, in both cases, is what we expect.
1108
 */
1109
static void
1110
update_runq_load(const struct scheduler *ops,
1111
                 struct csched2_runqueue_data *rqd, int change, s_time_t now)
1112
0
{
1113
0
    struct csched2_private *prv = csched2_priv(ops);
1114
0
    s_time_t delta, load = rqd->load;
1115
0
    unsigned int P, W;
1116
0
1117
0
    W = prv->load_window_shift;
1118
0
    P = prv->load_precision_shift;
1119
0
    now >>= LOADAVG_GRANULARITY_SHIFT;
1120
0
1121
0
    /*
1122
0
     * To avoid using fractions, we shift to left by load_precision_shift,
1123
0
     * and use the least last load_precision_shift bits as fractional part.
1124
0
     * Looking back at the formula we want to use, we now have:
1125
0
     *
1126
0
     *  P = 2^(load_precision_shift)
1127
0
     *  P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
1128
0
     *  P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
1129
0
     *
1130
0
     * And if we are ok storing and using P*avgload, we can rewrite this as:
1131
0
     *
1132
0
     *  P*avgload = avgload'
1133
0
     *  avgload' = avgload' + delta*P*load/W - delta*avgload'/W
1134
0
     *
1135
0
     * Coupled with, of course:
1136
0
     *
1137
0
     *  avgload_0' = P*load
1138
0
     */
1139
0
1140
0
    if ( rqd->load_last_update + (1ULL << W)  < now )
1141
0
    {
1142
0
        rqd->avgload = load << P;
1143
0
        rqd->b_avgload = load << P;
1144
0
    }
1145
0
    else
1146
0
    {
1147
0
        delta = now - rqd->load_last_update;
1148
0
        if ( unlikely(delta < 0) )
1149
0
        {
1150
0
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
1151
0
                     __func__, now, rqd->load_last_update);
1152
0
            delta = 0;
1153
0
        }
1154
0
1155
0
        /*
1156
0
         * Note that, if we were to enforce (or check) some relationship
1157
0
         * between P and W, we may save one shift. E.g., if we are sure
1158
0
         * that P < W, we could write:
1159
0
         *
1160
0
         *  (delta * (load << P)) >> W
1161
0
         *
1162
0
         * as:
1163
0
         *
1164
0
         *  (delta * load) >> (W - P)
1165
0
         */
1166
0
        rqd->avgload = rqd->avgload +
1167
0
                       ((delta * (load << P)) >> W) -
1168
0
                       ((delta * rqd->avgload) >> W);
1169
0
        rqd->b_avgload = rqd->b_avgload +
1170
0
                         ((delta * (load << P)) >> W) -
1171
0
                         ((delta * rqd->b_avgload) >> W);
1172
0
    }
1173
0
    rqd->load += change;
1174
0
    rqd->load_last_update = now;
1175
0
1176
0
    /* Overflow, capable of making the load look negative, must not occur. */
1177
0
    ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);
1178
0
1179
0
    if ( unlikely(tb_init_done) )
1180
0
    {
1181
0
        struct {
1182
0
            uint64_t rq_avgload, b_avgload;
1183
0
            unsigned rq_load:16, rq_id:8, shift:8;
1184
0
        } d;
1185
0
        d.rq_id = rqd->id;
1186
0
        d.rq_load = rqd->load;
1187
0
        d.rq_avgload = rqd->avgload;
1188
0
        d.b_avgload = rqd->b_avgload;
1189
0
        d.shift = P;
1190
0
        __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
1191
0
                    sizeof(d),
1192
0
                    (unsigned char *)&d);
1193
0
    }
1194
0
}
1195
1196
static void
1197
update_svc_load(const struct scheduler *ops,
1198
                struct csched2_vcpu *svc, int change, s_time_t now)
1199
0
{
1200
0
    struct csched2_private *prv = csched2_priv(ops);
1201
0
    s_time_t delta, vcpu_load;
1202
0
    unsigned int P, W;
1203
0
1204
0
    if ( change == -1 )
1205
0
        vcpu_load = 1;
1206
0
    else if ( change == 1 )
1207
0
        vcpu_load = 0;
1208
0
    else
1209
0
        vcpu_load = vcpu_runnable(svc->vcpu);
1210
0
1211
0
    W = prv->load_window_shift;
1212
0
    P = prv->load_precision_shift;
1213
0
    now >>= LOADAVG_GRANULARITY_SHIFT;
1214
0
1215
0
    if ( svc->load_last_update + (1ULL << W) < now )
1216
0
    {
1217
0
        svc->avgload = vcpu_load << P;
1218
0
    }
1219
0
    else
1220
0
    {
1221
0
        delta = now - svc->load_last_update;
1222
0
        if ( unlikely(delta < 0) )
1223
0
        {
1224
0
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
1225
0
                     __func__, now, svc->load_last_update);
1226
0
            delta = 0;
1227
0
        }
1228
0
1229
0
        svc->avgload = svc->avgload +
1230
0
                       ((delta * (vcpu_load << P)) >> W) -
1231
0
                       ((delta * svc->avgload) >> W);
1232
0
    }
1233
0
    svc->load_last_update = now;
1234
0
1235
0
    /* Overflow, capable of making the load look negative, must not occur. */
1236
0
    ASSERT(svc->avgload >= 0);
1237
0
1238
0
    if ( unlikely(tb_init_done) )
1239
0
    {
1240
0
        struct {
1241
0
            uint64_t v_avgload;
1242
0
            unsigned vcpu:16, dom:16;
1243
0
            unsigned shift;
1244
0
        } d;
1245
0
        d.dom = svc->vcpu->domain->domain_id;
1246
0
        d.vcpu = svc->vcpu->vcpu_id;
1247
0
        d.v_avgload = svc->avgload;
1248
0
        d.shift = P;
1249
0
        __trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1,
1250
0
                    sizeof(d),
1251
0
                    (unsigned char *)&d);
1252
0
    }
1253
0
}
1254
1255
static void
1256
update_load(const struct scheduler *ops,
1257
            struct csched2_runqueue_data *rqd,
1258
            struct csched2_vcpu *svc, int change, s_time_t now)
1259
0
{
1260
0
    trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0,  NULL);
1261
0
1262
0
    update_runq_load(ops, rqd, change, now);
1263
0
    if ( svc )
1264
0
        update_svc_load(ops, svc, change, now);
1265
0
}
1266
1267
static void
1268
runq_insert(const struct scheduler *ops, struct csched2_vcpu *svc)
1269
0
{
1270
0
    struct list_head *iter;
1271
0
    unsigned int cpu = svc->vcpu->processor;
1272
0
    struct list_head * runq = &c2rqd(ops, cpu)->runq;
1273
0
    int pos = 0;
1274
0
1275
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
1276
0
1277
0
    ASSERT(!vcpu_on_runq(svc));
1278
0
    ASSERT(c2r(cpu) == c2r(svc->vcpu->processor));
1279
0
1280
0
    ASSERT(&svc->rqd->runq == runq);
1281
0
    ASSERT(!is_idle_vcpu(svc->vcpu));
1282
0
    ASSERT(!svc->vcpu->is_running);
1283
0
    ASSERT(!(svc->flags & CSFLAG_scheduled));
1284
0
1285
0
    list_for_each( iter, runq )
1286
0
    {
1287
0
        struct csched2_vcpu * iter_svc = runq_elem(iter);
1288
0
1289
0
        if ( svc->credit > iter_svc->credit )
1290
0
            break;
1291
0
1292
0
        pos++;
1293
0
    }
1294
0
    list_add_tail(&svc->runq_elem, iter);
1295
0
1296
0
    if ( unlikely(tb_init_done) )
1297
0
    {
1298
0
        struct {
1299
0
            unsigned vcpu:16, dom:16;
1300
0
            unsigned pos;
1301
0
        } d;
1302
0
        d.dom = svc->vcpu->domain->domain_id;
1303
0
        d.vcpu = svc->vcpu->vcpu_id;
1304
0
        d.pos = pos;
1305
0
        __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
1306
0
                    sizeof(d),
1307
0
                    (unsigned char *)&d);
1308
0
    }
1309
0
}
1310
1311
static inline void runq_remove(struct csched2_vcpu *svc)
1312
0
{
1313
0
    ASSERT(vcpu_on_runq(svc));
1314
0
    list_del_init(&svc->runq_elem);
1315
0
}
1316
1317
void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *, s_time_t);
1318
1319
static inline void
1320
tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
1321
0
{
1322
0
    __cpumask_set_cpu(cpu, &rqd->tickled);
1323
0
    smt_idle_mask_clear(cpu, &rqd->smt_idle);
1324
0
    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1325
0
}
1326
1327
/*
1328
 * What we want to know is whether svc, which we assume to be running on some
1329
 * pcpu, can be interrupted and preempted (which, so far, basically means
1330
 * whether or not it already run for more than the ratelimit, to which we
1331
 * apply some tolerance).
1332
 */
1333
static inline bool is_preemptable(const struct csched2_vcpu *svc,
1334
                                    s_time_t now, s_time_t ratelimit)
1335
0
{
1336
0
    if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
1337
0
        return true;
1338
0
1339
0
    ASSERT(svc->vcpu->is_running);
1340
0
    return now - svc->vcpu->runstate.state_entry_time >
1341
0
           ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
1342
0
}
1343
1344
/*
1345
 * Score to preempt the target cpu.  Return a negative number if the
1346
 * credit isn't high enough; if it is, favor a preemption on cpu in
1347
 * this order:
1348
 * - cpu is in new's soft-affinity, not in cur's soft-affinity
1349
 *   (2 x CSCHED2_CREDIT_INIT score bonus);
1350
 * - cpu is in new's soft-affinity and cur's soft-affinity, or
1351
 *   cpu is not in new's soft-affinity, nor in cur's soft-affinity
1352
 *   (1x CSCHED2_CREDIT_INIT score bonus);
1353
 * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
1354
 *   (no bonus).
1355
 *
1356
 * Within the same class, the highest difference of credit.
1357
 */
1358
static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
1359
                             struct csched2_vcpu *new, unsigned int cpu)
1360
0
{
1361
0
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
1362
0
    struct csched2_vcpu * cur = csched2_vcpu(curr_on_cpu(cpu));
1363
0
    struct csched2_private *prv = csched2_priv(ops);
1364
0
    s_time_t score;
1365
0
1366
0
    /*
1367
0
     * We are dealing with cpus that are marked non-idle (i.e., that are not
1368
0
     * in rqd->idle). However, some of them may be running their idle vcpu,
1369
0
     * if taking care of tasklets. In that case, we want to leave it alone.
1370
0
     */
1371
0
    if ( unlikely(is_idle_vcpu(cur->vcpu) ||
1372
0
         !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
1373
0
        return -1;
1374
0
1375
0
    burn_credits(rqd, cur, now);
1376
0
1377
0
    score = new->credit - cur->credit;
1378
0
    if ( new->vcpu->processor != cpu )
1379
0
        score -= CSCHED2_MIGRATE_RESIST;
1380
0
1381
0
    /*
1382
0
     * If score is positive, it means new has enough credits (i.e.,
1383
0
     * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
1384
0
     *
1385
0
     * Let's compute the bonuses for soft-affinities.
1386
0
     */
1387
0
    if ( score > 0 )
1388
0
    {
1389
0
        if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) )
1390
0
            score += CSCHED2_CREDIT_INIT;
1391
0
1392
0
        if ( !cpumask_test_cpu(cpu, cur->vcpu->cpu_soft_affinity) )
1393
0
            score += CSCHED2_CREDIT_INIT;
1394
0
    }
1395
0
1396
0
    if ( unlikely(tb_init_done) )
1397
0
    {
1398
0
        struct {
1399
0
            unsigned vcpu:16, dom:16;
1400
0
            int credit, score;
1401
0
        } d;
1402
0
        d.dom = cur->vcpu->domain->domain_id;
1403
0
        d.vcpu = cur->vcpu->vcpu_id;
1404
0
        d.credit = cur->credit;
1405
0
        d.score = score;
1406
0
        __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
1407
0
                    sizeof(d),
1408
0
                    (unsigned char *)&d);
1409
0
    }
1410
0
1411
0
    return score;
1412
0
}
1413
1414
/*
1415
 * Check what processor it is best to 'wake', for picking up a vcpu that has
1416
 * just been put (back) in the runqueue. Logic is as follows:
1417
 *  1. if there are idle processors in the runq, wake one of them;
1418
 *  2. if there aren't idle processor, check the one were the vcpu was
1419
 *     running before to see if we can preempt what's running there now
1420
 *     (and hence doing just one migration);
1421
 *  3. last stand: check all processors and see if the vcpu is in right
1422
 *     of preempting any of the other vcpus running on them (this requires
1423
 *     two migrations, and that's indeed why it is left as the last stand).
1424
 *
1425
 * Note that when we say 'idle processors' what we really mean is (pretty
1426
 * much always) both _idle_ and _not_already_tickled_. In fact, if a
1427
 * processor has been tickled, it will run csched2_schedule() shortly, and
1428
 * pick up some work, so it would be wrong to consider it idle.
1429
 */
1430
static void
1431
runq_tickle(const struct scheduler *ops, struct csched2_vcpu *new, s_time_t now)
1432
0
{
1433
0
    int i, ipid = -1;
1434
0
    s_time_t max = 0;
1435
0
    unsigned int bs, cpu = new->vcpu->processor;
1436
0
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
1437
0
    cpumask_t *online = cpupool_domain_cpumask(new->vcpu->domain);
1438
0
    cpumask_t mask;
1439
0
1440
0
    ASSERT(new->rqd == rqd);
1441
0
1442
0
    if ( unlikely(tb_init_done) )
1443
0
    {
1444
0
        struct {
1445
0
            unsigned vcpu:16, dom:16;
1446
0
            unsigned processor;
1447
0
            int credit;
1448
0
        } d;
1449
0
        d.dom = new->vcpu->domain->domain_id;
1450
0
        d.vcpu = new->vcpu->vcpu_id;
1451
0
        d.processor = new->vcpu->processor;
1452
0
        d.credit = new->credit;
1453
0
        __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
1454
0
                    sizeof(d),
1455
0
                    (unsigned char *)&d);
1456
0
    }
1457
0
1458
0
    for_each_affinity_balance_step( bs )
1459
0
    {
1460
0
        /* Just skip first step, if we don't have a soft affinity */
1461
0
        if ( bs == BALANCE_SOFT_AFFINITY &&
1462
0
             !has_soft_affinity(new->vcpu, new->vcpu->cpu_hard_affinity) )
1463
0
            continue;
1464
0
1465
0
        affinity_balance_cpumask(new->vcpu, bs, cpumask_scratch_cpu(cpu));
1466
0
1467
0
        /*
1468
0
         * First of all, consider idle cpus, checking if we can just
1469
0
         * re-use the pcpu where we were running before.
1470
0
         *
1471
0
         * If there are cores where all the siblings are idle, consider
1472
0
         * them first, honoring whatever the spreading-vs-consolidation
1473
0
         * SMT policy wants us to do.
1474
0
         */
1475
0
        if ( unlikely(sched_smt_power_savings) )
1476
0
        {
1477
0
            cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
1478
0
            cpumask_and(&mask, &mask, online);
1479
0
        }
1480
0
        else
1481
0
            cpumask_and(&mask, &rqd->smt_idle, online);
1482
0
        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
1483
0
        i = cpumask_test_or_cycle(cpu, &mask);
1484
0
        if ( i < nr_cpu_ids )
1485
0
        {
1486
0
            SCHED_STAT_CRANK(tickled_idle_cpu);
1487
0
            ipid = i;
1488
0
            goto tickle;
1489
0
        }
1490
0
1491
0
        /*
1492
0
         * If there are no fully idle cores, check all idlers, after
1493
0
         * having filtered out pcpus that have been tickled but haven't
1494
0
         * gone through the scheduler yet.
1495
0
         */
1496
0
        cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
1497
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
1498
0
        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
1499
0
        i = cpumask_test_or_cycle(cpu, &mask);
1500
0
        if ( i < nr_cpu_ids )
1501
0
        {
1502
0
            SCHED_STAT_CRANK(tickled_idle_cpu);
1503
0
            ipid = i;
1504
0
            goto tickle;
1505
0
        }
1506
0
    }
1507
0
1508
0
    /*
1509
0
     * Note that, if we are here, it means we have done the hard-affinity
1510
0
     * balancing step of the loop, and hence what we have in cpumask_scratch
1511
0
     * is what we put there for last, i.e., new's vcpu_hard_affinity & online
1512
0
     * which is exactly what we need for the next part of the function.
1513
0
     */
1514
0
1515
0
    /*
1516
0
     * Otherwise, look for the non-idle (and non-tickled) processors with
1517
0
     * the lowest credit, among the ones new is allowed to run on. Again,
1518
0
     * the cpu were it was running on would be the best candidate.
1519
0
     *
1520
0
     * For deciding which cpu to tickle, we use tickle_score(), which will
1521
0
     * factor in both new's soft-affinity, and the soft-affinity of the
1522
0
     * vcpu running on each cpu that we consider.
1523
0
     */
1524
0
    cpumask_andnot(&mask, &rqd->active, &rqd->idle);
1525
0
    cpumask_andnot(&mask, &mask, &rqd->tickled);
1526
0
    cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
1527
0
    if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
1528
0
    {
1529
0
        s_time_t score = tickle_score(ops, now, new, cpu);
1530
0
1531
0
        if ( score > max )
1532
0
        {
1533
0
            max = score;
1534
0
            ipid = cpu;
1535
0
1536
0
            /* If this is in new's soft affinity, just take it */
1537
0
            if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) )
1538
0
            {
1539
0
                SCHED_STAT_CRANK(tickled_busy_cpu);
1540
0
                goto tickle;
1541
0
            }
1542
0
        }
1543
0
    }
1544
0
1545
0
    for_each_cpu(i, &mask)
1546
0
    {
1547
0
        s_time_t score;
1548
0
1549
0
        /* Already looked at this one above */
1550
0
        ASSERT(i != cpu);
1551
0
1552
0
        score = tickle_score(ops, now, new, i);
1553
0
1554
0
        if ( score > max )
1555
0
        {
1556
0
            max = score;
1557
0
            ipid = i;
1558
0
        }
1559
0
    }
1560
0
1561
0
    if ( ipid == -1 )
1562
0
    {
1563
0
        SCHED_STAT_CRANK(tickled_no_cpu);
1564
0
        return;
1565
0
    }
1566
0
1567
0
    ASSERT(!is_idle_vcpu(curr_on_cpu(ipid)));
1568
0
    SCHED_STAT_CRANK(tickled_busy_cpu);
1569
0
 tickle:
1570
0
    BUG_ON(ipid == -1);
1571
0
1572
0
    if ( unlikely(tb_init_done) )
1573
0
    {
1574
0
        struct {
1575
0
            unsigned cpu:16, pad:16;
1576
0
        } d;
1577
0
        d.cpu = ipid; d.pad = 0;
1578
0
        __trace_var(TRC_CSCHED2_TICKLE, 1,
1579
0
                    sizeof(d),
1580
0
                    (unsigned char *)&d);
1581
0
    }
1582
0
1583
0
    tickle_cpu(ipid, rqd);
1584
0
1585
0
    if ( unlikely(new->tickled_cpu != -1) )
1586
0
        SCHED_STAT_CRANK(tickled_cpu_overwritten);
1587
0
    new->tickled_cpu = ipid;
1588
0
}
1589
1590
/*
1591
 * Credit-related code
1592
 */
1593
static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
1594
                         struct csched2_vcpu *snext)
1595
0
{
1596
0
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
1597
0
    struct list_head *iter;
1598
0
    int m;
1599
0
1600
0
    /*
1601
0
     * Under normal circumstances, snext->credit should never be less
1602
0
     * than -CSCHED2_MIN_TIMER.  However, under some circumstances, a
1603
0
     * vcpu with low credits may be allowed to run long enough that
1604
0
     * its credits are actually less than -CSCHED2_CREDIT_INIT.
1605
0
     * (Instances have been observed, for example, where a vcpu with
1606
0
     * 200us of credit was allowed to run for 11ms, giving it -10.8ms
1607
0
     * of credit.  Thus it was still negative even after the reset.)
1608
0
     *
1609
0
     * If this is the case for snext, we simply want to keep moving
1610
0
     * everyone up until it is in the black again.  This fair because
1611
0
     * none of the other vcpus want to run at the moment.
1612
0
     *
1613
0
     * Rather than looping, however, we just calculate a multiplier,
1614
0
     * avoiding an integer division and multiplication in the common
1615
0
     * case.
1616
0
     */
1617
0
    m = 1;
1618
0
    if ( snext->credit < -CSCHED2_CREDIT_INIT )
1619
0
        m += (-snext->credit) / CSCHED2_CREDIT_INIT;
1620
0
1621
0
    list_for_each( iter, &rqd->svc )
1622
0
    {
1623
0
        unsigned int svc_cpu;
1624
0
        struct csched2_vcpu * svc;
1625
0
        int start_credit;
1626
0
1627
0
        svc = list_entry(iter, struct csched2_vcpu, rqd_elem);
1628
0
        svc_cpu = svc->vcpu->processor;
1629
0
1630
0
        ASSERT(!is_idle_vcpu(svc->vcpu));
1631
0
        ASSERT(svc->rqd == rqd);
1632
0
1633
0
        /*
1634
0
         * If svc is running, it is our responsibility to make sure, here,
1635
0
         * that the credit it has spent so far get accounted.
1636
0
         */
1637
0
        if ( svc->vcpu == curr_on_cpu(svc_cpu) )
1638
0
        {
1639
0
            burn_credits(rqd, svc, now);
1640
0
            /*
1641
0
             * And, similarly, in case it has run out of budget, as a
1642
0
             * consequence of this round of accounting, we also must inform
1643
0
             * its pCPU that it's time to park it, and pick up someone else.
1644
0
             */
1645
0
            if ( unlikely(svc->budget <= 0) )
1646
0
                tickle_cpu(svc_cpu, rqd);
1647
0
        }
1648
0
1649
0
        start_credit = svc->credit;
1650
0
1651
0
        /*
1652
0
         * Add INIT * m, avoiding integer multiplication in the common case.
1653
0
         */
1654
0
        if ( likely(m==1) )
1655
0
            svc->credit += CSCHED2_CREDIT_INIT;
1656
0
        else
1657
0
            svc->credit += m * CSCHED2_CREDIT_INIT;
1658
0
1659
0
        /* "Clip" credits to max carryover */
1660
0
        if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
1661
0
            svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;
1662
0
1663
0
        svc->start_time = now;
1664
0
1665
0
        if ( unlikely(tb_init_done) )
1666
0
        {
1667
0
            struct {
1668
0
                unsigned vcpu:16, dom:16;
1669
0
                int credit_start, credit_end;
1670
0
                unsigned multiplier;
1671
0
            } d;
1672
0
            d.dom = svc->vcpu->domain->domain_id;
1673
0
            d.vcpu = svc->vcpu->vcpu_id;
1674
0
            d.credit_start = start_credit;
1675
0
            d.credit_end = svc->credit;
1676
0
            d.multiplier = m;
1677
0
            __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
1678
0
                        sizeof(d),
1679
0
                        (unsigned char *)&d);
1680
0
        }
1681
0
    }
1682
0
1683
0
    SCHED_STAT_CRANK(credit_reset);
1684
0
1685
0
    /* No need to resort runqueue, as everyone's order should be the same. */
1686
0
}
1687
1688
void burn_credits(struct csched2_runqueue_data *rqd,
1689
                  struct csched2_vcpu *svc, s_time_t now)
1690
0
{
1691
0
    s_time_t delta;
1692
0
1693
0
    ASSERT(svc == csched2_vcpu(curr_on_cpu(svc->vcpu->processor)));
1694
0
1695
0
    if ( unlikely(is_idle_vcpu(svc->vcpu)) )
1696
0
    {
1697
0
        ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
1698
0
        return;
1699
0
    }
1700
0
1701
0
    delta = now - svc->start_time;
1702
0
1703
0
    if ( unlikely(delta <= 0) )
1704
0
    {
1705
0
        if ( unlikely(delta < 0) )
1706
0
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
1707
0
                     " start_time %"PRI_stime"\n", __func__, now,
1708
0
                     svc->start_time);
1709
0
        goto out;
1710
0
    }
1711
0
1712
0
    SCHED_STAT_CRANK(burn_credits_t2c);
1713
0
    t2c_update(rqd, delta, svc);
1714
0
1715
0
    if ( has_cap(svc) )
1716
0
        svc->budget -= delta;
1717
0
1718
0
    svc->start_time = now;
1719
0
1720
0
 out:
1721
0
    if ( unlikely(tb_init_done) )
1722
0
    {
1723
0
        struct {
1724
0
            unsigned vcpu:16, dom:16;
1725
0
            int credit, budget;
1726
0
            int delta;
1727
0
        } d;
1728
0
        d.dom = svc->vcpu->domain->domain_id;
1729
0
        d.vcpu = svc->vcpu->vcpu_id;
1730
0
        d.credit = svc->credit;
1731
0
        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
1732
0
        d.delta = delta;
1733
0
        __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
1734
0
                    sizeof(d),
1735
0
                    (unsigned char *)&d);
1736
0
    }
1737
0
}
1738
1739
/*
1740
 * Budget-related code.
1741
 */
1742
1743
static void park_vcpu(struct csched2_vcpu *svc)
1744
0
{
1745
0
    struct vcpu *v = svc->vcpu;
1746
0
1747
0
    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
1748
0
1749
0
    /*
1750
0
     * It was impossible to find budget for this vCPU, so it has to be
1751
0
     * "parked". This implies it is not runnable, so we mark it as such in
1752
0
     * its pause_flags. If the vCPU is currently scheduled (which means we
1753
0
     * are here after being called from within csched_schedule()), flagging
1754
0
     * is enough, as we'll choose someone else, and then context_saved()
1755
0
     * will take care of updating the load properly.
1756
0
     *
1757
0
     * If, OTOH, the vCPU is sitting in the runqueue (which means we are here
1758
0
     * after being called from within runq_candidate()), we must go all the
1759
0
     * way down to taking it out of there, and updating the load accordingly.
1760
0
     *
1761
0
     * In both cases, we also add it to the list of parked vCPUs of the domain.
1762
0
     */
1763
0
    __set_bit(_VPF_parked, &v->pause_flags);
1764
0
    if ( vcpu_on_runq(svc) )
1765
0
    {
1766
0
        runq_remove(svc);
1767
0
        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
1768
0
    }
1769
0
    list_add(&svc->parked_elem, &svc->sdom->parked_vcpus);
1770
0
}
1771
1772
static bool vcpu_grab_budget(struct csched2_vcpu *svc)
1773
0
{
1774
0
    struct csched2_dom *sdom = svc->sdom;
1775
0
    unsigned int cpu = svc->vcpu->processor;
1776
0
1777
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
1778
0
1779
0
    if ( svc->budget > 0 )
1780
0
        return true;
1781
0
1782
0
    /* budget_lock nests inside runqueue lock. */
1783
0
    spin_lock(&sdom->budget_lock);
1784
0
1785
0
    /*
1786
0
     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
1787
0
     * above!). That basically means the vCPU has overrun a bit --because of
1788
0
     * various reasons-- and we want to take that into account. With the +=,
1789
0
     * we are actually subtracting the amount of budget the vCPU has
1790
0
     * overconsumed, from the total domain budget.
1791
0
     */
1792
0
    sdom->budget += svc->budget;
1793
0
1794
0
    if ( sdom->budget > 0 )
1795
0
    {
1796
0
        s_time_t budget;
1797
0
1798
0
        /* Get our quota, if there's at least as much budget */
1799
0
        if ( likely(sdom->budget >= svc->budget_quota) )
1800
0
            budget = svc->budget_quota;
1801
0
        else
1802
0
            budget = sdom->budget;
1803
0
1804
0
        svc->budget = budget;
1805
0
        sdom->budget -= budget;
1806
0
    }
1807
0
    else
1808
0
    {
1809
0
        svc->budget = 0;
1810
0
        park_vcpu(svc);
1811
0
    }
1812
0
1813
0
    spin_unlock(&sdom->budget_lock);
1814
0
1815
0
    return svc->budget > 0;
1816
0
}
1817
1818
static void
1819
vcpu_return_budget(struct csched2_vcpu *svc, struct list_head *parked)
1820
0
{
1821
0
    struct csched2_dom *sdom = svc->sdom;
1822
0
    unsigned int cpu = svc->vcpu->processor;
1823
0
1824
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
1825
0
    ASSERT(list_empty(parked));
1826
0
1827
0
    /* budget_lock nests inside runqueue lock. */
1828
0
    spin_lock(&sdom->budget_lock);
1829
0
1830
0
    /*
1831
0
     * The vCPU is stopping running (e.g., because it's blocking, or it has
1832
0
     * been preempted). If it hasn't consumed all the budget it got when,
1833
0
     * starting to run, put that remaining amount back in the domain's budget
1834
0
     * pool.
1835
0
     */
1836
0
    sdom->budget += svc->budget;
1837
0
    svc->budget = 0;
1838
0
1839
0
    /*
1840
0
     * Making budget available again to the domain means that parked vCPUs
1841
0
     * may be unparked and run. They are, if any, in the domain's parked_vcpus
1842
0
     * list, so we want to go through that and unpark them (so they can try
1843
0
     * to get some budget).
1844
0
     *
1845
0
     * Touching the list requires the budget_lock, which we hold. Let's
1846
0
     * therefore put everyone in that list in another, temporary list, which
1847
0
     * then the caller will traverse, unparking the vCPUs it finds there.
1848
0
     *
1849
0
     * In fact, we can't do the actual unparking here, because that requires
1850
0
     * taking the runqueue lock of the vCPUs being unparked, and we can't
1851
0
     * take any runqueue locks while we hold a budget_lock.
1852
0
     */
1853
0
    if ( sdom->budget > 0 )
1854
0
        list_splice_init(&sdom->parked_vcpus, parked);
1855
0
1856
0
    spin_unlock(&sdom->budget_lock);
1857
0
}
1858
1859
static void
1860
unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus)
1861
0
{
1862
0
    struct csched2_vcpu *svc, *tmp;
1863
0
    spinlock_t *lock;
1864
0
1865
0
    list_for_each_entry_safe(svc, tmp, vcpus, parked_elem)
1866
0
    {
1867
0
        unsigned long flags;
1868
0
        s_time_t now;
1869
0
1870
0
        lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags);
1871
0
1872
0
        __clear_bit(_VPF_parked, &svc->vcpu->pause_flags);
1873
0
        if ( unlikely(svc->flags & CSFLAG_scheduled) )
1874
0
        {
1875
0
            /*
1876
0
             * We end here if a budget replenishment arrived between
1877
0
             * csched2_schedule() (and, in particular, after a call to
1878
0
             * vcpu_grab_budget() that returned false), and
1879
0
             * context_saved(). By setting __CSFLAG_delayed_runq_add,
1880
0
             * we tell context_saved() to put the vCPU back in the
1881
0
             * runqueue, from where it will compete with the others
1882
0
             * for the newly replenished budget.
1883
0
             */
1884
0
            ASSERT( svc->rqd != NULL );
1885
0
            ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd );
1886
0
            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
1887
0
        }
1888
0
        else if ( vcpu_runnable(svc->vcpu) )
1889
0
        {
1890
0
            /*
1891
0
             * The vCPU should go back to the runqueue, and compete for
1892
0
             * the newly replenished budget, but only if it is actually
1893
0
             * runnable (and was therefore offline only because of the
1894
0
             * lack of budget).
1895
0
             */
1896
0
            now = NOW();
1897
0
            update_load(ops, svc->rqd, svc, 1, now);
1898
0
            runq_insert(ops, svc);
1899
0
            runq_tickle(ops, svc, now);
1900
0
        }
1901
0
        list_del_init(&svc->parked_elem);
1902
0
1903
0
        vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu);
1904
0
    }
1905
0
}
1906
1907
static inline void do_replenish(struct csched2_dom *sdom)
1908
0
{
1909
0
    sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
1910
0
    sdom->budget += sdom->tot_budget;
1911
0
}
1912
1913
static void replenish_domain_budget(void* data)
1914
0
{
1915
0
    struct csched2_dom *sdom = data;
1916
0
    unsigned long flags;
1917
0
    s_time_t now;
1918
0
    LIST_HEAD(parked);
1919
0
1920
0
    spin_lock_irqsave(&sdom->budget_lock, flags);
1921
0
1922
0
    now = NOW();
1923
0
1924
0
    /*
1925
0
     * Let's do the replenishment. Note, though, that a domain may overrun,
1926
0
     * which means the budget would have gone below 0 (reasons may be system
1927
0
     * overbooking, accounting issues, etc.). It also may happen that we are
1928
0
     * handling the replenishment (much) later than we should (reasons may
1929
0
     * again be overbooking, or issues with timers).
1930
0
     *
1931
0
     * Even in cases of overrun or delay, however, we expect that in 99% of
1932
0
     * cases, doing just one replenishment will be good enough for being able
1933
0
     * to unpark the vCPUs that are waiting for some budget.
1934
0
     */
1935
0
    do_replenish(sdom);
1936
0
1937
0
    /*
1938
0
     * And now, the special cases:
1939
0
     * 1) if we are late enough to have skipped (at least) one full period,
1940
0
     * what we must do is doing more replenishments. Note that, however,
1941
0
     * every time we add tot_budget to the budget, we also move next_repl
1942
0
     * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
1943
0
     * respected.
1944
0
     */
1945
0
    if ( unlikely(sdom->next_repl <= now) )
1946
0
    {
1947
0
        do
1948
0
            do_replenish(sdom);
1949
0
        while ( sdom->next_repl <= now );
1950
0
    }
1951
0
    /*
1952
0
     * 2) if we overrun by more than tot_budget, then budget+tot_budget is
1953
0
     * still < 0, which means that we can't unpark the vCPUs. Let's bail,
1954
0
     * and wait for future replenishments.
1955
0
     */
1956
0
    if ( unlikely(sdom->budget <= 0) )
1957
0
    {
1958
0
        spin_unlock_irqrestore(&sdom->budget_lock, flags);
1959
0
        goto out;
1960
0
    }
1961
0
1962
0
    /* Since we do more replenishments, make sure we didn't overshot. */
1963
0
    sdom->budget = min(sdom->budget, sdom->tot_budget);
1964
0
1965
0
    /*
1966
0
     * As above, let's prepare the temporary list, out of the domain's
1967
0
     * parked_vcpus list, now that we hold the budget_lock. Then, drop such
1968
0
     * lock, and pass the list to the unparking function.
1969
0
     */
1970
0
    list_splice_init(&sdom->parked_vcpus, &parked);
1971
0
1972
0
    spin_unlock_irqrestore(&sdom->budget_lock, flags);
1973
0
1974
0
    unpark_parked_vcpus(sdom->dom->cpupool->sched, &parked);
1975
0
1976
0
 out:
1977
0
    set_timer(sdom->repl_timer, sdom->next_repl);
1978
0
}
1979
1980
#ifndef NDEBUG
1981
static inline void
1982
csched2_vcpu_check(struct vcpu *vc)
1983
0
{
1984
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
1985
0
    struct csched2_dom * const sdom = svc->sdom;
1986
0
1987
0
    BUG_ON( svc->vcpu != vc );
1988
0
    BUG_ON( sdom != csched2_dom(vc->domain) );
1989
0
    if ( sdom )
1990
0
    {
1991
0
        BUG_ON( is_idle_vcpu(vc) );
1992
0
        BUG_ON( sdom->dom != vc->domain );
1993
0
    }
1994
0
    else
1995
0
    {
1996
0
        BUG_ON( !is_idle_vcpu(vc) );
1997
0
    }
1998
0
    SCHED_STAT_CRANK(vcpu_check);
1999
0
}
2000
0
#define CSCHED2_VCPU_CHECK(_vc)  (csched2_vcpu_check(_vc))
2001
#else
2002
#define CSCHED2_VCPU_CHECK(_vc)
2003
#endif
2004
2005
static void *
2006
csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
2007
0
{
2008
0
    struct csched2_vcpu *svc;
2009
0
2010
0
    /* Allocate per-VCPU info */
2011
0
    svc = xzalloc(struct csched2_vcpu);
2012
0
    if ( svc == NULL )
2013
0
        return NULL;
2014
0
2015
0
    INIT_LIST_HEAD(&svc->rqd_elem);
2016
0
    INIT_LIST_HEAD(&svc->runq_elem);
2017
0
2018
0
    svc->sdom = dd;
2019
0
    svc->vcpu = vc;
2020
0
    svc->flags = 0U;
2021
0
2022
0
    if ( ! is_idle_vcpu(vc) )
2023
0
    {
2024
0
        ASSERT(svc->sdom != NULL);
2025
0
        svc->credit = CSCHED2_CREDIT_INIT;
2026
0
        svc->weight = svc->sdom->weight;
2027
0
        /* Starting load of 50% */
2028
0
        svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
2029
0
        svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
2030
0
    }
2031
0
    else
2032
0
    {
2033
0
        ASSERT(svc->sdom == NULL);
2034
0
        svc->credit = CSCHED2_IDLE_CREDIT;
2035
0
        svc->weight = 0;
2036
0
    }
2037
0
    svc->tickled_cpu = -1;
2038
0
2039
0
    svc->budget = STIME_MAX;
2040
0
    svc->budget_quota = 0;
2041
0
    INIT_LIST_HEAD(&svc->parked_elem);
2042
0
2043
0
    SCHED_STAT_CRANK(vcpu_alloc);
2044
0
2045
0
    return svc;
2046
0
}
2047
2048
static void
2049
csched2_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
2050
0
{
2051
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
2052
0
2053
0
    ASSERT(!is_idle_vcpu(vc));
2054
0
    SCHED_STAT_CRANK(vcpu_sleep);
2055
0
2056
0
    if ( curr_on_cpu(vc->processor) == vc )
2057
0
    {
2058
0
        tickle_cpu(vc->processor, svc->rqd);
2059
0
    }
2060
0
    else if ( vcpu_on_runq(svc) )
2061
0
    {
2062
0
        ASSERT(svc->rqd == c2rqd(ops, vc->processor));
2063
0
        update_load(ops, svc->rqd, svc, -1, NOW());
2064
0
        runq_remove(svc);
2065
0
    }
2066
0
    else if ( svc->flags & CSFLAG_delayed_runq_add )
2067
0
        __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
2068
0
}
2069
2070
static void
2071
csched2_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
2072
0
{
2073
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
2074
0
    unsigned int cpu = vc->processor;
2075
0
    s_time_t now;
2076
0
2077
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
2078
0
2079
0
    ASSERT(!is_idle_vcpu(vc));
2080
0
2081
0
    if ( unlikely(curr_on_cpu(cpu) == vc) )
2082
0
    {
2083
0
        SCHED_STAT_CRANK(vcpu_wake_running);
2084
0
        goto out;
2085
0
    }
2086
0
2087
0
    if ( unlikely(vcpu_on_runq(svc)) )
2088
0
    {
2089
0
        SCHED_STAT_CRANK(vcpu_wake_onrunq);
2090
0
        goto out;
2091
0
    }
2092
0
2093
0
    if ( likely(vcpu_runnable(vc)) )
2094
0
        SCHED_STAT_CRANK(vcpu_wake_runnable);
2095
0
    else
2096
0
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);
2097
0
2098
0
    /* If the context hasn't been saved for this vcpu yet, we can't put it on
2099
0
     * another runqueue.  Instead, we set a flag so that it will be put on the runqueue
2100
0
     * after the context has been saved. */
2101
0
    if ( unlikely(svc->flags & CSFLAG_scheduled) )
2102
0
    {
2103
0
        __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
2104
0
        goto out;
2105
0
    }
2106
0
2107
0
    /* Add into the new runqueue if necessary */
2108
0
    if ( svc->rqd == NULL )
2109
0
        runq_assign(ops, vc);
2110
0
    else
2111
0
        ASSERT(c2rqd(ops, vc->processor) == svc->rqd );
2112
0
2113
0
    now = NOW();
2114
0
2115
0
    update_load(ops, svc->rqd, svc, 1, now);
2116
0
        
2117
0
    /* Put the VCPU on the runq */
2118
0
    runq_insert(ops, svc);
2119
0
    runq_tickle(ops, svc, now);
2120
0
2121
0
out:
2122
0
    return;
2123
0
}
2124
2125
static void
2126
csched2_vcpu_yield(const struct scheduler *ops, struct vcpu *v)
2127
0
{
2128
0
    struct csched2_vcpu * const svc = csched2_vcpu(v);
2129
0
2130
0
    __set_bit(__CSFLAG_vcpu_yield, &svc->flags);
2131
0
}
2132
2133
static void
2134
csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
2135
0
{
2136
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
2137
0
    spinlock_t *lock = vcpu_schedule_lock_irq(vc);
2138
0
    s_time_t now = NOW();
2139
0
    LIST_HEAD(were_parked);
2140
0
2141
0
    BUG_ON( !is_idle_vcpu(vc) && svc->rqd != c2rqd(ops, vc->processor));
2142
0
    ASSERT(is_idle_vcpu(vc) || svc->rqd == c2rqd(ops, vc->processor));
2143
0
2144
0
    /* This vcpu is now eligible to be put on the runqueue again */
2145
0
    __clear_bit(__CSFLAG_scheduled, &svc->flags);
2146
0
2147
0
    if ( unlikely(has_cap(svc) && svc->budget > 0) )
2148
0
        vcpu_return_budget(svc, &were_parked);
2149
0
2150
0
    /* If someone wants it on the runqueue, put it there. */
2151
0
    /*
2152
0
     * NB: We can get rid of CSFLAG_scheduled by checking for
2153
0
     * vc->is_running and vcpu_on_runq(svc) here.  However,
2154
0
     * since we're accessing the flags cacheline anyway,
2155
0
     * it seems a bit pointless; especially as we have plenty of
2156
0
     * bits free.
2157
0
     */
2158
0
    if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
2159
0
         && likely(vcpu_runnable(vc)) )
2160
0
    {
2161
0
        ASSERT(!vcpu_on_runq(svc));
2162
0
2163
0
        runq_insert(ops, svc);
2164
0
        runq_tickle(ops, svc, now);
2165
0
    }
2166
0
    else if ( !is_idle_vcpu(vc) )
2167
0
        update_load(ops, svc->rqd, svc, -1, now);
2168
0
2169
0
    vcpu_schedule_unlock_irq(lock, vc);
2170
0
2171
0
    unpark_parked_vcpus(ops, &were_parked);
2172
0
}
2173
2174
0
#define MAX_LOAD (STIME_MAX)
2175
static int
2176
csched2_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
2177
0
{
2178
0
    struct csched2_private *prv = csched2_priv(ops);
2179
0
    int i, min_rqi = -1, min_s_rqi = -1;
2180
0
    unsigned int new_cpu, cpu = vc->processor;
2181
0
    struct csched2_vcpu *svc = csched2_vcpu(vc);
2182
0
    s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
2183
0
    bool has_soft;
2184
0
2185
0
    ASSERT(!cpumask_empty(&prv->active_queues));
2186
0
2187
0
    SCHED_STAT_CRANK(pick_cpu);
2188
0
2189
0
    /* Locking:
2190
0
     * - Runqueue lock of vc->processor is already locked
2191
0
     * - Need to grab prv lock to make sure active runqueues don't
2192
0
     *   change
2193
0
     * - Need to grab locks for other runqueues while checking
2194
0
     *   avgload
2195
0
     * Locking constraint is:
2196
0
     * - Lock prv before runqueue locks
2197
0
     * - Trylock between runqueue locks (no ordering)
2198
0
     *
2199
0
     * Since one of the runqueue locks is already held, we can't
2200
0
     * just grab the prv lock.  Instead, we'll have to trylock, and
2201
0
     * do something else reasonable if we fail.
2202
0
     */
2203
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
2204
0
2205
0
    if ( !read_trylock(&prv->lock) )
2206
0
    {
2207
0
        /* We may be here because someone requested us to migrate. */
2208
0
        __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
2209
0
        new_cpu = get_fallback_cpu(svc);
2210
0
        /*
2211
0
         * Tracing of runq and its load won't be accurate, since we could
2212
0
         * not get the lock, but at least we will output the chosen pcpu.
2213
0
         */
2214
0
        goto out;
2215
0
    }
2216
0
2217
0
    cpumask_and(cpumask_scratch_cpu(cpu), vc->cpu_hard_affinity,
2218
0
                cpupool_domain_cpumask(vc->domain));
2219
0
2220
0
    /*
2221
0
     * First check to see if we're here because someone else suggested a place
2222
0
     * for us to move.
2223
0
     */
2224
0
    if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
2225
0
    {
2226
0
        if ( unlikely(svc->migrate_rqd->id < 0) )
2227
0
        {
2228
0
            printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
2229
0
                   __func__);
2230
0
        }
2231
0
        else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
2232
0
                                     &svc->migrate_rqd->active) )
2233
0
        {
2234
0
            /*
2235
0
             * If we've been asked to move to migrate_rqd, we should just do
2236
0
             * that, which we actually do by returning one cpu from that runq.
2237
0
             * There is no need to take care of soft affinity, as that will
2238
0
             * happen in runq_tickle().
2239
0
             */
2240
0
            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
2241
0
                        &svc->migrate_rqd->active);
2242
0
            new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
2243
0
                                    cpumask_scratch_cpu(cpu));
2244
0
2245
0
            svc->migrate_rqd->pick_bias = new_cpu;
2246
0
            goto out_up;
2247
0
        }
2248
0
        /* Fall-through to normal cpu pick */
2249
0
    }
2250
0
2251
0
    /*
2252
0
     * What we want is:
2253
0
     *  - if we have soft affinity, the runqueue with the lowest average
2254
0
     *    load, among the ones that contain cpus in our soft affinity; this
2255
0
     *    represents the best runq on which we would want to run.
2256
0
     *  - the runqueue with the lowest average load among the ones that
2257
0
     *    contains cpus in our hard affinity; this represent the best runq
2258
0
     *    on which we can run.
2259
0
     *
2260
0
     * Find both runqueues in one pass.
2261
0
     */
2262
0
    has_soft = has_soft_affinity(vc, vc->cpu_hard_affinity);
2263
0
    for_each_cpu(i, &prv->active_queues)
2264
0
    {
2265
0
        struct csched2_runqueue_data *rqd;
2266
0
        s_time_t rqd_avgload = MAX_LOAD;
2267
0
2268
0
        rqd = prv->rqd + i;
2269
0
2270
0
        /*
2271
0
         * If none of the cpus of this runqueue is in svc's hard-affinity,
2272
0
         * skip the runqueue.
2273
0
         *
2274
0
         * Note that, in case svc's hard-affinity has changed, this is the
2275
0
         * first time when we see such change, so it is indeed possible
2276
0
         * that we end up skipping svc's current runqueue.
2277
0
         */
2278
0
        if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
2279
0
            continue;
2280
0
2281
0
        /*
2282
0
         * If checking a different runqueue, grab the lock, read the avg,
2283
0
         * and then release the lock.
2284
0
         *
2285
0
         * If on our own runqueue, don't grab or release the lock;
2286
0
         * but subtract our own load from the runqueue load to simulate
2287
0
         * impartiality.
2288
0
         */
2289
0
        if ( rqd == svc->rqd )
2290
0
        {
2291
0
            rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
2292
0
        }
2293
0
        else if ( spin_trylock(&rqd->lock) )
2294
0
        {
2295
0
            rqd_avgload = rqd->b_avgload;
2296
0
            spin_unlock(&rqd->lock);
2297
0
        }
2298
0
2299
0
        /*
2300
0
         * if svc has a soft-affinity, and some cpus of rqd are part of it,
2301
0
         * see if we need to update the "soft-affinity minimum".
2302
0
         */
2303
0
        if ( has_soft &&
2304
0
             rqd_avgload < min_s_avgload )
2305
0
        {
2306
0
            cpumask_t mask;
2307
0
2308
0
            cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
2309
0
            if ( cpumask_intersects(&mask, svc->vcpu->cpu_soft_affinity) )
2310
0
            {
2311
0
                min_s_avgload = rqd_avgload;
2312
0
                min_s_rqi = i;
2313
0
            }
2314
0
        }
2315
0
        /* In any case, keep the "hard-affinity minimum" updated too. */
2316
0
        if ( rqd_avgload < min_avgload )
2317
0
        {
2318
0
            min_avgload = rqd_avgload;
2319
0
            min_rqi = i;
2320
0
        }
2321
0
    }
2322
0
2323
0
    if ( has_soft && min_s_rqi != -1 )
2324
0
    {
2325
0
        /*
2326
0
         * We have soft affinity, and we have a candidate runq, so go for it.
2327
0
         *
2328
0
         * Note that, to obtain the soft-affinity mask, we "just" put what we
2329
0
         * have in cpumask_scratch in && with vc->cpu_soft_affinity. This is
2330
0
         * ok because:
2331
0
         * - we know that vc->cpu_hard_affinity and vc->cpu_soft_affinity have
2332
0
         *   a non-empty intersection (because has_soft is true);
2333
0
         * - we have vc->cpu_hard_affinity & cpupool_domain_cpumask() already
2334
0
         *   in cpumask_scratch, we do save a lot doing like this.
2335
0
         *
2336
0
         * It's kind of like open coding affinity_balance_cpumask() but, in
2337
0
         * this specific case, calling that would mean a lot of (unnecessary)
2338
0
         * cpumask operations.
2339
0
         */
2340
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
2341
0
                    vc->cpu_soft_affinity);
2342
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
2343
0
                    &prv->rqd[min_s_rqi].active);
2344
0
    }
2345
0
    else if ( min_rqi != -1 )
2346
0
    {
2347
0
        /*
2348
0
         * Either we don't have soft-affinity, or we do, but we did not find
2349
0
         * any suitable runq. But we did find one when considering hard
2350
0
         * affinity, so go for it.
2351
0
         *
2352
0
         * cpumask_scratch already has vc->cpu_hard_affinity &
2353
0
         * cpupool_domain_cpumask() in it, so it's enough that we filter
2354
0
         * with the cpus of the runq.
2355
0
         */
2356
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
2357
0
                    &prv->rqd[min_rqi].active);
2358
0
    }
2359
0
    else
2360
0
    {
2361
0
        /*
2362
0
         * We didn't find anyone at all (most likely because of spinlock
2363
0
         * contention).
2364
0
         */
2365
0
        new_cpu = get_fallback_cpu(svc);
2366
0
        min_rqi = c2r(new_cpu);
2367
0
        min_avgload = prv->rqd[min_rqi].b_avgload;
2368
0
        goto out_up;
2369
0
    }
2370
0
2371
0
    new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
2372
0
                            cpumask_scratch_cpu(cpu));
2373
0
    prv->rqd[min_rqi].pick_bias = new_cpu;
2374
0
    BUG_ON(new_cpu >= nr_cpu_ids);
2375
0
2376
0
 out_up:
2377
0
    read_unlock(&prv->lock);
2378
0
 out:
2379
0
    if ( unlikely(tb_init_done) )
2380
0
    {
2381
0
        struct {
2382
0
            uint64_t b_avgload;
2383
0
            unsigned vcpu:16, dom:16;
2384
0
            unsigned rq_id:16, new_cpu:16;
2385
0
        } d;
2386
0
        d.dom = vc->domain->domain_id;
2387
0
        d.vcpu = vc->vcpu_id;
2388
0
        d.rq_id = min_rqi;
2389
0
        d.b_avgload = min_avgload;
2390
0
        d.new_cpu = new_cpu;
2391
0
        __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
2392
0
                    sizeof(d),
2393
0
                    (unsigned char *)&d);
2394
0
    }
2395
0
2396
0
    return new_cpu;
2397
0
}
2398
2399
/* Working state of the load-balancing algorithm */
2400
typedef struct {
2401
    /* NB: Modified by consider() */
2402
    s_time_t load_delta;
2403
    struct csched2_vcpu * best_push_svc, *best_pull_svc;
2404
    /* NB: Read by consider() */
2405
    struct csched2_runqueue_data *lrqd;
2406
    struct csched2_runqueue_data *orqd;                  
2407
} balance_state_t;
2408
2409
static void consider(balance_state_t *st, 
2410
                     struct csched2_vcpu *push_svc,
2411
                     struct csched2_vcpu *pull_svc)
2412
0
{
2413
0
    s_time_t l_load, o_load, delta;
2414
0
2415
0
    l_load = st->lrqd->b_avgload;
2416
0
    o_load = st->orqd->b_avgload;
2417
0
    if ( push_svc )
2418
0
    {
2419
0
        /* What happens to the load on both if we push? */
2420
0
        l_load -= push_svc->avgload;
2421
0
        o_load += push_svc->avgload;
2422
0
    }
2423
0
    if ( pull_svc )
2424
0
    {
2425
0
        /* What happens to the load on both if we pull? */
2426
0
        l_load += pull_svc->avgload;
2427
0
        o_load -= pull_svc->avgload;
2428
0
    }
2429
0
2430
0
    delta = l_load - o_load;
2431
0
    if ( delta < 0 )
2432
0
        delta = -delta;
2433
0
2434
0
    if ( delta < st->load_delta )
2435
0
    {
2436
0
        st->load_delta = delta;
2437
0
        st->best_push_svc=push_svc;
2438
0
        st->best_pull_svc=pull_svc;
2439
0
    }
2440
0
}
2441
2442
2443
static void migrate(const struct scheduler *ops,
2444
                    struct csched2_vcpu *svc, 
2445
                    struct csched2_runqueue_data *trqd, 
2446
                    s_time_t now)
2447
0
{
2448
0
    int cpu = svc->vcpu->processor;
2449
0
2450
0
    if ( unlikely(tb_init_done) )
2451
0
    {
2452
0
        struct {
2453
0
            unsigned vcpu:16, dom:16;
2454
0
            unsigned rqi:16, trqi:16;
2455
0
        } d;
2456
0
        d.dom = svc->vcpu->domain->domain_id;
2457
0
        d.vcpu = svc->vcpu->vcpu_id;
2458
0
        d.rqi = svc->rqd->id;
2459
0
        d.trqi = trqd->id;
2460
0
        __trace_var(TRC_CSCHED2_MIGRATE, 1,
2461
0
                    sizeof(d),
2462
0
                    (unsigned char *)&d);
2463
0
    }
2464
0
2465
0
    if ( svc->flags & CSFLAG_scheduled )
2466
0
    {
2467
0
        /* It's running; mark it to migrate. */
2468
0
        svc->migrate_rqd = trqd;
2469
0
        __set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
2470
0
        __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
2471
0
        SCHED_STAT_CRANK(migrate_requested);
2472
0
        tickle_cpu(cpu, svc->rqd);
2473
0
    }
2474
0
    else
2475
0
    {
2476
0
        int on_runq = 0;
2477
0
        /* It's not running; just move it */
2478
0
        if ( vcpu_on_runq(svc) )
2479
0
        {
2480
0
            runq_remove(svc);
2481
0
            update_load(ops, svc->rqd, NULL, -1, now);
2482
0
            on_runq = 1;
2483
0
        }
2484
0
        _runq_deassign(svc);
2485
0
2486
0
        cpumask_and(cpumask_scratch_cpu(cpu), svc->vcpu->cpu_hard_affinity,
2487
0
                    cpupool_domain_cpumask(svc->vcpu->domain));
2488
0
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
2489
0
                    &trqd->active);
2490
0
        svc->vcpu->processor = cpumask_cycle(trqd->pick_bias,
2491
0
                                             cpumask_scratch_cpu(cpu));
2492
0
        trqd->pick_bias = svc->vcpu->processor;
2493
0
        ASSERT(svc->vcpu->processor < nr_cpu_ids);
2494
0
2495
0
        _runq_assign(svc, trqd);
2496
0
        if ( on_runq )
2497
0
        {
2498
0
            update_load(ops, svc->rqd, NULL, 1, now);
2499
0
            runq_insert(ops, svc);
2500
0
            runq_tickle(ops, svc, now);
2501
0
            SCHED_STAT_CRANK(migrate_on_runq);
2502
0
        }
2503
0
        else
2504
0
            SCHED_STAT_CRANK(migrate_no_runq);
2505
0
    }
2506
0
}
2507
2508
/*
2509
 * It makes sense considering migrating svc to rqd, if:
2510
 *  - svc is not already flagged to migrate,
2511
 *  - if svc is allowed to run on at least one of the pcpus of rqd.
2512
 */
2513
static bool vcpu_is_migrateable(struct csched2_vcpu *svc,
2514
                                  struct csched2_runqueue_data *rqd)
2515
0
{
2516
0
    struct vcpu *v = svc->vcpu;
2517
0
    int cpu = svc->vcpu->processor;
2518
0
2519
0
    cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
2520
0
                cpupool_domain_cpumask(v->domain));
2521
0
2522
0
    return !(svc->flags & CSFLAG_runq_migrate_request) &&
2523
0
           cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
2524
0
}
2525
2526
static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
2527
0
{
2528
0
    struct csched2_private *prv = csched2_priv(ops);
2529
0
    int i, max_delta_rqi = -1;
2530
0
    struct list_head *push_iter, *pull_iter;
2531
0
    bool inner_load_updated = 0;
2532
0
2533
0
    balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };
2534
0
2535
0
    /*
2536
0
     * Basic algorithm: Push, pull, or swap.
2537
0
     * - Find the runqueue with the furthest load distance
2538
0
     * - Find a pair that makes the difference the least (where one
2539
0
     * on either side may be empty).
2540
0
     */
2541
0
2542
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
2543
0
    st.lrqd = c2rqd(ops, cpu);
2544
0
2545
0
    update_runq_load(ops, st.lrqd, 0, now);
2546
0
2547
0
retry:
2548
0
    if ( !read_trylock(&prv->lock) )
2549
0
        return;
2550
0
2551
0
    st.load_delta = 0;
2552
0
2553
0
    for_each_cpu(i, &prv->active_queues)
2554
0
    {
2555
0
        s_time_t delta;
2556
0
        
2557
0
        st.orqd = prv->rqd + i;
2558
0
2559
0
        if ( st.orqd == st.lrqd
2560
0
             || !spin_trylock(&st.orqd->lock) )
2561
0
            continue;
2562
0
2563
0
        update_runq_load(ops, st.orqd, 0, now);
2564
0
    
2565
0
        delta = st.lrqd->b_avgload - st.orqd->b_avgload;
2566
0
        if ( delta < 0 )
2567
0
            delta = -delta;
2568
0
2569
0
        if ( delta > st.load_delta )
2570
0
        {
2571
0
            st.load_delta = delta;
2572
0
            max_delta_rqi = i;
2573
0
        }
2574
0
2575
0
        spin_unlock(&st.orqd->lock);
2576
0
    }
2577
0
2578
0
    /* Minimize holding the private scheduler lock. */
2579
0
    read_unlock(&prv->lock);
2580
0
    if ( max_delta_rqi == -1 )
2581
0
        goto out;
2582
0
2583
0
    {
2584
0
        s_time_t load_max;
2585
0
        int cpus_max;
2586
0
2587
0
        
2588
0
        load_max = st.lrqd->b_avgload;
2589
0
        if ( st.orqd->b_avgload > load_max )
2590
0
            load_max = st.orqd->b_avgload;
2591
0
2592
0
        cpus_max = cpumask_weight(&st.lrqd->active);
2593
0
        i = cpumask_weight(&st.orqd->active);
2594
0
        if ( i > cpus_max )
2595
0
            cpus_max = i;
2596
0
2597
0
        if ( unlikely(tb_init_done) )
2598
0
        {
2599
0
            struct {
2600
0
                unsigned lrq_id:16, orq_id:16;
2601
0
                unsigned load_delta;
2602
0
            } d;
2603
0
            d.lrq_id = st.lrqd->id;
2604
0
            d.orq_id = st.orqd->id;
2605
0
            d.load_delta = st.load_delta;
2606
0
            __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
2607
0
                        sizeof(d),
2608
0
                        (unsigned char *)&d);
2609
0
        }
2610
0
2611
0
        /*
2612
0
         * If we're under 100% capacaty, only shift if load difference
2613
0
         * is > 1.  otherwise, shift if under 12.5%
2614
0
         */
2615
0
        if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
2616
0
        {
2617
0
            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
2618
0
                                           opt_underload_balance_tolerance)) )
2619
0
                 goto out;
2620
0
        }
2621
0
        else
2622
0
            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
2623
0
                                           opt_overload_balance_tolerance)) )
2624
0
                goto out;
2625
0
    }
2626
0
             
2627
0
    /* Try to grab the other runqueue lock; if it's been taken in the
2628
0
     * meantime, try the process over again.  This can't deadlock
2629
0
     * because if it doesn't get any other rqd locks, it will simply
2630
0
     * give up and return. */
2631
0
    st.orqd = prv->rqd + max_delta_rqi;
2632
0
    if ( !spin_trylock(&st.orqd->lock) )
2633
0
        goto retry;
2634
0
2635
0
    /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
2636
0
    if ( unlikely(st.orqd->id < 0) )
2637
0
        goto out_up;
2638
0
2639
0
    if ( unlikely(tb_init_done) )
2640
0
    {
2641
0
        struct {
2642
0
            uint64_t lb_avgload, ob_avgload;
2643
0
            unsigned lrq_id:16, orq_id:16;
2644
0
        } d;
2645
0
        d.lrq_id = st.lrqd->id;
2646
0
        d.lb_avgload = st.lrqd->b_avgload;
2647
0
        d.orq_id = st.orqd->id;
2648
0
        d.ob_avgload = st.orqd->b_avgload;
2649
0
        __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
2650
0
                    sizeof(d),
2651
0
                    (unsigned char *)&d);
2652
0
    }
2653
0
2654
0
    SCHED_STAT_CRANK(acct_load_balance);
2655
0
2656
0
    /* Look for "swap" which gives the best load average
2657
0
     * FIXME: O(n^2)! */
2658
0
2659
0
    /* Reuse load delta (as we're trying to minimize it) */
2660
0
    list_for_each( push_iter, &st.lrqd->svc )
2661
0
    {
2662
0
        struct csched2_vcpu * push_svc = list_entry(push_iter, struct csched2_vcpu, rqd_elem);
2663
0
2664
0
        update_svc_load(ops, push_svc, 0, now);
2665
0
2666
0
        if ( !vcpu_is_migrateable(push_svc, st.orqd) )
2667
0
            continue;
2668
0
2669
0
        list_for_each( pull_iter, &st.orqd->svc )
2670
0
        {
2671
0
            struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
2672
0
            
2673
0
            if ( !inner_load_updated )
2674
0
                update_svc_load(ops, pull_svc, 0, now);
2675
0
        
2676
0
            if ( !vcpu_is_migrateable(pull_svc, st.lrqd) )
2677
0
                continue;
2678
0
2679
0
            consider(&st, push_svc, pull_svc);
2680
0
        }
2681
0
2682
0
        inner_load_updated = 1;
2683
0
2684
0
        /* Consider push only */
2685
0
        consider(&st, push_svc, NULL);
2686
0
    }
2687
0
2688
0
    list_for_each( pull_iter, &st.orqd->svc )
2689
0
    {
2690
0
        struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
2691
0
        
2692
0
        if ( !vcpu_is_migrateable(pull_svc, st.lrqd) )
2693
0
            continue;
2694
0
2695
0
        /* Consider pull only */
2696
0
        consider(&st, NULL, pull_svc);
2697
0
    }
2698
0
2699
0
    /* OK, now we have some candidates; do the moving */
2700
0
    if ( st.best_push_svc )
2701
0
        migrate(ops, st.best_push_svc, st.orqd, now);
2702
0
    if ( st.best_pull_svc )
2703
0
        migrate(ops, st.best_pull_svc, st.lrqd, now);
2704
0
2705
0
 out_up:
2706
0
    spin_unlock(&st.orqd->lock);
2707
0
 out:
2708
0
    return;
2709
0
}
2710
2711
static void
2712
csched2_vcpu_migrate(
2713
    const struct scheduler *ops, struct vcpu *vc, unsigned int new_cpu)
2714
0
{
2715
0
    struct domain *d = vc->domain;
2716
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
2717
0
    struct csched2_runqueue_data *trqd;
2718
0
    s_time_t now = NOW();
2719
0
2720
0
    /*
2721
0
     * Being passed a target pCPU which is outside of our cpupool is only
2722
0
     * valid if we are shutting down (or doing ACPI suspend), and we are
2723
0
     * moving everyone to BSP, no matter whether or not BSP is inside our
2724
0
     * cpupool.
2725
0
     *
2726
0
     * And since there indeed is the chance that it is not part of it, all
2727
0
     * we must do is remove _and_ unassign the vCPU from any runqueue, as
2728
0
     * well as updating v->processor with the target, so that the suspend
2729
0
     * process can continue.
2730
0
     *
2731
0
     * It will then be during resume that a new, meaningful, value for
2732
0
     * v->processor will be chosen, and during actual domain unpause that
2733
0
     * the vCPU will be assigned to and added to the proper runqueue.
2734
0
     */
2735
0
    if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_cpumask(d))) )
2736
0
    {
2737
0
        ASSERT(system_state == SYS_STATE_suspend);
2738
0
        if ( vcpu_on_runq(svc) )
2739
0
        {
2740
0
            runq_remove(svc);
2741
0
            update_load(ops, svc->rqd, NULL, -1, now);
2742
0
        }
2743
0
        _runq_deassign(svc);
2744
0
        vc->processor = new_cpu;
2745
0
        return;
2746
0
    }
2747
0
2748
0
    /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
2749
0
    ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
2750
0
    ASSERT(cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
2751
0
2752
0
    trqd = c2rqd(ops, new_cpu);
2753
0
2754
0
    /*
2755
0
     * Do the actual movement toward new_cpu, and update vc->processor.
2756
0
     * If we are changing runqueue, migrate() takes care of everything.
2757
0
     * If we are not changing runqueue, we need to update vc->processor
2758
0
     * here. In fact, if, for instance, we are here because the vcpu's
2759
0
     * hard affinity changed, we don't want to risk leaving vc->processor
2760
0
     * pointing to a pcpu where we can't run any longer.
2761
0
     */
2762
0
    if ( trqd != svc->rqd )
2763
0
        migrate(ops, svc, trqd, now);
2764
0
    else
2765
0
        vc->processor = new_cpu;
2766
0
}
2767
2768
static int
2769
csched2_dom_cntl(
2770
    const struct scheduler *ops,
2771
    struct domain *d,
2772
    struct xen_domctl_scheduler_op *op)
2773
0
{
2774
0
    struct csched2_dom * const sdom = csched2_dom(d);
2775
0
    struct csched2_private *prv = csched2_priv(ops);
2776
0
    unsigned long flags;
2777
0
    struct vcpu *v;
2778
0
    int rc = 0;
2779
0
2780
0
    /*
2781
0
     * Locking:
2782
0
     *  - we must take the private lock for accessing the weights of the
2783
0
     *    vcpus of d, and/or the cap;
2784
0
     *  - in the putinfo case, we also need the runqueue lock(s), for
2785
0
     *    updating the max waight of the runqueue(s).
2786
0
     *    If changing the cap, we also need the budget_lock, for updating
2787
0
     *    the value of the domain budget pool (and the runqueue lock,
2788
0
     *    for adjusting the parameters and rescheduling any vCPU that is
2789
0
     *    running at the time of the change).
2790
0
     */
2791
0
    switch ( op->cmd )
2792
0
    {
2793
0
    case XEN_DOMCTL_SCHEDOP_getinfo:
2794
0
        read_lock_irqsave(&prv->lock, flags);
2795
0
        op->u.credit2.weight = sdom->weight;
2796
0
        op->u.credit2.cap = sdom->cap;
2797
0
        read_unlock_irqrestore(&prv->lock, flags);
2798
0
        break;
2799
0
    case XEN_DOMCTL_SCHEDOP_putinfo:
2800
0
        write_lock_irqsave(&prv->lock, flags);
2801
0
        /* Weight */
2802
0
        if ( op->u.credit2.weight != 0 )
2803
0
        {
2804
0
            int old_weight;
2805
0
2806
0
            old_weight = sdom->weight;
2807
0
2808
0
            sdom->weight = op->u.credit2.weight;
2809
0
2810
0
            /* Update weights for vcpus, and max_weight for runqueues on which they reside */
2811
0
            for_each_vcpu ( d, v )
2812
0
            {
2813
0
                struct csched2_vcpu *svc = csched2_vcpu(v);
2814
0
                spinlock_t *lock = vcpu_schedule_lock(svc->vcpu);
2815
0
2816
0
                ASSERT(svc->rqd == c2rqd(ops, svc->vcpu->processor));
2817
0
2818
0
                svc->weight = sdom->weight;
2819
0
                update_max_weight(svc->rqd, svc->weight, old_weight);
2820
0
2821
0
                vcpu_schedule_unlock(lock, svc->vcpu);
2822
0
            }
2823
0
        }
2824
0
        /* Cap */
2825
0
        if ( op->u.credit2.cap != 0 )
2826
0
        {
2827
0
            struct csched2_vcpu *svc;
2828
0
            spinlock_t *lock;
2829
0
2830
0
            /* Cap is only valid if it's below 100 * nr_of_vCPUS */
2831
0
            if ( op->u.credit2.cap > 100 * sdom->nr_vcpus )
2832
0
            {
2833
0
                rc = -EINVAL;
2834
0
                write_unlock_irqrestore(&prv->lock, flags);
2835
0
                break;
2836
0
            }
2837
0
2838
0
            spin_lock(&sdom->budget_lock);
2839
0
            sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
2840
0
            sdom->tot_budget /= 100;
2841
0
            spin_unlock(&sdom->budget_lock);
2842
0
2843
0
            /*
2844
0
             * When trying to get some budget and run, each vCPU will grab
2845
0
             * from the pool 1/N (with N = nr of vCPUs of the domain) of
2846
0
             * the total budget. Roughly speaking, this means each vCPU will
2847
0
             * have at least one chance to run during every period.
2848
0
             */
2849
0
            for_each_vcpu ( d, v )
2850
0
            {
2851
0
                svc = csched2_vcpu(v);
2852
0
                lock = vcpu_schedule_lock(svc->vcpu);
2853
0
                /*
2854
0
                 * Too small quotas would in theory cause a lot of overhead,
2855
0
                 * which then won't happen because, in csched2_runtime(),
2856
0
                 * CSCHED2_MIN_TIMER is what would be used anyway.
2857
0
                 */
2858
0
                svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus,
2859
0
                                        CSCHED2_MIN_TIMER);
2860
0
                vcpu_schedule_unlock(lock, svc->vcpu);
2861
0
            }
2862
0
2863
0
            if ( sdom->cap == 0 )
2864
0
            {
2865
0
                /*
2866
0
                 * We give to the domain the budget to which it is entitled,
2867
0
                 * and queue its first replenishment event.
2868
0
                 *
2869
0
                 * Since cap is currently disabled for this domain, we
2870
0
                 * know no vCPU is messing with the domain's budget, and
2871
0
                 * the replenishment timer is still off.
2872
0
                 * For these reasons, it is safe to do the following without
2873
0
                 * taking the budget_lock.
2874
0
                 */
2875
0
                sdom->budget = sdom->tot_budget;
2876
0
                sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
2877
0
                set_timer(sdom->repl_timer, sdom->next_repl);
2878
0
2879
0
                /*
2880
0
                 * Now, let's enable budget accounting for all the vCPUs.
2881
0
                 * For making sure that they will start to honour the domain's
2882
0
                 * cap, we set their budget to 0.
2883
0
                 * This way, as soon as they will try to run, they will have
2884
0
                 * to get some budget.
2885
0
                 *
2886
0
                 * For the vCPUs that are already running, we trigger the
2887
0
                 * scheduler on their pCPU. When, as a consequence of this,
2888
0
                 * csched2_schedule() will run, it will figure out there is
2889
0
                 * no budget, and the vCPU will try to get some (and be parked,
2890
0
                 * if there's none, and we'll switch to someone else).
2891
0
                 */
2892
0
                for_each_vcpu ( d, v )
2893
0
                {
2894
0
                    svc = csched2_vcpu(v);
2895
0
                    lock = vcpu_schedule_lock(svc->vcpu);
2896
0
                    if ( v->is_running )
2897
0
                    {
2898
0
                        unsigned int cpu = v->processor;
2899
0
                        struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
2900
0
2901
0
                        ASSERT(curr_on_cpu(cpu) == v);
2902
0
2903
0
                        /*
2904
0
                         * We are triggering a reschedule on the vCPU's
2905
0
                         * pCPU. That will run burn_credits() and, since
2906
0
                         * the vCPU is capped now, it would charge all the
2907
0
                         * execution time of this last round as budget as
2908
0
                         * well. That will make the vCPU budget go negative,
2909
0
                         * potentially by a large amount, and it's unfair.
2910
0
                         *
2911
0
                         * To avoid that, call burn_credit() here, to do the
2912
0
                         * accounting of this current running instance now,
2913
0
                         * with budgetting still disabled. This does not
2914
0
                         * prevent some small amount of budget being charged
2915
0
                         * to the vCPU (i.e., the amount of time it runs from
2916
0
                         * now, to when scheduling happens). The budget will
2917
0
                         * also go below 0, but a lot less than how it would
2918
0
                         * if we don't do this.
2919
0
                         */
2920
0
                        burn_credits(rqd, svc, NOW());
2921
0
                        __cpumask_set_cpu(cpu, &rqd->tickled);
2922
0
                        ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
2923
0
                        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
2924
0
                    }
2925
0
                    svc->budget = 0;
2926
0
                    vcpu_schedule_unlock(lock, svc->vcpu);
2927
0
                }
2928
0
            }
2929
0
2930
0
            sdom->cap = op->u.credit2.cap;
2931
0
        }
2932
0
        else if ( sdom->cap != 0 )
2933
0
        {
2934
0
            LIST_HEAD(parked);
2935
0
2936
0
            stop_timer(sdom->repl_timer);
2937
0
2938
0
            /* Disable budget accounting for all the vCPUs. */
2939
0
            for_each_vcpu ( d, v )
2940
0
            {
2941
0
                struct csched2_vcpu *svc = csched2_vcpu(v);
2942
0
                spinlock_t *lock = vcpu_schedule_lock(svc->vcpu);
2943
0
2944
0
                svc->budget = STIME_MAX;
2945
0
                svc->budget_quota = 0;
2946
0
2947
0
                vcpu_schedule_unlock(lock, svc->vcpu);
2948
0
            }
2949
0
            sdom->cap = 0;
2950
0
            /*
2951
0
             * We are disabling the cap for this domain, which may have
2952
0
             * vCPUs waiting for a replenishment, so we unpark them all.
2953
0
             * Note that, since we have already disabled budget accounting
2954
0
             * for all the vCPUs of the domain, no currently running vCPU
2955
0
             * will be added to the parked vCPUs list any longer.
2956
0
             */
2957
0
            spin_lock(&sdom->budget_lock);
2958
0
            list_splice_init(&sdom->parked_vcpus, &parked);
2959
0
            spin_unlock(&sdom->budget_lock);
2960
0
2961
0
            unpark_parked_vcpus(ops, &parked);
2962
0
        }
2963
0
        write_unlock_irqrestore(&prv->lock, flags);
2964
0
        break;
2965
0
    default:
2966
0
        rc = -EINVAL;
2967
0
        break;
2968
0
    }
2969
0
2970
0
2971
0
    return rc;
2972
0
}
2973
2974
static int csched2_sys_cntl(const struct scheduler *ops,
2975
                            struct xen_sysctl_scheduler_op *sc)
2976
0
{
2977
0
    struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
2978
0
    struct csched2_private *prv = csched2_priv(ops);
2979
0
    unsigned long flags;
2980
0
2981
0
    switch (sc->cmd )
2982
0
    {
2983
0
    case XEN_SYSCTL_SCHEDOP_putinfo:
2984
0
        if ( params->ratelimit_us &&
2985
0
             (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
2986
0
              params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
2987
0
            return -EINVAL;
2988
0
2989
0
        write_lock_irqsave(&prv->lock, flags);
2990
0
        if ( !prv->ratelimit_us && params->ratelimit_us )
2991
0
            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
2992
0
        else if ( prv->ratelimit_us && !params->ratelimit_us )
2993
0
            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
2994
0
        prv->ratelimit_us = params->ratelimit_us;
2995
0
        write_unlock_irqrestore(&prv->lock, flags);
2996
0
2997
0
    /* FALLTHRU */
2998
0
    case XEN_SYSCTL_SCHEDOP_getinfo:
2999
0
        params->ratelimit_us = prv->ratelimit_us;
3000
0
        break;
3001
0
    }
3002
0
3003
0
    return 0;
3004
0
}
3005
3006
static void *
3007
csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
3008
0
{
3009
0
    struct csched2_private *prv = csched2_priv(ops);
3010
0
    struct csched2_dom *sdom;
3011
0
    unsigned long flags;
3012
0
3013
0
    sdom = xzalloc(struct csched2_dom);
3014
0
    if ( sdom == NULL )
3015
0
        return NULL;
3016
0
3017
0
    sdom->repl_timer = xzalloc(struct timer);
3018
0
    if ( sdom->repl_timer == NULL )
3019
0
    {
3020
0
        xfree(sdom);
3021
0
        return NULL;
3022
0
    }
3023
0
3024
0
    /* Initialize credit, cap and weight */
3025
0
    INIT_LIST_HEAD(&sdom->sdom_elem);
3026
0
    sdom->dom = dom;
3027
0
    sdom->weight = CSCHED2_DEFAULT_WEIGHT;
3028
0
    sdom->cap = 0U;
3029
0
    sdom->nr_vcpus = 0;
3030
0
3031
0
    init_timer(sdom->repl_timer, replenish_domain_budget, sdom,
3032
0
               cpumask_any(cpupool_domain_cpumask(dom)));
3033
0
    spin_lock_init(&sdom->budget_lock);
3034
0
    INIT_LIST_HEAD(&sdom->parked_vcpus);
3035
0
3036
0
    write_lock_irqsave(&prv->lock, flags);
3037
0
3038
0
    list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
3039
0
3040
0
    write_unlock_irqrestore(&prv->lock, flags);
3041
0
3042
0
    return (void *)sdom;
3043
0
}
3044
3045
static int
3046
csched2_dom_init(const struct scheduler *ops, struct domain *dom)
3047
0
{
3048
0
    struct csched2_dom *sdom;
3049
0
3050
0
    if ( is_idle_domain(dom) )
3051
0
        return 0;
3052
0
3053
0
    sdom = csched2_alloc_domdata(ops, dom);
3054
0
    if ( sdom == NULL )
3055
0
        return -ENOMEM;
3056
0
3057
0
    dom->sched_priv = sdom;
3058
0
3059
0
    return 0;
3060
0
}
3061
3062
static void
3063
csched2_free_domdata(const struct scheduler *ops, void *data)
3064
0
{
3065
0
    unsigned long flags;
3066
0
    struct csched2_dom *sdom = data;
3067
0
    struct csched2_private *prv = csched2_priv(ops);
3068
0
3069
0
    kill_timer(sdom->repl_timer);
3070
0
3071
0
    write_lock_irqsave(&prv->lock, flags);
3072
0
3073
0
    list_del_init(&sdom->sdom_elem);
3074
0
3075
0
    write_unlock_irqrestore(&prv->lock, flags);
3076
0
3077
0
    xfree(sdom->repl_timer);
3078
0
    xfree(data);
3079
0
}
3080
3081
static void
3082
csched2_dom_destroy(const struct scheduler *ops, struct domain *dom)
3083
0
{
3084
0
    ASSERT(csched2_dom(dom)->nr_vcpus == 0);
3085
0
3086
0
    csched2_free_domdata(ops, csched2_dom(dom));
3087
0
}
3088
3089
static void
3090
csched2_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
3091
0
{
3092
0
    struct csched2_vcpu *svc = vc->sched_priv;
3093
0
    struct csched2_dom * const sdom = svc->sdom;
3094
0
    spinlock_t *lock;
3095
0
3096
0
    ASSERT(!is_idle_vcpu(vc));
3097
0
    ASSERT(list_empty(&svc->runq_elem));
3098
0
3099
0
    /* csched2_cpu_pick() expects the pcpu lock to be held */
3100
0
    lock = vcpu_schedule_lock_irq(vc);
3101
0
3102
0
    vc->processor = csched2_cpu_pick(ops, vc);
3103
0
3104
0
    spin_unlock_irq(lock);
3105
0
3106
0
    lock = vcpu_schedule_lock_irq(vc);
3107
0
3108
0
    /* Add vcpu to runqueue of initial processor */
3109
0
    runq_assign(ops, vc);
3110
0
3111
0
    vcpu_schedule_unlock_irq(lock, vc);
3112
0
3113
0
    sdom->nr_vcpus++;
3114
0
3115
0
    SCHED_STAT_CRANK(vcpu_insert);
3116
0
3117
0
    CSCHED2_VCPU_CHECK(vc);
3118
0
}
3119
3120
static void
3121
csched2_free_vdata(const struct scheduler *ops, void *priv)
3122
0
{
3123
0
    struct csched2_vcpu *svc = priv;
3124
0
3125
0
    xfree(svc);
3126
0
}
3127
3128
static void
3129
csched2_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
3130
0
{
3131
0
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
3132
0
    spinlock_t *lock;
3133
0
3134
0
    ASSERT(!is_idle_vcpu(vc));
3135
0
    ASSERT(list_empty(&svc->runq_elem));
3136
0
3137
0
    SCHED_STAT_CRANK(vcpu_remove);
3138
0
3139
0
    /* Remove from runqueue */
3140
0
    lock = vcpu_schedule_lock_irq(vc);
3141
0
3142
0
    runq_deassign(ops, vc);
3143
0
3144
0
    vcpu_schedule_unlock_irq(lock, vc);
3145
0
3146
0
    svc->sdom->nr_vcpus--;
3147
0
}
3148
3149
/* How long should we let this vcpu run for? */
3150
static s_time_t
3151
csched2_runtime(const struct scheduler *ops, int cpu,
3152
                struct csched2_vcpu *snext, s_time_t now)
3153
0
{
3154
0
    s_time_t time, min_time;
3155
0
    int rt_credit; /* Proposed runtime measured in credits */
3156
0
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
3157
0
    struct list_head *runq = &rqd->runq;
3158
0
    struct csched2_private *prv = csched2_priv(ops);
3159
0
3160
0
    /*
3161
0
     * If we're idle, just stay so. Others (or external events)
3162
0
     * will poke us when necessary.
3163
0
     */
3164
0
    if ( is_idle_vcpu(snext->vcpu) )
3165
0
        return -1;
3166
0
3167
0
    /* General algorithm:
3168
0
     * 1) Run until snext's credit will be 0.
3169
0
     * 2) But if someone is waiting, run until snext's credit is equal
3170
0
     *    to his.
3171
0
     * 3) But, if we are capped, never run more than our budget.
3172
0
     * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
3173
0
     *    the ratelimit time.
3174
0
     */
3175
0
3176
0
    /* Calculate mintime */
3177
0
    min_time = CSCHED2_MIN_TIMER;
3178
0
    if ( prv->ratelimit_us )
3179
0
    {
3180
0
        s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
3181
0
        if ( snext->vcpu->is_running )
3182
0
            ratelimit_min = snext->vcpu->runstate.state_entry_time +
3183
0
                            MICROSECS(prv->ratelimit_us) - now;
3184
0
        if ( ratelimit_min > min_time )
3185
0
            min_time = ratelimit_min;
3186
0
    }
3187
0
3188
0
    /* 1) Run until snext's credit will be 0. */
3189
0
    rt_credit = snext->credit;
3190
0
3191
0
    /*
3192
0
     * 2) If there's someone waiting whose credit is positive,
3193
0
     *    run until your credit ~= his.
3194
0
     */
3195
0
    if ( ! list_empty(runq) )
3196
0
    {
3197
0
        struct csched2_vcpu *swait = runq_elem(runq->next);
3198
0
3199
0
        if ( ! is_idle_vcpu(swait->vcpu)
3200
0
             && swait->credit > 0 )
3201
0
        {
3202
0
            rt_credit = snext->credit - swait->credit;
3203
0
        }
3204
0
    }
3205
0
3206
0
    /*
3207
0
     * The next guy on the runqueue may actually have a higher credit,
3208
0
     * if we've tried to avoid migrating him from a different cpu.
3209
0
     * Setting time=0 will ensure the minimum timeslice is chosen.
3210
0
     *
3211
0
     * FIXME: See if we can eliminate this conversion if we know time
3212
0
     * will be outside (MIN,MAX).  Probably requires pre-calculating
3213
0
     * credit values of MIN,MAX per vcpu, since each vcpu burns credit
3214
0
     * at a different rate.
3215
0
     */
3216
0
    if ( rt_credit > 0 )
3217
0
        time = c2t(rqd, rt_credit, snext);
3218
0
    else
3219
0
        time = 0;
3220
0
3221
0
    /*
3222
0
     * 3) But, if capped, never run more than our budget.
3223
0
     */
3224
0
    if ( has_cap(snext) )
3225
0
        time = snext->budget < time ? snext->budget : time;
3226
0
3227
0
    /*
3228
0
     * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
3229
0
     *    the rate_limit time.
3230
0
     */
3231
0
    if ( time < min_time )
3232
0
    {
3233
0
        time = min_time;
3234
0
        SCHED_STAT_CRANK(runtime_min_timer);
3235
0
    }
3236
0
    else if (time > CSCHED2_MAX_TIMER)
3237
0
    {
3238
0
        time = CSCHED2_MAX_TIMER;
3239
0
        SCHED_STAT_CRANK(runtime_max_timer);
3240
0
    }
3241
0
3242
0
    return time;
3243
0
}
3244
3245
/*
3246
 * Find a candidate.
3247
 */
3248
static struct csched2_vcpu *
3249
runq_candidate(struct csched2_runqueue_data *rqd,
3250
               struct csched2_vcpu *scurr,
3251
               int cpu, s_time_t now,
3252
               unsigned int *skipped)
3253
0
{
3254
0
    struct list_head *iter, *temp;
3255
0
    struct csched2_vcpu *snext = NULL;
3256
0
    struct csched2_private *prv = csched2_priv(per_cpu(scheduler, cpu));
3257
0
    bool yield = false, soft_aff_preempt = false;
3258
0
3259
0
    *skipped = 0;
3260
0
3261
0
    if ( unlikely(is_idle_vcpu(scurr->vcpu)) )
3262
0
    {
3263
0
        snext = scurr;
3264
0
        goto check_runq;
3265
0
    }
3266
0
3267
0
    yield = __test_and_clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);
3268
0
3269
0
    /*
3270
0
     * Return the current vcpu if it has executed for less than ratelimit.
3271
0
     * Adjuststment for the selected vcpu's credit and decision
3272
0
     * for how long it will run will be taken in csched2_runtime.
3273
0
     *
3274
0
     * Note that, if scurr is yielding, we don't let rate limiting kick in.
3275
0
     * In fact, it may be the case that scurr is about to spin, and there's
3276
0
     * no point forcing it to do so until rate limiting expires.
3277
0
     */
3278
0
    if ( !yield && prv->ratelimit_us && vcpu_runnable(scurr->vcpu) &&
3279
0
         (now - scurr->vcpu->runstate.state_entry_time) <
3280
0
          MICROSECS(prv->ratelimit_us) )
3281
0
    {
3282
0
        if ( unlikely(tb_init_done) )
3283
0
        {
3284
0
            struct {
3285
0
                unsigned vcpu:16, dom:16;
3286
0
                unsigned runtime;
3287
0
            } d;
3288
0
            d.dom = scurr->vcpu->domain->domain_id;
3289
0
            d.vcpu = scurr->vcpu->vcpu_id;
3290
0
            d.runtime = now - scurr->vcpu->runstate.state_entry_time;
3291
0
            __trace_var(TRC_CSCHED2_RATELIMIT, 1,
3292
0
                        sizeof(d),
3293
0
                        (unsigned char *)&d);
3294
0
        }
3295
0
        return scurr;
3296
0
    }
3297
0
3298
0
    /* If scurr has a soft-affinity, let's check whether cpu is part of it */
3299
0
    if ( has_soft_affinity(scurr->vcpu, scurr->vcpu->cpu_hard_affinity) )
3300
0
    {
3301
0
        affinity_balance_cpumask(scurr->vcpu, BALANCE_SOFT_AFFINITY,
3302
0
                                 cpumask_scratch);
3303
0
        if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
3304
0
        {
3305
0
            cpumask_t *online = cpupool_domain_cpumask(scurr->vcpu->domain);
3306
0
3307
0
            /* Ok, is any of the pcpus in scurr soft-affinity idle? */
3308
0
            cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
3309
0
            cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
3310
0
            soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
3311
0
        }
3312
0
    }
3313
0
3314
0
    /*
3315
0
     * If scurr is runnable, and this cpu is in its soft-affinity, default to
3316
0
     * it. We also default to it, even if cpu is not in its soft-affinity, if
3317
0
     * there aren't any idle and not tickled cpu in its soft-affinity. In
3318
0
     * fact, we don't want to risk leaving scurr in the runq and this cpu idle
3319
0
     * only because scurr is running outside of its soft-affinity.
3320
0
     *
3321
0
     * On the other hand, if cpu is not in scurr's soft-affinity, and there
3322
0
     * looks to be better options, go for them. That happens by defaulting to
3323
0
     * idle here, which means scurr will be preempted, put back in runq, and
3324
0
     * one of those idle and not tickled cpus from its soft-affinity will be
3325
0
     * tickled to pick it up.
3326
0
     *
3327
0
     * Finally, if scurr does not have a valid soft-affinity, we also let it
3328
0
     * continue to run here (in fact, soft_aff_preempt will still be false,
3329
0
     * in this case).
3330
0
     *
3331
0
     * Of course, we also default to idle also if scurr is not runnable.
3332
0
     */
3333
0
    if ( vcpu_runnable(scurr->vcpu) && !soft_aff_preempt )
3334
0
        snext = scurr;
3335
0
    else
3336
0
        snext = csched2_vcpu(idle_vcpu[cpu]);
3337
0
3338
0
 check_runq:
3339
0
    list_for_each_safe( iter, temp, &rqd->runq )
3340
0
    {
3341
0
        struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
3342
0
3343
0
        if ( unlikely(tb_init_done) )
3344
0
        {
3345
0
            struct {
3346
0
                unsigned vcpu:16, dom:16;
3347
0
            } d;
3348
0
            d.dom = svc->vcpu->domain->domain_id;
3349
0
            d.vcpu = svc->vcpu->vcpu_id;
3350
0
            __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
3351
0
                        sizeof(d),
3352
0
                        (unsigned char *)&d);
3353
0
        }
3354
0
3355
0
        /* Only consider vcpus that are allowed to run on this processor. */
3356
0
        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
3357
0
        {
3358
0
            (*skipped)++;
3359
0
            continue;
3360
0
        }
3361
0
3362
0
        /*
3363
0
         * If a vcpu is meant to be picked up by another processor, and such
3364
0
         * processor has not scheduled yet, leave it in the runqueue for him.
3365
0
         */
3366
0
        if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
3367
0
             cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
3368
0
        {
3369
0
            (*skipped)++;
3370
0
            SCHED_STAT_CRANK(deferred_to_tickled_cpu);
3371
0
            continue;
3372
0
        }
3373
0
3374
0
        /*
3375
0
         * If this is on a different processor, don't pull it unless
3376
0
         * its credit is at least CSCHED2_MIGRATE_RESIST higher.
3377
0
         */
3378
0
        if ( svc->vcpu->processor != cpu
3379
0
             && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
3380
0
        {
3381
0
            (*skipped)++;
3382
0
            SCHED_STAT_CRANK(migrate_resisted);
3383
0
            continue;
3384
0
        }
3385
0
3386
0
        /*
3387
0
         * If the one in the runqueue has more credit than current (or idle,
3388
0
         * if current is not runnable), or if current is yielding, and also
3389
0
         * if the one in runqueue either is not capped, or is capped but has
3390
0
         * some budget, then choose it.
3391
0
         */
3392
0
        if ( (yield || svc->credit > snext->credit) &&
3393
0
             (!has_cap(svc) || vcpu_grab_budget(svc)) )
3394
0
            snext = svc;
3395
0
3396
0
        /* In any case, if we got this far, break. */
3397
0
        break;
3398
0
    }
3399
0
3400
0
    if ( unlikely(tb_init_done) )
3401
0
    {
3402
0
        struct {
3403
0
            unsigned vcpu:16, dom:16;
3404
0
            unsigned tickled_cpu, skipped;
3405
0
            int credit;
3406
0
        } d;
3407
0
        d.dom = snext->vcpu->domain->domain_id;
3408
0
        d.vcpu = snext->vcpu->vcpu_id;
3409
0
        d.credit = snext->credit;
3410
0
        d.tickled_cpu = snext->tickled_cpu;
3411
0
        d.skipped = *skipped;
3412
0
        __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
3413
0
                    sizeof(d),
3414
0
                    (unsigned char *)&d);
3415
0
    }
3416
0
3417
0
    if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
3418
0
        SCHED_STAT_CRANK(tickled_cpu_overridden);
3419
0
3420
0
    /*
3421
0
     * If snext is from a capped domain, it must have budget (or it
3422
0
     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
3423
0
     * which still is >= 0.
3424
0
     */
3425
0
    ASSERT(snext->budget >= 0);
3426
0
3427
0
    return snext;
3428
0
}
3429
3430
/*
3431
 * This function is in the critical path. It is designed to be simple and
3432
 * fast for the common case.
3433
 */
3434
static struct task_slice
3435
csched2_schedule(
3436
    const struct scheduler *ops, s_time_t now, bool tasklet_work_scheduled)
3437
0
{
3438
0
    const int cpu = smp_processor_id();
3439
0
    struct csched2_runqueue_data *rqd;
3440
0
    struct csched2_vcpu * const scurr = csched2_vcpu(current);
3441
0
    struct csched2_vcpu *snext = NULL;
3442
0
    unsigned int skipped_vcpus = 0;
3443
0
    struct task_slice ret;
3444
0
    bool tickled;
3445
0
3446
0
    SCHED_STAT_CRANK(schedule);
3447
0
    CSCHED2_VCPU_CHECK(current);
3448
0
3449
0
    BUG_ON(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
3450
0
3451
0
    rqd = c2rqd(ops, cpu);
3452
0
    BUG_ON(!cpumask_test_cpu(cpu, &rqd->active));
3453
0
3454
0
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
3455
0
3456
0
    BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd);
3457
0
3458
0
    /* Clear "tickled" bit now that we've been scheduled */
3459
0
    tickled = cpumask_test_cpu(cpu, &rqd->tickled);
3460
0
    if ( tickled )
3461
0
    {
3462
0
        __cpumask_clear_cpu(cpu, &rqd->tickled);
3463
0
        cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
3464
0
        smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle);
3465
0
    }
3466
0
3467
0
    if ( unlikely(tb_init_done) )
3468
0
    {
3469
0
        struct {
3470
0
            unsigned cpu:16, rq_id:16;
3471
0
            unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
3472
0
        } d;
3473
0
        d.cpu = cpu;
3474
0
        d.rq_id = c2r(cpu);
3475
0
        d.tasklet = tasklet_work_scheduled;
3476
0
        d.idle = is_idle_vcpu(current);
3477
0
        d.smt_idle = cpumask_test_cpu(cpu, &rqd->smt_idle);
3478
0
        d.tickled = tickled;
3479
0
        __trace_var(TRC_CSCHED2_SCHEDULE, 1,
3480
0
                    sizeof(d),
3481
0
                    (unsigned char *)&d);
3482
0
    }
3483
0
3484
0
    /* Update credits (and budget, if necessary). */
3485
0
    burn_credits(rqd, scurr, now);
3486
0
3487
0
    /*
3488
0
     *  Below 0, means that we are capped and we have overrun our  budget.
3489
0
     *  Let's try to get some more but, if we fail (e.g., because of the
3490
0
     *  other running vcpus), we will be parked.
3491
0
     */
3492
0
    if ( unlikely(scurr->budget <= 0) )
3493
0
        vcpu_grab_budget(scurr);
3494
0
3495
0
    /*
3496
0
     * Select next runnable local VCPU (ie top of local runq).
3497
0
     *
3498
0
     * If the current vcpu is runnable, and has higher credit than
3499
0
     * the next guy on the queue (or there is noone else), we want to
3500
0
     * run him again.
3501
0
     *
3502
0
     * If there's tasklet work to do, we want to chose the idle vcpu
3503
0
     * for this processor, and mark the current for delayed runqueue
3504
0
     * add.
3505
0
     *
3506
0
     * If the current vcpu is runnable, and there's another runnable
3507
0
     * candidate, we want to mark current for delayed runqueue add,
3508
0
     * and remove the next guy from the queue.
3509
0
     *
3510
0
     * If the current vcpu is not runnable, we want to chose the idle
3511
0
     * vcpu for this processor.
3512
0
     */
3513
0
    if ( tasklet_work_scheduled )
3514
0
    {
3515
0
        __clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);
3516
0
        trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
3517
0
        snext = csched2_vcpu(idle_vcpu[cpu]);
3518
0
    }
3519
0
    else
3520
0
        snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus);
3521
0
3522
0
    /* If switching from a non-idle runnable vcpu, put it
3523
0
     * back on the runqueue. */
3524
0
    if ( snext != scurr
3525
0
         && !is_idle_vcpu(scurr->vcpu)
3526
0
         && vcpu_runnable(current) )
3527
0
        __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
3528
0
3529
0
    ret.migrated = 0;
3530
0
3531
0
    /* Accounting for non-idle tasks */
3532
0
    if ( !is_idle_vcpu(snext->vcpu) )
3533
0
    {
3534
0
        /* If switching, remove this from the runqueue and mark it scheduled */
3535
0
        if ( snext != scurr )
3536
0
        {
3537
0
            ASSERT(snext->rqd == rqd);
3538
0
            ASSERT(!snext->vcpu->is_running);
3539
0
3540
0
            runq_remove(snext);
3541
0
            __set_bit(__CSFLAG_scheduled, &snext->flags);
3542
0
        }
3543
0
3544
0
        /*
3545
0
         * The reset condition is "has a scheduler epoch come to an end?".
3546
0
         * The way this is enforced is checking whether the vcpu at the top
3547
0
         * of the runqueue has negative credits. This means the epochs have
3548
0
         * variable length, as in one epoch expores when:
3549
0
         *  1) the vcpu at the top of the runqueue has executed for
3550
0
         *     around 10 ms (with default parameters);
3551
0
         *  2) no other vcpu with higher credits wants to run.
3552
0
         *
3553
0
         * Here, where we want to check for reset, we need to make sure the
3554
0
         * proper vcpu is being used. In fact, runqueue_candidate() may have
3555
0
         * not returned the first vcpu in the runqueue, for various reasons
3556
0
         * (e.g., affinity). Only trigger a reset when it does.
3557
0
         */
3558
0
        if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
3559
0
        {
3560
0
            reset_credit(ops, cpu, now, snext);
3561
0
            balance_load(ops, cpu, now);
3562
0
        }
3563
0
3564
0
        /* Clear the idle mask if necessary */
3565
0
        if ( cpumask_test_cpu(cpu, &rqd->idle) )
3566
0
        {
3567
0
            __cpumask_clear_cpu(cpu, &rqd->idle);
3568
0
            smt_idle_mask_clear(cpu, &rqd->smt_idle);
3569
0
        }
3570
0
3571
0
        snext->start_time = now;
3572
0
        snext->tickled_cpu = -1;
3573
0
3574
0
        /* Safe because lock for old processor is held */
3575
0
        if ( snext->vcpu->processor != cpu )
3576
0
        {
3577
0
            snext->credit += CSCHED2_MIGRATE_COMPENSATION;
3578
0
            snext->vcpu->processor = cpu;
3579
0
            SCHED_STAT_CRANK(migrated);
3580
0
            ret.migrated = 1;
3581
0
        }
3582
0
    }
3583
0
    else
3584
0
    {
3585
0
        /*
3586
0
         * Update the idle mask if necessary. Note that, if we're scheduling
3587
0
         * idle in order to carry on some tasklet work, we want to play busy!
3588
0
         */
3589
0
        if ( tasklet_work_scheduled )
3590
0
        {
3591
0
            if ( cpumask_test_cpu(cpu, &rqd->idle) )
3592
0
            {
3593
0
                __cpumask_clear_cpu(cpu, &rqd->idle);
3594
0
                smt_idle_mask_clear(cpu, &rqd->smt_idle);
3595
0
            }
3596
0
        }
3597
0
        else if ( !cpumask_test_cpu(cpu, &rqd->idle) )
3598
0
        {
3599
0
            __cpumask_set_cpu(cpu, &rqd->idle);
3600
0
            cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
3601
0
            smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle);
3602
0
        }
3603
0
        /* Make sure avgload gets updated periodically even
3604
0
         * if there's no activity */
3605
0
        update_load(ops, rqd, NULL, 0, now);
3606
0
    }
3607
0
3608
0
    /*
3609
0
     * Return task to run next...
3610
0
     */
3611
0
    ret.time = csched2_runtime(ops, cpu, snext, now);
3612
0
    ret.task = snext->vcpu;
3613
0
3614
0
    CSCHED2_VCPU_CHECK(ret.task);
3615
0
    return ret;
3616
0
}
3617
3618
static void
3619
csched2_dump_vcpu(struct csched2_private *prv, struct csched2_vcpu *svc)
3620
0
{
3621
0
    printk("[%i.%i] flags=%x cpu=%i",
3622
0
            svc->vcpu->domain->domain_id,
3623
0
            svc->vcpu->vcpu_id,
3624
0
            svc->flags,
3625
0
            svc->vcpu->processor);
3626
0
3627
0
    printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
3628
0
3629
0
    if ( has_cap(svc) )
3630
0
        printk(" budget=%"PRI_stime"(%"PRI_stime")",
3631
0
               svc->budget, svc->budget_quota);
3632
0
3633
0
    printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
3634
0
           (svc->avgload * 100) >> prv->load_precision_shift);
3635
0
3636
0
    printk("\n");
3637
0
}
3638
3639
static inline void
3640
dump_pcpu(const struct scheduler *ops, int cpu)
3641
0
{
3642
0
    struct csched2_private *prv = csched2_priv(ops);
3643
0
    struct csched2_vcpu *svc;
3644
0
#define cpustr keyhandler_scratch
3645
0
3646
0
    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
3647
0
    printk("CPU[%02d] runq=%d, sibling=%s, ", cpu, c2r(cpu), cpustr);
3648
0
    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
3649
0
    printk("core=%s\n", cpustr);
3650
0
3651
0
    /* current VCPU (nothing to say if that's the idle vcpu) */
3652
0
    svc = csched2_vcpu(curr_on_cpu(cpu));
3653
0
    if ( svc && !is_idle_vcpu(svc->vcpu) )
3654
0
    {
3655
0
        printk("\trun: ");
3656
0
        csched2_dump_vcpu(prv, svc);
3657
0
    }
3658
0
#undef cpustr
3659
0
}
3660
3661
static void
3662
csched2_dump(const struct scheduler *ops)
3663
0
{
3664
0
    struct list_head *iter_sdom;
3665
0
    struct csched2_private *prv = csched2_priv(ops);
3666
0
    unsigned long flags;
3667
0
    unsigned int i, j, loop;
3668
0
#define cpustr keyhandler_scratch
3669
0
3670
0
    /*
3671
0
     * We need the private scheduler lock as we access global
3672
0
     * scheduler data and (below) the list of active domains.
3673
0
     */
3674
0
    read_lock_irqsave(&prv->lock, flags);
3675
0
3676
0
    printk("Active queues: %d\n"
3677
0
           "\tdefault-weight     = %d\n",
3678
0
           cpumask_weight(&prv->active_queues),
3679
0
           CSCHED2_DEFAULT_WEIGHT);
3680
0
    for_each_cpu(i, &prv->active_queues)
3681
0
    {
3682
0
        s_time_t fraction;
3683
0
3684
0
        fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;
3685
0
3686
0
        cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].active);
3687
0
        printk("Runqueue %d:\n"
3688
0
               "\tncpus              = %u\n"
3689
0
               "\tcpus               = %s\n"
3690
0
               "\tmax_weight         = %u\n"
3691
0
               "\tpick_bias          = %u\n"
3692
0
               "\tinstload           = %d\n"
3693
0
               "\taveload            = %"PRI_stime" (~%"PRI_stime"%%)\n",
3694
0
               i,
3695
0
               cpumask_weight(&prv->rqd[i].active),
3696
0
               cpustr,
3697
0
               prv->rqd[i].max_weight,
3698
0
               prv->rqd[i].pick_bias,
3699
0
               prv->rqd[i].load,
3700
0
               prv->rqd[i].avgload,
3701
0
               fraction);
3702
0
3703
0
        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].idle);
3704
0
        printk("\tidlers: %s\n", cpustr);
3705
0
        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].tickled);
3706
0
        printk("\ttickled: %s\n", cpustr);
3707
0
        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].smt_idle);
3708
0
        printk("\tfully idle cores: %s\n", cpustr);
3709
0
    }
3710
0
3711
0
    printk("Domain info:\n");
3712
0
    loop = 0;
3713
0
    list_for_each( iter_sdom, &prv->sdom )
3714
0
    {
3715
0
        struct csched2_dom *sdom;
3716
0
        struct vcpu *v;
3717
0
3718
0
        sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
3719
0
3720
0
        printk("\tDomain: %d w %d c %u v %d\n",
3721
0
               sdom->dom->domain_id,
3722
0
               sdom->weight,
3723
0
               sdom->cap,
3724
0
               sdom->nr_vcpus);
3725
0
3726
0
        for_each_vcpu( sdom->dom, v )
3727
0
        {
3728
0
            struct csched2_vcpu * const svc = csched2_vcpu(v);
3729
0
            spinlock_t *lock;
3730
0
3731
0
            lock = vcpu_schedule_lock(svc->vcpu);
3732
0
3733
0
            printk("\t%3d: ", ++loop);
3734
0
            csched2_dump_vcpu(prv, svc);
3735
0
3736
0
            vcpu_schedule_unlock(lock, svc->vcpu);
3737
0
        }
3738
0
    }
3739
0
3740
0
    for_each_cpu(i, &prv->active_queues)
3741
0
    {
3742
0
        struct csched2_runqueue_data *rqd = prv->rqd + i;
3743
0
        struct list_head *iter, *runq = &rqd->runq;
3744
0
        int loop = 0;
3745
0
3746
0
        /* We need the lock to scan the runqueue. */
3747
0
        spin_lock(&rqd->lock);
3748
0
3749
0
        printk("Runqueue %d:\n", i);
3750
0
3751
0
        for_each_cpu(j, &rqd->active)
3752
0
            dump_pcpu(ops, j);
3753
0
3754
0
        printk("RUNQ:\n");
3755
0
        list_for_each( iter, runq )
3756
0
        {
3757
0
            struct csched2_vcpu *svc = runq_elem(iter);
3758
0
3759
0
            if ( svc )
3760
0
            {
3761
0
                printk("\t%3d: ", loop++);
3762
0
                csched2_dump_vcpu(prv, svc);
3763
0
            }
3764
0
        }
3765
0
        spin_unlock(&rqd->lock);
3766
0
    }
3767
0
3768
0
    read_unlock_irqrestore(&prv->lock, flags);
3769
0
#undef cpustr
3770
0
}
3771
3772
/* Returns the ID of the runqueue the cpu is assigned to. */
3773
static unsigned
3774
init_pdata(struct csched2_private *prv, unsigned int cpu)
3775
0
{
3776
0
    unsigned rqi;
3777
0
    struct csched2_runqueue_data *rqd;
3778
0
3779
0
    ASSERT(rw_is_write_locked(&prv->lock));
3780
0
    ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
3781
0
3782
0
    /* Figure out which runqueue to put it in */
3783
0
    rqi = cpu_to_runqueue(prv, cpu);
3784
0
3785
0
    rqd = prv->rqd + rqi;
3786
0
3787
0
    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi);
3788
0
    if ( ! cpumask_test_cpu(rqi, &prv->active_queues) )
3789
0
    {
3790
0
        printk(XENLOG_INFO " First cpu on runqueue, activating\n");
3791
0
        activate_runqueue(prv, rqi);
3792
0
    }
3793
0
    
3794
0
    /* Set the runqueue map */
3795
0
    per_cpu(runq_map, cpu) = rqi;
3796
0
    
3797
0
    __cpumask_set_cpu(cpu, &rqd->idle);
3798
0
    __cpumask_set_cpu(cpu, &rqd->active);
3799
0
    __cpumask_set_cpu(cpu, &prv->initialized);
3800
0
    __cpumask_set_cpu(cpu, &rqd->smt_idle);
3801
0
3802
0
    if ( cpumask_weight(&rqd->active) == 1 )
3803
0
        rqd->pick_bias = cpu;
3804
0
3805
0
    return rqi;
3806
0
}
3807
3808
static void
3809
csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
3810
0
{
3811
0
    struct csched2_private *prv = csched2_priv(ops);
3812
0
    spinlock_t *old_lock;
3813
0
    unsigned long flags;
3814
0
    unsigned rqi;
3815
0
3816
0
    /*
3817
0
     * pdata contains what alloc_pdata returned. But since we don't (need to)
3818
0
     * implement alloc_pdata, either that's NULL, or something is very wrong!
3819
0
     */
3820
0
    ASSERT(!pdata);
3821
0
3822
0
    write_lock_irqsave(&prv->lock, flags);
3823
0
    old_lock = pcpu_schedule_lock(cpu);
3824
0
3825
0
    rqi = init_pdata(prv, cpu);
3826
0
    /* Move the scheduler lock to the new runq lock. */
3827
0
    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
3828
0
3829
0
    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
3830
0
    spin_unlock(old_lock);
3831
0
    write_unlock_irqrestore(&prv->lock, flags);
3832
0
}
3833
3834
/* Change the scheduler of cpu to us (Credit2). */
3835
static void
3836
csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
3837
                     void *pdata, void *vdata)
3838
0
{
3839
0
    struct csched2_private *prv = csched2_priv(new_ops);
3840
0
    struct csched2_vcpu *svc = vdata;
3841
0
    unsigned rqi;
3842
0
3843
0
    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
3844
0
3845
0
    /*
3846
0
     * We own one runqueue lock already (from schedule_cpu_switch()). This
3847
0
     * looks like it violates this scheduler's locking rules, but it does
3848
0
     * not, as what we own is the lock of another scheduler, that hence has
3849
0
     * no particular (ordering) relationship with our private global lock.
3850
0
     * And owning exactly that one (the lock of the old scheduler of this
3851
0
     * cpu) is what is necessary to prevent races.
3852
0
     */
3853
0
    ASSERT(!local_irq_is_enabled());
3854
0
    write_lock(&prv->lock);
3855
0
3856
0
    idle_vcpu[cpu]->sched_priv = vdata;
3857
0
3858
0
    rqi = init_pdata(prv, cpu);
3859
0
3860
0
    /*
3861
0
     * Now that we know what runqueue we'll go in, double check what's said
3862
0
     * above: the lock we already hold is not the one of this runqueue of
3863
0
     * this scheduler, and so it's safe to have taken it /before/ our
3864
0
     * private global lock.
3865
0
     */
3866
0
    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);
3867
0
3868
0
    per_cpu(scheduler, cpu) = new_ops;
3869
0
    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
3870
0
3871
0
    /*
3872
0
     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
3873
0
     * if it is free (and it can be) we want that anyone that manages
3874
0
     * taking it, find all the initializations we've done above in place.
3875
0
     */
3876
0
    smp_mb();
3877
0
    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
3878
0
3879
0
    write_unlock(&prv->lock);
3880
0
}
3881
3882
static void
3883
csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
3884
0
{
3885
0
    unsigned long flags;
3886
0
    struct csched2_private *prv = csched2_priv(ops);
3887
0
    struct csched2_runqueue_data *rqd;
3888
0
    int rqi;
3889
0
3890
0
    write_lock_irqsave(&prv->lock, flags);
3891
0
3892
0
    /*
3893
0
     * alloc_pdata is not implemented, so pcpu must be NULL. On the other
3894
0
     * hand, init_pdata must have been called for this pCPU.
3895
0
     */
3896
0
    ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized));
3897
0
    
3898
0
    /* Find the old runqueue and remove this cpu from it */
3899
0
    rqi = per_cpu(runq_map, cpu);
3900
0
3901
0
    rqd = prv->rqd + rqi;
3902
0
3903
0
    /* No need to save IRQs here, they're already disabled */
3904
0
    spin_lock(&rqd->lock);
3905
0
3906
0
    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi);
3907
0
3908
0
    __cpumask_clear_cpu(cpu, &rqd->idle);
3909
0
    __cpumask_clear_cpu(cpu, &rqd->smt_idle);
3910
0
    __cpumask_clear_cpu(cpu, &rqd->active);
3911
0
3912
0
    if ( cpumask_empty(&rqd->active) )
3913
0
    {
3914
0
        printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
3915
0
        deactivate_runqueue(prv, rqi);
3916
0
    }
3917
0
    else if ( rqd->pick_bias == cpu )
3918
0
        rqd->pick_bias = cpumask_first(&rqd->active);
3919
0
3920
0
    per_cpu(runq_map, cpu) = -1;
3921
0
3922
0
    spin_unlock(&rqd->lock);
3923
0
3924
0
    __cpumask_clear_cpu(cpu, &prv->initialized);
3925
0
3926
0
    write_unlock_irqrestore(&prv->lock, flags);
3927
0
3928
0
    return;
3929
0
}
3930
3931
static int
3932
csched2_init(struct scheduler *ops)
3933
0
{
3934
0
    int i;
3935
0
    struct csched2_private *prv;
3936
0
3937
0
    printk("Initializing Credit2 scheduler\n");
3938
0
3939
0
    printk(XENLOG_INFO " load_precision_shift: %d\n"
3940
0
           XENLOG_INFO " load_window_shift: %d\n"
3941
0
           XENLOG_INFO " underload_balance_tolerance: %d\n"
3942
0
           XENLOG_INFO " overload_balance_tolerance: %d\n"
3943
0
           XENLOG_INFO " runqueues arrangement: %s\n"
3944
0
           XENLOG_INFO " cap enforcement granularity: %dms\n",
3945
0
           opt_load_precision_shift,
3946
0
           opt_load_window_shift,
3947
0
           opt_underload_balance_tolerance,
3948
0
           opt_overload_balance_tolerance,
3949
0
           opt_runqueue_str[opt_runqueue],
3950
0
           opt_cap_period);
3951
0
3952
0
    if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
3953
0
    {
3954
0
        printk("WARNING: %s: opt_load_precision_shift %d below min %d, resetting\n",
3955
0
               __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
3956
0
        opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
3957
0
    }
3958
0
3959
0
    if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
3960
0
    {
3961
0
        printk("WARNING: %s: opt_load_window_shift %d too short, resetting\n",
3962
0
               __func__, opt_load_window_shift);
3963
0
        opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
3964
0
    }
3965
0
    printk(XENLOG_INFO "load tracking window length %llu ns\n",
3966
0
           1ULL << opt_load_window_shift);
3967
0
3968
0
    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
3969
0
    {
3970
0
        printk("WARNING: %s: opt_cap_period %d too small, resetting\n",
3971
0
               __func__, opt_cap_period);
3972
0
        opt_cap_period = 10; /* ms */
3973
0
    }
3974
0
3975
0
    /*
3976
0
     * Basically no CPU information is available at this point; just
3977
0
     * set up basic structures, and a callback when the CPU info is
3978
0
     * available.
3979
0
     */
3980
0
3981
0
    prv = xzalloc(struct csched2_private);
3982
0
    if ( prv == NULL )
3983
0
        return -ENOMEM;
3984
0
    ops->sched_data = prv;
3985
0
3986
0
    rwlock_init(&prv->lock);
3987
0
    INIT_LIST_HEAD(&prv->sdom);
3988
0
3989
0
    /* Allocate all runqueues and mark them as un-initialized */
3990
0
    prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
3991
0
    if ( !prv->rqd )
3992
0
    {
3993
0
        xfree(prv);
3994
0
        return -ENOMEM;
3995
0
    }
3996
0
    for ( i = 0; i < nr_cpu_ids; i++ )
3997
0
        prv->rqd[i].id = -1;
3998
0
3999
0
    /* initialize ratelimit */
4000
0
    prv->ratelimit_us = sched_ratelimit_us;
4001
0
4002
0
    prv->load_precision_shift = opt_load_precision_shift;
4003
0
    prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
4004
0
    ASSERT(opt_load_window_shift > 0);
4005
0
4006
0
    return 0;
4007
0
}
4008
4009
static void
4010
csched2_deinit(struct scheduler *ops)
4011
0
{
4012
0
    struct csched2_private *prv;
4013
0
4014
0
    prv = csched2_priv(ops);
4015
0
    ops->sched_data = NULL;
4016
0
    xfree(prv);
4017
0
}
4018
4019
static const struct scheduler sched_credit2_def = {
4020
    .name           = "SMP Credit Scheduler rev2",
4021
    .opt_name       = "credit2",
4022
    .sched_id       = XEN_SCHEDULER_CREDIT2,
4023
    .sched_data     = NULL,
4024
4025
    .init_domain    = csched2_dom_init,
4026
    .destroy_domain = csched2_dom_destroy,
4027
4028
    .insert_vcpu    = csched2_vcpu_insert,
4029
    .remove_vcpu    = csched2_vcpu_remove,
4030
4031
    .sleep          = csched2_vcpu_sleep,
4032
    .wake           = csched2_vcpu_wake,
4033
    .yield          = csched2_vcpu_yield,
4034
4035
    .adjust         = csched2_dom_cntl,
4036
    .adjust_global  = csched2_sys_cntl,
4037
4038
    .pick_cpu       = csched2_cpu_pick,
4039
    .migrate        = csched2_vcpu_migrate,
4040
    .do_schedule    = csched2_schedule,
4041
    .context_saved  = csched2_context_saved,
4042
4043
    .dump_settings  = csched2_dump,
4044
    .init           = csched2_init,
4045
    .deinit         = csched2_deinit,
4046
    .alloc_vdata    = csched2_alloc_vdata,
4047
    .free_vdata     = csched2_free_vdata,
4048
    .init_pdata     = csched2_init_pdata,
4049
    .deinit_pdata   = csched2_deinit_pdata,
4050
    .switch_sched   = csched2_switch_sched,
4051
    .alloc_domdata  = csched2_alloc_domdata,
4052
    .free_domdata   = csched2_free_domdata,
4053
};
4054
4055
REGISTER_SCHEDULER(sched_credit2_def);