/root/src/xen/xen/common/sched_credit2.c
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | /**************************************************************************** |
3 | | * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd |
4 | | **************************************************************************** |
5 | | * |
6 | | * File: common/sched_credit2.c |
7 | | * Author: George Dunlap |
8 | | * |
9 | | * Description: Credit-based SMP CPU scheduler |
10 | | * Based on an earlier verson by Emmanuel Ackaouy. |
11 | | */ |
12 | | |
13 | | #include <xen/init.h> |
14 | | #include <xen/lib.h> |
15 | | #include <xen/sched.h> |
16 | | #include <xen/domain.h> |
17 | | #include <xen/delay.h> |
18 | | #include <xen/event.h> |
19 | | #include <xen/time.h> |
20 | | #include <xen/perfc.h> |
21 | | #include <xen/sched-if.h> |
22 | | #include <xen/softirq.h> |
23 | | #include <asm/div64.h> |
24 | | #include <xen/errno.h> |
25 | | #include <xen/trace.h> |
26 | | #include <xen/cpu.h> |
27 | | #include <xen/keyhandler.h> |
28 | | |
29 | | /* Meant only for helping developers during debugging. */ |
30 | | /* #define d2printk printk */ |
31 | | #define d2printk(x...) |
32 | | |
33 | | |
34 | | /* |
35 | | * Credit2 tracing events ("only" 512 available!). Check |
36 | | * include/public/trace.h for more details. |
37 | | */ |
38 | | #define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) |
39 | 0 | #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) |
40 | 0 | #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) |
41 | | #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) |
42 | 0 | #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) |
43 | 0 | #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) |
44 | 0 | #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) |
45 | 0 | #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) |
46 | 0 | #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) |
47 | 0 | #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) |
48 | 0 | #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) |
49 | 0 | #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) |
50 | 0 | #define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13) |
51 | 0 | #define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14) |
52 | 0 | #define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15) |
53 | 0 | #define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16) |
54 | 0 | #define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17) |
55 | 0 | #define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19) |
56 | 0 | #define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20) |
57 | 0 | #define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21) |
58 | 0 | #define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22) |
59 | 0 | #define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23) |
60 | | |
61 | | /* |
62 | | * WARNING: This is still in an experimental phase. Status and work can be found at the |
63 | | * credit2 wiki page: |
64 | | * http://wiki.xen.org/wiki/Credit2_Scheduler_Development |
65 | | * |
66 | | * TODO: |
67 | | * + Hyperthreading |
68 | | * - Look for non-busy core if possible |
69 | | * - "Discount" time run on a thread with busy siblings |
70 | | * + Algorithm: |
71 | | * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., |
72 | | * a flash animation in the background) can we schedule it with low enough latency |
73 | | * so that audio doesn't skip? |
74 | | * - Cap and reservation: How to implement with the current system? |
75 | | * + Optimizing |
76 | | * - Profiling, making new algorithms, making math more efficient (no long division) |
77 | | */ |
78 | | |
79 | | /* |
80 | | * Design: |
81 | | * |
82 | | * VMs "burn" credits based on their weight; higher weight means |
83 | | * credits burn more slowly. The highest weight vcpu burns credits at |
84 | | * a rate of 1 credit per nanosecond. Others burn proportionally |
85 | | * more. |
86 | | * |
87 | | * vcpus are inserted into the runqueue by credit order. |
88 | | * |
89 | | * Credits are "reset" when the next vcpu in the runqueue is less than |
90 | | * or equal to zero. At that point, everyone's credits are "clipped" |
91 | | * to a small value, and a fixed credit is added to everyone. |
92 | | */ |
93 | | |
94 | | /* |
95 | | * Utilization cap: |
96 | | * |
97 | | * Setting an pCPU utilization cap for a domain means the following: |
98 | | * |
99 | | * - a domain can have a cap, expressed in terms of % of physical CPU time. |
100 | | * A domain that must not use more than 1/4 of _one_ physical CPU, will |
101 | | * be given a cap of 25%; a domain that must not use more than 1+1/2 of |
102 | | * physical CPU time, will be given a cap of 150%; |
103 | | * |
104 | | * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and |
105 | | * a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4 |
106 | | * vCPUs, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be |
107 | | * split among the v vCPUs. How much each of the vCPUs will actually get, |
108 | | * during any given interval of time, is unspecified (as it depends on |
109 | | * various aspects: workload, system load, etc.). For instance, it is |
110 | | * possible that, during a given time interval, 2 vCPUs use 100% each, |
111 | | * and the other two use nothing; while during another time interval, |
112 | | * two vCPUs use 80%, one uses 10% and the other 30%; or that each use |
113 | | * 50% (and so on and so forth). |
114 | | * |
115 | | * For implementing this, we use the following approach: |
116 | | * |
117 | | * - each domain is given a 'budget', an each domain has a timer, which |
118 | | * replenishes the domain's budget periodically. The budget is the amount |
119 | | * of time the vCPUs of the domain can use every 'period'; |
120 | | * |
121 | | * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains |
122 | | * (but each domain has its own timer; so the all are periodic by the same |
123 | | * period, but replenishment of the budgets of the various domains, at |
124 | | * periods boundaries, are not synchronous); |
125 | | * |
126 | | * - when vCPUs run, they consume budget. When they don't run, they don't |
127 | | * consume budget. If there is no budget left for the domain, no vCPU of |
128 | | * that domain can run. If a vCPU tries to run and finds that there is no |
129 | | * budget, it blocks. |
130 | | * At whatever time a vCPU wants to run, it must check the domain's budget, |
131 | | * and if there is some, it can use it. |
132 | | * |
133 | | * - budget is replenished to the top of the capacity for the domain once |
134 | | * per period. Even if there was some leftover budget from previous period, |
135 | | * though, the budget after a replenishment will always be at most equal |
136 | | * to the total capacify of the domain ('tot_budget'); |
137 | | * |
138 | | * - when a budget replenishment occurs, if there are vCPUs that had been |
139 | | * blocked because of lack of budget, they'll be unblocked, and they will |
140 | | * (potentially) be able to run again. |
141 | | * |
142 | | * Finally, some even more implementation related detail: |
143 | | * |
144 | | * - budget is stored in a domain-wide pool. vCPUs of the domain that want |
145 | | * to run go to such pool, and grub some. When they do so, the amount |
146 | | * they grabbed is _immediately_ removed from the pool. This happens in |
147 | | * vcpu_grab_budget(); |
148 | | * |
149 | | * - when vCPUs stop running, if they've not consumed all the budget they |
150 | | * took, the leftover is put back in the pool. This happens in |
151 | | * vcpu_return_budget(); |
152 | | * |
153 | | * - the above means that a vCPU can find out that there is no budget and |
154 | | * block, not only if the cap has actually been reached (for this period), |
155 | | * but also if some other vCPUs, in order to run, have grabbed a certain |
156 | | * quota of budget, no matter whether they've already used it all or not. |
157 | | * A vCPU blocking because (any form of) lack of budget is said to be |
158 | | * "parked", and such blocking happens in park_vcpu(); |
159 | | * |
160 | | * - when a vCPU stops running, and puts back some budget in the domain pool, |
161 | | * we need to check whether there is someone which has been parked and that |
162 | | * can be unparked. This happens in unpark_parked_vcpus(), called from |
163 | | * csched2_context_saved(); |
164 | | * |
165 | | * - of course, unparking happens also as a consequence of the domain's budget |
166 | | * being replenished by the periodic timer. This also occurs by means of |
167 | | * calling csched2_context_saved() (but from replenish_domain_budget()); |
168 | | * |
169 | | * - parked vCPUs of a domain are kept in a (per-domain) list, called |
170 | | * 'parked_vcpus'). Manipulation of the list and of the domain-wide budget |
171 | | * pool, must occur only when holding the 'budget_lock'. |
172 | | */ |
173 | | |
174 | | /* |
175 | | * Locking: |
176 | | * |
177 | | * - runqueue lock |
178 | | * + it is per-runqueue, so: |
179 | | * * cpus in a runqueue take the runqueue lock, when using |
180 | | * pcpu_schedule_lock() / vcpu_schedule_lock() (and friends), |
181 | | * * a cpu may (try to) take a "remote" runqueue lock, e.g., for |
182 | | * load balancing; |
183 | | * + serializes runqueue operations (removing and inserting vcpus); |
184 | | * + protects runqueue-wide data in csched2_runqueue_data; |
185 | | * + protects vcpu parameters in csched2_vcpu for the vcpu in the |
186 | | * runqueue. |
187 | | * |
188 | | * - Private scheduler lock |
189 | | * + protects scheduler-wide data in csched2_private, such as: |
190 | | * * the list of domains active in this scheduler, |
191 | | * * what cpus and what runqueues are active and in what |
192 | | * runqueue each cpu is; |
193 | | * + serializes the operation of changing the weights of domains; |
194 | | * |
195 | | * - Budget lock |
196 | | * + it is per-domain; |
197 | | * + protects, in domains that have an utilization cap; |
198 | | * * manipulation of the total budget of the domain (as it is shared |
199 | | * among all vCPUs of the domain), |
200 | | * * manipulation of the list of vCPUs that are blocked waiting for |
201 | | * some budget to be available. |
202 | | * |
203 | | * - Type: |
204 | | * + runqueue locks are 'regular' spinlocks; |
205 | | * + the private scheduler lock can be an rwlock. In fact, data |
206 | | * it protects is modified only during initialization, cpupool |
207 | | * manipulation and when changing weights, and read in all |
208 | | * other cases (e.g., during load balancing); |
209 | | * + budget locks are 'regular' spinlocks. |
210 | | * |
211 | | * Ordering: |
212 | | * + tylock must be used when wanting to take a runqueue lock, |
213 | | * if we already hold another one; |
214 | | * + if taking both a runqueue lock and the private scheduler |
215 | | * lock is, the latter must always be taken for first; |
216 | | * + if taking both a runqueue lock and a budget lock, the former |
217 | | * must always be taken for first. |
218 | | */ |
219 | | |
220 | | /* |
221 | | * Basic constants |
222 | | */ |
223 | | /* Default weight: How much a new domain starts with. */ |
224 | 0 | #define CSCHED2_DEFAULT_WEIGHT 256 |
225 | | /* |
226 | | * Min timer: Minimum length a timer will be set, to |
227 | | * achieve efficiency. |
228 | | */ |
229 | 0 | #define CSCHED2_MIN_TIMER MICROSECS(500) |
230 | | /* |
231 | | * Amount of credit VMs begin with, and are reset to. |
232 | | * ATM, set so that highest-weight VMs can only run for 10ms |
233 | | * before a reset event. |
234 | | */ |
235 | 0 | #define CSCHED2_CREDIT_INIT MILLISECS(10) |
236 | | /* |
237 | | * Amount of credit the idle vcpus have. It never changes, as idle |
238 | | * vcpus does not consume credits, and it must be lower than whatever |
239 | | * amount of credit 'regular' vcpu would end up with. |
240 | | */ |
241 | 0 | #define CSCHED2_IDLE_CREDIT (-(1U<<30)) |
242 | | /* |
243 | | * Carryover: How much "extra" credit may be carried over after |
244 | | * a reset. |
245 | | */ |
246 | 0 | #define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER |
247 | | /* |
248 | | * Stickiness: Cross-L2 migration resistance. Should be less than |
249 | | * MIN_TIMER. |
250 | | */ |
251 | 0 | #define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) |
252 | | /* How much to "compensate" a vcpu for L2 migration. */ |
253 | 0 | #define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50) |
254 | | /* How tolerant we should be when peeking at runtime of vcpus on other cpus */ |
255 | 0 | #define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50) |
256 | | /* Reset: Value below which credit will be reset. */ |
257 | 0 | #define CSCHED2_CREDIT_RESET 0 |
258 | | /* Max timer: Maximum time a guest can be run for. */ |
259 | 0 | #define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT |
260 | | /* Period of the cap replenishment timer. */ |
261 | 0 | #define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1)) |
262 | | |
263 | | /* |
264 | | * Flags |
265 | | */ |
266 | | /* |
267 | | * CSFLAG_scheduled: Is this vcpu either running on, or context-switching off, |
268 | | * a physical cpu? |
269 | | * + Accessed only with runqueue lock held |
270 | | * + Set when chosen as next in csched2_schedule(). |
271 | | * + Cleared after context switch has been saved in csched2_context_saved() |
272 | | * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should |
273 | | * set CSFLAG_delayed_runq_add |
274 | | * + Checked to be false in runq_insert. |
275 | | */ |
276 | 0 | #define __CSFLAG_scheduled 1 |
277 | 0 | #define CSFLAG_scheduled (1U<<__CSFLAG_scheduled) |
278 | | /* |
279 | | * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done |
280 | | * being context switched out? |
281 | | * + Set when scheduling out in csched2_schedule() if prev is runnable |
282 | | * + Set in csched2_vcpu_wake if it finds CSFLAG_scheduled set |
283 | | * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and |
284 | | * clears the bit. |
285 | | */ |
286 | 0 | #define __CSFLAG_delayed_runq_add 2 |
287 | 0 | #define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add) |
288 | | /* |
289 | | * CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a |
290 | | * credit2-initiated runq migrate request; migrate it to the runqueue indicated |
291 | | * in the svc struct. |
292 | | */ |
293 | 0 | #define __CSFLAG_runq_migrate_request 3 |
294 | 0 | #define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request) |
295 | | /* |
296 | | * CSFLAG_vcpu_yield: this vcpu was running, and has called vcpu_yield(). The |
297 | | * scheduler is invoked to see if we can give the cpu to someone else, and |
298 | | * get back to the yielding vcpu in a while. |
299 | | */ |
300 | | #define __CSFLAG_vcpu_yield 4 |
301 | | #define CSFLAG_vcpu_yield (1U<<__CSFLAG_vcpu_yield) |
302 | | |
303 | | static unsigned int __read_mostly opt_migrate_resist = 500; |
304 | | integer_param("sched_credit2_migrate_resist", opt_migrate_resist); |
305 | | |
306 | | /* |
307 | | * Load tracking and load balancing |
308 | | * |
309 | | * Load history of runqueues and vcpus is accounted for by using an |
310 | | * exponential weighted moving average algorithm. However, instead of using |
311 | | * fractions,we shift everything to left by the number of bits we want to |
312 | | * use for representing the fractional part (Q-format). |
313 | | * |
314 | | * We may also want to reduce the precision of time accounting, to |
315 | | * accommodate 'longer windows'. So, if that is the case, we just need to |
316 | | * shift all time samples to the right. |
317 | | * |
318 | | * The details of the formulas used for load tracking are explained close to |
319 | | * update_runq_load(). Let's just say here that, with full nanosecond time |
320 | | * granularity, a 30 bits wide 'decaying window' is ~1 second long. |
321 | | * |
322 | | * We want to consider the following equations: |
323 | | * |
324 | | * avg[0] = load*P |
325 | | * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W |
326 | | * |
327 | | * where W is the length of the window, P the multiplier for transitiong into |
328 | | * Q-format fixed point arithmetic and load is the instantaneous load of a |
329 | | * runqueue, which basically is the number of runnable vcpus there are on the |
330 | | * runqueue (for the meaning of the other terms, look at the doc comment to |
331 | | * update_runq_load()). |
332 | | * |
333 | | * So, again, with full nanosecond granularity, and 1 second window, we have: |
334 | | * |
335 | | * W = 2^30 |
336 | | * P = 2^18 |
337 | | * |
338 | | * The maximum possible value for the average load, which we want to store in |
339 | | * s_time_t type variables (i.e., we have 63 bits available) is load*P. This |
340 | | * means that, with P 18 bits wide, load can occupy 45 bits. This in turn |
341 | | * means we can have 2^45 vcpus in each runqueue, before overflow occurs! |
342 | | * |
343 | | * However, it can happen that, at step j+1, if: |
344 | | * |
345 | | * avg[j] = load*P |
346 | | * delta = W |
347 | | * |
348 | | * then: |
349 | | * |
350 | | * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W |
351 | | * |
352 | | * So we must be able to deal with W*load*P. This means load can't be higher |
353 | | * than: |
354 | | * |
355 | | * 2^(63 - 30 - 18) = 2^15 = 32768 |
356 | | * |
357 | | * So 32768 is the maximum number of vcpus the we can have in a runqueue, |
358 | | * at any given time, and still not have problems with the load tracking |
359 | | * calculations... and this is more than fine. |
360 | | * |
361 | | * As a matter of fact, since we are using microseconds granularity, we have |
362 | | * W=2^20. So, still with 18 fractional bits and a 1 second long window, there |
363 | | * may be 2^25 = 33554432 vcpus in a runq before we have to start thinking |
364 | | * about overflow. |
365 | | */ |
366 | | |
367 | | /* If >0, decreases the granularity of time samples used for load tracking. */ |
368 | 0 | #define LOADAVG_GRANULARITY_SHIFT (10) |
369 | | /* Time window during which we still give value to previous load history. */ |
370 | 0 | #define LOADAVG_WINDOW_SHIFT (30) |
371 | | /* 18 bits by default (and not less than 4) for decimals. */ |
372 | | #define LOADAVG_PRECISION_SHIFT (18) |
373 | 0 | #define LOADAVG_PRECISION_SHIFT_MIN (4) |
374 | | |
375 | | /* |
376 | | * Both the length of the window and the number of fractional bits can be |
377 | | * decided with boot parameters. |
378 | | * |
379 | | * The length of the window is always expressed in nanoseconds. The actual |
380 | | * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT. |
381 | | */ |
382 | | static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT; |
383 | | integer_param("credit2_load_window_shift", opt_load_window_shift); |
384 | | static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT; |
385 | | integer_param("credit2_load_precision_shift", opt_load_precision_shift); |
386 | | |
387 | | static int __read_mostly opt_underload_balance_tolerance = 0; |
388 | | integer_param("credit2_balance_under", opt_underload_balance_tolerance); |
389 | | static int __read_mostly opt_overload_balance_tolerance = -3; |
390 | | integer_param("credit2_balance_over", opt_overload_balance_tolerance); |
391 | | /* |
392 | | * Domains subject to a cap receive a replenishment of their runtime budget |
393 | | * once every opt_cap_period interval. Default is 10 ms. The amount of budget |
394 | | * they receive depends on their cap. For instance, a domain with a 50% cap |
395 | | * will receive 50% of 10 ms, so 5 ms. |
396 | | */ |
397 | | static unsigned int __read_mostly opt_cap_period = 10; /* ms */ |
398 | | integer_param("credit2_cap_period_ms", opt_cap_period); |
399 | | |
400 | | /* |
401 | | * Runqueue organization. |
402 | | * |
403 | | * The various cpus are to be assigned each one to a runqueue, and we |
404 | | * want that to happen basing on topology. At the moment, it is possible |
405 | | * to choose to arrange runqueues to be: |
406 | | * |
407 | | * - per-cpu: meaning that there will be one runqueue per logical cpu. This |
408 | | * will happen when if the opt_runqueue parameter is set to 'cpu'. |
409 | | * |
410 | | * - per-core: meaning that there will be one runqueue per each physical |
411 | | * core of the host. This will happen if the opt_runqueue |
412 | | * parameter is set to 'core'; |
413 | | * |
414 | | * - per-socket: meaning that there will be one runqueue per each physical |
415 | | * socket (AKA package, which often, but not always, also |
416 | | * matches a NUMA node) of the host; This will happen if |
417 | | * the opt_runqueue parameter is set to 'socket'; |
418 | | * |
419 | | * - per-node: meaning that there will be one runqueue per each physical |
420 | | * NUMA node of the host. This will happen if the opt_runqueue |
421 | | * parameter is set to 'node'; |
422 | | * |
423 | | * - global: meaning that there will be only one runqueue to which all the |
424 | | * (logical) processors of the host belong. This will happen if |
425 | | * the opt_runqueue parameter is set to 'all'. |
426 | | * |
427 | | * Depending on the value of opt_runqueue, therefore, cpus that are part of |
428 | | * either the same physical core, the same physical socket, the same NUMA |
429 | | * node, or just all of them, will be put together to form runqueues. |
430 | | */ |
431 | 0 | #define OPT_RUNQUEUE_CPU 0 |
432 | 0 | #define OPT_RUNQUEUE_CORE 1 |
433 | 0 | #define OPT_RUNQUEUE_SOCKET 2 |
434 | 0 | #define OPT_RUNQUEUE_NODE 3 |
435 | 0 | #define OPT_RUNQUEUE_ALL 4 |
436 | | static const char *const opt_runqueue_str[] = { |
437 | | [OPT_RUNQUEUE_CPU] = "cpu", |
438 | | [OPT_RUNQUEUE_CORE] = "core", |
439 | | [OPT_RUNQUEUE_SOCKET] = "socket", |
440 | | [OPT_RUNQUEUE_NODE] = "node", |
441 | | [OPT_RUNQUEUE_ALL] = "all" |
442 | | }; |
443 | | static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET; |
444 | | |
445 | | static int parse_credit2_runqueue(const char *s) |
446 | 0 | { |
447 | 0 | unsigned int i; |
448 | 0 |
|
449 | 0 | for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ ) |
450 | 0 | { |
451 | 0 | if ( !strcmp(s, opt_runqueue_str[i]) ) |
452 | 0 | { |
453 | 0 | opt_runqueue = i; |
454 | 0 | return 0; |
455 | 0 | } |
456 | 0 | } |
457 | 0 |
|
458 | 0 | return -EINVAL; |
459 | 0 | } |
460 | | custom_param("credit2_runqueue", parse_credit2_runqueue); |
461 | | |
462 | | /* |
463 | | * Per-runqueue data |
464 | | */ |
465 | | struct csched2_runqueue_data { |
466 | | spinlock_t lock; /* Lock for this runqueue */ |
467 | | |
468 | | struct list_head runq; /* Ordered list of runnable vms */ |
469 | | int id; /* ID of this runqueue (-1 if invalid) */ |
470 | | |
471 | | int load; /* Instantaneous load (num of non-idle vcpus) */ |
472 | | s_time_t load_last_update; /* Last time average was updated */ |
473 | | s_time_t avgload; /* Decaying queue load */ |
474 | | s_time_t b_avgload; /* Decaying queue load modified by balancing */ |
475 | | |
476 | | cpumask_t active, /* CPUs enabled for this runqueue */ |
477 | | smt_idle, /* Fully idle-and-untickled cores (see below) */ |
478 | | tickled, /* Have been asked to go through schedule */ |
479 | | idle; /* Currently idle pcpus */ |
480 | | |
481 | | struct list_head svc; /* List of all vcpus assigned to the runqueue */ |
482 | | unsigned int max_weight; /* Max weight of the vcpus in this runqueue */ |
483 | | unsigned int pick_bias; /* Last picked pcpu. Start from it next time */ |
484 | | }; |
485 | | |
486 | | /* |
487 | | * System-wide private data |
488 | | */ |
489 | | struct csched2_private { |
490 | | rwlock_t lock; /* Private scheduler lock */ |
491 | | |
492 | | unsigned int load_precision_shift; /* Precision of load calculations */ |
493 | | unsigned int load_window_shift; /* Lenght of load decaying window */ |
494 | | unsigned int ratelimit_us; /* Rate limiting for this scheduler */ |
495 | | |
496 | | cpumask_t active_queues; /* Runqueues with (maybe) active cpus */ |
497 | | struct csched2_runqueue_data *rqd; /* Data of the various runqueues */ |
498 | | |
499 | | cpumask_t initialized; /* CPUs part of this scheduler */ |
500 | | struct list_head sdom; /* List of domains (for debug key) */ |
501 | | }; |
502 | | |
503 | | /* |
504 | | * Physical CPU |
505 | | * |
506 | | * The only per-pCPU information we need to maintain is of which runqueue |
507 | | * each CPU is part of. |
508 | | */ |
509 | | static DEFINE_PER_CPU(int, runq_map); |
510 | | |
511 | | /* |
512 | | * Virtual CPU |
513 | | */ |
514 | | struct csched2_vcpu { |
515 | | struct csched2_dom *sdom; /* Up-pointer to domain */ |
516 | | struct vcpu *vcpu; /* Up-pointer, to vcpu */ |
517 | | struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */ |
518 | | |
519 | | int credit; /* Current amount of credit */ |
520 | | unsigned int weight; /* Weight of this vcpu */ |
521 | | unsigned int residual; /* Reminder of div(max_weight/weight) */ |
522 | | unsigned flags; /* Status flags (16 bits would be ok, */ |
523 | | s_time_t budget; /* Current budget (if domains has cap) */ |
524 | | /* but clear_bit() does not like that) */ |
525 | | s_time_t budget_quota; /* Budget to which vCPU is entitled */ |
526 | | |
527 | | s_time_t start_time; /* Time we were scheduled (for credit) */ |
528 | | |
529 | | /* Individual contribution to load */ |
530 | | s_time_t load_last_update; /* Last time average was updated */ |
531 | | s_time_t avgload; /* Decaying queue load */ |
532 | | |
533 | | struct list_head runq_elem; /* On the runqueue (rqd->runq) */ |
534 | | struct list_head parked_elem; /* On the parked_vcpus list */ |
535 | | struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */ |
536 | | struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */ |
537 | | int tickled_cpu; /* Cpu that will pick us (-1 if none) */ |
538 | | }; |
539 | | |
540 | | /* |
541 | | * Domain |
542 | | */ |
543 | | struct csched2_dom { |
544 | | struct domain *dom; /* Up-pointer to domain */ |
545 | | |
546 | | spinlock_t budget_lock; /* Serialized budget calculations */ |
547 | | s_time_t tot_budget; /* Total amount of budget */ |
548 | | s_time_t budget; /* Currently available budget */ |
549 | | |
550 | | struct timer *repl_timer; /* Timer for periodic replenishment of budget */ |
551 | | s_time_t next_repl; /* Time at which next replenishment occurs */ |
552 | | struct list_head parked_vcpus; /* List of CPUs waiting for budget */ |
553 | | |
554 | | struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */ |
555 | | uint16_t weight; /* User specified weight */ |
556 | | uint16_t cap; /* User specified cap */ |
557 | | uint16_t nr_vcpus; /* Number of vcpus of this domain */ |
558 | | }; |
559 | | |
560 | | /* |
561 | | * Accessor helpers functions. |
562 | | */ |
563 | | static inline struct csched2_private *csched2_priv(const struct scheduler *ops) |
564 | 0 | { |
565 | 0 | return ops->sched_data; |
566 | 0 | } |
567 | | |
568 | | static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v) |
569 | 0 | { |
570 | 0 | return v->sched_priv; |
571 | 0 | } |
572 | | |
573 | | static inline struct csched2_dom *csched2_dom(const struct domain *d) |
574 | 0 | { |
575 | 0 | return d->sched_priv; |
576 | 0 | } |
577 | | |
578 | | /* CPU to runq_id macro */ |
579 | | static inline int c2r(unsigned int cpu) |
580 | 0 | { |
581 | 0 | return per_cpu(runq_map, cpu); |
582 | 0 | } |
583 | | |
584 | | /* CPU to runqueue struct macro */ |
585 | | static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops, |
586 | | unsigned int cpu) |
587 | 0 | { |
588 | 0 | return &csched2_priv(ops)->rqd[c2r(cpu)]; |
589 | 0 | } |
590 | | |
591 | | /* Does the domain of this vCPU have a cap? */ |
592 | | static inline bool has_cap(const struct csched2_vcpu *svc) |
593 | 0 | { |
594 | 0 | return svc->budget != STIME_MAX; |
595 | 0 | } |
596 | | |
597 | | /* |
598 | | * Hyperthreading (SMT) support. |
599 | | * |
600 | | * We use a special per-runq mask (smt_idle) and update it according to the |
601 | | * following logic: |
602 | | * - when _all_ the SMT sibling in a core are idle, all their corresponding |
603 | | * bits are set in the smt_idle mask; |
604 | | * - when even _just_one_ of the SMT siblings in a core is not idle, all the |
605 | | * bits correspondings to it and to all its siblings are clear in the |
606 | | * smt_idle mask. |
607 | | * |
608 | | * Once we have such a mask, it is easy to implement a policy that, either: |
609 | | * - uses fully idle cores first: it is enough to try to schedule the vcpus |
610 | | * on pcpus from smt_idle mask first. This is what happens if |
611 | | * sched_smt_power_savings was not set at boot (default), and it maximizes |
612 | | * true parallelism, and hence performance; |
613 | | * - uses already busy cores first: it is enough to try to schedule the vcpus |
614 | | * on pcpus that are idle, but are not in smt_idle. This is what happens if |
615 | | * sched_smt_power_savings is set at boot, and it allows as more cores as |
616 | | * possible to stay in low power states, minimizing power consumption. |
617 | | * |
618 | | * This logic is entirely implemented in runq_tickle(), and that is enough. |
619 | | * In fact, in this scheduler, placement of a vcpu on one of the pcpus of a |
620 | | * runq, _always_ happens by means of tickling: |
621 | | * - when a vcpu wakes up, it calls csched2_vcpu_wake(), which calls |
622 | | * runq_tickle(); |
623 | | * - when a migration is initiated in schedule.c, we call csched2_cpu_pick(), |
624 | | * csched2_vcpu_migrate() (which calls migrate()) and csched2_vcpu_wake(). |
625 | | * csched2_cpu_pick() looks for the least loaded runq and return just any |
626 | | * of its processors. Then, csched2_vcpu_migrate() just moves the vcpu to |
627 | | * the chosen runq, and it is again runq_tickle(), called by |
628 | | * csched2_vcpu_wake() that actually decides what pcpu to use within the |
629 | | * chosen runq; |
630 | | * - when a migration is initiated in sched_credit2.c, by calling migrate() |
631 | | * directly, that again temporarily use a random pcpu from the new runq, |
632 | | * and then calls runq_tickle(), by itself. |
633 | | */ |
634 | | |
635 | | /* |
636 | | * If all the siblings of cpu (including cpu itself) are both idle and |
637 | | * untickled, set all their bits in mask. |
638 | | * |
639 | | * NB that rqd->smt_idle is different than rqd->idle. rqd->idle |
640 | | * records pcpus that at are merely idle (i.e., at the moment do not |
641 | | * have a vcpu running on them). But you have to manually filter out |
642 | | * which pcpus have been tickled in order to find cores that are not |
643 | | * going to be busy soon. Filtering out tickled cpus pairwise is a |
644 | | * lot of extra pain; so for rqd->smt_idle, we explicitly make so that |
645 | | * the bits of a pcpu are set only if all the threads on its core are |
646 | | * both idle *and* untickled. |
647 | | * |
648 | | * This means changing the mask when either rqd->idle or rqd->tickled |
649 | | * changes. |
650 | | */ |
651 | | static inline |
652 | | void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers, |
653 | | cpumask_t *mask) |
654 | 0 | { |
655 | 0 | const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu); |
656 | 0 |
|
657 | 0 | if ( cpumask_subset(cpu_siblings, idlers) ) |
658 | 0 | cpumask_or(mask, mask, cpu_siblings); |
659 | 0 | } |
660 | | |
661 | | /* |
662 | | * Clear the bits of all the siblings of cpu from mask (if necessary). |
663 | | */ |
664 | | static inline |
665 | | void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask) |
666 | 0 | { |
667 | 0 | const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu); |
668 | 0 |
|
669 | 0 | if ( cpumask_subset(cpu_siblings, mask) ) |
670 | 0 | cpumask_andnot(mask, mask, per_cpu(cpu_sibling_mask, cpu)); |
671 | 0 | } |
672 | | |
673 | | /* |
674 | | * In csched2_cpu_pick(), it may not be possible to actually look at remote |
675 | | * runqueues (the trylock-s on their spinlocks can fail!). If that happens, |
676 | | * we pick, in order of decreasing preference: |
677 | | * 1) svc's current pcpu, if it is part of svc's soft affinity; |
678 | | * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity; |
679 | | * 3) svc's current pcpu, if it is part of svc's hard affinity; |
680 | | * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity; |
681 | | * 5) just one valid pcpu from svc's hard affinity |
682 | | * |
683 | | * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also |
684 | | * note that at least 5 is guaranteed to _always_ return at least one pcpu. |
685 | | */ |
686 | | static int get_fallback_cpu(struct csched2_vcpu *svc) |
687 | 0 | { |
688 | 0 | struct vcpu *v = svc->vcpu; |
689 | 0 | unsigned int bs; |
690 | 0 |
|
691 | 0 | SCHED_STAT_CRANK(need_fallback_cpu); |
692 | 0 |
|
693 | 0 | for_each_affinity_balance_step( bs ) |
694 | 0 | { |
695 | 0 | int cpu = v->processor; |
696 | 0 |
|
697 | 0 | if ( bs == BALANCE_SOFT_AFFINITY && |
698 | 0 | !has_soft_affinity(v, v->cpu_hard_affinity) ) |
699 | 0 | continue; |
700 | 0 |
|
701 | 0 | affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu)); |
702 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
703 | 0 | cpupool_domain_cpumask(v->domain)); |
704 | 0 |
|
705 | 0 | /* |
706 | 0 | * This is cases 1 or 3 (depending on bs): if v->processor is (still) |
707 | 0 | * in our affinity, go for it, for cache betterness. |
708 | 0 | */ |
709 | 0 | if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) |
710 | 0 | return cpu; |
711 | 0 |
|
712 | 0 | /* |
713 | 0 | * This is cases 2 or 4 (depending on bs): v->processor isn't there |
714 | 0 | * any longer, check if we at least can stay in our current runq. |
715 | 0 | */ |
716 | 0 | if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu), |
717 | 0 | &svc->rqd->active)) ) |
718 | 0 | { |
719 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
720 | 0 | &svc->rqd->active); |
721 | 0 | return cpumask_first(cpumask_scratch_cpu(cpu)); |
722 | 0 | } |
723 | 0 |
|
724 | 0 | /* |
725 | 0 | * We may well pick any valid pcpu from our soft-affinity, outside |
726 | 0 | * of our current runqueue, but we decide not to. In fact, changing |
727 | 0 | * runqueue is slow, affects load distribution, and is a source of |
728 | 0 | * overhead for the vcpus running on the other runqueue (we need the |
729 | 0 | * lock). So, better do that as a consequence of a well informed |
730 | 0 | * decision (or if we really don't have any other chance, as we will, |
731 | 0 | * at step 5, if we get to there). |
732 | 0 | * |
733 | 0 | * Also, being here, looking for a fallback, is an unfortunate and |
734 | 0 | * infrequent event, while the decision of putting us in the runqueue |
735 | 0 | * wehere we are was (likely) made taking all the relevant factors |
736 | 0 | * into account. So let's not disrupt that, just for the sake of |
737 | 0 | * soft-affinity, and let's wait here to be able to made (hopefully, |
738 | 0 | * soon), another similar well informed decision. |
739 | 0 | */ |
740 | 0 | if ( bs == BALANCE_SOFT_AFFINITY ) |
741 | 0 | continue; |
742 | 0 |
|
743 | 0 | /* |
744 | 0 | * This is cases 5: last stand, just one valid pcpu from our hard |
745 | 0 | * affinity. It's guaranteed that there is at least one valid cpu, |
746 | 0 | * and therefore we are sure that we return it, and never really |
747 | 0 | * exit the loop. |
748 | 0 | */ |
749 | 0 | ASSERT(bs == BALANCE_HARD_AFFINITY && |
750 | 0 | !cpumask_empty(cpumask_scratch_cpu(cpu))); |
751 | 0 | cpu = cpumask_first(cpumask_scratch_cpu(cpu)); |
752 | 0 | if ( likely(cpu < nr_cpu_ids) ) |
753 | 0 | return cpu; |
754 | 0 | } |
755 | 0 | ASSERT_UNREACHABLE(); |
756 | 0 | /* |
757 | 0 | * We can't be here. But if that somehow happen (in non-debug builds), |
758 | 0 | * at least return something which both online and in our hard-affinity. |
759 | 0 | */ |
760 | 0 | return cpumask_any(cpumask_scratch_cpu(v->processor)); |
761 | 0 | } |
762 | | |
763 | | /* |
764 | | * Time-to-credit, credit-to-time. |
765 | | * |
766 | | * We keep track of the "residual" time to make sure that frequent short |
767 | | * schedules still get accounted for in the end. |
768 | | * |
769 | | * FIXME: Do pre-calculated division? |
770 | | */ |
771 | | static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time, |
772 | | struct csched2_vcpu *svc) |
773 | 0 | { |
774 | 0 | uint64_t val = time * rqd->max_weight + svc->residual; |
775 | 0 |
|
776 | 0 | svc->residual = do_div(val, svc->weight); |
777 | 0 | svc->credit -= val; |
778 | 0 | } |
779 | | |
780 | | static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_vcpu *svc) |
781 | 0 | { |
782 | 0 | return credit * svc->weight / rqd->max_weight; |
783 | 0 | } |
784 | | |
785 | | /* |
786 | | * Runqueue related code. |
787 | | */ |
788 | | |
789 | | static inline int vcpu_on_runq(struct csched2_vcpu *svc) |
790 | 0 | { |
791 | 0 | return !list_empty(&svc->runq_elem); |
792 | 0 | } |
793 | | |
794 | | static inline struct csched2_vcpu * runq_elem(struct list_head *elem) |
795 | 0 | { |
796 | 0 | return list_entry(elem, struct csched2_vcpu, runq_elem); |
797 | 0 | } |
798 | | |
799 | | static void activate_runqueue(struct csched2_private *prv, int rqi) |
800 | 0 | { |
801 | 0 | struct csched2_runqueue_data *rqd; |
802 | 0 |
|
803 | 0 | rqd = prv->rqd + rqi; |
804 | 0 |
|
805 | 0 | BUG_ON(!cpumask_empty(&rqd->active)); |
806 | 0 |
|
807 | 0 | rqd->max_weight = 1; |
808 | 0 | rqd->id = rqi; |
809 | 0 | INIT_LIST_HEAD(&rqd->svc); |
810 | 0 | INIT_LIST_HEAD(&rqd->runq); |
811 | 0 | spin_lock_init(&rqd->lock); |
812 | 0 |
|
813 | 0 | __cpumask_set_cpu(rqi, &prv->active_queues); |
814 | 0 | } |
815 | | |
816 | | static void deactivate_runqueue(struct csched2_private *prv, int rqi) |
817 | 0 | { |
818 | 0 | struct csched2_runqueue_data *rqd; |
819 | 0 |
|
820 | 0 | rqd = prv->rqd + rqi; |
821 | 0 |
|
822 | 0 | BUG_ON(!cpumask_empty(&rqd->active)); |
823 | 0 |
|
824 | 0 | rqd->id = -1; |
825 | 0 |
|
826 | 0 | __cpumask_clear_cpu(rqi, &prv->active_queues); |
827 | 0 | } |
828 | | |
829 | | static inline bool same_node(unsigned int cpua, unsigned int cpub) |
830 | 0 | { |
831 | 0 | return cpu_to_node(cpua) == cpu_to_node(cpub); |
832 | 0 | } |
833 | | |
834 | | static inline bool same_socket(unsigned int cpua, unsigned int cpub) |
835 | 0 | { |
836 | 0 | return cpu_to_socket(cpua) == cpu_to_socket(cpub); |
837 | 0 | } |
838 | | |
839 | | static inline bool same_core(unsigned int cpua, unsigned int cpub) |
840 | 0 | { |
841 | 0 | return same_socket(cpua, cpub) && |
842 | 0 | cpu_to_core(cpua) == cpu_to_core(cpub); |
843 | 0 | } |
844 | | |
845 | | static unsigned int |
846 | | cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu) |
847 | 0 | { |
848 | 0 | struct csched2_runqueue_data *rqd; |
849 | 0 | unsigned int rqi; |
850 | 0 |
|
851 | 0 | for ( rqi = 0; rqi < nr_cpu_ids; rqi++ ) |
852 | 0 | { |
853 | 0 | unsigned int peer_cpu; |
854 | 0 |
|
855 | 0 | /* |
856 | 0 | * As soon as we come across an uninitialized runqueue, use it. |
857 | 0 | * In fact, either: |
858 | 0 | * - we are initializing the first cpu, and we assign it to |
859 | 0 | * runqueue 0. This is handy, especially if we are dealing |
860 | 0 | * with the boot cpu (if credit2 is the default scheduler), |
861 | 0 | * as we would not be able to use cpu_to_socket() and similar |
862 | 0 | * helpers anyway (they're result of which is not reliable yet); |
863 | 0 | * - we have gone through all the active runqueues, and have not |
864 | 0 | * found anyone whose cpus' topology matches the one we are |
865 | 0 | * dealing with, so activating a new runqueue is what we want. |
866 | 0 | */ |
867 | 0 | if ( prv->rqd[rqi].id == -1 ) |
868 | 0 | break; |
869 | 0 |
|
870 | 0 | rqd = prv->rqd + rqi; |
871 | 0 | BUG_ON(cpumask_empty(&rqd->active)); |
872 | 0 |
|
873 | 0 | peer_cpu = cpumask_first(&rqd->active); |
874 | 0 | BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID || |
875 | 0 | cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID); |
876 | 0 |
|
877 | 0 | if (opt_runqueue == OPT_RUNQUEUE_CPU) |
878 | 0 | continue; |
879 | 0 | if ( opt_runqueue == OPT_RUNQUEUE_ALL || |
880 | 0 | (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) || |
881 | 0 | (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) || |
882 | 0 | (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) ) |
883 | 0 | break; |
884 | 0 | } |
885 | 0 |
|
886 | 0 | /* We really expect to be able to assign each cpu to a runqueue. */ |
887 | 0 | BUG_ON(rqi >= nr_cpu_ids); |
888 | 0 |
|
889 | 0 | return rqi; |
890 | 0 | } |
891 | | |
892 | | /* Find the domain with the highest weight. */ |
893 | | static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight, |
894 | | int old_weight) |
895 | 0 | { |
896 | 0 | /* Try to avoid brute-force search: |
897 | 0 | * - If new_weight is larger, max_weigth <- new_weight |
898 | 0 | * - If old_weight != max_weight, someone else is still max_weight |
899 | 0 | * (No action required) |
900 | 0 | * - If old_weight == max_weight, brute-force search for max weight |
901 | 0 | */ |
902 | 0 | if ( new_weight > rqd->max_weight ) |
903 | 0 | { |
904 | 0 | rqd->max_weight = new_weight; |
905 | 0 | SCHED_STAT_CRANK(upd_max_weight_quick); |
906 | 0 | } |
907 | 0 | else if ( old_weight == rqd->max_weight ) |
908 | 0 | { |
909 | 0 | struct list_head *iter; |
910 | 0 | int max_weight = 1; |
911 | 0 |
|
912 | 0 | list_for_each( iter, &rqd->svc ) |
913 | 0 | { |
914 | 0 | struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, rqd_elem); |
915 | 0 |
|
916 | 0 | if ( svc->weight > max_weight ) |
917 | 0 | max_weight = svc->weight; |
918 | 0 | } |
919 | 0 |
|
920 | 0 | rqd->max_weight = max_weight; |
921 | 0 | SCHED_STAT_CRANK(upd_max_weight_full); |
922 | 0 | } |
923 | 0 |
|
924 | 0 | if ( unlikely(tb_init_done) ) |
925 | 0 | { |
926 | 0 | struct { |
927 | 0 | unsigned rqi:16, max_weight:16; |
928 | 0 | } d; |
929 | 0 | d.rqi = rqd->id; |
930 | 0 | d.max_weight = rqd->max_weight; |
931 | 0 | __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1, |
932 | 0 | sizeof(d), |
933 | 0 | (unsigned char *)&d); |
934 | 0 | } |
935 | 0 | } |
936 | | |
937 | | /* Add and remove from runqueue assignment (not active run queue) */ |
938 | | static void |
939 | | _runq_assign(struct csched2_vcpu *svc, struct csched2_runqueue_data *rqd) |
940 | 0 | { |
941 | 0 |
|
942 | 0 | svc->rqd = rqd; |
943 | 0 | list_add_tail(&svc->rqd_elem, &svc->rqd->svc); |
944 | 0 |
|
945 | 0 | update_max_weight(svc->rqd, svc->weight, 0); |
946 | 0 |
|
947 | 0 | /* Expected new load based on adding this vcpu */ |
948 | 0 | rqd->b_avgload += svc->avgload; |
949 | 0 |
|
950 | 0 | if ( unlikely(tb_init_done) ) |
951 | 0 | { |
952 | 0 | struct { |
953 | 0 | unsigned vcpu:16, dom:16; |
954 | 0 | unsigned rqi:16; |
955 | 0 | } d; |
956 | 0 | d.dom = svc->vcpu->domain->domain_id; |
957 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
958 | 0 | d.rqi=rqd->id; |
959 | 0 | __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, |
960 | 0 | sizeof(d), |
961 | 0 | (unsigned char *)&d); |
962 | 0 | } |
963 | 0 |
|
964 | 0 | } |
965 | | |
966 | | static void |
967 | | runq_assign(const struct scheduler *ops, struct vcpu *vc) |
968 | 0 | { |
969 | 0 | struct csched2_vcpu *svc = vc->sched_priv; |
970 | 0 |
|
971 | 0 | ASSERT(svc->rqd == NULL); |
972 | 0 |
|
973 | 0 | _runq_assign(svc, c2rqd(ops, vc->processor)); |
974 | 0 | } |
975 | | |
976 | | static void |
977 | | _runq_deassign(struct csched2_vcpu *svc) |
978 | 0 | { |
979 | 0 | struct csched2_runqueue_data *rqd = svc->rqd; |
980 | 0 |
|
981 | 0 | ASSERT(!vcpu_on_runq(svc)); |
982 | 0 | ASSERT(!(svc->flags & CSFLAG_scheduled)); |
983 | 0 |
|
984 | 0 | list_del_init(&svc->rqd_elem); |
985 | 0 | update_max_weight(rqd, 0, svc->weight); |
986 | 0 |
|
987 | 0 | /* Expected new load based on removing this vcpu */ |
988 | 0 | rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); |
989 | 0 |
|
990 | 0 | svc->rqd = NULL; |
991 | 0 | } |
992 | | |
993 | | static void |
994 | | runq_deassign(const struct scheduler *ops, struct vcpu *vc) |
995 | 0 | { |
996 | 0 | struct csched2_vcpu *svc = vc->sched_priv; |
997 | 0 |
|
998 | 0 | ASSERT(svc->rqd == c2rqd(ops, vc->processor)); |
999 | 0 |
|
1000 | 0 | _runq_deassign(svc); |
1001 | 0 | } |
1002 | | |
1003 | | /* |
1004 | | * Track the runq load by gathering instantaneous load samples, and using |
1005 | | * exponentially weighted moving average (EWMA) for the 'decaying'. |
1006 | | * |
1007 | | * We consider a window of length W=2^(prv->load_window_shift) nsecs |
1008 | | * (which takes LOADAVG_GRANULARITY_SHIFT into account). |
1009 | | * |
1010 | | * If load is the instantaneous load, the formula for EWMA looks as follows, |
1011 | | * for the i-eth sample: |
1012 | | * |
1013 | | * avg[i] = a*load + (1 - a)*avg[i-1] |
1014 | | * |
1015 | | * where avg[i] is the new value of the average load, avg[i-1] is the value |
1016 | | * of the average load calculated so far, and a is a coefficient less or |
1017 | | * equal to 1. |
1018 | | * |
1019 | | * So, for us, it becomes: |
1020 | | * |
1021 | | * avgload = a*load + (1 - a)*avgload |
1022 | | * |
1023 | | * For determining a, we consider _when_ we are doing the load update, wrt |
1024 | | * the length of the window. We define delta as follows: |
1025 | | * |
1026 | | * delta = t - load_last_update |
1027 | | * |
1028 | | * where t is current time (i.e., time at which we are both sampling and |
1029 | | * updating the load average) and load_last_update is the last time we did |
1030 | | * that. |
1031 | | * |
1032 | | * There are two possible situations: |
1033 | | * |
1034 | | * a) delta <= W |
1035 | | * this means that, during the last window of length W, the runeuque load |
1036 | | * was avgload for (W - detla) time, and load for delta time: |
1037 | | * |
1038 | | * |----------- W ---------| |
1039 | | * | | |
1040 | | * | load_last_update t |
1041 | | * -------------------------|---------|--- |
1042 | | * | | | |
1043 | | * \__W - delta__/\_delta__/ |
1044 | | * | | | |
1045 | | * |___avgload___|__load___| |
1046 | | * |
1047 | | * So, what about using delta/W as our smoothing coefficient a. If we do, |
1048 | | * here's what happens: |
1049 | | * |
1050 | | * a = delta / W |
1051 | | * 1 - a = 1 - (delta / W) = (W - delta) / W |
1052 | | * |
1053 | | * Which matches the above description of what happened in the last |
1054 | | * window of length W. |
1055 | | * |
1056 | | * Note that this also means that the weight that we assign to both the |
1057 | | * latest load sample, and to previous history, varies at each update. |
1058 | | * The longer the latest load sample has been in efect, within the last |
1059 | | * window, the higher it weights (and the lesser the previous history |
1060 | | * weights). |
1061 | | * |
1062 | | * This is some sort of extension of plain EWMA to fit even better to our |
1063 | | * use case. |
1064 | | * |
1065 | | * b) delta > W |
1066 | | * this means more than a full window has passed since the last update: |
1067 | | * |
1068 | | * |----------- W ---------| |
1069 | | * | | |
1070 | | * load_last_update t |
1071 | | * ----|------------------------------|--- |
1072 | | * | | |
1073 | | * \_________________delta________/ |
1074 | | * |
1075 | | * Basically, it means the last load sample has been in effect for more |
1076 | | * than W time, and hence we should just use it, and forget everything |
1077 | | * before that. |
1078 | | * |
1079 | | * This can be seen as a 'reset condition', occurring when, for whatever |
1080 | | * reason, load has not been updated for longer than we expected. (It is |
1081 | | * also how avgload is assigned its first value.) |
1082 | | * |
1083 | | * The formula for avgload then becomes: |
1084 | | * |
1085 | | * avgload = (delta/W)*load + (W - delta)*avgload/W |
1086 | | * avgload = delta*load/W + W*avgload/W - delta*avgload/W |
1087 | | * avgload = avgload + delta*load/W - delta*avgload/W |
1088 | | * |
1089 | | * So, final form is: |
1090 | | * |
1091 | | * avgload_0 = load |
1092 | | * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W |
1093 | | * |
1094 | | * As a confirmation, let's look at the extremes, when delta is 0 (i.e., |
1095 | | * what happens if we update the load twice, at the same time instant?): |
1096 | | * |
1097 | | * avgload = avgload + 0*load/W - 0*avgload/W |
1098 | | * avgload = avgload |
1099 | | * |
1100 | | * and when delta is W (i.e., what happens if we update at the last |
1101 | | * possible instant before the window 'expires'?): |
1102 | | * |
1103 | | * avgload = avgload + W*load/W - W*avgload/W |
1104 | | * avgload = avgload + load - avgload |
1105 | | * avgload = load |
1106 | | * |
1107 | | * Which, in both cases, is what we expect. |
1108 | | */ |
1109 | | static void |
1110 | | update_runq_load(const struct scheduler *ops, |
1111 | | struct csched2_runqueue_data *rqd, int change, s_time_t now) |
1112 | 0 | { |
1113 | 0 | struct csched2_private *prv = csched2_priv(ops); |
1114 | 0 | s_time_t delta, load = rqd->load; |
1115 | 0 | unsigned int P, W; |
1116 | 0 |
|
1117 | 0 | W = prv->load_window_shift; |
1118 | 0 | P = prv->load_precision_shift; |
1119 | 0 | now >>= LOADAVG_GRANULARITY_SHIFT; |
1120 | 0 |
|
1121 | 0 | /* |
1122 | 0 | * To avoid using fractions, we shift to left by load_precision_shift, |
1123 | 0 | * and use the least last load_precision_shift bits as fractional part. |
1124 | 0 | * Looking back at the formula we want to use, we now have: |
1125 | 0 | * |
1126 | 0 | * P = 2^(load_precision_shift) |
1127 | 0 | * P*avgload = P*(avgload + delta*load/W - delta*avgload/W) |
1128 | 0 | * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W |
1129 | 0 | * |
1130 | 0 | * And if we are ok storing and using P*avgload, we can rewrite this as: |
1131 | 0 | * |
1132 | 0 | * P*avgload = avgload' |
1133 | 0 | * avgload' = avgload' + delta*P*load/W - delta*avgload'/W |
1134 | 0 | * |
1135 | 0 | * Coupled with, of course: |
1136 | 0 | * |
1137 | 0 | * avgload_0' = P*load |
1138 | 0 | */ |
1139 | 0 |
|
1140 | 0 | if ( rqd->load_last_update + (1ULL << W) < now ) |
1141 | 0 | { |
1142 | 0 | rqd->avgload = load << P; |
1143 | 0 | rqd->b_avgload = load << P; |
1144 | 0 | } |
1145 | 0 | else |
1146 | 0 | { |
1147 | 0 | delta = now - rqd->load_last_update; |
1148 | 0 | if ( unlikely(delta < 0) ) |
1149 | 0 | { |
1150 | 0 | d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", |
1151 | 0 | __func__, now, rqd->load_last_update); |
1152 | 0 | delta = 0; |
1153 | 0 | } |
1154 | 0 |
|
1155 | 0 | /* |
1156 | 0 | * Note that, if we were to enforce (or check) some relationship |
1157 | 0 | * between P and W, we may save one shift. E.g., if we are sure |
1158 | 0 | * that P < W, we could write: |
1159 | 0 | * |
1160 | 0 | * (delta * (load << P)) >> W |
1161 | 0 | * |
1162 | 0 | * as: |
1163 | 0 | * |
1164 | 0 | * (delta * load) >> (W - P) |
1165 | 0 | */ |
1166 | 0 | rqd->avgload = rqd->avgload + |
1167 | 0 | ((delta * (load << P)) >> W) - |
1168 | 0 | ((delta * rqd->avgload) >> W); |
1169 | 0 | rqd->b_avgload = rqd->b_avgload + |
1170 | 0 | ((delta * (load << P)) >> W) - |
1171 | 0 | ((delta * rqd->b_avgload) >> W); |
1172 | 0 | } |
1173 | 0 | rqd->load += change; |
1174 | 0 | rqd->load_last_update = now; |
1175 | 0 |
|
1176 | 0 | /* Overflow, capable of making the load look negative, must not occur. */ |
1177 | 0 | ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0); |
1178 | 0 |
|
1179 | 0 | if ( unlikely(tb_init_done) ) |
1180 | 0 | { |
1181 | 0 | struct { |
1182 | 0 | uint64_t rq_avgload, b_avgload; |
1183 | 0 | unsigned rq_load:16, rq_id:8, shift:8; |
1184 | 0 | } d; |
1185 | 0 | d.rq_id = rqd->id; |
1186 | 0 | d.rq_load = rqd->load; |
1187 | 0 | d.rq_avgload = rqd->avgload; |
1188 | 0 | d.b_avgload = rqd->b_avgload; |
1189 | 0 | d.shift = P; |
1190 | 0 | __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, |
1191 | 0 | sizeof(d), |
1192 | 0 | (unsigned char *)&d); |
1193 | 0 | } |
1194 | 0 | } |
1195 | | |
1196 | | static void |
1197 | | update_svc_load(const struct scheduler *ops, |
1198 | | struct csched2_vcpu *svc, int change, s_time_t now) |
1199 | 0 | { |
1200 | 0 | struct csched2_private *prv = csched2_priv(ops); |
1201 | 0 | s_time_t delta, vcpu_load; |
1202 | 0 | unsigned int P, W; |
1203 | 0 |
|
1204 | 0 | if ( change == -1 ) |
1205 | 0 | vcpu_load = 1; |
1206 | 0 | else if ( change == 1 ) |
1207 | 0 | vcpu_load = 0; |
1208 | 0 | else |
1209 | 0 | vcpu_load = vcpu_runnable(svc->vcpu); |
1210 | 0 |
|
1211 | 0 | W = prv->load_window_shift; |
1212 | 0 | P = prv->load_precision_shift; |
1213 | 0 | now >>= LOADAVG_GRANULARITY_SHIFT; |
1214 | 0 |
|
1215 | 0 | if ( svc->load_last_update + (1ULL << W) < now ) |
1216 | 0 | { |
1217 | 0 | svc->avgload = vcpu_load << P; |
1218 | 0 | } |
1219 | 0 | else |
1220 | 0 | { |
1221 | 0 | delta = now - svc->load_last_update; |
1222 | 0 | if ( unlikely(delta < 0) ) |
1223 | 0 | { |
1224 | 0 | d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", |
1225 | 0 | __func__, now, svc->load_last_update); |
1226 | 0 | delta = 0; |
1227 | 0 | } |
1228 | 0 |
|
1229 | 0 | svc->avgload = svc->avgload + |
1230 | 0 | ((delta * (vcpu_load << P)) >> W) - |
1231 | 0 | ((delta * svc->avgload) >> W); |
1232 | 0 | } |
1233 | 0 | svc->load_last_update = now; |
1234 | 0 |
|
1235 | 0 | /* Overflow, capable of making the load look negative, must not occur. */ |
1236 | 0 | ASSERT(svc->avgload >= 0); |
1237 | 0 |
|
1238 | 0 | if ( unlikely(tb_init_done) ) |
1239 | 0 | { |
1240 | 0 | struct { |
1241 | 0 | uint64_t v_avgload; |
1242 | 0 | unsigned vcpu:16, dom:16; |
1243 | 0 | unsigned shift; |
1244 | 0 | } d; |
1245 | 0 | d.dom = svc->vcpu->domain->domain_id; |
1246 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
1247 | 0 | d.v_avgload = svc->avgload; |
1248 | 0 | d.shift = P; |
1249 | 0 | __trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1, |
1250 | 0 | sizeof(d), |
1251 | 0 | (unsigned char *)&d); |
1252 | 0 | } |
1253 | 0 | } |
1254 | | |
1255 | | static void |
1256 | | update_load(const struct scheduler *ops, |
1257 | | struct csched2_runqueue_data *rqd, |
1258 | | struct csched2_vcpu *svc, int change, s_time_t now) |
1259 | 0 | { |
1260 | 0 | trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL); |
1261 | 0 |
|
1262 | 0 | update_runq_load(ops, rqd, change, now); |
1263 | 0 | if ( svc ) |
1264 | 0 | update_svc_load(ops, svc, change, now); |
1265 | 0 | } |
1266 | | |
1267 | | static void |
1268 | | runq_insert(const struct scheduler *ops, struct csched2_vcpu *svc) |
1269 | 0 | { |
1270 | 0 | struct list_head *iter; |
1271 | 0 | unsigned int cpu = svc->vcpu->processor; |
1272 | 0 | struct list_head * runq = &c2rqd(ops, cpu)->runq; |
1273 | 0 | int pos = 0; |
1274 | 0 |
|
1275 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
1276 | 0 |
|
1277 | 0 | ASSERT(!vcpu_on_runq(svc)); |
1278 | 0 | ASSERT(c2r(cpu) == c2r(svc->vcpu->processor)); |
1279 | 0 |
|
1280 | 0 | ASSERT(&svc->rqd->runq == runq); |
1281 | 0 | ASSERT(!is_idle_vcpu(svc->vcpu)); |
1282 | 0 | ASSERT(!svc->vcpu->is_running); |
1283 | 0 | ASSERT(!(svc->flags & CSFLAG_scheduled)); |
1284 | 0 |
|
1285 | 0 | list_for_each( iter, runq ) |
1286 | 0 | { |
1287 | 0 | struct csched2_vcpu * iter_svc = runq_elem(iter); |
1288 | 0 |
|
1289 | 0 | if ( svc->credit > iter_svc->credit ) |
1290 | 0 | break; |
1291 | 0 |
|
1292 | 0 | pos++; |
1293 | 0 | } |
1294 | 0 | list_add_tail(&svc->runq_elem, iter); |
1295 | 0 |
|
1296 | 0 | if ( unlikely(tb_init_done) ) |
1297 | 0 | { |
1298 | 0 | struct { |
1299 | 0 | unsigned vcpu:16, dom:16; |
1300 | 0 | unsigned pos; |
1301 | 0 | } d; |
1302 | 0 | d.dom = svc->vcpu->domain->domain_id; |
1303 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
1304 | 0 | d.pos = pos; |
1305 | 0 | __trace_var(TRC_CSCHED2_RUNQ_POS, 1, |
1306 | 0 | sizeof(d), |
1307 | 0 | (unsigned char *)&d); |
1308 | 0 | } |
1309 | 0 | } |
1310 | | |
1311 | | static inline void runq_remove(struct csched2_vcpu *svc) |
1312 | 0 | { |
1313 | 0 | ASSERT(vcpu_on_runq(svc)); |
1314 | 0 | list_del_init(&svc->runq_elem); |
1315 | 0 | } |
1316 | | |
1317 | | void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *, s_time_t); |
1318 | | |
1319 | | static inline void |
1320 | | tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd) |
1321 | 0 | { |
1322 | 0 | __cpumask_set_cpu(cpu, &rqd->tickled); |
1323 | 0 | smt_idle_mask_clear(cpu, &rqd->smt_idle); |
1324 | 0 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
1325 | 0 | } |
1326 | | |
1327 | | /* |
1328 | | * What we want to know is whether svc, which we assume to be running on some |
1329 | | * pcpu, can be interrupted and preempted (which, so far, basically means |
1330 | | * whether or not it already run for more than the ratelimit, to which we |
1331 | | * apply some tolerance). |
1332 | | */ |
1333 | | static inline bool is_preemptable(const struct csched2_vcpu *svc, |
1334 | | s_time_t now, s_time_t ratelimit) |
1335 | 0 | { |
1336 | 0 | if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE ) |
1337 | 0 | return true; |
1338 | 0 |
|
1339 | 0 | ASSERT(svc->vcpu->is_running); |
1340 | 0 | return now - svc->vcpu->runstate.state_entry_time > |
1341 | 0 | ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE; |
1342 | 0 | } |
1343 | | |
1344 | | /* |
1345 | | * Score to preempt the target cpu. Return a negative number if the |
1346 | | * credit isn't high enough; if it is, favor a preemption on cpu in |
1347 | | * this order: |
1348 | | * - cpu is in new's soft-affinity, not in cur's soft-affinity |
1349 | | * (2 x CSCHED2_CREDIT_INIT score bonus); |
1350 | | * - cpu is in new's soft-affinity and cur's soft-affinity, or |
1351 | | * cpu is not in new's soft-affinity, nor in cur's soft-affinity |
1352 | | * (1x CSCHED2_CREDIT_INIT score bonus); |
1353 | | * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity |
1354 | | * (no bonus). |
1355 | | * |
1356 | | * Within the same class, the highest difference of credit. |
1357 | | */ |
1358 | | static s_time_t tickle_score(const struct scheduler *ops, s_time_t now, |
1359 | | struct csched2_vcpu *new, unsigned int cpu) |
1360 | 0 | { |
1361 | 0 | struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); |
1362 | 0 | struct csched2_vcpu * cur = csched2_vcpu(curr_on_cpu(cpu)); |
1363 | 0 | struct csched2_private *prv = csched2_priv(ops); |
1364 | 0 | s_time_t score; |
1365 | 0 |
|
1366 | 0 | /* |
1367 | 0 | * We are dealing with cpus that are marked non-idle (i.e., that are not |
1368 | 0 | * in rqd->idle). However, some of them may be running their idle vcpu, |
1369 | 0 | * if taking care of tasklets. In that case, we want to leave it alone. |
1370 | 0 | */ |
1371 | 0 | if ( unlikely(is_idle_vcpu(cur->vcpu) || |
1372 | 0 | !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) ) |
1373 | 0 | return -1; |
1374 | 0 |
|
1375 | 0 | burn_credits(rqd, cur, now); |
1376 | 0 |
|
1377 | 0 | score = new->credit - cur->credit; |
1378 | 0 | if ( new->vcpu->processor != cpu ) |
1379 | 0 | score -= CSCHED2_MIGRATE_RESIST; |
1380 | 0 |
|
1381 | 0 | /* |
1382 | 0 | * If score is positive, it means new has enough credits (i.e., |
1383 | 0 | * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST). |
1384 | 0 | * |
1385 | 0 | * Let's compute the bonuses for soft-affinities. |
1386 | 0 | */ |
1387 | 0 | if ( score > 0 ) |
1388 | 0 | { |
1389 | 0 | if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) ) |
1390 | 0 | score += CSCHED2_CREDIT_INIT; |
1391 | 0 |
|
1392 | 0 | if ( !cpumask_test_cpu(cpu, cur->vcpu->cpu_soft_affinity) ) |
1393 | 0 | score += CSCHED2_CREDIT_INIT; |
1394 | 0 | } |
1395 | 0 |
|
1396 | 0 | if ( unlikely(tb_init_done) ) |
1397 | 0 | { |
1398 | 0 | struct { |
1399 | 0 | unsigned vcpu:16, dom:16; |
1400 | 0 | int credit, score; |
1401 | 0 | } d; |
1402 | 0 | d.dom = cur->vcpu->domain->domain_id; |
1403 | 0 | d.vcpu = cur->vcpu->vcpu_id; |
1404 | 0 | d.credit = cur->credit; |
1405 | 0 | d.score = score; |
1406 | 0 | __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, |
1407 | 0 | sizeof(d), |
1408 | 0 | (unsigned char *)&d); |
1409 | 0 | } |
1410 | 0 |
|
1411 | 0 | return score; |
1412 | 0 | } |
1413 | | |
1414 | | /* |
1415 | | * Check what processor it is best to 'wake', for picking up a vcpu that has |
1416 | | * just been put (back) in the runqueue. Logic is as follows: |
1417 | | * 1. if there are idle processors in the runq, wake one of them; |
1418 | | * 2. if there aren't idle processor, check the one were the vcpu was |
1419 | | * running before to see if we can preempt what's running there now |
1420 | | * (and hence doing just one migration); |
1421 | | * 3. last stand: check all processors and see if the vcpu is in right |
1422 | | * of preempting any of the other vcpus running on them (this requires |
1423 | | * two migrations, and that's indeed why it is left as the last stand). |
1424 | | * |
1425 | | * Note that when we say 'idle processors' what we really mean is (pretty |
1426 | | * much always) both _idle_ and _not_already_tickled_. In fact, if a |
1427 | | * processor has been tickled, it will run csched2_schedule() shortly, and |
1428 | | * pick up some work, so it would be wrong to consider it idle. |
1429 | | */ |
1430 | | static void |
1431 | | runq_tickle(const struct scheduler *ops, struct csched2_vcpu *new, s_time_t now) |
1432 | 0 | { |
1433 | 0 | int i, ipid = -1; |
1434 | 0 | s_time_t max = 0; |
1435 | 0 | unsigned int bs, cpu = new->vcpu->processor; |
1436 | 0 | struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); |
1437 | 0 | cpumask_t *online = cpupool_domain_cpumask(new->vcpu->domain); |
1438 | 0 | cpumask_t mask; |
1439 | 0 |
|
1440 | 0 | ASSERT(new->rqd == rqd); |
1441 | 0 |
|
1442 | 0 | if ( unlikely(tb_init_done) ) |
1443 | 0 | { |
1444 | 0 | struct { |
1445 | 0 | unsigned vcpu:16, dom:16; |
1446 | 0 | unsigned processor; |
1447 | 0 | int credit; |
1448 | 0 | } d; |
1449 | 0 | d.dom = new->vcpu->domain->domain_id; |
1450 | 0 | d.vcpu = new->vcpu->vcpu_id; |
1451 | 0 | d.processor = new->vcpu->processor; |
1452 | 0 | d.credit = new->credit; |
1453 | 0 | __trace_var(TRC_CSCHED2_TICKLE_NEW, 1, |
1454 | 0 | sizeof(d), |
1455 | 0 | (unsigned char *)&d); |
1456 | 0 | } |
1457 | 0 |
|
1458 | 0 | for_each_affinity_balance_step( bs ) |
1459 | 0 | { |
1460 | 0 | /* Just skip first step, if we don't have a soft affinity */ |
1461 | 0 | if ( bs == BALANCE_SOFT_AFFINITY && |
1462 | 0 | !has_soft_affinity(new->vcpu, new->vcpu->cpu_hard_affinity) ) |
1463 | 0 | continue; |
1464 | 0 |
|
1465 | 0 | affinity_balance_cpumask(new->vcpu, bs, cpumask_scratch_cpu(cpu)); |
1466 | 0 |
|
1467 | 0 | /* |
1468 | 0 | * First of all, consider idle cpus, checking if we can just |
1469 | 0 | * re-use the pcpu where we were running before. |
1470 | 0 | * |
1471 | 0 | * If there are cores where all the siblings are idle, consider |
1472 | 0 | * them first, honoring whatever the spreading-vs-consolidation |
1473 | 0 | * SMT policy wants us to do. |
1474 | 0 | */ |
1475 | 0 | if ( unlikely(sched_smt_power_savings) ) |
1476 | 0 | { |
1477 | 0 | cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle); |
1478 | 0 | cpumask_and(&mask, &mask, online); |
1479 | 0 | } |
1480 | 0 | else |
1481 | 0 | cpumask_and(&mask, &rqd->smt_idle, online); |
1482 | 0 | cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); |
1483 | 0 | i = cpumask_test_or_cycle(cpu, &mask); |
1484 | 0 | if ( i < nr_cpu_ids ) |
1485 | 0 | { |
1486 | 0 | SCHED_STAT_CRANK(tickled_idle_cpu); |
1487 | 0 | ipid = i; |
1488 | 0 | goto tickle; |
1489 | 0 | } |
1490 | 0 |
|
1491 | 0 | /* |
1492 | 0 | * If there are no fully idle cores, check all idlers, after |
1493 | 0 | * having filtered out pcpus that have been tickled but haven't |
1494 | 0 | * gone through the scheduler yet. |
1495 | 0 | */ |
1496 | 0 | cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); |
1497 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online); |
1498 | 0 | cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); |
1499 | 0 | i = cpumask_test_or_cycle(cpu, &mask); |
1500 | 0 | if ( i < nr_cpu_ids ) |
1501 | 0 | { |
1502 | 0 | SCHED_STAT_CRANK(tickled_idle_cpu); |
1503 | 0 | ipid = i; |
1504 | 0 | goto tickle; |
1505 | 0 | } |
1506 | 0 | } |
1507 | 0 |
|
1508 | 0 | /* |
1509 | 0 | * Note that, if we are here, it means we have done the hard-affinity |
1510 | 0 | * balancing step of the loop, and hence what we have in cpumask_scratch |
1511 | 0 | * is what we put there for last, i.e., new's vcpu_hard_affinity & online |
1512 | 0 | * which is exactly what we need for the next part of the function. |
1513 | 0 | */ |
1514 | 0 |
|
1515 | 0 | /* |
1516 | 0 | * Otherwise, look for the non-idle (and non-tickled) processors with |
1517 | 0 | * the lowest credit, among the ones new is allowed to run on. Again, |
1518 | 0 | * the cpu were it was running on would be the best candidate. |
1519 | 0 | * |
1520 | 0 | * For deciding which cpu to tickle, we use tickle_score(), which will |
1521 | 0 | * factor in both new's soft-affinity, and the soft-affinity of the |
1522 | 0 | * vcpu running on each cpu that we consider. |
1523 | 0 | */ |
1524 | 0 | cpumask_andnot(&mask, &rqd->active, &rqd->idle); |
1525 | 0 | cpumask_andnot(&mask, &mask, &rqd->tickled); |
1526 | 0 | cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); |
1527 | 0 | if ( __cpumask_test_and_clear_cpu(cpu, &mask) ) |
1528 | 0 | { |
1529 | 0 | s_time_t score = tickle_score(ops, now, new, cpu); |
1530 | 0 |
|
1531 | 0 | if ( score > max ) |
1532 | 0 | { |
1533 | 0 | max = score; |
1534 | 0 | ipid = cpu; |
1535 | 0 |
|
1536 | 0 | /* If this is in new's soft affinity, just take it */ |
1537 | 0 | if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) ) |
1538 | 0 | { |
1539 | 0 | SCHED_STAT_CRANK(tickled_busy_cpu); |
1540 | 0 | goto tickle; |
1541 | 0 | } |
1542 | 0 | } |
1543 | 0 | } |
1544 | 0 |
|
1545 | 0 | for_each_cpu(i, &mask) |
1546 | 0 | { |
1547 | 0 | s_time_t score; |
1548 | 0 |
|
1549 | 0 | /* Already looked at this one above */ |
1550 | 0 | ASSERT(i != cpu); |
1551 | 0 |
|
1552 | 0 | score = tickle_score(ops, now, new, i); |
1553 | 0 |
|
1554 | 0 | if ( score > max ) |
1555 | 0 | { |
1556 | 0 | max = score; |
1557 | 0 | ipid = i; |
1558 | 0 | } |
1559 | 0 | } |
1560 | 0 |
|
1561 | 0 | if ( ipid == -1 ) |
1562 | 0 | { |
1563 | 0 | SCHED_STAT_CRANK(tickled_no_cpu); |
1564 | 0 | return; |
1565 | 0 | } |
1566 | 0 |
|
1567 | 0 | ASSERT(!is_idle_vcpu(curr_on_cpu(ipid))); |
1568 | 0 | SCHED_STAT_CRANK(tickled_busy_cpu); |
1569 | 0 | tickle: |
1570 | 0 | BUG_ON(ipid == -1); |
1571 | 0 |
|
1572 | 0 | if ( unlikely(tb_init_done) ) |
1573 | 0 | { |
1574 | 0 | struct { |
1575 | 0 | unsigned cpu:16, pad:16; |
1576 | 0 | } d; |
1577 | 0 | d.cpu = ipid; d.pad = 0; |
1578 | 0 | __trace_var(TRC_CSCHED2_TICKLE, 1, |
1579 | 0 | sizeof(d), |
1580 | 0 | (unsigned char *)&d); |
1581 | 0 | } |
1582 | 0 |
|
1583 | 0 | tickle_cpu(ipid, rqd); |
1584 | 0 |
|
1585 | 0 | if ( unlikely(new->tickled_cpu != -1) ) |
1586 | 0 | SCHED_STAT_CRANK(tickled_cpu_overwritten); |
1587 | 0 | new->tickled_cpu = ipid; |
1588 | 0 | } |
1589 | | |
1590 | | /* |
1591 | | * Credit-related code |
1592 | | */ |
1593 | | static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now, |
1594 | | struct csched2_vcpu *snext) |
1595 | 0 | { |
1596 | 0 | struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); |
1597 | 0 | struct list_head *iter; |
1598 | 0 | int m; |
1599 | 0 |
|
1600 | 0 | /* |
1601 | 0 | * Under normal circumstances, snext->credit should never be less |
1602 | 0 | * than -CSCHED2_MIN_TIMER. However, under some circumstances, a |
1603 | 0 | * vcpu with low credits may be allowed to run long enough that |
1604 | 0 | * its credits are actually less than -CSCHED2_CREDIT_INIT. |
1605 | 0 | * (Instances have been observed, for example, where a vcpu with |
1606 | 0 | * 200us of credit was allowed to run for 11ms, giving it -10.8ms |
1607 | 0 | * of credit. Thus it was still negative even after the reset.) |
1608 | 0 | * |
1609 | 0 | * If this is the case for snext, we simply want to keep moving |
1610 | 0 | * everyone up until it is in the black again. This fair because |
1611 | 0 | * none of the other vcpus want to run at the moment. |
1612 | 0 | * |
1613 | 0 | * Rather than looping, however, we just calculate a multiplier, |
1614 | 0 | * avoiding an integer division and multiplication in the common |
1615 | 0 | * case. |
1616 | 0 | */ |
1617 | 0 | m = 1; |
1618 | 0 | if ( snext->credit < -CSCHED2_CREDIT_INIT ) |
1619 | 0 | m += (-snext->credit) / CSCHED2_CREDIT_INIT; |
1620 | 0 |
|
1621 | 0 | list_for_each( iter, &rqd->svc ) |
1622 | 0 | { |
1623 | 0 | unsigned int svc_cpu; |
1624 | 0 | struct csched2_vcpu * svc; |
1625 | 0 | int start_credit; |
1626 | 0 |
|
1627 | 0 | svc = list_entry(iter, struct csched2_vcpu, rqd_elem); |
1628 | 0 | svc_cpu = svc->vcpu->processor; |
1629 | 0 |
|
1630 | 0 | ASSERT(!is_idle_vcpu(svc->vcpu)); |
1631 | 0 | ASSERT(svc->rqd == rqd); |
1632 | 0 |
|
1633 | 0 | /* |
1634 | 0 | * If svc is running, it is our responsibility to make sure, here, |
1635 | 0 | * that the credit it has spent so far get accounted. |
1636 | 0 | */ |
1637 | 0 | if ( svc->vcpu == curr_on_cpu(svc_cpu) ) |
1638 | 0 | { |
1639 | 0 | burn_credits(rqd, svc, now); |
1640 | 0 | /* |
1641 | 0 | * And, similarly, in case it has run out of budget, as a |
1642 | 0 | * consequence of this round of accounting, we also must inform |
1643 | 0 | * its pCPU that it's time to park it, and pick up someone else. |
1644 | 0 | */ |
1645 | 0 | if ( unlikely(svc->budget <= 0) ) |
1646 | 0 | tickle_cpu(svc_cpu, rqd); |
1647 | 0 | } |
1648 | 0 |
|
1649 | 0 | start_credit = svc->credit; |
1650 | 0 |
|
1651 | 0 | /* |
1652 | 0 | * Add INIT * m, avoiding integer multiplication in the common case. |
1653 | 0 | */ |
1654 | 0 | if ( likely(m==1) ) |
1655 | 0 | svc->credit += CSCHED2_CREDIT_INIT; |
1656 | 0 | else |
1657 | 0 | svc->credit += m * CSCHED2_CREDIT_INIT; |
1658 | 0 |
|
1659 | 0 | /* "Clip" credits to max carryover */ |
1660 | 0 | if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX ) |
1661 | 0 | svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX; |
1662 | 0 |
|
1663 | 0 | svc->start_time = now; |
1664 | 0 |
|
1665 | 0 | if ( unlikely(tb_init_done) ) |
1666 | 0 | { |
1667 | 0 | struct { |
1668 | 0 | unsigned vcpu:16, dom:16; |
1669 | 0 | int credit_start, credit_end; |
1670 | 0 | unsigned multiplier; |
1671 | 0 | } d; |
1672 | 0 | d.dom = svc->vcpu->domain->domain_id; |
1673 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
1674 | 0 | d.credit_start = start_credit; |
1675 | 0 | d.credit_end = svc->credit; |
1676 | 0 | d.multiplier = m; |
1677 | 0 | __trace_var(TRC_CSCHED2_CREDIT_RESET, 1, |
1678 | 0 | sizeof(d), |
1679 | 0 | (unsigned char *)&d); |
1680 | 0 | } |
1681 | 0 | } |
1682 | 0 |
|
1683 | 0 | SCHED_STAT_CRANK(credit_reset); |
1684 | 0 |
|
1685 | 0 | /* No need to resort runqueue, as everyone's order should be the same. */ |
1686 | 0 | } |
1687 | | |
1688 | | void burn_credits(struct csched2_runqueue_data *rqd, |
1689 | | struct csched2_vcpu *svc, s_time_t now) |
1690 | 0 | { |
1691 | 0 | s_time_t delta; |
1692 | 0 |
|
1693 | 0 | ASSERT(svc == csched2_vcpu(curr_on_cpu(svc->vcpu->processor))); |
1694 | 0 |
|
1695 | 0 | if ( unlikely(is_idle_vcpu(svc->vcpu)) ) |
1696 | 0 | { |
1697 | 0 | ASSERT(svc->credit == CSCHED2_IDLE_CREDIT); |
1698 | 0 | return; |
1699 | 0 | } |
1700 | 0 |
|
1701 | 0 | delta = now - svc->start_time; |
1702 | 0 |
|
1703 | 0 | if ( unlikely(delta <= 0) ) |
1704 | 0 | { |
1705 | 0 | if ( unlikely(delta < 0) ) |
1706 | 0 | d2printk("WARNING: %s: Time went backwards? now %"PRI_stime |
1707 | 0 | " start_time %"PRI_stime"\n", __func__, now, |
1708 | 0 | svc->start_time); |
1709 | 0 | goto out; |
1710 | 0 | } |
1711 | 0 |
|
1712 | 0 | SCHED_STAT_CRANK(burn_credits_t2c); |
1713 | 0 | t2c_update(rqd, delta, svc); |
1714 | 0 |
|
1715 | 0 | if ( has_cap(svc) ) |
1716 | 0 | svc->budget -= delta; |
1717 | 0 |
|
1718 | 0 | svc->start_time = now; |
1719 | 0 |
|
1720 | 0 | out: |
1721 | 0 | if ( unlikely(tb_init_done) ) |
1722 | 0 | { |
1723 | 0 | struct { |
1724 | 0 | unsigned vcpu:16, dom:16; |
1725 | 0 | int credit, budget; |
1726 | 0 | int delta; |
1727 | 0 | } d; |
1728 | 0 | d.dom = svc->vcpu->domain->domain_id; |
1729 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
1730 | 0 | d.credit = svc->credit; |
1731 | 0 | d.budget = has_cap(svc) ? svc->budget : INT_MIN; |
1732 | 0 | d.delta = delta; |
1733 | 0 | __trace_var(TRC_CSCHED2_CREDIT_BURN, 1, |
1734 | 0 | sizeof(d), |
1735 | 0 | (unsigned char *)&d); |
1736 | 0 | } |
1737 | 0 | } |
1738 | | |
1739 | | /* |
1740 | | * Budget-related code. |
1741 | | */ |
1742 | | |
1743 | | static void park_vcpu(struct csched2_vcpu *svc) |
1744 | 0 | { |
1745 | 0 | struct vcpu *v = svc->vcpu; |
1746 | 0 |
|
1747 | 0 | ASSERT(spin_is_locked(&svc->sdom->budget_lock)); |
1748 | 0 |
|
1749 | 0 | /* |
1750 | 0 | * It was impossible to find budget for this vCPU, so it has to be |
1751 | 0 | * "parked". This implies it is not runnable, so we mark it as such in |
1752 | 0 | * its pause_flags. If the vCPU is currently scheduled (which means we |
1753 | 0 | * are here after being called from within csched_schedule()), flagging |
1754 | 0 | * is enough, as we'll choose someone else, and then context_saved() |
1755 | 0 | * will take care of updating the load properly. |
1756 | 0 | * |
1757 | 0 | * If, OTOH, the vCPU is sitting in the runqueue (which means we are here |
1758 | 0 | * after being called from within runq_candidate()), we must go all the |
1759 | 0 | * way down to taking it out of there, and updating the load accordingly. |
1760 | 0 | * |
1761 | 0 | * In both cases, we also add it to the list of parked vCPUs of the domain. |
1762 | 0 | */ |
1763 | 0 | __set_bit(_VPF_parked, &v->pause_flags); |
1764 | 0 | if ( vcpu_on_runq(svc) ) |
1765 | 0 | { |
1766 | 0 | runq_remove(svc); |
1767 | 0 | update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW()); |
1768 | 0 | } |
1769 | 0 | list_add(&svc->parked_elem, &svc->sdom->parked_vcpus); |
1770 | 0 | } |
1771 | | |
1772 | | static bool vcpu_grab_budget(struct csched2_vcpu *svc) |
1773 | 0 | { |
1774 | 0 | struct csched2_dom *sdom = svc->sdom; |
1775 | 0 | unsigned int cpu = svc->vcpu->processor; |
1776 | 0 |
|
1777 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
1778 | 0 |
|
1779 | 0 | if ( svc->budget > 0 ) |
1780 | 0 | return true; |
1781 | 0 |
|
1782 | 0 | /* budget_lock nests inside runqueue lock. */ |
1783 | 0 | spin_lock(&sdom->budget_lock); |
1784 | 0 |
|
1785 | 0 | /* |
1786 | 0 | * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if |
1787 | 0 | * above!). That basically means the vCPU has overrun a bit --because of |
1788 | 0 | * various reasons-- and we want to take that into account. With the +=, |
1789 | 0 | * we are actually subtracting the amount of budget the vCPU has |
1790 | 0 | * overconsumed, from the total domain budget. |
1791 | 0 | */ |
1792 | 0 | sdom->budget += svc->budget; |
1793 | 0 |
|
1794 | 0 | if ( sdom->budget > 0 ) |
1795 | 0 | { |
1796 | 0 | s_time_t budget; |
1797 | 0 |
|
1798 | 0 | /* Get our quota, if there's at least as much budget */ |
1799 | 0 | if ( likely(sdom->budget >= svc->budget_quota) ) |
1800 | 0 | budget = svc->budget_quota; |
1801 | 0 | else |
1802 | 0 | budget = sdom->budget; |
1803 | 0 |
|
1804 | 0 | svc->budget = budget; |
1805 | 0 | sdom->budget -= budget; |
1806 | 0 | } |
1807 | 0 | else |
1808 | 0 | { |
1809 | 0 | svc->budget = 0; |
1810 | 0 | park_vcpu(svc); |
1811 | 0 | } |
1812 | 0 |
|
1813 | 0 | spin_unlock(&sdom->budget_lock); |
1814 | 0 |
|
1815 | 0 | return svc->budget > 0; |
1816 | 0 | } |
1817 | | |
1818 | | static void |
1819 | | vcpu_return_budget(struct csched2_vcpu *svc, struct list_head *parked) |
1820 | 0 | { |
1821 | 0 | struct csched2_dom *sdom = svc->sdom; |
1822 | 0 | unsigned int cpu = svc->vcpu->processor; |
1823 | 0 |
|
1824 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
1825 | 0 | ASSERT(list_empty(parked)); |
1826 | 0 |
|
1827 | 0 | /* budget_lock nests inside runqueue lock. */ |
1828 | 0 | spin_lock(&sdom->budget_lock); |
1829 | 0 |
|
1830 | 0 | /* |
1831 | 0 | * The vCPU is stopping running (e.g., because it's blocking, or it has |
1832 | 0 | * been preempted). If it hasn't consumed all the budget it got when, |
1833 | 0 | * starting to run, put that remaining amount back in the domain's budget |
1834 | 0 | * pool. |
1835 | 0 | */ |
1836 | 0 | sdom->budget += svc->budget; |
1837 | 0 | svc->budget = 0; |
1838 | 0 |
|
1839 | 0 | /* |
1840 | 0 | * Making budget available again to the domain means that parked vCPUs |
1841 | 0 | * may be unparked and run. They are, if any, in the domain's parked_vcpus |
1842 | 0 | * list, so we want to go through that and unpark them (so they can try |
1843 | 0 | * to get some budget). |
1844 | 0 | * |
1845 | 0 | * Touching the list requires the budget_lock, which we hold. Let's |
1846 | 0 | * therefore put everyone in that list in another, temporary list, which |
1847 | 0 | * then the caller will traverse, unparking the vCPUs it finds there. |
1848 | 0 | * |
1849 | 0 | * In fact, we can't do the actual unparking here, because that requires |
1850 | 0 | * taking the runqueue lock of the vCPUs being unparked, and we can't |
1851 | 0 | * take any runqueue locks while we hold a budget_lock. |
1852 | 0 | */ |
1853 | 0 | if ( sdom->budget > 0 ) |
1854 | 0 | list_splice_init(&sdom->parked_vcpus, parked); |
1855 | 0 |
|
1856 | 0 | spin_unlock(&sdom->budget_lock); |
1857 | 0 | } |
1858 | | |
1859 | | static void |
1860 | | unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus) |
1861 | 0 | { |
1862 | 0 | struct csched2_vcpu *svc, *tmp; |
1863 | 0 | spinlock_t *lock; |
1864 | 0 |
|
1865 | 0 | list_for_each_entry_safe(svc, tmp, vcpus, parked_elem) |
1866 | 0 | { |
1867 | 0 | unsigned long flags; |
1868 | 0 | s_time_t now; |
1869 | 0 |
|
1870 | 0 | lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags); |
1871 | 0 |
|
1872 | 0 | __clear_bit(_VPF_parked, &svc->vcpu->pause_flags); |
1873 | 0 | if ( unlikely(svc->flags & CSFLAG_scheduled) ) |
1874 | 0 | { |
1875 | 0 | /* |
1876 | 0 | * We end here if a budget replenishment arrived between |
1877 | 0 | * csched2_schedule() (and, in particular, after a call to |
1878 | 0 | * vcpu_grab_budget() that returned false), and |
1879 | 0 | * context_saved(). By setting __CSFLAG_delayed_runq_add, |
1880 | 0 | * we tell context_saved() to put the vCPU back in the |
1881 | 0 | * runqueue, from where it will compete with the others |
1882 | 0 | * for the newly replenished budget. |
1883 | 0 | */ |
1884 | 0 | ASSERT( svc->rqd != NULL ); |
1885 | 0 | ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd ); |
1886 | 0 | __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); |
1887 | 0 | } |
1888 | 0 | else if ( vcpu_runnable(svc->vcpu) ) |
1889 | 0 | { |
1890 | 0 | /* |
1891 | 0 | * The vCPU should go back to the runqueue, and compete for |
1892 | 0 | * the newly replenished budget, but only if it is actually |
1893 | 0 | * runnable (and was therefore offline only because of the |
1894 | 0 | * lack of budget). |
1895 | 0 | */ |
1896 | 0 | now = NOW(); |
1897 | 0 | update_load(ops, svc->rqd, svc, 1, now); |
1898 | 0 | runq_insert(ops, svc); |
1899 | 0 | runq_tickle(ops, svc, now); |
1900 | 0 | } |
1901 | 0 | list_del_init(&svc->parked_elem); |
1902 | 0 |
|
1903 | 0 | vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu); |
1904 | 0 | } |
1905 | 0 | } |
1906 | | |
1907 | | static inline void do_replenish(struct csched2_dom *sdom) |
1908 | 0 | { |
1909 | 0 | sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD; |
1910 | 0 | sdom->budget += sdom->tot_budget; |
1911 | 0 | } |
1912 | | |
1913 | | static void replenish_domain_budget(void* data) |
1914 | 0 | { |
1915 | 0 | struct csched2_dom *sdom = data; |
1916 | 0 | unsigned long flags; |
1917 | 0 | s_time_t now; |
1918 | 0 | LIST_HEAD(parked); |
1919 | 0 |
|
1920 | 0 | spin_lock_irqsave(&sdom->budget_lock, flags); |
1921 | 0 |
|
1922 | 0 | now = NOW(); |
1923 | 0 |
|
1924 | 0 | /* |
1925 | 0 | * Let's do the replenishment. Note, though, that a domain may overrun, |
1926 | 0 | * which means the budget would have gone below 0 (reasons may be system |
1927 | 0 | * overbooking, accounting issues, etc.). It also may happen that we are |
1928 | 0 | * handling the replenishment (much) later than we should (reasons may |
1929 | 0 | * again be overbooking, or issues with timers). |
1930 | 0 | * |
1931 | 0 | * Even in cases of overrun or delay, however, we expect that in 99% of |
1932 | 0 | * cases, doing just one replenishment will be good enough for being able |
1933 | 0 | * to unpark the vCPUs that are waiting for some budget. |
1934 | 0 | */ |
1935 | 0 | do_replenish(sdom); |
1936 | 0 |
|
1937 | 0 | /* |
1938 | 0 | * And now, the special cases: |
1939 | 0 | * 1) if we are late enough to have skipped (at least) one full period, |
1940 | 0 | * what we must do is doing more replenishments. Note that, however, |
1941 | 0 | * every time we add tot_budget to the budget, we also move next_repl |
1942 | 0 | * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always |
1943 | 0 | * respected. |
1944 | 0 | */ |
1945 | 0 | if ( unlikely(sdom->next_repl <= now) ) |
1946 | 0 | { |
1947 | 0 | do |
1948 | 0 | do_replenish(sdom); |
1949 | 0 | while ( sdom->next_repl <= now ); |
1950 | 0 | } |
1951 | 0 | /* |
1952 | 0 | * 2) if we overrun by more than tot_budget, then budget+tot_budget is |
1953 | 0 | * still < 0, which means that we can't unpark the vCPUs. Let's bail, |
1954 | 0 | * and wait for future replenishments. |
1955 | 0 | */ |
1956 | 0 | if ( unlikely(sdom->budget <= 0) ) |
1957 | 0 | { |
1958 | 0 | spin_unlock_irqrestore(&sdom->budget_lock, flags); |
1959 | 0 | goto out; |
1960 | 0 | } |
1961 | 0 |
|
1962 | 0 | /* Since we do more replenishments, make sure we didn't overshot. */ |
1963 | 0 | sdom->budget = min(sdom->budget, sdom->tot_budget); |
1964 | 0 |
|
1965 | 0 | /* |
1966 | 0 | * As above, let's prepare the temporary list, out of the domain's |
1967 | 0 | * parked_vcpus list, now that we hold the budget_lock. Then, drop such |
1968 | 0 | * lock, and pass the list to the unparking function. |
1969 | 0 | */ |
1970 | 0 | list_splice_init(&sdom->parked_vcpus, &parked); |
1971 | 0 |
|
1972 | 0 | spin_unlock_irqrestore(&sdom->budget_lock, flags); |
1973 | 0 |
|
1974 | 0 | unpark_parked_vcpus(sdom->dom->cpupool->sched, &parked); |
1975 | 0 |
|
1976 | 0 | out: |
1977 | 0 | set_timer(sdom->repl_timer, sdom->next_repl); |
1978 | 0 | } |
1979 | | |
1980 | | #ifndef NDEBUG |
1981 | | static inline void |
1982 | | csched2_vcpu_check(struct vcpu *vc) |
1983 | 0 | { |
1984 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
1985 | 0 | struct csched2_dom * const sdom = svc->sdom; |
1986 | 0 |
|
1987 | 0 | BUG_ON( svc->vcpu != vc ); |
1988 | 0 | BUG_ON( sdom != csched2_dom(vc->domain) ); |
1989 | 0 | if ( sdom ) |
1990 | 0 | { |
1991 | 0 | BUG_ON( is_idle_vcpu(vc) ); |
1992 | 0 | BUG_ON( sdom->dom != vc->domain ); |
1993 | 0 | } |
1994 | 0 | else |
1995 | 0 | { |
1996 | 0 | BUG_ON( !is_idle_vcpu(vc) ); |
1997 | 0 | } |
1998 | 0 | SCHED_STAT_CRANK(vcpu_check); |
1999 | 0 | } |
2000 | 0 | #define CSCHED2_VCPU_CHECK(_vc) (csched2_vcpu_check(_vc)) |
2001 | | #else |
2002 | | #define CSCHED2_VCPU_CHECK(_vc) |
2003 | | #endif |
2004 | | |
2005 | | static void * |
2006 | | csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) |
2007 | 0 | { |
2008 | 0 | struct csched2_vcpu *svc; |
2009 | 0 |
|
2010 | 0 | /* Allocate per-VCPU info */ |
2011 | 0 | svc = xzalloc(struct csched2_vcpu); |
2012 | 0 | if ( svc == NULL ) |
2013 | 0 | return NULL; |
2014 | 0 |
|
2015 | 0 | INIT_LIST_HEAD(&svc->rqd_elem); |
2016 | 0 | INIT_LIST_HEAD(&svc->runq_elem); |
2017 | 0 |
|
2018 | 0 | svc->sdom = dd; |
2019 | 0 | svc->vcpu = vc; |
2020 | 0 | svc->flags = 0U; |
2021 | 0 |
|
2022 | 0 | if ( ! is_idle_vcpu(vc) ) |
2023 | 0 | { |
2024 | 0 | ASSERT(svc->sdom != NULL); |
2025 | 0 | svc->credit = CSCHED2_CREDIT_INIT; |
2026 | 0 | svc->weight = svc->sdom->weight; |
2027 | 0 | /* Starting load of 50% */ |
2028 | 0 | svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1); |
2029 | 0 | svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT; |
2030 | 0 | } |
2031 | 0 | else |
2032 | 0 | { |
2033 | 0 | ASSERT(svc->sdom == NULL); |
2034 | 0 | svc->credit = CSCHED2_IDLE_CREDIT; |
2035 | 0 | svc->weight = 0; |
2036 | 0 | } |
2037 | 0 | svc->tickled_cpu = -1; |
2038 | 0 |
|
2039 | 0 | svc->budget = STIME_MAX; |
2040 | 0 | svc->budget_quota = 0; |
2041 | 0 | INIT_LIST_HEAD(&svc->parked_elem); |
2042 | 0 |
|
2043 | 0 | SCHED_STAT_CRANK(vcpu_alloc); |
2044 | 0 |
|
2045 | 0 | return svc; |
2046 | 0 | } |
2047 | | |
2048 | | static void |
2049 | | csched2_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) |
2050 | 0 | { |
2051 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
2052 | 0 |
|
2053 | 0 | ASSERT(!is_idle_vcpu(vc)); |
2054 | 0 | SCHED_STAT_CRANK(vcpu_sleep); |
2055 | 0 |
|
2056 | 0 | if ( curr_on_cpu(vc->processor) == vc ) |
2057 | 0 | { |
2058 | 0 | tickle_cpu(vc->processor, svc->rqd); |
2059 | 0 | } |
2060 | 0 | else if ( vcpu_on_runq(svc) ) |
2061 | 0 | { |
2062 | 0 | ASSERT(svc->rqd == c2rqd(ops, vc->processor)); |
2063 | 0 | update_load(ops, svc->rqd, svc, -1, NOW()); |
2064 | 0 | runq_remove(svc); |
2065 | 0 | } |
2066 | 0 | else if ( svc->flags & CSFLAG_delayed_runq_add ) |
2067 | 0 | __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); |
2068 | 0 | } |
2069 | | |
2070 | | static void |
2071 | | csched2_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) |
2072 | 0 | { |
2073 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
2074 | 0 | unsigned int cpu = vc->processor; |
2075 | 0 | s_time_t now; |
2076 | 0 |
|
2077 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
2078 | 0 |
|
2079 | 0 | ASSERT(!is_idle_vcpu(vc)); |
2080 | 0 |
|
2081 | 0 | if ( unlikely(curr_on_cpu(cpu) == vc) ) |
2082 | 0 | { |
2083 | 0 | SCHED_STAT_CRANK(vcpu_wake_running); |
2084 | 0 | goto out; |
2085 | 0 | } |
2086 | 0 |
|
2087 | 0 | if ( unlikely(vcpu_on_runq(svc)) ) |
2088 | 0 | { |
2089 | 0 | SCHED_STAT_CRANK(vcpu_wake_onrunq); |
2090 | 0 | goto out; |
2091 | 0 | } |
2092 | 0 |
|
2093 | 0 | if ( likely(vcpu_runnable(vc)) ) |
2094 | 0 | SCHED_STAT_CRANK(vcpu_wake_runnable); |
2095 | 0 | else |
2096 | 0 | SCHED_STAT_CRANK(vcpu_wake_not_runnable); |
2097 | 0 |
|
2098 | 0 | /* If the context hasn't been saved for this vcpu yet, we can't put it on |
2099 | 0 | * another runqueue. Instead, we set a flag so that it will be put on the runqueue |
2100 | 0 | * after the context has been saved. */ |
2101 | 0 | if ( unlikely(svc->flags & CSFLAG_scheduled) ) |
2102 | 0 | { |
2103 | 0 | __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); |
2104 | 0 | goto out; |
2105 | 0 | } |
2106 | 0 |
|
2107 | 0 | /* Add into the new runqueue if necessary */ |
2108 | 0 | if ( svc->rqd == NULL ) |
2109 | 0 | runq_assign(ops, vc); |
2110 | 0 | else |
2111 | 0 | ASSERT(c2rqd(ops, vc->processor) == svc->rqd ); |
2112 | 0 |
|
2113 | 0 | now = NOW(); |
2114 | 0 |
|
2115 | 0 | update_load(ops, svc->rqd, svc, 1, now); |
2116 | 0 | |
2117 | 0 | /* Put the VCPU on the runq */ |
2118 | 0 | runq_insert(ops, svc); |
2119 | 0 | runq_tickle(ops, svc, now); |
2120 | 0 |
|
2121 | 0 | out: |
2122 | 0 | return; |
2123 | 0 | } |
2124 | | |
2125 | | static void |
2126 | | csched2_vcpu_yield(const struct scheduler *ops, struct vcpu *v) |
2127 | 0 | { |
2128 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(v); |
2129 | 0 |
|
2130 | 0 | __set_bit(__CSFLAG_vcpu_yield, &svc->flags); |
2131 | 0 | } |
2132 | | |
2133 | | static void |
2134 | | csched2_context_saved(const struct scheduler *ops, struct vcpu *vc) |
2135 | 0 | { |
2136 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
2137 | 0 | spinlock_t *lock = vcpu_schedule_lock_irq(vc); |
2138 | 0 | s_time_t now = NOW(); |
2139 | 0 | LIST_HEAD(were_parked); |
2140 | 0 |
|
2141 | 0 | BUG_ON( !is_idle_vcpu(vc) && svc->rqd != c2rqd(ops, vc->processor)); |
2142 | 0 | ASSERT(is_idle_vcpu(vc) || svc->rqd == c2rqd(ops, vc->processor)); |
2143 | 0 |
|
2144 | 0 | /* This vcpu is now eligible to be put on the runqueue again */ |
2145 | 0 | __clear_bit(__CSFLAG_scheduled, &svc->flags); |
2146 | 0 |
|
2147 | 0 | if ( unlikely(has_cap(svc) && svc->budget > 0) ) |
2148 | 0 | vcpu_return_budget(svc, &were_parked); |
2149 | 0 |
|
2150 | 0 | /* If someone wants it on the runqueue, put it there. */ |
2151 | 0 | /* |
2152 | 0 | * NB: We can get rid of CSFLAG_scheduled by checking for |
2153 | 0 | * vc->is_running and vcpu_on_runq(svc) here. However, |
2154 | 0 | * since we're accessing the flags cacheline anyway, |
2155 | 0 | * it seems a bit pointless; especially as we have plenty of |
2156 | 0 | * bits free. |
2157 | 0 | */ |
2158 | 0 | if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) |
2159 | 0 | && likely(vcpu_runnable(vc)) ) |
2160 | 0 | { |
2161 | 0 | ASSERT(!vcpu_on_runq(svc)); |
2162 | 0 |
|
2163 | 0 | runq_insert(ops, svc); |
2164 | 0 | runq_tickle(ops, svc, now); |
2165 | 0 | } |
2166 | 0 | else if ( !is_idle_vcpu(vc) ) |
2167 | 0 | update_load(ops, svc->rqd, svc, -1, now); |
2168 | 0 |
|
2169 | 0 | vcpu_schedule_unlock_irq(lock, vc); |
2170 | 0 |
|
2171 | 0 | unpark_parked_vcpus(ops, &were_parked); |
2172 | 0 | } |
2173 | | |
2174 | 0 | #define MAX_LOAD (STIME_MAX) |
2175 | | static int |
2176 | | csched2_cpu_pick(const struct scheduler *ops, struct vcpu *vc) |
2177 | 0 | { |
2178 | 0 | struct csched2_private *prv = csched2_priv(ops); |
2179 | 0 | int i, min_rqi = -1, min_s_rqi = -1; |
2180 | 0 | unsigned int new_cpu, cpu = vc->processor; |
2181 | 0 | struct csched2_vcpu *svc = csched2_vcpu(vc); |
2182 | 0 | s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD; |
2183 | 0 | bool has_soft; |
2184 | 0 |
|
2185 | 0 | ASSERT(!cpumask_empty(&prv->active_queues)); |
2186 | 0 |
|
2187 | 0 | SCHED_STAT_CRANK(pick_cpu); |
2188 | 0 |
|
2189 | 0 | /* Locking: |
2190 | 0 | * - Runqueue lock of vc->processor is already locked |
2191 | 0 | * - Need to grab prv lock to make sure active runqueues don't |
2192 | 0 | * change |
2193 | 0 | * - Need to grab locks for other runqueues while checking |
2194 | 0 | * avgload |
2195 | 0 | * Locking constraint is: |
2196 | 0 | * - Lock prv before runqueue locks |
2197 | 0 | * - Trylock between runqueue locks (no ordering) |
2198 | 0 | * |
2199 | 0 | * Since one of the runqueue locks is already held, we can't |
2200 | 0 | * just grab the prv lock. Instead, we'll have to trylock, and |
2201 | 0 | * do something else reasonable if we fail. |
2202 | 0 | */ |
2203 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
2204 | 0 |
|
2205 | 0 | if ( !read_trylock(&prv->lock) ) |
2206 | 0 | { |
2207 | 0 | /* We may be here because someone requested us to migrate. */ |
2208 | 0 | __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); |
2209 | 0 | new_cpu = get_fallback_cpu(svc); |
2210 | 0 | /* |
2211 | 0 | * Tracing of runq and its load won't be accurate, since we could |
2212 | 0 | * not get the lock, but at least we will output the chosen pcpu. |
2213 | 0 | */ |
2214 | 0 | goto out; |
2215 | 0 | } |
2216 | 0 |
|
2217 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), vc->cpu_hard_affinity, |
2218 | 0 | cpupool_domain_cpumask(vc->domain)); |
2219 | 0 |
|
2220 | 0 | /* |
2221 | 0 | * First check to see if we're here because someone else suggested a place |
2222 | 0 | * for us to move. |
2223 | 0 | */ |
2224 | 0 | if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) |
2225 | 0 | { |
2226 | 0 | if ( unlikely(svc->migrate_rqd->id < 0) ) |
2227 | 0 | { |
2228 | 0 | printk(XENLOG_WARNING "%s: target runqueue disappeared!\n", |
2229 | 0 | __func__); |
2230 | 0 | } |
2231 | 0 | else if ( cpumask_intersects(cpumask_scratch_cpu(cpu), |
2232 | 0 | &svc->migrate_rqd->active) ) |
2233 | 0 | { |
2234 | 0 | /* |
2235 | 0 | * If we've been asked to move to migrate_rqd, we should just do |
2236 | 0 | * that, which we actually do by returning one cpu from that runq. |
2237 | 0 | * There is no need to take care of soft affinity, as that will |
2238 | 0 | * happen in runq_tickle(). |
2239 | 0 | */ |
2240 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
2241 | 0 | &svc->migrate_rqd->active); |
2242 | 0 | new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias, |
2243 | 0 | cpumask_scratch_cpu(cpu)); |
2244 | 0 |
|
2245 | 0 | svc->migrate_rqd->pick_bias = new_cpu; |
2246 | 0 | goto out_up; |
2247 | 0 | } |
2248 | 0 | /* Fall-through to normal cpu pick */ |
2249 | 0 | } |
2250 | 0 |
|
2251 | 0 | /* |
2252 | 0 | * What we want is: |
2253 | 0 | * - if we have soft affinity, the runqueue with the lowest average |
2254 | 0 | * load, among the ones that contain cpus in our soft affinity; this |
2255 | 0 | * represents the best runq on which we would want to run. |
2256 | 0 | * - the runqueue with the lowest average load among the ones that |
2257 | 0 | * contains cpus in our hard affinity; this represent the best runq |
2258 | 0 | * on which we can run. |
2259 | 0 | * |
2260 | 0 | * Find both runqueues in one pass. |
2261 | 0 | */ |
2262 | 0 | has_soft = has_soft_affinity(vc, vc->cpu_hard_affinity); |
2263 | 0 | for_each_cpu(i, &prv->active_queues) |
2264 | 0 | { |
2265 | 0 | struct csched2_runqueue_data *rqd; |
2266 | 0 | s_time_t rqd_avgload = MAX_LOAD; |
2267 | 0 |
|
2268 | 0 | rqd = prv->rqd + i; |
2269 | 0 |
|
2270 | 0 | /* |
2271 | 0 | * If none of the cpus of this runqueue is in svc's hard-affinity, |
2272 | 0 | * skip the runqueue. |
2273 | 0 | * |
2274 | 0 | * Note that, in case svc's hard-affinity has changed, this is the |
2275 | 0 | * first time when we see such change, so it is indeed possible |
2276 | 0 | * that we end up skipping svc's current runqueue. |
2277 | 0 | */ |
2278 | 0 | if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) ) |
2279 | 0 | continue; |
2280 | 0 |
|
2281 | 0 | /* |
2282 | 0 | * If checking a different runqueue, grab the lock, read the avg, |
2283 | 0 | * and then release the lock. |
2284 | 0 | * |
2285 | 0 | * If on our own runqueue, don't grab or release the lock; |
2286 | 0 | * but subtract our own load from the runqueue load to simulate |
2287 | 0 | * impartiality. |
2288 | 0 | */ |
2289 | 0 | if ( rqd == svc->rqd ) |
2290 | 0 | { |
2291 | 0 | rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); |
2292 | 0 | } |
2293 | 0 | else if ( spin_trylock(&rqd->lock) ) |
2294 | 0 | { |
2295 | 0 | rqd_avgload = rqd->b_avgload; |
2296 | 0 | spin_unlock(&rqd->lock); |
2297 | 0 | } |
2298 | 0 |
|
2299 | 0 | /* |
2300 | 0 | * if svc has a soft-affinity, and some cpus of rqd are part of it, |
2301 | 0 | * see if we need to update the "soft-affinity minimum". |
2302 | 0 | */ |
2303 | 0 | if ( has_soft && |
2304 | 0 | rqd_avgload < min_s_avgload ) |
2305 | 0 | { |
2306 | 0 | cpumask_t mask; |
2307 | 0 |
|
2308 | 0 | cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active); |
2309 | 0 | if ( cpumask_intersects(&mask, svc->vcpu->cpu_soft_affinity) ) |
2310 | 0 | { |
2311 | 0 | min_s_avgload = rqd_avgload; |
2312 | 0 | min_s_rqi = i; |
2313 | 0 | } |
2314 | 0 | } |
2315 | 0 | /* In any case, keep the "hard-affinity minimum" updated too. */ |
2316 | 0 | if ( rqd_avgload < min_avgload ) |
2317 | 0 | { |
2318 | 0 | min_avgload = rqd_avgload; |
2319 | 0 | min_rqi = i; |
2320 | 0 | } |
2321 | 0 | } |
2322 | 0 |
|
2323 | 0 | if ( has_soft && min_s_rqi != -1 ) |
2324 | 0 | { |
2325 | 0 | /* |
2326 | 0 | * We have soft affinity, and we have a candidate runq, so go for it. |
2327 | 0 | * |
2328 | 0 | * Note that, to obtain the soft-affinity mask, we "just" put what we |
2329 | 0 | * have in cpumask_scratch in && with vc->cpu_soft_affinity. This is |
2330 | 0 | * ok because: |
2331 | 0 | * - we know that vc->cpu_hard_affinity and vc->cpu_soft_affinity have |
2332 | 0 | * a non-empty intersection (because has_soft is true); |
2333 | 0 | * - we have vc->cpu_hard_affinity & cpupool_domain_cpumask() already |
2334 | 0 | * in cpumask_scratch, we do save a lot doing like this. |
2335 | 0 | * |
2336 | 0 | * It's kind of like open coding affinity_balance_cpumask() but, in |
2337 | 0 | * this specific case, calling that would mean a lot of (unnecessary) |
2338 | 0 | * cpumask operations. |
2339 | 0 | */ |
2340 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
2341 | 0 | vc->cpu_soft_affinity); |
2342 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
2343 | 0 | &prv->rqd[min_s_rqi].active); |
2344 | 0 | } |
2345 | 0 | else if ( min_rqi != -1 ) |
2346 | 0 | { |
2347 | 0 | /* |
2348 | 0 | * Either we don't have soft-affinity, or we do, but we did not find |
2349 | 0 | * any suitable runq. But we did find one when considering hard |
2350 | 0 | * affinity, so go for it. |
2351 | 0 | * |
2352 | 0 | * cpumask_scratch already has vc->cpu_hard_affinity & |
2353 | 0 | * cpupool_domain_cpumask() in it, so it's enough that we filter |
2354 | 0 | * with the cpus of the runq. |
2355 | 0 | */ |
2356 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
2357 | 0 | &prv->rqd[min_rqi].active); |
2358 | 0 | } |
2359 | 0 | else |
2360 | 0 | { |
2361 | 0 | /* |
2362 | 0 | * We didn't find anyone at all (most likely because of spinlock |
2363 | 0 | * contention). |
2364 | 0 | */ |
2365 | 0 | new_cpu = get_fallback_cpu(svc); |
2366 | 0 | min_rqi = c2r(new_cpu); |
2367 | 0 | min_avgload = prv->rqd[min_rqi].b_avgload; |
2368 | 0 | goto out_up; |
2369 | 0 | } |
2370 | 0 |
|
2371 | 0 | new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias, |
2372 | 0 | cpumask_scratch_cpu(cpu)); |
2373 | 0 | prv->rqd[min_rqi].pick_bias = new_cpu; |
2374 | 0 | BUG_ON(new_cpu >= nr_cpu_ids); |
2375 | 0 |
|
2376 | 0 | out_up: |
2377 | 0 | read_unlock(&prv->lock); |
2378 | 0 | out: |
2379 | 0 | if ( unlikely(tb_init_done) ) |
2380 | 0 | { |
2381 | 0 | struct { |
2382 | 0 | uint64_t b_avgload; |
2383 | 0 | unsigned vcpu:16, dom:16; |
2384 | 0 | unsigned rq_id:16, new_cpu:16; |
2385 | 0 | } d; |
2386 | 0 | d.dom = vc->domain->domain_id; |
2387 | 0 | d.vcpu = vc->vcpu_id; |
2388 | 0 | d.rq_id = min_rqi; |
2389 | 0 | d.b_avgload = min_avgload; |
2390 | 0 | d.new_cpu = new_cpu; |
2391 | 0 | __trace_var(TRC_CSCHED2_PICKED_CPU, 1, |
2392 | 0 | sizeof(d), |
2393 | 0 | (unsigned char *)&d); |
2394 | 0 | } |
2395 | 0 |
|
2396 | 0 | return new_cpu; |
2397 | 0 | } |
2398 | | |
2399 | | /* Working state of the load-balancing algorithm */ |
2400 | | typedef struct { |
2401 | | /* NB: Modified by consider() */ |
2402 | | s_time_t load_delta; |
2403 | | struct csched2_vcpu * best_push_svc, *best_pull_svc; |
2404 | | /* NB: Read by consider() */ |
2405 | | struct csched2_runqueue_data *lrqd; |
2406 | | struct csched2_runqueue_data *orqd; |
2407 | | } balance_state_t; |
2408 | | |
2409 | | static void consider(balance_state_t *st, |
2410 | | struct csched2_vcpu *push_svc, |
2411 | | struct csched2_vcpu *pull_svc) |
2412 | 0 | { |
2413 | 0 | s_time_t l_load, o_load, delta; |
2414 | 0 |
|
2415 | 0 | l_load = st->lrqd->b_avgload; |
2416 | 0 | o_load = st->orqd->b_avgload; |
2417 | 0 | if ( push_svc ) |
2418 | 0 | { |
2419 | 0 | /* What happens to the load on both if we push? */ |
2420 | 0 | l_load -= push_svc->avgload; |
2421 | 0 | o_load += push_svc->avgload; |
2422 | 0 | } |
2423 | 0 | if ( pull_svc ) |
2424 | 0 | { |
2425 | 0 | /* What happens to the load on both if we pull? */ |
2426 | 0 | l_load += pull_svc->avgload; |
2427 | 0 | o_load -= pull_svc->avgload; |
2428 | 0 | } |
2429 | 0 |
|
2430 | 0 | delta = l_load - o_load; |
2431 | 0 | if ( delta < 0 ) |
2432 | 0 | delta = -delta; |
2433 | 0 |
|
2434 | 0 | if ( delta < st->load_delta ) |
2435 | 0 | { |
2436 | 0 | st->load_delta = delta; |
2437 | 0 | st->best_push_svc=push_svc; |
2438 | 0 | st->best_pull_svc=pull_svc; |
2439 | 0 | } |
2440 | 0 | } |
2441 | | |
2442 | | |
2443 | | static void migrate(const struct scheduler *ops, |
2444 | | struct csched2_vcpu *svc, |
2445 | | struct csched2_runqueue_data *trqd, |
2446 | | s_time_t now) |
2447 | 0 | { |
2448 | 0 | int cpu = svc->vcpu->processor; |
2449 | 0 |
|
2450 | 0 | if ( unlikely(tb_init_done) ) |
2451 | 0 | { |
2452 | 0 | struct { |
2453 | 0 | unsigned vcpu:16, dom:16; |
2454 | 0 | unsigned rqi:16, trqi:16; |
2455 | 0 | } d; |
2456 | 0 | d.dom = svc->vcpu->domain->domain_id; |
2457 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
2458 | 0 | d.rqi = svc->rqd->id; |
2459 | 0 | d.trqi = trqd->id; |
2460 | 0 | __trace_var(TRC_CSCHED2_MIGRATE, 1, |
2461 | 0 | sizeof(d), |
2462 | 0 | (unsigned char *)&d); |
2463 | 0 | } |
2464 | 0 |
|
2465 | 0 | if ( svc->flags & CSFLAG_scheduled ) |
2466 | 0 | { |
2467 | 0 | /* It's running; mark it to migrate. */ |
2468 | 0 | svc->migrate_rqd = trqd; |
2469 | 0 | __set_bit(_VPF_migrating, &svc->vcpu->pause_flags); |
2470 | 0 | __set_bit(__CSFLAG_runq_migrate_request, &svc->flags); |
2471 | 0 | SCHED_STAT_CRANK(migrate_requested); |
2472 | 0 | tickle_cpu(cpu, svc->rqd); |
2473 | 0 | } |
2474 | 0 | else |
2475 | 0 | { |
2476 | 0 | int on_runq = 0; |
2477 | 0 | /* It's not running; just move it */ |
2478 | 0 | if ( vcpu_on_runq(svc) ) |
2479 | 0 | { |
2480 | 0 | runq_remove(svc); |
2481 | 0 | update_load(ops, svc->rqd, NULL, -1, now); |
2482 | 0 | on_runq = 1; |
2483 | 0 | } |
2484 | 0 | _runq_deassign(svc); |
2485 | 0 |
|
2486 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), svc->vcpu->cpu_hard_affinity, |
2487 | 0 | cpupool_domain_cpumask(svc->vcpu->domain)); |
2488 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
2489 | 0 | &trqd->active); |
2490 | 0 | svc->vcpu->processor = cpumask_cycle(trqd->pick_bias, |
2491 | 0 | cpumask_scratch_cpu(cpu)); |
2492 | 0 | trqd->pick_bias = svc->vcpu->processor; |
2493 | 0 | ASSERT(svc->vcpu->processor < nr_cpu_ids); |
2494 | 0 |
|
2495 | 0 | _runq_assign(svc, trqd); |
2496 | 0 | if ( on_runq ) |
2497 | 0 | { |
2498 | 0 | update_load(ops, svc->rqd, NULL, 1, now); |
2499 | 0 | runq_insert(ops, svc); |
2500 | 0 | runq_tickle(ops, svc, now); |
2501 | 0 | SCHED_STAT_CRANK(migrate_on_runq); |
2502 | 0 | } |
2503 | 0 | else |
2504 | 0 | SCHED_STAT_CRANK(migrate_no_runq); |
2505 | 0 | } |
2506 | 0 | } |
2507 | | |
2508 | | /* |
2509 | | * It makes sense considering migrating svc to rqd, if: |
2510 | | * - svc is not already flagged to migrate, |
2511 | | * - if svc is allowed to run on at least one of the pcpus of rqd. |
2512 | | */ |
2513 | | static bool vcpu_is_migrateable(struct csched2_vcpu *svc, |
2514 | | struct csched2_runqueue_data *rqd) |
2515 | 0 | { |
2516 | 0 | struct vcpu *v = svc->vcpu; |
2517 | 0 | int cpu = svc->vcpu->processor; |
2518 | 0 |
|
2519 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity, |
2520 | 0 | cpupool_domain_cpumask(v->domain)); |
2521 | 0 |
|
2522 | 0 | return !(svc->flags & CSFLAG_runq_migrate_request) && |
2523 | 0 | cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active); |
2524 | 0 | } |
2525 | | |
2526 | | static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) |
2527 | 0 | { |
2528 | 0 | struct csched2_private *prv = csched2_priv(ops); |
2529 | 0 | int i, max_delta_rqi = -1; |
2530 | 0 | struct list_head *push_iter, *pull_iter; |
2531 | 0 | bool inner_load_updated = 0; |
2532 | 0 |
|
2533 | 0 | balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; |
2534 | 0 |
|
2535 | 0 | /* |
2536 | 0 | * Basic algorithm: Push, pull, or swap. |
2537 | 0 | * - Find the runqueue with the furthest load distance |
2538 | 0 | * - Find a pair that makes the difference the least (where one |
2539 | 0 | * on either side may be empty). |
2540 | 0 | */ |
2541 | 0 |
|
2542 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
2543 | 0 | st.lrqd = c2rqd(ops, cpu); |
2544 | 0 |
|
2545 | 0 | update_runq_load(ops, st.lrqd, 0, now); |
2546 | 0 |
|
2547 | 0 | retry: |
2548 | 0 | if ( !read_trylock(&prv->lock) ) |
2549 | 0 | return; |
2550 | 0 |
|
2551 | 0 | st.load_delta = 0; |
2552 | 0 |
|
2553 | 0 | for_each_cpu(i, &prv->active_queues) |
2554 | 0 | { |
2555 | 0 | s_time_t delta; |
2556 | 0 | |
2557 | 0 | st.orqd = prv->rqd + i; |
2558 | 0 |
|
2559 | 0 | if ( st.orqd == st.lrqd |
2560 | 0 | || !spin_trylock(&st.orqd->lock) ) |
2561 | 0 | continue; |
2562 | 0 |
|
2563 | 0 | update_runq_load(ops, st.orqd, 0, now); |
2564 | 0 | |
2565 | 0 | delta = st.lrqd->b_avgload - st.orqd->b_avgload; |
2566 | 0 | if ( delta < 0 ) |
2567 | 0 | delta = -delta; |
2568 | 0 |
|
2569 | 0 | if ( delta > st.load_delta ) |
2570 | 0 | { |
2571 | 0 | st.load_delta = delta; |
2572 | 0 | max_delta_rqi = i; |
2573 | 0 | } |
2574 | 0 |
|
2575 | 0 | spin_unlock(&st.orqd->lock); |
2576 | 0 | } |
2577 | 0 |
|
2578 | 0 | /* Minimize holding the private scheduler lock. */ |
2579 | 0 | read_unlock(&prv->lock); |
2580 | 0 | if ( max_delta_rqi == -1 ) |
2581 | 0 | goto out; |
2582 | 0 |
|
2583 | 0 | { |
2584 | 0 | s_time_t load_max; |
2585 | 0 | int cpus_max; |
2586 | 0 |
|
2587 | 0 | |
2588 | 0 | load_max = st.lrqd->b_avgload; |
2589 | 0 | if ( st.orqd->b_avgload > load_max ) |
2590 | 0 | load_max = st.orqd->b_avgload; |
2591 | 0 |
|
2592 | 0 | cpus_max = cpumask_weight(&st.lrqd->active); |
2593 | 0 | i = cpumask_weight(&st.orqd->active); |
2594 | 0 | if ( i > cpus_max ) |
2595 | 0 | cpus_max = i; |
2596 | 0 |
|
2597 | 0 | if ( unlikely(tb_init_done) ) |
2598 | 0 | { |
2599 | 0 | struct { |
2600 | 0 | unsigned lrq_id:16, orq_id:16; |
2601 | 0 | unsigned load_delta; |
2602 | 0 | } d; |
2603 | 0 | d.lrq_id = st.lrqd->id; |
2604 | 0 | d.orq_id = st.orqd->id; |
2605 | 0 | d.load_delta = st.load_delta; |
2606 | 0 | __trace_var(TRC_CSCHED2_LOAD_CHECK, 1, |
2607 | 0 | sizeof(d), |
2608 | 0 | (unsigned char *)&d); |
2609 | 0 | } |
2610 | 0 |
|
2611 | 0 | /* |
2612 | 0 | * If we're under 100% capacaty, only shift if load difference |
2613 | 0 | * is > 1. otherwise, shift if under 12.5% |
2614 | 0 | */ |
2615 | 0 | if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) ) |
2616 | 0 | { |
2617 | 0 | if ( st.load_delta < (1ULL << (prv->load_precision_shift + |
2618 | 0 | opt_underload_balance_tolerance)) ) |
2619 | 0 | goto out; |
2620 | 0 | } |
2621 | 0 | else |
2622 | 0 | if ( st.load_delta < (1ULL << (prv->load_precision_shift + |
2623 | 0 | opt_overload_balance_tolerance)) ) |
2624 | 0 | goto out; |
2625 | 0 | } |
2626 | 0 | |
2627 | 0 | /* Try to grab the other runqueue lock; if it's been taken in the |
2628 | 0 | * meantime, try the process over again. This can't deadlock |
2629 | 0 | * because if it doesn't get any other rqd locks, it will simply |
2630 | 0 | * give up and return. */ |
2631 | 0 | st.orqd = prv->rqd + max_delta_rqi; |
2632 | 0 | if ( !spin_trylock(&st.orqd->lock) ) |
2633 | 0 | goto retry; |
2634 | 0 |
|
2635 | 0 | /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ |
2636 | 0 | if ( unlikely(st.orqd->id < 0) ) |
2637 | 0 | goto out_up; |
2638 | 0 |
|
2639 | 0 | if ( unlikely(tb_init_done) ) |
2640 | 0 | { |
2641 | 0 | struct { |
2642 | 0 | uint64_t lb_avgload, ob_avgload; |
2643 | 0 | unsigned lrq_id:16, orq_id:16; |
2644 | 0 | } d; |
2645 | 0 | d.lrq_id = st.lrqd->id; |
2646 | 0 | d.lb_avgload = st.lrqd->b_avgload; |
2647 | 0 | d.orq_id = st.orqd->id; |
2648 | 0 | d.ob_avgload = st.orqd->b_avgload; |
2649 | 0 | __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1, |
2650 | 0 | sizeof(d), |
2651 | 0 | (unsigned char *)&d); |
2652 | 0 | } |
2653 | 0 |
|
2654 | 0 | SCHED_STAT_CRANK(acct_load_balance); |
2655 | 0 |
|
2656 | 0 | /* Look for "swap" which gives the best load average |
2657 | 0 | * FIXME: O(n^2)! */ |
2658 | 0 |
|
2659 | 0 | /* Reuse load delta (as we're trying to minimize it) */ |
2660 | 0 | list_for_each( push_iter, &st.lrqd->svc ) |
2661 | 0 | { |
2662 | 0 | struct csched2_vcpu * push_svc = list_entry(push_iter, struct csched2_vcpu, rqd_elem); |
2663 | 0 |
|
2664 | 0 | update_svc_load(ops, push_svc, 0, now); |
2665 | 0 |
|
2666 | 0 | if ( !vcpu_is_migrateable(push_svc, st.orqd) ) |
2667 | 0 | continue; |
2668 | 0 |
|
2669 | 0 | list_for_each( pull_iter, &st.orqd->svc ) |
2670 | 0 | { |
2671 | 0 | struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem); |
2672 | 0 | |
2673 | 0 | if ( !inner_load_updated ) |
2674 | 0 | update_svc_load(ops, pull_svc, 0, now); |
2675 | 0 | |
2676 | 0 | if ( !vcpu_is_migrateable(pull_svc, st.lrqd) ) |
2677 | 0 | continue; |
2678 | 0 |
|
2679 | 0 | consider(&st, push_svc, pull_svc); |
2680 | 0 | } |
2681 | 0 |
|
2682 | 0 | inner_load_updated = 1; |
2683 | 0 |
|
2684 | 0 | /* Consider push only */ |
2685 | 0 | consider(&st, push_svc, NULL); |
2686 | 0 | } |
2687 | 0 |
|
2688 | 0 | list_for_each( pull_iter, &st.orqd->svc ) |
2689 | 0 | { |
2690 | 0 | struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem); |
2691 | 0 | |
2692 | 0 | if ( !vcpu_is_migrateable(pull_svc, st.lrqd) ) |
2693 | 0 | continue; |
2694 | 0 |
|
2695 | 0 | /* Consider pull only */ |
2696 | 0 | consider(&st, NULL, pull_svc); |
2697 | 0 | } |
2698 | 0 |
|
2699 | 0 | /* OK, now we have some candidates; do the moving */ |
2700 | 0 | if ( st.best_push_svc ) |
2701 | 0 | migrate(ops, st.best_push_svc, st.orqd, now); |
2702 | 0 | if ( st.best_pull_svc ) |
2703 | 0 | migrate(ops, st.best_pull_svc, st.lrqd, now); |
2704 | 0 |
|
2705 | 0 | out_up: |
2706 | 0 | spin_unlock(&st.orqd->lock); |
2707 | 0 | out: |
2708 | 0 | return; |
2709 | 0 | } |
2710 | | |
2711 | | static void |
2712 | | csched2_vcpu_migrate( |
2713 | | const struct scheduler *ops, struct vcpu *vc, unsigned int new_cpu) |
2714 | 0 | { |
2715 | 0 | struct domain *d = vc->domain; |
2716 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
2717 | 0 | struct csched2_runqueue_data *trqd; |
2718 | 0 | s_time_t now = NOW(); |
2719 | 0 |
|
2720 | 0 | /* |
2721 | 0 | * Being passed a target pCPU which is outside of our cpupool is only |
2722 | 0 | * valid if we are shutting down (or doing ACPI suspend), and we are |
2723 | 0 | * moving everyone to BSP, no matter whether or not BSP is inside our |
2724 | 0 | * cpupool. |
2725 | 0 | * |
2726 | 0 | * And since there indeed is the chance that it is not part of it, all |
2727 | 0 | * we must do is remove _and_ unassign the vCPU from any runqueue, as |
2728 | 0 | * well as updating v->processor with the target, so that the suspend |
2729 | 0 | * process can continue. |
2730 | 0 | * |
2731 | 0 | * It will then be during resume that a new, meaningful, value for |
2732 | 0 | * v->processor will be chosen, and during actual domain unpause that |
2733 | 0 | * the vCPU will be assigned to and added to the proper runqueue. |
2734 | 0 | */ |
2735 | 0 | if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_cpumask(d))) ) |
2736 | 0 | { |
2737 | 0 | ASSERT(system_state == SYS_STATE_suspend); |
2738 | 0 | if ( vcpu_on_runq(svc) ) |
2739 | 0 | { |
2740 | 0 | runq_remove(svc); |
2741 | 0 | update_load(ops, svc->rqd, NULL, -1, now); |
2742 | 0 | } |
2743 | 0 | _runq_deassign(svc); |
2744 | 0 | vc->processor = new_cpu; |
2745 | 0 | return; |
2746 | 0 | } |
2747 | 0 |
|
2748 | 0 | /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */ |
2749 | 0 | ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized)); |
2750 | 0 | ASSERT(cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity)); |
2751 | 0 |
|
2752 | 0 | trqd = c2rqd(ops, new_cpu); |
2753 | 0 |
|
2754 | 0 | /* |
2755 | 0 | * Do the actual movement toward new_cpu, and update vc->processor. |
2756 | 0 | * If we are changing runqueue, migrate() takes care of everything. |
2757 | 0 | * If we are not changing runqueue, we need to update vc->processor |
2758 | 0 | * here. In fact, if, for instance, we are here because the vcpu's |
2759 | 0 | * hard affinity changed, we don't want to risk leaving vc->processor |
2760 | 0 | * pointing to a pcpu where we can't run any longer. |
2761 | 0 | */ |
2762 | 0 | if ( trqd != svc->rqd ) |
2763 | 0 | migrate(ops, svc, trqd, now); |
2764 | 0 | else |
2765 | 0 | vc->processor = new_cpu; |
2766 | 0 | } |
2767 | | |
2768 | | static int |
2769 | | csched2_dom_cntl( |
2770 | | const struct scheduler *ops, |
2771 | | struct domain *d, |
2772 | | struct xen_domctl_scheduler_op *op) |
2773 | 0 | { |
2774 | 0 | struct csched2_dom * const sdom = csched2_dom(d); |
2775 | 0 | struct csched2_private *prv = csched2_priv(ops); |
2776 | 0 | unsigned long flags; |
2777 | 0 | struct vcpu *v; |
2778 | 0 | int rc = 0; |
2779 | 0 |
|
2780 | 0 | /* |
2781 | 0 | * Locking: |
2782 | 0 | * - we must take the private lock for accessing the weights of the |
2783 | 0 | * vcpus of d, and/or the cap; |
2784 | 0 | * - in the putinfo case, we also need the runqueue lock(s), for |
2785 | 0 | * updating the max waight of the runqueue(s). |
2786 | 0 | * If changing the cap, we also need the budget_lock, for updating |
2787 | 0 | * the value of the domain budget pool (and the runqueue lock, |
2788 | 0 | * for adjusting the parameters and rescheduling any vCPU that is |
2789 | 0 | * running at the time of the change). |
2790 | 0 | */ |
2791 | 0 | switch ( op->cmd ) |
2792 | 0 | { |
2793 | 0 | case XEN_DOMCTL_SCHEDOP_getinfo: |
2794 | 0 | read_lock_irqsave(&prv->lock, flags); |
2795 | 0 | op->u.credit2.weight = sdom->weight; |
2796 | 0 | op->u.credit2.cap = sdom->cap; |
2797 | 0 | read_unlock_irqrestore(&prv->lock, flags); |
2798 | 0 | break; |
2799 | 0 | case XEN_DOMCTL_SCHEDOP_putinfo: |
2800 | 0 | write_lock_irqsave(&prv->lock, flags); |
2801 | 0 | /* Weight */ |
2802 | 0 | if ( op->u.credit2.weight != 0 ) |
2803 | 0 | { |
2804 | 0 | int old_weight; |
2805 | 0 |
|
2806 | 0 | old_weight = sdom->weight; |
2807 | 0 |
|
2808 | 0 | sdom->weight = op->u.credit2.weight; |
2809 | 0 |
|
2810 | 0 | /* Update weights for vcpus, and max_weight for runqueues on which they reside */ |
2811 | 0 | for_each_vcpu ( d, v ) |
2812 | 0 | { |
2813 | 0 | struct csched2_vcpu *svc = csched2_vcpu(v); |
2814 | 0 | spinlock_t *lock = vcpu_schedule_lock(svc->vcpu); |
2815 | 0 |
|
2816 | 0 | ASSERT(svc->rqd == c2rqd(ops, svc->vcpu->processor)); |
2817 | 0 |
|
2818 | 0 | svc->weight = sdom->weight; |
2819 | 0 | update_max_weight(svc->rqd, svc->weight, old_weight); |
2820 | 0 |
|
2821 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
2822 | 0 | } |
2823 | 0 | } |
2824 | 0 | /* Cap */ |
2825 | 0 | if ( op->u.credit2.cap != 0 ) |
2826 | 0 | { |
2827 | 0 | struct csched2_vcpu *svc; |
2828 | 0 | spinlock_t *lock; |
2829 | 0 |
|
2830 | 0 | /* Cap is only valid if it's below 100 * nr_of_vCPUS */ |
2831 | 0 | if ( op->u.credit2.cap > 100 * sdom->nr_vcpus ) |
2832 | 0 | { |
2833 | 0 | rc = -EINVAL; |
2834 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
2835 | 0 | break; |
2836 | 0 | } |
2837 | 0 |
|
2838 | 0 | spin_lock(&sdom->budget_lock); |
2839 | 0 | sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap); |
2840 | 0 | sdom->tot_budget /= 100; |
2841 | 0 | spin_unlock(&sdom->budget_lock); |
2842 | 0 |
|
2843 | 0 | /* |
2844 | 0 | * When trying to get some budget and run, each vCPU will grab |
2845 | 0 | * from the pool 1/N (with N = nr of vCPUs of the domain) of |
2846 | 0 | * the total budget. Roughly speaking, this means each vCPU will |
2847 | 0 | * have at least one chance to run during every period. |
2848 | 0 | */ |
2849 | 0 | for_each_vcpu ( d, v ) |
2850 | 0 | { |
2851 | 0 | svc = csched2_vcpu(v); |
2852 | 0 | lock = vcpu_schedule_lock(svc->vcpu); |
2853 | 0 | /* |
2854 | 0 | * Too small quotas would in theory cause a lot of overhead, |
2855 | 0 | * which then won't happen because, in csched2_runtime(), |
2856 | 0 | * CSCHED2_MIN_TIMER is what would be used anyway. |
2857 | 0 | */ |
2858 | 0 | svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus, |
2859 | 0 | CSCHED2_MIN_TIMER); |
2860 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
2861 | 0 | } |
2862 | 0 |
|
2863 | 0 | if ( sdom->cap == 0 ) |
2864 | 0 | { |
2865 | 0 | /* |
2866 | 0 | * We give to the domain the budget to which it is entitled, |
2867 | 0 | * and queue its first replenishment event. |
2868 | 0 | * |
2869 | 0 | * Since cap is currently disabled for this domain, we |
2870 | 0 | * know no vCPU is messing with the domain's budget, and |
2871 | 0 | * the replenishment timer is still off. |
2872 | 0 | * For these reasons, it is safe to do the following without |
2873 | 0 | * taking the budget_lock. |
2874 | 0 | */ |
2875 | 0 | sdom->budget = sdom->tot_budget; |
2876 | 0 | sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD; |
2877 | 0 | set_timer(sdom->repl_timer, sdom->next_repl); |
2878 | 0 |
|
2879 | 0 | /* |
2880 | 0 | * Now, let's enable budget accounting for all the vCPUs. |
2881 | 0 | * For making sure that they will start to honour the domain's |
2882 | 0 | * cap, we set their budget to 0. |
2883 | 0 | * This way, as soon as they will try to run, they will have |
2884 | 0 | * to get some budget. |
2885 | 0 | * |
2886 | 0 | * For the vCPUs that are already running, we trigger the |
2887 | 0 | * scheduler on their pCPU. When, as a consequence of this, |
2888 | 0 | * csched2_schedule() will run, it will figure out there is |
2889 | 0 | * no budget, and the vCPU will try to get some (and be parked, |
2890 | 0 | * if there's none, and we'll switch to someone else). |
2891 | 0 | */ |
2892 | 0 | for_each_vcpu ( d, v ) |
2893 | 0 | { |
2894 | 0 | svc = csched2_vcpu(v); |
2895 | 0 | lock = vcpu_schedule_lock(svc->vcpu); |
2896 | 0 | if ( v->is_running ) |
2897 | 0 | { |
2898 | 0 | unsigned int cpu = v->processor; |
2899 | 0 | struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); |
2900 | 0 |
|
2901 | 0 | ASSERT(curr_on_cpu(cpu) == v); |
2902 | 0 |
|
2903 | 0 | /* |
2904 | 0 | * We are triggering a reschedule on the vCPU's |
2905 | 0 | * pCPU. That will run burn_credits() and, since |
2906 | 0 | * the vCPU is capped now, it would charge all the |
2907 | 0 | * execution time of this last round as budget as |
2908 | 0 | * well. That will make the vCPU budget go negative, |
2909 | 0 | * potentially by a large amount, and it's unfair. |
2910 | 0 | * |
2911 | 0 | * To avoid that, call burn_credit() here, to do the |
2912 | 0 | * accounting of this current running instance now, |
2913 | 0 | * with budgetting still disabled. This does not |
2914 | 0 | * prevent some small amount of budget being charged |
2915 | 0 | * to the vCPU (i.e., the amount of time it runs from |
2916 | 0 | * now, to when scheduling happens). The budget will |
2917 | 0 | * also go below 0, but a lot less than how it would |
2918 | 0 | * if we don't do this. |
2919 | 0 | */ |
2920 | 0 | burn_credits(rqd, svc, NOW()); |
2921 | 0 | __cpumask_set_cpu(cpu, &rqd->tickled); |
2922 | 0 | ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle)); |
2923 | 0 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
2924 | 0 | } |
2925 | 0 | svc->budget = 0; |
2926 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
2927 | 0 | } |
2928 | 0 | } |
2929 | 0 |
|
2930 | 0 | sdom->cap = op->u.credit2.cap; |
2931 | 0 | } |
2932 | 0 | else if ( sdom->cap != 0 ) |
2933 | 0 | { |
2934 | 0 | LIST_HEAD(parked); |
2935 | 0 |
|
2936 | 0 | stop_timer(sdom->repl_timer); |
2937 | 0 |
|
2938 | 0 | /* Disable budget accounting for all the vCPUs. */ |
2939 | 0 | for_each_vcpu ( d, v ) |
2940 | 0 | { |
2941 | 0 | struct csched2_vcpu *svc = csched2_vcpu(v); |
2942 | 0 | spinlock_t *lock = vcpu_schedule_lock(svc->vcpu); |
2943 | 0 |
|
2944 | 0 | svc->budget = STIME_MAX; |
2945 | 0 | svc->budget_quota = 0; |
2946 | 0 |
|
2947 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
2948 | 0 | } |
2949 | 0 | sdom->cap = 0; |
2950 | 0 | /* |
2951 | 0 | * We are disabling the cap for this domain, which may have |
2952 | 0 | * vCPUs waiting for a replenishment, so we unpark them all. |
2953 | 0 | * Note that, since we have already disabled budget accounting |
2954 | 0 | * for all the vCPUs of the domain, no currently running vCPU |
2955 | 0 | * will be added to the parked vCPUs list any longer. |
2956 | 0 | */ |
2957 | 0 | spin_lock(&sdom->budget_lock); |
2958 | 0 | list_splice_init(&sdom->parked_vcpus, &parked); |
2959 | 0 | spin_unlock(&sdom->budget_lock); |
2960 | 0 |
|
2961 | 0 | unpark_parked_vcpus(ops, &parked); |
2962 | 0 | } |
2963 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
2964 | 0 | break; |
2965 | 0 | default: |
2966 | 0 | rc = -EINVAL; |
2967 | 0 | break; |
2968 | 0 | } |
2969 | 0 |
|
2970 | 0 |
|
2971 | 0 | return rc; |
2972 | 0 | } |
2973 | | |
2974 | | static int csched2_sys_cntl(const struct scheduler *ops, |
2975 | | struct xen_sysctl_scheduler_op *sc) |
2976 | 0 | { |
2977 | 0 | struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2; |
2978 | 0 | struct csched2_private *prv = csched2_priv(ops); |
2979 | 0 | unsigned long flags; |
2980 | 0 |
|
2981 | 0 | switch (sc->cmd ) |
2982 | 0 | { |
2983 | 0 | case XEN_SYSCTL_SCHEDOP_putinfo: |
2984 | 0 | if ( params->ratelimit_us && |
2985 | 0 | (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || |
2986 | 0 | params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN )) |
2987 | 0 | return -EINVAL; |
2988 | 0 |
|
2989 | 0 | write_lock_irqsave(&prv->lock, flags); |
2990 | 0 | if ( !prv->ratelimit_us && params->ratelimit_us ) |
2991 | 0 | printk(XENLOG_INFO "Enabling context switch rate limiting\n"); |
2992 | 0 | else if ( prv->ratelimit_us && !params->ratelimit_us ) |
2993 | 0 | printk(XENLOG_INFO "Disabling context switch rate limiting\n"); |
2994 | 0 | prv->ratelimit_us = params->ratelimit_us; |
2995 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
2996 | 0 |
|
2997 | 0 | /* FALLTHRU */ |
2998 | 0 | case XEN_SYSCTL_SCHEDOP_getinfo: |
2999 | 0 | params->ratelimit_us = prv->ratelimit_us; |
3000 | 0 | break; |
3001 | 0 | } |
3002 | 0 |
|
3003 | 0 | return 0; |
3004 | 0 | } |
3005 | | |
3006 | | static void * |
3007 | | csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom) |
3008 | 0 | { |
3009 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3010 | 0 | struct csched2_dom *sdom; |
3011 | 0 | unsigned long flags; |
3012 | 0 |
|
3013 | 0 | sdom = xzalloc(struct csched2_dom); |
3014 | 0 | if ( sdom == NULL ) |
3015 | 0 | return NULL; |
3016 | 0 |
|
3017 | 0 | sdom->repl_timer = xzalloc(struct timer); |
3018 | 0 | if ( sdom->repl_timer == NULL ) |
3019 | 0 | { |
3020 | 0 | xfree(sdom); |
3021 | 0 | return NULL; |
3022 | 0 | } |
3023 | 0 |
|
3024 | 0 | /* Initialize credit, cap and weight */ |
3025 | 0 | INIT_LIST_HEAD(&sdom->sdom_elem); |
3026 | 0 | sdom->dom = dom; |
3027 | 0 | sdom->weight = CSCHED2_DEFAULT_WEIGHT; |
3028 | 0 | sdom->cap = 0U; |
3029 | 0 | sdom->nr_vcpus = 0; |
3030 | 0 |
|
3031 | 0 | init_timer(sdom->repl_timer, replenish_domain_budget, sdom, |
3032 | 0 | cpumask_any(cpupool_domain_cpumask(dom))); |
3033 | 0 | spin_lock_init(&sdom->budget_lock); |
3034 | 0 | INIT_LIST_HEAD(&sdom->parked_vcpus); |
3035 | 0 |
|
3036 | 0 | write_lock_irqsave(&prv->lock, flags); |
3037 | 0 |
|
3038 | 0 | list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom); |
3039 | 0 |
|
3040 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
3041 | 0 |
|
3042 | 0 | return (void *)sdom; |
3043 | 0 | } |
3044 | | |
3045 | | static int |
3046 | | csched2_dom_init(const struct scheduler *ops, struct domain *dom) |
3047 | 0 | { |
3048 | 0 | struct csched2_dom *sdom; |
3049 | 0 |
|
3050 | 0 | if ( is_idle_domain(dom) ) |
3051 | 0 | return 0; |
3052 | 0 |
|
3053 | 0 | sdom = csched2_alloc_domdata(ops, dom); |
3054 | 0 | if ( sdom == NULL ) |
3055 | 0 | return -ENOMEM; |
3056 | 0 |
|
3057 | 0 | dom->sched_priv = sdom; |
3058 | 0 |
|
3059 | 0 | return 0; |
3060 | 0 | } |
3061 | | |
3062 | | static void |
3063 | | csched2_free_domdata(const struct scheduler *ops, void *data) |
3064 | 0 | { |
3065 | 0 | unsigned long flags; |
3066 | 0 | struct csched2_dom *sdom = data; |
3067 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3068 | 0 |
|
3069 | 0 | kill_timer(sdom->repl_timer); |
3070 | 0 |
|
3071 | 0 | write_lock_irqsave(&prv->lock, flags); |
3072 | 0 |
|
3073 | 0 | list_del_init(&sdom->sdom_elem); |
3074 | 0 |
|
3075 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
3076 | 0 |
|
3077 | 0 | xfree(sdom->repl_timer); |
3078 | 0 | xfree(data); |
3079 | 0 | } |
3080 | | |
3081 | | static void |
3082 | | csched2_dom_destroy(const struct scheduler *ops, struct domain *dom) |
3083 | 0 | { |
3084 | 0 | ASSERT(csched2_dom(dom)->nr_vcpus == 0); |
3085 | 0 |
|
3086 | 0 | csched2_free_domdata(ops, csched2_dom(dom)); |
3087 | 0 | } |
3088 | | |
3089 | | static void |
3090 | | csched2_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) |
3091 | 0 | { |
3092 | 0 | struct csched2_vcpu *svc = vc->sched_priv; |
3093 | 0 | struct csched2_dom * const sdom = svc->sdom; |
3094 | 0 | spinlock_t *lock; |
3095 | 0 |
|
3096 | 0 | ASSERT(!is_idle_vcpu(vc)); |
3097 | 0 | ASSERT(list_empty(&svc->runq_elem)); |
3098 | 0 |
|
3099 | 0 | /* csched2_cpu_pick() expects the pcpu lock to be held */ |
3100 | 0 | lock = vcpu_schedule_lock_irq(vc); |
3101 | 0 |
|
3102 | 0 | vc->processor = csched2_cpu_pick(ops, vc); |
3103 | 0 |
|
3104 | 0 | spin_unlock_irq(lock); |
3105 | 0 |
|
3106 | 0 | lock = vcpu_schedule_lock_irq(vc); |
3107 | 0 |
|
3108 | 0 | /* Add vcpu to runqueue of initial processor */ |
3109 | 0 | runq_assign(ops, vc); |
3110 | 0 |
|
3111 | 0 | vcpu_schedule_unlock_irq(lock, vc); |
3112 | 0 |
|
3113 | 0 | sdom->nr_vcpus++; |
3114 | 0 |
|
3115 | 0 | SCHED_STAT_CRANK(vcpu_insert); |
3116 | 0 |
|
3117 | 0 | CSCHED2_VCPU_CHECK(vc); |
3118 | 0 | } |
3119 | | |
3120 | | static void |
3121 | | csched2_free_vdata(const struct scheduler *ops, void *priv) |
3122 | 0 | { |
3123 | 0 | struct csched2_vcpu *svc = priv; |
3124 | 0 |
|
3125 | 0 | xfree(svc); |
3126 | 0 | } |
3127 | | |
3128 | | static void |
3129 | | csched2_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) |
3130 | 0 | { |
3131 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(vc); |
3132 | 0 | spinlock_t *lock; |
3133 | 0 |
|
3134 | 0 | ASSERT(!is_idle_vcpu(vc)); |
3135 | 0 | ASSERT(list_empty(&svc->runq_elem)); |
3136 | 0 |
|
3137 | 0 | SCHED_STAT_CRANK(vcpu_remove); |
3138 | 0 |
|
3139 | 0 | /* Remove from runqueue */ |
3140 | 0 | lock = vcpu_schedule_lock_irq(vc); |
3141 | 0 |
|
3142 | 0 | runq_deassign(ops, vc); |
3143 | 0 |
|
3144 | 0 | vcpu_schedule_unlock_irq(lock, vc); |
3145 | 0 |
|
3146 | 0 | svc->sdom->nr_vcpus--; |
3147 | 0 | } |
3148 | | |
3149 | | /* How long should we let this vcpu run for? */ |
3150 | | static s_time_t |
3151 | | csched2_runtime(const struct scheduler *ops, int cpu, |
3152 | | struct csched2_vcpu *snext, s_time_t now) |
3153 | 0 | { |
3154 | 0 | s_time_t time, min_time; |
3155 | 0 | int rt_credit; /* Proposed runtime measured in credits */ |
3156 | 0 | struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); |
3157 | 0 | struct list_head *runq = &rqd->runq; |
3158 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3159 | 0 |
|
3160 | 0 | /* |
3161 | 0 | * If we're idle, just stay so. Others (or external events) |
3162 | 0 | * will poke us when necessary. |
3163 | 0 | */ |
3164 | 0 | if ( is_idle_vcpu(snext->vcpu) ) |
3165 | 0 | return -1; |
3166 | 0 |
|
3167 | 0 | /* General algorithm: |
3168 | 0 | * 1) Run until snext's credit will be 0. |
3169 | 0 | * 2) But if someone is waiting, run until snext's credit is equal |
3170 | 0 | * to his. |
3171 | 0 | * 3) But, if we are capped, never run more than our budget. |
3172 | 0 | * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or |
3173 | 0 | * the ratelimit time. |
3174 | 0 | */ |
3175 | 0 |
|
3176 | 0 | /* Calculate mintime */ |
3177 | 0 | min_time = CSCHED2_MIN_TIMER; |
3178 | 0 | if ( prv->ratelimit_us ) |
3179 | 0 | { |
3180 | 0 | s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us); |
3181 | 0 | if ( snext->vcpu->is_running ) |
3182 | 0 | ratelimit_min = snext->vcpu->runstate.state_entry_time + |
3183 | 0 | MICROSECS(prv->ratelimit_us) - now; |
3184 | 0 | if ( ratelimit_min > min_time ) |
3185 | 0 | min_time = ratelimit_min; |
3186 | 0 | } |
3187 | 0 |
|
3188 | 0 | /* 1) Run until snext's credit will be 0. */ |
3189 | 0 | rt_credit = snext->credit; |
3190 | 0 |
|
3191 | 0 | /* |
3192 | 0 | * 2) If there's someone waiting whose credit is positive, |
3193 | 0 | * run until your credit ~= his. |
3194 | 0 | */ |
3195 | 0 | if ( ! list_empty(runq) ) |
3196 | 0 | { |
3197 | 0 | struct csched2_vcpu *swait = runq_elem(runq->next); |
3198 | 0 |
|
3199 | 0 | if ( ! is_idle_vcpu(swait->vcpu) |
3200 | 0 | && swait->credit > 0 ) |
3201 | 0 | { |
3202 | 0 | rt_credit = snext->credit - swait->credit; |
3203 | 0 | } |
3204 | 0 | } |
3205 | 0 |
|
3206 | 0 | /* |
3207 | 0 | * The next guy on the runqueue may actually have a higher credit, |
3208 | 0 | * if we've tried to avoid migrating him from a different cpu. |
3209 | 0 | * Setting time=0 will ensure the minimum timeslice is chosen. |
3210 | 0 | * |
3211 | 0 | * FIXME: See if we can eliminate this conversion if we know time |
3212 | 0 | * will be outside (MIN,MAX). Probably requires pre-calculating |
3213 | 0 | * credit values of MIN,MAX per vcpu, since each vcpu burns credit |
3214 | 0 | * at a different rate. |
3215 | 0 | */ |
3216 | 0 | if ( rt_credit > 0 ) |
3217 | 0 | time = c2t(rqd, rt_credit, snext); |
3218 | 0 | else |
3219 | 0 | time = 0; |
3220 | 0 |
|
3221 | 0 | /* |
3222 | 0 | * 3) But, if capped, never run more than our budget. |
3223 | 0 | */ |
3224 | 0 | if ( has_cap(snext) ) |
3225 | 0 | time = snext->budget < time ? snext->budget : time; |
3226 | 0 |
|
3227 | 0 | /* |
3228 | 0 | * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or |
3229 | 0 | * the rate_limit time. |
3230 | 0 | */ |
3231 | 0 | if ( time < min_time ) |
3232 | 0 | { |
3233 | 0 | time = min_time; |
3234 | 0 | SCHED_STAT_CRANK(runtime_min_timer); |
3235 | 0 | } |
3236 | 0 | else if (time > CSCHED2_MAX_TIMER) |
3237 | 0 | { |
3238 | 0 | time = CSCHED2_MAX_TIMER; |
3239 | 0 | SCHED_STAT_CRANK(runtime_max_timer); |
3240 | 0 | } |
3241 | 0 |
|
3242 | 0 | return time; |
3243 | 0 | } |
3244 | | |
3245 | | /* |
3246 | | * Find a candidate. |
3247 | | */ |
3248 | | static struct csched2_vcpu * |
3249 | | runq_candidate(struct csched2_runqueue_data *rqd, |
3250 | | struct csched2_vcpu *scurr, |
3251 | | int cpu, s_time_t now, |
3252 | | unsigned int *skipped) |
3253 | 0 | { |
3254 | 0 | struct list_head *iter, *temp; |
3255 | 0 | struct csched2_vcpu *snext = NULL; |
3256 | 0 | struct csched2_private *prv = csched2_priv(per_cpu(scheduler, cpu)); |
3257 | 0 | bool yield = false, soft_aff_preempt = false; |
3258 | 0 |
|
3259 | 0 | *skipped = 0; |
3260 | 0 |
|
3261 | 0 | if ( unlikely(is_idle_vcpu(scurr->vcpu)) ) |
3262 | 0 | { |
3263 | 0 | snext = scurr; |
3264 | 0 | goto check_runq; |
3265 | 0 | } |
3266 | 0 |
|
3267 | 0 | yield = __test_and_clear_bit(__CSFLAG_vcpu_yield, &scurr->flags); |
3268 | 0 |
|
3269 | 0 | /* |
3270 | 0 | * Return the current vcpu if it has executed for less than ratelimit. |
3271 | 0 | * Adjuststment for the selected vcpu's credit and decision |
3272 | 0 | * for how long it will run will be taken in csched2_runtime. |
3273 | 0 | * |
3274 | 0 | * Note that, if scurr is yielding, we don't let rate limiting kick in. |
3275 | 0 | * In fact, it may be the case that scurr is about to spin, and there's |
3276 | 0 | * no point forcing it to do so until rate limiting expires. |
3277 | 0 | */ |
3278 | 0 | if ( !yield && prv->ratelimit_us && vcpu_runnable(scurr->vcpu) && |
3279 | 0 | (now - scurr->vcpu->runstate.state_entry_time) < |
3280 | 0 | MICROSECS(prv->ratelimit_us) ) |
3281 | 0 | { |
3282 | 0 | if ( unlikely(tb_init_done) ) |
3283 | 0 | { |
3284 | 0 | struct { |
3285 | 0 | unsigned vcpu:16, dom:16; |
3286 | 0 | unsigned runtime; |
3287 | 0 | } d; |
3288 | 0 | d.dom = scurr->vcpu->domain->domain_id; |
3289 | 0 | d.vcpu = scurr->vcpu->vcpu_id; |
3290 | 0 | d.runtime = now - scurr->vcpu->runstate.state_entry_time; |
3291 | 0 | __trace_var(TRC_CSCHED2_RATELIMIT, 1, |
3292 | 0 | sizeof(d), |
3293 | 0 | (unsigned char *)&d); |
3294 | 0 | } |
3295 | 0 | return scurr; |
3296 | 0 | } |
3297 | 0 |
|
3298 | 0 | /* If scurr has a soft-affinity, let's check whether cpu is part of it */ |
3299 | 0 | if ( has_soft_affinity(scurr->vcpu, scurr->vcpu->cpu_hard_affinity) ) |
3300 | 0 | { |
3301 | 0 | affinity_balance_cpumask(scurr->vcpu, BALANCE_SOFT_AFFINITY, |
3302 | 0 | cpumask_scratch); |
3303 | 0 | if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) ) |
3304 | 0 | { |
3305 | 0 | cpumask_t *online = cpupool_domain_cpumask(scurr->vcpu->domain); |
3306 | 0 |
|
3307 | 0 | /* Ok, is any of the pcpus in scurr soft-affinity idle? */ |
3308 | 0 | cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle); |
3309 | 0 | cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled); |
3310 | 0 | soft_aff_preempt = cpumask_intersects(cpumask_scratch, online); |
3311 | 0 | } |
3312 | 0 | } |
3313 | 0 |
|
3314 | 0 | /* |
3315 | 0 | * If scurr is runnable, and this cpu is in its soft-affinity, default to |
3316 | 0 | * it. We also default to it, even if cpu is not in its soft-affinity, if |
3317 | 0 | * there aren't any idle and not tickled cpu in its soft-affinity. In |
3318 | 0 | * fact, we don't want to risk leaving scurr in the runq and this cpu idle |
3319 | 0 | * only because scurr is running outside of its soft-affinity. |
3320 | 0 | * |
3321 | 0 | * On the other hand, if cpu is not in scurr's soft-affinity, and there |
3322 | 0 | * looks to be better options, go for them. That happens by defaulting to |
3323 | 0 | * idle here, which means scurr will be preempted, put back in runq, and |
3324 | 0 | * one of those idle and not tickled cpus from its soft-affinity will be |
3325 | 0 | * tickled to pick it up. |
3326 | 0 | * |
3327 | 0 | * Finally, if scurr does not have a valid soft-affinity, we also let it |
3328 | 0 | * continue to run here (in fact, soft_aff_preempt will still be false, |
3329 | 0 | * in this case). |
3330 | 0 | * |
3331 | 0 | * Of course, we also default to idle also if scurr is not runnable. |
3332 | 0 | */ |
3333 | 0 | if ( vcpu_runnable(scurr->vcpu) && !soft_aff_preempt ) |
3334 | 0 | snext = scurr; |
3335 | 0 | else |
3336 | 0 | snext = csched2_vcpu(idle_vcpu[cpu]); |
3337 | 0 |
|
3338 | 0 | check_runq: |
3339 | 0 | list_for_each_safe( iter, temp, &rqd->runq ) |
3340 | 0 | { |
3341 | 0 | struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem); |
3342 | 0 |
|
3343 | 0 | if ( unlikely(tb_init_done) ) |
3344 | 0 | { |
3345 | 0 | struct { |
3346 | 0 | unsigned vcpu:16, dom:16; |
3347 | 0 | } d; |
3348 | 0 | d.dom = svc->vcpu->domain->domain_id; |
3349 | 0 | d.vcpu = svc->vcpu->vcpu_id; |
3350 | 0 | __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1, |
3351 | 0 | sizeof(d), |
3352 | 0 | (unsigned char *)&d); |
3353 | 0 | } |
3354 | 0 |
|
3355 | 0 | /* Only consider vcpus that are allowed to run on this processor. */ |
3356 | 0 | if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) ) |
3357 | 0 | { |
3358 | 0 | (*skipped)++; |
3359 | 0 | continue; |
3360 | 0 | } |
3361 | 0 |
|
3362 | 0 | /* |
3363 | 0 | * If a vcpu is meant to be picked up by another processor, and such |
3364 | 0 | * processor has not scheduled yet, leave it in the runqueue for him. |
3365 | 0 | */ |
3366 | 0 | if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu && |
3367 | 0 | cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) ) |
3368 | 0 | { |
3369 | 0 | (*skipped)++; |
3370 | 0 | SCHED_STAT_CRANK(deferred_to_tickled_cpu); |
3371 | 0 | continue; |
3372 | 0 | } |
3373 | 0 |
|
3374 | 0 | /* |
3375 | 0 | * If this is on a different processor, don't pull it unless |
3376 | 0 | * its credit is at least CSCHED2_MIGRATE_RESIST higher. |
3377 | 0 | */ |
3378 | 0 | if ( svc->vcpu->processor != cpu |
3379 | 0 | && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit ) |
3380 | 0 | { |
3381 | 0 | (*skipped)++; |
3382 | 0 | SCHED_STAT_CRANK(migrate_resisted); |
3383 | 0 | continue; |
3384 | 0 | } |
3385 | 0 |
|
3386 | 0 | /* |
3387 | 0 | * If the one in the runqueue has more credit than current (or idle, |
3388 | 0 | * if current is not runnable), or if current is yielding, and also |
3389 | 0 | * if the one in runqueue either is not capped, or is capped but has |
3390 | 0 | * some budget, then choose it. |
3391 | 0 | */ |
3392 | 0 | if ( (yield || svc->credit > snext->credit) && |
3393 | 0 | (!has_cap(svc) || vcpu_grab_budget(svc)) ) |
3394 | 0 | snext = svc; |
3395 | 0 |
|
3396 | 0 | /* In any case, if we got this far, break. */ |
3397 | 0 | break; |
3398 | 0 | } |
3399 | 0 |
|
3400 | 0 | if ( unlikely(tb_init_done) ) |
3401 | 0 | { |
3402 | 0 | struct { |
3403 | 0 | unsigned vcpu:16, dom:16; |
3404 | 0 | unsigned tickled_cpu, skipped; |
3405 | 0 | int credit; |
3406 | 0 | } d; |
3407 | 0 | d.dom = snext->vcpu->domain->domain_id; |
3408 | 0 | d.vcpu = snext->vcpu->vcpu_id; |
3409 | 0 | d.credit = snext->credit; |
3410 | 0 | d.tickled_cpu = snext->tickled_cpu; |
3411 | 0 | d.skipped = *skipped; |
3412 | 0 | __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1, |
3413 | 0 | sizeof(d), |
3414 | 0 | (unsigned char *)&d); |
3415 | 0 | } |
3416 | 0 |
|
3417 | 0 | if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) ) |
3418 | 0 | SCHED_STAT_CRANK(tickled_cpu_overridden); |
3419 | 0 |
|
3420 | 0 | /* |
3421 | 0 | * If snext is from a capped domain, it must have budget (or it |
3422 | 0 | * wouldn't have been in the runq). If it is not, it'd be STIME_MAX, |
3423 | 0 | * which still is >= 0. |
3424 | 0 | */ |
3425 | 0 | ASSERT(snext->budget >= 0); |
3426 | 0 |
|
3427 | 0 | return snext; |
3428 | 0 | } |
3429 | | |
3430 | | /* |
3431 | | * This function is in the critical path. It is designed to be simple and |
3432 | | * fast for the common case. |
3433 | | */ |
3434 | | static struct task_slice |
3435 | | csched2_schedule( |
3436 | | const struct scheduler *ops, s_time_t now, bool tasklet_work_scheduled) |
3437 | 0 | { |
3438 | 0 | const int cpu = smp_processor_id(); |
3439 | 0 | struct csched2_runqueue_data *rqd; |
3440 | 0 | struct csched2_vcpu * const scurr = csched2_vcpu(current); |
3441 | 0 | struct csched2_vcpu *snext = NULL; |
3442 | 0 | unsigned int skipped_vcpus = 0; |
3443 | 0 | struct task_slice ret; |
3444 | 0 | bool tickled; |
3445 | 0 |
|
3446 | 0 | SCHED_STAT_CRANK(schedule); |
3447 | 0 | CSCHED2_VCPU_CHECK(current); |
3448 | 0 |
|
3449 | 0 | BUG_ON(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized)); |
3450 | 0 |
|
3451 | 0 | rqd = c2rqd(ops, cpu); |
3452 | 0 | BUG_ON(!cpumask_test_cpu(cpu, &rqd->active)); |
3453 | 0 |
|
3454 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
3455 | 0 |
|
3456 | 0 | BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd); |
3457 | 0 |
|
3458 | 0 | /* Clear "tickled" bit now that we've been scheduled */ |
3459 | 0 | tickled = cpumask_test_cpu(cpu, &rqd->tickled); |
3460 | 0 | if ( tickled ) |
3461 | 0 | { |
3462 | 0 | __cpumask_clear_cpu(cpu, &rqd->tickled); |
3463 | 0 | cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); |
3464 | 0 | smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle); |
3465 | 0 | } |
3466 | 0 |
|
3467 | 0 | if ( unlikely(tb_init_done) ) |
3468 | 0 | { |
3469 | 0 | struct { |
3470 | 0 | unsigned cpu:16, rq_id:16; |
3471 | 0 | unsigned tasklet:8, idle:8, smt_idle:8, tickled:8; |
3472 | 0 | } d; |
3473 | 0 | d.cpu = cpu; |
3474 | 0 | d.rq_id = c2r(cpu); |
3475 | 0 | d.tasklet = tasklet_work_scheduled; |
3476 | 0 | d.idle = is_idle_vcpu(current); |
3477 | 0 | d.smt_idle = cpumask_test_cpu(cpu, &rqd->smt_idle); |
3478 | 0 | d.tickled = tickled; |
3479 | 0 | __trace_var(TRC_CSCHED2_SCHEDULE, 1, |
3480 | 0 | sizeof(d), |
3481 | 0 | (unsigned char *)&d); |
3482 | 0 | } |
3483 | 0 |
|
3484 | 0 | /* Update credits (and budget, if necessary). */ |
3485 | 0 | burn_credits(rqd, scurr, now); |
3486 | 0 |
|
3487 | 0 | /* |
3488 | 0 | * Below 0, means that we are capped and we have overrun our budget. |
3489 | 0 | * Let's try to get some more but, if we fail (e.g., because of the |
3490 | 0 | * other running vcpus), we will be parked. |
3491 | 0 | */ |
3492 | 0 | if ( unlikely(scurr->budget <= 0) ) |
3493 | 0 | vcpu_grab_budget(scurr); |
3494 | 0 |
|
3495 | 0 | /* |
3496 | 0 | * Select next runnable local VCPU (ie top of local runq). |
3497 | 0 | * |
3498 | 0 | * If the current vcpu is runnable, and has higher credit than |
3499 | 0 | * the next guy on the queue (or there is noone else), we want to |
3500 | 0 | * run him again. |
3501 | 0 | * |
3502 | 0 | * If there's tasklet work to do, we want to chose the idle vcpu |
3503 | 0 | * for this processor, and mark the current for delayed runqueue |
3504 | 0 | * add. |
3505 | 0 | * |
3506 | 0 | * If the current vcpu is runnable, and there's another runnable |
3507 | 0 | * candidate, we want to mark current for delayed runqueue add, |
3508 | 0 | * and remove the next guy from the queue. |
3509 | 0 | * |
3510 | 0 | * If the current vcpu is not runnable, we want to chose the idle |
3511 | 0 | * vcpu for this processor. |
3512 | 0 | */ |
3513 | 0 | if ( tasklet_work_scheduled ) |
3514 | 0 | { |
3515 | 0 | __clear_bit(__CSFLAG_vcpu_yield, &scurr->flags); |
3516 | 0 | trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL); |
3517 | 0 | snext = csched2_vcpu(idle_vcpu[cpu]); |
3518 | 0 | } |
3519 | 0 | else |
3520 | 0 | snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus); |
3521 | 0 |
|
3522 | 0 | /* If switching from a non-idle runnable vcpu, put it |
3523 | 0 | * back on the runqueue. */ |
3524 | 0 | if ( snext != scurr |
3525 | 0 | && !is_idle_vcpu(scurr->vcpu) |
3526 | 0 | && vcpu_runnable(current) ) |
3527 | 0 | __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); |
3528 | 0 |
|
3529 | 0 | ret.migrated = 0; |
3530 | 0 |
|
3531 | 0 | /* Accounting for non-idle tasks */ |
3532 | 0 | if ( !is_idle_vcpu(snext->vcpu) ) |
3533 | 0 | { |
3534 | 0 | /* If switching, remove this from the runqueue and mark it scheduled */ |
3535 | 0 | if ( snext != scurr ) |
3536 | 0 | { |
3537 | 0 | ASSERT(snext->rqd == rqd); |
3538 | 0 | ASSERT(!snext->vcpu->is_running); |
3539 | 0 |
|
3540 | 0 | runq_remove(snext); |
3541 | 0 | __set_bit(__CSFLAG_scheduled, &snext->flags); |
3542 | 0 | } |
3543 | 0 |
|
3544 | 0 | /* |
3545 | 0 | * The reset condition is "has a scheduler epoch come to an end?". |
3546 | 0 | * The way this is enforced is checking whether the vcpu at the top |
3547 | 0 | * of the runqueue has negative credits. This means the epochs have |
3548 | 0 | * variable length, as in one epoch expores when: |
3549 | 0 | * 1) the vcpu at the top of the runqueue has executed for |
3550 | 0 | * around 10 ms (with default parameters); |
3551 | 0 | * 2) no other vcpu with higher credits wants to run. |
3552 | 0 | * |
3553 | 0 | * Here, where we want to check for reset, we need to make sure the |
3554 | 0 | * proper vcpu is being used. In fact, runqueue_candidate() may have |
3555 | 0 | * not returned the first vcpu in the runqueue, for various reasons |
3556 | 0 | * (e.g., affinity). Only trigger a reset when it does. |
3557 | 0 | */ |
3558 | 0 | if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET ) |
3559 | 0 | { |
3560 | 0 | reset_credit(ops, cpu, now, snext); |
3561 | 0 | balance_load(ops, cpu, now); |
3562 | 0 | } |
3563 | 0 |
|
3564 | 0 | /* Clear the idle mask if necessary */ |
3565 | 0 | if ( cpumask_test_cpu(cpu, &rqd->idle) ) |
3566 | 0 | { |
3567 | 0 | __cpumask_clear_cpu(cpu, &rqd->idle); |
3568 | 0 | smt_idle_mask_clear(cpu, &rqd->smt_idle); |
3569 | 0 | } |
3570 | 0 |
|
3571 | 0 | snext->start_time = now; |
3572 | 0 | snext->tickled_cpu = -1; |
3573 | 0 |
|
3574 | 0 | /* Safe because lock for old processor is held */ |
3575 | 0 | if ( snext->vcpu->processor != cpu ) |
3576 | 0 | { |
3577 | 0 | snext->credit += CSCHED2_MIGRATE_COMPENSATION; |
3578 | 0 | snext->vcpu->processor = cpu; |
3579 | 0 | SCHED_STAT_CRANK(migrated); |
3580 | 0 | ret.migrated = 1; |
3581 | 0 | } |
3582 | 0 | } |
3583 | 0 | else |
3584 | 0 | { |
3585 | 0 | /* |
3586 | 0 | * Update the idle mask if necessary. Note that, if we're scheduling |
3587 | 0 | * idle in order to carry on some tasklet work, we want to play busy! |
3588 | 0 | */ |
3589 | 0 | if ( tasklet_work_scheduled ) |
3590 | 0 | { |
3591 | 0 | if ( cpumask_test_cpu(cpu, &rqd->idle) ) |
3592 | 0 | { |
3593 | 0 | __cpumask_clear_cpu(cpu, &rqd->idle); |
3594 | 0 | smt_idle_mask_clear(cpu, &rqd->smt_idle); |
3595 | 0 | } |
3596 | 0 | } |
3597 | 0 | else if ( !cpumask_test_cpu(cpu, &rqd->idle) ) |
3598 | 0 | { |
3599 | 0 | __cpumask_set_cpu(cpu, &rqd->idle); |
3600 | 0 | cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); |
3601 | 0 | smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle); |
3602 | 0 | } |
3603 | 0 | /* Make sure avgload gets updated periodically even |
3604 | 0 | * if there's no activity */ |
3605 | 0 | update_load(ops, rqd, NULL, 0, now); |
3606 | 0 | } |
3607 | 0 |
|
3608 | 0 | /* |
3609 | 0 | * Return task to run next... |
3610 | 0 | */ |
3611 | 0 | ret.time = csched2_runtime(ops, cpu, snext, now); |
3612 | 0 | ret.task = snext->vcpu; |
3613 | 0 |
|
3614 | 0 | CSCHED2_VCPU_CHECK(ret.task); |
3615 | 0 | return ret; |
3616 | 0 | } |
3617 | | |
3618 | | static void |
3619 | | csched2_dump_vcpu(struct csched2_private *prv, struct csched2_vcpu *svc) |
3620 | 0 | { |
3621 | 0 | printk("[%i.%i] flags=%x cpu=%i", |
3622 | 0 | svc->vcpu->domain->domain_id, |
3623 | 0 | svc->vcpu->vcpu_id, |
3624 | 0 | svc->flags, |
3625 | 0 | svc->vcpu->processor); |
3626 | 0 |
|
3627 | 0 | printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); |
3628 | 0 |
|
3629 | 0 | if ( has_cap(svc) ) |
3630 | 0 | printk(" budget=%"PRI_stime"(%"PRI_stime")", |
3631 | 0 | svc->budget, svc->budget_quota); |
3632 | 0 |
|
3633 | 0 | printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload, |
3634 | 0 | (svc->avgload * 100) >> prv->load_precision_shift); |
3635 | 0 |
|
3636 | 0 | printk("\n"); |
3637 | 0 | } |
3638 | | |
3639 | | static inline void |
3640 | | dump_pcpu(const struct scheduler *ops, int cpu) |
3641 | 0 | { |
3642 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3643 | 0 | struct csched2_vcpu *svc; |
3644 | 0 | #define cpustr keyhandler_scratch |
3645 | 0 |
|
3646 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); |
3647 | 0 | printk("CPU[%02d] runq=%d, sibling=%s, ", cpu, c2r(cpu), cpustr); |
3648 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); |
3649 | 0 | printk("core=%s\n", cpustr); |
3650 | 0 |
|
3651 | 0 | /* current VCPU (nothing to say if that's the idle vcpu) */ |
3652 | 0 | svc = csched2_vcpu(curr_on_cpu(cpu)); |
3653 | 0 | if ( svc && !is_idle_vcpu(svc->vcpu) ) |
3654 | 0 | { |
3655 | 0 | printk("\trun: "); |
3656 | 0 | csched2_dump_vcpu(prv, svc); |
3657 | 0 | } |
3658 | 0 | #undef cpustr |
3659 | 0 | } |
3660 | | |
3661 | | static void |
3662 | | csched2_dump(const struct scheduler *ops) |
3663 | 0 | { |
3664 | 0 | struct list_head *iter_sdom; |
3665 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3666 | 0 | unsigned long flags; |
3667 | 0 | unsigned int i, j, loop; |
3668 | 0 | #define cpustr keyhandler_scratch |
3669 | 0 |
|
3670 | 0 | /* |
3671 | 0 | * We need the private scheduler lock as we access global |
3672 | 0 | * scheduler data and (below) the list of active domains. |
3673 | 0 | */ |
3674 | 0 | read_lock_irqsave(&prv->lock, flags); |
3675 | 0 |
|
3676 | 0 | printk("Active queues: %d\n" |
3677 | 0 | "\tdefault-weight = %d\n", |
3678 | 0 | cpumask_weight(&prv->active_queues), |
3679 | 0 | CSCHED2_DEFAULT_WEIGHT); |
3680 | 0 | for_each_cpu(i, &prv->active_queues) |
3681 | 0 | { |
3682 | 0 | s_time_t fraction; |
3683 | 0 |
|
3684 | 0 | fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift; |
3685 | 0 |
|
3686 | 0 | cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].active); |
3687 | 0 | printk("Runqueue %d:\n" |
3688 | 0 | "\tncpus = %u\n" |
3689 | 0 | "\tcpus = %s\n" |
3690 | 0 | "\tmax_weight = %u\n" |
3691 | 0 | "\tpick_bias = %u\n" |
3692 | 0 | "\tinstload = %d\n" |
3693 | 0 | "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n", |
3694 | 0 | i, |
3695 | 0 | cpumask_weight(&prv->rqd[i].active), |
3696 | 0 | cpustr, |
3697 | 0 | prv->rqd[i].max_weight, |
3698 | 0 | prv->rqd[i].pick_bias, |
3699 | 0 | prv->rqd[i].load, |
3700 | 0 | prv->rqd[i].avgload, |
3701 | 0 | fraction); |
3702 | 0 |
|
3703 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].idle); |
3704 | 0 | printk("\tidlers: %s\n", cpustr); |
3705 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].tickled); |
3706 | 0 | printk("\ttickled: %s\n", cpustr); |
3707 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].smt_idle); |
3708 | 0 | printk("\tfully idle cores: %s\n", cpustr); |
3709 | 0 | } |
3710 | 0 |
|
3711 | 0 | printk("Domain info:\n"); |
3712 | 0 | loop = 0; |
3713 | 0 | list_for_each( iter_sdom, &prv->sdom ) |
3714 | 0 | { |
3715 | 0 | struct csched2_dom *sdom; |
3716 | 0 | struct vcpu *v; |
3717 | 0 |
|
3718 | 0 | sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem); |
3719 | 0 |
|
3720 | 0 | printk("\tDomain: %d w %d c %u v %d\n", |
3721 | 0 | sdom->dom->domain_id, |
3722 | 0 | sdom->weight, |
3723 | 0 | sdom->cap, |
3724 | 0 | sdom->nr_vcpus); |
3725 | 0 |
|
3726 | 0 | for_each_vcpu( sdom->dom, v ) |
3727 | 0 | { |
3728 | 0 | struct csched2_vcpu * const svc = csched2_vcpu(v); |
3729 | 0 | spinlock_t *lock; |
3730 | 0 |
|
3731 | 0 | lock = vcpu_schedule_lock(svc->vcpu); |
3732 | 0 |
|
3733 | 0 | printk("\t%3d: ", ++loop); |
3734 | 0 | csched2_dump_vcpu(prv, svc); |
3735 | 0 |
|
3736 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
3737 | 0 | } |
3738 | 0 | } |
3739 | 0 |
|
3740 | 0 | for_each_cpu(i, &prv->active_queues) |
3741 | 0 | { |
3742 | 0 | struct csched2_runqueue_data *rqd = prv->rqd + i; |
3743 | 0 | struct list_head *iter, *runq = &rqd->runq; |
3744 | 0 | int loop = 0; |
3745 | 0 |
|
3746 | 0 | /* We need the lock to scan the runqueue. */ |
3747 | 0 | spin_lock(&rqd->lock); |
3748 | 0 |
|
3749 | 0 | printk("Runqueue %d:\n", i); |
3750 | 0 |
|
3751 | 0 | for_each_cpu(j, &rqd->active) |
3752 | 0 | dump_pcpu(ops, j); |
3753 | 0 |
|
3754 | 0 | printk("RUNQ:\n"); |
3755 | 0 | list_for_each( iter, runq ) |
3756 | 0 | { |
3757 | 0 | struct csched2_vcpu *svc = runq_elem(iter); |
3758 | 0 |
|
3759 | 0 | if ( svc ) |
3760 | 0 | { |
3761 | 0 | printk("\t%3d: ", loop++); |
3762 | 0 | csched2_dump_vcpu(prv, svc); |
3763 | 0 | } |
3764 | 0 | } |
3765 | 0 | spin_unlock(&rqd->lock); |
3766 | 0 | } |
3767 | 0 |
|
3768 | 0 | read_unlock_irqrestore(&prv->lock, flags); |
3769 | 0 | #undef cpustr |
3770 | 0 | } |
3771 | | |
3772 | | /* Returns the ID of the runqueue the cpu is assigned to. */ |
3773 | | static unsigned |
3774 | | init_pdata(struct csched2_private *prv, unsigned int cpu) |
3775 | 0 | { |
3776 | 0 | unsigned rqi; |
3777 | 0 | struct csched2_runqueue_data *rqd; |
3778 | 0 |
|
3779 | 0 | ASSERT(rw_is_write_locked(&prv->lock)); |
3780 | 0 | ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); |
3781 | 0 |
|
3782 | 0 | /* Figure out which runqueue to put it in */ |
3783 | 0 | rqi = cpu_to_runqueue(prv, cpu); |
3784 | 0 |
|
3785 | 0 | rqd = prv->rqd + rqi; |
3786 | 0 |
|
3787 | 0 | printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi); |
3788 | 0 | if ( ! cpumask_test_cpu(rqi, &prv->active_queues) ) |
3789 | 0 | { |
3790 | 0 | printk(XENLOG_INFO " First cpu on runqueue, activating\n"); |
3791 | 0 | activate_runqueue(prv, rqi); |
3792 | 0 | } |
3793 | 0 | |
3794 | 0 | /* Set the runqueue map */ |
3795 | 0 | per_cpu(runq_map, cpu) = rqi; |
3796 | 0 | |
3797 | 0 | __cpumask_set_cpu(cpu, &rqd->idle); |
3798 | 0 | __cpumask_set_cpu(cpu, &rqd->active); |
3799 | 0 | __cpumask_set_cpu(cpu, &prv->initialized); |
3800 | 0 | __cpumask_set_cpu(cpu, &rqd->smt_idle); |
3801 | 0 |
|
3802 | 0 | if ( cpumask_weight(&rqd->active) == 1 ) |
3803 | 0 | rqd->pick_bias = cpu; |
3804 | 0 |
|
3805 | 0 | return rqi; |
3806 | 0 | } |
3807 | | |
3808 | | static void |
3809 | | csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu) |
3810 | 0 | { |
3811 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3812 | 0 | spinlock_t *old_lock; |
3813 | 0 | unsigned long flags; |
3814 | 0 | unsigned rqi; |
3815 | 0 |
|
3816 | 0 | /* |
3817 | 0 | * pdata contains what alloc_pdata returned. But since we don't (need to) |
3818 | 0 | * implement alloc_pdata, either that's NULL, or something is very wrong! |
3819 | 0 | */ |
3820 | 0 | ASSERT(!pdata); |
3821 | 0 |
|
3822 | 0 | write_lock_irqsave(&prv->lock, flags); |
3823 | 0 | old_lock = pcpu_schedule_lock(cpu); |
3824 | 0 |
|
3825 | 0 | rqi = init_pdata(prv, cpu); |
3826 | 0 | /* Move the scheduler lock to the new runq lock. */ |
3827 | 0 | per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock; |
3828 | 0 |
|
3829 | 0 | /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ |
3830 | 0 | spin_unlock(old_lock); |
3831 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
3832 | 0 | } |
3833 | | |
3834 | | /* Change the scheduler of cpu to us (Credit2). */ |
3835 | | static void |
3836 | | csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, |
3837 | | void *pdata, void *vdata) |
3838 | 0 | { |
3839 | 0 | struct csched2_private *prv = csched2_priv(new_ops); |
3840 | 0 | struct csched2_vcpu *svc = vdata; |
3841 | 0 | unsigned rqi; |
3842 | 0 |
|
3843 | 0 | ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu)); |
3844 | 0 |
|
3845 | 0 | /* |
3846 | 0 | * We own one runqueue lock already (from schedule_cpu_switch()). This |
3847 | 0 | * looks like it violates this scheduler's locking rules, but it does |
3848 | 0 | * not, as what we own is the lock of another scheduler, that hence has |
3849 | 0 | * no particular (ordering) relationship with our private global lock. |
3850 | 0 | * And owning exactly that one (the lock of the old scheduler of this |
3851 | 0 | * cpu) is what is necessary to prevent races. |
3852 | 0 | */ |
3853 | 0 | ASSERT(!local_irq_is_enabled()); |
3854 | 0 | write_lock(&prv->lock); |
3855 | 0 |
|
3856 | 0 | idle_vcpu[cpu]->sched_priv = vdata; |
3857 | 0 |
|
3858 | 0 | rqi = init_pdata(prv, cpu); |
3859 | 0 |
|
3860 | 0 | /* |
3861 | 0 | * Now that we know what runqueue we'll go in, double check what's said |
3862 | 0 | * above: the lock we already hold is not the one of this runqueue of |
3863 | 0 | * this scheduler, and so it's safe to have taken it /before/ our |
3864 | 0 | * private global lock. |
3865 | 0 | */ |
3866 | 0 | ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock); |
3867 | 0 |
|
3868 | 0 | per_cpu(scheduler, cpu) = new_ops; |
3869 | 0 | per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */ |
3870 | 0 |
|
3871 | 0 | /* |
3872 | 0 | * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact, |
3873 | 0 | * if it is free (and it can be) we want that anyone that manages |
3874 | 0 | * taking it, find all the initializations we've done above in place. |
3875 | 0 | */ |
3876 | 0 | smp_mb(); |
3877 | 0 | per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock; |
3878 | 0 |
|
3879 | 0 | write_unlock(&prv->lock); |
3880 | 0 | } |
3881 | | |
3882 | | static void |
3883 | | csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) |
3884 | 0 | { |
3885 | 0 | unsigned long flags; |
3886 | 0 | struct csched2_private *prv = csched2_priv(ops); |
3887 | 0 | struct csched2_runqueue_data *rqd; |
3888 | 0 | int rqi; |
3889 | 0 |
|
3890 | 0 | write_lock_irqsave(&prv->lock, flags); |
3891 | 0 |
|
3892 | 0 | /* |
3893 | 0 | * alloc_pdata is not implemented, so pcpu must be NULL. On the other |
3894 | 0 | * hand, init_pdata must have been called for this pCPU. |
3895 | 0 | */ |
3896 | 0 | ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized)); |
3897 | 0 | |
3898 | 0 | /* Find the old runqueue and remove this cpu from it */ |
3899 | 0 | rqi = per_cpu(runq_map, cpu); |
3900 | 0 |
|
3901 | 0 | rqd = prv->rqd + rqi; |
3902 | 0 |
|
3903 | 0 | /* No need to save IRQs here, they're already disabled */ |
3904 | 0 | spin_lock(&rqd->lock); |
3905 | 0 |
|
3906 | 0 | printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi); |
3907 | 0 |
|
3908 | 0 | __cpumask_clear_cpu(cpu, &rqd->idle); |
3909 | 0 | __cpumask_clear_cpu(cpu, &rqd->smt_idle); |
3910 | 0 | __cpumask_clear_cpu(cpu, &rqd->active); |
3911 | 0 |
|
3912 | 0 | if ( cpumask_empty(&rqd->active) ) |
3913 | 0 | { |
3914 | 0 | printk(XENLOG_INFO " No cpus left on runqueue, disabling\n"); |
3915 | 0 | deactivate_runqueue(prv, rqi); |
3916 | 0 | } |
3917 | 0 | else if ( rqd->pick_bias == cpu ) |
3918 | 0 | rqd->pick_bias = cpumask_first(&rqd->active); |
3919 | 0 |
|
3920 | 0 | per_cpu(runq_map, cpu) = -1; |
3921 | 0 |
|
3922 | 0 | spin_unlock(&rqd->lock); |
3923 | 0 |
|
3924 | 0 | __cpumask_clear_cpu(cpu, &prv->initialized); |
3925 | 0 |
|
3926 | 0 | write_unlock_irqrestore(&prv->lock, flags); |
3927 | 0 |
|
3928 | 0 | return; |
3929 | 0 | } |
3930 | | |
3931 | | static int |
3932 | | csched2_init(struct scheduler *ops) |
3933 | 0 | { |
3934 | 0 | int i; |
3935 | 0 | struct csched2_private *prv; |
3936 | 0 |
|
3937 | 0 | printk("Initializing Credit2 scheduler\n"); |
3938 | 0 |
|
3939 | 0 | printk(XENLOG_INFO " load_precision_shift: %d\n" |
3940 | 0 | XENLOG_INFO " load_window_shift: %d\n" |
3941 | 0 | XENLOG_INFO " underload_balance_tolerance: %d\n" |
3942 | 0 | XENLOG_INFO " overload_balance_tolerance: %d\n" |
3943 | 0 | XENLOG_INFO " runqueues arrangement: %s\n" |
3944 | 0 | XENLOG_INFO " cap enforcement granularity: %dms\n", |
3945 | 0 | opt_load_precision_shift, |
3946 | 0 | opt_load_window_shift, |
3947 | 0 | opt_underload_balance_tolerance, |
3948 | 0 | opt_overload_balance_tolerance, |
3949 | 0 | opt_runqueue_str[opt_runqueue], |
3950 | 0 | opt_cap_period); |
3951 | 0 |
|
3952 | 0 | if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN ) |
3953 | 0 | { |
3954 | 0 | printk("WARNING: %s: opt_load_precision_shift %d below min %d, resetting\n", |
3955 | 0 | __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN); |
3956 | 0 | opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN; |
3957 | 0 | } |
3958 | 0 |
|
3959 | 0 | if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT ) |
3960 | 0 | { |
3961 | 0 | printk("WARNING: %s: opt_load_window_shift %d too short, resetting\n", |
3962 | 0 | __func__, opt_load_window_shift); |
3963 | 0 | opt_load_window_shift = LOADAVG_WINDOW_SHIFT; |
3964 | 0 | } |
3965 | 0 | printk(XENLOG_INFO "load tracking window length %llu ns\n", |
3966 | 0 | 1ULL << opt_load_window_shift); |
3967 | 0 |
|
3968 | 0 | if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER ) |
3969 | 0 | { |
3970 | 0 | printk("WARNING: %s: opt_cap_period %d too small, resetting\n", |
3971 | 0 | __func__, opt_cap_period); |
3972 | 0 | opt_cap_period = 10; /* ms */ |
3973 | 0 | } |
3974 | 0 |
|
3975 | 0 | /* |
3976 | 0 | * Basically no CPU information is available at this point; just |
3977 | 0 | * set up basic structures, and a callback when the CPU info is |
3978 | 0 | * available. |
3979 | 0 | */ |
3980 | 0 |
|
3981 | 0 | prv = xzalloc(struct csched2_private); |
3982 | 0 | if ( prv == NULL ) |
3983 | 0 | return -ENOMEM; |
3984 | 0 | ops->sched_data = prv; |
3985 | 0 |
|
3986 | 0 | rwlock_init(&prv->lock); |
3987 | 0 | INIT_LIST_HEAD(&prv->sdom); |
3988 | 0 |
|
3989 | 0 | /* Allocate all runqueues and mark them as un-initialized */ |
3990 | 0 | prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids); |
3991 | 0 | if ( !prv->rqd ) |
3992 | 0 | { |
3993 | 0 | xfree(prv); |
3994 | 0 | return -ENOMEM; |
3995 | 0 | } |
3996 | 0 | for ( i = 0; i < nr_cpu_ids; i++ ) |
3997 | 0 | prv->rqd[i].id = -1; |
3998 | 0 |
|
3999 | 0 | /* initialize ratelimit */ |
4000 | 0 | prv->ratelimit_us = sched_ratelimit_us; |
4001 | 0 |
|
4002 | 0 | prv->load_precision_shift = opt_load_precision_shift; |
4003 | 0 | prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT; |
4004 | 0 | ASSERT(opt_load_window_shift > 0); |
4005 | 0 |
|
4006 | 0 | return 0; |
4007 | 0 | } |
4008 | | |
4009 | | static void |
4010 | | csched2_deinit(struct scheduler *ops) |
4011 | 0 | { |
4012 | 0 | struct csched2_private *prv; |
4013 | 0 |
|
4014 | 0 | prv = csched2_priv(ops); |
4015 | 0 | ops->sched_data = NULL; |
4016 | 0 | xfree(prv); |
4017 | 0 | } |
4018 | | |
4019 | | static const struct scheduler sched_credit2_def = { |
4020 | | .name = "SMP Credit Scheduler rev2", |
4021 | | .opt_name = "credit2", |
4022 | | .sched_id = XEN_SCHEDULER_CREDIT2, |
4023 | | .sched_data = NULL, |
4024 | | |
4025 | | .init_domain = csched2_dom_init, |
4026 | | .destroy_domain = csched2_dom_destroy, |
4027 | | |
4028 | | .insert_vcpu = csched2_vcpu_insert, |
4029 | | .remove_vcpu = csched2_vcpu_remove, |
4030 | | |
4031 | | .sleep = csched2_vcpu_sleep, |
4032 | | .wake = csched2_vcpu_wake, |
4033 | | .yield = csched2_vcpu_yield, |
4034 | | |
4035 | | .adjust = csched2_dom_cntl, |
4036 | | .adjust_global = csched2_sys_cntl, |
4037 | | |
4038 | | .pick_cpu = csched2_cpu_pick, |
4039 | | .migrate = csched2_vcpu_migrate, |
4040 | | .do_schedule = csched2_schedule, |
4041 | | .context_saved = csched2_context_saved, |
4042 | | |
4043 | | .dump_settings = csched2_dump, |
4044 | | .init = csched2_init, |
4045 | | .deinit = csched2_deinit, |
4046 | | .alloc_vdata = csched2_alloc_vdata, |
4047 | | .free_vdata = csched2_free_vdata, |
4048 | | .init_pdata = csched2_init_pdata, |
4049 | | .deinit_pdata = csched2_deinit_pdata, |
4050 | | .switch_sched = csched2_switch_sched, |
4051 | | .alloc_domdata = csched2_alloc_domdata, |
4052 | | .free_domdata = csched2_free_domdata, |
4053 | | }; |
4054 | | |
4055 | | REGISTER_SCHEDULER(sched_credit2_def); |