/root/src/xen/xen/common/sched_credit.c
Line | Count | Source (jump to first uncovered line) |
1 | | /**************************************************************************** |
2 | | * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. |
3 | | **************************************************************************** |
4 | | * |
5 | | * File: common/csched_credit.c |
6 | | * Author: Emmanuel Ackaouy |
7 | | * |
8 | | * Description: Credit-based SMP CPU scheduler |
9 | | */ |
10 | | |
11 | | #include <xen/init.h> |
12 | | #include <xen/lib.h> |
13 | | #include <xen/sched.h> |
14 | | #include <xen/domain.h> |
15 | | #include <xen/delay.h> |
16 | | #include <xen/event.h> |
17 | | #include <xen/time.h> |
18 | | #include <xen/sched-if.h> |
19 | | #include <xen/softirq.h> |
20 | | #include <asm/atomic.h> |
21 | | #include <asm/div64.h> |
22 | | #include <xen/errno.h> |
23 | | #include <xen/keyhandler.h> |
24 | | #include <xen/trace.h> |
25 | | #include <xen/err.h> |
26 | | |
27 | | |
28 | | /* |
29 | | * Locking: |
30 | | * - Scheduler-lock (a.k.a. runqueue lock): |
31 | | * + is per-runqueue, and there is one runqueue per-cpu; |
32 | | * + serializes all runqueue manipulation operations; |
33 | | * - Private data lock (a.k.a. private scheduler lock): |
34 | | * + serializes accesses to the scheduler global state (weight, |
35 | | * credit, balance_credit, etc); |
36 | | * + serializes updates to the domains' scheduling parameters. |
37 | | * |
38 | | * Ordering is "private lock always comes first": |
39 | | * + if we need both locks, we must acquire the private |
40 | | * scheduler lock for first; |
41 | | * + if we already own a runqueue lock, we must never acquire |
42 | | * the private scheduler lock. |
43 | | */ |
44 | | |
45 | | /* |
46 | | * Basic constants |
47 | | */ |
48 | 1 | #define CSCHED_DEFAULT_WEIGHT 256 |
49 | 1 | #define CSCHED_TICKS_PER_TSLICE 3 |
50 | | /* Default timeslice: 30ms */ |
51 | 0 | #define CSCHED_DEFAULT_TSLICE_MS 30 |
52 | 3.07M | #define CSCHED_CREDITS_PER_MSEC 10 |
53 | | /* Never set a timer shorter than this value. */ |
54 | 57 | #define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN |
55 | | |
56 | | |
57 | | /* |
58 | | * Priorities |
59 | | */ |
60 | 12.9k | #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ |
61 | 76.8k | #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ |
62 | 5.02M | #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ |
63 | 10.1M | #define CSCHED_PRI_IDLE -64 /* idle */ |
64 | | |
65 | | |
66 | | /* |
67 | | * Flags |
68 | | * |
69 | | * Note that svc->flags (where these flags live) is protected by an |
70 | | * inconsistent set of locks. Therefore atomic-safe bit operations must |
71 | | * be used for accessing it. |
72 | | */ |
73 | | #define CSCHED_FLAG_VCPU_PARKED 0x0 /* VCPU over capped credits */ |
74 | | #define CSCHED_FLAG_VCPU_YIELD 0x1 /* VCPU yielding */ |
75 | | #define CSCHED_FLAG_VCPU_MIGRATING 0x2 /* VCPU may have moved to a new pcpu */ |
76 | | |
77 | | |
78 | | /* |
79 | | * Useful macros |
80 | | */ |
81 | | #define CSCHED_PRIV(_ops) \ |
82 | 6.65M | ((struct csched_private *)((_ops)->sched_data)) |
83 | | #define CSCHED_PCPU(_c) \ |
84 | 19.5M | ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) |
85 | 19.0M | #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) |
86 | 0 | #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) |
87 | 9.53M | #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) |
88 | | |
89 | | |
90 | | /* |
91 | | * CSCHED_STATS |
92 | | * |
93 | | * Manage very basic per-vCPU counters and stats. |
94 | | * |
95 | | * Useful for debugging live systems. The stats are displayed |
96 | | * with runq dumps ('r' on the Xen console). |
97 | | */ |
98 | | #ifdef SCHED_STATS |
99 | | |
100 | | #define CSCHED_STATS |
101 | | |
102 | | #define SCHED_VCPU_STATS_RESET(_V) \ |
103 | | do \ |
104 | | { \ |
105 | | memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ |
106 | | } while ( 0 ) |
107 | | |
108 | | #define SCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++) |
109 | | |
110 | | #define SCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) |
111 | | |
112 | | #else /* !SCHED_STATS */ |
113 | | |
114 | | #undef CSCHED_STATS |
115 | | |
116 | 24 | #define SCHED_VCPU_STATS_RESET(_V) do {} while ( 0 ) |
117 | 2.74k | #define SCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 ) |
118 | 9.30k | #define SCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 ) |
119 | | |
120 | | #endif /* SCHED_STATS */ |
121 | | |
122 | | |
123 | | /* |
124 | | * Credit tracing events ("only" 512 available!). Check |
125 | | * include/public/trace.h for more details. |
126 | | */ |
127 | | #define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1) |
128 | | #define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2) |
129 | | #define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3) |
130 | | #define TRC_CSCHED_STOLEN_VCPU TRC_SCHED_CLASS_EVT(CSCHED, 4) |
131 | | #define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5) |
132 | 0 | #define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6) |
133 | | #define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7) |
134 | | #define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8) |
135 | 0 | #define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9) |
136 | 0 | #define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10) |
137 | | #define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11) |
138 | | |
139 | | /* |
140 | | * Boot parameters |
141 | | */ |
142 | | static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; |
143 | | integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms); |
144 | | |
145 | | /* |
146 | | * Physical CPU |
147 | | */ |
148 | | struct csched_pcpu { |
149 | | struct list_head runq; |
150 | | uint32_t runq_sort_last; |
151 | | |
152 | | unsigned int idle_bias; |
153 | | unsigned int nr_runnable; |
154 | | |
155 | | unsigned int tick; |
156 | | struct timer ticker; |
157 | | }; |
158 | | |
159 | | /* |
160 | | * Virtual CPU |
161 | | */ |
162 | | struct csched_vcpu { |
163 | | struct list_head runq_elem; |
164 | | struct list_head active_vcpu_elem; |
165 | | |
166 | | /* Up-pointers */ |
167 | | struct csched_dom *sdom; |
168 | | struct vcpu *vcpu; |
169 | | |
170 | | s_time_t start_time; /* When we were scheduled (used for credit) */ |
171 | | unsigned flags; |
172 | | int pri; |
173 | | |
174 | | atomic_t credit; |
175 | | unsigned int residual; |
176 | | |
177 | | #ifdef CSCHED_STATS |
178 | | struct { |
179 | | int credit_last; |
180 | | uint32_t credit_incr; |
181 | | uint32_t state_active; |
182 | | uint32_t state_idle; |
183 | | uint32_t migrate_q; |
184 | | uint32_t migrate_r; |
185 | | uint32_t kicked_away; |
186 | | } stats; |
187 | | #endif |
188 | | }; |
189 | | |
190 | | /* |
191 | | * Domain |
192 | | */ |
193 | | struct csched_dom { |
194 | | struct list_head active_vcpu; |
195 | | struct list_head active_sdom_elem; |
196 | | struct domain *dom; |
197 | | uint16_t active_vcpu_count; |
198 | | uint16_t weight; |
199 | | uint16_t cap; |
200 | | }; |
201 | | |
202 | | /* |
203 | | * System-wide private data |
204 | | */ |
205 | | struct csched_private { |
206 | | /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ |
207 | | spinlock_t lock; |
208 | | |
209 | | cpumask_var_t idlers; |
210 | | cpumask_var_t cpus; |
211 | | uint32_t *balance_bias; |
212 | | uint32_t runq_sort; |
213 | | unsigned int ratelimit_us; |
214 | | |
215 | | /* Period of master and tick in milliseconds */ |
216 | | unsigned int tslice_ms, tick_period_us, ticks_per_tslice; |
217 | | uint32_t ncpus; |
218 | | |
219 | | struct list_head active_sdom; |
220 | | uint32_t weight; |
221 | | uint32_t credit; |
222 | | int credit_balance; |
223 | | unsigned int credits_per_tslice; |
224 | | |
225 | | unsigned int master; |
226 | | struct timer master_ticker; |
227 | | }; |
228 | | |
229 | | static void csched_tick(void *_cpu); |
230 | | static void csched_acct(void *dummy); |
231 | | |
232 | | static inline int |
233 | | __vcpu_on_runq(struct csched_vcpu *svc) |
234 | 9.60M | { |
235 | 9.60M | return !list_empty(&svc->runq_elem); |
236 | 9.60M | } |
237 | | |
238 | | static inline struct csched_vcpu * |
239 | | __runq_elem(struct list_head *elem) |
240 | 13.6M | { |
241 | 13.6M | return list_entry(elem, struct csched_vcpu, runq_elem); |
242 | 13.6M | } |
243 | | |
244 | | /* Is the first element of cpu's runq (if any) cpu's idle vcpu? */ |
245 | | static inline bool_t is_runq_idle(unsigned int cpu) |
246 | 10.4k | { |
247 | 10.4k | /* |
248 | 10.4k | * We're peeking at cpu's runq, we must hold the proper lock. |
249 | 10.4k | */ |
250 | 10.4k | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
251 | 10.4k | |
252 | 10.4k | return list_empty(RUNQ(cpu)) || |
253 | 10.3k | is_idle_vcpu(__runq_elem(RUNQ(cpu)->next)->vcpu); |
254 | 10.4k | } |
255 | | |
256 | | static inline void |
257 | | inc_nr_runnable(unsigned int cpu) |
258 | 67.3k | { |
259 | 67.3k | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
260 | 67.3k | CSCHED_PCPU(cpu)->nr_runnable++; |
261 | 67.3k | |
262 | 67.3k | } |
263 | | |
264 | | static inline void |
265 | | dec_nr_runnable(unsigned int cpu) |
266 | 65.5k | { |
267 | 65.5k | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
268 | 65.5k | ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1); |
269 | 65.5k | CSCHED_PCPU(cpu)->nr_runnable--; |
270 | 65.5k | } |
271 | | |
272 | | static inline void |
273 | | __runq_insert(struct csched_vcpu *svc) |
274 | 4.81M | { |
275 | 4.81M | unsigned int cpu = svc->vcpu->processor; |
276 | 4.81M | const struct list_head * const runq = RUNQ(cpu); |
277 | 4.81M | struct list_head *iter; |
278 | 4.81M | |
279 | 4.81M | BUG_ON( __vcpu_on_runq(svc) ); |
280 | 4.81M | |
281 | 4.81M | list_for_each( iter, runq ) |
282 | 4.66M | { |
283 | 4.66M | const struct csched_vcpu * const iter_svc = __runq_elem(iter); |
284 | 4.66M | if ( svc->pri > iter_svc->pri ) |
285 | 4.56M | break; |
286 | 4.66M | } |
287 | 4.81M | |
288 | 4.81M | /* If the vcpu yielded, try to put it behind one lower-priority |
289 | 4.81M | * runnable vcpu if we can. The next runq_sort will bring it forward |
290 | 4.81M | * within 30ms if the queue too long. */ |
291 | 4.81M | if ( test_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags) |
292 | 4.63M | && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) |
293 | 600 | { |
294 | 600 | iter=iter->next; |
295 | 600 | |
296 | 600 | /* Some sanity checks */ |
297 | 600 | BUG_ON(iter == runq); |
298 | 600 | } |
299 | 4.81M | |
300 | 4.81M | list_add_tail(&svc->runq_elem, iter); |
301 | 4.81M | } |
302 | | |
303 | | static inline void |
304 | | runq_insert(struct csched_vcpu *svc) |
305 | 66.8k | { |
306 | 66.8k | __runq_insert(svc); |
307 | 66.8k | inc_nr_runnable(svc->vcpu->processor); |
308 | 66.8k | } |
309 | | |
310 | | static inline void |
311 | | __runq_remove(struct csched_vcpu *svc) |
312 | 4.85M | { |
313 | 4.85M | BUG_ON( !__vcpu_on_runq(svc) ); |
314 | 4.85M | list_del_init(&svc->runq_elem); |
315 | 4.85M | } |
316 | | |
317 | | static inline void |
318 | | runq_remove(struct csched_vcpu *svc) |
319 | 507 | { |
320 | 507 | dec_nr_runnable(svc->vcpu->processor); |
321 | 507 | __runq_remove(svc); |
322 | 507 | } |
323 | | |
324 | | static void burn_credits(struct csched_vcpu *svc, s_time_t now) |
325 | 4.64M | { |
326 | 4.64M | s_time_t delta; |
327 | 4.64M | uint64_t val; |
328 | 4.64M | unsigned int credits; |
329 | 4.64M | |
330 | 4.64M | /* Assert svc is current */ |
331 | 4.64M | ASSERT( svc == CSCHED_VCPU(curr_on_cpu(svc->vcpu->processor)) ); |
332 | 4.64M | |
333 | 4.64M | if ( (delta = now - svc->start_time) <= 0 ) |
334 | 3.10M | return; |
335 | 4.64M | |
336 | 1.53M | val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual; |
337 | 1.53M | svc->residual = do_div(val, MILLISECS(1)); |
338 | 1.53M | credits = val; |
339 | 1.53M | ASSERT(credits == val); /* make sure we haven't truncated val */ |
340 | 1.53M | atomic_sub(credits, &svc->credit); |
341 | 1.53M | svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC; |
342 | 1.53M | } |
343 | | |
344 | | static bool_t __read_mostly opt_tickle_one_idle = 1; |
345 | | boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle); |
346 | | |
347 | | DEFINE_PER_CPU(unsigned int, last_tickle_cpu); |
348 | | |
349 | | static inline void __runq_tickle(struct csched_vcpu *new) |
350 | 66.8k | { |
351 | 66.8k | unsigned int cpu = new->vcpu->processor; |
352 | 66.8k | struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu)); |
353 | 66.8k | struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); |
354 | 66.8k | cpumask_t mask, idle_mask, *online; |
355 | 66.8k | int balance_step, idlers_empty; |
356 | 66.8k | |
357 | 66.8k | ASSERT(cur); |
358 | 66.8k | cpumask_clear(&mask); |
359 | 66.8k | |
360 | 66.8k | online = cpupool_domain_cpumask(new->sdom->dom); |
361 | 66.8k | cpumask_and(&idle_mask, prv->idlers, online); |
362 | 66.8k | idlers_empty = cpumask_empty(&idle_mask); |
363 | 66.8k | |
364 | 66.8k | /* |
365 | 66.8k | * If the pcpu is idle, or there are no idlers and the new |
366 | 66.8k | * vcpu is a higher priority than the old vcpu, run it here. |
367 | 66.8k | * |
368 | 66.8k | * If there are idle cpus, first try to find one suitable to run |
369 | 66.8k | * new, so we can avoid preempting cur. If we cannot find a |
370 | 66.8k | * suitable idler on which to run new, run it here, but try to |
371 | 66.8k | * find a suitable idler on which to run cur instead. |
372 | 66.8k | */ |
373 | 66.8k | if ( cur->pri == CSCHED_PRI_IDLE |
374 | 196 | || (idlers_empty && new->pri > cur->pri) ) |
375 | 66.6k | { |
376 | 66.6k | if ( cur->pri != CSCHED_PRI_IDLE ) |
377 | 0 | SCHED_STAT_CRANK(tickled_busy_cpu); |
378 | 66.6k | else |
379 | 66.6k | SCHED_STAT_CRANK(tickled_idle_cpu); |
380 | 66.6k | __cpumask_set_cpu(cpu, &mask); |
381 | 66.6k | } |
382 | 184 | else if ( !idlers_empty ) |
383 | 194 | { |
384 | 194 | /* |
385 | 194 | * Soft and hard affinity balancing loop. For vcpus without |
386 | 194 | * a useful soft affinity, consider hard affinity only. |
387 | 194 | */ |
388 | 194 | for_each_affinity_balance_step( balance_step ) |
389 | 388 | { |
390 | 388 | int new_idlers_empty; |
391 | 388 | |
392 | 388 | if ( balance_step == BALANCE_SOFT_AFFINITY |
393 | 194 | && !has_soft_affinity(new->vcpu, |
394 | 194 | new->vcpu->cpu_hard_affinity) ) |
395 | 194 | continue; |
396 | 388 | |
397 | 388 | /* Are there idlers suitable for new (for this balance step)? */ |
398 | 194 | affinity_balance_cpumask(new->vcpu, balance_step, |
399 | 194 | cpumask_scratch_cpu(cpu)); |
400 | 194 | cpumask_and(cpumask_scratch_cpu(cpu), |
401 | 194 | cpumask_scratch_cpu(cpu), &idle_mask); |
402 | 194 | new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu)); |
403 | 194 | |
404 | 194 | /* |
405 | 194 | * Let's not be too harsh! If there aren't idlers suitable |
406 | 194 | * for new in its soft affinity mask, make sure we check its |
407 | 194 | * hard affinity as well, before taking final decisions. |
408 | 194 | */ |
409 | 194 | if ( new_idlers_empty |
410 | 0 | && balance_step == BALANCE_SOFT_AFFINITY ) |
411 | 0 | continue; |
412 | 194 | |
413 | 194 | /* |
414 | 194 | * If there are no suitable idlers for new, and it's higher |
415 | 194 | * priority than cur, check whether we can migrate cur away. |
416 | 194 | * We have to do it indirectly, via _VPF_migrating (instead |
417 | 194 | * of just tickling any idler suitable for cur) because cur |
418 | 194 | * is running. |
419 | 194 | * |
420 | 194 | * If there are suitable idlers for new, no matter priorities, |
421 | 194 | * leave cur alone (as it is running and is, likely, cache-hot) |
422 | 194 | * and wake some of them (which is waking up and so is, likely, |
423 | 194 | * cache cold anyway). |
424 | 194 | */ |
425 | 194 | if ( new_idlers_empty && new->pri > cur->pri ) |
426 | 0 | { |
427 | 0 | if ( cpumask_intersects(cur->vcpu->cpu_hard_affinity, |
428 | 0 | &idle_mask) ) |
429 | 0 | { |
430 | 0 | SCHED_VCPU_STAT_CRANK(cur, kicked_away); |
431 | 0 | SCHED_VCPU_STAT_CRANK(cur, migrate_r); |
432 | 0 | SCHED_STAT_CRANK(migrate_kicked_away); |
433 | 0 | set_bit(_VPF_migrating, &cur->vcpu->pause_flags); |
434 | 0 | } |
435 | 0 | /* Tickle cpu anyway, to let new preempt cur. */ |
436 | 0 | SCHED_STAT_CRANK(tickled_busy_cpu); |
437 | 0 | __cpumask_set_cpu(cpu, &mask); |
438 | 0 | } |
439 | 194 | else if ( !new_idlers_empty ) |
440 | 194 | { |
441 | 194 | /* Which of the idlers suitable for new shall we wake up? */ |
442 | 194 | SCHED_STAT_CRANK(tickled_idle_cpu); |
443 | 194 | if ( opt_tickle_one_idle ) |
444 | 194 | { |
445 | 194 | this_cpu(last_tickle_cpu) = |
446 | 194 | cpumask_cycle(this_cpu(last_tickle_cpu), |
447 | 194 | cpumask_scratch_cpu(cpu)); |
448 | 194 | __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); |
449 | 194 | } |
450 | 194 | else |
451 | 0 | cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu)); |
452 | 194 | } |
453 | 194 | |
454 | 194 | /* Did we find anyone? */ |
455 | 194 | if ( !cpumask_empty(&mask) ) |
456 | 194 | break; |
457 | 194 | } |
458 | 194 | } |
459 | 66.8k | |
460 | 66.8k | if ( !cpumask_empty(&mask) ) |
461 | 66.8k | { |
462 | 66.8k | if ( unlikely(tb_init_done) ) |
463 | 0 | { |
464 | 0 | /* Avoid TRACE_*: saves checking !tb_init_done each step */ |
465 | 0 | for_each_cpu(cpu, &mask) |
466 | 0 | __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu); |
467 | 0 | } |
468 | 66.8k | |
469 | 66.8k | /* |
470 | 66.8k | * Mark the designated CPUs as busy and send them all the scheduler |
471 | 66.8k | * interrupt. We need the for_each_cpu for dealing with the |
472 | 66.8k | * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and |
473 | 66.8k | * can't use cpumask_andnot(), because prv->idlers needs atomic access. |
474 | 66.8k | * |
475 | 66.8k | * In the default (and most common) case, when opt_rickle_one_idle is |
476 | 66.8k | * true, the loop does only one step, and only one bit is cleared. |
477 | 66.8k | */ |
478 | 66.8k | for_each_cpu(cpu, &mask) |
479 | 66.8k | cpumask_clear_cpu(cpu, prv->idlers); |
480 | 66.8k | cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ); |
481 | 66.8k | } |
482 | 66.8k | else |
483 | 18.4E | SCHED_STAT_CRANK(tickled_no_cpu); |
484 | 66.8k | } |
485 | | |
486 | | static void |
487 | | csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) |
488 | 0 | { |
489 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
490 | 0 |
|
491 | 0 | /* |
492 | 0 | * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're |
493 | 0 | * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed |
494 | 0 | * very early. xfree() does not really mind, but we want to be sure that, |
495 | 0 | * when we get here, either init_pdata has never been called, or |
496 | 0 | * deinit_pdata has been called already. |
497 | 0 | */ |
498 | 0 | ASSERT(!cpumask_test_cpu(cpu, prv->cpus)); |
499 | 0 |
|
500 | 0 | xfree(pcpu); |
501 | 0 | } |
502 | | |
503 | | static void |
504 | | csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) |
505 | 0 | { |
506 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
507 | 0 | struct csched_pcpu *spc = pcpu; |
508 | 0 | unsigned int node = cpu_to_node(cpu); |
509 | 0 | unsigned long flags; |
510 | 0 |
|
511 | 0 | /* |
512 | 0 | * Scheduler specific data for this pCPU must still be there and and be |
513 | 0 | * valid. In fact, if we are here: |
514 | 0 | * 1. alloc_pdata must have been called for this cpu, and free_pdata |
515 | 0 | * must not have been called on it before us, |
516 | 0 | * 2. init_pdata must have been called on this cpu, and deinit_pdata |
517 | 0 | * (us!) must not have been called on it already. |
518 | 0 | */ |
519 | 0 | ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus)); |
520 | 0 |
|
521 | 0 | spin_lock_irqsave(&prv->lock, flags); |
522 | 0 |
|
523 | 0 | prv->credit -= prv->credits_per_tslice; |
524 | 0 | prv->ncpus--; |
525 | 0 | cpumask_clear_cpu(cpu, prv->idlers); |
526 | 0 | cpumask_clear_cpu(cpu, prv->cpus); |
527 | 0 | if ( (prv->master == cpu) && (prv->ncpus > 0) ) |
528 | 0 | { |
529 | 0 | prv->master = cpumask_first(prv->cpus); |
530 | 0 | migrate_timer(&prv->master_ticker, prv->master); |
531 | 0 | } |
532 | 0 | if ( prv->balance_bias[node] == cpu ) |
533 | 0 | { |
534 | 0 | cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node)); |
535 | 0 | if ( !cpumask_empty(cpumask_scratch) ) |
536 | 0 | prv->balance_bias[node] = cpumask_first(cpumask_scratch); |
537 | 0 | } |
538 | 0 | kill_timer(&spc->ticker); |
539 | 0 | if ( prv->ncpus == 0 ) |
540 | 0 | kill_timer(&prv->master_ticker); |
541 | 0 |
|
542 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
543 | 0 | } |
544 | | |
545 | | static void * |
546 | | csched_alloc_pdata(const struct scheduler *ops, int cpu) |
547 | 12 | { |
548 | 12 | struct csched_pcpu *spc; |
549 | 12 | |
550 | 12 | /* Allocate per-PCPU info */ |
551 | 12 | spc = xzalloc(struct csched_pcpu); |
552 | 12 | if ( spc == NULL ) |
553 | 0 | return ERR_PTR(-ENOMEM); |
554 | 12 | |
555 | 12 | return spc; |
556 | 12 | } |
557 | | |
558 | | static void |
559 | | init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu) |
560 | 12 | { |
561 | 12 | ASSERT(spin_is_locked(&prv->lock)); |
562 | 12 | /* cpu data needs to be allocated, but STILL uninitialized. */ |
563 | 12 | ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL); |
564 | 12 | |
565 | 12 | /* Initialize/update system-wide config */ |
566 | 12 | prv->credit += prv->credits_per_tslice; |
567 | 12 | prv->ncpus++; |
568 | 12 | cpumask_set_cpu(cpu, prv->cpus); |
569 | 12 | if ( prv->ncpus == 1 ) |
570 | 1 | { |
571 | 1 | prv->master = cpu; |
572 | 1 | init_timer(&prv->master_ticker, csched_acct, prv, cpu); |
573 | 1 | set_timer(&prv->master_ticker, |
574 | 1 | NOW() + MILLISECS(prv->tslice_ms)); |
575 | 1 | } |
576 | 12 | |
577 | 12 | cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu))); |
578 | 12 | if ( cpumask_weight(cpumask_scratch) == 1 ) |
579 | 1 | prv->balance_bias[cpu_to_node(cpu)] = cpu; |
580 | 12 | |
581 | 12 | init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); |
582 | 12 | set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); |
583 | 12 | |
584 | 12 | INIT_LIST_HEAD(&spc->runq); |
585 | 12 | spc->runq_sort_last = prv->runq_sort; |
586 | 12 | spc->idle_bias = nr_cpu_ids - 1; |
587 | 12 | |
588 | 12 | /* Start off idling... */ |
589 | 12 | BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu))); |
590 | 12 | cpumask_set_cpu(cpu, prv->idlers); |
591 | 12 | spc->nr_runnable = 0; |
592 | 12 | } |
593 | | |
594 | | static void |
595 | | csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu) |
596 | 12 | { |
597 | 12 | unsigned long flags; |
598 | 12 | struct csched_private *prv = CSCHED_PRIV(ops); |
599 | 12 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
600 | 12 | |
601 | 12 | /* |
602 | 12 | * This is called either during during boot, resume or hotplug, in |
603 | 12 | * case Credit1 is the scheduler chosen at boot. In such cases, the |
604 | 12 | * scheduler lock for cpu is already pointing to the default per-cpu |
605 | 12 | * spinlock, as Credit1 needs it, so there is no remapping to be done. |
606 | 12 | */ |
607 | 12 | ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock)); |
608 | 12 | |
609 | 12 | spin_lock_irqsave(&prv->lock, flags); |
610 | 12 | init_pdata(prv, pdata, cpu); |
611 | 12 | spin_unlock_irqrestore(&prv->lock, flags); |
612 | 12 | } |
613 | | |
614 | | /* Change the scheduler of cpu to us (Credit). */ |
615 | | static void |
616 | | csched_switch_sched(struct scheduler *new_ops, unsigned int cpu, |
617 | | void *pdata, void *vdata) |
618 | 0 | { |
619 | 0 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
620 | 0 | struct csched_private *prv = CSCHED_PRIV(new_ops); |
621 | 0 | struct csched_vcpu *svc = vdata; |
622 | 0 |
|
623 | 0 | ASSERT(svc && is_idle_vcpu(svc->vcpu)); |
624 | 0 |
|
625 | 0 | idle_vcpu[cpu]->sched_priv = vdata; |
626 | 0 |
|
627 | 0 | /* |
628 | 0 | * We are holding the runqueue lock already (it's been taken in |
629 | 0 | * schedule_cpu_switch()). It actually may or may not be the 'right' |
630 | 0 | * one for this cpu, but that is ok for preventing races. |
631 | 0 | */ |
632 | 0 | ASSERT(!local_irq_is_enabled()); |
633 | 0 | spin_lock(&prv->lock); |
634 | 0 | init_pdata(prv, pdata, cpu); |
635 | 0 | spin_unlock(&prv->lock); |
636 | 0 |
|
637 | 0 | per_cpu(scheduler, cpu) = new_ops; |
638 | 0 | per_cpu(schedule_data, cpu).sched_priv = pdata; |
639 | 0 |
|
640 | 0 | /* |
641 | 0 | * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact, |
642 | 0 | * if it is free (and it can be) we want that anyone that manages |
643 | 0 | * taking it, finds all the initializations we've done above in place. |
644 | 0 | */ |
645 | 0 | smp_mb(); |
646 | 0 | sd->schedule_lock = &sd->_lock; |
647 | 0 | } |
648 | | |
649 | | #ifndef NDEBUG |
650 | | static inline void |
651 | | __csched_vcpu_check(struct vcpu *vc) |
652 | 9.51M | { |
653 | 9.51M | struct csched_vcpu * const svc = CSCHED_VCPU(vc); |
654 | 9.51M | struct csched_dom * const sdom = svc->sdom; |
655 | 9.51M | |
656 | 9.51M | BUG_ON( svc->vcpu != vc ); |
657 | 9.51M | BUG_ON( sdom != CSCHED_DOM(vc->domain) ); |
658 | 9.51M | if ( sdom ) |
659 | 9.40M | { |
660 | 9.40M | BUG_ON( is_idle_vcpu(vc) ); |
661 | 9.40M | BUG_ON( sdom->dom != vc->domain ); |
662 | 9.40M | } |
663 | 9.51M | else |
664 | 110k | { |
665 | 110k | BUG_ON( !is_idle_vcpu(vc) ); |
666 | 110k | } |
667 | 9.51M | |
668 | 9.51M | SCHED_STAT_CRANK(vcpu_check); |
669 | 9.51M | } |
670 | 9.68M | #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) |
671 | | #else |
672 | | #define CSCHED_VCPU_CHECK(_vc) |
673 | | #endif |
674 | | |
675 | | /* |
676 | | * Delay, in microseconds, between migrations of a VCPU between PCPUs. |
677 | | * This prevents rapid fluttering of a VCPU between CPUs, and reduces the |
678 | | * implicit overheads such as cache-warming. 1ms (1000) has been measured |
679 | | * as a good value. |
680 | | */ |
681 | | static unsigned int vcpu_migration_delay; |
682 | | integer_param("vcpu_migration_delay", vcpu_migration_delay); |
683 | | |
684 | | void set_vcpu_migration_delay(unsigned int delay) |
685 | 0 | { |
686 | 0 | vcpu_migration_delay = delay; |
687 | 0 | } |
688 | | |
689 | | unsigned int get_vcpu_migration_delay(void) |
690 | 0 | { |
691 | 0 | return vcpu_migration_delay; |
692 | 0 | } |
693 | | |
694 | | static inline int |
695 | | __csched_vcpu_is_cache_hot(struct vcpu *v) |
696 | 506 | { |
697 | 506 | int hot = ((NOW() - v->last_run_time) < |
698 | 506 | ((uint64_t)vcpu_migration_delay * 1000u)); |
699 | 506 | |
700 | 506 | if ( hot ) |
701 | 0 | SCHED_STAT_CRANK(vcpu_hot); |
702 | 506 | |
703 | 506 | return hot; |
704 | 506 | } |
705 | | |
706 | | static inline int |
707 | | __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask) |
708 | 506 | { |
709 | 506 | /* |
710 | 506 | * Don't pick up work that's hot on peer PCPU, or that can't (or |
711 | 506 | * would prefer not to) run on cpu. |
712 | 506 | * |
713 | 506 | * The caller is supposed to have already checked that vc is also |
714 | 506 | * not running. |
715 | 506 | */ |
716 | 506 | ASSERT(!vc->is_running); |
717 | 506 | |
718 | 506 | return !__csched_vcpu_is_cache_hot(vc) && |
719 | 506 | cpumask_test_cpu(dest_cpu, mask); |
720 | 506 | } |
721 | | |
722 | | static int |
723 | | _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit) |
724 | 10.2k | { |
725 | 10.2k | cpumask_t cpus; |
726 | 10.2k | cpumask_t idlers; |
727 | 10.2k | cpumask_t *online; |
728 | 10.2k | struct csched_pcpu *spc = NULL; |
729 | 10.2k | int cpu = vc->processor; |
730 | 10.2k | int balance_step; |
731 | 10.2k | |
732 | 10.2k | /* Store in cpus the mask of online cpus on which the domain can run */ |
733 | 10.2k | online = cpupool_domain_cpumask(vc->domain); |
734 | 10.2k | cpumask_and(&cpus, vc->cpu_hard_affinity, online); |
735 | 10.2k | |
736 | 10.2k | for_each_affinity_balance_step( balance_step ) |
737 | 20.1k | { |
738 | 20.1k | /* |
739 | 20.1k | * We want to pick up a pcpu among the ones that are online and |
740 | 20.1k | * can accommodate vc, which is basically what we computed above |
741 | 20.1k | * and stored in cpus. As far as hard affinity is concerned, |
742 | 20.1k | * there always will be at least one of these pcpus, hence cpus |
743 | 20.1k | * is never empty and the calls to cpumask_cycle() and |
744 | 20.1k | * cpumask_test_cpu() below are ok. |
745 | 20.1k | * |
746 | 20.1k | * On the other hand, when considering soft affinity too, it |
747 | 20.1k | * is possible for the mask to become empty (for instance, if the |
748 | 20.1k | * domain has been put in a cpupool that does not contain any of the |
749 | 20.1k | * pcpus in its soft affinity), which would result in the ASSERT()-s |
750 | 20.1k | * inside cpumask_*() operations triggering (in debug builds). |
751 | 20.1k | * |
752 | 20.1k | * Therefore, in this case, we filter the soft affinity mask against |
753 | 20.1k | * cpus and, if the result is empty, we just skip the soft affinity |
754 | 20.1k | * balancing step all together. |
755 | 20.1k | */ |
756 | 20.1k | if ( balance_step == BALANCE_SOFT_AFFINITY |
757 | 10.2k | && !has_soft_affinity(vc, &cpus) ) |
758 | 10.1k | continue; |
759 | 20.1k | |
760 | 20.1k | /* Pick an online CPU from the proper affinity mask */ |
761 | 10.0k | affinity_balance_cpumask(vc, balance_step, &cpus); |
762 | 10.0k | cpumask_and(&cpus, &cpus, online); |
763 | 10.0k | |
764 | 10.0k | /* If present, prefer vc's current processor */ |
765 | 10.0k | cpu = cpumask_test_cpu(vc->processor, &cpus) |
766 | 10.3k | ? vc->processor |
767 | 18.4E | : cpumask_cycle(vc->processor, &cpus); |
768 | 10.0k | ASSERT(cpumask_test_cpu(cpu, &cpus)); |
769 | 10.0k | |
770 | 10.0k | /* |
771 | 10.0k | * Try to find an idle processor within the above constraints. |
772 | 10.0k | * |
773 | 10.0k | * In multi-core and multi-threaded CPUs, not all idle execution |
774 | 10.0k | * vehicles are equal! |
775 | 10.0k | * |
776 | 10.0k | * We give preference to the idle execution vehicle with the most |
777 | 10.0k | * idling neighbours in its grouping. This distributes work across |
778 | 10.0k | * distinct cores first and guarantees we don't do something stupid |
779 | 10.0k | * like run two VCPUs on co-hyperthreads while there are idle cores |
780 | 10.0k | * or sockets. |
781 | 10.0k | * |
782 | 10.0k | * Notice that, when computing the "idleness" of cpu, we may want to |
783 | 10.0k | * discount vc. That is, iff vc is the currently running and the only |
784 | 10.0k | * runnable vcpu on cpu, we add cpu to the idlers. |
785 | 10.0k | */ |
786 | 10.0k | cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); |
787 | 10.4k | if ( vc->processor == cpu && is_runq_idle(cpu) ) |
788 | 9.52k | __cpumask_set_cpu(cpu, &idlers); |
789 | 10.0k | cpumask_and(&cpus, &cpus, &idlers); |
790 | 10.0k | |
791 | 10.0k | /* |
792 | 10.0k | * It is important that cpu points to an idle processor, if a suitable |
793 | 10.0k | * one exists (and we can use cpus to check and, possibly, choose a new |
794 | 10.0k | * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and |
795 | 10.0k | * cpu points to a busy thread with an idle sibling, both the threads |
796 | 10.0k | * will be considered the same, from the "idleness" calculation point |
797 | 10.0k | * of view", preventing vcpu from being moved to the thread that is |
798 | 10.0k | * actually idle. |
799 | 10.0k | * |
800 | 10.0k | * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so |
801 | 10.0k | * we check for it first. |
802 | 10.0k | */ |
803 | 10.0k | if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) ) |
804 | 31 | cpu = cpumask_cycle(cpu, &cpus); |
805 | 10.0k | __cpumask_clear_cpu(cpu, &cpus); |
806 | 10.0k | |
807 | 30.7k | while ( !cpumask_empty(&cpus) ) |
808 | 20.7k | { |
809 | 20.7k | cpumask_t cpu_idlers; |
810 | 20.7k | cpumask_t nxt_idlers; |
811 | 20.7k | int nxt, weight_cpu, weight_nxt; |
812 | 20.7k | int migrate_factor; |
813 | 20.7k | |
814 | 20.7k | nxt = cpumask_cycle(cpu, &cpus); |
815 | 20.7k | |
816 | 20.7k | if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) |
817 | 20.7k | { |
818 | 20.7k | /* We're on the same socket, so check the busy-ness of threads. |
819 | 20.7k | * Migrate if # of idlers is less at all */ |
820 | 20.7k | ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); |
821 | 20.7k | migrate_factor = 1; |
822 | 20.7k | cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, |
823 | 20.7k | cpu)); |
824 | 20.7k | cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, |
825 | 20.7k | nxt)); |
826 | 20.7k | } |
827 | 20.7k | else |
828 | 18.4E | { |
829 | 18.4E | /* We're on different sockets, so check the busy-ness of cores. |
830 | 18.4E | * Migrate only if the other core is twice as idle */ |
831 | 18.4E | ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); |
832 | 18.4E | migrate_factor = 2; |
833 | 18.4E | cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); |
834 | 18.4E | cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); |
835 | 18.4E | } |
836 | 20.7k | |
837 | 20.7k | weight_cpu = cpumask_weight(&cpu_idlers); |
838 | 20.7k | weight_nxt = cpumask_weight(&nxt_idlers); |
839 | 20.7k | /* smt_power_savings: consolidate work rather than spreading it */ |
840 | 20.7k | if ( sched_smt_power_savings ? |
841 | 0 | weight_cpu > weight_nxt : |
842 | 20.7k | weight_cpu * migrate_factor < weight_nxt ) |
843 | 64 | { |
844 | 64 | cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); |
845 | 64 | spc = CSCHED_PCPU(nxt); |
846 | 64 | cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); |
847 | 64 | cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); |
848 | 64 | } |
849 | 20.7k | else |
850 | 20.6k | { |
851 | 20.6k | cpumask_andnot(&cpus, &cpus, &nxt_idlers); |
852 | 20.6k | } |
853 | 20.7k | } |
854 | 10.0k | |
855 | 10.0k | /* Stop if cpu is idle */ |
856 | 10.0k | if ( cpumask_test_cpu(cpu, &idlers) ) |
857 | 9.78k | break; |
858 | 10.0k | } |
859 | 10.2k | |
860 | 10.2k | if ( commit && spc ) |
861 | 30 | spc->idle_bias = cpu; |
862 | 10.2k | |
863 | 10.2k | TRACE_3D(TRC_CSCHED_PICKED_CPU, vc->domain->domain_id, vc->vcpu_id, cpu); |
864 | 10.2k | |
865 | 10.2k | return cpu; |
866 | 10.2k | } |
867 | | |
868 | | static int |
869 | | csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc) |
870 | 78 | { |
871 | 78 | struct csched_vcpu *svc = CSCHED_VCPU(vc); |
872 | 78 | |
873 | 78 | /* |
874 | 78 | * We have been called by vcpu_migrate() (in schedule.c), as part |
875 | 78 | * of the process of seeing if vc can be migrated to another pcpu. |
876 | 78 | * We make a note about this in svc->flags so that later, in |
877 | 78 | * csched_vcpu_wake() (still called from vcpu_migrate()) we won't |
878 | 78 | * get boosted, which we don't deserve as we are "only" migrating. |
879 | 78 | */ |
880 | 78 | set_bit(CSCHED_FLAG_VCPU_MIGRATING, &svc->flags); |
881 | 78 | return _csched_cpu_pick(ops, vc, 1); |
882 | 78 | } |
883 | | |
884 | | static inline void |
885 | | __csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc) |
886 | 1.08k | { |
887 | 1.08k | struct csched_dom * const sdom = svc->sdom; |
888 | 1.08k | unsigned long flags; |
889 | 1.08k | |
890 | 1.08k | spin_lock_irqsave(&prv->lock, flags); |
891 | 1.08k | |
892 | 1.08k | if ( list_empty(&svc->active_vcpu_elem) ) |
893 | 1.08k | { |
894 | 1.08k | SCHED_VCPU_STAT_CRANK(svc, state_active); |
895 | 1.08k | SCHED_STAT_CRANK(acct_vcpu_active); |
896 | 1.08k | |
897 | 1.08k | sdom->active_vcpu_count++; |
898 | 1.08k | list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); |
899 | 1.08k | /* Make weight per-vcpu */ |
900 | 1.08k | prv->weight += sdom->weight; |
901 | 1.08k | if ( list_empty(&sdom->active_sdom_elem) ) |
902 | 171 | { |
903 | 171 | list_add(&sdom->active_sdom_elem, &prv->active_sdom); |
904 | 171 | } |
905 | 1.08k | } |
906 | 1.08k | |
907 | 1.08k | TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id, |
908 | 1.08k | svc->vcpu->vcpu_id, sdom->active_vcpu_count); |
909 | 1.08k | |
910 | 1.08k | spin_unlock_irqrestore(&prv->lock, flags); |
911 | 1.08k | } |
912 | | |
913 | | static inline void |
914 | | __csched_vcpu_acct_stop_locked(struct csched_private *prv, |
915 | | struct csched_vcpu *svc) |
916 | 1.08k | { |
917 | 1.08k | struct csched_dom * const sdom = svc->sdom; |
918 | 1.08k | |
919 | 1.08k | BUG_ON( list_empty(&svc->active_vcpu_elem) ); |
920 | 1.08k | |
921 | 1.08k | SCHED_VCPU_STAT_CRANK(svc, state_idle); |
922 | 1.08k | SCHED_STAT_CRANK(acct_vcpu_idle); |
923 | 1.08k | |
924 | 1.08k | BUG_ON( prv->weight < sdom->weight ); |
925 | 1.08k | sdom->active_vcpu_count--; |
926 | 1.08k | list_del_init(&svc->active_vcpu_elem); |
927 | 1.08k | prv->weight -= sdom->weight; |
928 | 1.08k | if ( list_empty(&sdom->active_vcpu) ) |
929 | 170 | { |
930 | 170 | list_del_init(&sdom->active_sdom_elem); |
931 | 170 | } |
932 | 1.08k | |
933 | 1.08k | TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id, |
934 | 1.08k | svc->vcpu->vcpu_id, sdom->active_vcpu_count); |
935 | 1.08k | } |
936 | | |
937 | | static void |
938 | | csched_vcpu_acct(struct csched_private *prv, unsigned int cpu) |
939 | 10.3k | { |
940 | 10.3k | struct csched_vcpu * const svc = CSCHED_VCPU(current); |
941 | 10.3k | const struct scheduler *ops = per_cpu(scheduler, cpu); |
942 | 10.3k | |
943 | 10.3k | ASSERT( current->processor == cpu ); |
944 | 10.3k | ASSERT( svc->sdom != NULL ); |
945 | 10.3k | ASSERT( !is_idle_vcpu(svc->vcpu) ); |
946 | 10.3k | |
947 | 10.3k | /* |
948 | 10.3k | * If this VCPU's priority was boosted when it last awoke, reset it. |
949 | 10.3k | * If the VCPU is found here, then it's consuming a non-negligeable |
950 | 10.3k | * amount of CPU resources and should no longer be boosted. |
951 | 10.3k | */ |
952 | 10.3k | if ( svc->pri == CSCHED_PRI_TS_BOOST ) |
953 | 1.50k | { |
954 | 1.50k | svc->pri = CSCHED_PRI_TS_UNDER; |
955 | 1.50k | TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id, |
956 | 1.50k | svc->vcpu->vcpu_id); |
957 | 1.50k | } |
958 | 10.3k | |
959 | 10.3k | /* |
960 | 10.3k | * Update credits |
961 | 10.3k | */ |
962 | 10.3k | burn_credits(svc, NOW()); |
963 | 10.3k | |
964 | 10.3k | /* |
965 | 10.3k | * Put this VCPU and domain back on the active list if it was |
966 | 10.3k | * idling. |
967 | 10.3k | */ |
968 | 10.3k | if ( list_empty(&svc->active_vcpu_elem) ) |
969 | 1.08k | { |
970 | 1.08k | __csched_vcpu_acct_start(prv, svc); |
971 | 1.08k | } |
972 | 10.3k | else |
973 | 9.26k | { |
974 | 9.26k | unsigned int new_cpu; |
975 | 9.26k | unsigned long flags; |
976 | 9.26k | spinlock_t *lock = vcpu_schedule_lock_irqsave(current, &flags); |
977 | 9.26k | |
978 | 9.26k | /* |
979 | 9.26k | * If it's been active a while, check if we'd be better off |
980 | 9.26k | * migrating it to run elsewhere (see multi-core and multi-thread |
981 | 9.26k | * support in csched_cpu_pick()). |
982 | 9.26k | */ |
983 | 9.26k | new_cpu = _csched_cpu_pick(ops, current, 0); |
984 | 9.26k | |
985 | 9.26k | vcpu_schedule_unlock_irqrestore(lock, flags, current); |
986 | 9.26k | |
987 | 9.26k | if ( new_cpu != cpu ) |
988 | 64 | { |
989 | 64 | SCHED_VCPU_STAT_CRANK(svc, migrate_r); |
990 | 64 | SCHED_STAT_CRANK(migrate_running); |
991 | 64 | set_bit(_VPF_migrating, ¤t->pause_flags); |
992 | 64 | /* |
993 | 64 | * As we are about to tickle cpu, we should clear its bit in |
994 | 64 | * idlers. But, if we are here, it means there is someone running |
995 | 64 | * on it, and hence the bit must be zero already. |
996 | 64 | */ |
997 | 64 | ASSERT(!cpumask_test_cpu(cpu, |
998 | 64 | CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers)); |
999 | 64 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
1000 | 64 | } |
1001 | 9.26k | } |
1002 | 10.3k | } |
1003 | | |
1004 | | static void * |
1005 | | csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) |
1006 | 24 | { |
1007 | 24 | struct csched_vcpu *svc; |
1008 | 24 | |
1009 | 24 | /* Allocate per-VCPU info */ |
1010 | 24 | svc = xzalloc(struct csched_vcpu); |
1011 | 24 | if ( svc == NULL ) |
1012 | 0 | return NULL; |
1013 | 24 | |
1014 | 24 | INIT_LIST_HEAD(&svc->runq_elem); |
1015 | 24 | INIT_LIST_HEAD(&svc->active_vcpu_elem); |
1016 | 24 | svc->sdom = dd; |
1017 | 24 | svc->vcpu = vc; |
1018 | 24 | svc->pri = is_idle_domain(vc->domain) ? |
1019 | 12 | CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; |
1020 | 24 | SCHED_VCPU_STATS_RESET(svc); |
1021 | 24 | SCHED_STAT_CRANK(vcpu_alloc); |
1022 | 24 | return svc; |
1023 | 24 | } |
1024 | | |
1025 | | static void |
1026 | | csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) |
1027 | 12 | { |
1028 | 12 | struct csched_vcpu *svc = vc->sched_priv; |
1029 | 12 | spinlock_t *lock; |
1030 | 12 | |
1031 | 12 | BUG_ON( is_idle_vcpu(vc) ); |
1032 | 12 | |
1033 | 12 | /* csched_cpu_pick() looks in vc->processor's runq, so we need the lock. */ |
1034 | 12 | lock = vcpu_schedule_lock_irq(vc); |
1035 | 12 | |
1036 | 12 | vc->processor = csched_cpu_pick(ops, vc); |
1037 | 12 | |
1038 | 12 | spin_unlock_irq(lock); |
1039 | 12 | |
1040 | 12 | lock = vcpu_schedule_lock_irq(vc); |
1041 | 12 | |
1042 | 12 | if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) |
1043 | 0 | runq_insert(svc); |
1044 | 12 | |
1045 | 12 | vcpu_schedule_unlock_irq(lock, vc); |
1046 | 12 | |
1047 | 12 | SCHED_STAT_CRANK(vcpu_insert); |
1048 | 12 | } |
1049 | | |
1050 | | static void |
1051 | | csched_free_vdata(const struct scheduler *ops, void *priv) |
1052 | 0 | { |
1053 | 0 | struct csched_vcpu *svc = priv; |
1054 | 0 |
|
1055 | 0 | BUG_ON( !list_empty(&svc->runq_elem) ); |
1056 | 0 |
|
1057 | 0 | xfree(svc); |
1058 | 0 | } |
1059 | | |
1060 | | static void |
1061 | | csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) |
1062 | 0 | { |
1063 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
1064 | 0 | struct csched_vcpu * const svc = CSCHED_VCPU(vc); |
1065 | 0 | struct csched_dom * const sdom = svc->sdom; |
1066 | 0 |
|
1067 | 0 | SCHED_STAT_CRANK(vcpu_remove); |
1068 | 0 |
|
1069 | 0 | ASSERT(!__vcpu_on_runq(svc)); |
1070 | 0 |
|
1071 | 0 | if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) |
1072 | 0 | { |
1073 | 0 | SCHED_STAT_CRANK(vcpu_unpark); |
1074 | 0 | vcpu_unpause(svc->vcpu); |
1075 | 0 | } |
1076 | 0 |
|
1077 | 0 | spin_lock_irq(&prv->lock); |
1078 | 0 |
|
1079 | 0 | if ( !list_empty(&svc->active_vcpu_elem) ) |
1080 | 0 | __csched_vcpu_acct_stop_locked(prv, svc); |
1081 | 0 |
|
1082 | 0 | spin_unlock_irq(&prv->lock); |
1083 | 0 |
|
1084 | 0 | BUG_ON( sdom == NULL ); |
1085 | 0 | } |
1086 | | |
1087 | | static void |
1088 | | csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) |
1089 | 360 | { |
1090 | 360 | struct csched_vcpu * const svc = CSCHED_VCPU(vc); |
1091 | 360 | unsigned int cpu = vc->processor; |
1092 | 360 | |
1093 | 360 | SCHED_STAT_CRANK(vcpu_sleep); |
1094 | 360 | |
1095 | 360 | BUG_ON( is_idle_vcpu(vc) ); |
1096 | 360 | |
1097 | 360 | if ( curr_on_cpu(cpu) == vc ) |
1098 | 54 | { |
1099 | 54 | /* |
1100 | 54 | * We are about to tickle cpu, so we should clear its bit in idlers. |
1101 | 54 | * But, we are here because vc is going to sleep while running on cpu, |
1102 | 54 | * so the bit must be zero already. |
1103 | 54 | */ |
1104 | 54 | ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers)); |
1105 | 54 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
1106 | 54 | } |
1107 | 306 | else if ( __vcpu_on_runq(svc) ) |
1108 | 1 | runq_remove(svc); |
1109 | 360 | } |
1110 | | |
1111 | | static void |
1112 | | csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) |
1113 | 67.1k | { |
1114 | 67.1k | struct csched_vcpu * const svc = CSCHED_VCPU(vc); |
1115 | 67.1k | bool_t migrating; |
1116 | 67.1k | |
1117 | 67.1k | BUG_ON( is_idle_vcpu(vc) ); |
1118 | 67.1k | |
1119 | 67.1k | if ( unlikely(curr_on_cpu(vc->processor) == vc) ) |
1120 | 301 | { |
1121 | 301 | SCHED_STAT_CRANK(vcpu_wake_running); |
1122 | 301 | return; |
1123 | 301 | } |
1124 | 66.8k | if ( unlikely(__vcpu_on_runq(svc)) ) |
1125 | 0 | { |
1126 | 0 | SCHED_STAT_CRANK(vcpu_wake_onrunq); |
1127 | 0 | return; |
1128 | 0 | } |
1129 | 66.8k | |
1130 | 66.8k | if ( likely(vcpu_runnable(vc)) ) |
1131 | 66.8k | SCHED_STAT_CRANK(vcpu_wake_runnable); |
1132 | 66.8k | else |
1133 | 18.4E | SCHED_STAT_CRANK(vcpu_wake_not_runnable); |
1134 | 66.8k | |
1135 | 66.8k | /* |
1136 | 66.8k | * We temporarly boost the priority of awaking VCPUs! |
1137 | 66.8k | * |
1138 | 66.8k | * If this VCPU consumes a non negligeable amount of CPU, it |
1139 | 66.8k | * will eventually find itself in the credit accounting code |
1140 | 66.8k | * path where its priority will be reset to normal. |
1141 | 66.8k | * |
1142 | 66.8k | * If on the other hand the VCPU consumes little CPU and is |
1143 | 66.8k | * blocking and awoken a lot (doing I/O for example), its |
1144 | 66.8k | * priority will remain boosted, optimizing it's wake-to-run |
1145 | 66.8k | * latencies. |
1146 | 66.8k | * |
1147 | 66.8k | * This allows wake-to-run latency sensitive VCPUs to preempt |
1148 | 66.8k | * more CPU resource intensive VCPUs without impacting overall |
1149 | 66.8k | * system fairness. |
1150 | 66.8k | * |
1151 | 66.8k | * There are two cases, when we don't want to boost: |
1152 | 66.8k | * - VCPUs that are waking up after a migration, rather than |
1153 | 66.8k | * after having block; |
1154 | 66.8k | * - VCPUs of capped domains unpausing after earning credits |
1155 | 66.8k | * they had overspent. |
1156 | 66.8k | */ |
1157 | 66.8k | migrating = test_and_clear_bit(CSCHED_FLAG_VCPU_MIGRATING, &svc->flags); |
1158 | 66.8k | |
1159 | 66.8k | if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER && |
1160 | 2.57k | !test_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) |
1161 | 2.57k | { |
1162 | 2.57k | TRACE_2D(TRC_CSCHED_BOOST_START, vc->domain->domain_id, vc->vcpu_id); |
1163 | 2.57k | SCHED_STAT_CRANK(vcpu_boost); |
1164 | 2.57k | svc->pri = CSCHED_PRI_TS_BOOST; |
1165 | 2.57k | } |
1166 | 66.8k | |
1167 | 66.8k | /* Put the VCPU on the runq and tickle CPUs */ |
1168 | 66.8k | runq_insert(svc); |
1169 | 66.8k | __runq_tickle(svc); |
1170 | 66.8k | } |
1171 | | |
1172 | | static void |
1173 | | csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc) |
1174 | 4.69M | { |
1175 | 4.69M | struct csched_vcpu * const svc = CSCHED_VCPU(vc); |
1176 | 4.69M | |
1177 | 4.69M | /* Let the scheduler know that this vcpu is trying to yield */ |
1178 | 4.69M | set_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags); |
1179 | 4.69M | } |
1180 | | |
1181 | | static int |
1182 | | csched_dom_cntl( |
1183 | | const struct scheduler *ops, |
1184 | | struct domain *d, |
1185 | | struct xen_domctl_scheduler_op *op) |
1186 | 0 | { |
1187 | 0 | struct csched_dom * const sdom = CSCHED_DOM(d); |
1188 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
1189 | 0 | unsigned long flags; |
1190 | 0 | int rc = 0; |
1191 | 0 |
|
1192 | 0 | /* Protect both get and put branches with the pluggable scheduler |
1193 | 0 | * lock. Runq lock not needed anywhere in here. */ |
1194 | 0 | spin_lock_irqsave(&prv->lock, flags); |
1195 | 0 |
|
1196 | 0 | switch ( op->cmd ) |
1197 | 0 | { |
1198 | 0 | case XEN_DOMCTL_SCHEDOP_getinfo: |
1199 | 0 | op->u.credit.weight = sdom->weight; |
1200 | 0 | op->u.credit.cap = sdom->cap; |
1201 | 0 | break; |
1202 | 0 | case XEN_DOMCTL_SCHEDOP_putinfo: |
1203 | 0 | if ( op->u.credit.weight != 0 ) |
1204 | 0 | { |
1205 | 0 | if ( !list_empty(&sdom->active_sdom_elem) ) |
1206 | 0 | { |
1207 | 0 | prv->weight -= sdom->weight * sdom->active_vcpu_count; |
1208 | 0 | prv->weight += op->u.credit.weight * sdom->active_vcpu_count; |
1209 | 0 | } |
1210 | 0 | sdom->weight = op->u.credit.weight; |
1211 | 0 | } |
1212 | 0 |
|
1213 | 0 | if ( op->u.credit.cap != (uint16_t)~0U ) |
1214 | 0 | sdom->cap = op->u.credit.cap; |
1215 | 0 | break; |
1216 | 0 | default: |
1217 | 0 | rc = -EINVAL; |
1218 | 0 | break; |
1219 | 0 | } |
1220 | 0 |
|
1221 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
1222 | 0 |
|
1223 | 0 | return rc; |
1224 | 0 | } |
1225 | | |
1226 | | static inline void |
1227 | | __csched_set_tslice(struct csched_private *prv, unsigned timeslice) |
1228 | 1 | { |
1229 | 1 | prv->tslice_ms = timeslice; |
1230 | 1 | prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE; |
1231 | 1 | if ( prv->tslice_ms < prv->ticks_per_tslice ) |
1232 | 0 | prv->ticks_per_tslice = 1; |
1233 | 1 | prv->tick_period_us = prv->tslice_ms * 1000 / prv->ticks_per_tslice; |
1234 | 1 | prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * prv->tslice_ms; |
1235 | 1 | prv->credit = prv->credits_per_tslice * prv->ncpus; |
1236 | 1 | } |
1237 | | |
1238 | | static int |
1239 | | csched_sys_cntl(const struct scheduler *ops, |
1240 | | struct xen_sysctl_scheduler_op *sc) |
1241 | 0 | { |
1242 | 0 | int rc = -EINVAL; |
1243 | 0 | struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit; |
1244 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
1245 | 0 | unsigned long flags; |
1246 | 0 |
|
1247 | 0 | switch ( sc->cmd ) |
1248 | 0 | { |
1249 | 0 | case XEN_SYSCTL_SCHEDOP_putinfo: |
1250 | 0 | if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX |
1251 | 0 | || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN |
1252 | 0 | || (params->ratelimit_us |
1253 | 0 | && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX |
1254 | 0 | || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN)) |
1255 | 0 | || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) ) |
1256 | 0 | goto out; |
1257 | 0 |
|
1258 | 0 | spin_lock_irqsave(&prv->lock, flags); |
1259 | 0 | __csched_set_tslice(prv, params->tslice_ms); |
1260 | 0 | if ( !prv->ratelimit_us && params->ratelimit_us ) |
1261 | 0 | printk(XENLOG_INFO "Enabling context switch rate limiting\n"); |
1262 | 0 | else if ( prv->ratelimit_us && !params->ratelimit_us ) |
1263 | 0 | printk(XENLOG_INFO "Disabling context switch rate limiting\n"); |
1264 | 0 | prv->ratelimit_us = params->ratelimit_us; |
1265 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
1266 | 0 |
|
1267 | 0 | /* FALLTHRU */ |
1268 | 0 | case XEN_SYSCTL_SCHEDOP_getinfo: |
1269 | 0 | params->tslice_ms = prv->tslice_ms; |
1270 | 0 | params->ratelimit_us = prv->ratelimit_us; |
1271 | 0 | rc = 0; |
1272 | 0 | break; |
1273 | 0 | } |
1274 | 0 | out: |
1275 | 0 | return rc; |
1276 | 0 | } |
1277 | | |
1278 | | static void * |
1279 | | csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) |
1280 | 1 | { |
1281 | 1 | struct csched_dom *sdom; |
1282 | 1 | |
1283 | 1 | sdom = xzalloc(struct csched_dom); |
1284 | 1 | if ( sdom == NULL ) |
1285 | 0 | return NULL; |
1286 | 1 | |
1287 | 1 | /* Initialize credit and weight */ |
1288 | 1 | INIT_LIST_HEAD(&sdom->active_vcpu); |
1289 | 1 | INIT_LIST_HEAD(&sdom->active_sdom_elem); |
1290 | 1 | sdom->dom = dom; |
1291 | 1 | sdom->weight = CSCHED_DEFAULT_WEIGHT; |
1292 | 1 | |
1293 | 1 | return (void *)sdom; |
1294 | 1 | } |
1295 | | |
1296 | | static int |
1297 | | csched_dom_init(const struct scheduler *ops, struct domain *dom) |
1298 | 2 | { |
1299 | 2 | struct csched_dom *sdom; |
1300 | 2 | |
1301 | 2 | if ( is_idle_domain(dom) ) |
1302 | 1 | return 0; |
1303 | 2 | |
1304 | 1 | sdom = csched_alloc_domdata(ops, dom); |
1305 | 1 | if ( sdom == NULL ) |
1306 | 0 | return -ENOMEM; |
1307 | 1 | |
1308 | 1 | dom->sched_priv = sdom; |
1309 | 1 | |
1310 | 1 | return 0; |
1311 | 1 | } |
1312 | | |
1313 | | static void |
1314 | | csched_free_domdata(const struct scheduler *ops, void *data) |
1315 | 0 | { |
1316 | 0 | xfree(data); |
1317 | 0 | } |
1318 | | |
1319 | | static void |
1320 | | csched_dom_destroy(const struct scheduler *ops, struct domain *dom) |
1321 | 0 | { |
1322 | 0 | csched_free_domdata(ops, CSCHED_DOM(dom)); |
1323 | 0 | } |
1324 | | |
1325 | | /* |
1326 | | * This is a O(n) optimized sort of the runq. |
1327 | | * |
1328 | | * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk |
1329 | | * through the runq and move up any UNDERs that are preceded by OVERS. We |
1330 | | * remember the last UNDER to make the move up operation O(1). |
1331 | | */ |
1332 | | static void |
1333 | | csched_runq_sort(struct csched_private *prv, unsigned int cpu) |
1334 | 12.0k | { |
1335 | 12.0k | struct csched_pcpu * const spc = CSCHED_PCPU(cpu); |
1336 | 12.0k | struct list_head *runq, *elem, *next, *last_under; |
1337 | 12.0k | struct csched_vcpu *svc_elem; |
1338 | 12.0k | spinlock_t *lock; |
1339 | 12.0k | unsigned long flags; |
1340 | 12.0k | int sort_epoch; |
1341 | 12.0k | |
1342 | 12.0k | sort_epoch = prv->runq_sort; |
1343 | 12.0k | if ( sort_epoch == spc->runq_sort_last ) |
1344 | 7.14k | return; |
1345 | 12.0k | |
1346 | 4.92k | spc->runq_sort_last = sort_epoch; |
1347 | 4.92k | |
1348 | 4.92k | lock = pcpu_schedule_lock_irqsave(cpu, &flags); |
1349 | 4.92k | |
1350 | 4.92k | runq = &spc->runq; |
1351 | 4.92k | elem = runq->next; |
1352 | 4.92k | last_under = runq; |
1353 | 4.92k | |
1354 | 9.30k | while ( elem != runq ) |
1355 | 4.37k | { |
1356 | 4.37k | next = elem->next; |
1357 | 4.37k | svc_elem = __runq_elem(elem); |
1358 | 4.37k | |
1359 | 4.37k | if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) |
1360 | 25 | { |
1361 | 25 | /* does elem need to move up the runq? */ |
1362 | 25 | if ( elem->prev != last_under ) |
1363 | 0 | { |
1364 | 0 | list_del(elem); |
1365 | 0 | list_add(elem, last_under); |
1366 | 0 | } |
1367 | 25 | last_under = elem; |
1368 | 25 | } |
1369 | 4.37k | |
1370 | 4.37k | elem = next; |
1371 | 4.37k | } |
1372 | 4.92k | |
1373 | 4.92k | pcpu_schedule_unlock_irqrestore(lock, flags, cpu); |
1374 | 4.92k | } |
1375 | | |
1376 | | static void |
1377 | | csched_acct(void* dummy) |
1378 | 2.89k | { |
1379 | 2.89k | struct csched_private *prv = dummy; |
1380 | 2.89k | unsigned long flags; |
1381 | 2.89k | struct list_head *iter_vcpu, *next_vcpu; |
1382 | 2.89k | struct list_head *iter_sdom, *next_sdom; |
1383 | 2.89k | struct csched_vcpu *svc; |
1384 | 2.89k | struct csched_dom *sdom; |
1385 | 2.89k | uint32_t credit_total; |
1386 | 2.89k | uint32_t weight_total; |
1387 | 2.89k | uint32_t weight_left; |
1388 | 2.89k | uint32_t credit_fair; |
1389 | 2.89k | uint32_t credit_peak; |
1390 | 2.89k | uint32_t credit_cap; |
1391 | 2.89k | int credit_balance; |
1392 | 2.89k | int credit_xtra; |
1393 | 2.89k | int credit; |
1394 | 2.89k | |
1395 | 2.89k | |
1396 | 2.89k | spin_lock_irqsave(&prv->lock, flags); |
1397 | 2.89k | |
1398 | 2.89k | weight_total = prv->weight; |
1399 | 2.89k | credit_total = prv->credit; |
1400 | 2.89k | |
1401 | 2.89k | /* Converge balance towards 0 when it drops negative */ |
1402 | 2.89k | if ( prv->credit_balance < 0 ) |
1403 | 59 | { |
1404 | 59 | credit_total -= prv->credit_balance; |
1405 | 59 | SCHED_STAT_CRANK(acct_balance); |
1406 | 59 | } |
1407 | 2.89k | |
1408 | 2.89k | if ( unlikely(weight_total == 0) ) |
1409 | 1.54k | { |
1410 | 1.54k | prv->credit_balance = 0; |
1411 | 1.54k | spin_unlock_irqrestore(&prv->lock, flags); |
1412 | 1.54k | SCHED_STAT_CRANK(acct_no_work); |
1413 | 1.54k | goto out; |
1414 | 1.54k | } |
1415 | 2.89k | |
1416 | 1.34k | SCHED_STAT_CRANK(acct_run); |
1417 | 1.34k | |
1418 | 1.34k | weight_left = weight_total; |
1419 | 1.34k | credit_balance = 0; |
1420 | 1.34k | credit_xtra = 0; |
1421 | 1.34k | credit_cap = 0U; |
1422 | 1.34k | |
1423 | 1.34k | list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) |
1424 | 1.34k | { |
1425 | 1.34k | sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); |
1426 | 1.34k | |
1427 | 1.34k | BUG_ON( is_idle_domain(sdom->dom) ); |
1428 | 1.34k | BUG_ON( sdom->active_vcpu_count == 0 ); |
1429 | 1.34k | BUG_ON( sdom->weight == 0 ); |
1430 | 1.34k | BUG_ON( (sdom->weight * sdom->active_vcpu_count) > weight_left ); |
1431 | 1.34k | |
1432 | 1.34k | weight_left -= ( sdom->weight * sdom->active_vcpu_count ); |
1433 | 1.34k | |
1434 | 1.34k | /* |
1435 | 1.34k | * A domain's fair share is computed using its weight in competition |
1436 | 1.34k | * with that of all other active domains. |
1437 | 1.34k | * |
1438 | 1.34k | * At most, a domain can use credits to run all its active VCPUs |
1439 | 1.34k | * for one full accounting period. We allow a domain to earn more |
1440 | 1.34k | * only when the system-wide credit balance is negative. |
1441 | 1.34k | */ |
1442 | 1.34k | credit_peak = sdom->active_vcpu_count * prv->credits_per_tslice; |
1443 | 1.34k | if ( prv->credit_balance < 0 ) |
1444 | 59 | { |
1445 | 59 | credit_peak += ( ( -prv->credit_balance |
1446 | 59 | * sdom->weight |
1447 | 59 | * sdom->active_vcpu_count) + |
1448 | 59 | (weight_total - 1) |
1449 | 59 | ) / weight_total; |
1450 | 59 | } |
1451 | 1.34k | |
1452 | 1.34k | if ( sdom->cap != 0U ) |
1453 | 0 | { |
1454 | 0 | credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100; |
1455 | 0 | if ( credit_cap < credit_peak ) |
1456 | 0 | credit_peak = credit_cap; |
1457 | 0 |
|
1458 | 0 | /* FIXME -- set cap per-vcpu as well...? */ |
1459 | 0 | credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 ) |
1460 | 0 | ) / sdom->active_vcpu_count; |
1461 | 0 | } |
1462 | 1.34k | |
1463 | 1.34k | credit_fair = ( ( credit_total |
1464 | 1.34k | * sdom->weight |
1465 | 1.34k | * sdom->active_vcpu_count ) |
1466 | 1.34k | + (weight_total - 1) |
1467 | 1.34k | ) / weight_total; |
1468 | 1.34k | |
1469 | 1.34k | if ( credit_fair < credit_peak ) |
1470 | 0 | { |
1471 | 0 | credit_xtra = 1; |
1472 | 0 | } |
1473 | 1.34k | else |
1474 | 1.34k | { |
1475 | 1.34k | if ( weight_left != 0U ) |
1476 | 0 | { |
1477 | 0 | /* Give other domains a chance at unused credits */ |
1478 | 0 | credit_total += ( ( ( credit_fair - credit_peak |
1479 | 0 | ) * weight_total |
1480 | 0 | ) + ( weight_left - 1 ) |
1481 | 0 | ) / weight_left; |
1482 | 0 | } |
1483 | 1.34k | |
1484 | 1.34k | if ( credit_xtra ) |
1485 | 0 | { |
1486 | 0 | /* |
1487 | 0 | * Lazily keep domains with extra credits at the head of |
1488 | 0 | * the queue to give others a chance at them in future |
1489 | 0 | * accounting periods. |
1490 | 0 | */ |
1491 | 0 | SCHED_STAT_CRANK(acct_reorder); |
1492 | 0 | list_del(&sdom->active_sdom_elem); |
1493 | 0 | list_add(&sdom->active_sdom_elem, &prv->active_sdom); |
1494 | 0 | } |
1495 | 1.34k | |
1496 | 1.34k | credit_fair = credit_peak; |
1497 | 1.34k | } |
1498 | 1.34k | |
1499 | 1.34k | /* Compute fair share per VCPU */ |
1500 | 1.34k | credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 ) |
1501 | 1.34k | ) / sdom->active_vcpu_count; |
1502 | 1.34k | |
1503 | 1.34k | |
1504 | 1.34k | list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu ) |
1505 | 4.65k | { |
1506 | 4.65k | svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem); |
1507 | 4.65k | BUG_ON( sdom != svc->sdom ); |
1508 | 4.65k | |
1509 | 4.65k | /* Increment credit */ |
1510 | 4.65k | atomic_add(credit_fair, &svc->credit); |
1511 | 4.65k | credit = atomic_read(&svc->credit); |
1512 | 4.65k | |
1513 | 4.65k | /* |
1514 | 4.65k | * Recompute priority or, if VCPU is idling, remove it from |
1515 | 4.65k | * the active list. |
1516 | 4.65k | */ |
1517 | 4.65k | if ( credit < 0 ) |
1518 | 493 | { |
1519 | 493 | svc->pri = CSCHED_PRI_TS_OVER; |
1520 | 493 | |
1521 | 493 | /* Park running VCPUs of capped-out domains */ |
1522 | 493 | if ( sdom->cap != 0U && |
1523 | 0 | credit < -credit_cap && |
1524 | 0 | !test_and_set_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) |
1525 | 0 | { |
1526 | 0 | SCHED_STAT_CRANK(vcpu_park); |
1527 | 0 | vcpu_pause_nosync(svc->vcpu); |
1528 | 0 | } |
1529 | 493 | |
1530 | 493 | /* Lower bound on credits */ |
1531 | 493 | if ( credit < -prv->credits_per_tslice ) |
1532 | 1 | { |
1533 | 1 | SCHED_STAT_CRANK(acct_min_credit); |
1534 | 1 | credit = -prv->credits_per_tslice; |
1535 | 1 | atomic_set(&svc->credit, credit); |
1536 | 1 | } |
1537 | 493 | } |
1538 | 4.65k | else |
1539 | 4.15k | { |
1540 | 4.15k | svc->pri = CSCHED_PRI_TS_UNDER; |
1541 | 4.15k | |
1542 | 4.15k | /* Unpark any capped domains whose credits go positive */ |
1543 | 4.15k | if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) |
1544 | 0 | { |
1545 | 0 | /* |
1546 | 0 | * It's important to unset the flag AFTER the unpause() |
1547 | 0 | * call to make sure the VCPU's priority is not boosted |
1548 | 0 | * if it is woken up here. |
1549 | 0 | */ |
1550 | 0 | SCHED_STAT_CRANK(vcpu_unpark); |
1551 | 0 | vcpu_unpause(svc->vcpu); |
1552 | 0 | } |
1553 | 4.15k | |
1554 | 4.15k | /* Upper bound on credits means VCPU stops earning */ |
1555 | 4.15k | if ( credit > prv->credits_per_tslice ) |
1556 | 1.08k | { |
1557 | 1.08k | __csched_vcpu_acct_stop_locked(prv, svc); |
1558 | 1.08k | /* Divide credits in half, so that when it starts |
1559 | 1.08k | * accounting again, it starts a little bit "ahead" */ |
1560 | 1.08k | credit /= 2; |
1561 | 1.08k | atomic_set(&svc->credit, credit); |
1562 | 1.08k | } |
1563 | 4.15k | } |
1564 | 4.65k | |
1565 | 4.65k | SCHED_VCPU_STAT_SET(svc, credit_last, credit); |
1566 | 4.65k | SCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair); |
1567 | 4.65k | credit_balance += credit; |
1568 | 4.65k | } |
1569 | 1.34k | } |
1570 | 1.34k | |
1571 | 1.34k | prv->credit_balance = credit_balance; |
1572 | 1.34k | |
1573 | 1.34k | spin_unlock_irqrestore(&prv->lock, flags); |
1574 | 1.34k | |
1575 | 1.34k | /* Inform each CPU that its runq needs to be sorted */ |
1576 | 1.34k | prv->runq_sort++; |
1577 | 1.34k | |
1578 | 2.89k | out: |
1579 | 2.89k | set_timer( &prv->master_ticker, |
1580 | 2.89k | NOW() + MILLISECS(prv->tslice_ms)); |
1581 | 2.89k | } |
1582 | | |
1583 | | static void |
1584 | | csched_tick(void *_cpu) |
1585 | 11.0k | { |
1586 | 11.0k | unsigned int cpu = (unsigned long)_cpu; |
1587 | 11.0k | struct csched_pcpu *spc = CSCHED_PCPU(cpu); |
1588 | 11.0k | struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); |
1589 | 11.0k | |
1590 | 11.0k | spc->tick++; |
1591 | 11.0k | |
1592 | 11.0k | /* |
1593 | 11.0k | * Accounting for running VCPU |
1594 | 11.0k | */ |
1595 | 11.0k | if ( !is_idle_vcpu(current) ) |
1596 | 10.3k | csched_vcpu_acct(prv, cpu); |
1597 | 11.0k | |
1598 | 11.0k | /* |
1599 | 11.0k | * Check if runq needs to be sorted |
1600 | 11.0k | * |
1601 | 11.0k | * Every physical CPU resorts the runq after the accounting master has |
1602 | 11.0k | * modified priorities. This is a special O(n) sort and runs at most |
1603 | 11.0k | * once per accounting period (currently 30 milliseconds). |
1604 | 11.0k | */ |
1605 | 11.0k | csched_runq_sort(prv, cpu); |
1606 | 11.0k | |
1607 | 11.0k | set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); |
1608 | 11.0k | } |
1609 | | |
1610 | | static struct csched_vcpu * |
1611 | | csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) |
1612 | 2.39k | { |
1613 | 2.39k | const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); |
1614 | 2.39k | struct csched_vcpu *speer; |
1615 | 2.39k | struct list_head *iter; |
1616 | 2.39k | struct vcpu *vc; |
1617 | 2.39k | |
1618 | 2.39k | ASSERT(peer_pcpu != NULL); |
1619 | 2.39k | |
1620 | 2.39k | /* |
1621 | 2.39k | * Don't steal from an idle CPU's runq because it's about to |
1622 | 2.39k | * pick up work from it itself. |
1623 | 2.39k | */ |
1624 | 2.39k | if ( unlikely(is_idle_vcpu(curr_on_cpu(peer_cpu))) ) |
1625 | 1 | goto out; |
1626 | 2.39k | |
1627 | 2.39k | list_for_each( iter, &peer_pcpu->runq ) |
1628 | 3.08k | { |
1629 | 3.08k | speer = __runq_elem(iter); |
1630 | 3.08k | |
1631 | 3.08k | /* |
1632 | 3.08k | * If next available VCPU here is not of strictly higher |
1633 | 3.08k | * priority than ours, this PCPU is useless to us. |
1634 | 3.08k | */ |
1635 | 3.08k | if ( speer->pri <= pri ) |
1636 | 1.88k | break; |
1637 | 3.08k | |
1638 | 3.08k | /* Is this VCPU runnable on our PCPU? */ |
1639 | 1.19k | vc = speer->vcpu; |
1640 | 1.19k | BUG_ON( is_idle_vcpu(vc) ); |
1641 | 1.19k | |
1642 | 1.19k | /* |
1643 | 1.19k | * If the vcpu is still in peer_cpu's scheduling tail, or if it |
1644 | 1.19k | * has no useful soft affinity, skip it. |
1645 | 1.19k | * |
1646 | 1.19k | * In fact, what we want is to check if we have any "soft-affine |
1647 | 1.19k | * work" to steal, before starting to look at "hard-affine work". |
1648 | 1.19k | * |
1649 | 1.19k | * Notice that, if not even one vCPU on this runq has a useful |
1650 | 1.19k | * soft affinity, we could have avoid considering this runq for |
1651 | 1.19k | * a soft balancing step in the first place. This, for instance, |
1652 | 1.19k | * can be implemented by taking note of on what runq there are |
1653 | 1.19k | * vCPUs with useful soft affinities in some sort of bitmap |
1654 | 1.19k | * or counter. |
1655 | 1.19k | */ |
1656 | 1.19k | if ( vc->is_running || |
1657 | 1.05k | (balance_step == BALANCE_SOFT_AFFINITY |
1658 | 546 | && !has_soft_affinity(vc, vc->cpu_hard_affinity)) ) |
1659 | 691 | continue; |
1660 | 1.19k | |
1661 | 506 | affinity_balance_cpumask(vc, balance_step, cpumask_scratch); |
1662 | 506 | if ( __csched_vcpu_is_migrateable(vc, cpu, cpumask_scratch) ) |
1663 | 506 | { |
1664 | 506 | /* We got a candidate. Grab it! */ |
1665 | 506 | TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu, |
1666 | 506 | vc->domain->domain_id, vc->vcpu_id); |
1667 | 506 | SCHED_VCPU_STAT_CRANK(speer, migrate_q); |
1668 | 506 | SCHED_STAT_CRANK(migrate_queued); |
1669 | 506 | WARN_ON(vc->is_urgent); |
1670 | 506 | runq_remove(speer); |
1671 | 506 | vc->processor = cpu; |
1672 | 506 | /* |
1673 | 506 | * speer will start executing directly on cpu, without having to |
1674 | 506 | * go through runq_insert(). So we must update the runnable count |
1675 | 506 | * for cpu here. |
1676 | 506 | */ |
1677 | 506 | inc_nr_runnable(cpu); |
1678 | 506 | return speer; |
1679 | 506 | } |
1680 | 506 | } |
1681 | 1.88k | out: |
1682 | 1.88k | SCHED_STAT_CRANK(steal_peer_idle); |
1683 | 1.88k | return NULL; |
1684 | 2.39k | } |
1685 | | |
1686 | | static struct csched_vcpu * |
1687 | | csched_load_balance(struct csched_private *prv, int cpu, |
1688 | | struct csched_vcpu *snext, bool_t *stolen) |
1689 | 385k | { |
1690 | 385k | struct cpupool *c = per_cpu(cpupool, cpu); |
1691 | 385k | struct csched_vcpu *speer; |
1692 | 385k | cpumask_t workers; |
1693 | 385k | cpumask_t *online; |
1694 | 385k | int peer_cpu, first_cpu, peer_node, bstep; |
1695 | 385k | int node = cpu_to_node(cpu); |
1696 | 385k | |
1697 | 385k | BUG_ON( cpu != snext->vcpu->processor ); |
1698 | 385k | online = cpupool_online_cpumask(c); |
1699 | 385k | |
1700 | 385k | /* |
1701 | 385k | * If this CPU is going offline, or is not (yet) part of any cpupool |
1702 | 385k | * (as it happens, e.g., during cpu bringup), we shouldn't steal work. |
1703 | 385k | */ |
1704 | 385k | if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) ) |
1705 | 0 | goto out; |
1706 | 385k | |
1707 | 385k | if ( snext->pri == CSCHED_PRI_IDLE ) |
1708 | 66.5k | SCHED_STAT_CRANK(load_balance_idle); |
1709 | 318k | else if ( snext->pri == CSCHED_PRI_TS_OVER ) |
1710 | 318k | SCHED_STAT_CRANK(load_balance_over); |
1711 | 318k | else |
1712 | 516 | SCHED_STAT_CRANK(load_balance_other); |
1713 | 385k | |
1714 | 385k | /* |
1715 | 385k | * Let's look around for work to steal, taking both hard affinity |
1716 | 385k | * and soft affinity into account. More specifically, we check all |
1717 | 385k | * the non-idle CPUs' runq, looking for: |
1718 | 385k | * 1. any "soft-affine work" to steal first, |
1719 | 385k | * 2. if not finding anything, any "hard-affine work" to steal. |
1720 | 385k | */ |
1721 | 385k | for_each_affinity_balance_step( bstep ) |
1722 | 751k | { |
1723 | 751k | /* |
1724 | 751k | * We peek at the non-idling CPUs in a node-wise fashion. In fact, |
1725 | 751k | * it is more likely that we find some affine work on our same |
1726 | 751k | * node, not to mention that migrating vcpus within the same node |
1727 | 751k | * could well expected to be cheaper than across-nodes (memory |
1728 | 751k | * stays local, there might be some node-wide cache[s], etc.). |
1729 | 751k | */ |
1730 | 751k | peer_node = node; |
1731 | 751k | do |
1732 | 751k | { |
1733 | 751k | /* Select the pCPUs in this node that have work we can steal. */ |
1734 | 751k | cpumask_andnot(&workers, online, prv->idlers); |
1735 | 751k | cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); |
1736 | 751k | __cpumask_clear_cpu(cpu, &workers); |
1737 | 751k | |
1738 | 751k | first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers); |
1739 | 751k | if ( first_cpu >= nr_cpu_ids ) |
1740 | 57.9k | goto next_node; |
1741 | 693k | peer_cpu = first_cpu; |
1742 | 693k | do |
1743 | 6.06M | { |
1744 | 6.06M | spinlock_t *lock; |
1745 | 6.06M | |
1746 | 6.06M | /* |
1747 | 6.06M | * If there is only one runnable vCPU on peer_cpu, it means |
1748 | 6.06M | * there's no one to be stolen in its runqueue, so skip it. |
1749 | 6.06M | * |
1750 | 6.06M | * Checking this without holding the lock is racy... But that's |
1751 | 6.06M | * the whole point of this optimization! |
1752 | 6.06M | * |
1753 | 6.06M | * In more details: |
1754 | 6.06M | * - if we race with dec_nr_runnable(), we may try to take the |
1755 | 6.06M | * lock and call csched_runq_steal() for no reason. This is |
1756 | 6.06M | * not a functional issue, and should be infrequent enough. |
1757 | 6.06M | * And we can avoid that by re-checking nr_runnable after |
1758 | 6.06M | * having grabbed the lock, if we want; |
1759 | 6.06M | * - if we race with inc_nr_runnable(), we skip a pCPU that may |
1760 | 6.06M | * have runnable vCPUs in its runqueue, but that's not a |
1761 | 6.06M | * problem because: |
1762 | 6.06M | * + if racing with csched_vcpu_insert() or csched_vcpu_wake(), |
1763 | 6.06M | * __runq_tickle() will be called afterwords, so the vCPU |
1764 | 6.06M | * won't get stuck in the runqueue for too long; |
1765 | 6.06M | * + if racing with csched_runq_steal(), it may be that a |
1766 | 6.06M | * vCPU that we could have picked up, stays in a runqueue |
1767 | 6.06M | * until someone else tries to steal it again. But this is |
1768 | 6.06M | * no worse than what can happen already (without this |
1769 | 6.06M | * optimization), it the pCPU would schedule right after we |
1770 | 6.06M | * have taken the lock, and hence block on it. |
1771 | 6.06M | */ |
1772 | 6.06M | if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 ) |
1773 | 6.04M | { |
1774 | 6.04M | TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0); |
1775 | 6.04M | goto next_cpu; |
1776 | 6.04M | } |
1777 | 6.06M | |
1778 | 6.06M | /* |
1779 | 6.06M | * Get ahold of the scheduler lock for this peer CPU. |
1780 | 6.06M | * |
1781 | 6.06M | * Note: We don't spin on this lock but simply try it. Spinning |
1782 | 6.06M | * could cause a deadlock if the peer CPU is also load |
1783 | 6.06M | * balancing and trying to lock this CPU. |
1784 | 6.06M | */ |
1785 | 20.6k | lock = pcpu_schedule_trylock(peer_cpu); |
1786 | 20.6k | SCHED_STAT_CRANK(steal_trylock); |
1787 | 20.6k | if ( !lock ) |
1788 | 1.03k | { |
1789 | 1.03k | SCHED_STAT_CRANK(steal_trylock_failed); |
1790 | 1.03k | TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0); |
1791 | 1.03k | goto next_cpu; |
1792 | 1.03k | } |
1793 | 20.6k | |
1794 | 19.5k | TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1); |
1795 | 19.5k | |
1796 | 19.5k | /* Any work over there to steal? */ |
1797 | 19.5k | speer = cpumask_test_cpu(peer_cpu, online) ? |
1798 | 17.1k | csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; |
1799 | 19.5k | pcpu_schedule_unlock(lock, peer_cpu); |
1800 | 19.5k | |
1801 | 19.5k | /* As soon as one vcpu is found, balancing ends */ |
1802 | 19.5k | if ( speer != NULL ) |
1803 | 506 | { |
1804 | 506 | *stolen = 1; |
1805 | 506 | /* |
1806 | 506 | * Next time we'll look for work to steal on this node, we |
1807 | 506 | * will start from the next pCPU, with respect to this one, |
1808 | 506 | * so we don't risk stealing always from the same ones. |
1809 | 506 | */ |
1810 | 506 | prv->balance_bias[peer_node] = peer_cpu; |
1811 | 506 | return speer; |
1812 | 506 | } |
1813 | 19.5k | |
1814 | 6.01M | next_cpu: |
1815 | 6.01M | peer_cpu = cpumask_cycle(peer_cpu, &workers); |
1816 | 6.01M | |
1817 | 6.01M | } while( peer_cpu != first_cpu ); |
1818 | 693k | |
1819 | 754k | next_node: |
1820 | 754k | peer_node = cycle_node(peer_node, node_online_map); |
1821 | 754k | } while( peer_node != node ); |
1822 | 751k | } |
1823 | 385k | |
1824 | 376k | out: |
1825 | 376k | /* Failed to find more important work elsewhere... */ |
1826 | 376k | __runq_remove(snext); |
1827 | 376k | return snext; |
1828 | 385k | } |
1829 | | |
1830 | | /* |
1831 | | * This function is in the critical path. It is designed to be simple and |
1832 | | * fast for the common case. |
1833 | | */ |
1834 | | static struct task_slice |
1835 | | csched_schedule( |
1836 | | const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) |
1837 | 4.70M | { |
1838 | 4.70M | const int cpu = smp_processor_id(); |
1839 | 4.70M | struct list_head * const runq = RUNQ(cpu); |
1840 | 4.70M | struct csched_vcpu * const scurr = CSCHED_VCPU(current); |
1841 | 4.70M | struct csched_private *prv = CSCHED_PRIV(ops); |
1842 | 4.70M | struct csched_vcpu *snext; |
1843 | 4.70M | struct task_slice ret; |
1844 | 4.70M | s_time_t runtime, tslice; |
1845 | 4.70M | |
1846 | 4.70M | SCHED_STAT_CRANK(schedule); |
1847 | 4.70M | CSCHED_VCPU_CHECK(current); |
1848 | 4.70M | |
1849 | 4.70M | /* |
1850 | 4.70M | * Here in Credit1 code, we usually just call TRACE_nD() helpers, and |
1851 | 4.70M | * don't care about packing. But scheduling happens very often, so it |
1852 | 4.70M | * actually is important that the record is as small as possible. |
1853 | 4.70M | */ |
1854 | 4.70M | if ( unlikely(tb_init_done) ) |
1855 | 0 | { |
1856 | 0 | struct { |
1857 | 0 | unsigned cpu:16, tasklet:8, idle:8; |
1858 | 0 | } d; |
1859 | 0 | d.cpu = cpu; |
1860 | 0 | d.tasklet = tasklet_work_scheduled; |
1861 | 0 | d.idle = is_idle_vcpu(current); |
1862 | 0 | __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d), |
1863 | 0 | (unsigned char *)&d); |
1864 | 0 | } |
1865 | 4.70M | |
1866 | 4.70M | runtime = now - current->runstate.state_entry_time; |
1867 | 4.70M | if ( runtime < 0 ) /* Does this ever happen? */ |
1868 | 62.4k | runtime = 0; |
1869 | 4.70M | |
1870 | 4.70M | if ( !is_idle_vcpu(scurr->vcpu) ) |
1871 | 4.63M | { |
1872 | 4.63M | /* Update credits of a non-idle VCPU. */ |
1873 | 4.63M | burn_credits(scurr, now); |
1874 | 4.63M | scurr->start_time -= now; |
1875 | 4.63M | } |
1876 | 4.70M | else |
1877 | 69.7k | { |
1878 | 69.7k | /* Re-instate a boosted idle VCPU as normal-idle. */ |
1879 | 69.7k | scurr->pri = CSCHED_PRI_IDLE; |
1880 | 69.7k | } |
1881 | 4.70M | |
1882 | 4.70M | /* Choices, choices: |
1883 | 4.70M | * - If we have a tasklet, we need to run the idle vcpu no matter what. |
1884 | 4.70M | * - If sched rate limiting is in effect, and the current vcpu has |
1885 | 4.70M | * run for less than that amount of time, continue the current one, |
1886 | 4.70M | * but with a shorter timeslice and return it immediately |
1887 | 4.70M | * - Otherwise, chose the one with the highest priority (which may |
1888 | 4.70M | * be the one currently running) |
1889 | 4.70M | * - If the currently running one is TS_OVER, see if there |
1890 | 4.70M | * is a higher priority one waiting on the runqueue of another |
1891 | 4.70M | * cpu and steal it. |
1892 | 4.70M | */ |
1893 | 4.70M | |
1894 | 4.70M | /* |
1895 | 4.70M | * If we have schedule rate limiting enabled, check to see |
1896 | 4.70M | * how long we've run for. |
1897 | 4.70M | * |
1898 | 4.70M | * If scurr is yielding, however, we don't let rate limiting kick in. |
1899 | 4.70M | * In fact, it may be the case that scurr is about to spin, and there's |
1900 | 4.70M | * no point forcing it to do so until rate limiting expires. |
1901 | 4.70M | */ |
1902 | 4.70M | if ( !test_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags) |
1903 | 134k | && !tasklet_work_scheduled |
1904 | 134k | && prv->ratelimit_us |
1905 | 134k | && vcpu_runnable(current) |
1906 | 68.8k | && !is_idle_vcpu(current) |
1907 | 2.60k | && runtime < MICROSECS(prv->ratelimit_us) ) |
1908 | 242 | { |
1909 | 242 | snext = scurr; |
1910 | 242 | snext->start_time += now; |
1911 | 242 | perfc_incr(delay_ms); |
1912 | 242 | /* |
1913 | 242 | * Next timeslice must last just until we'll have executed for |
1914 | 242 | * ratelimit_us. However, to avoid setting a really short timer, which |
1915 | 242 | * will most likely be inaccurate and counterproductive, we never go |
1916 | 242 | * below CSCHED_MIN_TIMER. |
1917 | 242 | */ |
1918 | 242 | tslice = MICROSECS(prv->ratelimit_us) - runtime; |
1919 | 242 | if ( unlikely(runtime < CSCHED_MIN_TIMER) ) |
1920 | 57 | tslice = CSCHED_MIN_TIMER; |
1921 | 242 | if ( unlikely(tb_init_done) ) |
1922 | 0 | { |
1923 | 0 | struct { |
1924 | 0 | unsigned vcpu:16, dom:16; |
1925 | 0 | unsigned runtime; |
1926 | 0 | } d; |
1927 | 0 | d.dom = scurr->vcpu->domain->domain_id; |
1928 | 0 | d.vcpu = scurr->vcpu->vcpu_id; |
1929 | 0 | d.runtime = runtime; |
1930 | 0 | __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d), |
1931 | 0 | (unsigned char *)&d); |
1932 | 0 | } |
1933 | 242 | |
1934 | 242 | ret.migrated = 0; |
1935 | 242 | goto out; |
1936 | 242 | } |
1937 | 4.70M | tslice = MILLISECS(prv->tslice_ms); |
1938 | 4.70M | |
1939 | 4.70M | /* |
1940 | 4.70M | * Select next runnable local VCPU (ie top of local runq) |
1941 | 4.70M | */ |
1942 | 4.70M | if ( vcpu_runnable(current) ) |
1943 | 4.83M | __runq_insert(scurr); |
1944 | 4.70M | else |
1945 | 18.4E | { |
1946 | 18.4E | BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); |
1947 | 18.4E | /* Current has blocked. Update the runnable counter for this cpu. */ |
1948 | 18.4E | dec_nr_runnable(cpu); |
1949 | 18.4E | } |
1950 | 4.70M | |
1951 | 4.70M | snext = __runq_elem(runq->next); |
1952 | 4.70M | ret.migrated = 0; |
1953 | 4.70M | |
1954 | 4.70M | /* Tasklet work (which runs in idle VCPU context) overrides all else. */ |
1955 | 4.70M | if ( tasklet_work_scheduled ) |
1956 | 44 | { |
1957 | 44 | TRACE_0D(TRC_CSCHED_SCHED_TASKLET); |
1958 | 44 | snext = CSCHED_VCPU(idle_vcpu[cpu]); |
1959 | 44 | snext->pri = CSCHED_PRI_TS_BOOST; |
1960 | 44 | } |
1961 | 4.70M | |
1962 | 4.70M | /* |
1963 | 4.70M | * Clear YIELD flag before scheduling out |
1964 | 4.70M | */ |
1965 | 4.70M | clear_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags); |
1966 | 4.70M | |
1967 | 4.70M | /* |
1968 | 4.70M | * SMP Load balance: |
1969 | 4.70M | * |
1970 | 4.70M | * If the next highest priority local runnable VCPU has already eaten |
1971 | 4.70M | * through its credits, look on other PCPUs to see if we have more |
1972 | 4.70M | * urgent work... If not, csched_load_balance() will return snext, but |
1973 | 4.70M | * already removed from the runq. |
1974 | 4.70M | */ |
1975 | 4.70M | if ( snext->pri > CSCHED_PRI_TS_OVER ) |
1976 | 4.52M | __runq_remove(snext); |
1977 | 4.70M | else |
1978 | 184k | snext = csched_load_balance(prv, cpu, snext, &ret.migrated); |
1979 | 4.70M | |
1980 | 4.70M | /* |
1981 | 4.70M | * Update idlers mask if necessary. When we're idling, other CPUs |
1982 | 4.70M | * will tickle us when they get extra work. |
1983 | 4.70M | */ |
1984 | 4.88M | if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE ) |
1985 | 65.4k | { |
1986 | 65.4k | if ( !cpumask_test_cpu(cpu, prv->idlers) ) |
1987 | 65.5k | cpumask_set_cpu(cpu, prv->idlers); |
1988 | 65.4k | } |
1989 | 4.64M | else if ( cpumask_test_cpu(cpu, prv->idlers) ) |
1990 | 0 | { |
1991 | 0 | cpumask_clear_cpu(cpu, prv->idlers); |
1992 | 0 | } |
1993 | 4.70M | |
1994 | 4.70M | if ( !is_idle_vcpu(snext->vcpu) ) |
1995 | 4.89M | snext->start_time += now; |
1996 | 4.70M | |
1997 | 4.97M | out: |
1998 | 4.97M | /* |
1999 | 4.97M | * Return task to run next... |
2000 | 4.97M | */ |
2001 | 4.97M | ret.time = (is_idle_vcpu(snext->vcpu) ? |
2002 | 4.90M | -1 : tslice); |
2003 | 4.97M | ret.task = snext->vcpu; |
2004 | 4.97M | |
2005 | 4.97M | CSCHED_VCPU_CHECK(ret.task); |
2006 | 4.97M | return ret; |
2007 | 4.70M | } |
2008 | | |
2009 | | static void |
2010 | | csched_dump_vcpu(struct csched_vcpu *svc) |
2011 | 0 | { |
2012 | 0 | struct csched_dom * const sdom = svc->sdom; |
2013 | 0 |
|
2014 | 0 | printk("[%i.%i] pri=%i flags=%x cpu=%i", |
2015 | 0 | svc->vcpu->domain->domain_id, |
2016 | 0 | svc->vcpu->vcpu_id, |
2017 | 0 | svc->pri, |
2018 | 0 | svc->flags, |
2019 | 0 | svc->vcpu->processor); |
2020 | 0 |
|
2021 | 0 | if ( sdom ) |
2022 | 0 | { |
2023 | 0 | printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit), |
2024 | 0 | sdom->weight, sdom->cap); |
2025 | 0 | #ifdef CSCHED_STATS |
2026 | | printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}", |
2027 | | svc->stats.credit_last, |
2028 | | svc->stats.credit_incr, |
2029 | | svc->stats.state_active, |
2030 | | svc->stats.state_idle, |
2031 | | svc->stats.migrate_q, |
2032 | | svc->stats.migrate_r, |
2033 | | svc->stats.kicked_away); |
2034 | | #endif |
2035 | 0 | } |
2036 | 0 |
|
2037 | 0 | printk("\n"); |
2038 | 0 | } |
2039 | | |
2040 | | static void |
2041 | | csched_dump_pcpu(const struct scheduler *ops, int cpu) |
2042 | 0 | { |
2043 | 0 | struct list_head *runq, *iter; |
2044 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
2045 | 0 | struct csched_pcpu *spc; |
2046 | 0 | struct csched_vcpu *svc; |
2047 | 0 | spinlock_t *lock; |
2048 | 0 | unsigned long flags; |
2049 | 0 | int loop; |
2050 | 0 | #define cpustr keyhandler_scratch |
2051 | 0 |
|
2052 | 0 | /* |
2053 | 0 | * We need both locks: |
2054 | 0 | * - csched_dump_vcpu() wants to access domains' scheduling |
2055 | 0 | * parameters, which are protected by the private scheduler lock; |
2056 | 0 | * - we scan through the runqueue, so we need the proper runqueue |
2057 | 0 | * lock (the one of the runqueue of this cpu). |
2058 | 0 | */ |
2059 | 0 | spin_lock_irqsave(&prv->lock, flags); |
2060 | 0 | lock = pcpu_schedule_lock(cpu); |
2061 | 0 |
|
2062 | 0 | spc = CSCHED_PCPU(cpu); |
2063 | 0 | runq = &spc->runq; |
2064 | 0 |
|
2065 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); |
2066 | 0 | printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ", |
2067 | 0 | cpu, spc->nr_runnable, spc->runq_sort_last, cpustr); |
2068 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); |
2069 | 0 | printk("core=%s\n", cpustr); |
2070 | 0 |
|
2071 | 0 | /* current VCPU (nothing to say if that's the idle vcpu). */ |
2072 | 0 | svc = CSCHED_VCPU(curr_on_cpu(cpu)); |
2073 | 0 | if ( svc && !is_idle_vcpu(svc->vcpu) ) |
2074 | 0 | { |
2075 | 0 | printk("\trun: "); |
2076 | 0 | csched_dump_vcpu(svc); |
2077 | 0 | } |
2078 | 0 |
|
2079 | 0 | loop = 0; |
2080 | 0 | list_for_each( iter, runq ) |
2081 | 0 | { |
2082 | 0 | svc = __runq_elem(iter); |
2083 | 0 | if ( svc ) |
2084 | 0 | { |
2085 | 0 | printk("\t%3d: ", ++loop); |
2086 | 0 | csched_dump_vcpu(svc); |
2087 | 0 | } |
2088 | 0 | } |
2089 | 0 |
|
2090 | 0 | pcpu_schedule_unlock(lock, cpu); |
2091 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
2092 | 0 | #undef cpustr |
2093 | 0 | } |
2094 | | |
2095 | | static void |
2096 | | csched_dump(const struct scheduler *ops) |
2097 | 0 | { |
2098 | 0 | struct list_head *iter_sdom, *iter_svc; |
2099 | 0 | struct csched_private *prv = CSCHED_PRIV(ops); |
2100 | 0 | int loop; |
2101 | 0 | unsigned long flags; |
2102 | 0 |
|
2103 | 0 | spin_lock_irqsave(&prv->lock, flags); |
2104 | 0 |
|
2105 | 0 | #define idlers_buf keyhandler_scratch |
2106 | 0 |
|
2107 | 0 | printk("info:\n" |
2108 | 0 | "\tncpus = %u\n" |
2109 | 0 | "\tmaster = %u\n" |
2110 | 0 | "\tcredit = %u\n" |
2111 | 0 | "\tcredit balance = %d\n" |
2112 | 0 | "\tweight = %u\n" |
2113 | 0 | "\trunq_sort = %u\n" |
2114 | 0 | "\tdefault-weight = %d\n" |
2115 | 0 | "\ttslice = %dms\n" |
2116 | 0 | "\tratelimit = %dus\n" |
2117 | 0 | "\tcredits per msec = %d\n" |
2118 | 0 | "\tticks per tslice = %d\n" |
2119 | 0 | "\tmigration delay = %uus\n", |
2120 | 0 | prv->ncpus, |
2121 | 0 | prv->master, |
2122 | 0 | prv->credit, |
2123 | 0 | prv->credit_balance, |
2124 | 0 | prv->weight, |
2125 | 0 | prv->runq_sort, |
2126 | 0 | CSCHED_DEFAULT_WEIGHT, |
2127 | 0 | prv->tslice_ms, |
2128 | 0 | prv->ratelimit_us, |
2129 | 0 | CSCHED_CREDITS_PER_MSEC, |
2130 | 0 | prv->ticks_per_tslice, |
2131 | 0 | vcpu_migration_delay); |
2132 | 0 |
|
2133 | 0 | cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers); |
2134 | 0 | printk("idlers: %s\n", idlers_buf); |
2135 | 0 |
|
2136 | 0 | printk("active vcpus:\n"); |
2137 | 0 | loop = 0; |
2138 | 0 | list_for_each( iter_sdom, &prv->active_sdom ) |
2139 | 0 | { |
2140 | 0 | struct csched_dom *sdom; |
2141 | 0 | sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); |
2142 | 0 |
|
2143 | 0 | list_for_each( iter_svc, &sdom->active_vcpu ) |
2144 | 0 | { |
2145 | 0 | struct csched_vcpu *svc; |
2146 | 0 | spinlock_t *lock; |
2147 | 0 |
|
2148 | 0 | svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem); |
2149 | 0 | lock = vcpu_schedule_lock(svc->vcpu); |
2150 | 0 |
|
2151 | 0 | printk("\t%3d: ", ++loop); |
2152 | 0 | csched_dump_vcpu(svc); |
2153 | 0 |
|
2154 | 0 | vcpu_schedule_unlock(lock, svc->vcpu); |
2155 | 0 | } |
2156 | 0 | } |
2157 | 0 | #undef idlers_buf |
2158 | 0 |
|
2159 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
2160 | 0 | } |
2161 | | |
2162 | | static int |
2163 | | csched_init(struct scheduler *ops) |
2164 | 1 | { |
2165 | 1 | struct csched_private *prv; |
2166 | 1 | |
2167 | 1 | prv = xzalloc(struct csched_private); |
2168 | 1 | if ( prv == NULL ) |
2169 | 0 | return -ENOMEM; |
2170 | 1 | |
2171 | 1 | prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES); |
2172 | 1 | if ( prv->balance_bias == NULL ) |
2173 | 0 | { |
2174 | 0 | xfree(prv); |
2175 | 0 | return -ENOMEM; |
2176 | 0 | } |
2177 | 1 | |
2178 | 1 | if ( !zalloc_cpumask_var(&prv->cpus) || |
2179 | 1 | !zalloc_cpumask_var(&prv->idlers) ) |
2180 | 0 | { |
2181 | 0 | free_cpumask_var(prv->cpus); |
2182 | 0 | xfree(prv->balance_bias); |
2183 | 0 | xfree(prv); |
2184 | 0 | return -ENOMEM; |
2185 | 0 | } |
2186 | 1 | |
2187 | 1 | ops->sched_data = prv; |
2188 | 1 | spin_lock_init(&prv->lock); |
2189 | 1 | INIT_LIST_HEAD(&prv->active_sdom); |
2190 | 1 | prv->master = UINT_MAX; |
2191 | 1 | |
2192 | 1 | if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX |
2193 | 1 | || sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN ) |
2194 | 0 | { |
2195 | 0 | printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n" |
2196 | 0 | " Resetting to default %u\n", |
2197 | 0 | XEN_SYSCTL_CSCHED_TSLICE_MIN, |
2198 | 0 | XEN_SYSCTL_CSCHED_TSLICE_MAX, |
2199 | 0 | CSCHED_DEFAULT_TSLICE_MS); |
2200 | 0 | sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; |
2201 | 0 | } |
2202 | 1 | |
2203 | 1 | __csched_set_tslice(prv, sched_credit_tslice_ms); |
2204 | 1 | |
2205 | 1 | if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) |
2206 | 0 | { |
2207 | 0 | printk("WARNING: sched_ratelimit_us >" |
2208 | 0 | "sched_credit_tslice_ms is undefined\n" |
2209 | 0 | "Setting ratelimit_us to 1000 * tslice_ms\n"); |
2210 | 0 | prv->ratelimit_us = 1000 * prv->tslice_ms; |
2211 | 0 | } |
2212 | 1 | else |
2213 | 1 | prv->ratelimit_us = sched_ratelimit_us; |
2214 | 1 | return 0; |
2215 | 1 | } |
2216 | | |
2217 | | static void |
2218 | | csched_deinit(struct scheduler *ops) |
2219 | 0 | { |
2220 | 0 | struct csched_private *prv; |
2221 | 0 |
|
2222 | 0 | prv = CSCHED_PRIV(ops); |
2223 | 0 | if ( prv != NULL ) |
2224 | 0 | { |
2225 | 0 | ops->sched_data = NULL; |
2226 | 0 | free_cpumask_var(prv->cpus); |
2227 | 0 | free_cpumask_var(prv->idlers); |
2228 | 0 | xfree(prv->balance_bias); |
2229 | 0 | xfree(prv); |
2230 | 0 | } |
2231 | 0 | } |
2232 | | |
2233 | | static void csched_tick_suspend(const struct scheduler *ops, unsigned int cpu) |
2234 | 1.89M | { |
2235 | 1.89M | struct csched_pcpu *spc; |
2236 | 1.89M | |
2237 | 1.89M | spc = CSCHED_PCPU(cpu); |
2238 | 1.89M | |
2239 | 1.89M | stop_timer(&spc->ticker); |
2240 | 1.89M | } |
2241 | | |
2242 | | static void csched_tick_resume(const struct scheduler *ops, unsigned int cpu) |
2243 | 1.86M | { |
2244 | 1.86M | struct csched_private *prv; |
2245 | 1.86M | struct csched_pcpu *spc; |
2246 | 1.86M | uint64_t now = NOW(); |
2247 | 1.86M | |
2248 | 1.86M | spc = CSCHED_PCPU(cpu); |
2249 | 1.86M | |
2250 | 1.86M | prv = CSCHED_PRIV(ops); |
2251 | 1.86M | |
2252 | 1.86M | set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us) |
2253 | 1.86M | - now % MICROSECS(prv->tick_period_us) ); |
2254 | 1.86M | } |
2255 | | |
2256 | | static const struct scheduler sched_credit_def = { |
2257 | | .name = "SMP Credit Scheduler", |
2258 | | .opt_name = "credit", |
2259 | | .sched_id = XEN_SCHEDULER_CREDIT, |
2260 | | .sched_data = NULL, |
2261 | | |
2262 | | .init_domain = csched_dom_init, |
2263 | | .destroy_domain = csched_dom_destroy, |
2264 | | |
2265 | | .insert_vcpu = csched_vcpu_insert, |
2266 | | .remove_vcpu = csched_vcpu_remove, |
2267 | | |
2268 | | .sleep = csched_vcpu_sleep, |
2269 | | .wake = csched_vcpu_wake, |
2270 | | .yield = csched_vcpu_yield, |
2271 | | |
2272 | | .adjust = csched_dom_cntl, |
2273 | | .adjust_global = csched_sys_cntl, |
2274 | | |
2275 | | .pick_cpu = csched_cpu_pick, |
2276 | | .do_schedule = csched_schedule, |
2277 | | |
2278 | | .dump_cpu_state = csched_dump_pcpu, |
2279 | | .dump_settings = csched_dump, |
2280 | | .init = csched_init, |
2281 | | .deinit = csched_deinit, |
2282 | | .alloc_vdata = csched_alloc_vdata, |
2283 | | .free_vdata = csched_free_vdata, |
2284 | | .alloc_pdata = csched_alloc_pdata, |
2285 | | .init_pdata = csched_init_pdata, |
2286 | | .deinit_pdata = csched_deinit_pdata, |
2287 | | .free_pdata = csched_free_pdata, |
2288 | | .switch_sched = csched_switch_sched, |
2289 | | .alloc_domdata = csched_alloc_domdata, |
2290 | | .free_domdata = csched_free_domdata, |
2291 | | |
2292 | | .tick_suspend = csched_tick_suspend, |
2293 | | .tick_resume = csched_tick_resume, |
2294 | | }; |
2295 | | |
2296 | | REGISTER_SCHEDULER(sched_credit_def); |