debuggers.hg

view xen/common/sched_credit.c @ 20975:bd0d6ec8caaa

keyhandler: global shared scratch space for temporary strings

Put one static definition in one place and we can make it as big as we
think reasonable.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 11 21:08:06 2010 +0000 (2010-02-11)
parents cff23354d026
children ae2b7f1c89c8
line source
1 /****************************************************************************
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3 ****************************************************************************
4 *
5 * File: common/csched_credit.c
6 * Author: Emmanuel Ackaouy
7 *
8 * Description: Credit-based SMP CPU scheduler
9 */
11 #include <xen/config.h>
12 #include <xen/init.h>
13 #include <xen/lib.h>
14 #include <xen/sched.h>
15 #include <xen/domain.h>
16 #include <xen/delay.h>
17 #include <xen/event.h>
18 #include <xen/time.h>
19 #include <xen/perfc.h>
20 #include <xen/sched-if.h>
21 #include <xen/softirq.h>
22 #include <asm/atomic.h>
23 #include <xen/errno.h>
24 #include <xen/keyhandler.h>
26 /*
27 * CSCHED_STATS
28 *
29 * Manage very basic per-vCPU counters and stats.
30 *
31 * Useful for debugging live systems. The stats are displayed
32 * with runq dumps ('r' on the Xen console).
33 */
34 #ifdef PERF_COUNTERS
35 #define CSCHED_STATS
36 #endif
39 /*
40 * Basic constants
41 */
42 #define CSCHED_DEFAULT_WEIGHT 256
43 #define CSCHED_TICKS_PER_TSLICE 3
44 #define CSCHED_TICKS_PER_ACCT 3
45 #define CSCHED_MSECS_PER_TICK 10
46 #define CSCHED_MSECS_PER_TSLICE \
47 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48 #define CSCHED_CREDITS_PER_MSEC 10
49 #define CSCHED_CREDITS_PER_TSLICE \
50 (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TSLICE)
51 #define CSCHED_CREDITS_PER_ACCT \
52 (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_ACCT)
55 /*
56 * Priorities
57 */
58 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
59 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
60 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
61 #define CSCHED_PRI_IDLE -64 /* idle */
64 /*
65 * Flags
66 */
67 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
70 /*
71 * Useful macros
72 */
73 #define CSCHED_PCPU(_c) \
74 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
75 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
76 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
77 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
80 /*
81 * Stats
82 */
83 #define CSCHED_STAT_CRANK(_X) (perfc_incr(_X))
85 #ifdef CSCHED_STATS
87 #define CSCHED_VCPU_STATS_RESET(_V) \
88 do \
89 { \
90 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
91 } while ( 0 )
93 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
95 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
97 #else /* CSCHED_STATS */
99 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
100 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
101 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
103 #endif /* CSCHED_STATS */
106 /*
107 * Physical CPU
108 */
109 struct csched_pcpu {
110 struct list_head runq;
111 uint32_t runq_sort_last;
112 struct timer ticker;
113 unsigned int tick;
114 unsigned int idle_bias;
115 };
117 /*
118 * Virtual CPU
119 */
120 struct csched_vcpu {
121 struct list_head runq_elem;
122 struct list_head active_vcpu_elem;
123 struct csched_dom *sdom;
124 struct vcpu *vcpu;
125 atomic_t credit;
126 s_time_t start_time; /* When we were scheduled (used for credit) */
127 uint16_t flags;
128 int16_t pri;
129 #ifdef CSCHED_STATS
130 struct {
131 int credit_last;
132 uint32_t credit_incr;
133 uint32_t state_active;
134 uint32_t state_idle;
135 uint32_t migrate_q;
136 uint32_t migrate_r;
137 } stats;
138 #endif
139 };
141 /*
142 * Domain
143 */
144 struct csched_dom {
145 struct list_head active_vcpu;
146 struct list_head active_sdom_elem;
147 struct domain *dom;
148 uint16_t active_vcpu_count;
149 uint16_t weight;
150 uint16_t cap;
151 };
153 /*
154 * System-wide private data
155 */
156 struct csched_private {
157 spinlock_t lock;
158 struct list_head active_sdom;
159 uint32_t ncpus;
160 struct timer master_ticker;
161 unsigned int master;
162 cpumask_t idlers;
163 uint32_t weight;
164 uint32_t credit;
165 int credit_balance;
166 uint32_t runq_sort;
167 };
170 /*
171 * Global variables
172 */
173 static struct csched_private csched_priv;
175 static void csched_tick(void *_cpu);
177 static inline int
178 __vcpu_on_runq(struct csched_vcpu *svc)
179 {
180 return !list_empty(&svc->runq_elem);
181 }
183 static inline struct csched_vcpu *
184 __runq_elem(struct list_head *elem)
185 {
186 return list_entry(elem, struct csched_vcpu, runq_elem);
187 }
189 static inline void
190 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
191 {
192 const struct list_head * const runq = RUNQ(cpu);
193 struct list_head *iter;
195 BUG_ON( __vcpu_on_runq(svc) );
196 BUG_ON( cpu != svc->vcpu->processor );
198 list_for_each( iter, runq )
199 {
200 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
201 if ( svc->pri > iter_svc->pri )
202 break;
203 }
205 list_add_tail(&svc->runq_elem, iter);
206 }
208 static inline void
209 __runq_remove(struct csched_vcpu *svc)
210 {
211 BUG_ON( !__vcpu_on_runq(svc) );
212 list_del_init(&svc->runq_elem);
213 }
215 static void burn_credits(struct csched_vcpu *svc, s_time_t now)
216 {
217 s_time_t delta;
218 unsigned int credits;
220 /* Assert svc is current */
221 ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
223 if ( (delta = now - svc->start_time) <= 0 )
224 return;
226 credits = (delta*CSCHED_CREDITS_PER_MSEC + MILLISECS(1)/2) / MILLISECS(1);
227 atomic_sub(credits, &svc->credit);
228 svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
229 }
231 static inline void
232 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
233 {
234 struct csched_vcpu * const cur =
235 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
236 cpumask_t mask;
238 ASSERT(cur);
239 cpus_clear(mask);
241 /* If strictly higher priority than current VCPU, signal the CPU */
242 if ( new->pri > cur->pri )
243 {
244 if ( cur->pri == CSCHED_PRI_IDLE )
245 CSCHED_STAT_CRANK(tickle_local_idler);
246 else if ( cur->pri == CSCHED_PRI_TS_OVER )
247 CSCHED_STAT_CRANK(tickle_local_over);
248 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
249 CSCHED_STAT_CRANK(tickle_local_under);
250 else
251 CSCHED_STAT_CRANK(tickle_local_other);
253 cpu_set(cpu, mask);
254 }
256 /*
257 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
258 * let them know there is runnable work in the system...
259 */
260 if ( cur->pri > CSCHED_PRI_IDLE )
261 {
262 if ( cpus_empty(csched_priv.idlers) )
263 {
264 CSCHED_STAT_CRANK(tickle_idlers_none);
265 }
266 else
267 {
268 CSCHED_STAT_CRANK(tickle_idlers_some);
269 cpus_or(mask, mask, csched_priv.idlers);
270 cpus_and(mask, mask, new->vcpu->cpu_affinity);
271 }
272 }
274 /* Send scheduler interrupts to designated CPUs */
275 if ( !cpus_empty(mask) )
276 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
277 }
279 static int
280 csched_pcpu_init(int cpu)
281 {
282 struct csched_pcpu *spc;
283 unsigned long flags;
285 /* Allocate per-PCPU info */
286 spc = xmalloc(struct csched_pcpu);
287 if ( spc == NULL )
288 return -1;
289 memset(spc, 0, sizeof(*spc));
291 spin_lock_irqsave(&csched_priv.lock, flags);
293 /* Initialize/update system-wide config */
294 csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
295 if ( csched_priv.ncpus <= cpu )
296 csched_priv.ncpus = cpu + 1;
297 if ( csched_priv.master >= csched_priv.ncpus )
298 csched_priv.master = cpu;
300 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
301 INIT_LIST_HEAD(&spc->runq);
302 spc->runq_sort_last = csched_priv.runq_sort;
303 spc->idle_bias = NR_CPUS - 1;
304 per_cpu(schedule_data, cpu).sched_priv = spc;
306 /* Start off idling... */
307 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
308 cpu_set(cpu, csched_priv.idlers);
310 spin_unlock_irqrestore(&csched_priv.lock, flags);
312 return 0;
313 }
315 #ifndef NDEBUG
316 static inline void
317 __csched_vcpu_check(struct vcpu *vc)
318 {
319 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
320 struct csched_dom * const sdom = svc->sdom;
322 BUG_ON( svc->vcpu != vc );
323 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
324 if ( sdom )
325 {
326 BUG_ON( is_idle_vcpu(vc) );
327 BUG_ON( sdom->dom != vc->domain );
328 }
329 else
330 {
331 BUG_ON( !is_idle_vcpu(vc) );
332 }
334 CSCHED_STAT_CRANK(vcpu_check);
335 }
336 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
337 #else
338 #define CSCHED_VCPU_CHECK(_vc)
339 #endif
341 /*
342 * Delay, in microseconds, between migrations of a VCPU between PCPUs.
343 * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
344 * implicit overheads such as cache-warming. 1ms (1000) has been measured
345 * as a good value.
346 */
347 static unsigned int vcpu_migration_delay;
348 integer_param("vcpu_migration_delay", vcpu_migration_delay);
350 void set_vcpu_migration_delay(unsigned int delay)
351 {
352 vcpu_migration_delay = delay;
353 }
355 unsigned int get_vcpu_migration_delay(void)
356 {
357 return vcpu_migration_delay;
358 }
360 static inline int
361 __csched_vcpu_is_cache_hot(struct vcpu *v)
362 {
363 int hot = ((NOW() - v->last_run_time) <
364 ((uint64_t)vcpu_migration_delay * 1000u));
366 if ( hot )
367 CSCHED_STAT_CRANK(vcpu_hot);
369 return hot;
370 }
372 static inline int
373 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
374 {
375 /*
376 * Don't pick up work that's in the peer's scheduling tail or hot on
377 * peer PCPU. Only pick up work that's allowed to run on our CPU.
378 */
379 return !vc->is_running &&
380 !__csched_vcpu_is_cache_hot(vc) &&
381 cpu_isset(dest_cpu, vc->cpu_affinity);
382 }
384 static int
385 _csched_cpu_pick(struct vcpu *vc, bool_t commit)
386 {
387 cpumask_t cpus;
388 cpumask_t idlers;
389 int cpu;
391 /*
392 * Pick from online CPUs in VCPU's affinity mask, giving a
393 * preference to its current processor if it's in there.
394 */
395 cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
396 cpu = cpu_isset(vc->processor, cpus)
397 ? vc->processor
398 : cycle_cpu(vc->processor, cpus);
399 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
401 /*
402 * Try to find an idle processor within the above constraints.
403 *
404 * In multi-core and multi-threaded CPUs, not all idle execution
405 * vehicles are equal!
406 *
407 * We give preference to the idle execution vehicle with the most
408 * idling neighbours in its grouping. This distributes work across
409 * distinct cores first and guarantees we don't do something stupid
410 * like run two VCPUs on co-hyperthreads while there are idle cores
411 * or sockets.
412 */
413 idlers = csched_priv.idlers;
414 cpu_set(cpu, idlers);
415 cpus_and(cpus, cpus, idlers);
416 cpu_clear(cpu, cpus);
418 while ( !cpus_empty(cpus) )
419 {
420 cpumask_t cpu_idlers;
421 cpumask_t nxt_idlers;
422 int nxt, weight_cpu, weight_nxt;
424 nxt = cycle_cpu(cpu, cpus);
426 if ( cpu_isset(cpu, per_cpu(cpu_core_map, nxt)) )
427 {
428 ASSERT( cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
429 cpus_and(cpu_idlers, idlers, per_cpu(cpu_sibling_map, cpu));
430 cpus_and(nxt_idlers, idlers, per_cpu(cpu_sibling_map, nxt));
431 }
432 else
433 {
434 ASSERT( !cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
435 cpus_and(cpu_idlers, idlers, per_cpu(cpu_core_map, cpu));
436 cpus_and(nxt_idlers, idlers, per_cpu(cpu_core_map, nxt));
437 }
439 weight_cpu = cpus_weight(cpu_idlers);
440 weight_nxt = cpus_weight(nxt_idlers);
441 if ( ( (weight_cpu < weight_nxt) ^ sched_smt_power_savings )
442 && (weight_cpu != weight_nxt) )
443 {
444 cpu = cycle_cpu(CSCHED_PCPU(nxt)->idle_bias, nxt_idlers);
445 if ( commit )
446 CSCHED_PCPU(nxt)->idle_bias = cpu;
447 cpus_andnot(cpus, cpus, per_cpu(cpu_sibling_map, cpu));
448 }
449 else
450 {
451 cpus_andnot(cpus, cpus, nxt_idlers);
452 }
453 }
455 return cpu;
456 }
458 static int
459 csched_cpu_pick(struct vcpu *vc)
460 {
461 return _csched_cpu_pick(vc, 1);
462 }
464 static inline void
465 __csched_vcpu_acct_start(struct csched_vcpu *svc)
466 {
467 struct csched_dom * const sdom = svc->sdom;
468 unsigned long flags;
470 spin_lock_irqsave(&csched_priv.lock, flags);
472 if ( list_empty(&svc->active_vcpu_elem) )
473 {
474 CSCHED_VCPU_STAT_CRANK(svc, state_active);
475 CSCHED_STAT_CRANK(acct_vcpu_active);
477 sdom->active_vcpu_count++;
478 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
479 if ( list_empty(&sdom->active_sdom_elem) )
480 {
481 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
482 csched_priv.weight += sdom->weight;
483 }
484 }
486 spin_unlock_irqrestore(&csched_priv.lock, flags);
487 }
489 static inline void
490 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
491 {
492 struct csched_dom * const sdom = svc->sdom;
494 BUG_ON( list_empty(&svc->active_vcpu_elem) );
496 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
497 CSCHED_STAT_CRANK(acct_vcpu_idle);
499 sdom->active_vcpu_count--;
500 list_del_init(&svc->active_vcpu_elem);
501 if ( list_empty(&sdom->active_vcpu) )
502 {
503 BUG_ON( csched_priv.weight < sdom->weight );
504 list_del_init(&sdom->active_sdom_elem);
505 csched_priv.weight -= sdom->weight;
506 }
507 }
509 static void
510 csched_vcpu_acct(unsigned int cpu)
511 {
512 struct csched_vcpu * const svc = CSCHED_VCPU(current);
514 ASSERT( current->processor == cpu );
515 ASSERT( svc->sdom != NULL );
517 /*
518 * If this VCPU's priority was boosted when it last awoke, reset it.
519 * If the VCPU is found here, then it's consuming a non-negligeable
520 * amount of CPU resources and should no longer be boosted.
521 */
522 if ( svc->pri == CSCHED_PRI_TS_BOOST )
523 svc->pri = CSCHED_PRI_TS_UNDER;
525 /*
526 * Update credits
527 */
528 if ( !is_idle_vcpu(svc->vcpu) )
529 burn_credits(svc, NOW());
531 /*
532 * Put this VCPU and domain back on the active list if it was
533 * idling.
534 *
535 * If it's been active a while, check if we'd be better off
536 * migrating it to run elsewhere (see multi-core and multi-thread
537 * support in csched_cpu_pick()).
538 */
539 if ( list_empty(&svc->active_vcpu_elem) )
540 {
541 __csched_vcpu_acct_start(svc);
542 }
543 else if ( _csched_cpu_pick(current, 0) != cpu )
544 {
545 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
546 CSCHED_STAT_CRANK(migrate_running);
547 set_bit(_VPF_migrating, &current->pause_flags);
548 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
549 }
550 }
552 static int
553 csched_vcpu_init(struct vcpu *vc)
554 {
555 struct domain * const dom = vc->domain;
556 struct csched_dom *sdom = CSCHED_DOM(dom);
557 struct csched_vcpu *svc;
559 CSCHED_STAT_CRANK(vcpu_init);
561 /* Allocate per-VCPU info */
562 svc = xmalloc(struct csched_vcpu);
563 if ( svc == NULL )
564 return -1;
565 memset(svc, 0, sizeof(*svc));
567 INIT_LIST_HEAD(&svc->runq_elem);
568 INIT_LIST_HEAD(&svc->active_vcpu_elem);
569 svc->sdom = sdom;
570 svc->vcpu = vc;
571 atomic_set(&svc->credit, 0);
572 svc->flags = 0U;
573 svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
574 CSCHED_VCPU_STATS_RESET(svc);
575 vc->sched_priv = svc;
577 /* Allocate per-PCPU info */
578 if ( unlikely(!CSCHED_PCPU(vc->processor)) )
579 {
580 if ( csched_pcpu_init(vc->processor) != 0 )
581 return -1;
582 }
584 CSCHED_VCPU_CHECK(vc);
585 return 0;
586 }
588 static void
589 csched_vcpu_destroy(struct vcpu *vc)
590 {
591 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
592 struct csched_dom * const sdom = svc->sdom;
593 unsigned long flags;
595 CSCHED_STAT_CRANK(vcpu_destroy);
597 BUG_ON( sdom == NULL );
598 BUG_ON( !list_empty(&svc->runq_elem) );
600 spin_lock_irqsave(&csched_priv.lock, flags);
602 if ( !list_empty(&svc->active_vcpu_elem) )
603 __csched_vcpu_acct_stop_locked(svc);
605 spin_unlock_irqrestore(&csched_priv.lock, flags);
607 xfree(svc);
608 }
610 static void
611 csched_vcpu_sleep(struct vcpu *vc)
612 {
613 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
615 CSCHED_STAT_CRANK(vcpu_sleep);
617 BUG_ON( is_idle_vcpu(vc) );
619 if ( per_cpu(schedule_data, vc->processor).curr == vc )
620 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
621 else if ( __vcpu_on_runq(svc) )
622 __runq_remove(svc);
623 }
625 static void
626 csched_vcpu_wake(struct vcpu *vc)
627 {
628 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
629 const unsigned int cpu = vc->processor;
631 BUG_ON( is_idle_vcpu(vc) );
633 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
634 {
635 CSCHED_STAT_CRANK(vcpu_wake_running);
636 return;
637 }
638 if ( unlikely(__vcpu_on_runq(svc)) )
639 {
640 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
641 return;
642 }
644 if ( likely(vcpu_runnable(vc)) )
645 CSCHED_STAT_CRANK(vcpu_wake_runnable);
646 else
647 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
649 /*
650 * We temporarly boost the priority of awaking VCPUs!
651 *
652 * If this VCPU consumes a non negligeable amount of CPU, it
653 * will eventually find itself in the credit accounting code
654 * path where its priority will be reset to normal.
655 *
656 * If on the other hand the VCPU consumes little CPU and is
657 * blocking and awoken a lot (doing I/O for example), its
658 * priority will remain boosted, optimizing it's wake-to-run
659 * latencies.
660 *
661 * This allows wake-to-run latency sensitive VCPUs to preempt
662 * more CPU resource intensive VCPUs without impacting overall
663 * system fairness.
664 *
665 * The one exception is for VCPUs of capped domains unpausing
666 * after earning credits they had overspent. We don't boost
667 * those.
668 */
669 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
670 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
671 {
672 svc->pri = CSCHED_PRI_TS_BOOST;
673 }
675 /* Put the VCPU on the runq and tickle CPUs */
676 __runq_insert(cpu, svc);
677 __runq_tickle(cpu, svc);
678 }
680 static int
681 csched_dom_cntl(
682 struct domain *d,
683 struct xen_domctl_scheduler_op *op)
684 {
685 struct csched_dom * const sdom = CSCHED_DOM(d);
686 unsigned long flags;
688 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
689 {
690 op->u.credit.weight = sdom->weight;
691 op->u.credit.cap = sdom->cap;
692 }
693 else
694 {
695 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
697 spin_lock_irqsave(&csched_priv.lock, flags);
699 if ( op->u.credit.weight != 0 )
700 {
701 if ( !list_empty(&sdom->active_sdom_elem) )
702 {
703 csched_priv.weight -= sdom->weight;
704 csched_priv.weight += op->u.credit.weight;
705 }
706 sdom->weight = op->u.credit.weight;
707 }
709 if ( op->u.credit.cap != (uint16_t)~0U )
710 sdom->cap = op->u.credit.cap;
712 spin_unlock_irqrestore(&csched_priv.lock, flags);
713 }
715 return 0;
716 }
718 static int
719 csched_dom_init(struct domain *dom)
720 {
721 struct csched_dom *sdom;
723 CSCHED_STAT_CRANK(dom_init);
725 if ( is_idle_domain(dom) )
726 return 0;
728 sdom = xmalloc(struct csched_dom);
729 if ( sdom == NULL )
730 return -ENOMEM;
731 memset(sdom, 0, sizeof(*sdom));
733 /* Initialize credit and weight */
734 INIT_LIST_HEAD(&sdom->active_vcpu);
735 sdom->active_vcpu_count = 0;
736 INIT_LIST_HEAD(&sdom->active_sdom_elem);
737 sdom->dom = dom;
738 sdom->weight = CSCHED_DEFAULT_WEIGHT;
739 sdom->cap = 0U;
740 dom->sched_priv = sdom;
742 return 0;
743 }
745 static void
746 csched_dom_destroy(struct domain *dom)
747 {
748 CSCHED_STAT_CRANK(dom_destroy);
749 xfree(CSCHED_DOM(dom));
750 }
752 /*
753 * This is a O(n) optimized sort of the runq.
754 *
755 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
756 * through the runq and move up any UNDERs that are preceded by OVERS. We
757 * remember the last UNDER to make the move up operation O(1).
758 */
759 static void
760 csched_runq_sort(unsigned int cpu)
761 {
762 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
763 struct list_head *runq, *elem, *next, *last_under;
764 struct csched_vcpu *svc_elem;
765 unsigned long flags;
766 int sort_epoch;
768 sort_epoch = csched_priv.runq_sort;
769 if ( sort_epoch == spc->runq_sort_last )
770 return;
772 spc->runq_sort_last = sort_epoch;
774 spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
776 runq = &spc->runq;
777 elem = runq->next;
778 last_under = runq;
780 while ( elem != runq )
781 {
782 next = elem->next;
783 svc_elem = __runq_elem(elem);
785 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
786 {
787 /* does elem need to move up the runq? */
788 if ( elem->prev != last_under )
789 {
790 list_del(elem);
791 list_add(elem, last_under);
792 }
793 last_under = elem;
794 }
796 elem = next;
797 }
799 spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
800 }
802 static void
803 csched_acct(void* dummy)
804 {
805 unsigned long flags;
806 struct list_head *iter_vcpu, *next_vcpu;
807 struct list_head *iter_sdom, *next_sdom;
808 struct csched_vcpu *svc;
809 struct csched_dom *sdom;
810 uint32_t credit_total;
811 uint32_t weight_total;
812 uint32_t weight_left;
813 uint32_t credit_fair;
814 uint32_t credit_peak;
815 uint32_t credit_cap;
816 int credit_balance;
817 int credit_xtra;
818 int credit;
821 spin_lock_irqsave(&csched_priv.lock, flags);
823 weight_total = csched_priv.weight;
824 credit_total = csched_priv.credit;
826 /* Converge balance towards 0 when it drops negative */
827 if ( csched_priv.credit_balance < 0 )
828 {
829 credit_total -= csched_priv.credit_balance;
830 CSCHED_STAT_CRANK(acct_balance);
831 }
833 if ( unlikely(weight_total == 0) )
834 {
835 csched_priv.credit_balance = 0;
836 spin_unlock_irqrestore(&csched_priv.lock, flags);
837 CSCHED_STAT_CRANK(acct_no_work);
838 goto out;
839 }
841 CSCHED_STAT_CRANK(acct_run);
843 weight_left = weight_total;
844 credit_balance = 0;
845 credit_xtra = 0;
846 credit_cap = 0U;
848 list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
849 {
850 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
852 BUG_ON( is_idle_domain(sdom->dom) );
853 BUG_ON( sdom->active_vcpu_count == 0 );
854 BUG_ON( sdom->weight == 0 );
855 BUG_ON( sdom->weight > weight_left );
857 weight_left -= sdom->weight;
859 /*
860 * A domain's fair share is computed using its weight in competition
861 * with that of all other active domains.
862 *
863 * At most, a domain can use credits to run all its active VCPUs
864 * for one full accounting period. We allow a domain to earn more
865 * only when the system-wide credit balance is negative.
866 */
867 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
868 if ( csched_priv.credit_balance < 0 )
869 {
870 credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
871 (weight_total - 1)
872 ) / weight_total;
873 }
875 if ( sdom->cap != 0U )
876 {
877 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
878 if ( credit_cap < credit_peak )
879 credit_peak = credit_cap;
881 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
882 ) / sdom->active_vcpu_count;
883 }
885 credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
886 ) / weight_total;
888 if ( credit_fair < credit_peak )
889 {
890 credit_xtra = 1;
891 }
892 else
893 {
894 if ( weight_left != 0U )
895 {
896 /* Give other domains a chance at unused credits */
897 credit_total += ( ( ( credit_fair - credit_peak
898 ) * weight_total
899 ) + ( weight_left - 1 )
900 ) / weight_left;
901 }
903 if ( credit_xtra )
904 {
905 /*
906 * Lazily keep domains with extra credits at the head of
907 * the queue to give others a chance at them in future
908 * accounting periods.
909 */
910 CSCHED_STAT_CRANK(acct_reorder);
911 list_del(&sdom->active_sdom_elem);
912 list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
913 }
915 credit_fair = credit_peak;
916 }
918 /* Compute fair share per VCPU */
919 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
920 ) / sdom->active_vcpu_count;
923 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
924 {
925 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
926 BUG_ON( sdom != svc->sdom );
928 /* Increment credit */
929 atomic_add(credit_fair, &svc->credit);
930 credit = atomic_read(&svc->credit);
932 /*
933 * Recompute priority or, if VCPU is idling, remove it from
934 * the active list.
935 */
936 if ( credit < 0 )
937 {
938 svc->pri = CSCHED_PRI_TS_OVER;
940 /* Park running VCPUs of capped-out domains */
941 if ( sdom->cap != 0U &&
942 credit < -credit_cap &&
943 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
944 {
945 CSCHED_STAT_CRANK(vcpu_park);
946 vcpu_pause_nosync(svc->vcpu);
947 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
948 }
950 /* Lower bound on credits */
951 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
952 {
953 CSCHED_STAT_CRANK(acct_min_credit);
954 credit = -CSCHED_CREDITS_PER_TSLICE;
955 atomic_set(&svc->credit, credit);
956 }
957 }
958 else
959 {
960 svc->pri = CSCHED_PRI_TS_UNDER;
962 /* Unpark any capped domains whose credits go positive */
963 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
964 {
965 /*
966 * It's important to unset the flag AFTER the unpause()
967 * call to make sure the VCPU's priority is not boosted
968 * if it is woken up here.
969 */
970 CSCHED_STAT_CRANK(vcpu_unpark);
971 vcpu_unpause(svc->vcpu);
972 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
973 }
975 /* Upper bound on credits means VCPU stops earning */
976 if ( credit > CSCHED_CREDITS_PER_TSLICE )
977 {
978 __csched_vcpu_acct_stop_locked(svc);
979 credit = 0;
980 atomic_set(&svc->credit, credit);
981 }
982 }
984 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
985 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
986 credit_balance += credit;
987 }
988 }
990 csched_priv.credit_balance = credit_balance;
992 spin_unlock_irqrestore(&csched_priv.lock, flags);
994 /* Inform each CPU that its runq needs to be sorted */
995 csched_priv.runq_sort++;
997 out:
998 set_timer( &csched_priv.master_ticker, NOW() +
999 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
1002 static void
1003 csched_tick(void *_cpu)
1005 unsigned int cpu = (unsigned long)_cpu;
1006 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1008 spc->tick++;
1010 /*
1011 * Accounting for running VCPU
1012 */
1013 if ( !is_idle_vcpu(current) )
1014 csched_vcpu_acct(cpu);
1016 /*
1017 * Check if runq needs to be sorted
1019 * Every physical CPU resorts the runq after the accounting master has
1020 * modified priorities. This is a special O(n) sort and runs at most
1021 * once per accounting period (currently 30 milliseconds).
1022 */
1023 csched_runq_sort(cpu);
1025 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1028 static struct csched_vcpu *
1029 csched_runq_steal(int peer_cpu, int cpu, int pri)
1031 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1032 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1033 struct csched_vcpu *speer;
1034 struct list_head *iter;
1035 struct vcpu *vc;
1037 /*
1038 * Don't steal from an idle CPU's runq because it's about to
1039 * pick up work from it itself.
1040 */
1041 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1043 list_for_each( iter, &peer_pcpu->runq )
1045 speer = __runq_elem(iter);
1047 /*
1048 * If next available VCPU here is not of strictly higher
1049 * priority than ours, this PCPU is useless to us.
1050 */
1051 if ( speer->pri <= pri )
1052 break;
1054 /* Is this VCPU is runnable on our PCPU? */
1055 vc = speer->vcpu;
1056 BUG_ON( is_idle_vcpu(vc) );
1058 if (__csched_vcpu_is_migrateable(vc, cpu))
1060 /* We got a candidate. Grab it! */
1061 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1062 CSCHED_STAT_CRANK(migrate_queued);
1063 __runq_remove(speer);
1064 vc->processor = cpu;
1065 return speer;
1070 CSCHED_STAT_CRANK(steal_peer_idle);
1071 return NULL;
1074 static struct csched_vcpu *
1075 csched_load_balance(int cpu, struct csched_vcpu *snext)
1077 struct csched_vcpu *speer;
1078 cpumask_t workers;
1079 int peer_cpu;
1081 BUG_ON( cpu != snext->vcpu->processor );
1083 /* If this CPU is going offline we shouldn't steal work. */
1084 if ( unlikely(!cpu_online(cpu)) )
1085 goto out;
1087 if ( snext->pri == CSCHED_PRI_IDLE )
1088 CSCHED_STAT_CRANK(load_balance_idle);
1089 else if ( snext->pri == CSCHED_PRI_TS_OVER )
1090 CSCHED_STAT_CRANK(load_balance_over);
1091 else
1092 CSCHED_STAT_CRANK(load_balance_other);
1094 /*
1095 * Peek at non-idling CPUs in the system, starting with our
1096 * immediate neighbour.
1097 */
1098 cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1099 cpu_clear(cpu, workers);
1100 peer_cpu = cpu;
1102 while ( !cpus_empty(workers) )
1104 peer_cpu = cycle_cpu(peer_cpu, workers);
1105 cpu_clear(peer_cpu, workers);
1107 /*
1108 * Get ahold of the scheduler lock for this peer CPU.
1110 * Note: We don't spin on this lock but simply try it. Spinning could
1111 * cause a deadlock if the peer CPU is also load balancing and trying
1112 * to lock this CPU.
1113 */
1114 if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1116 CSCHED_STAT_CRANK(steal_trylock_failed);
1117 continue;
1120 /*
1121 * Any work over there to steal?
1122 */
1123 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1124 spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1125 if ( speer != NULL )
1126 return speer;
1129 out:
1130 /* Failed to find more important work elsewhere... */
1131 __runq_remove(snext);
1132 return snext;
1135 /*
1136 * This function is in the critical path. It is designed to be simple and
1137 * fast for the common case.
1138 */
1139 static struct task_slice
1140 csched_schedule(s_time_t now)
1142 const int cpu = smp_processor_id();
1143 struct list_head * const runq = RUNQ(cpu);
1144 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1145 struct csched_vcpu *snext;
1146 struct task_slice ret;
1148 CSCHED_STAT_CRANK(schedule);
1149 CSCHED_VCPU_CHECK(current);
1151 /* Update credits */
1152 if ( !is_idle_vcpu(scurr->vcpu) )
1154 burn_credits(scurr, now);
1155 scurr->start_time -= now;
1158 /*
1159 * Select next runnable local VCPU (ie top of local runq)
1160 */
1161 if ( vcpu_runnable(current) )
1162 __runq_insert(cpu, scurr);
1163 else
1164 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1166 snext = __runq_elem(runq->next);
1168 /*
1169 * SMP Load balance:
1171 * If the next highest priority local runnable VCPU has already eaten
1172 * through its credits, look on other PCPUs to see if we have more
1173 * urgent work... If not, csched_load_balance() will return snext, but
1174 * already removed from the runq.
1175 */
1176 if ( snext->pri > CSCHED_PRI_TS_OVER )
1177 __runq_remove(snext);
1178 else
1179 snext = csched_load_balance(cpu, snext);
1181 /*
1182 * Update idlers mask if necessary. When we're idling, other CPUs
1183 * will tickle us when they get extra work.
1184 */
1185 if ( snext->pri == CSCHED_PRI_IDLE )
1187 if ( !cpu_isset(cpu, csched_priv.idlers) )
1188 cpu_set(cpu, csched_priv.idlers);
1190 else if ( cpu_isset(cpu, csched_priv.idlers) )
1192 cpu_clear(cpu, csched_priv.idlers);
1195 if ( !is_idle_vcpu(snext->vcpu) )
1196 snext->start_time += now;
1198 /*
1199 * Return task to run next...
1200 */
1201 ret.time = (is_idle_vcpu(snext->vcpu) ?
1202 -1 : MILLISECS(CSCHED_MSECS_PER_TSLICE));
1203 ret.task = snext->vcpu;
1205 CSCHED_VCPU_CHECK(ret.task);
1206 return ret;
1209 static void
1210 csched_dump_vcpu(struct csched_vcpu *svc)
1212 struct csched_dom * const sdom = svc->sdom;
1214 printk("[%i.%i] pri=%i flags=%x cpu=%i",
1215 svc->vcpu->domain->domain_id,
1216 svc->vcpu->vcpu_id,
1217 svc->pri,
1218 svc->flags,
1219 svc->vcpu->processor);
1221 if ( sdom )
1223 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1224 #ifdef CSCHED_STATS
1225 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1226 svc->stats.credit_last,
1227 svc->stats.credit_incr,
1228 svc->stats.state_active,
1229 svc->stats.state_idle,
1230 svc->stats.migrate_q,
1231 svc->stats.migrate_r);
1232 #endif
1235 printk("\n");
1238 static void
1239 csched_dump_pcpu(int cpu)
1241 struct list_head *runq, *iter;
1242 struct csched_pcpu *spc;
1243 struct csched_vcpu *svc;
1244 int loop;
1245 #define cpustr keyhandler_scratch
1247 spc = CSCHED_PCPU(cpu);
1248 runq = &spc->runq;
1250 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map, cpu));
1251 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1252 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map, cpu));
1253 printk("core=%s\n", cpustr);
1255 /* current VCPU */
1256 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1257 if ( svc )
1259 printk("\trun: ");
1260 csched_dump_vcpu(svc);
1263 loop = 0;
1264 list_for_each( iter, runq )
1266 svc = __runq_elem(iter);
1267 if ( svc )
1269 printk("\t%3d: ", ++loop);
1270 csched_dump_vcpu(svc);
1273 #undef cpustr
1276 static void
1277 csched_dump(void)
1279 struct list_head *iter_sdom, *iter_svc;
1280 int loop;
1281 #define idlers_buf keyhandler_scratch
1283 printk("info:\n"
1284 "\tncpus = %u\n"
1285 "\tmaster = %u\n"
1286 "\tcredit = %u\n"
1287 "\tcredit balance = %d\n"
1288 "\tweight = %u\n"
1289 "\trunq_sort = %u\n"
1290 "\tdefault-weight = %d\n"
1291 "\tmsecs per tick = %dms\n"
1292 "\tcredits per msec = %d\n"
1293 "\tticks per tslice = %d\n"
1294 "\tticks per acct = %d\n"
1295 "\tmigration delay = %uus\n",
1296 csched_priv.ncpus,
1297 csched_priv.master,
1298 csched_priv.credit,
1299 csched_priv.credit_balance,
1300 csched_priv.weight,
1301 csched_priv.runq_sort,
1302 CSCHED_DEFAULT_WEIGHT,
1303 CSCHED_MSECS_PER_TICK,
1304 CSCHED_CREDITS_PER_MSEC,
1305 CSCHED_TICKS_PER_TSLICE,
1306 CSCHED_TICKS_PER_ACCT,
1307 vcpu_migration_delay);
1309 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1310 printk("idlers: %s\n", idlers_buf);
1312 printk("active vcpus:\n");
1313 loop = 0;
1314 list_for_each( iter_sdom, &csched_priv.active_sdom )
1316 struct csched_dom *sdom;
1317 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1319 list_for_each( iter_svc, &sdom->active_vcpu )
1321 struct csched_vcpu *svc;
1322 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1324 printk("\t%3d: ", ++loop);
1325 csched_dump_vcpu(svc);
1328 #undef idlers_buf
1331 static void
1332 csched_init(void)
1334 spin_lock_init(&csched_priv.lock);
1335 INIT_LIST_HEAD(&csched_priv.active_sdom);
1336 csched_priv.ncpus = 0;
1337 csched_priv.master = UINT_MAX;
1338 cpus_clear(csched_priv.idlers);
1339 csched_priv.weight = 0U;
1340 csched_priv.credit = 0U;
1341 csched_priv.credit_balance = 0;
1342 csched_priv.runq_sort = 0U;
1345 /* Tickers cannot be kicked until SMP subsystem is alive. */
1346 static __init int csched_start_tickers(void)
1348 struct csched_pcpu *spc;
1349 unsigned int cpu;
1351 /* Is the credit scheduler initialised? */
1352 if ( csched_priv.ncpus == 0 )
1353 return 0;
1355 for_each_online_cpu ( cpu )
1357 spc = CSCHED_PCPU(cpu);
1358 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1361 init_timer( &csched_priv.master_ticker, csched_acct, NULL,
1362 csched_priv.master);
1364 set_timer( &csched_priv.master_ticker, NOW() +
1365 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
1367 return 0;
1369 __initcall(csched_start_tickers);
1371 static void csched_tick_suspend(void)
1373 struct csched_pcpu *spc;
1375 spc = CSCHED_PCPU(smp_processor_id());
1377 stop_timer(&spc->ticker);
1380 static void csched_tick_resume(void)
1382 struct csched_pcpu *spc;
1383 uint64_t now = NOW();
1385 spc = CSCHED_PCPU(smp_processor_id());
1387 set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
1388 - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
1391 const struct scheduler sched_credit_def = {
1392 .name = "SMP Credit Scheduler",
1393 .opt_name = "credit",
1394 .sched_id = XEN_SCHEDULER_CREDIT,
1396 .init_domain = csched_dom_init,
1397 .destroy_domain = csched_dom_destroy,
1399 .init_vcpu = csched_vcpu_init,
1400 .destroy_vcpu = csched_vcpu_destroy,
1402 .sleep = csched_vcpu_sleep,
1403 .wake = csched_vcpu_wake,
1405 .adjust = csched_dom_cntl,
1407 .pick_cpu = csched_cpu_pick,
1408 .do_schedule = csched_schedule,
1410 .dump_cpu_state = csched_dump_pcpu,
1411 .dump_settings = csched_dump,
1412 .init = csched_init,
1414 .tick_suspend = csched_tick_suspend,
1415 .tick_resume = csched_tick_resume,
1416 };