debuggers.hg

view xen/common/sched_credit2.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 0133cf2a72f5
children
line source
2 /****************************************************************************
3 * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
4 ****************************************************************************
5 *
6 * File: common/csched_credit2.c
7 * Author: George Dunlap
8 *
9 * Description: Credit-based SMP CPU scheduler
10 * Based on an earlier verson by Emmanuel Ackaouy.
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/perfc.h>
22 #include <xen/sched-if.h>
23 #include <xen/softirq.h>
24 #include <asm/atomic.h>
25 #include <xen/errno.h>
26 #include <xen/trace.h>
27 #include <xen/cpu.h>
29 #if __i386__
30 #define PRI_stime "lld"
31 #else
32 #define PRI_stime "ld"
33 #endif
35 #define d2printk(x...)
36 //#define d2printk printk
38 #define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1
39 #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2
40 #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3
41 #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4
42 #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5
43 #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS + 6
44 #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS + 7
45 #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS + 8
46 #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS + 9
47 #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS + 10
48 #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS + 11
49 #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS + 12
51 /*
52 * WARNING: This is still in an experimental phase. Status and work can be found at the
53 * credit2 wiki page:
54 * http://wiki.xensource.com/xenwiki/Credit2_Scheduler_Development
55 * TODO:
56 * + Immediate bug-fixes
57 * - Do per-runqueue, grab proper lock for dump debugkey
58 * + Multiple sockets
59 * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map())
60 * - Simple load balancer / runqueue assignment
61 * - Runqueue load measurement
62 * - Load-based load balancer
63 * + Hyperthreading
64 * - Look for non-busy core if possible
65 * - "Discount" time run on a thread with busy siblings
66 * + Algorithm:
67 * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
68 * a flash animation in the background) can we schedule it with low enough latency
69 * so that audio doesn't skip?
70 * - Cap and reservation: How to implement with the current system?
71 * + Optimizing
72 * - Profiling, making new algorithms, making math more efficient (no long division)
73 */
75 /*
76 * Design:
77 *
78 * VMs "burn" credits based on their weight; higher weight means
79 * credits burn more slowly. The highest weight vcpu burns credits at
80 * a rate of 1 credit per nanosecond. Others burn proportionally
81 * more.
82 *
83 * vcpus are inserted into the runqueue by credit order.
84 *
85 * Credits are "reset" when the next vcpu in the runqueue is less than
86 * or equal to zero. At that point, everyone's credits are "clipped"
87 * to a small value, and a fixed credit is added to everyone.
88 *
89 * The plan is for all cores that share an L2 will share the same
90 * runqueue. At the moment, there is one global runqueue for all
91 * cores.
92 */
94 /*
95 * Locking:
96 * - Schedule-lock is per-runqueue
97 * + Protects runqueue data, runqueue insertion, &c
98 * + Also protects updates to private sched vcpu structure
99 * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr
100 * doesn't change under our feet.
101 * - Private data lock
102 * + Protects access to global domain list
103 * + All other private data is written at init and only read afterwards.
104 * Ordering:
105 * - We grab private->schedule when updating domain weight; so we
106 * must never grab private if a schedule lock is held.
107 */
109 /*
110 * Basic constants
111 */
112 /* Default weight: How much a new domain starts with */
113 #define CSCHED_DEFAULT_WEIGHT 256
114 /* Min timer: Minimum length a timer will be set, to
115 * achieve efficiency */
116 #define CSCHED_MIN_TIMER MICROSECS(500)
117 /* Amount of credit VMs begin with, and are reset to.
118 * ATM, set so that highest-weight VMs can only run for 10ms
119 * before a reset event. */
120 #define CSCHED_CREDIT_INIT MILLISECS(10)
121 /* Carryover: How much "extra" credit may be carried over after
122 * a reset. */
123 #define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER
124 /* Stickiness: Cross-L2 migration resistance. Should be less than
125 * MIN_TIMER. */
126 #define CSCHED_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1))
127 /* How much to "compensate" a vcpu for L2 migration */
128 #define CSCHED_MIGRATE_COMPENSATION MICROSECS(50)
129 /* Reset: Value below which credit will be reset. */
130 #define CSCHED_CREDIT_RESET 0
131 /* Max timer: Maximum time a guest can be run for. */
132 #define CSCHED_MAX_TIMER MILLISECS(2)
135 #define CSCHED_IDLE_CREDIT (-(1<<30))
137 /*
138 * Flags
139 */
140 /* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off,
141 * a physical cpu?
142 * + Accessed only with runqueue lock held
143 * + Set when chosen as next in csched_schedule().
144 * + Cleared after context switch has been saved in csched_context_saved()
145 * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
146 * set CSFLAG_delayed_runq_add
147 * + Checked to be false in runq_insert.
148 */
149 #define __CSFLAG_scheduled 1
150 #define CSFLAG_scheduled (1<<__CSFLAG_scheduled)
151 /* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
152 * being context switched out?
153 * + Set when scheduling out in csched_schedule() if prev is runnable
154 * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set
155 * + Read in csched_context_saved(). If set, it adds prev to the runqueue and
156 * clears the bit.
157 */
158 #define __CSFLAG_delayed_runq_add 2
159 #define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add)
160 /* CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a
161 * credit2-initiated runq migrate request; migrate it to the runqueue indicated
162 * in the svc struct.
163 */
164 #define __CSFLAG_runq_migrate_request 3
165 #define CSFLAG_runq_migrate_request (1<<__CSFLAG_runq_migrate_request)
168 int opt_migrate_resist=500;
169 integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
171 /*
172 * Useful macros
173 */
174 #define CSCHED_PRIV(_ops) \
175 ((struct csched_private *)((_ops)->sched_data))
176 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
177 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
178 #define CSCHED_CPUONLINE(_pool) \
179 (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
180 /* CPU to runq_id macro */
181 #define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)])
182 /* CPU to runqueue struct macro */
183 #define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
185 /*
186 * Shifts for load average.
187 * - granularity: Reduce granularity of time by a factor of 1000, so we can use 32-bit maths
188 * - window shift: Given granularity shift, make the window about 1 second
189 * - scale shift: Shift up load by this amount rather than using fractions; 128 corresponds
190 * to a load of 1.
191 */
192 #define LOADAVG_GRANULARITY_SHIFT (10)
193 int opt_load_window_shift=18;
194 #define LOADAVG_WINDOW_SHIFT_MIN 4
195 integer_param("credit2_load_window_shift", opt_load_window_shift);
196 int opt_underload_balance_tolerance=0;
197 integer_param("credit2_balance_under", opt_underload_balance_tolerance);
198 int opt_overload_balance_tolerance=-3;
199 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
201 /*
202 * Per-runqueue data
203 */
204 struct csched_runqueue_data {
205 int id;
207 spinlock_t lock; /* Lock for this runqueue. */
208 cpumask_t active; /* CPUs enabled for this runqueue */
210 struct list_head runq; /* Ordered list of runnable vms */
211 struct list_head svc; /* List of all vcpus assigned to this runqueue */
212 int max_weight;
214 cpumask_t idle, /* Currently idle */
215 tickled; /* Another cpu in the queue is already targeted for this one */
216 int load; /* Instantaneous load: Length of queue + num non-idle threads */
217 s_time_t load_last_update; /* Last time average was updated */
218 s_time_t avgload; /* Decaying queue load */
219 s_time_t b_avgload; /* Decaying queue load modified by balancing */
220 };
222 /*
223 * System-wide private data
224 */
225 struct csched_private {
226 spinlock_t lock;
227 cpumask_t initialized; /* CPU is initialized for this pool */
229 struct list_head sdom; /* Used mostly for dump keyhandler. */
231 int runq_map[NR_CPUS];
232 cpumask_t active_queues; /* Queues which may have active cpus */
233 struct csched_runqueue_data rqd[NR_CPUS];
235 int load_window_shift;
236 };
238 /*
239 * Virtual CPU
240 */
241 struct csched_vcpu {
242 struct list_head rqd_elem; /* On the runqueue data list */
243 struct list_head sdom_elem; /* On the domain vcpu list */
244 struct list_head runq_elem; /* On the runqueue */
245 struct csched_runqueue_data *rqd; /* Up-pointer to the runqueue */
247 /* Up-pointers */
248 struct csched_dom *sdom;
249 struct vcpu *vcpu;
251 int weight;
253 int credit;
254 s_time_t start_time; /* When we were scheduled (used for credit) */
255 unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */
257 /* Individual contribution to load */
258 s_time_t load_last_update; /* Last time average was updated */
259 s_time_t avgload; /* Decaying queue load */
261 struct csched_runqueue_data *migrate_rqd; /* Pre-determined rqd to which to migrate */
262 };
264 /*
265 * Domain
266 */
267 struct csched_dom {
268 struct list_head vcpu;
269 struct list_head sdom_elem;
270 struct domain *dom;
271 uint16_t weight;
272 uint16_t nr_vcpus;
273 };
276 /*
277 * Time-to-credit, credit-to-time.
278 * FIXME: Do pre-calculated division?
279 */
280 static s_time_t t2c(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc)
281 {
282 return time * rqd->max_weight / svc->weight;
283 }
285 static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc)
286 {
287 return credit * svc->weight / rqd->max_weight;
288 }
290 /*
291 * Runqueue related code
292 */
294 static /*inline*/ int
295 __vcpu_on_runq(struct csched_vcpu *svc)
296 {
297 return !list_empty(&svc->runq_elem);
298 }
300 static /*inline*/ struct csched_vcpu *
301 __runq_elem(struct list_head *elem)
302 {
303 return list_entry(elem, struct csched_vcpu, runq_elem);
304 }
306 static void
307 __update_runq_load(const struct scheduler *ops,
308 struct csched_runqueue_data *rqd, int change, s_time_t now)
309 {
310 struct csched_private *prv = CSCHED_PRIV(ops);
311 s_time_t delta=-1;
313 now >>= LOADAVG_GRANULARITY_SHIFT;
315 if ( rqd->load_last_update + (1ULL<<prv->load_window_shift) < now )
316 {
317 rqd->avgload = (unsigned long long)rqd->load << prv->load_window_shift;
318 rqd->b_avgload = (unsigned long long)rqd->load << prv->load_window_shift;
319 }
320 else
321 {
322 delta = now - rqd->load_last_update;
324 rqd->avgload =
325 ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) )
326 + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->avgload ) ) >> prv->load_window_shift;
328 rqd->b_avgload =
329 ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) )
330 + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->b_avgload ) ) >> prv->load_window_shift;
331 }
332 rqd->load += change;
333 rqd->load_last_update = now;
335 {
336 struct {
337 unsigned rq_load:4, rq_avgload:28;
338 unsigned rq_id:4, b_avgload:28;
339 } d;
340 d.rq_id=rqd->id;
341 d.rq_load = rqd->load;
342 d.rq_avgload = rqd->avgload;
343 d.b_avgload = rqd->b_avgload;
344 trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
345 sizeof(d),
346 (unsigned char *)&d);
347 }
348 }
350 static void
351 __update_svc_load(const struct scheduler *ops,
352 struct csched_vcpu *svc, int change, s_time_t now)
353 {
354 struct csched_private *prv = CSCHED_PRIV(ops);
355 s_time_t delta=-1;
356 int vcpu_load;
358 if ( change == -1 )
359 vcpu_load = 1;
360 else if ( change == 1 )
361 vcpu_load = 0;
362 else
363 vcpu_load = vcpu_runnable(svc->vcpu);
365 now >>= LOADAVG_GRANULARITY_SHIFT;
367 if ( svc->load_last_update + (1ULL<<prv->load_window_shift) < now )
368 {
369 svc->avgload = (unsigned long long)vcpu_load << prv->load_window_shift;
370 }
371 else
372 {
373 delta = now - svc->load_last_update;
375 svc->avgload =
376 ( ( delta * ( (unsigned long long)vcpu_load << prv->load_window_shift ) )
377 + ( ((1ULL<<prv->load_window_shift) - delta) * svc->avgload ) ) >> prv->load_window_shift;
378 }
379 svc->load_last_update = now;
381 {
382 struct {
383 unsigned dom:16,vcpu:16;
384 unsigned v_avgload:32;
385 } d;
386 d.dom = svc->vcpu->domain->domain_id;
387 d.vcpu = svc->vcpu->vcpu_id;
388 d.v_avgload = svc->avgload;
389 trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1,
390 sizeof(d),
391 (unsigned char *)&d);
392 }
393 }
395 static void
396 update_load(const struct scheduler *ops,
397 struct csched_runqueue_data *rqd,
398 struct csched_vcpu *svc, int change, s_time_t now)
399 {
400 __update_runq_load(ops, rqd, change, now);
401 if ( svc )
402 __update_svc_load(ops, svc, change, now);
403 }
405 static int
406 __runq_insert(struct list_head *runq, struct csched_vcpu *svc)
407 {
408 struct list_head *iter;
409 int pos = 0;
411 d2printk("rqi d%dv%d\n",
412 svc->vcpu->domain->domain_id,
413 svc->vcpu->vcpu_id);
415 BUG_ON(&svc->rqd->runq != runq);
416 /* Idle vcpus not allowed on the runqueue anymore */
417 BUG_ON(is_idle_vcpu(svc->vcpu));
418 BUG_ON(svc->vcpu->is_running);
419 BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags));
421 list_for_each( iter, runq )
422 {
423 struct csched_vcpu * iter_svc = __runq_elem(iter);
425 if ( svc->credit > iter_svc->credit )
426 {
427 d2printk(" p%d d%dv%d\n",
428 pos,
429 iter_svc->vcpu->domain->domain_id,
430 iter_svc->vcpu->vcpu_id);
431 break;
432 }
433 pos++;
434 }
436 list_add_tail(&svc->runq_elem, iter);
438 return pos;
439 }
441 static void
442 runq_insert(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc)
443 {
444 struct list_head * runq = &RQD(ops, cpu)->runq;
445 int pos = 0;
447 ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) );
449 BUG_ON( __vcpu_on_runq(svc) );
450 BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) );
452 pos = __runq_insert(runq, svc);
454 {
455 struct {
456 unsigned dom:16,vcpu:16;
457 unsigned pos;
458 } d;
459 d.dom = svc->vcpu->domain->domain_id;
460 d.vcpu = svc->vcpu->vcpu_id;
461 d.pos = pos;
462 trace_var(TRC_CSCHED2_RUNQ_POS, 0,
463 sizeof(d),
464 (unsigned char *)&d);
465 }
467 return;
468 }
470 static inline void
471 __runq_remove(struct csched_vcpu *svc)
472 {
473 BUG_ON( !__vcpu_on_runq(svc) );
474 list_del_init(&svc->runq_elem);
475 }
477 void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t);
479 /* Check to see if the item on the runqueue is higher priority than what's
480 * currently running; if so, wake up the processor */
481 static /*inline*/ void
482 runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now)
483 {
484 int i, ipid=-1;
485 s_time_t lowest=(1<<30);
486 struct csched_runqueue_data *rqd = RQD(ops, cpu);
487 cpumask_t mask;
488 struct csched_vcpu * cur;
490 d2printk("rqt d%dv%d cd%dv%d\n",
491 new->vcpu->domain->domain_id,
492 new->vcpu->vcpu_id,
493 current->domain->domain_id,
494 current->vcpu_id);
496 BUG_ON(new->vcpu->processor != cpu);
497 BUG_ON(new->rqd != rqd);
499 /* Look at the cpu it's running on first */
500 cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
501 burn_credits(rqd, cur, now);
503 if ( cur->credit < new->credit )
504 {
505 ipid = cpu;
506 goto tickle;
507 }
509 /* Get a mask of idle, but not tickled */
510 cpus_andnot(mask, rqd->idle, rqd->tickled);
512 /* If it's not empty, choose one */
513 if ( !cpus_empty(mask) )
514 {
515 ipid=first_cpu(mask);
516 goto tickle;
517 }
519 /* Otherwise, look for the non-idle cpu with the lowest credit,
520 * skipping cpus which have been tickled but not scheduled yet */
521 cpus_andnot(mask, rqd->active, rqd->idle);
522 cpus_andnot(mask, mask, rqd->tickled);
524 for_each_cpu_mask(i, mask)
525 {
526 struct csched_vcpu * cur;
528 /* Already looked at this one above */
529 if ( i == cpu )
530 continue;
532 cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr);
534 BUG_ON(is_idle_vcpu(cur->vcpu));
536 /* Update credits for current to see if we want to preempt */
537 burn_credits(rqd, cur, now);
539 if ( cur->credit < lowest )
540 {
541 ipid = i;
542 lowest = cur->credit;
543 }
545 /* TRACE */ {
546 struct {
547 unsigned dom:16,vcpu:16;
548 unsigned credit;
549 } d;
550 d.dom = cur->vcpu->domain->domain_id;
551 d.vcpu = cur->vcpu->vcpu_id;
552 d.credit = cur->credit;
553 trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
554 sizeof(d),
555 (unsigned char *)&d);
556 }
557 }
559 /* Only switch to another processor if the credit difference is greater
560 * than the migrate resistance */
561 if ( ipid == -1 || lowest + CSCHED_MIGRATE_RESIST > new->credit )
562 goto no_tickle;
564 tickle:
565 BUG_ON(ipid == -1);
567 /* TRACE */ {
568 struct {
569 unsigned cpu:8;
570 } d;
571 d.cpu = ipid;
572 trace_var(TRC_CSCHED2_TICKLE, 0,
573 sizeof(d),
574 (unsigned char *)&d);
575 }
576 cpu_set(ipid, rqd->tickled);
577 cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ);
579 no_tickle:
580 return;
581 }
583 /*
584 * Credit-related code
585 */
586 static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now)
587 {
588 struct csched_runqueue_data *rqd = RQD(ops, cpu);
589 struct list_head *iter;
591 list_for_each( iter, &rqd->svc )
592 {
593 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
595 int start_credit;
597 BUG_ON( is_idle_vcpu(svc->vcpu) );
598 BUG_ON( svc->rqd != rqd );
600 start_credit = svc->credit;
602 /* "Clip" credits to max carryover */
603 if ( svc->credit > CSCHED_CARRYOVER_MAX )
604 svc->credit = CSCHED_CARRYOVER_MAX;
605 /* And add INIT */
606 svc->credit += CSCHED_CREDIT_INIT;
607 svc->start_time = now;
609 /* TRACE */ {
610 struct {
611 unsigned dom:16,vcpu:16;
612 unsigned credit_start, credit_end;
613 } d;
614 d.dom = svc->vcpu->domain->domain_id;
615 d.vcpu = svc->vcpu->vcpu_id;
616 d.credit_start = start_credit;
617 d.credit_end = svc->credit;
618 trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
619 sizeof(d),
620 (unsigned char *)&d);
621 }
622 }
624 /* No need to resort runqueue, as everyone's order should be the same. */
625 }
627 void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now)
628 {
629 s_time_t delta;
631 /* Assert svc is current */
632 ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
634 if ( is_idle_vcpu(svc->vcpu) )
635 {
636 BUG_ON(svc->credit != CSCHED_IDLE_CREDIT);
637 return;
638 }
640 delta = now - svc->start_time;
642 if ( delta > 0 ) {
643 /* This will round down; should we consider rounding up...? */
644 svc->credit -= t2c(rqd, delta, svc);
645 svc->start_time = now;
647 d2printk("b d%dv%d c%d\n",
648 svc->vcpu->domain->domain_id,
649 svc->vcpu->vcpu_id,
650 svc->credit);
651 } else {
652 d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n",
653 __func__, now, svc->start_time);
654 }
656 /* TRACE */
657 {
658 struct {
659 unsigned dom:16,vcpu:16;
660 unsigned credit;
661 int delta;
662 } d;
663 d.dom = svc->vcpu->domain->domain_id;
664 d.vcpu = svc->vcpu->vcpu_id;
665 d.credit = svc->credit;
666 d.delta = delta;
667 trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
668 sizeof(d),
669 (unsigned char *)&d);
670 }
671 }
673 /* Find the domain with the highest weight. */
674 void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight)
675 {
676 /* Try to avoid brute-force search:
677 * - If new_weight is larger, max_weigth <- new_weight
678 * - If old_weight != max_weight, someone else is still max_weight
679 * (No action required)
680 * - If old_weight == max_weight, brute-force search for max weight
681 */
682 if ( new_weight > rqd->max_weight )
683 {
684 rqd->max_weight = new_weight;
685 d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
686 }
687 else if ( old_weight == rqd->max_weight )
688 {
689 struct list_head *iter;
690 int max_weight = 1;
692 list_for_each( iter, &rqd->svc )
693 {
694 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
696 if ( svc->weight > max_weight )
697 max_weight = svc->weight;
698 }
700 rqd->max_weight = max_weight;
701 d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
702 }
703 }
705 #ifndef NDEBUG
706 static /*inline*/ void
707 __csched_vcpu_check(struct vcpu *vc)
708 {
709 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
710 struct csched_dom * const sdom = svc->sdom;
712 BUG_ON( svc->vcpu != vc );
713 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
714 if ( sdom )
715 {
716 BUG_ON( is_idle_vcpu(vc) );
717 BUG_ON( sdom->dom != vc->domain );
718 }
719 else
720 {
721 BUG_ON( !is_idle_vcpu(vc) );
722 }
723 }
724 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
725 #else
726 #define CSCHED_VCPU_CHECK(_vc)
727 #endif
729 static void *
730 csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
731 {
732 struct csched_vcpu *svc;
734 /* Allocate per-VCPU info */
735 svc = xmalloc(struct csched_vcpu);
736 if ( svc == NULL )
737 return NULL;
738 memset(svc, 0, sizeof(*svc));
740 INIT_LIST_HEAD(&svc->rqd_elem);
741 INIT_LIST_HEAD(&svc->sdom_elem);
742 INIT_LIST_HEAD(&svc->runq_elem);
744 svc->sdom = dd;
745 svc->vcpu = vc;
746 svc->flags = 0U;
748 if ( ! is_idle_vcpu(vc) )
749 {
750 BUG_ON( svc->sdom == NULL );
752 svc->credit = CSCHED_CREDIT_INIT;
753 svc->weight = svc->sdom->weight;
754 /* Starting load of 50% */
755 svc->avgload = 1ULL << (CSCHED_PRIV(ops)->load_window_shift - 1);
756 svc->load_last_update = NOW();
757 }
758 else
759 {
760 BUG_ON( svc->sdom != NULL );
761 svc->credit = CSCHED_IDLE_CREDIT;
762 svc->weight = 0;
763 }
765 return svc;
766 }
768 /* Add and remove from runqueue assignment (not active run queue) */
769 static void
770 __runq_assign(struct csched_vcpu *svc, struct csched_runqueue_data *rqd)
771 {
773 svc->rqd = rqd;
774 list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
776 update_max_weight(svc->rqd, svc->weight, 0);
778 /* Expected new load based on adding this vcpu */
779 rqd->b_avgload += svc->avgload;
781 /* TRACE */
782 {
783 struct {
784 unsigned dom:16,vcpu:16;
785 unsigned rqi:16;
786 } d;
787 d.dom = svc->vcpu->domain->domain_id;
788 d.vcpu = svc->vcpu->vcpu_id;
789 d.rqi=rqd->id;
790 trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
791 sizeof(d),
792 (unsigned char *)&d);
793 }
795 }
797 static void
798 runq_assign(const struct scheduler *ops, struct vcpu *vc)
799 {
800 struct csched_vcpu *svc = vc->sched_priv;
802 BUG_ON(svc->rqd != NULL);
804 __runq_assign(svc, RQD(ops, vc->processor));
805 }
807 static void
808 __runq_deassign(struct csched_vcpu *svc)
809 {
810 BUG_ON(__vcpu_on_runq(svc));
812 list_del_init(&svc->rqd_elem);
813 update_max_weight(svc->rqd, 0, svc->weight);
815 /* Expected new load based on removing this vcpu */
816 svc->rqd->b_avgload -= svc->avgload;
818 svc->rqd = NULL;
819 }
821 static void
822 runq_deassign(const struct scheduler *ops, struct vcpu *vc)
823 {
824 struct csched_vcpu *svc = vc->sched_priv;
826 BUG_ON(svc->rqd != RQD(ops, vc->processor));
828 __runq_deassign(svc);
829 }
831 static void
832 csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
833 {
834 struct csched_vcpu *svc = vc->sched_priv;
835 struct domain * const dom = vc->domain;
836 struct csched_dom * const sdom = svc->sdom;
838 printk("%s: Inserting d%dv%d\n",
839 __func__, dom->domain_id, vc->vcpu_id);
841 /* NB: On boot, idle vcpus are inserted before alloc_pdata() has
842 * been called for that cpu.
843 */
844 if ( ! is_idle_vcpu(vc) )
845 {
846 /* FIXME: Do we need the private lock here? */
847 list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
849 /* Add vcpu to runqueue of initial processor */
850 vcpu_schedule_lock_irq(vc);
852 runq_assign(ops, vc);
854 vcpu_schedule_unlock_irq(vc);
856 sdom->nr_vcpus++;
857 }
859 CSCHED_VCPU_CHECK(vc);
860 }
862 static void
863 csched_free_vdata(const struct scheduler *ops, void *priv)
864 {
865 struct csched_vcpu *svc = priv;
867 xfree(svc);
868 }
870 static void
871 csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
872 {
873 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
874 struct csched_dom * const sdom = svc->sdom;
876 BUG_ON( sdom == NULL );
877 BUG_ON( !list_empty(&svc->runq_elem) );
879 if ( ! is_idle_vcpu(vc) )
880 {
881 /* Remove from runqueue */
882 vcpu_schedule_lock_irq(vc);
884 runq_deassign(ops, vc);
886 vcpu_schedule_unlock_irq(vc);
888 /* Remove from sdom list. Don't need a lock for this, as it's called
889 * syncronously when nothing else can happen. */
890 list_del_init(&svc->sdom_elem);
892 svc->sdom->nr_vcpus--;
893 }
894 }
896 static void
897 csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
898 {
899 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
901 BUG_ON( is_idle_vcpu(vc) );
903 if ( per_cpu(schedule_data, vc->processor).curr == vc )
904 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
905 else if ( __vcpu_on_runq(svc) )
906 {
907 BUG_ON(svc->rqd != RQD(ops, vc->processor));
908 update_load(ops, svc->rqd, svc, -1, NOW());
909 __runq_remove(svc);
910 }
911 else if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) )
912 clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
913 }
915 static void
916 csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
917 {
918 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
919 s_time_t now = 0;
921 /* Schedule lock should be held at this point. */
923 d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id);
925 BUG_ON( is_idle_vcpu(vc) );
927 /* Make sure svc priority mod happens before runq check */
928 if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) )
929 {
930 goto out;
931 }
933 if ( unlikely(__vcpu_on_runq(svc)) )
934 {
935 /* If we've boosted someone that's already on a runqueue, prioritize
936 * it and inform the cpu in question. */
937 goto out;
938 }
940 /* If the context hasn't been saved for this vcpu yet, we can't put it on
941 * another runqueue. Instead, we set a flag so that it will be put on the runqueue
942 * after the context has been saved. */
943 if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) )
944 {
945 set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
946 goto out;
947 }
949 /* Add into the new runqueue if necessary */
950 if ( svc->rqd == NULL )
951 runq_assign(ops, vc);
952 else
953 BUG_ON(RQD(ops, vc->processor) != svc->rqd );
955 now = NOW();
957 update_load(ops, svc->rqd, svc, 1, now);
959 /* Put the VCPU on the runq */
960 runq_insert(ops, vc->processor, svc);
961 runq_tickle(ops, vc->processor, svc, now);
963 out:
964 d2printk("w-\n");
965 return;
966 }
968 static void
969 csched_context_saved(const struct scheduler *ops, struct vcpu *vc)
970 {
971 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
972 s_time_t now = NOW();
974 vcpu_schedule_lock_irq(vc);
976 BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor));
978 /* This vcpu is now eligible to be put on the runqueue again */
979 clear_bit(__CSFLAG_scheduled, &svc->flags);
981 /* If someone wants it on the runqueue, put it there. */
982 /*
983 * NB: We can get rid of CSFLAG_scheduled by checking for
984 * vc->is_running and __vcpu_on_runq(svc) here. However,
985 * since we're accessing the flags cacheline anyway,
986 * it seems a bit pointless; especially as we have plenty of
987 * bits free.
988 */
989 if ( test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
990 && likely(vcpu_runnable(vc)) )
991 {
992 BUG_ON(__vcpu_on_runq(svc));
994 runq_insert(ops, vc->processor, svc);
995 runq_tickle(ops, vc->processor, svc, now);
996 }
997 else if ( !is_idle_vcpu(vc) )
998 update_load(ops, svc->rqd, svc, -1, now);
1000 vcpu_schedule_unlock_irq(vc);
1003 #define MAX_LOAD (1ULL<<60);
1004 static int
1005 choose_cpu(const struct scheduler *ops, struct vcpu *vc)
1007 struct csched_private *prv = CSCHED_PRIV(ops);
1008 int i, min_rqi = -1, new_cpu;
1009 struct csched_vcpu *svc = CSCHED_VCPU(vc);
1010 s_time_t min_avgload;
1012 BUG_ON(cpus_empty(prv->active_queues));
1014 /* Locking:
1015 * - vc->processor is already locked
1016 * - Need to grab prv lock to make sure active runqueues don't
1017 * change
1018 * - Need to grab locks for other runqueues while checking
1019 * avgload
1020 * Locking constraint is:
1021 * - Lock prv before runqueue locks
1022 * - Trylock between runqueue locks (no ordering)
1024 * Since one of the runqueue locks is already held, we can't
1025 * just grab the prv lock. Instead, we'll have to trylock, and
1026 * do something else reasonable if we fail.
1027 */
1029 if ( !spin_trylock(&prv->lock) )
1031 if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
1033 d2printk("d%dv%d -\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id);
1034 clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
1036 /* Leave it where it is for now. When we actually pay attention
1037 * to affinity we'll have to figure something out... */
1038 return vc->processor;
1041 /* First check to see if we're here because someone else suggested a place
1042 * for us to move. */
1043 if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
1045 if ( unlikely(svc->migrate_rqd->id < 0) )
1047 printk("%s: Runqueue migrate aborted because target runqueue disappeared!\n",
1048 __func__);
1049 /* Fall-through to normal cpu pick */
1051 else
1053 d2printk("d%dv%d +\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id);
1054 new_cpu = first_cpu(svc->migrate_rqd->active);
1055 goto out_up;
1059 /* FIXME: Pay attention to cpu affinity */
1061 min_avgload = MAX_LOAD;
1063 /* Find the runqueue with the lowest instantaneous load */
1064 for_each_cpu_mask(i, prv->active_queues)
1066 struct csched_runqueue_data *rqd;
1067 s_time_t rqd_avgload;
1069 rqd = prv->rqd + i;
1071 /* If checking a different runqueue, grab the lock,
1072 * read the avg, and then release the lock.
1074 * If on our own runqueue, don't grab or release the lock;
1075 * but subtract our own load from the runqueue load to simulate
1076 * impartiality */
1077 if ( rqd == svc->rqd )
1079 rqd_avgload = rqd->b_avgload - svc->avgload;
1081 else if ( spin_trylock(&rqd->lock) )
1083 rqd_avgload = rqd->b_avgload;
1084 spin_unlock(&rqd->lock);
1086 else
1087 continue;
1089 if ( rqd_avgload < min_avgload )
1091 min_avgload = rqd_avgload;
1092 min_rqi=i;
1096 /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
1097 if ( min_rqi == -1 )
1098 new_cpu = vc->processor;
1099 else
1101 BUG_ON(cpus_empty(prv->rqd[min_rqi].active));
1102 new_cpu = first_cpu(prv->rqd[min_rqi].active);
1105 out_up:
1106 spin_unlock(&prv->lock);
1108 return new_cpu;
1111 static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
1113 struct csched_private *prv = CSCHED_PRIV(ops);
1114 int i, max_delta_rqi = -1;
1115 struct list_head *push_iter, *pull_iter;
1117 /* NB: Modified by consider() */
1118 s_time_t load_delta;
1119 struct csched_vcpu * best_push_svc=NULL, *best_pull_svc=NULL;
1120 /* NB: Read by consider() */
1121 struct csched_runqueue_data *lrqd;
1122 struct csched_runqueue_data *orqd;
1124 void consider(struct csched_vcpu *push_svc,
1125 struct csched_vcpu *pull_svc)
1127 s_time_t l_load, o_load, delta;
1129 l_load = lrqd->b_avgload;
1130 o_load = orqd->b_avgload;
1131 if ( push_svc )
1133 /* What happens to the load on both if we push? */
1134 l_load -= push_svc->avgload;
1135 o_load += push_svc->avgload;
1137 if ( pull_svc )
1139 /* What happens to the load on both if we pull? */
1140 l_load += pull_svc->avgload;
1141 o_load -= pull_svc->avgload;
1144 delta = l_load - o_load;
1145 if ( delta < 0 )
1146 delta = -delta;
1148 if ( delta < load_delta )
1150 load_delta = delta;
1151 best_push_svc=push_svc;
1152 best_pull_svc=pull_svc;
1156 void migrate(struct csched_vcpu *svc, struct csched_runqueue_data *trqd)
1158 if ( test_bit(__CSFLAG_scheduled, &svc->flags) )
1160 d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
1161 svc->rqd->id, trqd->id);
1162 /* It's running; mark it to migrate. */
1163 svc->migrate_rqd = trqd;
1164 set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
1165 set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
1167 else
1169 int on_runq=0;
1170 /* It's not running; just move it */
1171 d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
1172 svc->rqd->id, trqd->id);
1173 if ( __vcpu_on_runq(svc) )
1175 __runq_remove(svc);
1176 update_load(ops, svc->rqd, svc, -1, now);
1177 on_runq=1;
1179 __runq_deassign(svc);
1180 svc->vcpu->processor = first_cpu(trqd->active);
1181 __runq_assign(svc, trqd);
1182 if ( on_runq )
1184 update_load(ops, svc->rqd, svc, 1, now);
1185 runq_insert(ops, svc->vcpu->processor, svc);
1186 runq_tickle(ops, svc->vcpu->processor, svc, now);
1192 /*
1193 * Basic algorithm: Push, pull, or swap.
1194 * - Find the runqueue with the furthest load distance
1195 * - Find a pair that makes the difference the least (where one
1196 * on either side may be empty).
1197 */
1199 /* Locking:
1200 * - pcpu schedule lock should be already locked
1201 */
1202 lrqd = RQD(ops, cpu);
1204 __update_runq_load(ops, lrqd, 0, now);
1206 retry:
1207 if ( !spin_trylock(&prv->lock) )
1208 return;
1210 load_delta = 0;
1212 for_each_cpu_mask(i, prv->active_queues)
1214 s_time_t delta;
1216 orqd = prv->rqd + i;
1218 if ( orqd == lrqd
1219 || !spin_trylock(&orqd->lock) )
1220 continue;
1222 __update_runq_load(ops, orqd, 0, now);
1224 delta = lrqd->b_avgload - orqd->b_avgload;
1225 if ( delta < 0 )
1226 delta = -delta;
1228 if ( delta > load_delta )
1230 load_delta = delta;
1231 max_delta_rqi = i;
1234 spin_unlock(&orqd->lock);
1237 /* Minimize holding the big lock */
1238 spin_unlock(&prv->lock);
1239 if ( max_delta_rqi == -1 )
1240 goto out;
1243 s_time_t load_max;
1244 int cpus_max;
1247 load_max = lrqd->b_avgload;
1248 if ( orqd->b_avgload > load_max )
1249 load_max = orqd->b_avgload;
1251 cpus_max=cpus_weight(lrqd->active);
1252 if ( cpus_weight(orqd->active) > cpus_max )
1253 cpus_max = cpus_weight(orqd->active);
1255 /* If we're under 100% capacaty, only shift if load difference
1256 * is > 1. otherwise, shift if under 12.5% */
1257 if ( load_max < (1ULL<<(prv->load_window_shift))*cpus_max )
1259 if ( load_delta < (1ULL<<(prv->load_window_shift+opt_underload_balance_tolerance) ) )
1260 goto out;
1262 else
1263 if ( load_delta < (1ULL<<(prv->load_window_shift+opt_overload_balance_tolerance)) )
1264 goto out;
1267 /* Try to grab the other runqueue lock; if it's been taken in the
1268 * meantime, try the process over again. This can't deadlock
1269 * because if it doesn't get any other rqd locks, it will simply
1270 * give up and return. */
1271 orqd = prv->rqd + max_delta_rqi;
1272 if ( !spin_trylock(&orqd->lock) )
1273 goto retry;
1275 /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
1276 if ( unlikely(orqd->id < 0) )
1277 goto out_up;
1279 /* Look for "swap" which gives the best load average
1280 * FIXME: O(n^2)! */
1282 /* Reuse load delta (as we're trying to minimize it) */
1283 list_for_each( push_iter, &lrqd->svc )
1285 int inner_load_updated = 0;
1286 struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem);
1288 __update_svc_load(ops, push_svc, 0, now);
1290 /* Skip this one if it's already been flagged to migrate */
1291 if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
1292 continue;
1294 list_for_each( pull_iter, &orqd->svc )
1296 struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
1298 if ( ! inner_load_updated )
1300 __update_svc_load(ops, pull_svc, 0, now);
1303 /* Skip this one if it's already been flagged to migrate */
1304 if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
1305 continue;
1307 consider(push_svc, pull_svc);
1310 inner_load_updated = 1;
1312 /* Consider push only */
1313 consider(push_svc, NULL);
1316 list_for_each( pull_iter, &orqd->svc )
1318 struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
1320 /* Skip this one if it's already been flagged to migrate */
1321 if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
1322 continue;
1324 /* Consider pull only */
1325 consider(NULL, pull_svc);
1328 /* OK, now we have some candidates; do the moving */
1329 if ( best_push_svc )
1330 migrate(best_push_svc, orqd);
1331 if ( best_pull_svc )
1332 migrate(best_pull_svc, lrqd);
1334 out_up:
1335 spin_unlock(&orqd->lock);
1337 out:
1338 return;
1341 static int
1342 csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
1344 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1345 int new_cpu;
1347 /* The scheduler interface doesn't have an explicit mechanism to
1348 * involve the choosable scheduler in the migrate process, so we
1349 * infer that a change may happen by the call to cpu_pick, and
1350 * remove it from the old runqueue while the lock for the old
1351 * runqueue is held. It can't be actively waiting to run. It
1352 * will be added to the new runqueue when it next wakes.
1354 * If we want to be able to call pick() separately, we need to add
1355 * a mechansim to remove a vcpu from an old processor / runqueue
1356 * before releasing the lock. */
1357 BUG_ON(__vcpu_on_runq(svc));
1359 new_cpu = choose_cpu(ops, vc);
1360 /* If we're suggesting moving to a different runqueue, remove it
1361 * from the old runqueue while we have the lock. It will be added
1362 * to the new one when it wakes. */
1363 if ( svc->rqd != NULL
1364 && RQD(ops, new_cpu) != svc->rqd )
1365 runq_deassign(ops, vc);
1367 return new_cpu;
1370 static int
1371 csched_dom_cntl(
1372 const struct scheduler *ops,
1373 struct domain *d,
1374 struct xen_domctl_scheduler_op *op)
1376 struct csched_dom * const sdom = CSCHED_DOM(d);
1377 struct csched_private *prv = CSCHED_PRIV(ops);
1378 unsigned long flags;
1380 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
1382 op->u.credit2.weight = sdom->weight;
1384 else
1386 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
1388 if ( op->u.credit2.weight != 0 )
1390 struct list_head *iter;
1391 int old_weight;
1393 /* Must hold csched_priv lock to update sdom, runq lock to
1394 * update csvcs. */
1395 spin_lock_irqsave(&prv->lock, flags);
1397 old_weight = sdom->weight;
1399 sdom->weight = op->u.credit2.weight;
1401 /* Update weights for vcpus, and max_weight for runqueues on which they reside */
1402 list_for_each ( iter, &sdom->vcpu )
1404 struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem);
1406 /* NB: Locking order is important here. Because we grab this lock here, we
1407 * must never lock csched_priv.lock if we're holding a runqueue
1408 * lock. */
1409 vcpu_schedule_lock_irq(svc->vcpu);
1411 BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor));
1413 svc->weight = sdom->weight;
1414 update_max_weight(svc->rqd, svc->weight, old_weight);
1416 vcpu_schedule_unlock_irq(svc->vcpu);
1419 spin_unlock_irqrestore(&prv->lock, flags);
1423 return 0;
1426 static void *
1427 csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
1429 struct csched_dom *sdom;
1430 int flags;
1432 sdom = xmalloc(struct csched_dom);
1433 if ( sdom == NULL )
1434 return NULL;
1435 memset(sdom, 0, sizeof(*sdom));
1437 /* Initialize credit and weight */
1438 INIT_LIST_HEAD(&sdom->vcpu);
1439 INIT_LIST_HEAD(&sdom->sdom_elem);
1440 sdom->dom = dom;
1441 sdom->weight = CSCHED_DEFAULT_WEIGHT;
1442 sdom->nr_vcpus = 0;
1444 spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
1446 list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom);
1448 spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
1450 return (void *)sdom;
1453 static int
1454 csched_dom_init(const struct scheduler *ops, struct domain *dom)
1456 struct csched_dom *sdom;
1458 printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
1460 if ( is_idle_domain(dom) )
1461 return 0;
1463 sdom = csched_alloc_domdata(ops, dom);
1464 if ( sdom == NULL )
1465 return -ENOMEM;
1467 dom->sched_priv = sdom;
1469 return 0;
1472 static void
1473 csched_free_domdata(const struct scheduler *ops, void *data)
1475 int flags;
1476 struct csched_dom *sdom = data;
1478 spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
1480 list_del_init(&sdom->sdom_elem);
1482 spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
1484 xfree(data);
1487 static void
1488 csched_dom_destroy(const struct scheduler *ops, struct domain *dom)
1490 struct csched_dom *sdom = CSCHED_DOM(dom);
1492 BUG_ON(!list_empty(&sdom->vcpu));
1494 csched_free_domdata(ops, CSCHED_DOM(dom));
1497 /* How long should we let this vcpu run for? */
1498 static s_time_t
1499 csched_runtime(const struct scheduler *ops, int cpu, struct csched_vcpu *snext)
1501 s_time_t time = CSCHED_MAX_TIMER;
1502 struct csched_runqueue_data *rqd = RQD(ops, cpu);
1503 struct list_head *runq = &rqd->runq;
1505 if ( is_idle_vcpu(snext->vcpu) )
1506 return CSCHED_MAX_TIMER;
1508 /* Basic time */
1509 time = c2t(rqd, snext->credit, snext);
1511 /* Next guy on runqueue */
1512 if ( ! list_empty(runq) )
1514 struct csched_vcpu *svc = __runq_elem(runq->next);
1515 s_time_t ntime;
1517 if ( ! is_idle_vcpu(svc->vcpu) )
1519 ntime = c2t(rqd, snext->credit - svc->credit, snext);
1521 if ( time > ntime )
1522 time = ntime;
1526 /* Check limits */
1527 if ( time < CSCHED_MIN_TIMER )
1528 time = CSCHED_MIN_TIMER;
1529 else if ( time > CSCHED_MAX_TIMER )
1530 time = CSCHED_MAX_TIMER;
1532 return time;
1535 void __dump_execstate(void *unused);
1537 /*
1538 * Find a candidate.
1539 */
1540 static struct csched_vcpu *
1541 runq_candidate(struct csched_runqueue_data *rqd,
1542 struct csched_vcpu *scurr,
1543 int cpu, s_time_t now)
1545 struct list_head *iter;
1546 struct csched_vcpu *snext = NULL;
1548 /* Default to current if runnable, idle otherwise */
1549 if ( vcpu_runnable(scurr->vcpu) )
1550 snext = scurr;
1551 else
1552 snext = CSCHED_VCPU(idle_vcpu[cpu]);
1554 list_for_each( iter, &rqd->runq )
1556 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, runq_elem);
1558 /* If this is on a different processor, don't pull it unless
1559 * its credit is at least CSCHED_MIGRATE_RESIST higher. */
1560 if ( svc->vcpu->processor != cpu
1561 && snext->credit + CSCHED_MIGRATE_RESIST > svc->credit )
1562 continue;
1564 /* If the next one on the list has more credit than current
1565 * (or idle, if current is not runnable), choose it. */
1566 if ( svc->credit > snext->credit )
1567 snext = svc;
1569 /* In any case, if we got this far, break. */
1570 break;
1574 return snext;
1577 /*
1578 * This function is in the critical path. It is designed to be simple and
1579 * fast for the common case.
1580 */
1581 static struct task_slice
1582 csched_schedule(
1583 const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1585 const int cpu = smp_processor_id();
1586 struct csched_runqueue_data *rqd;
1587 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1588 struct csched_vcpu *snext = NULL;
1589 struct task_slice ret;
1591 CSCHED_VCPU_CHECK(current);
1593 d2printk("sc p%d c d%dv%d now %"PRI_stime"\n",
1594 cpu,
1595 scurr->vcpu->domain->domain_id,
1596 scurr->vcpu->vcpu_id,
1597 now);
1599 BUG_ON(!cpu_isset(cpu, CSCHED_PRIV(ops)->initialized));
1601 rqd = RQD(ops, cpu);
1602 BUG_ON(!cpu_isset(cpu, rqd->active));
1604 /* Protected by runqueue lock */
1606 BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd);
1608 /* Clear "tickled" bit now that we've been scheduled */
1609 if ( cpu_isset(cpu, rqd->tickled) )
1610 cpu_clear(cpu, rqd->tickled);
1612 /* Update credits */
1613 burn_credits(rqd, scurr, now);
1615 /*
1616 * Select next runnable local VCPU (ie top of local runq).
1618 * If the current vcpu is runnable, and has higher credit than
1619 * the next guy on the queue (or there is noone else), we want to
1620 * run him again.
1622 * If there's tasklet work to do, we want to chose the idle vcpu
1623 * for this processor, and mark the current for delayed runqueue
1624 * add.
1626 * If the current vcpu is runnable, and there's another runnable
1627 * candidate, we want to mark current for delayed runqueue add,
1628 * and remove the next guy from the queue.
1630 * If the current vcpu is not runnable, we want to chose the idle
1631 * vcpu for this processor.
1632 */
1633 if ( tasklet_work_scheduled )
1635 trace_var(TRC_CSCHED2_SCHED_TASKLET, 0, 0, NULL);
1636 snext = CSCHED_VCPU(idle_vcpu[cpu]);
1638 else
1639 snext=runq_candidate(rqd, scurr, cpu, now);
1641 /* If switching from a non-idle runnable vcpu, put it
1642 * back on the runqueue. */
1643 if ( snext != scurr
1644 && !is_idle_vcpu(scurr->vcpu)
1645 && vcpu_runnable(current) )
1646 set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
1648 ret.migrated = 0;
1650 /* Accounting for non-idle tasks */
1651 if ( !is_idle_vcpu(snext->vcpu) )
1653 /* If switching, remove this from the runqueue and mark it scheduled */
1654 if ( snext != scurr )
1656 BUG_ON(snext->rqd != rqd);
1658 __runq_remove(snext);
1659 if ( snext->vcpu->is_running )
1661 printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n",
1662 cpu,
1663 snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id,
1664 snext->vcpu->processor,
1665 scurr->vcpu->domain->domain_id,
1666 scurr->vcpu->vcpu_id);
1667 BUG();
1669 set_bit(__CSFLAG_scheduled, &snext->flags);
1672 /* Check for the reset condition */
1673 if ( snext->credit <= CSCHED_CREDIT_RESET )
1675 reset_credit(ops, cpu, now);
1676 balance_load(ops, cpu, now);
1679 /* Clear the idle mask if necessary */
1680 if ( cpu_isset(cpu, rqd->idle) )
1681 cpu_clear(cpu, rqd->idle);
1683 snext->start_time = now;
1685 /* Safe because lock for old processor is held */
1686 if ( snext->vcpu->processor != cpu )
1688 snext->credit += CSCHED_MIGRATE_COMPENSATION;
1689 snext->vcpu->processor = cpu;
1690 ret.migrated = 1;
1693 else
1695 /* Update the idle mask if necessary */
1696 if ( !cpu_isset(cpu, rqd->idle) )
1697 cpu_set(cpu, rqd->idle);
1698 /* Make sure avgload gets updated periodically even
1699 * if there's no activity */
1700 update_load(ops, rqd, NULL, 0, now);
1703 /*
1704 * Return task to run next...
1705 */
1706 ret.time = csched_runtime(ops, cpu, snext);
1707 ret.task = snext->vcpu;
1709 CSCHED_VCPU_CHECK(ret.task);
1710 return ret;
1713 static void
1714 csched_dump_vcpu(struct csched_vcpu *svc)
1716 printk("[%i.%i] flags=%x cpu=%i",
1717 svc->vcpu->domain->domain_id,
1718 svc->vcpu->vcpu_id,
1719 svc->flags,
1720 svc->vcpu->processor);
1722 printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
1724 printk("\n");
1727 static void
1728 csched_dump_pcpu(const struct scheduler *ops, int cpu)
1730 struct list_head *runq, *iter;
1731 struct csched_vcpu *svc;
1732 int loop;
1733 char cpustr[100];
1735 /* FIXME: Do locking properly for access to runqueue structures */
1737 runq = &RQD(ops, cpu)->runq;
1739 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu));
1740 printk(" sibling=%s, ", cpustr);
1741 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu));
1742 printk("core=%s\n", cpustr);
1744 /* current VCPU */
1745 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1746 if ( svc )
1748 printk("\trun: ");
1749 csched_dump_vcpu(svc);
1752 loop = 0;
1753 list_for_each( iter, runq )
1755 svc = __runq_elem(iter);
1756 if ( svc )
1758 printk("\t%3d: ", ++loop);
1759 csched_dump_vcpu(svc);
1764 static void
1765 csched_dump(const struct scheduler *ops)
1767 struct list_head *iter_sdom, *iter_svc;
1768 struct csched_private *prv = CSCHED_PRIV(ops);
1769 int i, loop;
1771 printk("Active queues: %d\n"
1772 "\tdefault-weight = %d\n",
1773 cpus_weight(prv->active_queues),
1774 CSCHED_DEFAULT_WEIGHT);
1775 for_each_cpu_mask(i, prv->active_queues)
1777 s_time_t fraction;
1779 fraction = prv->rqd[i].avgload * 100 / (1ULL<<prv->load_window_shift);
1781 printk("Runqueue %d:\n"
1782 "\tncpus = %u\n"
1783 "\tmax_weight = %d\n"
1784 "\tinstload = %d\n"
1785 "\taveload = %3"PRI_stime"\n",
1786 i,
1787 cpus_weight(prv->rqd[i].active),
1788 prv->rqd[i].max_weight,
1789 prv->rqd[i].load,
1790 fraction);
1793 /* FIXME: Locking! */
1795 printk("Domain info:\n");
1796 loop = 0;
1797 list_for_each( iter_sdom, &prv->sdom )
1799 struct csched_dom *sdom;
1800 sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
1802 printk("\tDomain: %d w %d v %d\n\t",
1803 sdom->dom->domain_id,
1804 sdom->weight,
1805 sdom->nr_vcpus);
1807 list_for_each( iter_svc, &sdom->vcpu )
1809 struct csched_vcpu *svc;
1810 svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem);
1812 printk("\t%3d: ", ++loop);
1813 csched_dump_vcpu(svc);
1818 static void activate_runqueue(struct csched_private *prv, int rqi)
1820 struct csched_runqueue_data *rqd;
1822 rqd = prv->rqd + rqi;
1824 BUG_ON(!cpus_empty(rqd->active));
1826 rqd->max_weight = 1;
1827 rqd->id = rqi;
1828 INIT_LIST_HEAD(&rqd->svc);
1829 INIT_LIST_HEAD(&rqd->runq);
1830 spin_lock_init(&rqd->lock);
1832 cpu_set(rqi, prv->active_queues);
1835 static void deactivate_runqueue(struct csched_private *prv, int rqi)
1837 struct csched_runqueue_data *rqd;
1839 rqd = prv->rqd + rqi;
1841 BUG_ON(!cpus_empty(rqd->active));
1843 rqd->id = -1;
1845 cpu_clear(rqi, prv->active_queues);
1848 static void init_pcpu(const struct scheduler *ops, int cpu)
1850 int rqi, old_rqi, flags;
1851 struct csched_private *prv = CSCHED_PRIV(ops);
1852 struct csched_runqueue_data *rqd;
1853 spinlock_t *old_lock;
1855 spin_lock_irqsave(&prv->lock, flags);
1857 if ( cpu_isset(cpu, prv->initialized) )
1859 printk("%s: Strange, cpu %d already initialized!\n", __func__, cpu);
1860 spin_unlock_irqrestore(&prv->lock, flags);
1861 return;
1864 old_rqi = prv->runq_map[cpu];
1866 /* Figure out which runqueue to put it in */
1867 rqi = 0;
1869 /* Figure out which runqueue to put it in */
1870 /* NB: cpu 0 doesn't get a STARTING callback, so we hard-code it to runqueue 0. */
1871 if ( cpu == 0 )
1872 rqi = 0;
1873 else
1874 rqi = cpu_to_socket(cpu);
1876 if ( rqi < 0 )
1878 printk("%s: cpu_to_socket(%d) returned %d!\n",
1879 __func__, cpu, rqi);
1880 BUG();
1883 rqd=prv->rqd + rqi;
1885 printk("Adding cpu %d to runqueue %d\n", cpu, rqi);
1886 if ( ! cpu_isset(rqi, prv->active_queues) )
1888 printk(" First cpu on runqueue, activating\n");
1889 activate_runqueue(prv, rqi);
1892 /* IRQs already disabled */
1893 old_lock=pcpu_schedule_lock(cpu);
1895 /* Move spinlock to new runq lock. */
1896 per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock;
1898 /* Set the runqueue map */
1899 prv->runq_map[cpu]=rqi;
1901 cpu_set(cpu, rqd->idle);
1902 cpu_set(cpu, rqd->active);
1904 spin_unlock(old_lock);
1906 cpu_set(cpu, prv->initialized);
1908 spin_unlock_irqrestore(&prv->lock, flags);
1910 return;
1913 static void *
1914 csched_alloc_pdata(const struct scheduler *ops, int cpu)
1916 /* Check to see if the cpu is online yet */
1917 /* Note: cpu 0 doesn't get a STARTING callback */
1918 if ( cpu == 0 || cpu_to_socket(cpu) >= 0 )
1919 init_pcpu(ops, cpu);
1920 else
1921 printk("%s: cpu %d not online yet, deferring initializatgion\n",
1922 __func__, cpu);
1924 return (void *)1;
1927 static void
1928 csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
1930 unsigned long flags;
1931 struct csched_private *prv = CSCHED_PRIV(ops);
1932 struct csched_runqueue_data *rqd;
1933 int rqi;
1935 spin_lock_irqsave(&prv->lock, flags);
1937 BUG_ON( !cpu_isset(cpu, prv->initialized));
1939 /* Find the old runqueue and remove this cpu from it */
1940 rqi = prv->runq_map[cpu];
1942 rqd = prv->rqd + rqi;
1944 /* No need to save IRQs here, they're already disabled */
1945 spin_lock(&rqd->lock);
1947 BUG_ON(!cpu_isset(cpu, rqd->idle));
1949 printk("Removing cpu %d from runqueue %d\n", cpu, rqi);
1951 cpu_clear(cpu, rqd->idle);
1952 cpu_clear(cpu, rqd->active);
1954 if ( cpus_empty(rqd->active) )
1956 printk(" No cpus left on runqueue, disabling\n");
1957 deactivate_runqueue(prv, rqi);
1960 spin_unlock(&rqd->lock);
1962 cpu_clear(cpu, prv->initialized);
1964 spin_unlock_irqrestore(&prv->lock, flags);
1966 return;
1969 static int
1970 csched_cpu_starting(int cpu)
1972 struct scheduler *ops;
1974 /* Hope this is safe from cpupools switching things around. :-) */
1975 ops = per_cpu(scheduler, cpu);
1977 init_pcpu(ops, cpu);
1979 return NOTIFY_DONE;
1982 static int cpu_credit2_callback(
1983 struct notifier_block *nfb, unsigned long action, void *hcpu)
1985 unsigned int cpu = (unsigned long)hcpu;
1986 int rc = 0;
1988 switch ( action )
1990 case CPU_STARTING:
1991 csched_cpu_starting(cpu);
1992 break;
1993 default:
1994 break;
1997 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
2000 static struct notifier_block cpu_credit2_nfb = {
2001 .notifier_call = cpu_credit2_callback
2002 };
2004 static int
2005 csched_init(struct scheduler *ops)
2007 int i;
2008 struct csched_private *prv;
2010 printk("Initializing Credit2 scheduler\n" \
2011 " WARNING: This is experimental software in development.\n" \
2012 " Use at your own risk.\n");
2014 printk(" load_window_shift: %d\n", opt_load_window_shift);
2015 printk(" underload_balance_tolerance: %d\n", opt_underload_balance_tolerance);
2016 printk(" overload_balance_tolerance: %d\n", opt_overload_balance_tolerance);
2018 if ( opt_load_window_shift < LOADAVG_WINDOW_SHIFT_MIN )
2020 printk("%s: opt_load_window_shift %d below min %d, resetting\n",
2021 __func__, opt_load_window_shift, LOADAVG_WINDOW_SHIFT_MIN);
2022 opt_load_window_shift = LOADAVG_WINDOW_SHIFT_MIN;
2025 /* Basically no CPU information is available at this point; just
2026 * set up basic structures, and a callback when the CPU info is
2027 * available. */
2029 prv = xmalloc(struct csched_private);
2030 if ( prv == NULL )
2031 return -ENOMEM;
2032 memset(prv, 0, sizeof(*prv));
2033 ops->sched_data = prv;
2034 spin_lock_init(&prv->lock);
2035 INIT_LIST_HEAD(&prv->sdom);
2037 register_cpu_notifier(&cpu_credit2_nfb);
2039 /* But un-initialize all runqueues */
2040 for ( i=0; i<NR_CPUS; i++)
2042 prv->runq_map[i] = -1;
2043 prv->rqd[i].id = -1;
2046 prv->load_window_shift = opt_load_window_shift;
2048 return 0;
2051 static void
2052 csched_deinit(const struct scheduler *ops)
2054 struct csched_private *prv;
2056 prv = CSCHED_PRIV(ops);
2057 if ( prv != NULL )
2058 xfree(prv);
2062 static struct csched_private _csched_priv;
2064 const struct scheduler sched_credit2_def = {
2065 .name = "SMP Credit Scheduler rev2",
2066 .opt_name = "credit2",
2067 .sched_id = XEN_SCHEDULER_CREDIT2,
2068 .sched_data = &_csched_priv,
2070 .init_domain = csched_dom_init,
2071 .destroy_domain = csched_dom_destroy,
2073 .insert_vcpu = csched_vcpu_insert,
2074 .remove_vcpu = csched_vcpu_remove,
2076 .sleep = csched_vcpu_sleep,
2077 .wake = csched_vcpu_wake,
2079 .adjust = csched_dom_cntl,
2081 .pick_cpu = csched_cpu_pick,
2082 .do_schedule = csched_schedule,
2083 .context_saved = csched_context_saved,
2085 .dump_cpu_state = csched_dump_pcpu,
2086 .dump_settings = csched_dump,
2087 .init = csched_init,
2088 .deinit = csched_deinit,
2089 .alloc_vdata = csched_alloc_vdata,
2090 .free_vdata = csched_free_vdata,
2091 .alloc_pdata = csched_alloc_pdata,
2092 .free_pdata = csched_free_pdata,
2093 .alloc_domdata = csched_alloc_domdata,
2094 .free_domdata = csched_free_domdata,
2095 };