debuggers.hg

view xen/common/sched_credit2.c @ 22855:1d1eec7e1fb4

xl: Perform minimal validation of virtual disk file while parsing config file

This patch performs some very basic validation on the virtual disk
file passed through the config file. This validation ensures that we
don't go too far with the initialization like spawn qemu and more
while there could be some potentially fundamental issues.

[ Patch fixed up to work with PHYSTYPE_EMPTY 22808:6ec61438713a -iwj ]

Signed-off-by: Kamala Narasimhan <kamala.narasimhan@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
author Kamala Narasimhan <kamala.narasimhan@gmail.com>
date Tue Jan 25 18:09:49 2011 +0000 (2011-01-25)
parents 0133cf2a72f5
children
line source
2 /****************************************************************************
3 * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
4 ****************************************************************************
5 *
6 * File: common/csched_credit2.c
7 * Author: George Dunlap
8 *
9 * Description: Credit-based SMP CPU scheduler
10 * Based on an earlier verson by Emmanuel Ackaouy.
11 */
13 #include <xen/config.h>
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/perfc.h>
22 #include <xen/sched-if.h>
23 #include <xen/softirq.h>
24 #include <asm/atomic.h>
25 #include <xen/errno.h>
26 #include <xen/trace.h>
27 #include <xen/cpu.h>
29 #if __i386__
30 #define PRI_stime "lld"
31 #else
32 #define PRI_stime "ld"
33 #endif
35 #define d2printk(x...)
36 //#define d2printk printk
38 #define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1
39 #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2
40 #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3
41 #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4
42 #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5
43 #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS + 6
44 #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS + 7
45 #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS + 8
46 #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS + 9
47 #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS + 10
48 #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS + 11
49 #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS + 12
51 /*
52 * WARNING: This is still in an experimental phase. Status and work can be found at the
53 * credit2 wiki page:
54 * http://wiki.xensource.com/xenwiki/Credit2_Scheduler_Development
55 * TODO:
56 * + Immediate bug-fixes
57 * - Do per-runqueue, grab proper lock for dump debugkey
58 * + Multiple sockets
59 * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map())
60 * - Simple load balancer / runqueue assignment
61 * - Runqueue load measurement
62 * - Load-based load balancer
63 * + Hyperthreading
64 * - Look for non-busy core if possible
65 * - "Discount" time run on a thread with busy siblings
66 * + Algorithm:
67 * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
68 * a flash animation in the background) can we schedule it with low enough latency
69 * so that audio doesn't skip?
70 * - Cap and reservation: How to implement with the current system?
71 * + Optimizing
72 * - Profiling, making new algorithms, making math more efficient (no long division)
73 */
75 /*
76 * Design:
77 *
78 * VMs "burn" credits based on their weight; higher weight means
79 * credits burn more slowly. The highest weight vcpu burns credits at
80 * a rate of 1 credit per nanosecond. Others burn proportionally
81 * more.
82 *
83 * vcpus are inserted into the runqueue by credit order.
84 *
85 * Credits are "reset" when the next vcpu in the runqueue is less than
86 * or equal to zero. At that point, everyone's credits are "clipped"
87 * to a small value, and a fixed credit is added to everyone.
88 *
89 * The plan is for all cores that share an L2 will share the same
90 * runqueue. At the moment, there is one global runqueue for all
91 * cores.
92 */
94 /*
95 * Locking:
96 * - Schedule-lock is per-runqueue
97 * + Protects runqueue data, runqueue insertion, &c
98 * + Also protects updates to private sched vcpu structure
99 * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr
100 * doesn't change under our feet.
101 * - Private data lock
102 * + Protects access to global domain list
103 * + All other private data is written at init and only read afterwards.
104 * Ordering:
105 * - We grab private->schedule when updating domain weight; so we
106 * must never grab private if a schedule lock is held.
107 */
109 /*
110 * Basic constants
111 */
112 /* Default weight: How much a new domain starts with */
113 #define CSCHED_DEFAULT_WEIGHT 256
114 /* Min timer: Minimum length a timer will be set, to
115 * achieve efficiency */
116 #define CSCHED_MIN_TIMER MICROSECS(500)
117 /* Amount of credit VMs begin with, and are reset to.
118 * ATM, set so that highest-weight VMs can only run for 10ms
119 * before a reset event. */
120 #define CSCHED_CREDIT_INIT MILLISECS(10)
121 /* Carryover: How much "extra" credit may be carried over after
122 * a reset. */
123 #define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER
124 /* Stickiness: Cross-L2 migration resistance. Should be less than
125 * MIN_TIMER. */
126 #define CSCHED_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1))
127 /* How much to "compensate" a vcpu for L2 migration */
128 #define CSCHED_MIGRATE_COMPENSATION MICROSECS(50)
129 /* Reset: Value below which credit will be reset. */
130 #define CSCHED_CREDIT_RESET 0
131 /* Max timer: Maximum time a guest can be run for. */
132 #define CSCHED_MAX_TIMER MILLISECS(2)
135 #define CSCHED_IDLE_CREDIT (-(1<<30))
137 /*
138 * Flags
139 */
140 /* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off,
141 * a physical cpu?
142 * + Accessed only with runqueue lock held
143 * + Set when chosen as next in csched_schedule().
144 * + Cleared after context switch has been saved in csched_context_saved()
145 * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
146 * set CSFLAG_delayed_runq_add
147 * + Checked to be false in runq_insert.
148 */
149 #define __CSFLAG_scheduled 1
150 #define CSFLAG_scheduled (1<<__CSFLAG_scheduled)
151 /* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
152 * being context switched out?
153 * + Set when scheduling out in csched_schedule() if prev is runnable
154 * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set
155 * + Read in csched_context_saved(). If set, it adds prev to the runqueue and
156 * clears the bit.
157 */
158 #define __CSFLAG_delayed_runq_add 2
159 #define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add)
160 /* CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a
161 * credit2-initiated runq migrate request; migrate it to the runqueue indicated
162 * in the svc struct.
163 */
164 #define __CSFLAG_runq_migrate_request 3
165 #define CSFLAG_runq_migrate_request (1<<__CSFLAG_runq_migrate_request)
168 int opt_migrate_resist=500;
169 integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
171 /*
172 * Useful macros
173 */
174 #define CSCHED_PRIV(_ops) \
175 ((struct csched_private *)((_ops)->sched_data))
176 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
177 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
178 #define CSCHED_CPUONLINE(_pool) \
179 (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
180 /* CPU to runq_id macro */
181 #define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)])
182 /* CPU to runqueue struct macro */
183 #define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
185 /*
186 * Shifts for load average.
187 * - granularity: Reduce granularity of time by a factor of 1000, so we can use 32-bit maths
188 * - window shift: Given granularity shift, make the window about 1 second
189 * - scale shift: Shift up load by this amount rather than using fractions; 128 corresponds
190 * to a load of 1.
191 */
192 #define LOADAVG_GRANULARITY_SHIFT (10)
193 int opt_load_window_shift=18;
194 #define LOADAVG_WINDOW_SHIFT_MIN 4
195 integer_param("credit2_load_window_shift", opt_load_window_shift);
196 int opt_underload_balance_tolerance=0;
197 integer_param("credit2_balance_under", opt_underload_balance_tolerance);
198 int opt_overload_balance_tolerance=-3;
199 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
201 /*
202 * Per-runqueue data
203 */
204 struct csched_runqueue_data {
205 int id;
207 spinlock_t lock; /* Lock for this runqueue. */
208 cpumask_t active; /* CPUs enabled for this runqueue */
210 struct list_head runq; /* Ordered list of runnable vms */
211 struct list_head svc; /* List of all vcpus assigned to this runqueue */
212 int max_weight;
214 cpumask_t idle, /* Currently idle */
215 tickled; /* Another cpu in the queue is already targeted for this one */
216 int load; /* Instantaneous load: Length of queue + num non-idle threads */
217 s_time_t load_last_update; /* Last time average was updated */
218 s_time_t avgload; /* Decaying queue load */
219 s_time_t b_avgload; /* Decaying queue load modified by balancing */
220 };
222 /*
223 * System-wide private data
224 */
225 struct csched_private {
226 spinlock_t lock;
227 cpumask_t initialized; /* CPU is initialized for this pool */
229 struct list_head sdom; /* Used mostly for dump keyhandler. */
231 int runq_map[NR_CPUS];
232 cpumask_t active_queues; /* Queues which may have active cpus */
233 struct csched_runqueue_data rqd[NR_CPUS];
235 int load_window_shift;
236 };
238 /*
239 * Virtual CPU
240 */
241 struct csched_vcpu {
242 struct list_head rqd_elem; /* On the runqueue data list */
243 struct list_head sdom_elem; /* On the domain vcpu list */
244 struct list_head runq_elem; /* On the runqueue */
245 struct csched_runqueue_data *rqd; /* Up-pointer to the runqueue */
247 /* Up-pointers */
248 struct csched_dom *sdom;
249 struct vcpu *vcpu;
251 int weight;
253 int credit;
254 s_time_t start_time; /* When we were scheduled (used for credit) */
255 unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */
257 /* Individual contribution to load */
258 s_time_t load_last_update; /* Last time average was updated */
259 s_time_t avgload; /* Decaying queue load */
261 struct csched_runqueue_data *migrate_rqd; /* Pre-determined rqd to which to migrate */
262 };
264 /*
265 * Domain
266 */
267 struct csched_dom {
268 struct list_head vcpu;
269 struct list_head sdom_elem;
270 struct domain *dom;
271 uint16_t weight;
272 uint16_t nr_vcpus;
273 };
276 /*
277 * Time-to-credit, credit-to-time.
278 * FIXME: Do pre-calculated division?
279 */
280 static s_time_t t2c(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc)
281 {
282 return time * rqd->max_weight / svc->weight;
283 }
285 static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc)
286 {
287 return credit * svc->weight / rqd->max_weight;
288 }
290 /*
291 * Runqueue related code
292 */
294 static /*inline*/ int
295 __vcpu_on_runq(struct csched_vcpu *svc)
296 {
297 return !list_empty(&svc->runq_elem);
298 }
300 static /*inline*/ struct csched_vcpu *
301 __runq_elem(struct list_head *elem)
302 {
303 return list_entry(elem, struct csched_vcpu, runq_elem);
304 }
306 static void
307 __update_runq_load(const struct scheduler *ops,
308 struct csched_runqueue_data *rqd, int change, s_time_t now)
309 {
310 struct csched_private *prv = CSCHED_PRIV(ops);
311 s_time_t delta=-1;
313 now >>= LOADAVG_GRANULARITY_SHIFT;
315 if ( rqd->load_last_update + (1ULL<<prv->load_window_shift) < now )
316 {
317 rqd->avgload = (unsigned long long)rqd->load << prv->load_window_shift;
318 rqd->b_avgload = (unsigned long long)rqd->load << prv->load_window_shift;
319 }
320 else
321 {
322 delta = now - rqd->load_last_update;
324 rqd->avgload =
325 ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) )
326 + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->avgload ) ) >> prv->load_window_shift;
328 rqd->b_avgload =
329 ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) )
330 + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->b_avgload ) ) >> prv->load_window_shift;
331 }
332 rqd->load += change;
333 rqd->load_last_update = now;
335 {
336 struct {
337 unsigned rq_load:4, rq_avgload:28;
338 unsigned rq_id:4, b_avgload:28;
339 } d;
340 d.rq_id=rqd->id;
341 d.rq_load = rqd->load;
342 d.rq_avgload = rqd->avgload;
343 d.b_avgload = rqd->b_avgload;
344 trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
345 sizeof(d),
346 (unsigned char *)&d);
347 }
348 }
350 static void
351 __update_svc_load(const struct scheduler *ops,
352 struct csched_vcpu *svc, int change, s_time_t now)
353 {
354 struct csched_private *prv = CSCHED_PRIV(ops);
355 s_time_t delta=-1;
356 int vcpu_load;
358 if ( change == -1 )
359 vcpu_load = 1;
360 else if ( change == 1 )
361 vcpu_load = 0;
362 else
363 vcpu_load = vcpu_runnable(svc->vcpu);
365 now >>= LOADAVG_GRANULARITY_SHIFT;
367 if ( svc->load_last_update + (1ULL<<prv->load_window_shift) < now )
368 {
369 svc->avgload = (unsigned long long)vcpu_load << prv->load_window_shift;
370 }
371 else
372 {
373 delta = now - svc->load_last_update;
375 svc->avgload =
376 ( ( delta * ( (unsigned long long)vcpu_load << prv->load_window_shift ) )
377 + ( ((1ULL<<prv->load_window_shift) - delta) * svc->avgload ) ) >> prv->load_window_shift;
378 }
379 svc->load_last_update = now;
381 {
382 struct {
383 unsigned dom:16,vcpu:16;
384 unsigned v_avgload:32;
385 } d;
386 d.dom = svc->vcpu->domain->domain_id;
387 d.vcpu = svc->vcpu->vcpu_id;
388 d.v_avgload = svc->avgload;
389 trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1,
390 sizeof(d),
391 (unsigned char *)&d);
392 }
393 }
395 static void
396 update_load(const struct scheduler *ops,
397 struct csched_runqueue_data *rqd,
398 struct csched_vcpu *svc, int change, s_time_t now)
399 {
400 __update_runq_load(ops, rqd, change, now);
401 if ( svc )
402 __update_svc_load(ops, svc, change, now);
403 }
405 static int
406 __runq_insert(struct list_head *runq, struct csched_vcpu *svc)
407 {
408 struct list_head *iter;
409 int pos = 0;
411 d2printk("rqi d%dv%d\n",
412 svc->vcpu->domain->domain_id,
413 svc->vcpu->vcpu_id);
415 BUG_ON(&svc->rqd->runq != runq);
416 /* Idle vcpus not allowed on the runqueue anymore */
417 BUG_ON(is_idle_vcpu(svc->vcpu));
418 BUG_ON(svc->vcpu->is_running);
419 BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags));
421 list_for_each( iter, runq )
422 {
423 struct csched_vcpu * iter_svc = __runq_elem(iter);
425 if ( svc->credit > iter_svc->credit )
426 {
427 d2printk(" p%d d%dv%d\n",
428 pos,
429 iter_svc->vcpu->domain->domain_id,
430 iter_svc->vcpu->vcpu_id);
431 break;
432 }
433 pos++;
434 }
436 list_add_tail(&svc->runq_elem, iter);
438 return pos;
439 }
441 static void
442 runq_insert(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc)
443 {
444 struct list_head * runq = &RQD(ops, cpu)->runq;
445 int pos = 0;
447 ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) );
449 BUG_ON( __vcpu_on_runq(svc) );
450 BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) );
452 pos = __runq_insert(runq, svc);
454 {
455 struct {
456 unsigned dom:16,vcpu:16;
457 unsigned pos;
458 } d;
459 d.dom = svc->vcpu->domain->domain_id;
460 d.vcpu = svc->vcpu->vcpu_id;
461 d.pos = pos;
462 trace_var(TRC_CSCHED2_RUNQ_POS, 0,
463 sizeof(d),
464 (unsigned char *)&d);
465 }
467 return;
468 }
470 static inline void
471 __runq_remove(struct csched_vcpu *svc)
472 {
473 BUG_ON( !__vcpu_on_runq(svc) );
474 list_del_init(&svc->runq_elem);
475 }
477 void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t);
479 /* Check to see if the item on the runqueue is higher priority than what's
480 * currently running; if so, wake up the processor */
481 static /*inline*/ void
482 runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now)
483 {
484 int i, ipid=-1;
485 s_time_t lowest=(1<<30);
486 struct csched_runqueue_data *rqd = RQD(ops, cpu);
487 cpumask_t mask;
488 struct csched_vcpu * cur;
490 d2printk("rqt d%dv%d cd%dv%d\n",
491 new->vcpu->domain->domain_id,
492 new->vcpu->vcpu_id,
493 current->domain->domain_id,
494 current->vcpu_id);
496 BUG_ON(new->vcpu->processor != cpu);
497 BUG_ON(new->rqd != rqd);
499 /* Look at the cpu it's running on first */
500 cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
501 burn_credits(rqd, cur, now);
503 if ( cur->credit < new->credit )
504 {
505 ipid = cpu;
506 goto tickle;
507 }
509 /* Get a mask of idle, but not tickled */
510 cpus_andnot(mask, rqd->idle, rqd->tickled);
512 /* If it's not empty, choose one */
513 if ( !cpus_empty(mask) )
514 {
515 ipid=first_cpu(mask);
516 goto tickle;
517 }
519 /* Otherwise, look for the non-idle cpu with the lowest credit,
520 * skipping cpus which have been tickled but not scheduled yet */
521 cpus_andnot(mask, rqd->active, rqd->idle);
522 cpus_andnot(mask, mask, rqd->tickled);
524 for_each_cpu_mask(i, mask)
525 {
526 struct csched_vcpu * cur;
528 /* Already looked at this one above */
529 if ( i == cpu )
530 continue;
532 cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr);
534 BUG_ON(is_idle_vcpu(cur->vcpu));
536 /* Update credits for current to see if we want to preempt */
537 burn_credits(rqd, cur, now);
539 if ( cur->credit < lowest )
540 {
541 ipid = i;
542 lowest = cur->credit;
543 }
545 /* TRACE */ {
546 struct {
547 unsigned dom:16,vcpu:16;
548 unsigned credit;
549 } d;
550 d.dom = cur->vcpu->domain->domain_id;
551 d.vcpu = cur->vcpu->vcpu_id;
552 d.credit = cur->credit;
553 trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
554 sizeof(d),
555 (unsigned char *)&d);
556 }
557 }
559 /* Only switch to another processor if the credit difference is greater
560 * than the migrate resistance */
561 if ( ipid == -1 || lowest + CSCHED_MIGRATE_RESIST > new->credit )
562 goto no_tickle;
564 tickle:
565 BUG_ON(ipid == -1);
567 /* TRACE */ {
568 struct {
569 unsigned cpu:8;
570 } d;
571 d.cpu = ipid;
572 trace_var(TRC_CSCHED2_TICKLE, 0,
573 sizeof(d),
574 (unsigned char *)&d);
575 }
576 cpu_set(ipid, rqd->tickled);
577 cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ);
579 no_tickle:
580 return;
581 }
583 /*
584 * Credit-related code
585 */
586 static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now)
587 {
588 struct csched_runqueue_data *rqd = RQD(ops, cpu);
589 struct list_head *iter;
591 list_for_each( iter, &rqd->svc )
592 {
593 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
595 int start_credit;
597 BUG_ON( is_idle_vcpu(svc->vcpu) );
598 BUG_ON( svc->rqd != rqd );
600 start_credit = svc->credit;
602 /* "Clip" credits to max carryover */
603 if ( svc->credit > CSCHED_CARRYOVER_MAX )
604 svc->credit = CSCHED_CARRYOVER_MAX;
605 /* And add INIT */
606 svc->credit += CSCHED_CREDIT_INIT;
607 svc->start_time = now;
609 /* TRACE */ {
610 struct {
611 unsigned dom:16,vcpu:16;
612 unsigned credit_start, credit_end;
613 } d;
614 d.dom = svc->vcpu->domain->domain_id;
615 d.vcpu = svc->vcpu->vcpu_id;
616 d.credit_start = start_credit;
617 d.credit_end = svc->credit;
618 trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
619 sizeof(d),
620 (unsigned char *)&d);
621 }
622 }
624 /* No need to resort runqueue, as everyone's order should be the same. */
625 }
627 void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now)
628 {
629 s_time_t delta;
631 /* Assert svc is current */
632 ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
634 if ( is_idle_vcpu(svc->vcpu) )
635 {
636 BUG_ON(svc->credit != CSCHED_IDLE_CREDIT);
637 return;
638 }
640 delta = now - svc->start_time;
642 if ( delta > 0 ) {
643 /* This will round down; should we consider rounding up...? */
644 svc->credit -= t2c(rqd, delta, svc);
645 svc->start_time = now;
647 d2printk("b d%dv%d c%d\n",
648 svc->vcpu->domain->domain_id,
649 svc->vcpu->vcpu_id,
650 svc->credit);
651 } else {
652 d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n",
653 __func__, now, svc->start_time);
654 }
656 /* TRACE */
657 {
658 struct {
659 unsigned dom:16,vcpu:16;
660 unsigned credit;
661 int delta;
662 } d;
663 d.dom = svc->vcpu->domain->domain_id;
664 d.vcpu = svc->vcpu->vcpu_id;
665 d.credit = svc->credit;
666 d.delta = delta;
667 trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
668 sizeof(d),
669 (unsigned char *)&d);
670 }
671 }
673 /* Find the domain with the highest weight. */
674 void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight)
675 {
676 /* Try to avoid brute-force search:
677 * - If new_weight is larger, max_weigth <- new_weight
678 * - If old_weight != max_weight, someone else is still max_weight
679 * (No action required)
680 * - If old_weight == max_weight, brute-force search for max weight
681 */
682 if ( new_weight > rqd->max_weight )
683 {
684 rqd->max_weight = new_weight;
685 d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
686 }
687 else if ( old_weight == rqd->max_weight )
688 {
689 struct list_head *iter;
690 int max_weight = 1;
692 list_for_each( iter, &rqd->svc )
693 {
694 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
696 if ( svc->weight > max_weight )
697 max_weight = svc->weight;
698 }
700 rqd->max_weight = max_weight;
701 d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
702 }
703 }
705 #ifndef NDEBUG
706 static /*inline*/ void
707 __csched_vcpu_check(struct vcpu *vc)
708 {
709 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
710 struct csched_dom * const sdom = svc->sdom;
712 BUG_ON( svc->vcpu != vc );
713 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
714 if ( sdom )
715 {
716 BUG_ON( is_idle_vcpu(vc) );
717 BUG_ON( sdom->dom != vc->domain );
718 }
719 else
720 {
721 BUG_ON( !is_idle_vcpu(vc) );
722 }
723 }
724 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
725 #else
726 #define CSCHED_VCPU_CHECK(_vc)
727 #endif
729 static void *
730 csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
731 {
732 struct csched_vcpu *svc;
734 /* Allocate per-VCPU info */
735 svc = xmalloc(struct csched_vcpu);
736 if ( svc == NULL )
737 return NULL;
738 memset(svc, 0, sizeof(*svc));
740 INIT_LIST_HEAD(&svc->rqd_elem);
741 INIT_LIST_HEAD(&svc->sdom_elem);
742 INIT_LIST_HEAD(&svc->runq_elem);
744 svc->sdom = dd;
745 svc->vcpu = vc;
746 svc->flags = 0U;
748 if ( ! is_idle_vcpu(vc) )
749 {
750 BUG_ON( svc->sdom == NULL );
752 svc->credit = CSCHED_CREDIT_INIT;
753 svc->weight = svc->sdom->weight;
754 /* Starting load of 50% */
755 svc->avgload = 1ULL << (CSCHED_PRIV(ops)->load_window_shift - 1);
756 svc->load_last_update = NOW();
757 }
758 else
759 {
760 BUG_ON( svc->sdom != NULL );
761 svc->credit = CSCHED_IDLE_CREDIT;
762 svc->weight = 0;
763 }
765 return svc;
766 }
768 /* Add and remove from runqueue assignment (not active run queue) */
769 static void
770 __runq_assign(struct csched_vcpu *svc, struct csched_runqueue_data *rqd)
771 {
773 svc->rqd = rqd;
774 list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
776 update_max_weight(svc->rqd, svc->weight, 0);
778 /* Expected new load based on adding this vcpu */
779 rqd->b_avgload += svc->avgload;
781 /* TRACE */
782 {
783 struct {
784 unsigned dom:16,vcpu:16;
785 unsigned rqi:16;
786 } d;
787 d.dom = svc->vcpu->domain->domain_id;
788 d.vcpu = svc->vcpu->vcpu_id;
789 d.rqi=rqd->id;
790 trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
791 sizeof(d),
792 (unsigned char *)&d);
793 }
795 }
797 static void
798 runq_assign(const struct scheduler *ops, struct vcpu *vc)
799 {
800 struct csched_vcpu *svc = vc->sched_priv;
802 BUG_ON(svc->rqd != NULL);
804 __runq_assign(svc, RQD(ops, vc->processor));
805 }
807 static void
808 __runq_deassign(struct csched_vcpu *svc)
809 {
810 BUG_ON(__vcpu_on_runq(svc));
812 list_del_init(&svc->rqd_elem);
813 update_max_weight(svc->rqd, 0, svc->weight);
815 /* Expected new load based on removing this vcpu */
816 svc->rqd->b_avgload -= svc->avgload;
818 svc->rqd = NULL;
819 }
821 static void
822 runq_deassign(const struct scheduler *ops, struct vcpu *vc)
823 {
824 struct csched_vcpu *svc = vc->sched_priv;
826 BUG_ON(svc->rqd != RQD(ops, vc->processor));
828 __runq_deassign(svc);
829 }
831 static void
832 csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
833 {
834 struct csched_vcpu *svc = vc->sched_priv;
835 struct domain * const dom = vc->domain;
836 struct csched_dom * const sdom = svc->sdom;
838 printk("%s: Inserting d%dv%d\n",
839 __func__, dom->domain_id, vc->vcpu_id);
841 /* NB: On boot, idle vcpus are inserted before alloc_pdata() has
842 * been called for that cpu.
843 */
844 if ( ! is_idle_vcpu(vc) )
845 {
846 /* FIXME: Do we need the private lock here? */
847 list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
849 /* Add vcpu to runqueue of initial processor */
850 vcpu_schedule_lock_irq(vc);
852 runq_assign(ops, vc);
854 vcpu_schedule_unlock_irq(vc);
856 sdom->nr_vcpus++;
857 }
859 CSCHED_VCPU_CHECK(vc);
860 }
862 static void
863 csched_free_vdata(const struct scheduler *ops, void *priv)
864 {
865 struct csched_vcpu *svc = priv;
867 xfree(svc);
868 }
870 static void
871 csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
872 {
873 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
874 struct csched_dom * const sdom = svc->sdom;
876 BUG_ON( sdom == NULL );
877 BUG_ON( !list_empty(&svc->runq_elem) );
879 if ( ! is_idle_vcpu(vc) )
880 {
881 /* Remove from runqueue */
882 vcpu_schedule_lock_irq(vc);
884 runq_deassign(ops, vc);
886 vcpu_schedule_unlock_irq(vc);
888 /* Remove from sdom list. Don't need a lock for this, as it's called
889 * syncronously when nothing else can happen. */
890 list_del_init(&svc->sdom_elem);
892 svc->sdom->nr_vcpus--;
893 }
894 }
896 static void
897 csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
898 {
899 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
901 BUG_ON( is_idle_vcpu(vc) );
903 if ( per_cpu(schedule_data, vc->processor).curr == vc )
904 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
905 else if ( __vcpu_on_runq(svc) )
906 {
907 BUG_ON(svc->rqd != RQD(ops, vc->processor));
908 update_load(ops, svc->rqd, svc, -1, NOW());
909 __runq_remove(svc);
910 }
911 else if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) )
912 clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
913 }
915 static void
916 csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
917 {
918 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
919 s_time_t now = 0;
921 /* Schedule lock should be held at this point. */
923 d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id);
925 BUG_ON( is_idle_vcpu(vc) );
927 /* Make sure svc priority mod happens before runq check */
928 if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) )
929 {
930 goto out;
931 }
933 if ( unlikely(__vcpu_on_runq(svc)) )
934 {
935 /* If we've boosted someone that's already on a runqueue, prioritize
936 * it and inform the cpu in question. */
937 goto out;
938 }
940 /* If the context hasn't been saved for this vcpu yet, we can't put it on
941 * another runqueue. Instead, we set a flag so that it will be put on the runqueue
942 * after the context has been saved. */
943 if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) )
944 {
945 set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
946 goto out;
947 }
949 /* Add into the new runqueue if necessary */
950 if ( svc->rqd == NULL )
951 runq_assign(ops, vc);
952 else
953 BUG_ON(RQD(ops, vc->processor) != svc->rqd );
955 now = NOW();
957 update_load(ops, svc->rqd, svc, 1, now);
959 /* Put the VCPU on the runq */
960 runq_insert(ops, vc->processor, svc);
961 runq_tickle(ops, vc->processor, svc, now);
963 out:
964 d2printk("w-\n");
965 return;
966 }
968 static void
969 csched_context_saved(const struct scheduler *ops, struct vcpu *vc)
970 {
971 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
972 s_time_t now = NOW();
974 vcpu_schedule_lock_irq(vc);
976 BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor));
978 /* This vcpu is now eligible to be put on the runqueue again */
979 clear_bit(__CSFLAG_scheduled, &svc->flags);
981 /* If someone wants it on the runqueue, put it there. */
982 /*
983 * NB: We can get rid of CSFLAG_scheduled by checking for
984 * vc->is_running and __vcpu_on_runq(svc) here. However,
985 * since we're accessing the flags cacheline anyway,
986 * it seems a bit pointless; especially as we have plenty of
987 * bits free.
988 */
989 if ( test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
990 && likely(vcpu_runnable(vc)) )
991 {
992 BUG_ON(__vcpu_on_runq(svc));
994 runq_insert(ops, vc->processor, svc);
995 runq_tickle(ops, vc->processor, svc, now);
996 }
997 else if ( !is_idle_vcpu(vc) )
998 update_load(ops, svc->rqd, svc, -1, now);
1000 vcpu_schedule_unlock_irq(vc);
1003 #define MAX_LOAD (1ULL<<60);
1004 static int
1005 choose_cpu(const struct scheduler *ops, struct vcpu *vc)
1007 struct csched_private *prv = CSCHED_PRIV(ops);
1008 int i, min_rqi = -1, new_cpu;
1009 struct csched_vcpu *svc = CSCHED_VCPU(vc);
1010 s_time_t min_avgload;
1012 BUG_ON(cpus_empty(prv->active_queues));
1014 /* Locking:
1015 * - vc->processor is already locked
1016 * - Need to grab prv lock to make sure active runqueues don't
1017 * change
1018 * - Need to grab locks for other runqueues while checking
1019 * avgload
1020 * Locking constraint is:
1021 * - Lock prv before runqueue locks
1022 * - Trylock between runqueue locks (no ordering)
1024 * Since one of the runqueue locks is already held, we can't
1025 * just grab the prv lock. Instead, we'll have to trylock, and
1026 * do something else reasonable if we fail.
1027 */
1029 if ( !spin_trylock(&prv->lock) )
1031 if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
1033 d2printk("d%dv%d -\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id);
1034 clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
1036 /* Leave it where it is for now. When we actually pay attention
1037 * to affinity we'll have to figure something out... */
1038 return vc->processor;
1041 /* First check to see if we're here because someone else suggested a place
1042 * for us to move. */
1043 if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
1045 if ( unlikely(svc->migrate_rqd->id < 0) )
1047 printk("%s: Runqueue migrate aborted because target runqueue disappeared!\n",
1048 __func__);
1049 /* Fall-through to normal cpu pick */
1051 else
1053 d2printk("d%dv%d +\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id);
1054 new_cpu = first_cpu(svc->migrate_rqd->active);
1055 goto out_up;
1059 /* FIXME: Pay attention to cpu affinity */
1061 min_avgload = MAX_LOAD;
1063 /* Find the runqueue with the lowest instantaneous load */
1064 for_each_cpu_mask(i, prv->active_queues)
1066 struct csched_runqueue_data *rqd;
1067 s_time_t rqd_avgload;
1069 rqd = prv->rqd + i;
1071 /* If checking a different runqueue, grab the lock,
1072 * read the avg, and then release the lock.
1074 * If on our own runqueue, don't grab or release the lock;
1075 * but subtract our own load from the runqueue load to simulate
1076 * impartiality */
1077 if ( rqd == svc->rqd )
1079 rqd_avgload = rqd->b_avgload - svc->avgload;
1081 else if ( spin_trylock(&rqd->lock) )
1083 rqd_avgload = rqd->b_avgload;
1084 spin_unlock(&rqd->lock);
1086 else
1087 continue;
1089 if ( rqd_avgload < min_avgload )
1091 min_avgload = rqd_avgload;
1092 min_rqi=i;
1096 /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
1097 if ( min_rqi == -1 )
1098 new_cpu = vc->processor;
1099 else
1101 BUG_ON(cpus_empty(prv->rqd[min_rqi].active));
1102 new_cpu = first_cpu(prv->rqd[min_rqi].active);
1105 out_up:
1106 spin_unlock(&prv->lock);
1108 return new_cpu;
1111 static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
1113 struct csched_private *prv = CSCHED_PRIV(ops);
1114 int i, max_delta_rqi = -1;
1115 struct list_head *push_iter, *pull_iter;
1117 /* NB: Modified by consider() */
1118 s_time_t load_delta;
1119 struct csched_vcpu * best_push_svc=NULL, *best_pull_svc=NULL;
1120 /* NB: Read by consider() */
1121 struct csched_runqueue_data *lrqd;
1122 struct csched_runqueue_data *orqd;
1124 void consider(struct csched_vcpu *push_svc,
1125 struct csched_vcpu *pull_svc)
1127 s_time_t l_load, o_load, delta;
1129 l_load = lrqd->b_avgload;
1130 o_load = orqd->b_avgload;
1131 if ( push_svc )
1133 /* What happens to the load on both if we push? */
1134 l_load -= push_svc->avgload;
1135 o_load += push_svc->avgload;
1137 if ( pull_svc )
1139 /* What happens to the load on both if we pull? */
1140 l_load += pull_svc->avgload;
1141 o_load -= pull_svc->avgload;
1144 delta = l_load - o_load;
1145 if ( delta < 0 )
1146 delta = -delta;
1148 if ( delta < load_delta )
1150 load_delta = delta;
1151 best_push_svc=push_svc;
1152 best_pull_svc=pull_svc;
1156 void migrate(struct csched_vcpu *svc, struct csched_runqueue_data *trqd)
1158 if ( test_bit(__CSFLAG_scheduled, &svc->flags) )
1160 d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
1161 svc->rqd->id, trqd->id);
1162 /* It's running; mark it to migrate. */
1163 svc->migrate_rqd = trqd;
1164 set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
1165 set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
1167 else
1169 int on_runq=0;
1170 /* It's not running; just move it */
1171 d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
1172 svc->rqd->id, trqd->id);
1173 if ( __vcpu_on_runq(svc) )
1175 __runq_remove(svc);
1176 update_load(ops, svc->rqd, svc, -1, now);
1177 on_runq=1;
1179 __runq_deassign(svc);
1180 svc->vcpu->processor = first_cpu(trqd->active);
1181 __runq_assign(svc, trqd);
1182 if ( on_runq )
1184 update_load(ops, svc->rqd, svc, 1, now);
1185 runq_insert(ops, svc->vcpu->processor, svc);
1186 runq_tickle(ops, svc->vcpu->processor, svc, now);
1192 /*
1193 * Basic algorithm: Push, pull, or swap.
1194 * - Find the runqueue with the furthest load distance
1195 * - Find a pair that makes the difference the least (where one
1196 * on either side may be empty).
1197 */
1199 /* Locking:
1200 * - pcpu schedule lock should be already locked
1201 */
1202 lrqd = RQD(ops, cpu);
1204 __update_runq_load(ops, lrqd, 0, now);
1206 retry:
1207 if ( !spin_trylock(&prv->lock) )
1208 return;
1210 load_delta = 0;
1212 for_each_cpu_mask(i, prv->active_queues)
1214 s_time_t delta;
1216 orqd = prv->rqd + i;
1218 if ( orqd == lrqd
1219 || !spin_trylock(&orqd->lock) )
1220 continue;
1222 __update_runq_load(ops, orqd, 0, now);
1224 delta = lrqd->b_avgload - orqd->b_avgload;
1225 if ( delta < 0 )
1226 delta = -delta;
1228 if ( delta > load_delta )
1230 load_delta = delta;
1231 max_delta_rqi = i;
1234 spin_unlock(&orqd->lock);
1237 /* Minimize holding the big lock */
1238 spin_unlock(&prv->lock);
1239 if ( max_delta_rqi == -1 )
1240 goto out;
1243 s_time_t load_max;
1244 int cpus_max;
1247 load_max = lrqd->b_avgload;
1248 if ( orqd->b_avgload > load_max )
1249 load_max = orqd->b_avgload;
1251 cpus_max=cpus_weight(lrqd->active);
1252 if ( cpus_weight(orqd->active) > cpus_max )
1253 cpus_max = cpus_weight(orqd->active);
1255 /* If we're under 100% capacaty, only shift if load difference
1256 * is > 1. otherwise, shift if under 12.5% */
1257 if ( load_max < (1ULL<<(prv->load_window_shift))*cpus_max )
1259 if ( load_delta < (1ULL<<(prv->load_window_shift+opt_underload_balance_tolerance) ) )
1260 goto out;
1262 else
1263 if ( load_delta < (1ULL<<(prv->load_window_shift+opt_overload_balance_tolerance)) )
1264 goto out;
1267 /* Try to grab the other runqueue lock; if it's been taken in the
1268 * meantime, try the process over again. This can't deadlock
1269 * because if it doesn't get any other rqd locks, it will simply
1270 * give up and return. */
1271 orqd = prv->rqd + max_delta_rqi;
1272 if ( !spin_trylock(&orqd->lock) )
1273 goto retry;
1275 /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
1276 if ( unlikely(orqd->id < 0) )
1277 goto out_up;
1279 /* Look for "swap" which gives the best load average
1280 * FIXME: O(n^2)! */
1282 /* Reuse load delta (as we're trying to minimize it) */
1283 list_for_each( push_iter, &lrqd->svc )
1285 int inner_load_updated = 0;
1286 struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem);
1288 __update_svc_load(ops, push_svc, 0, now);
1290 /* Skip this one if it's already been flagged to migrate */
1291 if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
1292 continue;
1294 list_for_each( pull_iter, &orqd->svc )
1296 struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
1298 if ( ! inner_load_updated )
1300 __update_svc_load(ops, pull_svc, 0, now);
1303 /* Skip this one if it's already been flagged to migrate */
1304 if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
1305 continue;
1307 consider(push_svc, pull_svc);
1310 inner_load_updated = 1;
1312 /* Consider push only */
1313 consider(push_svc, NULL);
1316 list_for_each( pull_iter, &orqd->svc )
1318 struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
1320 /* Skip this one if it's already been flagged to migrate */
1321 if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
1322 continue;
1324 /* Consider pull only */
1325 consider(NULL, pull_svc);
1328 /* OK, now we have some candidates; do the moving */
1329 if ( best_push_svc )
1330 migrate(best_push_svc, orqd);
1331 if ( best_pull_svc )
1332 migrate(best_pull_svc, lrqd);
1334 out_up:
1335 spin_unlock(&orqd->lock);
1337 out:
1338 return;
1341 static int
1342 csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
1344 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
1345 int new_cpu;
1347 /* The scheduler interface doesn't have an explicit mechanism to
1348 * involve the choosable scheduler in the migrate process, so we
1349 * infer that a change may happen by the call to cpu_pick, and
1350 * remove it from the old runqueue while the lock for the old
1351 * runqueue is held. It can't be actively waiting to run. It
1352 * will be added to the new runqueue when it next wakes.
1354 * If we want to be able to call pick() separately, we need to add
1355 * a mechansim to remove a vcpu from an old processor / runqueue
1356 * before releasing the lock. */
1357 BUG_ON(__vcpu_on_runq(svc));
1359 new_cpu = choose_cpu(ops, vc);
1360 /* If we're suggesting moving to a different runqueue, remove it
1361 * from the old runqueue while we have the lock. It will be added
1362 * to the new one when it wakes. */
1363 if ( svc->rqd != NULL
1364 && RQD(ops, new_cpu) != svc->rqd )
1365 runq_deassign(ops, vc);
1367 return new_cpu;
1370 static int
1371 csched_dom_cntl(
1372 const struct scheduler *ops,
1373 struct domain *d,
1374 struct xen_domctl_scheduler_op *op)
1376 struct csched_dom * const sdom = CSCHED_DOM(d);
1377 struct csched_private *prv = CSCHED_PRIV(ops);
1378 unsigned long flags;
1380 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
1382 op->u.credit2.weight = sdom->weight;
1384 else
1386 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
1388 if ( op->u.credit2.weight != 0 )
1390 struct list_head *iter;
1391 int old_weight;
1393 /* Must hold csched_priv lock to update sdom, runq lock to
1394 * update csvcs. */
1395 spin_lock_irqsave(&prv->lock, flags);
1397 old_weight = sdom->weight;
1399 sdom->weight = op->u.credit2.weight;
1401 /* Update weights for vcpus, and max_weight for runqueues on which they reside */
1402 list_for_each ( iter, &sdom->vcpu )
1404 struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem);
1406 /* NB: Locking order is important here. Because we grab this lock here, we
1407 * must never lock csched_priv.lock if we're holding a runqueue
1408 * lock. */
1409 vcpu_schedule_lock_irq(svc->vcpu);
1411 BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor));
1413 svc->weight = sdom->weight;
1414 update_max_weight(svc->rqd, svc->weight, old_weight);
1416 vcpu_schedule_unlock_irq(svc->vcpu);
1419 spin_unlock_irqrestore(&prv->lock, flags);
1423 return 0;
1426 static void *
1427 csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
1429 struct csched_dom *sdom;
1430 int flags;
1432 sdom = xmalloc(struct csched_dom);
1433 if ( sdom == NULL )
1434 return NULL;
1435 memset(sdom, 0, sizeof(*sdom));
1437 /* Initialize credit and weight */
1438 INIT_LIST_HEAD(&sdom->vcpu);
1439 INIT_LIST_HEAD(&sdom->sdom_elem);
1440 sdom->dom = dom;
1441 sdom->weight = CSCHED_DEFAULT_WEIGHT;
1442 sdom->nr_vcpus = 0;
1444 spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
1446 list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom);
1448 spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
1450 return (void *)sdom;
1453 static int
1454 csched_dom_init(const struct scheduler *ops, struct domain *dom)
1456 struct csched_dom *sdom;
1458 printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
1460 if ( is_idle_domain(dom) )
1461 return 0;
1463 sdom = csched_alloc_domdata(ops, dom);
1464 if ( sdom == NULL )
1465 return -ENOMEM;
1467 dom->sched_priv = sdom;
1469 return 0;
1472 static void
1473 csched_free_domdata(const struct scheduler *ops, void *data)
1475 int flags;
1476 struct csched_dom *sdom = data;
1478 spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags);
1480 list_del_init(&sdom->sdom_elem);
1482 spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags);
1484 xfree(data);
1487 static void
1488 csched_dom_destroy(const struct scheduler *ops, struct domain *dom)
1490 struct csched_dom *sdom = CSCHED_DOM(dom);
1492 BUG_ON(!list_empty(&sdom->vcpu));
1494 csched_free_domdata(ops, CSCHED_DOM(dom));
1497 /* How long should we let this vcpu run for? */
1498 static s_time_t
1499 csched_runtime(const struct scheduler *ops, int cpu, struct csched_vcpu *snext)
1501 s_time_t time = CSCHED_MAX_TIMER;
1502 struct csched_runqueue_data *rqd = RQD(ops, cpu);
1503 struct list_head *runq = &rqd->runq;
1505 if ( is_idle_vcpu(snext->vcpu) )
1506 return CSCHED_MAX_TIMER;
1508 /* Basic time */
1509 time = c2t(rqd, snext->credit, snext);
1511 /* Next guy on runqueue */
1512 if ( ! list_empty(runq) )
1514 struct csched_vcpu *svc = __runq_elem(runq->next);
1515 s_time_t ntime;
1517 if ( ! is_idle_vcpu(svc->vcpu) )
1519 ntime = c2t(rqd, snext->credit - svc->credit, snext);
1521 if ( time > ntime )
1522 time = ntime;
1526 /* Check limits */
1527 if ( time < CSCHED_MIN_TIMER )
1528 time = CSCHED_MIN_TIMER;
1529 else if ( time > CSCHED_MAX_TIMER )
1530 time = CSCHED_MAX_TIMER;
1532 return time;
1535 void __dump_execstate(void *unused);
1537 /*
1538 * Find a candidate.
1539 */
1540 static struct csched_vcpu *
1541 runq_candidate(struct csched_runqueue_data *rqd,
1542 struct csched_vcpu *scurr,
1543 int cpu, s_time_t now)
1545 struct list_head *iter;
1546 struct csched_vcpu *snext = NULL;
1548 /* Default to current if runnable, idle otherwise */
1549 if ( vcpu_runnable(scurr->vcpu) )
1550 snext = scurr;
1551 else
1552 snext = CSCHED_VCPU(idle_vcpu[cpu]);
1554 list_for_each( iter, &rqd->runq )
1556 struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, runq_elem);
1558 /* If this is on a different processor, don't pull it unless
1559 * its credit is at least CSCHED_MIGRATE_RESIST higher. */
1560 if ( svc->vcpu->processor != cpu
1561 && snext->credit + CSCHED_MIGRATE_RESIST > svc->credit )
1562 continue;
1564 /* If the next one on the list has more credit than current
1565 * (or idle, if current is not runnable), choose it. */
1566 if ( svc->credit > snext->credit )
1567 snext = svc;
1569 /* In any case, if we got this far, break. */
1570 break;
1574 return snext;
1577 /*
1578 * This function is in the critical path. It is designed to be simple and
1579 * fast for the common case.
1580 */
1581 static struct task_slice
1582 csched_schedule(
1583 const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1585 const int cpu = smp_processor_id();
1586 struct csched_runqueue_data *rqd;
1587 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1588 struct csched_vcpu *snext = NULL;
1589 struct task_slice ret;
1591 CSCHED_VCPU_CHECK(current);
1593 d2printk("sc p%d c d%dv%d now %"PRI_stime"\n",
1594 cpu,
1595 scurr->vcpu->domain->domain_id,
1596 scurr->vcpu->vcpu_id,
1597 now);
1599 BUG_ON(!cpu_isset(cpu, CSCHED_PRIV(ops)->initialized));
1601 rqd = RQD(ops, cpu);
1602 BUG_ON(!cpu_isset(cpu, rqd->active));
1604 /* Protected by runqueue lock */
1606 BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd);
1608 /* Clear "tickled" bit now that we've been scheduled */
1609 if ( cpu_isset(cpu, rqd->tickled) )
1610 cpu_clear(cpu, rqd->tickled);
1612 /* Update credits */
1613 burn_credits(rqd, scurr, now);
1615 /*
1616 * Select next runnable local VCPU (ie top of local runq).
1618 * If the current vcpu is runnable, and has higher credit than
1619 * the next guy on the queue (or there is noone else), we want to
1620 * run him again.
1622 * If there's tasklet work to do, we want to chose the idle vcpu
1623 * for this processor, and mark the current for delayed runqueue
1624 * add.
1626 * If the current vcpu is runnable, and there's another runnable
1627 * candidate, we want to mark current for delayed runqueue add,
1628 * and remove the next guy from the queue.
1630 * If the current vcpu is not runnable, we want to chose the idle
1631 * vcpu for this processor.
1632 */
1633 if ( tasklet_work_scheduled )
1635 trace_var(TRC_CSCHED2_SCHED_TASKLET, 0, 0, NULL);
1636 snext = CSCHED_VCPU(idle_vcpu[cpu]);
1638 else
1639 snext=runq_candidate(rqd, scurr, cpu, now);
1641 /* If switching from a non-idle runnable vcpu, put it
1642 * back on the runqueue. */
1643 if ( snext != scurr
1644 && !is_idle_vcpu(scurr->vcpu)
1645 && vcpu_runnable(current) )
1646 set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
1648 ret.migrated = 0;
1650 /* Accounting for non-idle tasks */
1651 if ( !is_idle_vcpu(snext->vcpu) )
1653 /* If switching, remove this from the runqueue and mark it scheduled */
1654 if ( snext != scurr )
1656 BUG_ON(snext->rqd != rqd);
1658 __runq_remove(snext);
1659 if ( snext->vcpu->is_running )
1661 printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n",
1662 cpu,
1663 snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id,
1664 snext->vcpu->processor,
1665 scurr->vcpu->domain->domain_id,
1666 scurr->vcpu->vcpu_id);
1667 BUG();
1669 set_bit(__CSFLAG_scheduled, &snext->flags);
1672 /* Check for the reset condition */
1673 if ( snext->credit <= CSCHED_CREDIT_RESET )
1675 reset_credit(ops, cpu, now);
1676 balance_load(ops, cpu, now);
1679 /* Clear the idle mask if necessary */
1680 if ( cpu_isset(cpu, rqd->idle) )
1681 cpu_clear(cpu, rqd->idle);
1683 snext->start_time = now;
1685 /* Safe because lock for old processor is held */
1686 if ( snext->vcpu->processor != cpu )
1688 snext->credit += CSCHED_MIGRATE_COMPENSATION;
1689 snext->vcpu->processor = cpu;
1690 ret.migrated = 1;
1693 else
1695 /* Update the idle mask if necessary */
1696 if ( !cpu_isset(cpu, rqd->idle) )
1697 cpu_set(cpu, rqd->idle);
1698 /* Make sure avgload gets updated periodically even
1699 * if there's no activity */
1700 update_load(ops, rqd, NULL, 0, now);
1703 /*
1704 * Return task to run next...
1705 */
1706 ret.time = csched_runtime(ops, cpu, snext);
1707 ret.task = snext->vcpu;
1709 CSCHED_VCPU_CHECK(ret.task);
1710 return ret;
1713 static void
1714 csched_dump_vcpu(struct csched_vcpu *svc)
1716 printk("[%i.%i] flags=%x cpu=%i",
1717 svc->vcpu->domain->domain_id,
1718 svc->vcpu->vcpu_id,
1719 svc->flags,
1720 svc->vcpu->processor);
1722 printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
1724 printk("\n");
1727 static void
1728 csched_dump_pcpu(const struct scheduler *ops, int cpu)
1730 struct list_head *runq, *iter;
1731 struct csched_vcpu *svc;
1732 int loop;
1733 char cpustr[100];
1735 /* FIXME: Do locking properly for access to runqueue structures */
1737 runq = &RQD(ops, cpu)->runq;
1739 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu));
1740 printk(" sibling=%s, ", cpustr);
1741 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu));
1742 printk("core=%s\n", cpustr);
1744 /* current VCPU */
1745 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1746 if ( svc )
1748 printk("\trun: ");
1749 csched_dump_vcpu(svc);
1752 loop = 0;
1753 list_for_each( iter, runq )
1755 svc = __runq_elem(iter);
1756 if ( svc )
1758 printk("\t%3d: ", ++loop);
1759 csched_dump_vcpu(svc);
1764 static void
1765 csched_dump(const struct scheduler *ops)
1767 struct list_head *iter_sdom, *iter_svc;
1768 struct csched_private *prv = CSCHED_PRIV(ops);
1769 int i, loop;
1771 printk("Active queues: %d\n"
1772 "\tdefault-weight = %d\n",
1773 cpus_weight(prv->active_queues),
1774 CSCHED_DEFAULT_WEIGHT);
1775 for_each_cpu_mask(i, prv->active_queues)
1777 s_time_t fraction;
1779 fraction = prv->rqd[i].avgload * 100 / (1ULL<<prv->load_window_shift);
1781 printk("Runqueue %d:\n"
1782 "\tncpus = %u\n"
1783 "\tmax_weight = %d\n"
1784 "\tinstload = %d\n"
1785 "\taveload = %3"PRI_stime"\n",
1786 i,
1787 cpus_weight(prv->rqd[i].active),
1788 prv->rqd[i].max_weight,
1789 prv->rqd[i].load,
1790 fraction);
1793 /* FIXME: Locking! */
1795 printk("Domain info:\n");
1796 loop = 0;
1797 list_for_each( iter_sdom, &prv->sdom )
1799 struct csched_dom *sdom;
1800 sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
1802 printk("\tDomain: %d w %d v %d\n\t",
1803 sdom->dom->domain_id,
1804 sdom->weight,
1805 sdom->nr_vcpus);
1807 list_for_each( iter_svc, &sdom->vcpu )
1809 struct csched_vcpu *svc;
1810 svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem);
1812 printk("\t%3d: ", ++loop);
1813 csched_dump_vcpu(svc);
1818 static void activate_runqueue(struct csched_private *prv, int rqi)
1820 struct csched_runqueue_data *rqd;
1822 rqd = prv->rqd + rqi;
1824 BUG_ON(!cpus_empty(rqd->active));
1826 rqd->max_weight = 1;
1827 rqd->id = rqi;
1828 INIT_LIST_HEAD(&rqd->svc);
1829 INIT_LIST_HEAD(&rqd->runq);
1830 spin_lock_init(&rqd->lock);
1832 cpu_set(rqi, prv->active_queues);
1835 static void deactivate_runqueue(struct csched_private *prv, int rqi)
1837 struct csched_runqueue_data *rqd;
1839 rqd = prv->rqd + rqi;
1841 BUG_ON(!cpus_empty(rqd->active));
1843 rqd->id = -1;
1845 cpu_clear(rqi, prv->active_queues);
1848 static void init_pcpu(const struct scheduler *ops, int cpu)
1850 int rqi, old_rqi, flags;
1851 struct csched_private *prv = CSCHED_PRIV(ops);
1852 struct csched_runqueue_data *rqd;
1853 spinlock_t *old_lock;
1855 spin_lock_irqsave(&prv->lock, flags);
1857 if ( cpu_isset(cpu, prv->initialized) )
1859 printk("%s: Strange, cpu %d already initialized!\n", __func__, cpu);
1860 spin_unlock_irqrestore(&prv->lock, flags);
1861 return;
1864 old_rqi = prv->runq_map[cpu];
1866 /* Figure out which runqueue to put it in */
1867 rqi = 0;
1869 /* Figure out which runqueue to put it in */
1870 /* NB: cpu 0 doesn't get a STARTING callback, so we hard-code it to runqueue 0. */
1871 if ( cpu == 0 )
1872 rqi = 0;
1873 else
1874 rqi = cpu_to_socket(cpu);
1876 if ( rqi < 0 )
1878 printk("%s: cpu_to_socket(%d) returned %d!\n",
1879 __func__, cpu, rqi);
1880 BUG();
1883 rqd=prv->rqd + rqi;
1885 printk("Adding cpu %d to runqueue %d\n", cpu, rqi);
1886 if ( ! cpu_isset(rqi, prv->active_queues) )
1888 printk(" First cpu on runqueue, activating\n");
1889 activate_runqueue(prv, rqi);
1892 /* IRQs already disabled */
1893 old_lock=pcpu_schedule_lock(cpu);
1895 /* Move spinlock to new runq lock. */
1896 per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock;
1898 /* Set the runqueue map */
1899 prv->runq_map[cpu]=rqi;
1901 cpu_set(cpu, rqd->idle);
1902 cpu_set(cpu, rqd->active);
1904 spin_unlock(old_lock);
1906 cpu_set(cpu, prv->initialized);
1908 spin_unlock_irqrestore(&prv->lock, flags);
1910 return;
1913 static void *
1914 csched_alloc_pdata(const struct scheduler *ops, int cpu)
1916 /* Check to see if the cpu is online yet */
1917 /* Note: cpu 0 doesn't get a STARTING callback */
1918 if ( cpu == 0 || cpu_to_socket(cpu) >= 0 )
1919 init_pcpu(ops, cpu);
1920 else
1921 printk("%s: cpu %d not online yet, deferring initializatgion\n",
1922 __func__, cpu);
1924 return (void *)1;
1927 static void
1928 csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
1930 unsigned long flags;
1931 struct csched_private *prv = CSCHED_PRIV(ops);
1932 struct csched_runqueue_data *rqd;
1933 int rqi;
1935 spin_lock_irqsave(&prv->lock, flags);
1937 BUG_ON( !cpu_isset(cpu, prv->initialized));
1939 /* Find the old runqueue and remove this cpu from it */
1940 rqi = prv->runq_map[cpu];
1942 rqd = prv->rqd + rqi;
1944 /* No need to save IRQs here, they're already disabled */
1945 spin_lock(&rqd->lock);
1947 BUG_ON(!cpu_isset(cpu, rqd->idle));
1949 printk("Removing cpu %d from runqueue %d\n", cpu, rqi);
1951 cpu_clear(cpu, rqd->idle);
1952 cpu_clear(cpu, rqd->active);
1954 if ( cpus_empty(rqd->active) )
1956 printk(" No cpus left on runqueue, disabling\n");
1957 deactivate_runqueue(prv, rqi);
1960 spin_unlock(&rqd->lock);
1962 cpu_clear(cpu, prv->initialized);
1964 spin_unlock_irqrestore(&prv->lock, flags);
1966 return;
1969 static int
1970 csched_cpu_starting(int cpu)
1972 struct scheduler *ops;
1974 /* Hope this is safe from cpupools switching things around. :-) */
1975 ops = per_cpu(scheduler, cpu);
1977 init_pcpu(ops, cpu);
1979 return NOTIFY_DONE;
1982 static int cpu_credit2_callback(
1983 struct notifier_block *nfb, unsigned long action, void *hcpu)
1985 unsigned int cpu = (unsigned long)hcpu;
1986 int rc = 0;
1988 switch ( action )
1990 case CPU_STARTING:
1991 csched_cpu_starting(cpu);
1992 break;
1993 default:
1994 break;
1997 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
2000 static struct notifier_block cpu_credit2_nfb = {
2001 .notifier_call = cpu_credit2_callback
2002 };
2004 static int
2005 csched_init(struct scheduler *ops)
2007 int i;
2008 struct csched_private *prv;
2010 printk("Initializing Credit2 scheduler\n" \
2011 " WARNING: This is experimental software in development.\n" \
2012 " Use at your own risk.\n");
2014 printk(" load_window_shift: %d\n", opt_load_window_shift);
2015 printk(" underload_balance_tolerance: %d\n", opt_underload_balance_tolerance);
2016 printk(" overload_balance_tolerance: %d\n", opt_overload_balance_tolerance);
2018 if ( opt_load_window_shift < LOADAVG_WINDOW_SHIFT_MIN )
2020 printk("%s: opt_load_window_shift %d below min %d, resetting\n",
2021 __func__, opt_load_window_shift, LOADAVG_WINDOW_SHIFT_MIN);
2022 opt_load_window_shift = LOADAVG_WINDOW_SHIFT_MIN;
2025 /* Basically no CPU information is available at this point; just
2026 * set up basic structures, and a callback when the CPU info is
2027 * available. */
2029 prv = xmalloc(struct csched_private);
2030 if ( prv == NULL )
2031 return -ENOMEM;
2032 memset(prv, 0, sizeof(*prv));
2033 ops->sched_data = prv;
2034 spin_lock_init(&prv->lock);
2035 INIT_LIST_HEAD(&prv->sdom);
2037 register_cpu_notifier(&cpu_credit2_nfb);
2039 /* But un-initialize all runqueues */
2040 for ( i=0; i<NR_CPUS; i++)
2042 prv->runq_map[i] = -1;
2043 prv->rqd[i].id = -1;
2046 prv->load_window_shift = opt_load_window_shift;
2048 return 0;
2051 static void
2052 csched_deinit(const struct scheduler *ops)
2054 struct csched_private *prv;
2056 prv = CSCHED_PRIV(ops);
2057 if ( prv != NULL )
2058 xfree(prv);
2062 static struct csched_private _csched_priv;
2064 const struct scheduler sched_credit2_def = {
2065 .name = "SMP Credit Scheduler rev2",
2066 .opt_name = "credit2",
2067 .sched_id = XEN_SCHEDULER_CREDIT2,
2068 .sched_data = &_csched_priv,
2070 .init_domain = csched_dom_init,
2071 .destroy_domain = csched_dom_destroy,
2073 .insert_vcpu = csched_vcpu_insert,
2074 .remove_vcpu = csched_vcpu_remove,
2076 .sleep = csched_vcpu_sleep,
2077 .wake = csched_vcpu_wake,
2079 .adjust = csched_dom_cntl,
2081 .pick_cpu = csched_cpu_pick,
2082 .do_schedule = csched_schedule,
2083 .context_saved = csched_context_saved,
2085 .dump_cpu_state = csched_dump_pcpu,
2086 .dump_settings = csched_dump,
2087 .init = csched_init,
2088 .deinit = csched_deinit,
2089 .alloc_vdata = csched_alloc_vdata,
2090 .free_vdata = csched_free_vdata,
2091 .alloc_pdata = csched_alloc_pdata,
2092 .free_pdata = csched_free_pdata,
2093 .alloc_domdata = csched_alloc_domdata,
2094 .free_domdata = csched_free_domdata,
2095 };