debuggers.hg

view xen/common/sched_bvt.c @ 3650:beb0887c54bc

bitkeeper revision 1.1159.238.1 (4200c8d8KsGlaM3w6o3y4GHhK1jKjg)

A typesafe allocator submitted by Rusty Russel with trivial renames by me.
Signed-off-by: Rusty Russel <rusty@rustcorp.com.au> (authored)
Signed-off-by: ian.pratt@cl.cam.ac.uk
author iap10@labyrinth.cl.cam.ac.uk
date Wed Feb 02 12:34:32 2005 +0000 (2005-02-02)
parents c6f1bab39d4f
children f98fa170a9f4
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 * (C) 2004 - Mark Williamson - Intel Research Cambridge
6 ****************************************************************************
7 *
8 * File: common/schedule.c
9 * Author: Rolf Neugebauer & Keir Fraser
10 * Updated for generic API by Mark Williamson
11 *
12 * Description: CPU scheduling
13 * implements A Borrowed Virtual Time scheduler.
14 * (see Duda & Cheriton SOSP'99)
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/ac_timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/slab.h>
28 #include <xen/softirq.h>
30 /* all per-domain BVT-specific scheduling info is stored here */
31 struct bvt_edom_info
32 {
33 struct list_head run_list; /* runqueue list pointers */
34 u32 avt; /* actual virtual time */
35 u32 evt; /* effective virtual time */
36 struct exec_domain *exec_domain;
37 struct bvt_dom_info *inf;
38 };
40 struct bvt_dom_info
41 {
42 struct domain *domain; /* domain this info belongs to */
43 u32 mcu_advance; /* inverse of weight */
44 int warpback; /* warp? */
45 int warp; /* warp set and within the warp
46 limits*/
47 s32 warp_value; /* virtual time warp */
48 s_time_t warpl; /* warp limit */
49 struct ac_timer warp_timer; /* deals with warpl */
50 s_time_t warpu; /* unwarp time requirement */
51 struct ac_timer unwarp_timer; /* deals with warpu */
53 struct bvt_edom_info ed_inf[MAX_VIRT_CPUS];
54 };
56 struct bvt_cpu_info
57 {
58 struct list_head runqueue;
59 unsigned long svt;
60 };
62 #define BVT_INFO(p) ((struct bvt_dom_info *)(p)->sched_priv)
63 #define EBVT_INFO(p) ((struct bvt_edom_info *)(p)->ed_sched_priv)
64 #define CPU_INFO(cpu) ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
65 #define RUNLIST(p) ((struct list_head *)&(EBVT_INFO(p)->run_list))
66 #define RUNQUEUE(cpu) ((struct list_head *)&(CPU_INFO(cpu)->runqueue))
67 #define CPU_SVT(cpu) (CPU_INFO(cpu)->svt)
69 #define MCU (s32)MICROSECS(100) /* Minimum unit */
70 #define MCU_ADVANCE 10 /* default weight */
71 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
72 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
74 static xmem_cache_t *dom_info_cache;
76 static inline void __add_to_runqueue_head(struct exec_domain *d)
77 {
78 list_add(RUNLIST(d), RUNQUEUE(d->processor));
79 }
81 static inline void __add_to_runqueue_tail(struct exec_domain *d)
82 {
83 list_add_tail(RUNLIST(d), RUNQUEUE(d->processor));
84 }
86 static inline void __del_from_runqueue(struct exec_domain *d)
87 {
88 struct list_head *runlist = RUNLIST(d);
89 list_del(runlist);
90 runlist->next = NULL;
91 }
93 static inline int __task_on_runqueue(struct exec_domain *d)
94 {
95 return (RUNLIST(d))->next != NULL;
96 }
99 /* Warp/unwarp timer functions */
100 static void warp_timer_fn(unsigned long pointer)
101 {
102 struct bvt_dom_info *inf = (struct bvt_dom_info *)pointer;
103 unsigned int cpu = inf->domain->exec_domain[0]->processor;
105 spin_lock_irq(&schedule_data[cpu].schedule_lock);
107 inf->warp = 0;
109 /* unwarp equal to zero => stop warping */
110 if ( inf->warpu == 0 )
111 {
112 inf->warpback = 0;
113 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
114 }
116 /* set unwarp timer */
117 inf->unwarp_timer.expires = NOW() + inf->warpu;
118 add_ac_timer(&inf->unwarp_timer);
120 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
121 }
123 static void unwarp_timer_fn(unsigned long pointer)
124 {
125 struct bvt_dom_info *inf = (struct bvt_dom_info *)pointer;
126 unsigned int cpu = inf->domain->exec_domain[0]->processor;
128 spin_lock_irq(&schedule_data[cpu].schedule_lock);
130 if ( inf->warpback )
131 {
132 inf->warp = 1;
133 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
134 }
136 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
137 }
139 static inline u32 calc_avt(struct exec_domain *d, s_time_t now)
140 {
141 u32 ranfor, mcus;
142 struct bvt_dom_info *inf = BVT_INFO(d->domain);
143 struct bvt_edom_info *einf = EBVT_INFO(d);
145 ranfor = (u32)(now - d->lastschd);
146 mcus = (ranfor + MCU - 1)/MCU;
148 return einf->avt + mcus * inf->mcu_advance;
149 }
151 /*
152 * Calculate the effective virtual time for a domain. Take into account
153 * warping limits
154 */
155 static inline u32 calc_evt(struct exec_domain *d, u32 avt)
156 {
157 struct bvt_dom_info *inf = BVT_INFO(d->domain);
158 /* TODO The warp routines need to be rewritten GM */
160 if ( inf->warp )
161 return avt - inf->warp_value;
162 else
163 return avt;
164 }
166 /**
167 * bvt_alloc_task - allocate BVT private structures for a task
168 * @p: task to allocate private structures for
169 *
170 * Returns non-zero on failure.
171 */
172 int bvt_alloc_task(struct exec_domain *ed)
173 {
174 struct domain *d = ed->domain;
175 if ( (d->sched_priv == NULL) ) {
176 if ( (d->sched_priv = xmem_cache_alloc(dom_info_cache)) == NULL )
177 return -1;
178 memset(d->sched_priv, 0, sizeof(struct bvt_dom_info));
179 }
180 ed->ed_sched_priv = &BVT_INFO(d)->ed_inf[ed->eid];
181 BVT_INFO(d)->ed_inf[ed->eid].inf = BVT_INFO(d);
182 BVT_INFO(d)->ed_inf[ed->eid].exec_domain = ed;
183 return 0;
184 }
186 /*
187 * Add and remove a domain
188 */
189 void bvt_add_task(struct exec_domain *d)
190 {
191 struct bvt_dom_info *inf = BVT_INFO(d->domain);
192 struct bvt_edom_info *einf = EBVT_INFO(d);
193 ASSERT(inf != NULL);
194 ASSERT(d != NULL);
196 if (d->eid == 0) {
197 inf->mcu_advance = MCU_ADVANCE;
198 inf->domain = d->domain;
199 inf->warpback = 0;
200 /* Set some default values here. */
201 inf->warp = 0;
202 inf->warp_value = 0;
203 inf->warpl = MILLISECS(2000);
204 inf->warpu = MILLISECS(1000);
205 /* initialise the timers */
206 init_ac_timer(&inf->warp_timer);
207 inf->warp_timer.cpu = d->processor;
208 inf->warp_timer.data = (unsigned long)inf;
209 inf->warp_timer.function = &warp_timer_fn;
210 init_ac_timer(&inf->unwarp_timer);
211 inf->unwarp_timer.cpu = d->processor;
212 inf->unwarp_timer.data = (unsigned long)inf;
213 inf->unwarp_timer.function = &unwarp_timer_fn;
214 }
216 einf->exec_domain = d;
218 if ( d->domain->id == IDLE_DOMAIN_ID )
219 {
220 einf->avt = einf->evt = ~0U;
221 }
222 else
223 {
224 /* Set avt and evt to system virtual time. */
225 einf->avt = CPU_SVT(d->processor);
226 einf->evt = CPU_SVT(d->processor);
227 }
228 }
230 int bvt_init_idle_task(struct exec_domain *p)
231 {
232 if ( bvt_alloc_task(p) < 0 )
233 return -1;
235 bvt_add_task(p);
237 set_bit(EDF_RUNNING, &p->ed_flags);
238 if ( !__task_on_runqueue(p) )
239 __add_to_runqueue_head(p);
241 return 0;
242 }
244 void bvt_wake(struct exec_domain *d)
245 {
246 struct bvt_edom_info *einf = EBVT_INFO(d);
247 struct exec_domain *curr;
248 s_time_t now, r_time;
249 int cpu = d->processor;
250 u32 curr_evt;
252 if ( unlikely(__task_on_runqueue(d)) )
253 return;
255 __add_to_runqueue_head(d);
257 now = NOW();
259 /* Set the BVT parameters. AVT should always be updated
260 if CPU migration ocurred.*/
261 if ( einf->avt < CPU_SVT(cpu) ||
262 unlikely(test_bit(EDF_MIGRATED, &d->ed_flags)) )
263 einf->avt = CPU_SVT(cpu);
265 /* Deal with warping here. */
266 einf->evt = calc_evt(d, einf->avt);
268 curr = schedule_data[cpu].curr;
269 curr_evt = calc_evt(curr, calc_avt(curr, now));
270 /* Calculate the time the current domain would run assuming
271 the second smallest evt is of the newly woken domain */
272 r_time = curr->lastschd +
273 ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
274 ctx_allow;
276 if ( is_idle_task(curr->domain) || (einf->evt <= curr_evt) )
277 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
278 else if ( schedule_data[cpu].s_timer.expires > r_time )
279 mod_ac_timer(&schedule_data[cpu].s_timer, r_time);
280 }
283 static void bvt_sleep(struct exec_domain *d)
284 {
285 if ( test_bit(EDF_RUNNING, &d->ed_flags) )
286 cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
287 else if ( __task_on_runqueue(d) )
288 __del_from_runqueue(d);
289 }
291 /**
292 * bvt_free_task - free BVT private structures for a task
293 * @d: task
294 */
295 void bvt_free_task(struct domain *d)
296 {
297 ASSERT(d->sched_priv != NULL);
298 xmem_cache_free(dom_info_cache, d->sched_priv);
299 }
301 /* Control the scheduler. */
302 int bvt_ctl(struct sched_ctl_cmd *cmd)
303 {
304 struct bvt_ctl *params = &cmd->u.bvt;
306 if ( cmd->direction == SCHED_INFO_PUT )
307 ctx_allow = params->ctx_allow;
308 else
309 params->ctx_allow = ctx_allow;
311 return 0;
312 }
314 /* Adjust scheduling parameter for a given domain. */
315 int bvt_adjdom(
316 struct domain *d, struct sched_adjdom_cmd *cmd)
317 {
318 struct bvt_adjdom *params = &cmd->u.bvt;
320 if ( cmd->direction == SCHED_INFO_PUT )
321 {
322 u32 mcu_adv = params->mcu_adv;
323 u32 warpback = params->warpback;
324 s32 warpvalue = params->warpvalue;
325 s_time_t warpl = params->warpl;
326 s_time_t warpu = params->warpu;
328 struct bvt_dom_info *inf = BVT_INFO(d);
330 /* Sanity -- this can avoid divide-by-zero. */
331 if ( (mcu_adv == 0) || (warpl < 0) || (warpu < 0) )
332 return -EINVAL;
334 inf->mcu_advance = mcu_adv;
335 inf->warpback = warpback;
336 /* The warp should be the same as warpback */
337 inf->warp = warpback;
338 inf->warp_value = warpvalue;
339 inf->warpl = MILLISECS(warpl);
340 inf->warpu = MILLISECS(warpu);
342 /* If the unwarp timer set up it needs to be removed */
343 rem_ac_timer(&inf->unwarp_timer);
344 /* If we stop warping the warp timer needs to be removed */
345 if ( !warpback )
346 rem_ac_timer(&inf->warp_timer);
347 }
348 else if ( cmd->direction == SCHED_INFO_GET )
349 {
350 struct bvt_dom_info *inf = BVT_INFO(d);
351 params->mcu_adv = inf->mcu_advance;
352 params->warpvalue = inf->warp_value;
353 params->warpback = inf->warpback;
354 params->warpl = inf->warpl;
355 params->warpu = inf->warpu;
356 }
358 return 0;
359 }
362 /*
363 * The main function
364 * - deschedule the current domain.
365 * - pick a new domain.
366 * i.e., the domain with lowest EVT.
367 * The runqueue should be ordered by EVT so that is easy.
368 */
369 static task_slice_t bvt_do_schedule(s_time_t now)
370 {
371 struct domain *d;
372 struct exec_domain *prev = current, *next = NULL, *next_prime, *ed;
373 int cpu = prev->processor;
374 s32 r_time; /* time for new dom to run */
375 u32 next_evt, next_prime_evt, min_avt;
376 struct bvt_dom_info *prev_inf = BVT_INFO(prev->domain);
377 struct bvt_edom_info *prev_einf = EBVT_INFO(prev);
378 struct bvt_edom_info *p_einf = NULL;
379 struct bvt_edom_info *next_einf = NULL;
380 struct bvt_edom_info *next_prime_einf = NULL;
381 task_slice_t ret;
383 ASSERT(prev->ed_sched_priv != NULL);
384 ASSERT(prev_einf != NULL);
385 ASSERT(__task_on_runqueue(prev));
387 if ( likely(!is_idle_task(prev->domain)) )
388 {
389 prev_einf->avt = calc_avt(prev, now);
390 prev_einf->evt = calc_evt(prev, prev_einf->avt);
392 if(prev_inf->warpback && prev_inf->warpl > 0)
393 rem_ac_timer(&prev_inf->warp_timer);
395 __del_from_runqueue(prev);
397 if ( domain_runnable(prev) )
398 __add_to_runqueue_tail(prev);
399 }
402 /* We should at least have the idle task */
403 ASSERT(!list_empty(RUNQUEUE(cpu)));
405 /*
406 * scan through the run queue and pick the task with the lowest evt
407 * *and* the task the second lowest evt.
408 * this code is O(n) but we expect n to be small.
409 */
410 next_einf = EBVT_INFO(schedule_data[cpu].idle);
411 next_prime_einf = NULL;
413 next_evt = ~0U;
414 next_prime_evt = ~0U;
415 min_avt = ~0U;
417 list_for_each_entry ( p_einf, RUNQUEUE(cpu), run_list )
418 {
419 if ( p_einf->evt < next_evt )
420 {
421 next_prime_einf = next_einf;
422 next_prime_evt = next_evt;
423 next_einf = p_einf;
424 next_evt = p_einf->evt;
425 }
426 else if ( next_prime_evt == ~0U )
427 {
428 next_prime_evt = p_einf->evt;
429 next_prime_einf = p_einf;
430 }
431 else if ( p_einf->evt < next_prime_evt )
432 {
433 next_prime_evt = p_einf->evt;
434 next_prime_einf = p_einf;
435 }
437 /* Determine system virtual time. */
438 if ( p_einf->avt < min_avt )
439 min_avt = p_einf->avt;
440 }
442 if(next_einf->inf->warp && next_einf->inf->warpl > 0)
443 {
444 /* Set the timer up */
445 next_einf->inf->warp_timer.expires = now + next_einf->inf->warpl;
446 /* Add it to the heap */
447 add_ac_timer(&next_einf->inf->warp_timer);
448 }
450 /* Extract the domain pointers from the dom infos */
451 next = next_einf->exec_domain;
452 next_prime = next_prime_einf->exec_domain;
454 /* Update system virtual time. */
455 if ( min_avt != ~0U )
456 CPU_SVT(cpu) = min_avt;
458 /* check for virtual time overrun on this cpu */
459 if ( CPU_SVT(cpu) >= 0xf0000000 )
460 {
461 ASSERT(!local_irq_is_enabled());
463 write_lock(&domlist_lock);
465 for_each_domain ( d )
466 {
467 for_each_exec_domain (d, ed) {
468 if ( ed->processor == cpu )
469 {
470 p_einf = EBVT_INFO(ed);
471 p_einf->evt -= 0xe0000000;
472 p_einf->avt -= 0xe0000000;
473 }
474 }
475 }
477 write_unlock(&domlist_lock);
479 CPU_SVT(cpu) -= 0xe0000000;
480 }
482 /* work out time for next run through scheduler */
483 if ( is_idle_task(next->domain) )
484 {
485 r_time = ctx_allow;
486 goto sched_done;
487 }
489 if ( (next_prime == NULL) || is_idle_task(next_prime->domain) )
490 {
491 /* We have only one runnable task besides the idle task. */
492 r_time = 10 * ctx_allow; /* RN: random constant */
493 goto sched_done;
494 }
496 /*
497 * If we are here then we have two runnable tasks.
498 * Work out how long 'next' can run till its evt is greater than
499 * 'next_prime's evt. Take context switch allowance into account.
500 */
501 ASSERT(next_prime_einf->evt >= next_einf->evt);
503 r_time = ((next_prime_einf->evt - next_einf->evt)/next_einf->inf->mcu_advance)
504 + ctx_allow;
506 ASSERT(r_time >= ctx_allow);
508 sched_done:
509 ret.task = next;
510 ret.time = r_time;
511 return ret;
512 }
515 static void bvt_dump_runq_el(struct exec_domain *p)
516 {
517 struct bvt_edom_info *inf = EBVT_INFO(p);
519 printk("mcua=%d ev=0x%08X av=0x%08X ",
520 inf->inf->mcu_advance, inf->evt, inf->avt);
521 }
523 static void bvt_dump_settings(void)
524 {
525 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
526 }
528 static void bvt_dump_cpu_state(int i)
529 {
530 struct list_head *queue;
531 int loop = 0;
532 struct bvt_edom_info *d_inf;
533 struct exec_domain *d;
535 printk("svt=0x%08lX ", CPU_SVT(i));
537 queue = RUNQUEUE(i);
538 printk("QUEUE rq %lx n: %lx, p: %lx\n", (unsigned long)queue,
539 (unsigned long) queue->next, (unsigned long) queue->prev);
541 list_for_each_entry ( d_inf, queue, run_list )
542 {
543 d = d_inf->exec_domain;
544 printk("%3d: %u has=%c ", loop++, d->domain->id,
545 test_bit(EDF_RUNNING, &d->ed_flags) ? 'T':'F');
546 bvt_dump_runq_el(d);
547 printk("c=0x%X%08X\n", (u32)(d->cpu_time>>32), (u32)d->cpu_time);
548 printk(" l: %p n: %p p: %p\n",
549 &d_inf->run_list, d_inf->run_list.next, d_inf->run_list.prev);
550 }
551 }
553 /* Initialise the data structures. */
554 int bvt_init_scheduler()
555 {
556 int i;
558 for ( i = 0; i < NR_CPUS; i++ )
559 {
560 schedule_data[i].sched_priv = xmalloc(struct bvt_cpu_info);
562 if ( schedule_data[i].sched_priv == NULL )
563 {
564 printk("Failed to allocate BVT scheduler per-CPU memory!\n");
565 return -1;
566 }
568 INIT_LIST_HEAD(RUNQUEUE(i));
570 CPU_SVT(i) = 0; /* XXX do I really need to do this? */
571 }
573 dom_info_cache = xmem_cache_create(
574 "BVT dom info", sizeof(struct bvt_dom_info), 0, 0, NULL, NULL);
575 if ( dom_info_cache == NULL )
576 {
577 printk("BVT: Failed to allocate domain info SLAB cache");
578 return -1;
579 }
581 return 0;
582 }
584 struct scheduler sched_bvt_def = {
585 .name = "Borrowed Virtual Time",
586 .opt_name = "bvt",
587 .sched_id = SCHED_BVT,
589 .init_scheduler = bvt_init_scheduler,
590 .init_idle_task = bvt_init_idle_task,
591 .alloc_task = bvt_alloc_task,
592 .add_task = bvt_add_task,
593 .free_task = bvt_free_task,
594 .do_schedule = bvt_do_schedule,
595 .control = bvt_ctl,
596 .adjdom = bvt_adjdom,
597 .dump_settings = bvt_dump_settings,
598 .dump_cpu_state = bvt_dump_cpu_state,
599 .sleep = bvt_sleep,
600 .wake = bvt_wake,
601 };