debuggers.hg

view xen/common/sched_bvt.c @ 2633:fe2f4bbcf869

bitkeeper revision 1.1159.99.4 (41626f06VquclgVVpIeHy9z2K3jW-A)

Rationalise scheduler locking. A bit more conservative now, but much
simpler! I only applied this to the basic BVT scheduler -- the others
are still unsafe and have been removed from the basic build.
author kaf24@freefall.cl.cam.ac.uk
date Tue Oct 05 09:53:10 2004 +0000 (2004-10-05)
parents 49739c6ac967
children 92fff25bf21e
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 * (C) 2004 - Mark Williamson - Intel Research Cambridge
6 ****************************************************************************
7 *
8 * File: common/schedule.c
9 * Author: Rolf Neugebauer & Keir Fraser
10 * Updated for generic API by Mark Williamson
11 *
12 * Description: CPU scheduling
13 * implements A Borrowed Virtual Time scheduler.
14 * (see Duda & Cheriton SOSP'99)
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/ac_timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/slab.h>
28 #include <xen/softirq.h>
30 /* all per-domain BVT-specific scheduling info is stored here */
31 struct bvt_dom_info
32 {
33 struct domain *domain; /* domain this info belongs to */
34 struct list_head run_list; /* runqueue list pointers */
35 u32 mcu_advance; /* inverse of weight */
36 u32 avt; /* actual virtual time */
37 u32 evt; /* effective virtual time */
38 int warpback; /* warp? */
39 int warp; /* warp set and within the warp
40 limits*/
41 s32 warp_value; /* virtual time warp */
42 s_time_t warpl; /* warp limit */
43 struct ac_timer warp_timer; /* deals with warpl */
44 s_time_t warpu; /* unwarp time requirement */
45 struct ac_timer unwarp_timer; /* deals with warpu */
46 };
48 struct bvt_cpu_info
49 {
50 struct list_head runqueue;
51 unsigned long svt;
52 };
54 #define BVT_INFO(p) ((struct bvt_dom_info *)(p)->sched_priv)
55 #define CPU_INFO(cpu) ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
56 #define RUNLIST(p) ((struct list_head *)&(BVT_INFO(p)->run_list))
57 #define RUNQUEUE(cpu) ((struct list_head *)&(CPU_INFO(cpu)->runqueue))
58 #define CPU_SVT(cpu) (CPU_INFO(cpu)->svt)
60 #define MCU (s32)MICROSECS(100) /* Minimum unit */
61 #define MCU_ADVANCE 10 /* default weight */
62 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
63 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
65 static xmem_cache_t *dom_info_cache;
67 static inline void __add_to_runqueue_head(struct domain *d)
68 {
69 list_add(RUNLIST(d), RUNQUEUE(d->processor));
70 }
72 static inline void __add_to_runqueue_tail(struct domain *d)
73 {
74 list_add_tail(RUNLIST(d), RUNQUEUE(d->processor));
75 }
77 static inline void __del_from_runqueue(struct domain *d)
78 {
79 struct list_head *runlist = RUNLIST(d);
80 list_del(runlist);
81 runlist->next = NULL;
82 }
84 static inline int __task_on_runqueue(struct domain *d)
85 {
86 return (RUNLIST(d))->next != NULL;
87 }
90 /* Warp/unwarp timer functions */
91 static void warp_timer_fn(unsigned long pointer)
92 {
93 struct bvt_dom_info *inf = (struct bvt_dom_info *)pointer;
94 unsigned int cpu = inf->domain->processor;
96 spin_lock_irq(&schedule_data[cpu].schedule_lock);
98 inf->warp = 0;
100 /* unwarp equal to zero => stop warping */
101 if ( inf->warpu == 0 )
102 {
103 inf->warpback = 0;
104 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
105 }
107 /* set unwarp timer */
108 inf->unwarp_timer.expires = NOW() + inf->warpu;
109 add_ac_timer(&inf->unwarp_timer);
111 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
112 }
114 static void unwarp_timer_fn(unsigned long pointer)
115 {
116 struct bvt_dom_info *inf = (struct bvt_dom_info *)pointer;
117 unsigned int cpu = inf->domain->processor;
119 spin_lock_irq(&schedule_data[cpu].schedule_lock);
121 if ( inf->warpback )
122 {
123 inf->warp = 1;
124 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
125 }
127 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
128 }
130 static inline u32 calc_avt(struct domain *d, s_time_t now)
131 {
132 u32 ranfor, mcus;
133 struct bvt_dom_info *inf = BVT_INFO(d);
135 ranfor = (u32)(now - d->lastschd);
136 mcus = (ranfor + MCU - 1)/MCU;
138 return inf->avt + mcus * inf->mcu_advance;
139 }
141 /*
142 * Calculate the effective virtual time for a domain. Take into account
143 * warping limits
144 */
145 static inline u32 calc_evt(struct domain *d, u32 avt)
146 {
147 struct bvt_dom_info *inf = BVT_INFO(d);
148 /* TODO The warp routines need to be rewritten GM */
150 if ( inf->warp )
151 return avt - inf->warp_value;
152 else
153 return avt;
154 }
156 /**
157 * bvt_alloc_task - allocate BVT private structures for a task
158 * @p: task to allocate private structures for
159 *
160 * Returns non-zero on failure.
161 */
162 int bvt_alloc_task(struct domain *d)
163 {
164 if ( (d->sched_priv = xmem_cache_alloc(dom_info_cache)) == NULL )
165 return -1;
166 memset(d->sched_priv, 0, sizeof(struct bvt_dom_info));
167 return 0;
168 }
170 /*
171 * Add and remove a domain
172 */
173 void bvt_add_task(struct domain *d)
174 {
175 struct bvt_dom_info *inf = BVT_INFO(d);
176 ASSERT(inf != NULL);
177 ASSERT(d != NULL);
179 inf->mcu_advance = MCU_ADVANCE;
180 inf->domain = d;
181 inf->warpback = 0;
182 /* Set some default values here. */
183 inf->warp = 0;
184 inf->warp_value = 0;
185 inf->warpl = MILLISECS(2000);
186 inf->warpu = MILLISECS(1000);
187 /* initialise the timers */
188 init_ac_timer(&inf->warp_timer);
189 inf->warp_timer.cpu = d->processor;
190 inf->warp_timer.data = (unsigned long)inf;
191 inf->warp_timer.function = &warp_timer_fn;
192 init_ac_timer(&inf->unwarp_timer);
193 inf->unwarp_timer.cpu = d->processor;
194 inf->unwarp_timer.data = (unsigned long)inf;
195 inf->unwarp_timer.function = &unwarp_timer_fn;
197 if ( d->domain == IDLE_DOMAIN_ID )
198 {
199 inf->avt = inf->evt = ~0U;
200 }
201 else
202 {
203 /* Set avt and evt to system virtual time. */
204 inf->avt = CPU_SVT(d->processor);
205 inf->evt = CPU_SVT(d->processor);
206 }
207 }
209 int bvt_init_idle_task(struct domain *p)
210 {
211 if ( bvt_alloc_task(p) < 0 )
212 return -1;
214 bvt_add_task(p);
216 set_bit(DF_RUNNING, &p->flags);
217 if ( !__task_on_runqueue(p) )
218 __add_to_runqueue_head(p);
220 return 0;
221 }
223 void bvt_wake(struct domain *d)
224 {
225 struct bvt_dom_info *inf = BVT_INFO(d);
226 struct domain *curr;
227 s_time_t now, r_time;
228 int cpu = d->processor;
229 u32 curr_evt;
231 if ( unlikely(__task_on_runqueue(d)) )
232 return;
234 __add_to_runqueue_head(d);
236 now = NOW();
238 /* Set the BVT parameters. AVT should always be updated
239 if CPU migration ocurred.*/
240 if ( inf->avt < CPU_SVT(cpu) ||
241 unlikely(test_bit(DF_MIGRATED, &d->flags)) )
242 inf->avt = CPU_SVT(cpu);
244 /* Deal with warping here. */
245 inf->evt = calc_evt(d, inf->avt);
247 curr = schedule_data[cpu].curr;
248 curr_evt = calc_evt(curr, calc_avt(curr, now));
249 /* Calculate the time the current domain would run assuming
250 the second smallest evt is of the newly woken domain */
251 r_time = curr->lastschd +
252 ((inf->evt - curr_evt) / BVT_INFO(curr)->mcu_advance) +
253 ctx_allow;
255 if ( is_idle_task(curr) || (inf->evt <= curr_evt) )
256 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
257 else if ( schedule_data[cpu].s_timer.expires > r_time )
258 mod_ac_timer(&schedule_data[cpu].s_timer, r_time);
259 }
262 static void bvt_sleep(struct domain *d)
263 {
264 if ( test_bit(DF_RUNNING, &d->flags) )
265 cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
266 else if ( __task_on_runqueue(d) )
267 __del_from_runqueue(d);
268 }
270 /**
271 * bvt_free_task - free BVT private structures for a task
272 * @d: task
273 */
274 void bvt_free_task(struct domain *d)
275 {
276 ASSERT(d->sched_priv != NULL);
277 xmem_cache_free(dom_info_cache, d->sched_priv);
278 }
280 /* Control the scheduler. */
281 int bvt_ctl(struct sched_ctl_cmd *cmd)
282 {
283 struct bvt_ctl *params = &cmd->u.bvt;
285 if ( cmd->direction == SCHED_INFO_PUT )
286 ctx_allow = params->ctx_allow;
287 else
288 params->ctx_allow = ctx_allow;
290 return 0;
291 }
293 /* Adjust scheduling parameter for a given domain. */
294 int bvt_adjdom(
295 struct domain *d, struct sched_adjdom_cmd *cmd)
296 {
297 struct bvt_adjdom *params = &cmd->u.bvt;
299 if ( cmd->direction == SCHED_INFO_PUT )
300 {
301 u32 mcu_adv = params->mcu_adv;
302 u32 warpback = params->warpback;
303 s32 warpvalue = params->warpvalue;
304 s_time_t warpl = params->warpl;
305 s_time_t warpu = params->warpu;
307 struct bvt_dom_info *inf = BVT_INFO(d);
309 DPRINTK("Get domain %u bvt mcu_adv=%u, warpback=%d, warpvalue=%d, "
310 "warpl=%lld, warpu=%lld\n",
311 d->domain, inf->mcu_advance, inf->warpback, inf->warp_value,
312 inf->warpl, inf->warpu);
314 /* Sanity -- this can avoid divide-by-zero. */
315 if ( (mcu_adv == 0) || (warpl < 0) || (warpu < 0) )
316 return -EINVAL;
318 inf->mcu_advance = mcu_adv;
319 inf->warpback = warpback;
320 /* The warp should be the same as warpback */
321 inf->warp = warpback;
322 inf->warp_value = warpvalue;
323 inf->warpl = MILLISECS(warpl);
324 inf->warpu = MILLISECS(warpu);
326 /* If the unwarp timer set up it needs to be removed */
327 rem_ac_timer(&inf->unwarp_timer);
328 /* If we stop warping the warp timer needs to be removed */
329 if ( !warpback )
330 rem_ac_timer(&inf->warp_timer);
332 DPRINTK("Get domain %u bvt mcu_adv=%u, warpback=%d, warpvalue=%d, "
333 "warpl=%lld, warpu=%lld\n",
334 d->domain, inf->mcu_advance, inf->warpback, inf->warp_value,
335 inf->warpl, inf->warpu);
337 }
338 else if ( cmd->direction == SCHED_INFO_GET )
339 {
340 struct bvt_dom_info *inf = BVT_INFO(d);
341 params->mcu_adv = inf->mcu_advance;
342 params->warpvalue = inf->warp_value;
343 params->warpback = inf->warpback;
344 params->warpl = inf->warpl;
345 params->warpu = inf->warpu;
346 }
348 return 0;
349 }
352 /*
353 * The main function
354 * - deschedule the current domain.
355 * - pick a new domain.
356 * i.e., the domain with lowest EVT.
357 * The runqueue should be ordered by EVT so that is easy.
358 */
359 static task_slice_t bvt_do_schedule(s_time_t now)
360 {
361 struct domain *prev = current, *next = NULL, *next_prime, *p;
362 struct list_head *tmp;
363 int cpu = prev->processor;
364 s32 r_time; /* time for new dom to run */
365 u32 next_evt, next_prime_evt, min_avt;
366 struct bvt_dom_info *prev_inf = BVT_INFO(prev);
367 struct bvt_dom_info *p_inf = NULL;
368 struct bvt_dom_info *next_inf = NULL;
369 struct bvt_dom_info *next_prime_inf = NULL;
370 task_slice_t ret;
372 ASSERT(prev->sched_priv != NULL);
373 ASSERT(prev_inf != NULL);
374 ASSERT(__task_on_runqueue(prev));
376 if ( likely(!is_idle_task(prev)) )
377 {
378 prev_inf->avt = calc_avt(prev, now);
379 prev_inf->evt = calc_evt(prev, prev_inf->avt);
381 if(prev_inf->warpback && prev_inf->warpl > 0)
382 rem_ac_timer(&prev_inf->warp_timer);
384 __del_from_runqueue(prev);
386 if ( domain_runnable(prev) )
387 __add_to_runqueue_tail(prev);
388 }
391 /* We should at least have the idle task */
392 ASSERT(!list_empty(RUNQUEUE(cpu)));
394 /*
395 * scan through the run queue and pick the task with the lowest evt
396 * *and* the task the second lowest evt.
397 * this code is O(n) but we expect n to be small.
398 */
399 next_inf = BVT_INFO(schedule_data[cpu].idle);
400 next_prime_inf = NULL;
402 next_evt = ~0U;
403 next_prime_evt = ~0U;
404 min_avt = ~0U;
406 list_for_each ( tmp, RUNQUEUE(cpu) )
407 {
408 p_inf = list_entry(tmp, struct bvt_dom_info, run_list);
410 if ( p_inf->evt < next_evt )
411 {
412 next_prime_inf = next_inf;
413 next_prime_evt = next_evt;
414 next_inf = p_inf;
415 next_evt = p_inf->evt;
416 }
417 else if ( next_prime_evt == ~0U )
418 {
419 next_prime_evt = p_inf->evt;
420 next_prime_inf = p_inf;
421 }
422 else if ( p_inf->evt < next_prime_evt )
423 {
424 next_prime_evt = p_inf->evt;
425 next_prime_inf = p_inf;
426 }
428 /* Determine system virtual time. */
429 if ( p_inf->avt < min_avt )
430 min_avt = p_inf->avt;
431 }
433 if(next_inf->warp && next_inf->warpl > 0)
434 {
435 /* Set the timer up */
436 next_inf->warp_timer.expires = now + next_inf->warpl;
437 /* Add it to the heap */
438 add_ac_timer(&next_inf->warp_timer);
439 }
441 /* Extract the domain pointers from the dom infos */
442 next = next_inf->domain;
443 next_prime = next_prime_inf->domain;
445 /* Update system virtual time. */
446 if ( min_avt != ~0U )
447 CPU_SVT(cpu) = min_avt;
449 /* check for virtual time overrun on this cpu */
450 if ( CPU_SVT(cpu) >= 0xf0000000 )
451 {
452 ASSERT(!local_irq_is_enabled());
454 write_lock(&tasklist_lock);
456 for_each_domain ( p )
457 {
458 if ( p->processor == cpu )
459 {
460 p_inf = BVT_INFO(p);
461 p_inf->evt -= 0xe0000000;
462 p_inf->avt -= 0xe0000000;
463 }
464 }
466 write_unlock(&tasklist_lock);
468 CPU_SVT(cpu) -= 0xe0000000;
469 }
471 /* work out time for next run through scheduler */
472 if ( is_idle_task(next) )
473 {
474 r_time = ctx_allow;
475 goto sched_done;
476 }
478 if ( (next_prime == NULL) || is_idle_task(next_prime) )
479 {
480 /* We have only one runnable task besides the idle task. */
481 r_time = 10 * ctx_allow; /* RN: random constant */
482 goto sched_done;
483 }
485 /*
486 * If we are here then we have two runnable tasks.
487 * Work out how long 'next' can run till its evt is greater than
488 * 'next_prime's evt. Take context switch allowance into account.
489 */
490 ASSERT(next_prime_inf->evt >= next_inf->evt);
492 r_time = ((next_prime_inf->evt - next_inf->evt)/next_inf->mcu_advance)
493 + ctx_allow;
495 ASSERT(r_time >= ctx_allow);
497 sched_done:
498 ret.task = next;
499 ret.time = r_time;
500 return ret;
501 }
504 static void bvt_dump_runq_el(struct domain *p)
505 {
506 struct bvt_dom_info *inf = BVT_INFO(p);
508 printk("mcua=%d ev=0x%08X av=0x%08X ",
509 inf->mcu_advance, inf->evt, inf->avt);
510 }
512 static void bvt_dump_settings(void)
513 {
514 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
515 }
517 static void bvt_dump_cpu_state(int i)
518 {
519 struct list_head *list, *queue;
520 int loop = 0;
521 struct bvt_dom_info *d_inf;
522 struct domain *d;
524 printk("svt=0x%08lX ", CPU_SVT(i));
526 queue = RUNQUEUE(i);
527 printk("QUEUE rq %lx n: %lx, p: %lx\n", (unsigned long)queue,
528 (unsigned long) queue->next, (unsigned long) queue->prev);
530 list_for_each ( list, queue )
531 {
532 d_inf = list_entry(list, struct bvt_dom_info, run_list);
533 d = d_inf->domain;
534 printk("%3d: %u has=%c ", loop++, d->domain,
535 test_bit(DF_RUNNING, &d->flags) ? 'T':'F');
536 bvt_dump_runq_el(d);
537 printk("c=0x%X%08X\n", (u32)(d->cpu_time>>32), (u32)d->cpu_time);
538 printk(" l: %lx n: %lx p: %lx\n",
539 (unsigned long)list, (unsigned long)list->next,
540 (unsigned long)list->prev);
541 }
542 }
544 /* Initialise the data structures. */
545 int bvt_init_scheduler()
546 {
547 int i;
549 for ( i = 0; i < NR_CPUS; i++ )
550 {
551 schedule_data[i].sched_priv = xmalloc(sizeof(struct bvt_cpu_info));
553 if ( schedule_data[i].sched_priv == NULL )
554 {
555 printk("Failed to allocate BVT scheduler per-CPU memory!\n");
556 return -1;
557 }
559 INIT_LIST_HEAD(RUNQUEUE(i));
561 CPU_SVT(i) = 0; /* XXX do I really need to do this? */
562 }
564 dom_info_cache = xmem_cache_create(
565 "BVT dom info", sizeof(struct bvt_dom_info), 0, 0, NULL, NULL);
566 if ( dom_info_cache == NULL )
567 {
568 printk("BVT: Failed to allocate domain info SLAB cache");
569 return -1;
570 }
572 return 0;
573 }
575 struct scheduler sched_bvt_def = {
576 .name = "Borrowed Virtual Time",
577 .opt_name = "bvt",
578 .sched_id = SCHED_BVT,
580 .init_scheduler = bvt_init_scheduler,
581 .init_idle_task = bvt_init_idle_task,
582 .alloc_task = bvt_alloc_task,
583 .add_task = bvt_add_task,
584 .free_task = bvt_free_task,
585 .do_schedule = bvt_do_schedule,
586 .control = bvt_ctl,
587 .adjdom = bvt_adjdom,
588 .dump_settings = bvt_dump_settings,
589 .dump_cpu_state = bvt_dump_cpu_state,
590 .sleep = bvt_sleep,
591 .wake = bvt_wake,
592 };