debuggers.hg

view xen/common/schedule.c @ 2633:fe2f4bbcf869

bitkeeper revision 1.1159.99.4 (41626f06VquclgVVpIeHy9z2K3jW-A)

Rationalise scheduler locking. A bit more conservative now, but much
simpler! I only applied this to the basic BVT scheduler -- the others
are still unsafe and have been removed from the basic build.
author kaf24@freefall.cl.cam.ac.uk
date Tue Oct 05 09:53:10 2004 +0000 (2004-10-05)
parents aed97013f9fe
children ebe6012dace7
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 * (C) 2004 - Mark Williamson - Intel Research Cambridge
6 ****************************************************************************
7 *
8 * File: common/schedule.c
9 * Author: Rolf Neugebauer & Keir Fraser
10 * Updated for generic API by Mark Williamson
11 *
12 * Description: Generic CPU scheduling code
13 * implements support functionality for the Xen scheduler API.
14 *
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/ac_timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <hypervisor-ifs/sched_ctl.h>
31 /*#define WAKE_HISTO*/
32 /*#define BLOCKTIME_HISTO*/
34 #if defined(WAKE_HISTO)
35 #define BUCKETS 31
36 #elif defined(BLOCKTIME_HISTO)
37 #define BUCKETS 200
38 #endif
40 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
42 /*
43 * TODO MAW pull trace-related #defines out of here and into an auto-generated
44 * header file later on!
45 */
46 #define TRC_SCHED_DOM_ADD 0x00010000
47 #define TRC_SCHED_DOM_REM 0x00010001
48 #define TRC_SCHED_WAKE 0x00010002
49 #define TRC_SCHED_BLOCK 0x00010003
50 #define TRC_SCHED_YIELD 0x00010004
51 #define TRC_SCHED_SET_TIMER 0x00010005
52 #define TRC_SCHED_CTL 0x00010006
53 #define TRC_SCHED_ADJDOM 0x00010007
54 #define TRC_SCHED_RESCHED 0x00010008
55 #define TRC_SCHED_SWITCH 0x00010009
56 #define TRC_SCHED_S_TIMER_FN 0x0001000A
57 #define TRC_SCHED_T_TIMER_FN 0x0001000B
58 #define TRC_SCHED_DOM_TIMER_FN 0x0001000C
59 #define TRC_SCHED_FALLBACK_TIMER_FN 0x0001000D
61 /* Various timer handlers. */
62 static void s_timer_fn(unsigned long unused);
63 static void t_timer_fn(unsigned long unused);
64 static void dom_timer_fn(unsigned long data);
65 static void fallback_timer_fn(unsigned long unused);
67 /* This is global for now so that private implementations can reach it */
68 schedule_data_t schedule_data[NR_CPUS];
70 extern struct scheduler sched_bvt_def;
71 extern struct scheduler sched_fbvt_def;
72 extern struct scheduler sched_rrobin_def;
73 extern struct scheduler sched_atropos_def;
74 static struct scheduler *schedulers[] = {
75 &sched_bvt_def,
76 #ifdef BROKEN_SCHEDULERS
77 &sched_fbvt_def,
78 &sched_rrobin_def,
79 &sched_atropos_def,
80 #endif
81 NULL
82 };
84 /* Operations for the current scheduler. */
85 static struct scheduler ops;
87 #define SCHED_OP(fn, ...) \
88 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
89 : (typeof(ops.fn(__VA_ARGS__)))0 )
91 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
92 static struct ac_timer t_timer[NR_CPUS];
94 /*
95 * Per-CPU timer which ensures that even guests with very long quantums get
96 * their time-of-day state updated often enough to avoid wrapping.
97 */
98 static struct ac_timer fallback_timer[NR_CPUS];
100 extern xmem_cache_t *domain_struct_cachep;
102 void free_domain_struct(struct domain *d)
103 {
104 SCHED_OP(free_task, d);
105 xmem_cache_free(domain_struct_cachep, d);
106 }
108 struct domain *alloc_domain_struct(void)
109 {
110 struct domain *d;
112 if ( (d = xmem_cache_alloc(domain_struct_cachep)) == NULL )
113 return NULL;
115 memset(d, 0, sizeof(*d));
117 if ( SCHED_OP(alloc_task, d) < 0 )
118 {
119 xmem_cache_free(domain_struct_cachep, d);
120 return NULL;
121 }
123 return d;
124 }
126 /*
127 * Add and remove a domain
128 */
129 void sched_add_domain(struct domain *d)
130 {
131 /* Must be unpaused by control software to start execution. */
132 set_bit(DF_CTRLPAUSE, &d->flags);
134 if ( d->domain != IDLE_DOMAIN_ID )
135 {
136 /* Initialise the per-domain timer. */
137 init_ac_timer(&d->timer);
138 d->timer.cpu = d->processor;
139 d->timer.data = (unsigned long)d;
140 d->timer.function = &dom_timer_fn;
141 }
142 else
143 {
144 schedule_data[d->processor].idle = d;
145 }
147 SCHED_OP(add_task, d);
149 TRACE_2D(TRC_SCHED_DOM_ADD, d->domain, d);
150 }
152 void sched_rem_domain(struct domain *d)
153 {
154 rem_ac_timer(&d->timer);
155 SCHED_OP(rem_task, d);
156 TRACE_2D(TRC_SCHED_DOM_REM, d->domain, d);
157 }
159 void init_idle_task(void)
160 {
161 if ( SCHED_OP(init_idle_task, current) < 0 )
162 BUG();
163 }
165 void domain_sleep(struct domain *d)
166 {
167 unsigned long flags;
169 spin_lock_irqsave(&schedule_data[d->processor].schedule_lock, flags);
171 if ( likely(!domain_runnable(d)) )
172 SCHED_OP(sleep, d);
174 spin_unlock_irqrestore(&schedule_data[d->processor].schedule_lock, flags);
176 /* Synchronous. */
177 while ( test_bit(DF_RUNNING, &d->flags) && !domain_runnable(d) )
178 {
179 smp_mb();
180 cpu_relax();
181 }
182 }
184 void domain_wake(struct domain *d)
185 {
186 unsigned long flags;
188 spin_lock_irqsave(&schedule_data[d->processor].schedule_lock, flags);
190 if ( likely(domain_runnable(d)) )
191 {
192 TRACE_2D(TRC_SCHED_WAKE, d->domain, d);
193 SCHED_OP(wake, d);
194 #ifdef WAKE_HISTO
195 d->wokenup = NOW();
196 #endif
197 }
199 clear_bit(DF_MIGRATED, &d->flags);
201 spin_unlock_irqrestore(&schedule_data[d->processor].schedule_lock, flags);
202 }
204 /* Block the currently-executing domain until a pertinent event occurs. */
205 long do_block(void)
206 {
207 ASSERT(current->domain != IDLE_DOMAIN_ID);
208 current->shared_info->vcpu_data[0].evtchn_upcall_mask = 0;
209 set_bit(DF_BLOCKED, &current->flags);
210 TRACE_2D(TRC_SCHED_BLOCK, current->domain, current);
211 __enter_scheduler();
212 return 0;
213 }
215 /* Voluntarily yield the processor for this allocation. */
216 static long do_yield(void)
217 {
218 TRACE_2D(TRC_SCHED_YIELD, current->domain, current);
219 __enter_scheduler();
220 return 0;
221 }
223 /*
224 * Demultiplex scheduler-related hypercalls.
225 */
226 long do_sched_op(unsigned long op)
227 {
228 long ret = 0;
230 switch ( op & SCHEDOP_cmdmask )
231 {
233 case SCHEDOP_yield:
234 {
235 ret = do_yield();
236 break;
237 }
239 case SCHEDOP_block:
240 {
241 ret = do_block();
242 break;
243 }
245 case SCHEDOP_shutdown:
246 {
247 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
248 break;
249 }
251 default:
252 ret = -ENOSYS;
253 }
255 return ret;
256 }
258 /* Per-domain one-shot-timer hypercall. */
259 long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
260 {
261 struct domain *p = current;
263 rem_ac_timer(&p->timer);
265 if ( (timeout_hi != 0) || (timeout_lo != 0) )
266 {
267 p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo);
268 add_ac_timer(&p->timer);
269 }
271 TRACE_4D(TRC_SCHED_SET_TIMER, p->domain, p, timeout_hi, timeout_lo);
273 return 0;
274 }
276 /** sched_id - fetch ID of current scheduler */
277 int sched_id()
278 {
279 return ops.sched_id;
280 }
282 long sched_ctl(struct sched_ctl_cmd *cmd)
283 {
284 TRACE_0D(TRC_SCHED_CTL);
286 if ( cmd->sched_id != ops.sched_id )
287 return -EINVAL;
289 return SCHED_OP(control, cmd);
290 }
293 /* Adjust scheduling parameter for a given domain. */
294 long sched_adjdom(struct sched_adjdom_cmd *cmd)
295 {
296 struct domain *d;
298 if ( cmd->sched_id != ops.sched_id )
299 return -EINVAL;
301 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
302 return -EINVAL;
304 d = find_domain_by_id(cmd->domain);
305 if ( d == NULL )
306 return -ESRCH;
308 TRACE_1D(TRC_SCHED_ADJDOM, d->domain);
310 spin_lock_irq(&schedule_data[d->processor].schedule_lock);
311 SCHED_OP(adjdom, d, cmd);
312 spin_unlock_irq(&schedule_data[d->processor].schedule_lock);
314 put_domain(d);
315 return 0;
316 }
318 /*
319 * The main function
320 * - deschedule the current domain (scheduler independent).
321 * - pick a new domain (scheduler dependent).
322 */
323 void __enter_scheduler(void)
324 {
325 struct domain *prev = current, *next = NULL;
326 int cpu = prev->processor;
327 s_time_t now;
328 task_slice_t next_slice;
329 s32 r_time; /* time for new dom to run */
331 perfc_incrc(sched_run);
333 spin_lock_irq(&schedule_data[cpu].schedule_lock);
335 now = NOW();
337 rem_ac_timer(&schedule_data[cpu].s_timer);
339 ASSERT(!in_irq());
341 if ( test_bit(DF_BLOCKED, &prev->flags) )
342 {
343 /* This check is needed to avoid a race condition. */
344 if ( event_pending(prev) )
345 clear_bit(DF_BLOCKED, &prev->flags);
346 else
347 SCHED_OP(do_block, prev);
348 }
350 prev->cpu_time += now - prev->lastschd;
352 /* get policy-specific decision on scheduling... */
353 next_slice = ops.do_schedule(now);
355 r_time = next_slice.time;
356 next = next_slice.task;
358 schedule_data[cpu].curr = next;
360 next->lastschd = now;
362 /* reprogramm the timer */
363 schedule_data[cpu].s_timer.expires = now + r_time;
364 add_ac_timer(&schedule_data[cpu].s_timer);
366 /* Must be protected by the schedule_lock! */
367 set_bit(DF_RUNNING, &next->flags);
369 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
371 /* Ensure that the domain has an up-to-date time base. */
372 if ( !is_idle_task(next) )
373 update_dom_time(next->shared_info);
375 if ( unlikely(prev == next) )
376 return;
378 cleanup_writable_pagetable(
379 prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
381 perfc_incrc(sched_ctx);
383 #if defined(WAKE_HISTO)
384 if ( !is_idle_task(next) && next->wokenup ) {
385 ulong diff = (ulong)(now - next->wokenup);
386 diff /= (ulong)MILLISECS(1);
387 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
388 else schedule_data[cpu].hist[BUCKETS-1]++;
389 }
390 next->wokenup = (s_time_t)0;
391 #elif defined(BLOCKTIME_HISTO)
392 prev->lastdeschd = now;
393 if ( !is_idle_task(next) )
394 {
395 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
396 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
397 else schedule_data[cpu].hist[BUCKETS-1]++;
398 }
399 #endif
401 TRACE_2D(TRC_SCHED_SWITCH, next->domain, next);
403 switch_to(prev, next);
405 /*
406 * We do this late on because it doesn't need to be protected by the
407 * schedule_lock, and because we want this to be the very last use of
408 * 'prev' (after this point, a dying domain's info structure may be freed
409 * without warning).
410 */
411 clear_bit(DF_RUNNING, &prev->flags);
413 /* Mark a timer event for the newly-scheduled domain. */
414 if ( !is_idle_task(next) )
415 send_guest_virq(next, VIRQ_TIMER);
417 schedule_tail(next);
419 BUG();
420 }
422 /* No locking needed -- pointer comparison is safe :-) */
423 int idle_cpu(int cpu)
424 {
425 struct domain *p = schedule_data[cpu].curr;
426 return p == idle_task[cpu];
427 }
430 /****************************************************************************
431 * Timers: the scheduler utilises a number of timers
432 * - s_timer: per CPU timer for preemption and scheduling decisions
433 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
434 * - dom_timer: per domain timer to specifiy timeout values
435 * - fallback_timer: safeguard to ensure time is up to date
436 ****************************************************************************/
438 /* The scheduler timer: force a run through the scheduler*/
439 static void s_timer_fn(unsigned long unused)
440 {
441 TRACE_0D(TRC_SCHED_S_TIMER_FN);
442 raise_softirq(SCHEDULE_SOFTIRQ);
443 perfc_incrc(sched_irq);
444 }
446 /* Periodic tick timer: send timer event to current domain*/
447 static void t_timer_fn(unsigned long unused)
448 {
449 struct domain *p = current;
451 TRACE_0D(TRC_SCHED_T_TIMER_FN);
453 if ( !is_idle_task(p) ) {
454 update_dom_time(p->shared_info);
455 send_guest_virq(p, VIRQ_TIMER);
456 }
458 t_timer[p->processor].expires = NOW() + MILLISECS(10);
459 add_ac_timer(&t_timer[p->processor]);
460 }
462 /* Domain timer function, sends a virtual timer interrupt to domain */
463 static void dom_timer_fn(unsigned long data)
464 {
465 struct domain *p = (struct domain *)data;
466 TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
467 update_dom_time(p->shared_info);
468 send_guest_virq(p, VIRQ_TIMER);
469 }
472 /* Fallback timer to ensure guests get time updated 'often enough'. */
473 static void fallback_timer_fn(unsigned long unused)
474 {
475 struct domain *p = current;
477 TRACE_0D(TRC_SCHED_FALLBACK_TIMER_FN);
479 if ( !is_idle_task(p) )
480 update_dom_time(p->shared_info);
482 fallback_timer[p->processor].expires = NOW() + MILLISECS(500);
483 add_ac_timer(&fallback_timer[p->processor]);
484 }
486 /* Initialise the data structures. */
487 void __init scheduler_init(void)
488 {
489 int i;
491 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
493 for ( i = 0; i < NR_CPUS; i++ )
494 {
495 spin_lock_init(&schedule_data[i].schedule_lock);
496 schedule_data[i].curr = &idle0_task;
498 init_ac_timer(&schedule_data[i].s_timer);
499 schedule_data[i].s_timer.cpu = i;
500 schedule_data[i].s_timer.data = 2;
501 schedule_data[i].s_timer.function = &s_timer_fn;
503 init_ac_timer(&t_timer[i]);
504 t_timer[i].cpu = i;
505 t_timer[i].data = 3;
506 t_timer[i].function = &t_timer_fn;
508 init_ac_timer(&fallback_timer[i]);
509 fallback_timer[i].cpu = i;
510 fallback_timer[i].data = 4;
511 fallback_timer[i].function = &fallback_timer_fn;
512 }
514 schedule_data[0].idle = &idle0_task;
516 extern char opt_sched[];
518 for ( i = 0; schedulers[i] != NULL; i++ )
519 {
520 ops = *schedulers[i];
521 if ( strcmp(ops.opt_name, opt_sched) == 0 )
522 break;
523 }
525 if ( schedulers[i] == NULL )
526 printk("Could not find scheduler: %s\n", opt_sched);
528 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
530 if ( SCHED_OP(init_scheduler) < 0 )
531 panic("Initialising scheduler failed!");
532 }
534 /*
535 * Start a scheduler for each CPU
536 * This has to be done *after* the timers, e.g., APICs, have been initialised
537 */
538 void schedulers_start(void)
539 {
540 s_timer_fn(0);
541 smp_call_function((void *)s_timer_fn, NULL, 1, 1);
543 t_timer_fn(0);
544 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
546 fallback_timer_fn(0);
547 smp_call_function((void *)fallback_timer_fn, NULL, 1, 1);
548 }
551 void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
552 {
553 s_time_t now = NOW();
554 int i;
555 unsigned long flags;
557 local_irq_save(flags);
559 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
560 SCHED_OP(dump_settings);
561 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
563 for ( i = 0; i < smp_num_cpus; i++ )
564 {
565 spin_lock(&schedule_data[i].schedule_lock);
566 printk("CPU[%02d] ", i);
567 SCHED_OP(dump_cpu_state,i);
568 spin_unlock(&schedule_data[i].schedule_lock);
569 }
571 local_irq_restore(flags);
572 }
574 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
575 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
576 {
577 int i, j, k;
578 for ( k = 0; k < smp_num_cpus; k++ )
579 {
580 j = 0;
581 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
582 for ( i = 0; i < BUCKETS; i++ )
583 {
584 if ( schedule_data[k].hist[i] != 0 )
585 {
586 if ( i < BUCKETS-1 )
587 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
588 else
589 printk(" >:[%7u] ", schedule_data[k].hist[i]);
590 if ( !(++j % 5) )
591 printk("\n");
592 }
593 }
594 printk("\n");
595 }
597 }
598 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
599 {
600 int i, j;
601 for ( j = 0; j < smp_num_cpus; j++ )
602 for ( i=0; i < BUCKETS; i++ )
603 schedule_data[j].hist[i] = 0;
604 }
605 #else
606 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
607 {
608 }
609 void reset_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
610 {
611 }
612 #endif