debuggers.hg

view xen/common/schedule.c @ 3349:c754bd0be650

bitkeeper revision 1.1159.1.496 (41c85faeMBUejFtICiJueb_Xdh8yJA)

Priv-op emulation in Xen, for RDMSR/WRMSR/WBINVD. Cleaned up Linux
a bit as a result.
author kaf24@scramble.cl.cam.ac.uk
date Tue Dec 21 17:38:54 2004 +0000 (2004-12-21)
parents a778ae82fcb3
children b2fa96909734
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
2 ****************************************************************************
3 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
4 * (C) 2002-2003 University of Cambridge
5 * (C) 2004 - Mark Williamson - Intel Research Cambridge
6 ****************************************************************************
7 *
8 * File: common/schedule.c
9 * Author: Rolf Neugebauer & Keir Fraser
10 * Updated for generic API by Mark Williamson
11 *
12 * Description: Generic CPU scheduling code
13 * implements support functionality for the Xen scheduler API.
14 *
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/ac_timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <public/sched_ctl.h>
31 /*#define WAKE_HISTO*/
32 /*#define BLOCKTIME_HISTO*/
34 #if defined(WAKE_HISTO)
35 #define BUCKETS 31
36 #elif defined(BLOCKTIME_HISTO)
37 #define BUCKETS 200
38 #endif
40 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
42 /*
43 * TODO MAW pull trace-related #defines out of here and into an auto-generated
44 * header file later on!
45 */
46 #define TRC_SCHED_DOM_ADD 0x00010000
47 #define TRC_SCHED_DOM_REM 0x00010001
48 #define TRC_SCHED_WAKE 0x00010002
49 #define TRC_SCHED_BLOCK 0x00010003
50 #define TRC_SCHED_YIELD 0x00010004
51 #define TRC_SCHED_SET_TIMER 0x00010005
52 #define TRC_SCHED_CTL 0x00010006
53 #define TRC_SCHED_ADJDOM 0x00010007
54 #define TRC_SCHED_RESCHED 0x00010008
55 #define TRC_SCHED_SWITCH 0x00010009
56 #define TRC_SCHED_S_TIMER_FN 0x0001000A
57 #define TRC_SCHED_T_TIMER_FN 0x0001000B
58 #define TRC_SCHED_DOM_TIMER_FN 0x0001000C
60 /* Various timer handlers. */
61 static void s_timer_fn(unsigned long unused);
62 static void t_timer_fn(unsigned long unused);
63 static void dom_timer_fn(unsigned long data);
65 /* This is global for now so that private implementations can reach it */
66 schedule_data_t schedule_data[NR_CPUS];
68 extern struct scheduler sched_bvt_def;
69 // extern struct scheduler sched_rrobin_def;
70 // extern struct scheduler sched_atropos_def;
71 static struct scheduler *schedulers[] = {
72 &sched_bvt_def,
73 // &sched_rrobin_def,
74 // &sched_atropos_def,
75 NULL
76 };
78 /* Operations for the current scheduler. */
79 static struct scheduler ops;
81 #define SCHED_OP(fn, ...) \
82 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
83 : (typeof(ops.fn(__VA_ARGS__)))0 )
85 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
86 static struct ac_timer t_timer[NR_CPUS];
88 extern xmem_cache_t *domain_struct_cachep;
89 extern xmem_cache_t *exec_domain_struct_cachep;
91 void free_domain_struct(struct domain *d)
92 {
93 struct exec_domain *ed;
95 SCHED_OP(free_task, d);
96 for_each_exec_domain(d, ed)
97 xmem_cache_free(exec_domain_struct_cachep, ed);
98 xmem_cache_free(domain_struct_cachep, d);
99 }
101 struct exec_domain *alloc_exec_domain_struct(struct domain *d,
102 unsigned long vcpu)
103 {
104 struct exec_domain *ed, *edc;
106 ASSERT( d->exec_domain[vcpu] == NULL );
108 if ( (ed = xmem_cache_alloc(exec_domain_struct_cachep)) == NULL )
109 return NULL;
111 memset(ed, 0, sizeof(*ed));
113 d->exec_domain[vcpu] = ed;
114 ed->domain = d;
115 ed->eid = vcpu;
117 if ( SCHED_OP(alloc_task, ed) < 0 )
118 goto out;
120 if (vcpu != 0) {
121 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
123 for_each_exec_domain(d, edc) {
124 if (edc->ed_next_list == NULL || edc->ed_next_list->eid > vcpu)
125 break;
126 }
127 ed->ed_next_list = edc->ed_next_list;
128 edc->ed_next_list = ed;
130 if (test_bit(EDF_CPUPINNED, &edc->ed_flags)) {
131 ed->processor = (edc->processor + 1) % smp_num_cpus;
132 set_bit(EDF_CPUPINNED, &ed->ed_flags);
133 } else {
134 ed->processor = (edc->processor + 1) % smp_num_cpus; /* XXX */
135 }
136 }
138 return ed;
140 out:
141 d->exec_domain[vcpu] = NULL;
142 xmem_cache_free(exec_domain_struct_cachep, ed);
144 return NULL;
145 }
147 struct domain *alloc_domain_struct(void)
148 {
149 struct domain *d;
151 if ( (d = xmem_cache_alloc(domain_struct_cachep)) == NULL )
152 return NULL;
154 memset(d, 0, sizeof(*d));
156 if ( alloc_exec_domain_struct(d, 0) == NULL )
157 goto out;
159 return d;
161 out:
162 xmem_cache_free(domain_struct_cachep, d);
163 return NULL;
164 }
166 /*
167 * Add and remove a domain
168 */
169 void sched_add_domain(struct exec_domain *ed)
170 {
171 struct domain *d = ed->domain;
173 /* Must be unpaused by control software to start execution. */
174 set_bit(EDF_CTRLPAUSE, &ed->ed_flags);
176 if ( d->id != IDLE_DOMAIN_ID )
177 {
178 /* Initialise the per-domain timer. */
179 init_ac_timer(&ed->timer);
180 ed->timer.cpu = ed->processor;
181 ed->timer.data = (unsigned long)ed;
182 ed->timer.function = &dom_timer_fn;
183 }
184 else
185 {
186 schedule_data[ed->processor].idle = ed;
187 }
189 SCHED_OP(add_task, ed);
191 TRACE_2D(TRC_SCHED_DOM_ADD, d->id, ed);
192 }
194 void sched_rem_domain(struct exec_domain *ed)
195 {
197 rem_ac_timer(&ed->timer);
198 SCHED_OP(rem_task, ed);
199 TRACE_3D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid, ed);
200 }
202 void init_idle_task(void)
203 {
204 if ( SCHED_OP(init_idle_task, current) < 0 )
205 BUG();
206 }
208 void domain_sleep(struct exec_domain *d)
209 {
210 unsigned long flags;
212 spin_lock_irqsave(&schedule_data[d->processor].schedule_lock, flags);
214 if ( likely(!domain_runnable(d)) )
215 SCHED_OP(sleep, d);
217 spin_unlock_irqrestore(&schedule_data[d->processor].schedule_lock, flags);
219 /* Synchronous. */
220 while ( test_bit(EDF_RUNNING, &d->ed_flags) && !domain_runnable(d) )
221 {
222 smp_mb();
223 cpu_relax();
224 }
225 }
227 void domain_wake(struct exec_domain *ed)
228 {
229 unsigned long flags;
231 spin_lock_irqsave(&schedule_data[ed->processor].schedule_lock, flags);
233 if ( likely(domain_runnable(ed)) )
234 {
235 TRACE_2D(TRC_SCHED_WAKE, ed->domain->id, ed);
236 SCHED_OP(wake, ed);
237 #ifdef WAKE_HISTO
238 ed->wokenup = NOW();
239 #endif
240 }
242 clear_bit(EDF_MIGRATED, &ed->ed_flags);
244 spin_unlock_irqrestore(&schedule_data[ed->processor].schedule_lock, flags);
245 }
247 /* Block the currently-executing domain until a pertinent event occurs. */
248 long do_block(void)
249 {
250 ASSERT(current->domain->id != IDLE_DOMAIN_ID);
251 current->vcpu_info->evtchn_upcall_mask = 0;
252 set_bit(EDF_BLOCKED, &current->ed_flags);
253 TRACE_2D(TRC_SCHED_BLOCK, current->domain->id, current);
254 __enter_scheduler();
255 return 0;
256 }
258 /* Voluntarily yield the processor for this allocation. */
259 static long do_yield(void)
260 {
261 TRACE_2D(TRC_SCHED_YIELD, current->domain->id, current);
262 __enter_scheduler();
263 return 0;
264 }
266 /*
267 * Demultiplex scheduler-related hypercalls.
268 */
269 long do_sched_op(unsigned long op)
270 {
271 long ret = 0;
273 switch ( op & SCHEDOP_cmdmask )
274 {
276 case SCHEDOP_yield:
277 {
278 ret = do_yield();
279 break;
280 }
282 case SCHEDOP_block:
283 {
284 ret = do_block();
285 break;
286 }
288 case SCHEDOP_shutdown:
289 {
290 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
291 break;
292 }
294 default:
295 ret = -ENOSYS;
296 }
298 return ret;
299 }
301 /* Per-domain one-shot-timer hypercall. */
302 long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
303 {
304 struct exec_domain *p = current;
306 rem_ac_timer(&p->timer);
308 if ( (timeout_hi != 0) || (timeout_lo != 0) )
309 {
310 p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo);
311 add_ac_timer(&p->timer);
312 }
314 TRACE_5D(TRC_SCHED_SET_TIMER, p->domain->id, p->eid, p, timeout_hi,
315 timeout_lo);
317 return 0;
318 }
320 /** sched_id - fetch ID of current scheduler */
321 int sched_id()
322 {
323 return ops.sched_id;
324 }
326 long sched_ctl(struct sched_ctl_cmd *cmd)
327 {
328 TRACE_0D(TRC_SCHED_CTL);
330 if ( cmd->sched_id != ops.sched_id )
331 return -EINVAL;
333 return SCHED_OP(control, cmd);
334 }
337 /* Adjust scheduling parameter for a given domain. */
338 long sched_adjdom(struct sched_adjdom_cmd *cmd)
339 {
340 struct domain *d;
342 if ( cmd->sched_id != ops.sched_id )
343 return -EINVAL;
345 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
346 return -EINVAL;
348 d = find_domain_by_id(cmd->domain);
349 if ( d == NULL )
350 return -ESRCH;
352 TRACE_1D(TRC_SCHED_ADJDOM, d->id);
354 spin_lock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
355 SCHED_OP(adjdom, d, cmd);
356 spin_unlock_irq(&schedule_data[d->exec_domain[0]->processor].schedule_lock);
358 put_domain(d);
359 return 0;
360 }
362 /*
363 * The main function
364 * - deschedule the current domain (scheduler independent).
365 * - pick a new domain (scheduler dependent).
366 */
367 void __enter_scheduler(void)
368 {
369 struct exec_domain *prev = current, *next = NULL;
370 int cpu = prev->processor;
371 s_time_t now;
372 task_slice_t next_slice;
373 s32 r_time; /* time for new dom to run */
375 if ( !is_idle_task(current->domain) )
376 {
377 LOCK_BIGLOCK(current->domain);
378 cleanup_writable_pagetable(
379 prev->domain, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
380 UNLOCK_BIGLOCK(current->domain);
381 }
383 perfc_incrc(sched_run);
385 spin_lock_irq(&schedule_data[cpu].schedule_lock);
387 now = NOW();
389 rem_ac_timer(&schedule_data[cpu].s_timer);
391 ASSERT(!in_irq());
393 if ( test_bit(EDF_BLOCKED, &prev->ed_flags) )
394 {
395 /* This check is needed to avoid a race condition. */
396 if ( event_pending(prev) )
397 clear_bit(EDF_BLOCKED, &prev->ed_flags);
398 else
399 SCHED_OP(do_block, prev);
400 }
402 prev->cpu_time += now - prev->lastschd;
404 /* get policy-specific decision on scheduling... */
405 next_slice = ops.do_schedule(now);
407 r_time = next_slice.time;
408 next = next_slice.task;
410 schedule_data[cpu].curr = next;
412 next->lastschd = now;
414 /* reprogramm the timer */
415 schedule_data[cpu].s_timer.expires = now + r_time;
416 add_ac_timer(&schedule_data[cpu].s_timer);
418 /* Must be protected by the schedule_lock! */
419 set_bit(EDF_RUNNING, &next->ed_flags);
421 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
423 /* Ensure that the domain has an up-to-date time base. */
424 if ( !is_idle_task(next->domain) )
425 update_dom_time(next->domain);
427 if ( unlikely(prev == next) )
428 return;
430 perfc_incrc(sched_ctx);
432 #if defined(WAKE_HISTO)
433 if ( !is_idle_task(next) && next->wokenup ) {
434 ulong diff = (ulong)(now - next->wokenup);
435 diff /= (ulong)MILLISECS(1);
436 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
437 else schedule_data[cpu].hist[BUCKETS-1]++;
438 }
439 next->wokenup = (s_time_t)0;
440 #elif defined(BLOCKTIME_HISTO)
441 prev->lastdeschd = now;
442 if ( !is_idle_task(next) )
443 {
444 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
445 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
446 else schedule_data[cpu].hist[BUCKETS-1]++;
447 }
448 #endif
450 TRACE_2D(TRC_SCHED_SWITCH, next->domain->id, next);
452 switch_to(prev, next);
454 /*
455 * We do this late on because it doesn't need to be protected by the
456 * schedule_lock, and because we want this to be the very last use of
457 * 'prev' (after this point, a dying domain's info structure may be freed
458 * without warning).
459 */
460 clear_bit(EDF_RUNNING, &prev->ed_flags);
462 /* Mark a timer event for the newly-scheduled domain. */
463 if ( !is_idle_task(next->domain) )
464 send_guest_virq(next, VIRQ_TIMER);
466 schedule_tail(next);
468 BUG();
469 }
471 /* No locking needed -- pointer comparison is safe :-) */
472 int idle_cpu(int cpu)
473 {
474 struct exec_domain *p = schedule_data[cpu].curr;
475 return p == idle_task[cpu];
476 }
479 /****************************************************************************
480 * Timers: the scheduler utilises a number of timers
481 * - s_timer: per CPU timer for preemption and scheduling decisions
482 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
483 * - dom_timer: per domain timer to specifiy timeout values
484 ****************************************************************************/
486 /* The scheduler timer: force a run through the scheduler*/
487 static void s_timer_fn(unsigned long unused)
488 {
489 TRACE_0D(TRC_SCHED_S_TIMER_FN);
490 raise_softirq(SCHEDULE_SOFTIRQ);
491 perfc_incrc(sched_irq);
492 }
494 /* Periodic tick timer: send timer event to current domain*/
495 static void t_timer_fn(unsigned long unused)
496 {
497 struct exec_domain *ed = current;
499 TRACE_0D(TRC_SCHED_T_TIMER_FN);
501 if ( !is_idle_task(ed->domain) )
502 {
503 update_dom_time(ed->domain);
504 send_guest_virq(ed, VIRQ_TIMER);
505 }
507 t_timer[ed->processor].expires = NOW() + MILLISECS(10);
508 add_ac_timer(&t_timer[ed->processor]);
509 }
511 /* Domain timer function, sends a virtual timer interrupt to domain */
512 static void dom_timer_fn(unsigned long data)
513 {
514 struct exec_domain *ed = (struct exec_domain *)data;
516 TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
517 update_dom_time(ed->domain);
518 send_guest_virq(ed, VIRQ_TIMER);
519 }
521 /* Initialise the data structures. */
522 void __init scheduler_init(void)
523 {
524 int i;
526 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
528 for ( i = 0; i < NR_CPUS; i++ )
529 {
530 spin_lock_init(&schedule_data[i].schedule_lock);
531 schedule_data[i].curr = &idle0_exec_domain;
533 init_ac_timer(&schedule_data[i].s_timer);
534 schedule_data[i].s_timer.cpu = i;
535 schedule_data[i].s_timer.data = 2;
536 schedule_data[i].s_timer.function = &s_timer_fn;
538 init_ac_timer(&t_timer[i]);
539 t_timer[i].cpu = i;
540 t_timer[i].data = 3;
541 t_timer[i].function = &t_timer_fn;
542 }
544 schedule_data[0].idle = &idle0_exec_domain;
546 extern char opt_sched[];
548 for ( i = 0; schedulers[i] != NULL; i++ )
549 {
550 ops = *schedulers[i];
551 if ( strcmp(ops.opt_name, opt_sched) == 0 )
552 break;
553 }
555 if ( schedulers[i] == NULL )
556 printk("Could not find scheduler: %s\n", opt_sched);
558 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
560 if ( SCHED_OP(init_scheduler) < 0 )
561 panic("Initialising scheduler failed!");
562 }
564 /*
565 * Start a scheduler for each CPU
566 * This has to be done *after* the timers, e.g., APICs, have been initialised
567 */
568 void schedulers_start(void)
569 {
570 s_timer_fn(0);
571 smp_call_function((void *)s_timer_fn, NULL, 1, 1);
573 t_timer_fn(0);
574 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
575 }
578 void dump_runq(unsigned char key)
579 {
580 s_time_t now = NOW();
581 int i;
582 unsigned long flags;
584 local_irq_save(flags);
586 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
587 SCHED_OP(dump_settings);
588 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
590 for ( i = 0; i < smp_num_cpus; i++ )
591 {
592 spin_lock(&schedule_data[i].schedule_lock);
593 printk("CPU[%02d] ", i);
594 SCHED_OP(dump_cpu_state,i);
595 spin_unlock(&schedule_data[i].schedule_lock);
596 }
598 local_irq_restore(flags);
599 }
601 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
602 void print_sched_histo(unsigned char key)
603 {
604 int i, j, k;
605 for ( k = 0; k < smp_num_cpus; k++ )
606 {
607 j = 0;
608 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
609 for ( i = 0; i < BUCKETS; i++ )
610 {
611 if ( schedule_data[k].hist[i] != 0 )
612 {
613 if ( i < BUCKETS-1 )
614 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
615 else
616 printk(" >:[%7u] ", schedule_data[k].hist[i]);
617 if ( !(++j % 5) )
618 printk("\n");
619 }
620 }
621 printk("\n");
622 }
624 }
625 void reset_sched_histo(unsigned char key)
626 {
627 int i, j;
628 for ( j = 0; j < smp_num_cpus; j++ )
629 for ( i=0; i < BUCKETS; i++ )
630 schedule_data[j].hist[i] = 0;
631 }
632 #else
633 void print_sched_histo(unsigned char key) { }
634 void reset_sched_histo(unsigned char key) { }
635 #endif