debuggers.hg

view xen/common/schedule.c @ 21018:1bc860c790d9

cpuidle: Small fix to urgent_count update logic.

From: Ke Yu <ke.yu@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 24 10:57:24 2010 +0000 (2010-02-24)
parents ae2b7f1c89c8
children 9471200daee4
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <public/sched.h>
35 #include <xsm/xsm.h>
37 /* opt_sched: scheduler - default to credit */
38 static char __initdata opt_sched[10] = "credit";
39 string_param("sched", opt_sched);
41 /* if sched_smt_power_savings is set,
42 * scheduler will give preferrence to partially idle package compared to
43 * the full idle package, when picking pCPU to schedule vCPU.
44 */
45 int sched_smt_power_savings = 0;
46 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
48 /* Various timer handlers. */
49 static void s_timer_fn(void *unused);
50 static void vcpu_periodic_timer_fn(void *data);
51 static void vcpu_singleshot_timer_fn(void *data);
52 static void poll_timer_fn(void *data);
54 /* This is global for now so that private implementations can reach it */
55 DEFINE_PER_CPU(struct schedule_data, schedule_data);
57 extern const struct scheduler sched_sedf_def;
58 extern const struct scheduler sched_credit_def;
59 static const struct scheduler *__initdata schedulers[] = {
60 &sched_sedf_def,
61 &sched_credit_def,
62 NULL
63 };
65 static struct scheduler __read_mostly ops;
67 #define SCHED_OP(fn, ...) \
68 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
69 : (typeof(ops.fn(__VA_ARGS__)))0 )
71 static inline void trace_runstate_change(struct vcpu *v, int new_state)
72 {
73 struct { uint32_t vcpu:16, domain:16; } d;
74 uint32_t event;
76 if ( likely(!tb_init_done) )
77 return;
79 d.vcpu = v->vcpu_id;
80 d.domain = v->domain->domain_id;
82 event = TRC_SCHED_RUNSTATE_CHANGE;
83 event |= ( v->runstate.state & 0x3 ) << 8;
84 event |= ( new_state & 0x3 ) << 4;
86 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
87 }
89 static inline void trace_continue_running(struct vcpu *v)
90 {
91 struct { uint32_t vcpu:16, domain:16; } d;
93 if ( likely(!tb_init_done) )
94 return;
96 d.vcpu = v->vcpu_id;
97 d.domain = v->domain->domain_id;
99 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
100 (unsigned char *)&d);
101 }
103 static inline void vcpu_urgent_count_update(struct vcpu *v)
104 {
105 if ( is_idle_vcpu(v) )
106 return;
108 if ( unlikely(v->is_urgent) )
109 {
110 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
111 !test_bit(v->vcpu_id, v->domain->poll_mask) )
112 {
113 v->is_urgent = 0;
114 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
115 }
116 }
117 else
118 {
119 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
120 test_bit(v->vcpu_id, v->domain->poll_mask)) )
121 {
122 v->is_urgent = 1;
123 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
124 }
125 }
126 }
128 static inline void vcpu_runstate_change(
129 struct vcpu *v, int new_state, s_time_t new_entry_time)
130 {
131 s_time_t delta;
133 ASSERT(v->runstate.state != new_state);
134 ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
136 vcpu_urgent_count_update(v);
138 trace_runstate_change(v, new_state);
140 delta = new_entry_time - v->runstate.state_entry_time;
141 if ( delta > 0 )
142 {
143 v->runstate.time[v->runstate.state] += delta;
144 v->runstate.state_entry_time = new_entry_time;
145 }
147 v->runstate.state = new_state;
148 }
150 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
151 {
152 s_time_t delta;
154 if ( unlikely(v != current) )
155 vcpu_schedule_lock_irq(v);
157 memcpy(runstate, &v->runstate, sizeof(*runstate));
158 delta = NOW() - runstate->state_entry_time;
159 if ( delta > 0 )
160 runstate->time[runstate->state] += delta;
162 if ( unlikely(v != current) )
163 vcpu_schedule_unlock_irq(v);
164 }
166 uint64_t get_cpu_idle_time(unsigned int cpu)
167 {
168 struct vcpu_runstate_info state;
169 struct vcpu *v;
171 if ( (v = idle_vcpu[cpu]) == NULL )
172 return 0;
174 vcpu_runstate_get(v, &state);
175 return state.time[RUNSTATE_running];
176 }
178 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
179 {
180 struct domain *d = v->domain;
182 /*
183 * Initialize processor and affinity settings. The idler, and potentially
184 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
185 */
186 v->processor = processor;
187 if ( is_idle_domain(d) || d->is_pinned )
188 v->cpu_affinity = cpumask_of_cpu(processor);
189 else
190 cpus_setall(v->cpu_affinity);
192 /* Initialise the per-vcpu timers. */
193 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
194 v, v->processor);
195 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
196 v, v->processor);
197 init_timer(&v->poll_timer, poll_timer_fn,
198 v, v->processor);
200 /* Idle VCPUs are scheduled immediately. */
201 if ( is_idle_domain(d) )
202 {
203 per_cpu(schedule_data, v->processor).curr = v;
204 per_cpu(schedule_data, v->processor).idle = v;
205 v->is_running = 1;
206 }
208 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
210 return SCHED_OP(init_vcpu, v);
211 }
213 void sched_destroy_vcpu(struct vcpu *v)
214 {
215 kill_timer(&v->periodic_timer);
216 kill_timer(&v->singleshot_timer);
217 kill_timer(&v->poll_timer);
218 if ( test_and_clear_bool(v->is_urgent) )
219 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
220 SCHED_OP(destroy_vcpu, v);
221 }
223 int sched_init_domain(struct domain *d)
224 {
225 return SCHED_OP(init_domain, d);
226 }
228 void sched_destroy_domain(struct domain *d)
229 {
230 SCHED_OP(destroy_domain, d);
231 }
233 void vcpu_sleep_nosync(struct vcpu *v)
234 {
235 unsigned long flags;
237 vcpu_schedule_lock_irqsave(v, flags);
239 if ( likely(!vcpu_runnable(v)) )
240 {
241 if ( v->runstate.state == RUNSTATE_runnable )
242 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
244 SCHED_OP(sleep, v);
245 }
247 vcpu_schedule_unlock_irqrestore(v, flags);
249 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
250 }
252 void vcpu_sleep_sync(struct vcpu *v)
253 {
254 vcpu_sleep_nosync(v);
256 while ( !vcpu_runnable(v) && v->is_running )
257 cpu_relax();
259 sync_vcpu_execstate(v);
260 }
262 void vcpu_wake(struct vcpu *v)
263 {
264 unsigned long flags;
266 vcpu_schedule_lock_irqsave(v, flags);
268 if ( likely(vcpu_runnable(v)) )
269 {
270 if ( v->runstate.state >= RUNSTATE_blocked )
271 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
272 SCHED_OP(wake, v);
273 }
274 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
275 {
276 if ( v->runstate.state == RUNSTATE_blocked )
277 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
278 }
280 vcpu_schedule_unlock_irqrestore(v, flags);
282 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
283 }
285 void vcpu_unblock(struct vcpu *v)
286 {
287 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
288 return;
290 /* Polling period ends when a VCPU is unblocked. */
291 if ( unlikely(v->poll_evtchn != 0) )
292 {
293 v->poll_evtchn = 0;
294 /*
295 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
296 * this VCPU (and it then going back to sleep on poll_mask).
297 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
298 */
299 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
300 clear_bit(_VPF_blocked, &v->pause_flags);
301 }
303 vcpu_wake(v);
304 }
306 static void vcpu_migrate(struct vcpu *v)
307 {
308 unsigned long flags;
309 int old_cpu, new_cpu;
311 vcpu_schedule_lock_irqsave(v, flags);
313 /*
314 * NB. Check of v->running happens /after/ setting migration flag
315 * because they both happen in (different) spinlock regions, and those
316 * regions are strictly serialised.
317 */
318 if ( v->is_running ||
319 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
320 {
321 vcpu_schedule_unlock_irqrestore(v, flags);
322 return;
323 }
325 /* Select new CPU. */
326 old_cpu = v->processor;
327 new_cpu = SCHED_OP(pick_cpu, v);
329 /*
330 * Transfer urgency status to new CPU before switching CPUs, as once
331 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
332 * scheduler lock we are holding.
333 */
334 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
335 {
336 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
337 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
338 }
340 /* Switch to new CPU, then unlock old CPU. */
341 v->processor = new_cpu;
342 spin_unlock_irqrestore(
343 &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
345 /* Wake on new CPU. */
346 vcpu_wake(v);
347 }
349 /*
350 * Force a VCPU through a deschedule/reschedule path.
351 * For example, using this when setting the periodic timer period means that
352 * most periodic-timer state need only be touched from within the scheduler
353 * which can thus be done without need for synchronisation.
354 */
355 void vcpu_force_reschedule(struct vcpu *v)
356 {
357 vcpu_schedule_lock_irq(v);
358 if ( v->is_running )
359 set_bit(_VPF_migrating, &v->pause_flags);
360 vcpu_schedule_unlock_irq(v);
362 if ( test_bit(_VPF_migrating, &v->pause_flags) )
363 {
364 vcpu_sleep_nosync(v);
365 vcpu_migrate(v);
366 }
367 }
369 /*
370 * This function is used by cpu_hotplug code from stop_machine context.
371 * Hence we can avoid needing to take the
372 */
373 void cpu_disable_scheduler(void)
374 {
375 struct domain *d;
376 struct vcpu *v;
377 unsigned int cpu = smp_processor_id();
379 for_each_domain ( d )
380 {
381 for_each_vcpu ( d, v )
382 {
383 if ( is_idle_vcpu(v) )
384 continue;
386 if ( (cpus_weight(v->cpu_affinity) == 1) &&
387 cpu_isset(cpu, v->cpu_affinity) )
388 {
389 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
390 v->domain->domain_id, v->vcpu_id);
391 cpus_setall(v->cpu_affinity);
392 }
394 /*
395 * Migrate single-shot timers to CPU0. A new cpu will automatically
396 * be chosen when the timer is next re-set.
397 */
398 if ( v->singleshot_timer.cpu == cpu )
399 migrate_timer(&v->singleshot_timer, 0);
401 if ( v->processor == cpu )
402 {
403 set_bit(_VPF_migrating, &v->pause_flags);
404 vcpu_sleep_nosync(v);
405 vcpu_migrate(v);
406 }
407 }
408 }
409 }
411 static int __vcpu_set_affinity(
412 struct vcpu *v, cpumask_t *affinity,
413 bool_t old_lock_status, bool_t new_lock_status)
414 {
415 cpumask_t online_affinity, old_affinity;
417 cpus_and(online_affinity, *affinity, cpu_online_map);
418 if ( cpus_empty(online_affinity) )
419 return -EINVAL;
421 vcpu_schedule_lock_irq(v);
423 if ( v->affinity_locked != old_lock_status )
424 {
425 BUG_ON(!v->affinity_locked);
426 vcpu_schedule_unlock_irq(v);
427 return -EBUSY;
428 }
430 v->affinity_locked = new_lock_status;
432 old_affinity = v->cpu_affinity;
433 v->cpu_affinity = *affinity;
434 *affinity = old_affinity;
435 if ( !cpu_isset(v->processor, v->cpu_affinity) )
436 set_bit(_VPF_migrating, &v->pause_flags);
438 vcpu_schedule_unlock_irq(v);
440 if ( test_bit(_VPF_migrating, &v->pause_flags) )
441 {
442 vcpu_sleep_nosync(v);
443 vcpu_migrate(v);
444 }
446 return 0;
447 }
449 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
450 {
451 if ( v->domain->is_pinned )
452 return -EINVAL;
453 return __vcpu_set_affinity(v, affinity, 0, 0);
454 }
456 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
457 {
458 return __vcpu_set_affinity(v, affinity, 0, 1);
459 }
461 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
462 {
463 return __vcpu_set_affinity(v, affinity, 1, 1);
464 }
466 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
467 {
468 cpumask_t online_affinity;
470 /* Do not fail if no CPU in old affinity mask is online. */
471 cpus_and(online_affinity, *affinity, cpu_online_map);
472 if ( cpus_empty(online_affinity) )
473 *affinity = cpu_online_map;
475 if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
476 BUG();
477 }
479 /* Block the currently-executing domain until a pertinent event occurs. */
480 static long do_block(void)
481 {
482 struct vcpu *v = current;
484 local_event_delivery_enable();
485 set_bit(_VPF_blocked, &v->pause_flags);
487 /* Check for events /after/ blocking: avoids wakeup waiting race. */
488 if ( local_events_need_delivery() )
489 {
490 clear_bit(_VPF_blocked, &v->pause_flags);
491 }
492 else
493 {
494 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
495 raise_softirq(SCHEDULE_SOFTIRQ);
496 }
498 return 0;
499 }
501 static long do_poll(struct sched_poll *sched_poll)
502 {
503 struct vcpu *v = current;
504 struct domain *d = v->domain;
505 evtchn_port_t port;
506 long rc;
507 unsigned int i;
509 /* Fairly arbitrary limit. */
510 if ( sched_poll->nr_ports > 128 )
511 return -EINVAL;
513 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
514 return -EFAULT;
516 set_bit(_VPF_blocked, &v->pause_flags);
517 v->poll_evtchn = -1;
518 set_bit(v->vcpu_id, d->poll_mask);
520 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
521 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
522 smp_mb();
524 /*
525 * Someone may have seen we are blocked but not that we are polling, or
526 * vice versa. We are certainly being woken, so clean up and bail. Beyond
527 * this point others can be guaranteed to clean up for us if they wake us.
528 */
529 rc = 0;
530 if ( (v->poll_evtchn == 0) ||
531 !test_bit(_VPF_blocked, &v->pause_flags) ||
532 !test_bit(v->vcpu_id, d->poll_mask) )
533 goto out;
534 #endif
536 rc = 0;
537 if ( local_events_need_delivery() )
538 goto out;
540 for ( i = 0; i < sched_poll->nr_ports; i++ )
541 {
542 rc = -EFAULT;
543 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
544 goto out;
546 rc = -EINVAL;
547 if ( port >= MAX_EVTCHNS(d) )
548 goto out;
550 rc = 0;
551 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
552 goto out;
553 }
555 if ( sched_poll->nr_ports == 1 )
556 v->poll_evtchn = port;
558 if ( sched_poll->timeout != 0 )
559 set_timer(&v->poll_timer, sched_poll->timeout);
561 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
562 raise_softirq(SCHEDULE_SOFTIRQ);
564 return 0;
566 out:
567 v->poll_evtchn = 0;
568 clear_bit(v->vcpu_id, d->poll_mask);
569 clear_bit(_VPF_blocked, &v->pause_flags);
570 return rc;
571 }
573 /* Voluntarily yield the processor for this allocation. */
574 static long do_yield(void)
575 {
576 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
577 raise_softirq(SCHEDULE_SOFTIRQ);
578 return 0;
579 }
581 long do_sched_op_compat(int cmd, unsigned long arg)
582 {
583 long ret = 0;
585 switch ( cmd )
586 {
587 case SCHEDOP_yield:
588 {
589 ret = do_yield();
590 break;
591 }
593 case SCHEDOP_block:
594 {
595 ret = do_block();
596 break;
597 }
599 case SCHEDOP_shutdown:
600 {
601 TRACE_3D(TRC_SCHED_SHUTDOWN,
602 current->domain->domain_id, current->vcpu_id, arg);
603 domain_shutdown(current->domain, (u8)arg);
604 break;
605 }
607 default:
608 ret = -ENOSYS;
609 }
611 return ret;
612 }
614 typedef long ret_t;
616 #endif /* !COMPAT */
618 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
619 {
620 ret_t ret = 0;
622 switch ( cmd )
623 {
624 case SCHEDOP_yield:
625 {
626 ret = do_yield();
627 break;
628 }
630 case SCHEDOP_block:
631 {
632 ret = do_block();
633 break;
634 }
636 case SCHEDOP_shutdown:
637 {
638 struct sched_shutdown sched_shutdown;
640 ret = -EFAULT;
641 if ( copy_from_guest(&sched_shutdown, arg, 1) )
642 break;
644 ret = 0;
645 TRACE_3D(TRC_SCHED_SHUTDOWN,
646 current->domain->domain_id, current->vcpu_id,
647 sched_shutdown.reason);
648 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
650 break;
651 }
653 case SCHEDOP_poll:
654 {
655 struct sched_poll sched_poll;
657 ret = -EFAULT;
658 if ( copy_from_guest(&sched_poll, arg, 1) )
659 break;
661 ret = do_poll(&sched_poll);
663 break;
664 }
666 case SCHEDOP_remote_shutdown:
667 {
668 struct domain *d;
669 struct sched_remote_shutdown sched_remote_shutdown;
671 ret = -EFAULT;
672 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
673 break;
675 ret = -ESRCH;
676 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
677 if ( d == NULL )
678 break;
680 if ( !IS_PRIV_FOR(current->domain, d) )
681 {
682 rcu_unlock_domain(d);
683 return -EPERM;
684 }
686 ret = xsm_schedop_shutdown(current->domain, d);
687 if ( ret )
688 {
689 rcu_unlock_domain(d);
690 return ret;
691 }
693 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
695 rcu_unlock_domain(d);
696 ret = 0;
698 break;
699 }
701 default:
702 ret = -ENOSYS;
703 }
705 return ret;
706 }
708 #ifndef COMPAT
710 /* Per-vcpu oneshot-timer hypercall. */
711 long do_set_timer_op(s_time_t timeout)
712 {
713 struct vcpu *v = current;
714 s_time_t offset = timeout - NOW();
716 if ( timeout == 0 )
717 {
718 stop_timer(&v->singleshot_timer);
719 }
720 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
721 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
722 {
723 /*
724 * Linux workaround: occasionally we will see timeouts a long way in
725 * the future due to wrapping in Linux's jiffy time handling. We check
726 * for timeouts wrapped negative, and for positive timeouts more than
727 * about 13 days in the future (2^50ns). The correct fix is to trigger
728 * an interrupt immediately (since Linux in fact has pending work to
729 * do in this situation). However, older guests also set a long timeout
730 * when they have *no* pending timers at all: setting an immediate
731 * timeout in this case can burn a lot of CPU. We therefore go for a
732 * reasonable middleground of triggering a timer event in 100ms.
733 */
734 gdprintk(XENLOG_INFO,
735 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
736 v->vcpu_id, (uint64_t)timeout);
737 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
738 }
739 else
740 {
741 if ( v->singleshot_timer.cpu != smp_processor_id() )
742 {
743 stop_timer(&v->singleshot_timer);
744 v->singleshot_timer.cpu = smp_processor_id();
745 }
747 set_timer(&v->singleshot_timer, timeout);
748 }
750 return 0;
751 }
753 /* sched_id - fetch ID of current scheduler */
754 int sched_id(void)
755 {
756 return ops.sched_id;
757 }
759 /* Adjust scheduling parameter for a given domain. */
760 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
761 {
762 struct vcpu *v;
763 long ret;
765 if ( (op->sched_id != ops.sched_id) ||
766 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
767 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
768 return -EINVAL;
770 /*
771 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
772 * we acquire the local schedule_lock to guard against concurrent updates.
773 *
774 * We only acquire the local schedule lock after we have paused all other
775 * VCPUs in this domain. There are two reasons for this:
776 * 1- We don't want to hold up interrupts as pausing a VCPU can
777 * trigger a tlb shootdown.
778 * 2- Pausing other VCPUs involves briefly locking the schedule
779 * lock of the CPU they are running on. This CPU could be the
780 * same as ours.
781 */
783 for_each_vcpu ( d, v )
784 {
785 if ( v != current )
786 vcpu_pause(v);
787 }
789 if ( d == current->domain )
790 vcpu_schedule_lock_irq(current);
792 if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
793 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
795 if ( d == current->domain )
796 vcpu_schedule_unlock_irq(current);
798 for_each_vcpu ( d, v )
799 {
800 if ( v != current )
801 vcpu_unpause(v);
802 }
804 return ret;
805 }
807 static void vcpu_periodic_timer_work(struct vcpu *v)
808 {
809 s_time_t now = NOW();
810 uint64_t periodic_next_event;
812 ASSERT(!active_timer(&v->periodic_timer));
814 if ( v->periodic_period == 0 )
815 return;
817 periodic_next_event = v->periodic_last_event + v->periodic_period;
819 if ( now >= periodic_next_event )
820 {
821 send_timer_event(v);
822 v->periodic_last_event = now;
823 periodic_next_event = now + v->periodic_period;
824 }
826 v->periodic_timer.cpu = smp_processor_id();
827 set_timer(&v->periodic_timer, periodic_next_event);
828 }
830 /*
831 * The main function
832 * - deschedule the current domain (scheduler independent).
833 * - pick a new domain (scheduler dependent).
834 */
835 static void schedule(void)
836 {
837 struct vcpu *prev = current, *next = NULL;
838 s_time_t now = NOW();
839 struct schedule_data *sd;
840 struct task_slice next_slice;
842 ASSERT(!in_irq());
843 ASSERT(this_cpu(mc_state).flags == 0);
845 perfc_incr(sched_run);
847 sd = &this_cpu(schedule_data);
849 spin_lock_irq(&sd->schedule_lock);
851 stop_timer(&sd->s_timer);
853 /* get policy-specific decision on scheduling... */
854 next_slice = ops.do_schedule(now);
856 next = next_slice.task;
858 sd->curr = next;
860 if ( next_slice.time >= 0 ) /* -ve means no limit */
861 set_timer(&sd->s_timer, now + next_slice.time);
863 if ( unlikely(prev == next) )
864 {
865 spin_unlock_irq(&sd->schedule_lock);
866 trace_continue_running(next);
867 return continue_running(prev);
868 }
870 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
871 prev->domain->domain_id,
872 now - prev->runstate.state_entry_time);
873 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
874 next->domain->domain_id,
875 (next->runstate.state == RUNSTATE_runnable) ?
876 (now - next->runstate.state_entry_time) : 0,
877 next_slice.time);
879 ASSERT(prev->runstate.state == RUNSTATE_running);
880 vcpu_runstate_change(
881 prev,
882 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
883 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
884 now);
885 prev->last_run_time = now;
887 ASSERT(next->runstate.state != RUNSTATE_running);
888 vcpu_runstate_change(next, RUNSTATE_running, now);
890 ASSERT(!next->is_running);
891 next->is_running = 1;
893 spin_unlock_irq(&sd->schedule_lock);
895 perfc_incr(sched_ctx);
897 stop_timer(&prev->periodic_timer);
899 /* Ensure that the domain has an up-to-date time base. */
900 update_vcpu_system_time(next);
901 vcpu_periodic_timer_work(next);
903 TRACE_4D(TRC_SCHED_SWITCH,
904 prev->domain->domain_id, prev->vcpu_id,
905 next->domain->domain_id, next->vcpu_id);
907 context_switch(prev, next);
908 }
910 void context_saved(struct vcpu *prev)
911 {
912 /* Clear running flag /after/ writing context to memory. */
913 smp_wmb();
915 prev->is_running = 0;
917 /* Check for migration request /after/ clearing running flag. */
918 smp_mb();
920 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
921 vcpu_migrate(prev);
922 }
924 /* The scheduler timer: force a run through the scheduler */
925 static void s_timer_fn(void *unused)
926 {
927 raise_softirq(SCHEDULE_SOFTIRQ);
928 perfc_incr(sched_irq);
929 }
931 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
932 static void vcpu_periodic_timer_fn(void *data)
933 {
934 struct vcpu *v = data;
935 vcpu_periodic_timer_work(v);
936 }
938 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
939 static void vcpu_singleshot_timer_fn(void *data)
940 {
941 struct vcpu *v = data;
942 send_timer_event(v);
943 }
945 /* SCHEDOP_poll timeout callback. */
946 static void poll_timer_fn(void *data)
947 {
948 struct vcpu *v = data;
950 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
951 vcpu_unblock(v);
952 }
954 /* Initialise the data structures. */
955 void __init scheduler_init(void)
956 {
957 int i;
959 open_softirq(SCHEDULE_SOFTIRQ, schedule);
961 for_each_possible_cpu ( i )
962 {
963 spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
964 init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
965 }
967 for ( i = 0; schedulers[i] != NULL; i++ )
968 {
969 ops = *schedulers[i];
970 if ( strcmp(ops.opt_name, opt_sched) == 0 )
971 break;
972 }
974 if ( schedulers[i] == NULL )
975 {
976 printk("Could not find scheduler: %s\n", opt_sched);
977 ops = *schedulers[0];
978 }
980 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
981 SCHED_OP(init);
982 }
984 void dump_runq(unsigned char key)
985 {
986 s_time_t now = NOW();
987 int i;
988 unsigned long flags;
990 local_irq_save(flags);
992 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
993 SCHED_OP(dump_settings);
994 printk("sched_smt_power_savings: %s\n",
995 sched_smt_power_savings? "enabled":"disabled");
996 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
998 for_each_online_cpu ( i )
999 {
1000 spin_lock(&per_cpu(schedule_data, i).schedule_lock);
1001 printk("CPU[%02d] ", i);
1002 SCHED_OP(dump_cpu_state, i);
1003 spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
1006 local_irq_restore(flags);
1009 void sched_tick_suspend(void)
1011 SCHED_OP(tick_suspend);
1014 void sched_tick_resume(void)
1016 SCHED_OP(tick_resume);
1019 #ifdef CONFIG_COMPAT
1020 #include "compat/schedule.c"
1021 #endif
1023 #endif /* !COMPAT */
1025 /*
1026 * Local variables:
1027 * mode: C
1028 * c-set-style: "BSD"
1029 * c-basic-offset: 4
1030 * tab-width: 4
1031 * indent-tabs-mode: nil
1032 * End:
1033 */