debuggers.hg

view xen/common/schedule.c @ 22676:e8acb9753ff1

Use bool_t for various boolean variables

... decreasing cache footprint. As a prerequisite this requires making
cmdline_parse() a little more flexible.

Also remove a few variables altogether, and adjust sections
annotations for several others.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xen.org>
author Keir Fraser <keir@xen.org>
date Fri Dec 24 10:10:45 2010 +0000 (2010-12-24)
parents 05377a796952
children 700ac6445812
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <public/sched.h>
37 #include <xsm/xsm.h>
39 /* opt_sched: scheduler - default to credit */
40 static char __initdata opt_sched[10] = "credit";
41 string_param("sched", opt_sched);
43 /* if sched_smt_power_savings is set,
44 * scheduler will give preferrence to partially idle package compared to
45 * the full idle package, when picking pCPU to schedule vCPU.
46 */
47 bool_t sched_smt_power_savings = 0;
48 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
50 /* Various timer handlers. */
51 static void s_timer_fn(void *unused);
52 static void vcpu_periodic_timer_fn(void *data);
53 static void vcpu_singleshot_timer_fn(void *data);
54 static void poll_timer_fn(void *data);
56 /* This is global for now so that private implementations can reach it */
57 DEFINE_PER_CPU(struct schedule_data, schedule_data);
58 DEFINE_PER_CPU(struct scheduler *, scheduler);
60 extern const struct scheduler sched_sedf_def;
61 extern const struct scheduler sched_credit_def;
62 extern const struct scheduler sched_credit2_def;
63 extern const struct scheduler sched_arinc653_def;
64 static const struct scheduler *schedulers[] = {
65 &sched_sedf_def,
66 &sched_credit_def,
67 &sched_credit2_def,
68 &sched_arinc653_def,
69 NULL
70 };
72 static struct scheduler __read_mostly ops;
74 #define SCHED_OP(opsptr, fn, ...) \
75 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
76 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
78 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
79 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
80 #define VCPU2ONLINE(_v) \
81 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
82 : &(_v)->domain->cpupool->cpu_valid)
84 static inline void trace_runstate_change(struct vcpu *v, int new_state)
85 {
86 struct { uint32_t vcpu:16, domain:16; } d;
87 uint32_t event;
89 if ( likely(!tb_init_done) )
90 return;
92 d.vcpu = v->vcpu_id;
93 d.domain = v->domain->domain_id;
95 event = TRC_SCHED_RUNSTATE_CHANGE;
96 event |= ( v->runstate.state & 0x3 ) << 8;
97 event |= ( new_state & 0x3 ) << 4;
99 __trace_var(event, 1/*tsc*/, sizeof(d), &d);
100 }
102 static inline void trace_continue_running(struct vcpu *v)
103 {
104 struct { uint32_t vcpu:16, domain:16; } d;
106 if ( likely(!tb_init_done) )
107 return;
109 d.vcpu = v->vcpu_id;
110 d.domain = v->domain->domain_id;
112 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
113 }
115 static inline void vcpu_urgent_count_update(struct vcpu *v)
116 {
117 if ( is_idle_vcpu(v) )
118 return;
120 if ( unlikely(v->is_urgent) )
121 {
122 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
123 !test_bit(v->vcpu_id, v->domain->poll_mask) )
124 {
125 v->is_urgent = 0;
126 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
127 }
128 }
129 else
130 {
131 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
132 test_bit(v->vcpu_id, v->domain->poll_mask)) )
133 {
134 v->is_urgent = 1;
135 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
136 }
137 }
138 }
140 static inline void vcpu_runstate_change(
141 struct vcpu *v, int new_state, s_time_t new_entry_time)
142 {
143 s_time_t delta;
145 ASSERT(v->runstate.state != new_state);
146 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
148 vcpu_urgent_count_update(v);
150 trace_runstate_change(v, new_state);
152 delta = new_entry_time - v->runstate.state_entry_time;
153 if ( delta > 0 )
154 {
155 v->runstate.time[v->runstate.state] += delta;
156 v->runstate.state_entry_time = new_entry_time;
157 }
159 v->runstate.state = new_state;
160 }
162 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
163 {
164 s_time_t delta;
166 if ( unlikely(v != current) )
167 vcpu_schedule_lock_irq(v);
169 memcpy(runstate, &v->runstate, sizeof(*runstate));
170 delta = NOW() - runstate->state_entry_time;
171 if ( delta > 0 )
172 runstate->time[runstate->state] += delta;
174 if ( unlikely(v != current) )
175 vcpu_schedule_unlock_irq(v);
176 }
178 uint64_t get_cpu_idle_time(unsigned int cpu)
179 {
180 struct vcpu_runstate_info state;
181 struct vcpu *v;
183 if ( (v = idle_vcpu[cpu]) == NULL )
184 return 0;
186 vcpu_runstate_get(v, &state);
187 return state.time[RUNSTATE_running];
188 }
190 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
191 {
192 struct domain *d = v->domain;
194 /*
195 * Initialize processor and affinity settings. The idler, and potentially
196 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
197 */
198 v->processor = processor;
199 if ( is_idle_domain(d) || d->is_pinned )
200 v->cpu_affinity = cpumask_of_cpu(processor);
201 else
202 cpus_setall(v->cpu_affinity);
204 /* Initialise the per-vcpu timers. */
205 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
206 v, v->processor);
207 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
208 v, v->processor);
209 init_timer(&v->poll_timer, poll_timer_fn,
210 v, v->processor);
212 /* Idle VCPUs are scheduled immediately. */
213 if ( is_idle_domain(d) )
214 {
215 per_cpu(schedule_data, v->processor).curr = v;
216 v->is_running = 1;
217 }
219 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
221 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
222 if ( v->sched_priv == NULL )
223 return 1;
225 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
227 return 0;
228 }
230 int sched_move_domain(struct domain *d, struct cpupool *c)
231 {
232 struct vcpu *v;
233 unsigned int new_p;
234 void **vcpu_priv;
235 void *domdata;
237 domdata = SCHED_OP(c->sched, alloc_domdata, d);
238 if ( domdata == NULL )
239 return -ENOMEM;
241 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
242 if ( vcpu_priv == NULL )
243 {
244 SCHED_OP(c->sched, free_domdata, domdata);
245 return -ENOMEM;
246 }
248 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
249 for_each_vcpu ( d, v )
250 {
251 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
252 if ( vcpu_priv[v->vcpu_id] == NULL )
253 {
254 for_each_vcpu ( d, v )
255 {
256 if ( vcpu_priv[v->vcpu_id] != NULL )
257 xfree(vcpu_priv[v->vcpu_id]);
258 }
259 xfree(vcpu_priv);
260 SCHED_OP(c->sched, free_domdata, domdata);
261 return -ENOMEM;
262 }
263 }
265 domain_pause(d);
267 new_p = first_cpu(c->cpu_valid);
268 for_each_vcpu ( d, v )
269 {
270 migrate_timer(&v->periodic_timer, new_p);
271 migrate_timer(&v->singleshot_timer, new_p);
272 migrate_timer(&v->poll_timer, new_p);
274 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
275 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
277 cpus_setall(v->cpu_affinity);
278 v->processor = new_p;
279 v->sched_priv = vcpu_priv[v->vcpu_id];
280 evtchn_move_pirqs(v);
282 new_p = cycle_cpu(new_p, c->cpu_valid);
284 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
285 }
286 domain_update_node_affinity(d);
288 d->cpupool = c;
289 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
290 d->sched_priv = domdata;
292 domain_unpause(d);
294 xfree(vcpu_priv);
296 return 0;
297 }
299 void sched_destroy_vcpu(struct vcpu *v)
300 {
301 kill_timer(&v->periodic_timer);
302 kill_timer(&v->singleshot_timer);
303 kill_timer(&v->poll_timer);
304 if ( test_and_clear_bool(v->is_urgent) )
305 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
306 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
307 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
308 }
310 int sched_init_domain(struct domain *d)
311 {
312 return SCHED_OP(DOM2OP(d), init_domain, d);
313 }
315 void sched_destroy_domain(struct domain *d)
316 {
317 SCHED_OP(DOM2OP(d), destroy_domain, d);
318 }
320 void vcpu_sleep_nosync(struct vcpu *v)
321 {
322 unsigned long flags;
324 vcpu_schedule_lock_irqsave(v, flags);
326 if ( likely(!vcpu_runnable(v)) )
327 {
328 if ( v->runstate.state == RUNSTATE_runnable )
329 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
331 SCHED_OP(VCPU2OP(v), sleep, v);
332 }
334 vcpu_schedule_unlock_irqrestore(v, flags);
336 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
337 }
339 void vcpu_sleep_sync(struct vcpu *v)
340 {
341 vcpu_sleep_nosync(v);
343 while ( !vcpu_runnable(v) && v->is_running )
344 cpu_relax();
346 sync_vcpu_execstate(v);
347 }
349 void vcpu_wake(struct vcpu *v)
350 {
351 unsigned long flags;
353 vcpu_schedule_lock_irqsave(v, flags);
355 if ( likely(vcpu_runnable(v)) )
356 {
357 if ( v->runstate.state >= RUNSTATE_blocked )
358 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
359 SCHED_OP(VCPU2OP(v), wake, v);
360 }
361 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
362 {
363 if ( v->runstate.state == RUNSTATE_blocked )
364 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
365 }
367 vcpu_schedule_unlock_irqrestore(v, flags);
369 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
370 }
372 void vcpu_unblock(struct vcpu *v)
373 {
374 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
375 return;
377 /* Polling period ends when a VCPU is unblocked. */
378 if ( unlikely(v->poll_evtchn != 0) )
379 {
380 v->poll_evtchn = 0;
381 /*
382 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
383 * this VCPU (and it then going back to sleep on poll_mask).
384 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
385 */
386 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
387 clear_bit(_VPF_blocked, &v->pause_flags);
388 }
390 vcpu_wake(v);
391 }
393 static void vcpu_migrate(struct vcpu *v)
394 {
395 unsigned long flags;
396 int old_cpu, new_cpu;
398 vcpu_schedule_lock_irqsave(v, flags);
400 /*
401 * NB. Check of v->running happens /after/ setting migration flag
402 * because they both happen in (different) spinlock regions, and those
403 * regions are strictly serialised.
404 */
405 if ( v->is_running ||
406 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
407 {
408 vcpu_schedule_unlock_irqrestore(v, flags);
409 return;
410 }
412 /* Select new CPU. */
413 old_cpu = v->processor;
414 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
416 /*
417 * Transfer urgency status to new CPU before switching CPUs, as once
418 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
419 * scheduler lock we are holding.
420 */
421 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
422 {
423 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
424 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
425 }
427 /* Switch to new CPU, then unlock old CPU. This is safe because
428 * the lock pointer cant' change while the current lock is held. */
429 v->processor = new_cpu;
430 spin_unlock_irqrestore(
431 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
433 if ( old_cpu != new_cpu )
434 evtchn_move_pirqs(v);
436 /* Wake on new CPU. */
437 vcpu_wake(v);
438 }
440 /*
441 * Force a VCPU through a deschedule/reschedule path.
442 * For example, using this when setting the periodic timer period means that
443 * most periodic-timer state need only be touched from within the scheduler
444 * which can thus be done without need for synchronisation.
445 */
446 void vcpu_force_reschedule(struct vcpu *v)
447 {
448 vcpu_schedule_lock_irq(v);
449 if ( v->is_running )
450 set_bit(_VPF_migrating, &v->pause_flags);
451 vcpu_schedule_unlock_irq(v);
453 if ( test_bit(_VPF_migrating, &v->pause_flags) )
454 {
455 vcpu_sleep_nosync(v);
456 vcpu_migrate(v);
457 }
458 }
460 /*
461 * This function is used by cpu_hotplug code from stop_machine context
462 * and from cpupools to switch schedulers on a cpu.
463 */
464 int cpu_disable_scheduler(unsigned int cpu)
465 {
466 struct domain *d;
467 struct vcpu *v;
468 struct cpupool *c;
469 int ret = 0;
470 bool_t affinity_broken;
472 c = per_cpu(cpupool, cpu);
473 if ( c == NULL )
474 return ret;
476 for_each_domain ( d )
477 {
478 if ( d->cpupool != c )
479 continue;
481 affinity_broken = 0;
483 for_each_vcpu ( d, v )
484 {
485 vcpu_schedule_lock_irq(v);
487 if ( (cpus_weight(v->cpu_affinity) == 1) &&
488 cpu_isset(cpu, v->cpu_affinity) )
489 {
490 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
491 v->domain->domain_id, v->vcpu_id);
492 cpus_setall(v->cpu_affinity);
493 affinity_broken = 1;
494 }
496 if ( v->processor == cpu )
497 {
498 set_bit(_VPF_migrating, &v->pause_flags);
499 vcpu_schedule_unlock_irq(v);
500 vcpu_sleep_nosync(v);
501 vcpu_migrate(v);
502 }
503 else
504 {
505 vcpu_schedule_unlock_irq(v);
506 }
508 /*
509 * A vcpu active in the hypervisor will not be migratable.
510 * The caller should try again after releasing and reaquiring
511 * all locks.
512 */
513 if ( v->processor == cpu )
514 ret = -EAGAIN;
515 }
517 if ( affinity_broken )
518 domain_update_node_affinity(d);
519 }
521 return ret;
522 }
524 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
525 {
526 cpumask_t online_affinity, old_affinity;
527 cpumask_t *online;
529 if ( v->domain->is_pinned )
530 return -EINVAL;
531 online = VCPU2ONLINE(v);
532 cpus_and(online_affinity, *affinity, *online);
533 if ( cpus_empty(online_affinity) )
534 return -EINVAL;
536 vcpu_schedule_lock_irq(v);
538 old_affinity = v->cpu_affinity;
539 v->cpu_affinity = *affinity;
540 *affinity = old_affinity;
541 if ( !cpu_isset(v->processor, v->cpu_affinity) )
542 set_bit(_VPF_migrating, &v->pause_flags);
544 vcpu_schedule_unlock_irq(v);
546 domain_update_node_affinity(v->domain);
548 if ( test_bit(_VPF_migrating, &v->pause_flags) )
549 {
550 vcpu_sleep_nosync(v);
551 vcpu_migrate(v);
552 }
554 return 0;
555 }
557 /* Block the currently-executing domain until a pertinent event occurs. */
558 static long do_block(void)
559 {
560 struct vcpu *v = current;
562 local_event_delivery_enable();
563 set_bit(_VPF_blocked, &v->pause_flags);
565 /* Check for events /after/ blocking: avoids wakeup waiting race. */
566 if ( local_events_need_delivery() )
567 {
568 clear_bit(_VPF_blocked, &v->pause_flags);
569 }
570 else
571 {
572 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
573 raise_softirq(SCHEDULE_SOFTIRQ);
574 }
576 return 0;
577 }
579 static long do_poll(struct sched_poll *sched_poll)
580 {
581 struct vcpu *v = current;
582 struct domain *d = v->domain;
583 evtchn_port_t port;
584 long rc;
585 unsigned int i;
587 /* Fairly arbitrary limit. */
588 if ( sched_poll->nr_ports > 128 )
589 return -EINVAL;
591 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
592 return -EFAULT;
594 set_bit(_VPF_blocked, &v->pause_flags);
595 v->poll_evtchn = -1;
596 set_bit(v->vcpu_id, d->poll_mask);
598 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
599 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
600 smp_mb();
602 /*
603 * Someone may have seen we are blocked but not that we are polling, or
604 * vice versa. We are certainly being woken, so clean up and bail. Beyond
605 * this point others can be guaranteed to clean up for us if they wake us.
606 */
607 rc = 0;
608 if ( (v->poll_evtchn == 0) ||
609 !test_bit(_VPF_blocked, &v->pause_flags) ||
610 !test_bit(v->vcpu_id, d->poll_mask) )
611 goto out;
612 #endif
614 rc = 0;
615 if ( local_events_need_delivery() )
616 goto out;
618 for ( i = 0; i < sched_poll->nr_ports; i++ )
619 {
620 rc = -EFAULT;
621 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
622 goto out;
624 rc = -EINVAL;
625 if ( port >= MAX_EVTCHNS(d) )
626 goto out;
628 rc = 0;
629 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
630 goto out;
631 }
633 if ( sched_poll->nr_ports == 1 )
634 v->poll_evtchn = port;
636 if ( sched_poll->timeout != 0 )
637 set_timer(&v->poll_timer, sched_poll->timeout);
639 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
640 raise_softirq(SCHEDULE_SOFTIRQ);
642 return 0;
644 out:
645 v->poll_evtchn = 0;
646 clear_bit(v->vcpu_id, d->poll_mask);
647 clear_bit(_VPF_blocked, &v->pause_flags);
648 return rc;
649 }
651 /* Voluntarily yield the processor for this allocation. */
652 static long do_yield(void)
653 {
654 struct vcpu * v=current;
656 vcpu_schedule_lock_irq(v);
657 SCHED_OP(VCPU2OP(v), yield, v);
658 vcpu_schedule_unlock_irq(v);
660 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
661 raise_softirq(SCHEDULE_SOFTIRQ);
662 return 0;
663 }
665 static void domain_watchdog_timeout(void *data)
666 {
667 struct domain *d = data;
669 if ( d->is_shutting_down || d->is_dying )
670 return;
672 printk("Watchdog timer fired for domain %u\n", d->domain_id);
673 domain_shutdown(d, SHUTDOWN_watchdog);
674 }
676 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
677 {
678 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
679 return -EINVAL;
681 spin_lock(&d->watchdog_lock);
683 if ( id == 0 )
684 {
685 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
686 {
687 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
688 continue;
689 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
690 break;
691 }
692 spin_unlock(&d->watchdog_lock);
693 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
694 }
696 id -= 1;
697 if ( !test_bit(id, &d->watchdog_inuse_map) )
698 {
699 spin_unlock(&d->watchdog_lock);
700 return -EEXIST;
701 }
703 if ( timeout == 0 )
704 {
705 stop_timer(&d->watchdog_timer[id]);
706 clear_bit(id, &d->watchdog_inuse_map);
707 }
708 else
709 {
710 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
711 }
713 spin_unlock(&d->watchdog_lock);
714 return 0;
715 }
717 void watchdog_domain_init(struct domain *d)
718 {
719 unsigned int i;
721 spin_lock_init(&d->watchdog_lock);
723 d->watchdog_inuse_map = 0;
725 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
726 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
727 }
729 void watchdog_domain_destroy(struct domain *d)
730 {
731 unsigned int i;
733 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
734 kill_timer(&d->watchdog_timer[i]);
735 }
737 long do_sched_op_compat(int cmd, unsigned long arg)
738 {
739 long ret = 0;
741 switch ( cmd )
742 {
743 case SCHEDOP_yield:
744 {
745 ret = do_yield();
746 break;
747 }
749 case SCHEDOP_block:
750 {
751 ret = do_block();
752 break;
753 }
755 case SCHEDOP_shutdown:
756 {
757 TRACE_3D(TRC_SCHED_SHUTDOWN,
758 current->domain->domain_id, current->vcpu_id, arg);
759 domain_shutdown(current->domain, (u8)arg);
760 break;
761 }
763 default:
764 ret = -ENOSYS;
765 }
767 return ret;
768 }
770 typedef long ret_t;
772 #endif /* !COMPAT */
774 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
775 {
776 ret_t ret = 0;
778 switch ( cmd )
779 {
780 case SCHEDOP_yield:
781 {
782 ret = do_yield();
783 break;
784 }
786 case SCHEDOP_block:
787 {
788 ret = do_block();
789 break;
790 }
792 case SCHEDOP_shutdown:
793 {
794 struct sched_shutdown sched_shutdown;
796 ret = -EFAULT;
797 if ( copy_from_guest(&sched_shutdown, arg, 1) )
798 break;
800 ret = 0;
801 TRACE_3D(TRC_SCHED_SHUTDOWN,
802 current->domain->domain_id, current->vcpu_id,
803 sched_shutdown.reason);
804 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
806 break;
807 }
809 case SCHEDOP_shutdown_code:
810 {
811 struct sched_shutdown sched_shutdown;
812 struct domain *d = current->domain;
814 ret = -EFAULT;
815 if ( copy_from_guest(&sched_shutdown, arg, 1) )
816 break;
818 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
819 d->domain_id, current->vcpu_id, sched_shutdown.reason);
821 spin_lock(&d->shutdown_lock);
822 if ( d->shutdown_code == -1 )
823 d->shutdown_code = (u8)sched_shutdown.reason;
824 spin_unlock(&d->shutdown_lock);
826 ret = 0;
827 break;
828 }
830 case SCHEDOP_poll:
831 {
832 struct sched_poll sched_poll;
834 ret = -EFAULT;
835 if ( copy_from_guest(&sched_poll, arg, 1) )
836 break;
838 ret = do_poll(&sched_poll);
840 break;
841 }
843 case SCHEDOP_remote_shutdown:
844 {
845 struct domain *d;
846 struct sched_remote_shutdown sched_remote_shutdown;
848 ret = -EFAULT;
849 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
850 break;
852 ret = -ESRCH;
853 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
854 if ( d == NULL )
855 break;
857 if ( !IS_PRIV_FOR(current->domain, d) )
858 {
859 rcu_unlock_domain(d);
860 return -EPERM;
861 }
863 ret = xsm_schedop_shutdown(current->domain, d);
864 if ( ret )
865 {
866 rcu_unlock_domain(d);
867 return ret;
868 }
870 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
872 rcu_unlock_domain(d);
873 ret = 0;
875 break;
876 }
878 case SCHEDOP_watchdog:
879 {
880 struct sched_watchdog sched_watchdog;
882 ret = -EFAULT;
883 if ( copy_from_guest(&sched_watchdog, arg, 1) )
884 break;
886 ret = domain_watchdog(
887 current->domain, sched_watchdog.id, sched_watchdog.timeout);
888 break;
889 }
891 default:
892 ret = -ENOSYS;
893 }
895 return ret;
896 }
898 #ifndef COMPAT
900 /* Per-vcpu oneshot-timer hypercall. */
901 long do_set_timer_op(s_time_t timeout)
902 {
903 struct vcpu *v = current;
904 s_time_t offset = timeout - NOW();
906 if ( timeout == 0 )
907 {
908 stop_timer(&v->singleshot_timer);
909 }
910 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
911 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
912 {
913 /*
914 * Linux workaround: occasionally we will see timeouts a long way in
915 * the future due to wrapping in Linux's jiffy time handling. We check
916 * for timeouts wrapped negative, and for positive timeouts more than
917 * about 13 days in the future (2^50ns). The correct fix is to trigger
918 * an interrupt immediately (since Linux in fact has pending work to
919 * do in this situation). However, older guests also set a long timeout
920 * when they have *no* pending timers at all: setting an immediate
921 * timeout in this case can burn a lot of CPU. We therefore go for a
922 * reasonable middleground of triggering a timer event in 100ms.
923 */
924 gdprintk(XENLOG_INFO,
925 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
926 v->vcpu_id, (uint64_t)timeout);
927 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
928 }
929 else
930 {
931 migrate_timer(&v->singleshot_timer, smp_processor_id());
932 set_timer(&v->singleshot_timer, timeout);
933 }
935 return 0;
936 }
938 /* sched_id - fetch ID of current scheduler */
939 int sched_id(void)
940 {
941 return ops.sched_id;
942 }
944 /* Adjust scheduling parameter for a given domain. */
945 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
946 {
947 struct vcpu *v;
948 long ret;
950 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
951 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
952 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
953 return -EINVAL;
955 /*
956 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
957 * we acquire the local schedule_lock to guard against concurrent updates.
958 *
959 * We only acquire the local schedule lock after we have paused all other
960 * VCPUs in this domain. There are two reasons for this:
961 * 1- We don't want to hold up interrupts as pausing a VCPU can
962 * trigger a tlb shootdown.
963 * 2- Pausing other VCPUs involves briefly locking the schedule
964 * lock of the CPU they are running on. This CPU could be the
965 * same as ours.
966 */
968 for_each_vcpu ( d, v )
969 {
970 if ( v != current )
971 vcpu_pause(v);
972 }
974 if ( d == current->domain )
975 vcpu_schedule_lock_irq(current);
977 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
978 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
980 if ( d == current->domain )
981 vcpu_schedule_unlock_irq(current);
983 for_each_vcpu ( d, v )
984 {
985 if ( v != current )
986 vcpu_unpause(v);
987 }
989 return ret;
990 }
992 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
993 {
994 struct cpupool *pool;
995 int rc;
997 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
998 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
999 return -EINVAL;
1001 pool = cpupool_get_by_id(op->cpupool_id);
1002 if ( pool == NULL )
1003 return -ESRCH;
1005 rc = ((op->sched_id == pool->sched->sched_id)
1006 ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
1008 cpupool_put(pool);
1010 return rc;
1013 static void vcpu_periodic_timer_work(struct vcpu *v)
1015 s_time_t now = NOW();
1016 s_time_t periodic_next_event;
1018 if ( v->periodic_period == 0 )
1019 return;
1021 periodic_next_event = v->periodic_last_event + v->periodic_period;
1023 if ( now >= periodic_next_event )
1025 send_timer_event(v);
1026 v->periodic_last_event = now;
1027 periodic_next_event = now + v->periodic_period;
1030 migrate_timer(&v->periodic_timer, smp_processor_id());
1031 set_timer(&v->periodic_timer, periodic_next_event);
1034 /*
1035 * The main function
1036 * - deschedule the current domain (scheduler independent).
1037 * - pick a new domain (scheduler dependent).
1038 */
1039 static void schedule(void)
1041 struct vcpu *prev = current, *next = NULL;
1042 s_time_t now = NOW();
1043 struct scheduler *sched = this_cpu(scheduler);
1044 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1045 bool_t tasklet_work_scheduled = 0;
1046 struct schedule_data *sd;
1047 struct task_slice next_slice;
1049 ASSERT(!in_atomic());
1051 perfc_incr(sched_run);
1053 sd = &this_cpu(schedule_data);
1055 /* Update tasklet scheduling status. */
1056 switch ( *tasklet_work )
1058 case TASKLET_enqueued:
1059 set_bit(_TASKLET_scheduled, tasklet_work);
1060 case TASKLET_enqueued|TASKLET_scheduled:
1061 tasklet_work_scheduled = 1;
1062 break;
1063 case TASKLET_scheduled:
1064 clear_bit(_TASKLET_scheduled, tasklet_work);
1065 case 0:
1066 /*tasklet_work_scheduled = 0;*/
1067 break;
1068 default:
1069 BUG();
1072 spin_lock_irq(sd->schedule_lock);
1074 stop_timer(&sd->s_timer);
1076 /* get policy-specific decision on scheduling... */
1077 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1079 next = next_slice.task;
1081 sd->curr = next;
1083 if ( next_slice.time >= 0 ) /* -ve means no limit */
1084 set_timer(&sd->s_timer, now + next_slice.time);
1086 if ( unlikely(prev == next) )
1088 spin_unlock_irq(sd->schedule_lock);
1089 trace_continue_running(next);
1090 return continue_running(prev);
1093 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
1094 prev->domain->domain_id,
1095 now - prev->runstate.state_entry_time);
1096 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
1097 next->domain->domain_id,
1098 (next->runstate.state == RUNSTATE_runnable) ?
1099 (now - next->runstate.state_entry_time) : 0,
1100 next_slice.time);
1102 ASSERT(prev->runstate.state == RUNSTATE_running);
1104 TRACE_4D(TRC_SCHED_SWITCH,
1105 prev->domain->domain_id, prev->vcpu_id,
1106 next->domain->domain_id, next->vcpu_id);
1108 vcpu_runstate_change(
1109 prev,
1110 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
1111 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1112 now);
1113 prev->last_run_time = now;
1115 ASSERT(next->runstate.state != RUNSTATE_running);
1116 vcpu_runstate_change(next, RUNSTATE_running, now);
1118 /*
1119 * NB. Don't add any trace records from here until the actual context
1120 * switch, else lost_records resume will not work properly.
1121 */
1123 ASSERT(!next->is_running);
1124 next->is_running = 1;
1126 spin_unlock_irq(sd->schedule_lock);
1128 perfc_incr(sched_ctx);
1130 stop_timer(&prev->periodic_timer);
1132 if ( next_slice.migrated )
1133 evtchn_move_pirqs(next);
1135 /* Ensure that the domain has an up-to-date time base. */
1136 update_vcpu_system_time(next);
1137 vcpu_periodic_timer_work(next);
1139 context_switch(prev, next);
1142 void context_saved(struct vcpu *prev)
1144 /* Clear running flag /after/ writing context to memory. */
1145 smp_wmb();
1147 prev->is_running = 0;
1149 /* Check for migration request /after/ clearing running flag. */
1150 smp_mb();
1152 SCHED_OP(VCPU2OP(prev), context_saved, prev);
1154 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
1155 vcpu_migrate(prev);
1158 /* The scheduler timer: force a run through the scheduler */
1159 static void s_timer_fn(void *unused)
1161 raise_softirq(SCHEDULE_SOFTIRQ);
1162 perfc_incr(sched_irq);
1165 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1166 static void vcpu_periodic_timer_fn(void *data)
1168 struct vcpu *v = data;
1169 vcpu_periodic_timer_work(v);
1172 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1173 static void vcpu_singleshot_timer_fn(void *data)
1175 struct vcpu *v = data;
1176 send_timer_event(v);
1179 /* SCHEDOP_poll timeout callback. */
1180 static void poll_timer_fn(void *data)
1182 struct vcpu *v = data;
1184 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1185 vcpu_unblock(v);
1188 static int cpu_schedule_up(unsigned int cpu)
1190 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1192 per_cpu(scheduler, cpu) = &ops;
1193 spin_lock_init(&sd->_lock);
1194 sd->schedule_lock = &sd->_lock;
1195 sd->curr = idle_vcpu[cpu];
1196 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1197 atomic_set(&sd->urgent_count, 0);
1199 /* Boot CPU is dealt with later in schedule_init(). */
1200 if ( cpu == 0 )
1201 return 0;
1203 if ( idle_vcpu[cpu] == NULL )
1204 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1205 if ( idle_vcpu[cpu] == NULL )
1206 return -ENOMEM;
1208 if ( (ops.alloc_pdata != NULL) &&
1209 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
1210 return -ENOMEM;
1212 return 0;
1215 static void cpu_schedule_down(unsigned int cpu)
1217 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1219 if ( sd->sched_priv != NULL )
1220 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
1222 kill_timer(&sd->s_timer);
1225 static int cpu_schedule_callback(
1226 struct notifier_block *nfb, unsigned long action, void *hcpu)
1228 unsigned int cpu = (unsigned long)hcpu;
1229 int rc = 0;
1231 switch ( action )
1233 case CPU_UP_PREPARE:
1234 rc = cpu_schedule_up(cpu);
1235 break;
1236 case CPU_UP_CANCELED:
1237 case CPU_DEAD:
1238 cpu_schedule_down(cpu);
1239 break;
1240 default:
1241 break;
1244 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1247 static struct notifier_block cpu_schedule_nfb = {
1248 .notifier_call = cpu_schedule_callback
1249 };
1251 /* Initialise the data structures. */
1252 void __init scheduler_init(void)
1254 struct domain *idle_domain;
1255 int i;
1257 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1259 for ( i = 0; schedulers[i] != NULL; i++ )
1261 ops = *schedulers[i];
1262 if ( strcmp(ops.opt_name, opt_sched) == 0 )
1263 break;
1266 if ( schedulers[i] == NULL )
1268 printk("Could not find scheduler: %s\n", opt_sched);
1269 ops = *schedulers[0];
1272 if ( cpu_schedule_up(0) )
1273 BUG();
1274 register_cpu_notifier(&cpu_schedule_nfb);
1276 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1277 if ( SCHED_OP(&ops, init) )
1278 panic("scheduler returned error on init\n");
1280 idle_domain = domain_create(DOMID_IDLE, 0, 0);
1281 BUG_ON(idle_domain == NULL);
1282 idle_domain->vcpu = idle_vcpu;
1283 idle_domain->max_vcpus = NR_CPUS;
1284 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1285 BUG();
1286 if ( ops.alloc_pdata &&
1287 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
1288 BUG();
1291 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1293 unsigned long flags;
1294 struct vcpu *idle;
1295 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1296 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1297 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1299 if ( old_ops == new_ops )
1300 return;
1302 idle = idle_vcpu[cpu];
1303 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1304 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1306 pcpu_schedule_lock_irqsave(cpu, flags);
1308 SCHED_OP(old_ops, tick_suspend, cpu);
1309 vpriv_old = idle->sched_priv;
1310 idle->sched_priv = vpriv;
1311 per_cpu(scheduler, cpu) = new_ops;
1312 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1313 per_cpu(schedule_data, cpu).sched_priv = ppriv;
1314 SCHED_OP(new_ops, tick_resume, cpu);
1315 SCHED_OP(new_ops, insert_vcpu, idle);
1317 pcpu_schedule_unlock_irqrestore(cpu, flags);
1319 SCHED_OP(old_ops, free_vdata, vpriv_old);
1320 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1323 struct scheduler *scheduler_get_default(void)
1325 return &ops;
1328 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1330 int i;
1331 struct scheduler *sched;
1333 for ( i = 0; schedulers[i] != NULL; i++ )
1334 if ( schedulers[i]->sched_id == sched_id )
1335 goto found;
1336 *perr = -ENOENT;
1337 return NULL;
1339 found:
1340 *perr = -ENOMEM;
1341 if ( (sched = xmalloc(struct scheduler)) == NULL )
1342 return NULL;
1343 memcpy(sched, schedulers[i], sizeof(*sched));
1344 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1346 xfree(sched);
1347 sched = NULL;
1350 return sched;
1353 void scheduler_free(struct scheduler *sched)
1355 BUG_ON(sched == &ops);
1356 SCHED_OP(sched, deinit);
1357 xfree(sched);
1360 void schedule_dump(struct cpupool *c)
1362 int i;
1363 struct scheduler *sched;
1364 cpumask_t *cpus;
1366 sched = (c == NULL) ? &ops : c->sched;
1367 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
1368 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1369 SCHED_OP(sched, dump_settings);
1371 for_each_cpu_mask (i, *cpus)
1373 pcpu_schedule_lock(i);
1374 printk("CPU[%02d] ", i);
1375 SCHED_OP(sched, dump_cpu_state, i);
1376 pcpu_schedule_unlock(i);
1380 void sched_tick_suspend(void)
1382 struct scheduler *sched;
1383 unsigned int cpu = smp_processor_id();
1385 sched = per_cpu(scheduler, cpu);
1386 SCHED_OP(sched, tick_suspend, cpu);
1389 void sched_tick_resume(void)
1391 struct scheduler *sched;
1392 unsigned int cpu = smp_processor_id();
1394 sched = per_cpu(scheduler, cpu);
1395 SCHED_OP(sched, tick_resume, cpu);
1398 void wait(void)
1400 schedule();
1403 #ifdef CONFIG_COMPAT
1404 #include "compat/schedule.c"
1405 #endif
1407 #endif /* !COMPAT */
1409 /*
1410 * Local variables:
1411 * mode: C
1412 * c-set-style: "BSD"
1413 * c-basic-offset: 4
1414 * tab-width: 4
1415 * indent-tabs-mode: nil
1416 * End:
1417 */