debuggers.hg

view xen/common/schedule.c @ 21984:07ea977397e0

cpupool: correct removing cpu from cpupool

Corrects an error introduced with cs 21422.
Without the patch my machine crashed when removing a cpu from a
cpupool other than Pool-0.

Signed-off-by: Juergen Gross <juergen.gross@ts.fujitsu.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Aug 09 16:39:09 2010 +0100 (2010-08-09)
parents 93074767205e
children 62a44418d8a0
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <public/sched.h>
36 #include <xsm/xsm.h>
38 /* opt_sched: scheduler - default to credit */
39 static char __initdata opt_sched[10] = "credit";
40 string_param("sched", opt_sched);
42 /* if sched_smt_power_savings is set,
43 * scheduler will give preferrence to partially idle package compared to
44 * the full idle package, when picking pCPU to schedule vCPU.
45 */
46 int sched_smt_power_savings = 0;
47 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
49 /* Various timer handlers. */
50 static void s_timer_fn(void *unused);
51 static void vcpu_periodic_timer_fn(void *data);
52 static void vcpu_singleshot_timer_fn(void *data);
53 static void poll_timer_fn(void *data);
55 /* This is global for now so that private implementations can reach it */
56 DEFINE_PER_CPU(struct schedule_data, schedule_data);
57 DEFINE_PER_CPU(struct scheduler *, scheduler);
59 extern const struct scheduler sched_sedf_def;
60 extern const struct scheduler sched_credit_def;
61 extern const struct scheduler sched_credit2_def;
62 static const struct scheduler *schedulers[] = {
63 &sched_sedf_def,
64 &sched_credit_def,
65 &sched_credit2_def,
66 NULL
67 };
69 static struct scheduler __read_mostly ops;
71 #define SCHED_OP(opsptr, fn, ...) \
72 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
73 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
75 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
76 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
77 #define VCPU2ONLINE(_v) \
78 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
79 : &(_v)->domain->cpupool->cpu_valid)
81 static inline void trace_runstate_change(struct vcpu *v, int new_state)
82 {
83 struct { uint32_t vcpu:16, domain:16; } d;
84 uint32_t event;
86 if ( likely(!tb_init_done) )
87 return;
89 d.vcpu = v->vcpu_id;
90 d.domain = v->domain->domain_id;
92 event = TRC_SCHED_RUNSTATE_CHANGE;
93 event |= ( v->runstate.state & 0x3 ) << 8;
94 event |= ( new_state & 0x3 ) << 4;
96 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
97 }
99 static inline void trace_continue_running(struct vcpu *v)
100 {
101 struct { uint32_t vcpu:16, domain:16; } d;
103 if ( likely(!tb_init_done) )
104 return;
106 d.vcpu = v->vcpu_id;
107 d.domain = v->domain->domain_id;
109 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
110 (unsigned char *)&d);
111 }
113 static inline void vcpu_urgent_count_update(struct vcpu *v)
114 {
115 if ( is_idle_vcpu(v) )
116 return;
118 if ( unlikely(v->is_urgent) )
119 {
120 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
121 !test_bit(v->vcpu_id, v->domain->poll_mask) )
122 {
123 v->is_urgent = 0;
124 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
125 }
126 }
127 else
128 {
129 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
130 test_bit(v->vcpu_id, v->domain->poll_mask)) )
131 {
132 v->is_urgent = 1;
133 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
134 }
135 }
136 }
138 static inline void vcpu_runstate_change(
139 struct vcpu *v, int new_state, s_time_t new_entry_time)
140 {
141 s_time_t delta;
143 ASSERT(v->runstate.state != new_state);
144 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
146 vcpu_urgent_count_update(v);
148 trace_runstate_change(v, new_state);
150 delta = new_entry_time - v->runstate.state_entry_time;
151 if ( delta > 0 )
152 {
153 v->runstate.time[v->runstate.state] += delta;
154 v->runstate.state_entry_time = new_entry_time;
155 }
157 v->runstate.state = new_state;
158 }
160 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
161 {
162 s_time_t delta;
164 if ( unlikely(v != current) )
165 vcpu_schedule_lock_irq(v);
167 memcpy(runstate, &v->runstate, sizeof(*runstate));
168 delta = NOW() - runstate->state_entry_time;
169 if ( delta > 0 )
170 runstate->time[runstate->state] += delta;
172 if ( unlikely(v != current) )
173 vcpu_schedule_unlock_irq(v);
174 }
176 uint64_t get_cpu_idle_time(unsigned int cpu)
177 {
178 struct vcpu_runstate_info state;
179 struct vcpu *v;
181 if ( (v = idle_vcpu[cpu]) == NULL )
182 return 0;
184 vcpu_runstate_get(v, &state);
185 return state.time[RUNSTATE_running];
186 }
188 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
189 {
190 struct domain *d = v->domain;
192 /*
193 * Initialize processor and affinity settings. The idler, and potentially
194 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
195 */
196 v->processor = processor;
197 if ( is_idle_domain(d) || d->is_pinned )
198 v->cpu_affinity = cpumask_of_cpu(processor);
199 else
200 cpus_setall(v->cpu_affinity);
202 /* Initialise the per-vcpu timers. */
203 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
204 v, v->processor);
205 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
206 v, v->processor);
207 init_timer(&v->poll_timer, poll_timer_fn,
208 v, v->processor);
210 /* Idle VCPUs are scheduled immediately. */
211 if ( is_idle_domain(d) )
212 {
213 per_cpu(schedule_data, v->processor).curr = v;
214 v->is_running = 1;
215 }
217 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
219 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
220 if ( v->sched_priv == NULL )
221 return 1;
223 return 0;
224 }
226 int sched_move_domain(struct domain *d, struct cpupool *c)
227 {
228 struct vcpu *v;
229 unsigned int new_p;
230 void **vcpu_priv;
231 void *domdata;
233 domdata = SCHED_OP(c->sched, alloc_domdata, d);
234 if ( domdata == NULL )
235 return -ENOMEM;
237 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
238 if ( vcpu_priv == NULL )
239 {
240 SCHED_OP(c->sched, free_domdata, domdata);
241 return -ENOMEM;
242 }
244 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
245 for_each_vcpu ( d, v )
246 {
247 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
248 if ( vcpu_priv[v->vcpu_id] == NULL )
249 {
250 for_each_vcpu ( d, v )
251 {
252 if ( vcpu_priv[v->vcpu_id] != NULL )
253 xfree(vcpu_priv[v->vcpu_id]);
254 }
255 xfree(vcpu_priv);
256 SCHED_OP(c->sched, free_domdata, domdata);
257 return -ENOMEM;
258 }
259 }
261 domain_pause(d);
263 new_p = first_cpu(c->cpu_valid);
264 for_each_vcpu ( d, v )
265 {
266 migrate_timer(&v->periodic_timer, new_p);
267 migrate_timer(&v->singleshot_timer, new_p);
268 migrate_timer(&v->poll_timer, new_p);
270 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
272 cpus_setall(v->cpu_affinity);
273 v->processor = new_p;
274 v->sched_priv = vcpu_priv[v->vcpu_id];
275 evtchn_move_pirqs(v);
277 new_p = cycle_cpu(new_p, c->cpu_valid);
278 }
279 domain_update_node_affinity(d);
281 d->cpupool = c;
282 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
283 d->sched_priv = domdata;
285 domain_unpause(d);
287 xfree(vcpu_priv);
289 return 0;
290 }
292 void sched_destroy_vcpu(struct vcpu *v)
293 {
294 kill_timer(&v->periodic_timer);
295 kill_timer(&v->singleshot_timer);
296 kill_timer(&v->poll_timer);
297 if ( test_and_clear_bool(v->is_urgent) )
298 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
299 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
300 }
302 int sched_init_domain(struct domain *d)
303 {
304 return SCHED_OP(DOM2OP(d), init_domain, d);
305 }
307 void sched_destroy_domain(struct domain *d)
308 {
309 SCHED_OP(DOM2OP(d), destroy_domain, d);
310 }
312 void vcpu_sleep_nosync(struct vcpu *v)
313 {
314 unsigned long flags;
316 vcpu_schedule_lock_irqsave(v, flags);
318 if ( likely(!vcpu_runnable(v)) )
319 {
320 if ( v->runstate.state == RUNSTATE_runnable )
321 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
323 SCHED_OP(VCPU2OP(v), sleep, v);
324 }
326 vcpu_schedule_unlock_irqrestore(v, flags);
328 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
329 }
331 void vcpu_sleep_sync(struct vcpu *v)
332 {
333 vcpu_sleep_nosync(v);
335 while ( !vcpu_runnable(v) && v->is_running )
336 cpu_relax();
338 sync_vcpu_execstate(v);
339 }
341 void vcpu_wake(struct vcpu *v)
342 {
343 unsigned long flags;
345 vcpu_schedule_lock_irqsave(v, flags);
347 if ( likely(vcpu_runnable(v)) )
348 {
349 if ( v->runstate.state >= RUNSTATE_blocked )
350 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
351 SCHED_OP(VCPU2OP(v), wake, v);
352 }
353 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
354 {
355 if ( v->runstate.state == RUNSTATE_blocked )
356 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
357 }
359 vcpu_schedule_unlock_irqrestore(v, flags);
361 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
362 }
364 void vcpu_unblock(struct vcpu *v)
365 {
366 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
367 return;
369 /* Polling period ends when a VCPU is unblocked. */
370 if ( unlikely(v->poll_evtchn != 0) )
371 {
372 v->poll_evtchn = 0;
373 /*
374 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
375 * this VCPU (and it then going back to sleep on poll_mask).
376 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
377 */
378 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
379 clear_bit(_VPF_blocked, &v->pause_flags);
380 }
382 vcpu_wake(v);
383 }
385 static void vcpu_migrate(struct vcpu *v)
386 {
387 unsigned long flags;
388 int old_cpu, new_cpu;
390 vcpu_schedule_lock_irqsave(v, flags);
392 /*
393 * NB. Check of v->running happens /after/ setting migration flag
394 * because they both happen in (different) spinlock regions, and those
395 * regions are strictly serialised.
396 */
397 if ( v->is_running ||
398 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
399 {
400 vcpu_schedule_unlock_irqrestore(v, flags);
401 return;
402 }
404 /* Select new CPU. */
405 old_cpu = v->processor;
406 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
408 /*
409 * Transfer urgency status to new CPU before switching CPUs, as once
410 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
411 * scheduler lock we are holding.
412 */
413 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
414 {
415 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
416 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
417 }
419 /* Switch to new CPU, then unlock old CPU. */
420 v->processor = new_cpu;
421 spin_unlock_irqrestore(
422 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
424 if ( old_cpu != new_cpu )
425 evtchn_move_pirqs(v);
427 /* Wake on new CPU. */
428 vcpu_wake(v);
429 }
431 /*
432 * Force a VCPU through a deschedule/reschedule path.
433 * For example, using this when setting the periodic timer period means that
434 * most periodic-timer state need only be touched from within the scheduler
435 * which can thus be done without need for synchronisation.
436 */
437 void vcpu_force_reschedule(struct vcpu *v)
438 {
439 vcpu_schedule_lock_irq(v);
440 if ( v->is_running )
441 set_bit(_VPF_migrating, &v->pause_flags);
442 vcpu_schedule_unlock_irq(v);
444 if ( test_bit(_VPF_migrating, &v->pause_flags) )
445 {
446 vcpu_sleep_nosync(v);
447 vcpu_migrate(v);
448 }
449 }
451 /*
452 * This function is used by cpu_hotplug code from stop_machine context
453 * and from cpupools to switch schedulers on a cpu.
454 */
455 int cpu_disable_scheduler(unsigned int cpu)
456 {
457 struct domain *d;
458 struct vcpu *v;
459 struct cpupool *c;
460 int ret = 0;
461 bool_t affinity_broken;
463 c = per_cpu(cpupool, cpu);
464 if ( c == NULL )
465 return ret;
467 for_each_domain ( d )
468 {
469 if ( d->cpupool != c )
470 continue;
472 affinity_broken = 0;
474 for_each_vcpu ( d, v )
475 {
476 vcpu_schedule_lock_irq(v);
478 if ( (cpus_weight(v->cpu_affinity) == 1) &&
479 cpu_isset(cpu, v->cpu_affinity) )
480 {
481 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
482 v->domain->domain_id, v->vcpu_id);
483 cpus_setall(v->cpu_affinity);
484 affinity_broken = 1;
485 }
487 if ( v->processor == cpu )
488 {
489 set_bit(_VPF_migrating, &v->pause_flags);
490 vcpu_schedule_unlock_irq(v);
491 vcpu_sleep_nosync(v);
492 vcpu_migrate(v);
493 }
494 else
495 {
496 vcpu_schedule_unlock_irq(v);
497 }
499 /*
500 * A vcpu active in the hypervisor will not be migratable.
501 * The caller should try again after releasing and reaquiring
502 * all locks.
503 */
504 if ( v->processor == cpu )
505 ret = -EAGAIN;
506 }
508 if ( affinity_broken )
509 domain_update_node_affinity(d);
510 }
512 return ret;
513 }
515 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
516 {
517 cpumask_t online_affinity, old_affinity;
518 cpumask_t *online;
520 if ( v->domain->is_pinned )
521 return -EINVAL;
522 online = VCPU2ONLINE(v);
523 cpus_and(online_affinity, *affinity, *online);
524 if ( cpus_empty(online_affinity) )
525 return -EINVAL;
527 vcpu_schedule_lock_irq(v);
529 old_affinity = v->cpu_affinity;
530 v->cpu_affinity = *affinity;
531 *affinity = old_affinity;
532 if ( !cpu_isset(v->processor, v->cpu_affinity) )
533 set_bit(_VPF_migrating, &v->pause_flags);
535 vcpu_schedule_unlock_irq(v);
537 domain_update_node_affinity(v->domain);
539 if ( test_bit(_VPF_migrating, &v->pause_flags) )
540 {
541 vcpu_sleep_nosync(v);
542 vcpu_migrate(v);
543 }
545 return 0;
546 }
548 /* Block the currently-executing domain until a pertinent event occurs. */
549 static long do_block(void)
550 {
551 struct vcpu *v = current;
553 local_event_delivery_enable();
554 set_bit(_VPF_blocked, &v->pause_flags);
556 /* Check for events /after/ blocking: avoids wakeup waiting race. */
557 if ( local_events_need_delivery() )
558 {
559 clear_bit(_VPF_blocked, &v->pause_flags);
560 }
561 else
562 {
563 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
564 raise_softirq(SCHEDULE_SOFTIRQ);
565 }
567 return 0;
568 }
570 static long do_poll(struct sched_poll *sched_poll)
571 {
572 struct vcpu *v = current;
573 struct domain *d = v->domain;
574 evtchn_port_t port;
575 long rc;
576 unsigned int i;
578 /* Fairly arbitrary limit. */
579 if ( sched_poll->nr_ports > 128 )
580 return -EINVAL;
582 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
583 return -EFAULT;
585 set_bit(_VPF_blocked, &v->pause_flags);
586 v->poll_evtchn = -1;
587 set_bit(v->vcpu_id, d->poll_mask);
589 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
590 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
591 smp_mb();
593 /*
594 * Someone may have seen we are blocked but not that we are polling, or
595 * vice versa. We are certainly being woken, so clean up and bail. Beyond
596 * this point others can be guaranteed to clean up for us if they wake us.
597 */
598 rc = 0;
599 if ( (v->poll_evtchn == 0) ||
600 !test_bit(_VPF_blocked, &v->pause_flags) ||
601 !test_bit(v->vcpu_id, d->poll_mask) )
602 goto out;
603 #endif
605 rc = 0;
606 if ( local_events_need_delivery() )
607 goto out;
609 for ( i = 0; i < sched_poll->nr_ports; i++ )
610 {
611 rc = -EFAULT;
612 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
613 goto out;
615 rc = -EINVAL;
616 if ( port >= MAX_EVTCHNS(d) )
617 goto out;
619 rc = 0;
620 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
621 goto out;
622 }
624 if ( sched_poll->nr_ports == 1 )
625 v->poll_evtchn = port;
627 if ( sched_poll->timeout != 0 )
628 set_timer(&v->poll_timer, sched_poll->timeout);
630 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
631 raise_softirq(SCHEDULE_SOFTIRQ);
633 return 0;
635 out:
636 v->poll_evtchn = 0;
637 clear_bit(v->vcpu_id, d->poll_mask);
638 clear_bit(_VPF_blocked, &v->pause_flags);
639 return rc;
640 }
642 /* Voluntarily yield the processor for this allocation. */
643 static long do_yield(void)
644 {
645 struct vcpu * v=current;
647 vcpu_schedule_lock_irq(v);
648 SCHED_OP(VCPU2OP(v), yield, v);
649 vcpu_schedule_unlock_irq(v);
651 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
652 raise_softirq(SCHEDULE_SOFTIRQ);
653 return 0;
654 }
656 static void domain_watchdog_timeout(void *data)
657 {
658 struct domain *d = data;
660 if ( d->is_shutting_down || d->is_dying )
661 return;
663 printk("Watchdog timer fired for domain %u\n", d->domain_id);
664 domain_shutdown(d, SHUTDOWN_watchdog);
665 }
667 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
668 {
669 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
670 return -EINVAL;
672 spin_lock(&d->watchdog_lock);
674 if ( id == 0 )
675 {
676 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
677 {
678 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
679 continue;
680 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
681 break;
682 }
683 spin_unlock(&d->watchdog_lock);
684 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
685 }
687 id -= 1;
688 if ( !test_bit(id, &d->watchdog_inuse_map) )
689 {
690 spin_unlock(&d->watchdog_lock);
691 return -EEXIST;
692 }
694 if ( timeout == 0 )
695 {
696 stop_timer(&d->watchdog_timer[id]);
697 clear_bit(id, &d->watchdog_inuse_map);
698 }
699 else
700 {
701 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
702 }
704 spin_unlock(&d->watchdog_lock);
705 return 0;
706 }
708 void watchdog_domain_init(struct domain *d)
709 {
710 unsigned int i;
712 spin_lock_init(&d->watchdog_lock);
714 d->watchdog_inuse_map = 0;
716 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
717 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
718 }
720 void watchdog_domain_destroy(struct domain *d)
721 {
722 unsigned int i;
724 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
725 kill_timer(&d->watchdog_timer[i]);
726 }
728 long do_sched_op_compat(int cmd, unsigned long arg)
729 {
730 long ret = 0;
732 switch ( cmd )
733 {
734 case SCHEDOP_yield:
735 {
736 ret = do_yield();
737 break;
738 }
740 case SCHEDOP_block:
741 {
742 ret = do_block();
743 break;
744 }
746 case SCHEDOP_shutdown:
747 {
748 TRACE_3D(TRC_SCHED_SHUTDOWN,
749 current->domain->domain_id, current->vcpu_id, arg);
750 domain_shutdown(current->domain, (u8)arg);
751 break;
752 }
754 default:
755 ret = -ENOSYS;
756 }
758 return ret;
759 }
761 typedef long ret_t;
763 #endif /* !COMPAT */
765 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
766 {
767 ret_t ret = 0;
769 switch ( cmd )
770 {
771 case SCHEDOP_yield:
772 {
773 ret = do_yield();
774 break;
775 }
777 case SCHEDOP_block:
778 {
779 ret = do_block();
780 break;
781 }
783 case SCHEDOP_shutdown:
784 {
785 struct sched_shutdown sched_shutdown;
787 ret = -EFAULT;
788 if ( copy_from_guest(&sched_shutdown, arg, 1) )
789 break;
791 ret = 0;
792 TRACE_3D(TRC_SCHED_SHUTDOWN,
793 current->domain->domain_id, current->vcpu_id,
794 sched_shutdown.reason);
795 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
797 break;
798 }
800 case SCHEDOP_shutdown_code:
801 {
802 struct sched_shutdown sched_shutdown;
803 struct domain *d = current->domain;
805 ret = -EFAULT;
806 if ( copy_from_guest(&sched_shutdown, arg, 1) )
807 break;
809 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
810 d->domain_id, current->vcpu_id, sched_shutdown.reason);
812 spin_lock(&d->shutdown_lock);
813 if ( d->shutdown_code == -1 )
814 d->shutdown_code = (u8)sched_shutdown.reason;
815 spin_unlock(&d->shutdown_lock);
817 ret = 0;
818 break;
819 }
821 case SCHEDOP_poll:
822 {
823 struct sched_poll sched_poll;
825 ret = -EFAULT;
826 if ( copy_from_guest(&sched_poll, arg, 1) )
827 break;
829 ret = do_poll(&sched_poll);
831 break;
832 }
834 case SCHEDOP_remote_shutdown:
835 {
836 struct domain *d;
837 struct sched_remote_shutdown sched_remote_shutdown;
839 ret = -EFAULT;
840 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
841 break;
843 ret = -ESRCH;
844 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
845 if ( d == NULL )
846 break;
848 if ( !IS_PRIV_FOR(current->domain, d) )
849 {
850 rcu_unlock_domain(d);
851 return -EPERM;
852 }
854 ret = xsm_schedop_shutdown(current->domain, d);
855 if ( ret )
856 {
857 rcu_unlock_domain(d);
858 return ret;
859 }
861 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
863 rcu_unlock_domain(d);
864 ret = 0;
866 break;
867 }
869 case SCHEDOP_watchdog:
870 {
871 struct sched_watchdog sched_watchdog;
873 ret = -EFAULT;
874 if ( copy_from_guest(&sched_watchdog, arg, 1) )
875 break;
877 ret = domain_watchdog(
878 current->domain, sched_watchdog.id, sched_watchdog.timeout);
879 break;
880 }
882 default:
883 ret = -ENOSYS;
884 }
886 return ret;
887 }
889 #ifndef COMPAT
891 /* Per-vcpu oneshot-timer hypercall. */
892 long do_set_timer_op(s_time_t timeout)
893 {
894 struct vcpu *v = current;
895 s_time_t offset = timeout - NOW();
897 if ( timeout == 0 )
898 {
899 stop_timer(&v->singleshot_timer);
900 }
901 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
902 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
903 {
904 /*
905 * Linux workaround: occasionally we will see timeouts a long way in
906 * the future due to wrapping in Linux's jiffy time handling. We check
907 * for timeouts wrapped negative, and for positive timeouts more than
908 * about 13 days in the future (2^50ns). The correct fix is to trigger
909 * an interrupt immediately (since Linux in fact has pending work to
910 * do in this situation). However, older guests also set a long timeout
911 * when they have *no* pending timers at all: setting an immediate
912 * timeout in this case can burn a lot of CPU. We therefore go for a
913 * reasonable middleground of triggering a timer event in 100ms.
914 */
915 gdprintk(XENLOG_INFO,
916 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
917 v->vcpu_id, (uint64_t)timeout);
918 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
919 }
920 else
921 {
922 migrate_timer(&v->singleshot_timer, smp_processor_id());
923 set_timer(&v->singleshot_timer, timeout);
924 }
926 return 0;
927 }
929 /* sched_id - fetch ID of current scheduler */
930 int sched_id(void)
931 {
932 return ops.sched_id;
933 }
935 /* Adjust scheduling parameter for a given domain. */
936 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
937 {
938 struct vcpu *v;
939 long ret;
941 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
942 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
943 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
944 return -EINVAL;
946 /*
947 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
948 * we acquire the local schedule_lock to guard against concurrent updates.
949 *
950 * We only acquire the local schedule lock after we have paused all other
951 * VCPUs in this domain. There are two reasons for this:
952 * 1- We don't want to hold up interrupts as pausing a VCPU can
953 * trigger a tlb shootdown.
954 * 2- Pausing other VCPUs involves briefly locking the schedule
955 * lock of the CPU they are running on. This CPU could be the
956 * same as ours.
957 */
959 for_each_vcpu ( d, v )
960 {
961 if ( v != current )
962 vcpu_pause(v);
963 }
965 if ( d == current->domain )
966 vcpu_schedule_lock_irq(current);
968 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
969 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
971 if ( d == current->domain )
972 vcpu_schedule_unlock_irq(current);
974 for_each_vcpu ( d, v )
975 {
976 if ( v != current )
977 vcpu_unpause(v);
978 }
980 return ret;
981 }
983 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
984 {
985 struct cpupool *pool;
986 int rc;
988 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
989 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
990 return -EINVAL;
992 pool = cpupool_get_by_id(op->cpupool_id);
993 if ( pool == NULL )
994 return -ESRCH;
996 if ( op->sched_id != pool->sched->sched_id )
997 {
998 cpupool_put(pool);
999 return -EINVAL;
1002 rc = SCHED_OP(pool->sched, adjust_global, op);
1004 cpupool_put(pool);
1006 return rc;
1009 static void vcpu_periodic_timer_work(struct vcpu *v)
1011 s_time_t now = NOW();
1012 s_time_t periodic_next_event;
1014 if ( v->periodic_period == 0 )
1015 return;
1017 periodic_next_event = v->periodic_last_event + v->periodic_period;
1019 if ( now >= periodic_next_event )
1021 send_timer_event(v);
1022 v->periodic_last_event = now;
1023 periodic_next_event = now + v->periodic_period;
1026 migrate_timer(&v->periodic_timer, smp_processor_id());
1027 set_timer(&v->periodic_timer, periodic_next_event);
1030 /*
1031 * The main function
1032 * - deschedule the current domain (scheduler independent).
1033 * - pick a new domain (scheduler dependent).
1034 */
1035 static void schedule(void)
1037 struct vcpu *prev = current, *next = NULL;
1038 s_time_t now = NOW();
1039 struct scheduler *sched = this_cpu(scheduler);
1040 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1041 bool_t tasklet_work_scheduled = 0;
1042 struct schedule_data *sd;
1043 struct task_slice next_slice;
1045 ASSERT(!in_irq());
1046 ASSERT(this_cpu(mc_state).flags == 0);
1048 perfc_incr(sched_run);
1050 sd = &this_cpu(schedule_data);
1052 /* Update tasklet scheduling status. */
1053 switch ( *tasklet_work )
1055 case TASKLET_enqueued:
1056 set_bit(_TASKLET_scheduled, tasklet_work);
1057 case TASKLET_enqueued|TASKLET_scheduled:
1058 tasklet_work_scheduled = 1;
1059 break;
1060 case TASKLET_scheduled:
1061 clear_bit(_TASKLET_scheduled, tasklet_work);
1062 case 0:
1063 /*tasklet_work_scheduled = 0;*/
1064 break;
1065 default:
1066 BUG();
1069 spin_lock_irq(sd->schedule_lock);
1071 stop_timer(&sd->s_timer);
1073 /* get policy-specific decision on scheduling... */
1074 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1076 next = next_slice.task;
1078 sd->curr = next;
1080 if ( next_slice.time >= 0 ) /* -ve means no limit */
1081 set_timer(&sd->s_timer, now + next_slice.time);
1083 if ( unlikely(prev == next) )
1085 spin_unlock_irq(sd->schedule_lock);
1086 trace_continue_running(next);
1087 return continue_running(prev);
1090 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
1091 prev->domain->domain_id,
1092 now - prev->runstate.state_entry_time);
1093 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
1094 next->domain->domain_id,
1095 (next->runstate.state == RUNSTATE_runnable) ?
1096 (now - next->runstate.state_entry_time) : 0,
1097 next_slice.time);
1099 ASSERT(prev->runstate.state == RUNSTATE_running);
1101 TRACE_4D(TRC_SCHED_SWITCH,
1102 prev->domain->domain_id, prev->vcpu_id,
1103 next->domain->domain_id, next->vcpu_id);
1105 vcpu_runstate_change(
1106 prev,
1107 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
1108 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1109 now);
1110 prev->last_run_time = now;
1112 ASSERT(next->runstate.state != RUNSTATE_running);
1113 vcpu_runstate_change(next, RUNSTATE_running, now);
1115 /*
1116 * NB. Don't add any trace records from here until the actual context
1117 * switch, else lost_records resume will not work properly.
1118 */
1120 ASSERT(!next->is_running);
1121 next->is_running = 1;
1123 spin_unlock_irq(sd->schedule_lock);
1125 perfc_incr(sched_ctx);
1127 stop_timer(&prev->periodic_timer);
1129 if ( next_slice.migrated )
1130 evtchn_move_pirqs(next);
1132 /* Ensure that the domain has an up-to-date time base. */
1133 update_vcpu_system_time(next);
1134 vcpu_periodic_timer_work(next);
1136 context_switch(prev, next);
1139 void context_saved(struct vcpu *prev)
1141 /* Clear running flag /after/ writing context to memory. */
1142 smp_wmb();
1144 prev->is_running = 0;
1146 /* Check for migration request /after/ clearing running flag. */
1147 smp_mb();
1149 SCHED_OP(VCPU2OP(prev), context_saved, prev);
1151 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
1152 vcpu_migrate(prev);
1155 /* The scheduler timer: force a run through the scheduler */
1156 static void s_timer_fn(void *unused)
1158 raise_softirq(SCHEDULE_SOFTIRQ);
1159 perfc_incr(sched_irq);
1162 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1163 static void vcpu_periodic_timer_fn(void *data)
1165 struct vcpu *v = data;
1166 vcpu_periodic_timer_work(v);
1169 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1170 static void vcpu_singleshot_timer_fn(void *data)
1172 struct vcpu *v = data;
1173 send_timer_event(v);
1176 /* SCHEDOP_poll timeout callback. */
1177 static void poll_timer_fn(void *data)
1179 struct vcpu *v = data;
1181 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1182 vcpu_unblock(v);
1185 static int cpu_schedule_up(unsigned int cpu)
1187 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1189 per_cpu(scheduler, cpu) = &ops;
1190 spin_lock_init(&sd->_lock);
1191 sd->schedule_lock = &sd->_lock;
1192 sd->curr = idle_vcpu[cpu];
1193 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1194 atomic_set(&sd->urgent_count, 0);
1196 /* Boot CPU is dealt with later in schedule_init(). */
1197 if ( cpu == 0 )
1198 return 0;
1200 if ( idle_vcpu[cpu] == NULL )
1201 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1202 if ( idle_vcpu[cpu] == NULL )
1203 return -ENOMEM;
1205 if ( (ops.alloc_pdata != NULL) &&
1206 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
1207 return -ENOMEM;
1209 return 0;
1212 static void cpu_schedule_down(unsigned int cpu)
1214 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1216 if ( sd->sched_priv != NULL )
1217 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
1219 kill_timer(&sd->s_timer);
1222 static int cpu_schedule_callback(
1223 struct notifier_block *nfb, unsigned long action, void *hcpu)
1225 unsigned int cpu = (unsigned long)hcpu;
1226 int rc = 0;
1228 switch ( action )
1230 case CPU_UP_PREPARE:
1231 rc = cpu_schedule_up(cpu);
1232 break;
1233 case CPU_UP_CANCELED:
1234 case CPU_DEAD:
1235 cpu_schedule_down(cpu);
1236 break;
1237 default:
1238 break;
1241 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1244 static struct notifier_block cpu_schedule_nfb = {
1245 .notifier_call = cpu_schedule_callback
1246 };
1248 /* Initialise the data structures. */
1249 void __init scheduler_init(void)
1251 struct domain *idle_domain;
1252 int i;
1254 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1256 for ( i = 0; schedulers[i] != NULL; i++ )
1258 ops = *schedulers[i];
1259 if ( strcmp(ops.opt_name, opt_sched) == 0 )
1260 break;
1263 if ( schedulers[i] == NULL )
1265 printk("Could not find scheduler: %s\n", opt_sched);
1266 ops = *schedulers[0];
1269 if ( cpu_schedule_up(0) )
1270 BUG();
1271 register_cpu_notifier(&cpu_schedule_nfb);
1273 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1274 if ( SCHED_OP(&ops, init) )
1275 panic("scheduler returned error on init\n");
1277 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
1278 BUG_ON(idle_domain == NULL);
1279 idle_domain->vcpu = idle_vcpu;
1280 idle_domain->max_vcpus = NR_CPUS;
1281 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1282 BUG();
1283 if ( ops.alloc_pdata &&
1284 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
1285 BUG();
1288 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1290 unsigned long flags;
1291 struct vcpu *idle;
1292 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1293 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1294 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1296 if ( old_ops == new_ops )
1297 return;
1299 idle = idle_vcpu[cpu];
1300 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1301 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1303 spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags);
1305 SCHED_OP(old_ops, tick_suspend, cpu);
1306 vpriv_old = idle->sched_priv;
1307 idle->sched_priv = vpriv;
1308 per_cpu(scheduler, cpu) = new_ops;
1309 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1310 per_cpu(schedule_data, cpu).sched_priv = ppriv;
1311 SCHED_OP(new_ops, tick_resume, cpu);
1312 SCHED_OP(new_ops, insert_vcpu, idle);
1314 spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags);
1316 SCHED_OP(old_ops, free_vdata, vpriv_old);
1317 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1320 struct scheduler *scheduler_get_default(void)
1322 return &ops;
1325 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1327 int i;
1328 struct scheduler *sched;
1330 for ( i = 0; schedulers[i] != NULL; i++ )
1331 if ( schedulers[i]->sched_id == sched_id )
1332 goto found;
1333 *perr = -ENOENT;
1334 return NULL;
1336 found:
1337 *perr = -ENOMEM;
1338 if ( (sched = xmalloc(struct scheduler)) == NULL )
1339 return NULL;
1340 memcpy(sched, schedulers[i], sizeof(*sched));
1341 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1343 xfree(sched);
1344 sched = NULL;
1347 return sched;
1350 void scheduler_free(struct scheduler *sched)
1352 BUG_ON(sched == &ops);
1353 SCHED_OP(sched, deinit);
1354 xfree(sched);
1357 void schedule_dump(struct cpupool *c)
1359 int i;
1360 struct scheduler *sched;
1361 cpumask_t *cpus;
1363 sched = (c == NULL) ? &ops : c->sched;
1364 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
1365 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1366 SCHED_OP(sched, dump_settings);
1368 for_each_cpu_mask (i, *cpus)
1370 spin_lock(per_cpu(schedule_data, i).schedule_lock);
1371 printk("CPU[%02d] ", i);
1372 SCHED_OP(sched, dump_cpu_state, i);
1373 spin_unlock(per_cpu(schedule_data, i).schedule_lock);
1377 void sched_tick_suspend(void)
1379 struct scheduler *sched;
1380 unsigned int cpu = smp_processor_id();
1382 sched = per_cpu(scheduler, cpu);
1383 SCHED_OP(sched, tick_suspend, cpu);
1386 void sched_tick_resume(void)
1388 struct scheduler *sched;
1389 unsigned int cpu = smp_processor_id();
1391 sched = per_cpu(scheduler, cpu);
1392 SCHED_OP(sched, tick_resume, cpu);
1395 #ifdef CONFIG_COMPAT
1396 #include "compat/schedule.c"
1397 #endif
1399 #endif /* !COMPAT */
1401 /*
1402 * Local variables:
1403 * mode: C
1404 * c-set-style: "BSD"
1405 * c-basic-offset: 4
1406 * tab-width: 4
1407 * indent-tabs-mode: nil
1408 * End:
1409 */