debuggers.hg

view xen/common/schedule.c @ 21959:581ebaa7e2da

numa: Attempt more efficient NUMA allocation in hypervisor by default.

1. Try to allocate from nodes containing CPUs which a guest can be
scheduled on.
2. Remember which node we allocated from last, and round-robin
allocations among above-mentioned nodes.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 04 15:35:28 2010 +0100 (2010-08-04)
parents db35740574a5
children 49254cab8465
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <public/sched.h>
36 #include <xsm/xsm.h>
38 /* opt_sched: scheduler - default to credit */
39 static char __initdata opt_sched[10] = "credit";
40 string_param("sched", opt_sched);
42 /* if sched_smt_power_savings is set,
43 * scheduler will give preferrence to partially idle package compared to
44 * the full idle package, when picking pCPU to schedule vCPU.
45 */
46 int sched_smt_power_savings = 0;
47 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
49 /* Various timer handlers. */
50 static void s_timer_fn(void *unused);
51 static void vcpu_periodic_timer_fn(void *data);
52 static void vcpu_singleshot_timer_fn(void *data);
53 static void poll_timer_fn(void *data);
55 /* This is global for now so that private implementations can reach it */
56 DEFINE_PER_CPU(struct schedule_data, schedule_data);
57 DEFINE_PER_CPU(struct scheduler *, scheduler);
59 extern const struct scheduler sched_sedf_def;
60 extern const struct scheduler sched_credit_def;
61 extern const struct scheduler sched_credit2_def;
62 static const struct scheduler *schedulers[] = {
63 &sched_sedf_def,
64 &sched_credit_def,
65 &sched_credit2_def,
66 NULL
67 };
69 static struct scheduler __read_mostly ops;
71 #define SCHED_OP(opsptr, fn, ...) \
72 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
73 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
75 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
76 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
77 #define VCPU2ONLINE(_v) \
78 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
79 : &(_v)->domain->cpupool->cpu_valid)
81 static inline void trace_runstate_change(struct vcpu *v, int new_state)
82 {
83 struct { uint32_t vcpu:16, domain:16; } d;
84 uint32_t event;
86 if ( likely(!tb_init_done) )
87 return;
89 d.vcpu = v->vcpu_id;
90 d.domain = v->domain->domain_id;
92 event = TRC_SCHED_RUNSTATE_CHANGE;
93 event |= ( v->runstate.state & 0x3 ) << 8;
94 event |= ( new_state & 0x3 ) << 4;
96 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
97 }
99 static inline void trace_continue_running(struct vcpu *v)
100 {
101 struct { uint32_t vcpu:16, domain:16; } d;
103 if ( likely(!tb_init_done) )
104 return;
106 d.vcpu = v->vcpu_id;
107 d.domain = v->domain->domain_id;
109 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
110 (unsigned char *)&d);
111 }
113 static inline void vcpu_urgent_count_update(struct vcpu *v)
114 {
115 if ( is_idle_vcpu(v) )
116 return;
118 if ( unlikely(v->is_urgent) )
119 {
120 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
121 !test_bit(v->vcpu_id, v->domain->poll_mask) )
122 {
123 v->is_urgent = 0;
124 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
125 }
126 }
127 else
128 {
129 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
130 test_bit(v->vcpu_id, v->domain->poll_mask)) )
131 {
132 v->is_urgent = 1;
133 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
134 }
135 }
136 }
138 static inline void vcpu_runstate_change(
139 struct vcpu *v, int new_state, s_time_t new_entry_time)
140 {
141 s_time_t delta;
143 ASSERT(v->runstate.state != new_state);
144 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
146 vcpu_urgent_count_update(v);
148 trace_runstate_change(v, new_state);
150 delta = new_entry_time - v->runstate.state_entry_time;
151 if ( delta > 0 )
152 {
153 v->runstate.time[v->runstate.state] += delta;
154 v->runstate.state_entry_time = new_entry_time;
155 }
157 v->runstate.state = new_state;
158 }
160 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
161 {
162 s_time_t delta;
164 if ( unlikely(v != current) )
165 vcpu_schedule_lock_irq(v);
167 memcpy(runstate, &v->runstate, sizeof(*runstate));
168 delta = NOW() - runstate->state_entry_time;
169 if ( delta > 0 )
170 runstate->time[runstate->state] += delta;
172 if ( unlikely(v != current) )
173 vcpu_schedule_unlock_irq(v);
174 }
176 uint64_t get_cpu_idle_time(unsigned int cpu)
177 {
178 struct vcpu_runstate_info state;
179 struct vcpu *v;
181 if ( (v = idle_vcpu[cpu]) == NULL )
182 return 0;
184 vcpu_runstate_get(v, &state);
185 return state.time[RUNSTATE_running];
186 }
188 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
189 {
190 struct domain *d = v->domain;
192 /*
193 * Initialize processor and affinity settings. The idler, and potentially
194 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
195 */
196 v->processor = processor;
197 if ( is_idle_domain(d) || d->is_pinned )
198 v->cpu_affinity = cpumask_of_cpu(processor);
199 else
200 cpus_setall(v->cpu_affinity);
202 /* Initialise the per-vcpu timers. */
203 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
204 v, v->processor);
205 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
206 v, v->processor);
207 init_timer(&v->poll_timer, poll_timer_fn,
208 v, v->processor);
210 /* Idle VCPUs are scheduled immediately. */
211 if ( is_idle_domain(d) )
212 {
213 per_cpu(schedule_data, v->processor).curr = v;
214 v->is_running = 1;
215 }
217 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
219 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
220 if ( v->sched_priv == NULL )
221 return 1;
223 return 0;
224 }
226 int sched_move_domain(struct domain *d, struct cpupool *c)
227 {
228 struct vcpu *v;
229 unsigned int new_p;
230 void **vcpu_priv;
231 void *domdata;
233 domdata = SCHED_OP(c->sched, alloc_domdata, d);
234 if ( domdata == NULL )
235 return -ENOMEM;
237 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
238 if ( vcpu_priv == NULL )
239 {
240 SCHED_OP(c->sched, free_domdata, domdata);
241 return -ENOMEM;
242 }
244 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
245 for_each_vcpu ( d, v )
246 {
247 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
248 if ( vcpu_priv[v->vcpu_id] == NULL )
249 {
250 for_each_vcpu ( d, v )
251 {
252 if ( vcpu_priv[v->vcpu_id] != NULL )
253 xfree(vcpu_priv[v->vcpu_id]);
254 }
255 xfree(vcpu_priv);
256 SCHED_OP(c->sched, free_domdata, domdata);
257 return -ENOMEM;
258 }
259 }
261 domain_pause(d);
263 new_p = first_cpu(c->cpu_valid);
264 for_each_vcpu ( d, v )
265 {
266 migrate_timer(&v->periodic_timer, new_p);
267 migrate_timer(&v->singleshot_timer, new_p);
268 migrate_timer(&v->poll_timer, new_p);
270 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
272 cpus_setall(v->cpu_affinity);
273 domain_update_node_affinity(d);
274 v->processor = new_p;
275 v->sched_priv = vcpu_priv[v->vcpu_id];
276 evtchn_move_pirqs(v);
278 new_p = cycle_cpu(new_p, c->cpu_valid);
279 }
281 d->cpupool = c;
282 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
283 d->sched_priv = domdata;
285 domain_unpause(d);
287 xfree(vcpu_priv);
289 return 0;
290 }
292 void sched_destroy_vcpu(struct vcpu *v)
293 {
294 kill_timer(&v->periodic_timer);
295 kill_timer(&v->singleshot_timer);
296 kill_timer(&v->poll_timer);
297 if ( test_and_clear_bool(v->is_urgent) )
298 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
299 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
300 }
302 int sched_init_domain(struct domain *d)
303 {
304 return SCHED_OP(DOM2OP(d), init_domain, d);
305 }
307 void sched_destroy_domain(struct domain *d)
308 {
309 SCHED_OP(DOM2OP(d), destroy_domain, d);
310 }
312 void vcpu_sleep_nosync(struct vcpu *v)
313 {
314 unsigned long flags;
316 vcpu_schedule_lock_irqsave(v, flags);
318 if ( likely(!vcpu_runnable(v)) )
319 {
320 if ( v->runstate.state == RUNSTATE_runnable )
321 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
323 SCHED_OP(VCPU2OP(v), sleep, v);
324 }
326 vcpu_schedule_unlock_irqrestore(v, flags);
328 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
329 }
331 void vcpu_sleep_sync(struct vcpu *v)
332 {
333 vcpu_sleep_nosync(v);
335 while ( !vcpu_runnable(v) && v->is_running )
336 cpu_relax();
338 sync_vcpu_execstate(v);
339 }
341 void vcpu_wake(struct vcpu *v)
342 {
343 unsigned long flags;
345 vcpu_schedule_lock_irqsave(v, flags);
347 if ( likely(vcpu_runnable(v)) )
348 {
349 if ( v->runstate.state >= RUNSTATE_blocked )
350 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
351 SCHED_OP(VCPU2OP(v), wake, v);
352 }
353 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
354 {
355 if ( v->runstate.state == RUNSTATE_blocked )
356 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
357 }
359 vcpu_schedule_unlock_irqrestore(v, flags);
361 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
362 }
364 void vcpu_unblock(struct vcpu *v)
365 {
366 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
367 return;
369 /* Polling period ends when a VCPU is unblocked. */
370 if ( unlikely(v->poll_evtchn != 0) )
371 {
372 v->poll_evtchn = 0;
373 /*
374 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
375 * this VCPU (and it then going back to sleep on poll_mask).
376 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
377 */
378 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
379 clear_bit(_VPF_blocked, &v->pause_flags);
380 }
382 vcpu_wake(v);
383 }
385 static void vcpu_migrate(struct vcpu *v)
386 {
387 unsigned long flags;
388 int old_cpu, new_cpu;
390 vcpu_schedule_lock_irqsave(v, flags);
392 /*
393 * NB. Check of v->running happens /after/ setting migration flag
394 * because they both happen in (different) spinlock regions, and those
395 * regions are strictly serialised.
396 */
397 if ( v->is_running ||
398 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
399 {
400 vcpu_schedule_unlock_irqrestore(v, flags);
401 return;
402 }
404 /* Select new CPU. */
405 old_cpu = v->processor;
406 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
408 /*
409 * Transfer urgency status to new CPU before switching CPUs, as once
410 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
411 * scheduler lock we are holding.
412 */
413 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
414 {
415 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
416 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
417 }
419 /* Switch to new CPU, then unlock old CPU. */
420 v->processor = new_cpu;
421 spin_unlock_irqrestore(
422 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
424 if ( old_cpu != new_cpu )
425 evtchn_move_pirqs(v);
427 /* Wake on new CPU. */
428 vcpu_wake(v);
429 }
431 /*
432 * Force a VCPU through a deschedule/reschedule path.
433 * For example, using this when setting the periodic timer period means that
434 * most periodic-timer state need only be touched from within the scheduler
435 * which can thus be done without need for synchronisation.
436 */
437 void vcpu_force_reschedule(struct vcpu *v)
438 {
439 vcpu_schedule_lock_irq(v);
440 if ( v->is_running )
441 set_bit(_VPF_migrating, &v->pause_flags);
442 vcpu_schedule_unlock_irq(v);
444 if ( test_bit(_VPF_migrating, &v->pause_flags) )
445 {
446 vcpu_sleep_nosync(v);
447 vcpu_migrate(v);
448 }
449 }
451 /*
452 * This function is used by cpu_hotplug code from stop_machine context
453 * and from cpupools to switch schedulers on a cpu.
454 */
455 int cpu_disable_scheduler(unsigned int cpu)
456 {
457 struct domain *d;
458 struct vcpu *v;
459 struct cpupool *c;
460 int ret = 0;
462 c = per_cpu(cpupool, cpu);
463 if ( c == NULL )
464 return ret;
466 for_each_domain ( d )
467 {
468 if ( d->cpupool != c )
469 continue;
471 for_each_vcpu ( d, v )
472 {
473 vcpu_schedule_lock_irq(v);
475 if ( (cpus_weight(v->cpu_affinity) == 1) &&
476 cpu_isset(cpu, v->cpu_affinity) )
477 {
478 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
479 v->domain->domain_id, v->vcpu_id);
480 cpus_setall(v->cpu_affinity);
481 domain_update_node_affinity(d);
482 }
484 if ( v->processor == cpu )
485 {
486 set_bit(_VPF_migrating, &v->pause_flags);
487 vcpu_schedule_unlock_irq(v);
488 vcpu_sleep_nosync(v);
489 vcpu_migrate(v);
490 }
491 else
492 {
493 vcpu_schedule_unlock_irq(v);
494 }
496 /*
497 * A vcpu active in the hypervisor will not be migratable.
498 * The caller should try again after releasing and reaquiring
499 * all locks.
500 */
501 if ( v->processor == cpu )
502 ret = -EAGAIN;
503 }
504 }
505 return ret;
506 }
508 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
509 {
510 cpumask_t online_affinity, old_affinity;
511 cpumask_t *online;
513 if ( v->domain->is_pinned )
514 return -EINVAL;
515 online = VCPU2ONLINE(v);
516 cpus_and(online_affinity, *affinity, *online);
517 if ( cpus_empty(online_affinity) )
518 return -EINVAL;
520 vcpu_schedule_lock_irq(v);
522 old_affinity = v->cpu_affinity;
523 v->cpu_affinity = *affinity;
524 domain_update_node_affinity(v->domain);
525 *affinity = old_affinity;
526 if ( !cpu_isset(v->processor, v->cpu_affinity) )
527 set_bit(_VPF_migrating, &v->pause_flags);
529 vcpu_schedule_unlock_irq(v);
531 if ( test_bit(_VPF_migrating, &v->pause_flags) )
532 {
533 vcpu_sleep_nosync(v);
534 vcpu_migrate(v);
535 }
537 return 0;
538 }
540 /* Block the currently-executing domain until a pertinent event occurs. */
541 static long do_block(void)
542 {
543 struct vcpu *v = current;
545 local_event_delivery_enable();
546 set_bit(_VPF_blocked, &v->pause_flags);
548 /* Check for events /after/ blocking: avoids wakeup waiting race. */
549 if ( local_events_need_delivery() )
550 {
551 clear_bit(_VPF_blocked, &v->pause_flags);
552 }
553 else
554 {
555 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
556 raise_softirq(SCHEDULE_SOFTIRQ);
557 }
559 return 0;
560 }
562 static long do_poll(struct sched_poll *sched_poll)
563 {
564 struct vcpu *v = current;
565 struct domain *d = v->domain;
566 evtchn_port_t port;
567 long rc;
568 unsigned int i;
570 /* Fairly arbitrary limit. */
571 if ( sched_poll->nr_ports > 128 )
572 return -EINVAL;
574 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
575 return -EFAULT;
577 set_bit(_VPF_blocked, &v->pause_flags);
578 v->poll_evtchn = -1;
579 set_bit(v->vcpu_id, d->poll_mask);
581 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
582 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
583 smp_mb();
585 /*
586 * Someone may have seen we are blocked but not that we are polling, or
587 * vice versa. We are certainly being woken, so clean up and bail. Beyond
588 * this point others can be guaranteed to clean up for us if they wake us.
589 */
590 rc = 0;
591 if ( (v->poll_evtchn == 0) ||
592 !test_bit(_VPF_blocked, &v->pause_flags) ||
593 !test_bit(v->vcpu_id, d->poll_mask) )
594 goto out;
595 #endif
597 rc = 0;
598 if ( local_events_need_delivery() )
599 goto out;
601 for ( i = 0; i < sched_poll->nr_ports; i++ )
602 {
603 rc = -EFAULT;
604 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
605 goto out;
607 rc = -EINVAL;
608 if ( port >= MAX_EVTCHNS(d) )
609 goto out;
611 rc = 0;
612 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
613 goto out;
614 }
616 if ( sched_poll->nr_ports == 1 )
617 v->poll_evtchn = port;
619 if ( sched_poll->timeout != 0 )
620 set_timer(&v->poll_timer, sched_poll->timeout);
622 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
623 raise_softirq(SCHEDULE_SOFTIRQ);
625 return 0;
627 out:
628 v->poll_evtchn = 0;
629 clear_bit(v->vcpu_id, d->poll_mask);
630 clear_bit(_VPF_blocked, &v->pause_flags);
631 return rc;
632 }
634 /* Voluntarily yield the processor for this allocation. */
635 static long do_yield(void)
636 {
637 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
638 raise_softirq(SCHEDULE_SOFTIRQ);
639 return 0;
640 }
642 static void domain_watchdog_timeout(void *data)
643 {
644 struct domain *d = data;
646 if ( d->is_shutting_down || d->is_dying )
647 return;
649 printk("Watchdog timer fired for domain %u\n", d->domain_id);
650 domain_shutdown(d, SHUTDOWN_watchdog);
651 }
653 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
654 {
655 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
656 return -EINVAL;
658 spin_lock(&d->watchdog_lock);
660 if ( id == 0 )
661 {
662 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
663 {
664 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
665 continue;
666 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
667 break;
668 }
669 spin_unlock(&d->watchdog_lock);
670 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
671 }
673 id -= 1;
674 if ( !test_bit(id, &d->watchdog_inuse_map) )
675 {
676 spin_unlock(&d->watchdog_lock);
677 return -EEXIST;
678 }
680 if ( timeout == 0 )
681 {
682 stop_timer(&d->watchdog_timer[id]);
683 clear_bit(id, &d->watchdog_inuse_map);
684 }
685 else
686 {
687 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
688 }
690 spin_unlock(&d->watchdog_lock);
691 return 0;
692 }
694 void watchdog_domain_init(struct domain *d)
695 {
696 unsigned int i;
698 spin_lock_init(&d->watchdog_lock);
700 d->watchdog_inuse_map = 0;
702 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
703 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
704 }
706 void watchdog_domain_destroy(struct domain *d)
707 {
708 unsigned int i;
710 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
711 kill_timer(&d->watchdog_timer[i]);
712 }
714 long do_sched_op_compat(int cmd, unsigned long arg)
715 {
716 long ret = 0;
718 switch ( cmd )
719 {
720 case SCHEDOP_yield:
721 {
722 ret = do_yield();
723 break;
724 }
726 case SCHEDOP_block:
727 {
728 ret = do_block();
729 break;
730 }
732 case SCHEDOP_shutdown:
733 {
734 TRACE_3D(TRC_SCHED_SHUTDOWN,
735 current->domain->domain_id, current->vcpu_id, arg);
736 domain_shutdown(current->domain, (u8)arg);
737 break;
738 }
740 default:
741 ret = -ENOSYS;
742 }
744 return ret;
745 }
747 typedef long ret_t;
749 #endif /* !COMPAT */
751 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
752 {
753 ret_t ret = 0;
755 switch ( cmd )
756 {
757 case SCHEDOP_yield:
758 {
759 ret = do_yield();
760 break;
761 }
763 case SCHEDOP_block:
764 {
765 ret = do_block();
766 break;
767 }
769 case SCHEDOP_shutdown:
770 {
771 struct sched_shutdown sched_shutdown;
773 ret = -EFAULT;
774 if ( copy_from_guest(&sched_shutdown, arg, 1) )
775 break;
777 ret = 0;
778 TRACE_3D(TRC_SCHED_SHUTDOWN,
779 current->domain->domain_id, current->vcpu_id,
780 sched_shutdown.reason);
781 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
783 break;
784 }
786 case SCHEDOP_shutdown_code:
787 {
788 struct sched_shutdown sched_shutdown;
789 struct domain *d = current->domain;
791 ret = -EFAULT;
792 if ( copy_from_guest(&sched_shutdown, arg, 1) )
793 break;
795 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
796 d->domain_id, current->vcpu_id, sched_shutdown.reason);
798 spin_lock(&d->shutdown_lock);
799 if ( d->shutdown_code == -1 )
800 d->shutdown_code = (u8)sched_shutdown.reason;
801 spin_unlock(&d->shutdown_lock);
803 ret = 0;
804 break;
805 }
807 case SCHEDOP_poll:
808 {
809 struct sched_poll sched_poll;
811 ret = -EFAULT;
812 if ( copy_from_guest(&sched_poll, arg, 1) )
813 break;
815 ret = do_poll(&sched_poll);
817 break;
818 }
820 case SCHEDOP_remote_shutdown:
821 {
822 struct domain *d;
823 struct sched_remote_shutdown sched_remote_shutdown;
825 ret = -EFAULT;
826 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
827 break;
829 ret = -ESRCH;
830 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
831 if ( d == NULL )
832 break;
834 if ( !IS_PRIV_FOR(current->domain, d) )
835 {
836 rcu_unlock_domain(d);
837 return -EPERM;
838 }
840 ret = xsm_schedop_shutdown(current->domain, d);
841 if ( ret )
842 {
843 rcu_unlock_domain(d);
844 return ret;
845 }
847 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
849 rcu_unlock_domain(d);
850 ret = 0;
852 break;
853 }
855 case SCHEDOP_watchdog:
856 {
857 struct sched_watchdog sched_watchdog;
859 ret = -EFAULT;
860 if ( copy_from_guest(&sched_watchdog, arg, 1) )
861 break;
863 ret = domain_watchdog(
864 current->domain, sched_watchdog.id, sched_watchdog.timeout);
865 break;
866 }
868 default:
869 ret = -ENOSYS;
870 }
872 return ret;
873 }
875 #ifndef COMPAT
877 /* Per-vcpu oneshot-timer hypercall. */
878 long do_set_timer_op(s_time_t timeout)
879 {
880 struct vcpu *v = current;
881 s_time_t offset = timeout - NOW();
883 if ( timeout == 0 )
884 {
885 stop_timer(&v->singleshot_timer);
886 }
887 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
888 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
889 {
890 /*
891 * Linux workaround: occasionally we will see timeouts a long way in
892 * the future due to wrapping in Linux's jiffy time handling. We check
893 * for timeouts wrapped negative, and for positive timeouts more than
894 * about 13 days in the future (2^50ns). The correct fix is to trigger
895 * an interrupt immediately (since Linux in fact has pending work to
896 * do in this situation). However, older guests also set a long timeout
897 * when they have *no* pending timers at all: setting an immediate
898 * timeout in this case can burn a lot of CPU. We therefore go for a
899 * reasonable middleground of triggering a timer event in 100ms.
900 */
901 gdprintk(XENLOG_INFO,
902 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
903 v->vcpu_id, (uint64_t)timeout);
904 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
905 }
906 else
907 {
908 migrate_timer(&v->singleshot_timer, smp_processor_id());
909 set_timer(&v->singleshot_timer, timeout);
910 }
912 return 0;
913 }
915 /* sched_id - fetch ID of current scheduler */
916 int sched_id(void)
917 {
918 return ops.sched_id;
919 }
921 /* Adjust scheduling parameter for a given domain. */
922 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
923 {
924 struct vcpu *v;
925 long ret;
927 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
928 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
929 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
930 return -EINVAL;
932 /*
933 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
934 * we acquire the local schedule_lock to guard against concurrent updates.
935 *
936 * We only acquire the local schedule lock after we have paused all other
937 * VCPUs in this domain. There are two reasons for this:
938 * 1- We don't want to hold up interrupts as pausing a VCPU can
939 * trigger a tlb shootdown.
940 * 2- Pausing other VCPUs involves briefly locking the schedule
941 * lock of the CPU they are running on. This CPU could be the
942 * same as ours.
943 */
945 for_each_vcpu ( d, v )
946 {
947 if ( v != current )
948 vcpu_pause(v);
949 }
951 if ( d == current->domain )
952 vcpu_schedule_lock_irq(current);
954 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
955 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
957 if ( d == current->domain )
958 vcpu_schedule_unlock_irq(current);
960 for_each_vcpu ( d, v )
961 {
962 if ( v != current )
963 vcpu_unpause(v);
964 }
966 return ret;
967 }
969 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
970 {
971 struct cpupool *pool;
972 int rc;
974 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
975 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
976 return -EINVAL;
978 pool = cpupool_get_by_id(op->cpupool_id);
979 if ( pool == NULL )
980 return -ESRCH;
982 if ( op->sched_id != pool->sched->sched_id )
983 {
984 cpupool_put(pool);
985 return -EINVAL;
986 }
988 rc = SCHED_OP(pool->sched, adjust_global, op);
990 cpupool_put(pool);
992 return rc;
993 }
995 static void vcpu_periodic_timer_work(struct vcpu *v)
996 {
997 s_time_t now = NOW();
998 s_time_t periodic_next_event;
1000 if ( v->periodic_period == 0 )
1001 return;
1003 periodic_next_event = v->periodic_last_event + v->periodic_period;
1005 if ( now >= periodic_next_event )
1007 send_timer_event(v);
1008 v->periodic_last_event = now;
1009 periodic_next_event = now + v->periodic_period;
1012 migrate_timer(&v->periodic_timer, smp_processor_id());
1013 set_timer(&v->periodic_timer, periodic_next_event);
1016 /*
1017 * The main function
1018 * - deschedule the current domain (scheduler independent).
1019 * - pick a new domain (scheduler dependent).
1020 */
1021 static void schedule(void)
1023 struct vcpu *prev = current, *next = NULL;
1024 s_time_t now = NOW();
1025 struct scheduler *sched = this_cpu(scheduler);
1026 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1027 bool_t tasklet_work_scheduled = 0;
1028 struct schedule_data *sd;
1029 struct task_slice next_slice;
1031 ASSERT(!in_irq());
1032 ASSERT(this_cpu(mc_state).flags == 0);
1034 perfc_incr(sched_run);
1036 sd = &this_cpu(schedule_data);
1038 /* Update tasklet scheduling status. */
1039 switch ( *tasklet_work )
1041 case TASKLET_enqueued:
1042 set_bit(_TASKLET_scheduled, tasklet_work);
1043 case TASKLET_enqueued|TASKLET_scheduled:
1044 tasklet_work_scheduled = 1;
1045 break;
1046 case TASKLET_scheduled:
1047 clear_bit(_TASKLET_scheduled, tasklet_work);
1048 case 0:
1049 /*tasklet_work_scheduled = 0;*/
1050 break;
1051 default:
1052 BUG();
1055 spin_lock_irq(sd->schedule_lock);
1057 stop_timer(&sd->s_timer);
1059 /* get policy-specific decision on scheduling... */
1060 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1062 next = next_slice.task;
1064 sd->curr = next;
1066 if ( next_slice.time >= 0 ) /* -ve means no limit */
1067 set_timer(&sd->s_timer, now + next_slice.time);
1069 if ( unlikely(prev == next) )
1071 spin_unlock_irq(sd->schedule_lock);
1072 trace_continue_running(next);
1073 return continue_running(prev);
1076 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
1077 prev->domain->domain_id,
1078 now - prev->runstate.state_entry_time);
1079 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
1080 next->domain->domain_id,
1081 (next->runstate.state == RUNSTATE_runnable) ?
1082 (now - next->runstate.state_entry_time) : 0,
1083 next_slice.time);
1085 ASSERT(prev->runstate.state == RUNSTATE_running);
1087 TRACE_4D(TRC_SCHED_SWITCH,
1088 prev->domain->domain_id, prev->vcpu_id,
1089 next->domain->domain_id, next->vcpu_id);
1091 vcpu_runstate_change(
1092 prev,
1093 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
1094 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1095 now);
1096 prev->last_run_time = now;
1098 ASSERT(next->runstate.state != RUNSTATE_running);
1099 vcpu_runstate_change(next, RUNSTATE_running, now);
1101 /*
1102 * NB. Don't add any trace records from here until the actual context
1103 * switch, else lost_records resume will not work properly.
1104 */
1106 ASSERT(!next->is_running);
1107 next->is_running = 1;
1109 spin_unlock_irq(sd->schedule_lock);
1111 perfc_incr(sched_ctx);
1113 stop_timer(&prev->periodic_timer);
1115 if ( next_slice.migrated )
1116 evtchn_move_pirqs(next);
1118 /* Ensure that the domain has an up-to-date time base. */
1119 update_vcpu_system_time(next);
1120 vcpu_periodic_timer_work(next);
1122 context_switch(prev, next);
1125 void context_saved(struct vcpu *prev)
1127 /* Clear running flag /after/ writing context to memory. */
1128 smp_wmb();
1130 prev->is_running = 0;
1132 /* Check for migration request /after/ clearing running flag. */
1133 smp_mb();
1135 SCHED_OP(VCPU2OP(prev), context_saved, prev);
1137 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
1138 vcpu_migrate(prev);
1141 /* The scheduler timer: force a run through the scheduler */
1142 static void s_timer_fn(void *unused)
1144 raise_softirq(SCHEDULE_SOFTIRQ);
1145 perfc_incr(sched_irq);
1148 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1149 static void vcpu_periodic_timer_fn(void *data)
1151 struct vcpu *v = data;
1152 vcpu_periodic_timer_work(v);
1155 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1156 static void vcpu_singleshot_timer_fn(void *data)
1158 struct vcpu *v = data;
1159 send_timer_event(v);
1162 /* SCHEDOP_poll timeout callback. */
1163 static void poll_timer_fn(void *data)
1165 struct vcpu *v = data;
1167 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1168 vcpu_unblock(v);
1171 static int cpu_schedule_up(unsigned int cpu)
1173 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1175 per_cpu(scheduler, cpu) = &ops;
1176 spin_lock_init(&sd->_lock);
1177 sd->schedule_lock = &sd->_lock;
1178 sd->curr = idle_vcpu[cpu];
1179 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1180 atomic_set(&sd->urgent_count, 0);
1182 /* Boot CPU is dealt with later in schedule_init(). */
1183 if ( cpu == 0 )
1184 return 0;
1186 if ( idle_vcpu[cpu] == NULL )
1187 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1188 if ( idle_vcpu[cpu] == NULL )
1189 return -ENOMEM;
1191 if ( (ops.alloc_pdata != NULL) &&
1192 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
1193 return -ENOMEM;
1195 return 0;
1198 static void cpu_schedule_down(unsigned int cpu)
1200 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1202 if ( sd->sched_priv != NULL )
1203 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
1205 kill_timer(&sd->s_timer);
1208 static int cpu_schedule_callback(
1209 struct notifier_block *nfb, unsigned long action, void *hcpu)
1211 unsigned int cpu = (unsigned long)hcpu;
1212 int rc = 0;
1214 switch ( action )
1216 case CPU_UP_PREPARE:
1217 rc = cpu_schedule_up(cpu);
1218 break;
1219 case CPU_UP_CANCELED:
1220 case CPU_DEAD:
1221 cpu_schedule_down(cpu);
1222 break;
1223 default:
1224 break;
1227 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1230 static struct notifier_block cpu_schedule_nfb = {
1231 .notifier_call = cpu_schedule_callback
1232 };
1234 /* Initialise the data structures. */
1235 void __init scheduler_init(void)
1237 struct domain *idle_domain;
1238 int i;
1240 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1242 for ( i = 0; schedulers[i] != NULL; i++ )
1244 ops = *schedulers[i];
1245 if ( strcmp(ops.opt_name, opt_sched) == 0 )
1246 break;
1249 if ( schedulers[i] == NULL )
1251 printk("Could not find scheduler: %s\n", opt_sched);
1252 ops = *schedulers[0];
1255 if ( cpu_schedule_up(0) )
1256 BUG();
1257 register_cpu_notifier(&cpu_schedule_nfb);
1259 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1260 if ( SCHED_OP(&ops, init) )
1261 panic("scheduler returned error on init\n");
1263 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
1264 BUG_ON(idle_domain == NULL);
1265 idle_domain->vcpu = idle_vcpu;
1266 idle_domain->max_vcpus = NR_CPUS;
1267 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1268 BUG();
1269 if ( ops.alloc_pdata &&
1270 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
1271 BUG();
1274 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1276 unsigned long flags;
1277 struct vcpu *idle;
1278 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1279 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1280 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1282 if ( old_ops == new_ops )
1283 return;
1285 idle = idle_vcpu[cpu];
1286 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1287 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1289 spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags);
1291 SCHED_OP(old_ops, tick_suspend, cpu);
1292 vpriv_old = idle->sched_priv;
1293 idle->sched_priv = vpriv;
1294 per_cpu(scheduler, cpu) = new_ops;
1295 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1296 per_cpu(schedule_data, cpu).sched_priv = ppriv;
1297 SCHED_OP(new_ops, tick_resume, cpu);
1298 SCHED_OP(new_ops, insert_vcpu, idle);
1300 spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags);
1302 SCHED_OP(old_ops, free_vdata, vpriv);
1303 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1306 struct scheduler *scheduler_get_default(void)
1308 return &ops;
1311 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1313 int i;
1314 struct scheduler *sched;
1316 for ( i = 0; schedulers[i] != NULL; i++ )
1317 if ( schedulers[i]->sched_id == sched_id )
1318 goto found;
1319 *perr = -ENOENT;
1320 return NULL;
1322 found:
1323 *perr = -ENOMEM;
1324 if ( (sched = xmalloc(struct scheduler)) == NULL )
1325 return NULL;
1326 memcpy(sched, schedulers[i], sizeof(*sched));
1327 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1329 xfree(sched);
1330 sched = NULL;
1333 return sched;
1336 void scheduler_free(struct scheduler *sched)
1338 BUG_ON(sched == &ops);
1339 SCHED_OP(sched, deinit);
1340 xfree(sched);
1343 void schedule_dump(struct cpupool *c)
1345 int i;
1346 struct scheduler *sched;
1347 cpumask_t *cpus;
1349 sched = (c == NULL) ? &ops : c->sched;
1350 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
1351 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1352 SCHED_OP(sched, dump_settings);
1354 for_each_cpu_mask (i, *cpus)
1356 spin_lock(per_cpu(schedule_data, i).schedule_lock);
1357 printk("CPU[%02d] ", i);
1358 SCHED_OP(sched, dump_cpu_state, i);
1359 spin_unlock(per_cpu(schedule_data, i).schedule_lock);
1363 void sched_tick_suspend(void)
1365 struct scheduler *sched;
1366 unsigned int cpu = smp_processor_id();
1368 sched = per_cpu(scheduler, cpu);
1369 SCHED_OP(sched, tick_suspend, cpu);
1372 void sched_tick_resume(void)
1374 struct scheduler *sched;
1375 unsigned int cpu = smp_processor_id();
1377 sched = per_cpu(scheduler, cpu);
1378 SCHED_OP(sched, tick_resume, cpu);
1381 #ifdef CONFIG_COMPAT
1382 #include "compat/schedule.c"
1383 #endif
1385 #endif /* !COMPAT */
1387 /*
1388 * Local variables:
1389 * mode: C
1390 * c-set-style: "BSD"
1391 * c-basic-offset: 4
1392 * tab-width: 4
1393 * indent-tabs-mode: nil
1394 * End:
1395 */