debuggers.hg

view xen/common/schedule.c @ 21960:49254cab8465

numa: Small tweaks to domain_update_node_affinity() and its callers.

From: Andrew Jones <drjones@redhat.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 04 17:10:46 2010 +0100 (2010-08-04)
parents 581ebaa7e2da
children cd606ea8f963
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <public/sched.h>
36 #include <xsm/xsm.h>
38 /* opt_sched: scheduler - default to credit */
39 static char __initdata opt_sched[10] = "credit";
40 string_param("sched", opt_sched);
42 /* if sched_smt_power_savings is set,
43 * scheduler will give preferrence to partially idle package compared to
44 * the full idle package, when picking pCPU to schedule vCPU.
45 */
46 int sched_smt_power_savings = 0;
47 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
49 /* Various timer handlers. */
50 static void s_timer_fn(void *unused);
51 static void vcpu_periodic_timer_fn(void *data);
52 static void vcpu_singleshot_timer_fn(void *data);
53 static void poll_timer_fn(void *data);
55 /* This is global for now so that private implementations can reach it */
56 DEFINE_PER_CPU(struct schedule_data, schedule_data);
57 DEFINE_PER_CPU(struct scheduler *, scheduler);
59 extern const struct scheduler sched_sedf_def;
60 extern const struct scheduler sched_credit_def;
61 extern const struct scheduler sched_credit2_def;
62 static const struct scheduler *schedulers[] = {
63 &sched_sedf_def,
64 &sched_credit_def,
65 &sched_credit2_def,
66 NULL
67 };
69 static struct scheduler __read_mostly ops;
71 #define SCHED_OP(opsptr, fn, ...) \
72 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
73 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
75 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
76 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
77 #define VCPU2ONLINE(_v) \
78 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
79 : &(_v)->domain->cpupool->cpu_valid)
81 static inline void trace_runstate_change(struct vcpu *v, int new_state)
82 {
83 struct { uint32_t vcpu:16, domain:16; } d;
84 uint32_t event;
86 if ( likely(!tb_init_done) )
87 return;
89 d.vcpu = v->vcpu_id;
90 d.domain = v->domain->domain_id;
92 event = TRC_SCHED_RUNSTATE_CHANGE;
93 event |= ( v->runstate.state & 0x3 ) << 8;
94 event |= ( new_state & 0x3 ) << 4;
96 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
97 }
99 static inline void trace_continue_running(struct vcpu *v)
100 {
101 struct { uint32_t vcpu:16, domain:16; } d;
103 if ( likely(!tb_init_done) )
104 return;
106 d.vcpu = v->vcpu_id;
107 d.domain = v->domain->domain_id;
109 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
110 (unsigned char *)&d);
111 }
113 static inline void vcpu_urgent_count_update(struct vcpu *v)
114 {
115 if ( is_idle_vcpu(v) )
116 return;
118 if ( unlikely(v->is_urgent) )
119 {
120 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
121 !test_bit(v->vcpu_id, v->domain->poll_mask) )
122 {
123 v->is_urgent = 0;
124 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
125 }
126 }
127 else
128 {
129 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
130 test_bit(v->vcpu_id, v->domain->poll_mask)) )
131 {
132 v->is_urgent = 1;
133 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
134 }
135 }
136 }
138 static inline void vcpu_runstate_change(
139 struct vcpu *v, int new_state, s_time_t new_entry_time)
140 {
141 s_time_t delta;
143 ASSERT(v->runstate.state != new_state);
144 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
146 vcpu_urgent_count_update(v);
148 trace_runstate_change(v, new_state);
150 delta = new_entry_time - v->runstate.state_entry_time;
151 if ( delta > 0 )
152 {
153 v->runstate.time[v->runstate.state] += delta;
154 v->runstate.state_entry_time = new_entry_time;
155 }
157 v->runstate.state = new_state;
158 }
160 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
161 {
162 s_time_t delta;
164 if ( unlikely(v != current) )
165 vcpu_schedule_lock_irq(v);
167 memcpy(runstate, &v->runstate, sizeof(*runstate));
168 delta = NOW() - runstate->state_entry_time;
169 if ( delta > 0 )
170 runstate->time[runstate->state] += delta;
172 if ( unlikely(v != current) )
173 vcpu_schedule_unlock_irq(v);
174 }
176 uint64_t get_cpu_idle_time(unsigned int cpu)
177 {
178 struct vcpu_runstate_info state;
179 struct vcpu *v;
181 if ( (v = idle_vcpu[cpu]) == NULL )
182 return 0;
184 vcpu_runstate_get(v, &state);
185 return state.time[RUNSTATE_running];
186 }
188 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
189 {
190 struct domain *d = v->domain;
192 /*
193 * Initialize processor and affinity settings. The idler, and potentially
194 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
195 */
196 v->processor = processor;
197 if ( is_idle_domain(d) || d->is_pinned )
198 v->cpu_affinity = cpumask_of_cpu(processor);
199 else
200 cpus_setall(v->cpu_affinity);
202 /* Initialise the per-vcpu timers. */
203 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
204 v, v->processor);
205 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
206 v, v->processor);
207 init_timer(&v->poll_timer, poll_timer_fn,
208 v, v->processor);
210 /* Idle VCPUs are scheduled immediately. */
211 if ( is_idle_domain(d) )
212 {
213 per_cpu(schedule_data, v->processor).curr = v;
214 v->is_running = 1;
215 }
217 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
219 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
220 if ( v->sched_priv == NULL )
221 return 1;
223 return 0;
224 }
226 int sched_move_domain(struct domain *d, struct cpupool *c)
227 {
228 struct vcpu *v;
229 unsigned int new_p;
230 void **vcpu_priv;
231 void *domdata;
233 domdata = SCHED_OP(c->sched, alloc_domdata, d);
234 if ( domdata == NULL )
235 return -ENOMEM;
237 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
238 if ( vcpu_priv == NULL )
239 {
240 SCHED_OP(c->sched, free_domdata, domdata);
241 return -ENOMEM;
242 }
244 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
245 for_each_vcpu ( d, v )
246 {
247 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
248 if ( vcpu_priv[v->vcpu_id] == NULL )
249 {
250 for_each_vcpu ( d, v )
251 {
252 if ( vcpu_priv[v->vcpu_id] != NULL )
253 xfree(vcpu_priv[v->vcpu_id]);
254 }
255 xfree(vcpu_priv);
256 SCHED_OP(c->sched, free_domdata, domdata);
257 return -ENOMEM;
258 }
259 }
261 domain_pause(d);
263 new_p = first_cpu(c->cpu_valid);
264 for_each_vcpu ( d, v )
265 {
266 migrate_timer(&v->periodic_timer, new_p);
267 migrate_timer(&v->singleshot_timer, new_p);
268 migrate_timer(&v->poll_timer, new_p);
270 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
272 cpus_setall(v->cpu_affinity);
273 v->processor = new_p;
274 v->sched_priv = vcpu_priv[v->vcpu_id];
275 evtchn_move_pirqs(v);
277 new_p = cycle_cpu(new_p, c->cpu_valid);
278 }
279 domain_update_node_affinity(d);
281 d->cpupool = c;
282 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
283 d->sched_priv = domdata;
285 domain_unpause(d);
287 xfree(vcpu_priv);
289 return 0;
290 }
292 void sched_destroy_vcpu(struct vcpu *v)
293 {
294 kill_timer(&v->periodic_timer);
295 kill_timer(&v->singleshot_timer);
296 kill_timer(&v->poll_timer);
297 if ( test_and_clear_bool(v->is_urgent) )
298 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
299 SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
300 }
302 int sched_init_domain(struct domain *d)
303 {
304 return SCHED_OP(DOM2OP(d), init_domain, d);
305 }
307 void sched_destroy_domain(struct domain *d)
308 {
309 SCHED_OP(DOM2OP(d), destroy_domain, d);
310 }
312 void vcpu_sleep_nosync(struct vcpu *v)
313 {
314 unsigned long flags;
316 vcpu_schedule_lock_irqsave(v, flags);
318 if ( likely(!vcpu_runnable(v)) )
319 {
320 if ( v->runstate.state == RUNSTATE_runnable )
321 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
323 SCHED_OP(VCPU2OP(v), sleep, v);
324 }
326 vcpu_schedule_unlock_irqrestore(v, flags);
328 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
329 }
331 void vcpu_sleep_sync(struct vcpu *v)
332 {
333 vcpu_sleep_nosync(v);
335 while ( !vcpu_runnable(v) && v->is_running )
336 cpu_relax();
338 sync_vcpu_execstate(v);
339 }
341 void vcpu_wake(struct vcpu *v)
342 {
343 unsigned long flags;
345 vcpu_schedule_lock_irqsave(v, flags);
347 if ( likely(vcpu_runnable(v)) )
348 {
349 if ( v->runstate.state >= RUNSTATE_blocked )
350 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
351 SCHED_OP(VCPU2OP(v), wake, v);
352 }
353 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
354 {
355 if ( v->runstate.state == RUNSTATE_blocked )
356 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
357 }
359 vcpu_schedule_unlock_irqrestore(v, flags);
361 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
362 }
364 void vcpu_unblock(struct vcpu *v)
365 {
366 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
367 return;
369 /* Polling period ends when a VCPU is unblocked. */
370 if ( unlikely(v->poll_evtchn != 0) )
371 {
372 v->poll_evtchn = 0;
373 /*
374 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
375 * this VCPU (and it then going back to sleep on poll_mask).
376 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
377 */
378 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
379 clear_bit(_VPF_blocked, &v->pause_flags);
380 }
382 vcpu_wake(v);
383 }
385 static void vcpu_migrate(struct vcpu *v)
386 {
387 unsigned long flags;
388 int old_cpu, new_cpu;
390 vcpu_schedule_lock_irqsave(v, flags);
392 /*
393 * NB. Check of v->running happens /after/ setting migration flag
394 * because they both happen in (different) spinlock regions, and those
395 * regions are strictly serialised.
396 */
397 if ( v->is_running ||
398 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
399 {
400 vcpu_schedule_unlock_irqrestore(v, flags);
401 return;
402 }
404 /* Select new CPU. */
405 old_cpu = v->processor;
406 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
408 /*
409 * Transfer urgency status to new CPU before switching CPUs, as once
410 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
411 * scheduler lock we are holding.
412 */
413 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
414 {
415 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
416 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
417 }
419 /* Switch to new CPU, then unlock old CPU. */
420 v->processor = new_cpu;
421 spin_unlock_irqrestore(
422 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
424 if ( old_cpu != new_cpu )
425 evtchn_move_pirqs(v);
427 /* Wake on new CPU. */
428 vcpu_wake(v);
429 }
431 /*
432 * Force a VCPU through a deschedule/reschedule path.
433 * For example, using this when setting the periodic timer period means that
434 * most periodic-timer state need only be touched from within the scheduler
435 * which can thus be done without need for synchronisation.
436 */
437 void vcpu_force_reschedule(struct vcpu *v)
438 {
439 vcpu_schedule_lock_irq(v);
440 if ( v->is_running )
441 set_bit(_VPF_migrating, &v->pause_flags);
442 vcpu_schedule_unlock_irq(v);
444 if ( test_bit(_VPF_migrating, &v->pause_flags) )
445 {
446 vcpu_sleep_nosync(v);
447 vcpu_migrate(v);
448 }
449 }
451 /*
452 * This function is used by cpu_hotplug code from stop_machine context
453 * and from cpupools to switch schedulers on a cpu.
454 */
455 int cpu_disable_scheduler(unsigned int cpu)
456 {
457 struct domain *d;
458 struct vcpu *v;
459 struct cpupool *c;
460 int ret = 0;
461 bool_t affinity_broken;
463 c = per_cpu(cpupool, cpu);
464 if ( c == NULL )
465 return ret;
467 for_each_domain ( d )
468 {
469 if ( d->cpupool != c )
470 continue;
472 affinity_broken = 0;
474 for_each_vcpu ( d, v )
475 {
476 vcpu_schedule_lock_irq(v);
478 if ( (cpus_weight(v->cpu_affinity) == 1) &&
479 cpu_isset(cpu, v->cpu_affinity) )
480 {
481 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
482 v->domain->domain_id, v->vcpu_id);
483 cpus_setall(v->cpu_affinity);
484 affinity_broken = 1;
485 }
487 if ( v->processor == cpu )
488 {
489 set_bit(_VPF_migrating, &v->pause_flags);
490 vcpu_schedule_unlock_irq(v);
491 vcpu_sleep_nosync(v);
492 vcpu_migrate(v);
493 }
494 else
495 {
496 vcpu_schedule_unlock_irq(v);
497 }
499 /*
500 * A vcpu active in the hypervisor will not be migratable.
501 * The caller should try again after releasing and reaquiring
502 * all locks.
503 */
504 if ( v->processor == cpu )
505 ret = -EAGAIN;
506 }
508 if ( affinity_broken )
509 domain_update_node_affinity(d);
510 }
512 return ret;
513 }
515 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
516 {
517 cpumask_t online_affinity, old_affinity;
518 cpumask_t *online;
520 if ( v->domain->is_pinned )
521 return -EINVAL;
522 online = VCPU2ONLINE(v);
523 cpus_and(online_affinity, *affinity, *online);
524 if ( cpus_empty(online_affinity) )
525 return -EINVAL;
527 vcpu_schedule_lock_irq(v);
529 old_affinity = v->cpu_affinity;
530 v->cpu_affinity = *affinity;
531 domain_update_node_affinity(v->domain);
532 *affinity = old_affinity;
533 if ( !cpu_isset(v->processor, v->cpu_affinity) )
534 set_bit(_VPF_migrating, &v->pause_flags);
536 vcpu_schedule_unlock_irq(v);
538 if ( test_bit(_VPF_migrating, &v->pause_flags) )
539 {
540 vcpu_sleep_nosync(v);
541 vcpu_migrate(v);
542 }
544 return 0;
545 }
547 /* Block the currently-executing domain until a pertinent event occurs. */
548 static long do_block(void)
549 {
550 struct vcpu *v = current;
552 local_event_delivery_enable();
553 set_bit(_VPF_blocked, &v->pause_flags);
555 /* Check for events /after/ blocking: avoids wakeup waiting race. */
556 if ( local_events_need_delivery() )
557 {
558 clear_bit(_VPF_blocked, &v->pause_flags);
559 }
560 else
561 {
562 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
563 raise_softirq(SCHEDULE_SOFTIRQ);
564 }
566 return 0;
567 }
569 static long do_poll(struct sched_poll *sched_poll)
570 {
571 struct vcpu *v = current;
572 struct domain *d = v->domain;
573 evtchn_port_t port;
574 long rc;
575 unsigned int i;
577 /* Fairly arbitrary limit. */
578 if ( sched_poll->nr_ports > 128 )
579 return -EINVAL;
581 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
582 return -EFAULT;
584 set_bit(_VPF_blocked, &v->pause_flags);
585 v->poll_evtchn = -1;
586 set_bit(v->vcpu_id, d->poll_mask);
588 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
589 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
590 smp_mb();
592 /*
593 * Someone may have seen we are blocked but not that we are polling, or
594 * vice versa. We are certainly being woken, so clean up and bail. Beyond
595 * this point others can be guaranteed to clean up for us if they wake us.
596 */
597 rc = 0;
598 if ( (v->poll_evtchn == 0) ||
599 !test_bit(_VPF_blocked, &v->pause_flags) ||
600 !test_bit(v->vcpu_id, d->poll_mask) )
601 goto out;
602 #endif
604 rc = 0;
605 if ( local_events_need_delivery() )
606 goto out;
608 for ( i = 0; i < sched_poll->nr_ports; i++ )
609 {
610 rc = -EFAULT;
611 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
612 goto out;
614 rc = -EINVAL;
615 if ( port >= MAX_EVTCHNS(d) )
616 goto out;
618 rc = 0;
619 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
620 goto out;
621 }
623 if ( sched_poll->nr_ports == 1 )
624 v->poll_evtchn = port;
626 if ( sched_poll->timeout != 0 )
627 set_timer(&v->poll_timer, sched_poll->timeout);
629 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
630 raise_softirq(SCHEDULE_SOFTIRQ);
632 return 0;
634 out:
635 v->poll_evtchn = 0;
636 clear_bit(v->vcpu_id, d->poll_mask);
637 clear_bit(_VPF_blocked, &v->pause_flags);
638 return rc;
639 }
641 /* Voluntarily yield the processor for this allocation. */
642 static long do_yield(void)
643 {
644 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
645 raise_softirq(SCHEDULE_SOFTIRQ);
646 return 0;
647 }
649 static void domain_watchdog_timeout(void *data)
650 {
651 struct domain *d = data;
653 if ( d->is_shutting_down || d->is_dying )
654 return;
656 printk("Watchdog timer fired for domain %u\n", d->domain_id);
657 domain_shutdown(d, SHUTDOWN_watchdog);
658 }
660 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
661 {
662 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
663 return -EINVAL;
665 spin_lock(&d->watchdog_lock);
667 if ( id == 0 )
668 {
669 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
670 {
671 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
672 continue;
673 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
674 break;
675 }
676 spin_unlock(&d->watchdog_lock);
677 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
678 }
680 id -= 1;
681 if ( !test_bit(id, &d->watchdog_inuse_map) )
682 {
683 spin_unlock(&d->watchdog_lock);
684 return -EEXIST;
685 }
687 if ( timeout == 0 )
688 {
689 stop_timer(&d->watchdog_timer[id]);
690 clear_bit(id, &d->watchdog_inuse_map);
691 }
692 else
693 {
694 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
695 }
697 spin_unlock(&d->watchdog_lock);
698 return 0;
699 }
701 void watchdog_domain_init(struct domain *d)
702 {
703 unsigned int i;
705 spin_lock_init(&d->watchdog_lock);
707 d->watchdog_inuse_map = 0;
709 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
710 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
711 }
713 void watchdog_domain_destroy(struct domain *d)
714 {
715 unsigned int i;
717 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
718 kill_timer(&d->watchdog_timer[i]);
719 }
721 long do_sched_op_compat(int cmd, unsigned long arg)
722 {
723 long ret = 0;
725 switch ( cmd )
726 {
727 case SCHEDOP_yield:
728 {
729 ret = do_yield();
730 break;
731 }
733 case SCHEDOP_block:
734 {
735 ret = do_block();
736 break;
737 }
739 case SCHEDOP_shutdown:
740 {
741 TRACE_3D(TRC_SCHED_SHUTDOWN,
742 current->domain->domain_id, current->vcpu_id, arg);
743 domain_shutdown(current->domain, (u8)arg);
744 break;
745 }
747 default:
748 ret = -ENOSYS;
749 }
751 return ret;
752 }
754 typedef long ret_t;
756 #endif /* !COMPAT */
758 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
759 {
760 ret_t ret = 0;
762 switch ( cmd )
763 {
764 case SCHEDOP_yield:
765 {
766 ret = do_yield();
767 break;
768 }
770 case SCHEDOP_block:
771 {
772 ret = do_block();
773 break;
774 }
776 case SCHEDOP_shutdown:
777 {
778 struct sched_shutdown sched_shutdown;
780 ret = -EFAULT;
781 if ( copy_from_guest(&sched_shutdown, arg, 1) )
782 break;
784 ret = 0;
785 TRACE_3D(TRC_SCHED_SHUTDOWN,
786 current->domain->domain_id, current->vcpu_id,
787 sched_shutdown.reason);
788 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
790 break;
791 }
793 case SCHEDOP_shutdown_code:
794 {
795 struct sched_shutdown sched_shutdown;
796 struct domain *d = current->domain;
798 ret = -EFAULT;
799 if ( copy_from_guest(&sched_shutdown, arg, 1) )
800 break;
802 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
803 d->domain_id, current->vcpu_id, sched_shutdown.reason);
805 spin_lock(&d->shutdown_lock);
806 if ( d->shutdown_code == -1 )
807 d->shutdown_code = (u8)sched_shutdown.reason;
808 spin_unlock(&d->shutdown_lock);
810 ret = 0;
811 break;
812 }
814 case SCHEDOP_poll:
815 {
816 struct sched_poll sched_poll;
818 ret = -EFAULT;
819 if ( copy_from_guest(&sched_poll, arg, 1) )
820 break;
822 ret = do_poll(&sched_poll);
824 break;
825 }
827 case SCHEDOP_remote_shutdown:
828 {
829 struct domain *d;
830 struct sched_remote_shutdown sched_remote_shutdown;
832 ret = -EFAULT;
833 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
834 break;
836 ret = -ESRCH;
837 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
838 if ( d == NULL )
839 break;
841 if ( !IS_PRIV_FOR(current->domain, d) )
842 {
843 rcu_unlock_domain(d);
844 return -EPERM;
845 }
847 ret = xsm_schedop_shutdown(current->domain, d);
848 if ( ret )
849 {
850 rcu_unlock_domain(d);
851 return ret;
852 }
854 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
856 rcu_unlock_domain(d);
857 ret = 0;
859 break;
860 }
862 case SCHEDOP_watchdog:
863 {
864 struct sched_watchdog sched_watchdog;
866 ret = -EFAULT;
867 if ( copy_from_guest(&sched_watchdog, arg, 1) )
868 break;
870 ret = domain_watchdog(
871 current->domain, sched_watchdog.id, sched_watchdog.timeout);
872 break;
873 }
875 default:
876 ret = -ENOSYS;
877 }
879 return ret;
880 }
882 #ifndef COMPAT
884 /* Per-vcpu oneshot-timer hypercall. */
885 long do_set_timer_op(s_time_t timeout)
886 {
887 struct vcpu *v = current;
888 s_time_t offset = timeout - NOW();
890 if ( timeout == 0 )
891 {
892 stop_timer(&v->singleshot_timer);
893 }
894 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
895 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
896 {
897 /*
898 * Linux workaround: occasionally we will see timeouts a long way in
899 * the future due to wrapping in Linux's jiffy time handling. We check
900 * for timeouts wrapped negative, and for positive timeouts more than
901 * about 13 days in the future (2^50ns). The correct fix is to trigger
902 * an interrupt immediately (since Linux in fact has pending work to
903 * do in this situation). However, older guests also set a long timeout
904 * when they have *no* pending timers at all: setting an immediate
905 * timeout in this case can burn a lot of CPU. We therefore go for a
906 * reasonable middleground of triggering a timer event in 100ms.
907 */
908 gdprintk(XENLOG_INFO,
909 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
910 v->vcpu_id, (uint64_t)timeout);
911 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
912 }
913 else
914 {
915 migrate_timer(&v->singleshot_timer, smp_processor_id());
916 set_timer(&v->singleshot_timer, timeout);
917 }
919 return 0;
920 }
922 /* sched_id - fetch ID of current scheduler */
923 int sched_id(void)
924 {
925 return ops.sched_id;
926 }
928 /* Adjust scheduling parameter for a given domain. */
929 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
930 {
931 struct vcpu *v;
932 long ret;
934 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
935 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
936 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
937 return -EINVAL;
939 /*
940 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
941 * we acquire the local schedule_lock to guard against concurrent updates.
942 *
943 * We only acquire the local schedule lock after we have paused all other
944 * VCPUs in this domain. There are two reasons for this:
945 * 1- We don't want to hold up interrupts as pausing a VCPU can
946 * trigger a tlb shootdown.
947 * 2- Pausing other VCPUs involves briefly locking the schedule
948 * lock of the CPU they are running on. This CPU could be the
949 * same as ours.
950 */
952 for_each_vcpu ( d, v )
953 {
954 if ( v != current )
955 vcpu_pause(v);
956 }
958 if ( d == current->domain )
959 vcpu_schedule_lock_irq(current);
961 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
962 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
964 if ( d == current->domain )
965 vcpu_schedule_unlock_irq(current);
967 for_each_vcpu ( d, v )
968 {
969 if ( v != current )
970 vcpu_unpause(v);
971 }
973 return ret;
974 }
976 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
977 {
978 struct cpupool *pool;
979 int rc;
981 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
982 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
983 return -EINVAL;
985 pool = cpupool_get_by_id(op->cpupool_id);
986 if ( pool == NULL )
987 return -ESRCH;
989 if ( op->sched_id != pool->sched->sched_id )
990 {
991 cpupool_put(pool);
992 return -EINVAL;
993 }
995 rc = SCHED_OP(pool->sched, adjust_global, op);
997 cpupool_put(pool);
999 return rc;
1002 static void vcpu_periodic_timer_work(struct vcpu *v)
1004 s_time_t now = NOW();
1005 s_time_t periodic_next_event;
1007 if ( v->periodic_period == 0 )
1008 return;
1010 periodic_next_event = v->periodic_last_event + v->periodic_period;
1012 if ( now >= periodic_next_event )
1014 send_timer_event(v);
1015 v->periodic_last_event = now;
1016 periodic_next_event = now + v->periodic_period;
1019 migrate_timer(&v->periodic_timer, smp_processor_id());
1020 set_timer(&v->periodic_timer, periodic_next_event);
1023 /*
1024 * The main function
1025 * - deschedule the current domain (scheduler independent).
1026 * - pick a new domain (scheduler dependent).
1027 */
1028 static void schedule(void)
1030 struct vcpu *prev = current, *next = NULL;
1031 s_time_t now = NOW();
1032 struct scheduler *sched = this_cpu(scheduler);
1033 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1034 bool_t tasklet_work_scheduled = 0;
1035 struct schedule_data *sd;
1036 struct task_slice next_slice;
1038 ASSERT(!in_irq());
1039 ASSERT(this_cpu(mc_state).flags == 0);
1041 perfc_incr(sched_run);
1043 sd = &this_cpu(schedule_data);
1045 /* Update tasklet scheduling status. */
1046 switch ( *tasklet_work )
1048 case TASKLET_enqueued:
1049 set_bit(_TASKLET_scheduled, tasklet_work);
1050 case TASKLET_enqueued|TASKLET_scheduled:
1051 tasklet_work_scheduled = 1;
1052 break;
1053 case TASKLET_scheduled:
1054 clear_bit(_TASKLET_scheduled, tasklet_work);
1055 case 0:
1056 /*tasklet_work_scheduled = 0;*/
1057 break;
1058 default:
1059 BUG();
1062 spin_lock_irq(sd->schedule_lock);
1064 stop_timer(&sd->s_timer);
1066 /* get policy-specific decision on scheduling... */
1067 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1069 next = next_slice.task;
1071 sd->curr = next;
1073 if ( next_slice.time >= 0 ) /* -ve means no limit */
1074 set_timer(&sd->s_timer, now + next_slice.time);
1076 if ( unlikely(prev == next) )
1078 spin_unlock_irq(sd->schedule_lock);
1079 trace_continue_running(next);
1080 return continue_running(prev);
1083 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
1084 prev->domain->domain_id,
1085 now - prev->runstate.state_entry_time);
1086 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
1087 next->domain->domain_id,
1088 (next->runstate.state == RUNSTATE_runnable) ?
1089 (now - next->runstate.state_entry_time) : 0,
1090 next_slice.time);
1092 ASSERT(prev->runstate.state == RUNSTATE_running);
1094 TRACE_4D(TRC_SCHED_SWITCH,
1095 prev->domain->domain_id, prev->vcpu_id,
1096 next->domain->domain_id, next->vcpu_id);
1098 vcpu_runstate_change(
1099 prev,
1100 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
1101 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1102 now);
1103 prev->last_run_time = now;
1105 ASSERT(next->runstate.state != RUNSTATE_running);
1106 vcpu_runstate_change(next, RUNSTATE_running, now);
1108 /*
1109 * NB. Don't add any trace records from here until the actual context
1110 * switch, else lost_records resume will not work properly.
1111 */
1113 ASSERT(!next->is_running);
1114 next->is_running = 1;
1116 spin_unlock_irq(sd->schedule_lock);
1118 perfc_incr(sched_ctx);
1120 stop_timer(&prev->periodic_timer);
1122 if ( next_slice.migrated )
1123 evtchn_move_pirqs(next);
1125 /* Ensure that the domain has an up-to-date time base. */
1126 update_vcpu_system_time(next);
1127 vcpu_periodic_timer_work(next);
1129 context_switch(prev, next);
1132 void context_saved(struct vcpu *prev)
1134 /* Clear running flag /after/ writing context to memory. */
1135 smp_wmb();
1137 prev->is_running = 0;
1139 /* Check for migration request /after/ clearing running flag. */
1140 smp_mb();
1142 SCHED_OP(VCPU2OP(prev), context_saved, prev);
1144 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
1145 vcpu_migrate(prev);
1148 /* The scheduler timer: force a run through the scheduler */
1149 static void s_timer_fn(void *unused)
1151 raise_softirq(SCHEDULE_SOFTIRQ);
1152 perfc_incr(sched_irq);
1155 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1156 static void vcpu_periodic_timer_fn(void *data)
1158 struct vcpu *v = data;
1159 vcpu_periodic_timer_work(v);
1162 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1163 static void vcpu_singleshot_timer_fn(void *data)
1165 struct vcpu *v = data;
1166 send_timer_event(v);
1169 /* SCHEDOP_poll timeout callback. */
1170 static void poll_timer_fn(void *data)
1172 struct vcpu *v = data;
1174 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1175 vcpu_unblock(v);
1178 static int cpu_schedule_up(unsigned int cpu)
1180 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1182 per_cpu(scheduler, cpu) = &ops;
1183 spin_lock_init(&sd->_lock);
1184 sd->schedule_lock = &sd->_lock;
1185 sd->curr = idle_vcpu[cpu];
1186 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1187 atomic_set(&sd->urgent_count, 0);
1189 /* Boot CPU is dealt with later in schedule_init(). */
1190 if ( cpu == 0 )
1191 return 0;
1193 if ( idle_vcpu[cpu] == NULL )
1194 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1195 if ( idle_vcpu[cpu] == NULL )
1196 return -ENOMEM;
1198 if ( (ops.alloc_pdata != NULL) &&
1199 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
1200 return -ENOMEM;
1202 return 0;
1205 static void cpu_schedule_down(unsigned int cpu)
1207 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1209 if ( sd->sched_priv != NULL )
1210 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
1212 kill_timer(&sd->s_timer);
1215 static int cpu_schedule_callback(
1216 struct notifier_block *nfb, unsigned long action, void *hcpu)
1218 unsigned int cpu = (unsigned long)hcpu;
1219 int rc = 0;
1221 switch ( action )
1223 case CPU_UP_PREPARE:
1224 rc = cpu_schedule_up(cpu);
1225 break;
1226 case CPU_UP_CANCELED:
1227 case CPU_DEAD:
1228 cpu_schedule_down(cpu);
1229 break;
1230 default:
1231 break;
1234 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1237 static struct notifier_block cpu_schedule_nfb = {
1238 .notifier_call = cpu_schedule_callback
1239 };
1241 /* Initialise the data structures. */
1242 void __init scheduler_init(void)
1244 struct domain *idle_domain;
1245 int i;
1247 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1249 for ( i = 0; schedulers[i] != NULL; i++ )
1251 ops = *schedulers[i];
1252 if ( strcmp(ops.opt_name, opt_sched) == 0 )
1253 break;
1256 if ( schedulers[i] == NULL )
1258 printk("Could not find scheduler: %s\n", opt_sched);
1259 ops = *schedulers[0];
1262 if ( cpu_schedule_up(0) )
1263 BUG();
1264 register_cpu_notifier(&cpu_schedule_nfb);
1266 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1267 if ( SCHED_OP(&ops, init) )
1268 panic("scheduler returned error on init\n");
1270 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
1271 BUG_ON(idle_domain == NULL);
1272 idle_domain->vcpu = idle_vcpu;
1273 idle_domain->max_vcpus = NR_CPUS;
1274 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1275 BUG();
1276 if ( ops.alloc_pdata &&
1277 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
1278 BUG();
1281 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1283 unsigned long flags;
1284 struct vcpu *idle;
1285 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1286 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1287 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1289 if ( old_ops == new_ops )
1290 return;
1292 idle = idle_vcpu[cpu];
1293 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1294 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1296 spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags);
1298 SCHED_OP(old_ops, tick_suspend, cpu);
1299 vpriv_old = idle->sched_priv;
1300 idle->sched_priv = vpriv;
1301 per_cpu(scheduler, cpu) = new_ops;
1302 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1303 per_cpu(schedule_data, cpu).sched_priv = ppriv;
1304 SCHED_OP(new_ops, tick_resume, cpu);
1305 SCHED_OP(new_ops, insert_vcpu, idle);
1307 spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags);
1309 SCHED_OP(old_ops, free_vdata, vpriv);
1310 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1313 struct scheduler *scheduler_get_default(void)
1315 return &ops;
1318 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1320 int i;
1321 struct scheduler *sched;
1323 for ( i = 0; schedulers[i] != NULL; i++ )
1324 if ( schedulers[i]->sched_id == sched_id )
1325 goto found;
1326 *perr = -ENOENT;
1327 return NULL;
1329 found:
1330 *perr = -ENOMEM;
1331 if ( (sched = xmalloc(struct scheduler)) == NULL )
1332 return NULL;
1333 memcpy(sched, schedulers[i], sizeof(*sched));
1334 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1336 xfree(sched);
1337 sched = NULL;
1340 return sched;
1343 void scheduler_free(struct scheduler *sched)
1345 BUG_ON(sched == &ops);
1346 SCHED_OP(sched, deinit);
1347 xfree(sched);
1350 void schedule_dump(struct cpupool *c)
1352 int i;
1353 struct scheduler *sched;
1354 cpumask_t *cpus;
1356 sched = (c == NULL) ? &ops : c->sched;
1357 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
1358 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1359 SCHED_OP(sched, dump_settings);
1361 for_each_cpu_mask (i, *cpus)
1363 spin_lock(per_cpu(schedule_data, i).schedule_lock);
1364 printk("CPU[%02d] ", i);
1365 SCHED_OP(sched, dump_cpu_state, i);
1366 spin_unlock(per_cpu(schedule_data, i).schedule_lock);
1370 void sched_tick_suspend(void)
1372 struct scheduler *sched;
1373 unsigned int cpu = smp_processor_id();
1375 sched = per_cpu(scheduler, cpu);
1376 SCHED_OP(sched, tick_suspend, cpu);
1379 void sched_tick_resume(void)
1381 struct scheduler *sched;
1382 unsigned int cpu = smp_processor_id();
1384 sched = per_cpu(scheduler, cpu);
1385 SCHED_OP(sched, tick_resume, cpu);
1388 #ifdef CONFIG_COMPAT
1389 #include "compat/schedule.c"
1390 #endif
1392 #endif /* !COMPAT */
1394 /*
1395 * Local variables:
1396 * mode: C
1397 * c-set-style: "BSD"
1398 * c-basic-offset: 4
1399 * tab-width: 4
1400 * indent-tabs-mode: nil
1401 * End:
1402 */