debuggers.hg

view xen/common/schedule.c @ 22855:1d1eec7e1fb4

xl: Perform minimal validation of virtual disk file while parsing config file

This patch performs some very basic validation on the virtual disk
file passed through the config file. This validation ensures that we
don't go too far with the initialization like spawn qemu and more
while there could be some potentially fundamental issues.

[ Patch fixed up to work with PHYSTYPE_EMPTY 22808:6ec61438713a -iwj ]

Signed-off-by: Kamala Narasimhan <kamala.narasimhan@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
author Kamala Narasimhan <kamala.narasimhan@gmail.com>
date Tue Jan 25 18:09:49 2011 +0000 (2011-01-25)
parents e8acb9753ff1
children 700ac6445812
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <public/sched.h>
37 #include <xsm/xsm.h>
39 /* opt_sched: scheduler - default to credit */
40 static char __initdata opt_sched[10] = "credit";
41 string_param("sched", opt_sched);
43 /* if sched_smt_power_savings is set,
44 * scheduler will give preferrence to partially idle package compared to
45 * the full idle package, when picking pCPU to schedule vCPU.
46 */
47 bool_t sched_smt_power_savings = 0;
48 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
50 /* Various timer handlers. */
51 static void s_timer_fn(void *unused);
52 static void vcpu_periodic_timer_fn(void *data);
53 static void vcpu_singleshot_timer_fn(void *data);
54 static void poll_timer_fn(void *data);
56 /* This is global for now so that private implementations can reach it */
57 DEFINE_PER_CPU(struct schedule_data, schedule_data);
58 DEFINE_PER_CPU(struct scheduler *, scheduler);
60 extern const struct scheduler sched_sedf_def;
61 extern const struct scheduler sched_credit_def;
62 extern const struct scheduler sched_credit2_def;
63 extern const struct scheduler sched_arinc653_def;
64 static const struct scheduler *schedulers[] = {
65 &sched_sedf_def,
66 &sched_credit_def,
67 &sched_credit2_def,
68 &sched_arinc653_def,
69 NULL
70 };
72 static struct scheduler __read_mostly ops;
74 #define SCHED_OP(opsptr, fn, ...) \
75 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
76 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
78 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
79 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
80 #define VCPU2ONLINE(_v) \
81 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
82 : &(_v)->domain->cpupool->cpu_valid)
84 static inline void trace_runstate_change(struct vcpu *v, int new_state)
85 {
86 struct { uint32_t vcpu:16, domain:16; } d;
87 uint32_t event;
89 if ( likely(!tb_init_done) )
90 return;
92 d.vcpu = v->vcpu_id;
93 d.domain = v->domain->domain_id;
95 event = TRC_SCHED_RUNSTATE_CHANGE;
96 event |= ( v->runstate.state & 0x3 ) << 8;
97 event |= ( new_state & 0x3 ) << 4;
99 __trace_var(event, 1/*tsc*/, sizeof(d), &d);
100 }
102 static inline void trace_continue_running(struct vcpu *v)
103 {
104 struct { uint32_t vcpu:16, domain:16; } d;
106 if ( likely(!tb_init_done) )
107 return;
109 d.vcpu = v->vcpu_id;
110 d.domain = v->domain->domain_id;
112 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
113 }
115 static inline void vcpu_urgent_count_update(struct vcpu *v)
116 {
117 if ( is_idle_vcpu(v) )
118 return;
120 if ( unlikely(v->is_urgent) )
121 {
122 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
123 !test_bit(v->vcpu_id, v->domain->poll_mask) )
124 {
125 v->is_urgent = 0;
126 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
127 }
128 }
129 else
130 {
131 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
132 test_bit(v->vcpu_id, v->domain->poll_mask)) )
133 {
134 v->is_urgent = 1;
135 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
136 }
137 }
138 }
140 static inline void vcpu_runstate_change(
141 struct vcpu *v, int new_state, s_time_t new_entry_time)
142 {
143 s_time_t delta;
145 ASSERT(v->runstate.state != new_state);
146 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
148 vcpu_urgent_count_update(v);
150 trace_runstate_change(v, new_state);
152 delta = new_entry_time - v->runstate.state_entry_time;
153 if ( delta > 0 )
154 {
155 v->runstate.time[v->runstate.state] += delta;
156 v->runstate.state_entry_time = new_entry_time;
157 }
159 v->runstate.state = new_state;
160 }
162 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
163 {
164 s_time_t delta;
166 if ( unlikely(v != current) )
167 vcpu_schedule_lock_irq(v);
169 memcpy(runstate, &v->runstate, sizeof(*runstate));
170 delta = NOW() - runstate->state_entry_time;
171 if ( delta > 0 )
172 runstate->time[runstate->state] += delta;
174 if ( unlikely(v != current) )
175 vcpu_schedule_unlock_irq(v);
176 }
178 uint64_t get_cpu_idle_time(unsigned int cpu)
179 {
180 struct vcpu_runstate_info state;
181 struct vcpu *v;
183 if ( (v = idle_vcpu[cpu]) == NULL )
184 return 0;
186 vcpu_runstate_get(v, &state);
187 return state.time[RUNSTATE_running];
188 }
190 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
191 {
192 struct domain *d = v->domain;
194 /*
195 * Initialize processor and affinity settings. The idler, and potentially
196 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
197 */
198 v->processor = processor;
199 if ( is_idle_domain(d) || d->is_pinned )
200 v->cpu_affinity = cpumask_of_cpu(processor);
201 else
202 cpus_setall(v->cpu_affinity);
204 /* Initialise the per-vcpu timers. */
205 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
206 v, v->processor);
207 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
208 v, v->processor);
209 init_timer(&v->poll_timer, poll_timer_fn,
210 v, v->processor);
212 /* Idle VCPUs are scheduled immediately. */
213 if ( is_idle_domain(d) )
214 {
215 per_cpu(schedule_data, v->processor).curr = v;
216 v->is_running = 1;
217 }
219 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
221 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
222 if ( v->sched_priv == NULL )
223 return 1;
225 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
227 return 0;
228 }
230 int sched_move_domain(struct domain *d, struct cpupool *c)
231 {
232 struct vcpu *v;
233 unsigned int new_p;
234 void **vcpu_priv;
235 void *domdata;
237 domdata = SCHED_OP(c->sched, alloc_domdata, d);
238 if ( domdata == NULL )
239 return -ENOMEM;
241 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
242 if ( vcpu_priv == NULL )
243 {
244 SCHED_OP(c->sched, free_domdata, domdata);
245 return -ENOMEM;
246 }
248 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
249 for_each_vcpu ( d, v )
250 {
251 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
252 if ( vcpu_priv[v->vcpu_id] == NULL )
253 {
254 for_each_vcpu ( d, v )
255 {
256 if ( vcpu_priv[v->vcpu_id] != NULL )
257 xfree(vcpu_priv[v->vcpu_id]);
258 }
259 xfree(vcpu_priv);
260 SCHED_OP(c->sched, free_domdata, domdata);
261 return -ENOMEM;
262 }
263 }
265 domain_pause(d);
267 new_p = first_cpu(c->cpu_valid);
268 for_each_vcpu ( d, v )
269 {
270 migrate_timer(&v->periodic_timer, new_p);
271 migrate_timer(&v->singleshot_timer, new_p);
272 migrate_timer(&v->poll_timer, new_p);
274 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
275 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
277 cpus_setall(v->cpu_affinity);
278 v->processor = new_p;
279 v->sched_priv = vcpu_priv[v->vcpu_id];
280 evtchn_move_pirqs(v);
282 new_p = cycle_cpu(new_p, c->cpu_valid);
284 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
285 }
286 domain_update_node_affinity(d);
288 d->cpupool = c;
289 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
290 d->sched_priv = domdata;
292 domain_unpause(d);
294 xfree(vcpu_priv);
296 return 0;
297 }
299 void sched_destroy_vcpu(struct vcpu *v)
300 {
301 kill_timer(&v->periodic_timer);
302 kill_timer(&v->singleshot_timer);
303 kill_timer(&v->poll_timer);
304 if ( test_and_clear_bool(v->is_urgent) )
305 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
306 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
307 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
308 }
310 int sched_init_domain(struct domain *d)
311 {
312 return SCHED_OP(DOM2OP(d), init_domain, d);
313 }
315 void sched_destroy_domain(struct domain *d)
316 {
317 SCHED_OP(DOM2OP(d), destroy_domain, d);
318 }
320 void vcpu_sleep_nosync(struct vcpu *v)
321 {
322 unsigned long flags;
324 vcpu_schedule_lock_irqsave(v, flags);
326 if ( likely(!vcpu_runnable(v)) )
327 {
328 if ( v->runstate.state == RUNSTATE_runnable )
329 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
331 SCHED_OP(VCPU2OP(v), sleep, v);
332 }
334 vcpu_schedule_unlock_irqrestore(v, flags);
336 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
337 }
339 void vcpu_sleep_sync(struct vcpu *v)
340 {
341 vcpu_sleep_nosync(v);
343 while ( !vcpu_runnable(v) && v->is_running )
344 cpu_relax();
346 sync_vcpu_execstate(v);
347 }
349 void vcpu_wake(struct vcpu *v)
350 {
351 unsigned long flags;
353 vcpu_schedule_lock_irqsave(v, flags);
355 if ( likely(vcpu_runnable(v)) )
356 {
357 if ( v->runstate.state >= RUNSTATE_blocked )
358 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
359 SCHED_OP(VCPU2OP(v), wake, v);
360 }
361 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
362 {
363 if ( v->runstate.state == RUNSTATE_blocked )
364 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
365 }
367 vcpu_schedule_unlock_irqrestore(v, flags);
369 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
370 }
372 void vcpu_unblock(struct vcpu *v)
373 {
374 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
375 return;
377 /* Polling period ends when a VCPU is unblocked. */
378 if ( unlikely(v->poll_evtchn != 0) )
379 {
380 v->poll_evtchn = 0;
381 /*
382 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
383 * this VCPU (and it then going back to sleep on poll_mask).
384 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
385 */
386 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
387 clear_bit(_VPF_blocked, &v->pause_flags);
388 }
390 vcpu_wake(v);
391 }
393 static void vcpu_migrate(struct vcpu *v)
394 {
395 unsigned long flags;
396 int old_cpu, new_cpu;
398 vcpu_schedule_lock_irqsave(v, flags);
400 /*
401 * NB. Check of v->running happens /after/ setting migration flag
402 * because they both happen in (different) spinlock regions, and those
403 * regions are strictly serialised.
404 */
405 if ( v->is_running ||
406 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
407 {
408 vcpu_schedule_unlock_irqrestore(v, flags);
409 return;
410 }
412 /* Select new CPU. */
413 old_cpu = v->processor;
414 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
416 /*
417 * Transfer urgency status to new CPU before switching CPUs, as once
418 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
419 * scheduler lock we are holding.
420 */
421 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
422 {
423 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
424 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
425 }
427 /* Switch to new CPU, then unlock old CPU. This is safe because
428 * the lock pointer cant' change while the current lock is held. */
429 v->processor = new_cpu;
430 spin_unlock_irqrestore(
431 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
433 if ( old_cpu != new_cpu )
434 evtchn_move_pirqs(v);
436 /* Wake on new CPU. */
437 vcpu_wake(v);
438 }
440 /*
441 * Force a VCPU through a deschedule/reschedule path.
442 * For example, using this when setting the periodic timer period means that
443 * most periodic-timer state need only be touched from within the scheduler
444 * which can thus be done without need for synchronisation.
445 */
446 void vcpu_force_reschedule(struct vcpu *v)
447 {
448 vcpu_schedule_lock_irq(v);
449 if ( v->is_running )
450 set_bit(_VPF_migrating, &v->pause_flags);
451 vcpu_schedule_unlock_irq(v);
453 if ( test_bit(_VPF_migrating, &v->pause_flags) )
454 {
455 vcpu_sleep_nosync(v);
456 vcpu_migrate(v);
457 }
458 }
460 /*
461 * This function is used by cpu_hotplug code from stop_machine context
462 * and from cpupools to switch schedulers on a cpu.
463 */
464 int cpu_disable_scheduler(unsigned int cpu)
465 {
466 struct domain *d;
467 struct vcpu *v;
468 struct cpupool *c;
469 int ret = 0;
470 bool_t affinity_broken;
472 c = per_cpu(cpupool, cpu);
473 if ( c == NULL )
474 return ret;
476 for_each_domain ( d )
477 {
478 if ( d->cpupool != c )
479 continue;
481 affinity_broken = 0;
483 for_each_vcpu ( d, v )
484 {
485 vcpu_schedule_lock_irq(v);
487 if ( (cpus_weight(v->cpu_affinity) == 1) &&
488 cpu_isset(cpu, v->cpu_affinity) )
489 {
490 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
491 v->domain->domain_id, v->vcpu_id);
492 cpus_setall(v->cpu_affinity);
493 affinity_broken = 1;
494 }
496 if ( v->processor == cpu )
497 {
498 set_bit(_VPF_migrating, &v->pause_flags);
499 vcpu_schedule_unlock_irq(v);
500 vcpu_sleep_nosync(v);
501 vcpu_migrate(v);
502 }
503 else
504 {
505 vcpu_schedule_unlock_irq(v);
506 }
508 /*
509 * A vcpu active in the hypervisor will not be migratable.
510 * The caller should try again after releasing and reaquiring
511 * all locks.
512 */
513 if ( v->processor == cpu )
514 ret = -EAGAIN;
515 }
517 if ( affinity_broken )
518 domain_update_node_affinity(d);
519 }
521 return ret;
522 }
524 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
525 {
526 cpumask_t online_affinity, old_affinity;
527 cpumask_t *online;
529 if ( v->domain->is_pinned )
530 return -EINVAL;
531 online = VCPU2ONLINE(v);
532 cpus_and(online_affinity, *affinity, *online);
533 if ( cpus_empty(online_affinity) )
534 return -EINVAL;
536 vcpu_schedule_lock_irq(v);
538 old_affinity = v->cpu_affinity;
539 v->cpu_affinity = *affinity;
540 *affinity = old_affinity;
541 if ( !cpu_isset(v->processor, v->cpu_affinity) )
542 set_bit(_VPF_migrating, &v->pause_flags);
544 vcpu_schedule_unlock_irq(v);
546 domain_update_node_affinity(v->domain);
548 if ( test_bit(_VPF_migrating, &v->pause_flags) )
549 {
550 vcpu_sleep_nosync(v);
551 vcpu_migrate(v);
552 }
554 return 0;
555 }
557 /* Block the currently-executing domain until a pertinent event occurs. */
558 static long do_block(void)
559 {
560 struct vcpu *v = current;
562 local_event_delivery_enable();
563 set_bit(_VPF_blocked, &v->pause_flags);
565 /* Check for events /after/ blocking: avoids wakeup waiting race. */
566 if ( local_events_need_delivery() )
567 {
568 clear_bit(_VPF_blocked, &v->pause_flags);
569 }
570 else
571 {
572 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
573 raise_softirq(SCHEDULE_SOFTIRQ);
574 }
576 return 0;
577 }
579 static long do_poll(struct sched_poll *sched_poll)
580 {
581 struct vcpu *v = current;
582 struct domain *d = v->domain;
583 evtchn_port_t port;
584 long rc;
585 unsigned int i;
587 /* Fairly arbitrary limit. */
588 if ( sched_poll->nr_ports > 128 )
589 return -EINVAL;
591 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
592 return -EFAULT;
594 set_bit(_VPF_blocked, &v->pause_flags);
595 v->poll_evtchn = -1;
596 set_bit(v->vcpu_id, d->poll_mask);
598 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
599 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
600 smp_mb();
602 /*
603 * Someone may have seen we are blocked but not that we are polling, or
604 * vice versa. We are certainly being woken, so clean up and bail. Beyond
605 * this point others can be guaranteed to clean up for us if they wake us.
606 */
607 rc = 0;
608 if ( (v->poll_evtchn == 0) ||
609 !test_bit(_VPF_blocked, &v->pause_flags) ||
610 !test_bit(v->vcpu_id, d->poll_mask) )
611 goto out;
612 #endif
614 rc = 0;
615 if ( local_events_need_delivery() )
616 goto out;
618 for ( i = 0; i < sched_poll->nr_ports; i++ )
619 {
620 rc = -EFAULT;
621 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
622 goto out;
624 rc = -EINVAL;
625 if ( port >= MAX_EVTCHNS(d) )
626 goto out;
628 rc = 0;
629 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
630 goto out;
631 }
633 if ( sched_poll->nr_ports == 1 )
634 v->poll_evtchn = port;
636 if ( sched_poll->timeout != 0 )
637 set_timer(&v->poll_timer, sched_poll->timeout);
639 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
640 raise_softirq(SCHEDULE_SOFTIRQ);
642 return 0;
644 out:
645 v->poll_evtchn = 0;
646 clear_bit(v->vcpu_id, d->poll_mask);
647 clear_bit(_VPF_blocked, &v->pause_flags);
648 return rc;
649 }
651 /* Voluntarily yield the processor for this allocation. */
652 static long do_yield(void)
653 {
654 struct vcpu * v=current;
656 vcpu_schedule_lock_irq(v);
657 SCHED_OP(VCPU2OP(v), yield, v);
658 vcpu_schedule_unlock_irq(v);
660 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
661 raise_softirq(SCHEDULE_SOFTIRQ);
662 return 0;
663 }
665 static void domain_watchdog_timeout(void *data)
666 {
667 struct domain *d = data;
669 if ( d->is_shutting_down || d->is_dying )
670 return;
672 printk("Watchdog timer fired for domain %u\n", d->domain_id);
673 domain_shutdown(d, SHUTDOWN_watchdog);
674 }
676 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
677 {
678 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
679 return -EINVAL;
681 spin_lock(&d->watchdog_lock);
683 if ( id == 0 )
684 {
685 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
686 {
687 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
688 continue;
689 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
690 break;
691 }
692 spin_unlock(&d->watchdog_lock);
693 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
694 }
696 id -= 1;
697 if ( !test_bit(id, &d->watchdog_inuse_map) )
698 {
699 spin_unlock(&d->watchdog_lock);
700 return -EEXIST;
701 }
703 if ( timeout == 0 )
704 {
705 stop_timer(&d->watchdog_timer[id]);
706 clear_bit(id, &d->watchdog_inuse_map);
707 }
708 else
709 {
710 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
711 }
713 spin_unlock(&d->watchdog_lock);
714 return 0;
715 }
717 void watchdog_domain_init(struct domain *d)
718 {
719 unsigned int i;
721 spin_lock_init(&d->watchdog_lock);
723 d->watchdog_inuse_map = 0;
725 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
726 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
727 }
729 void watchdog_domain_destroy(struct domain *d)
730 {
731 unsigned int i;
733 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
734 kill_timer(&d->watchdog_timer[i]);
735 }
737 long do_sched_op_compat(int cmd, unsigned long arg)
738 {
739 long ret = 0;
741 switch ( cmd )
742 {
743 case SCHEDOP_yield:
744 {
745 ret = do_yield();
746 break;
747 }
749 case SCHEDOP_block:
750 {
751 ret = do_block();
752 break;
753 }
755 case SCHEDOP_shutdown:
756 {
757 TRACE_3D(TRC_SCHED_SHUTDOWN,
758 current->domain->domain_id, current->vcpu_id, arg);
759 domain_shutdown(current->domain, (u8)arg);
760 break;
761 }
763 default:
764 ret = -ENOSYS;
765 }
767 return ret;
768 }
770 typedef long ret_t;
772 #endif /* !COMPAT */
774 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
775 {
776 ret_t ret = 0;
778 switch ( cmd )
779 {
780 case SCHEDOP_yield:
781 {
782 ret = do_yield();
783 break;
784 }
786 case SCHEDOP_block:
787 {
788 ret = do_block();
789 break;
790 }
792 case SCHEDOP_shutdown:
793 {
794 struct sched_shutdown sched_shutdown;
796 ret = -EFAULT;
797 if ( copy_from_guest(&sched_shutdown, arg, 1) )
798 break;
800 ret = 0;
801 TRACE_3D(TRC_SCHED_SHUTDOWN,
802 current->domain->domain_id, current->vcpu_id,
803 sched_shutdown.reason);
804 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
806 break;
807 }
809 case SCHEDOP_shutdown_code:
810 {
811 struct sched_shutdown sched_shutdown;
812 struct domain *d = current->domain;
814 ret = -EFAULT;
815 if ( copy_from_guest(&sched_shutdown, arg, 1) )
816 break;
818 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
819 d->domain_id, current->vcpu_id, sched_shutdown.reason);
821 spin_lock(&d->shutdown_lock);
822 if ( d->shutdown_code == -1 )
823 d->shutdown_code = (u8)sched_shutdown.reason;
824 spin_unlock(&d->shutdown_lock);
826 ret = 0;
827 break;
828 }
830 case SCHEDOP_poll:
831 {
832 struct sched_poll sched_poll;
834 ret = -EFAULT;
835 if ( copy_from_guest(&sched_poll, arg, 1) )
836 break;
838 ret = do_poll(&sched_poll);
840 break;
841 }
843 case SCHEDOP_remote_shutdown:
844 {
845 struct domain *d;
846 struct sched_remote_shutdown sched_remote_shutdown;
848 ret = -EFAULT;
849 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
850 break;
852 ret = -ESRCH;
853 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
854 if ( d == NULL )
855 break;
857 if ( !IS_PRIV_FOR(current->domain, d) )
858 {
859 rcu_unlock_domain(d);
860 return -EPERM;
861 }
863 ret = xsm_schedop_shutdown(current->domain, d);
864 if ( ret )
865 {
866 rcu_unlock_domain(d);
867 return ret;
868 }
870 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
872 rcu_unlock_domain(d);
873 ret = 0;
875 break;
876 }
878 case SCHEDOP_watchdog:
879 {
880 struct sched_watchdog sched_watchdog;
882 ret = -EFAULT;
883 if ( copy_from_guest(&sched_watchdog, arg, 1) )
884 break;
886 ret = domain_watchdog(
887 current->domain, sched_watchdog.id, sched_watchdog.timeout);
888 break;
889 }
891 default:
892 ret = -ENOSYS;
893 }
895 return ret;
896 }
898 #ifndef COMPAT
900 /* Per-vcpu oneshot-timer hypercall. */
901 long do_set_timer_op(s_time_t timeout)
902 {
903 struct vcpu *v = current;
904 s_time_t offset = timeout - NOW();
906 if ( timeout == 0 )
907 {
908 stop_timer(&v->singleshot_timer);
909 }
910 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
911 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
912 {
913 /*
914 * Linux workaround: occasionally we will see timeouts a long way in
915 * the future due to wrapping in Linux's jiffy time handling. We check
916 * for timeouts wrapped negative, and for positive timeouts more than
917 * about 13 days in the future (2^50ns). The correct fix is to trigger
918 * an interrupt immediately (since Linux in fact has pending work to
919 * do in this situation). However, older guests also set a long timeout
920 * when they have *no* pending timers at all: setting an immediate
921 * timeout in this case can burn a lot of CPU. We therefore go for a
922 * reasonable middleground of triggering a timer event in 100ms.
923 */
924 gdprintk(XENLOG_INFO,
925 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
926 v->vcpu_id, (uint64_t)timeout);
927 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
928 }
929 else
930 {
931 migrate_timer(&v->singleshot_timer, smp_processor_id());
932 set_timer(&v->singleshot_timer, timeout);
933 }
935 return 0;
936 }
938 /* sched_id - fetch ID of current scheduler */
939 int sched_id(void)
940 {
941 return ops.sched_id;
942 }
944 /* Adjust scheduling parameter for a given domain. */
945 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
946 {
947 struct vcpu *v;
948 long ret;
950 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
951 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
952 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
953 return -EINVAL;
955 /*
956 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
957 * we acquire the local schedule_lock to guard against concurrent updates.
958 *
959 * We only acquire the local schedule lock after we have paused all other
960 * VCPUs in this domain. There are two reasons for this:
961 * 1- We don't want to hold up interrupts as pausing a VCPU can
962 * trigger a tlb shootdown.
963 * 2- Pausing other VCPUs involves briefly locking the schedule
964 * lock of the CPU they are running on. This CPU could be the
965 * same as ours.
966 */
968 for_each_vcpu ( d, v )
969 {
970 if ( v != current )
971 vcpu_pause(v);
972 }
974 if ( d == current->domain )
975 vcpu_schedule_lock_irq(current);
977 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
978 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
980 if ( d == current->domain )
981 vcpu_schedule_unlock_irq(current);
983 for_each_vcpu ( d, v )
984 {
985 if ( v != current )
986 vcpu_unpause(v);
987 }
989 return ret;
990 }
992 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
993 {
994 struct cpupool *pool;
995 int rc;
997 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
998 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
999 return -EINVAL;
1001 pool = cpupool_get_by_id(op->cpupool_id);
1002 if ( pool == NULL )
1003 return -ESRCH;
1005 rc = ((op->sched_id == pool->sched->sched_id)
1006 ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
1008 cpupool_put(pool);
1010 return rc;
1013 static void vcpu_periodic_timer_work(struct vcpu *v)
1015 s_time_t now = NOW();
1016 s_time_t periodic_next_event;
1018 if ( v->periodic_period == 0 )
1019 return;
1021 periodic_next_event = v->periodic_last_event + v->periodic_period;
1023 if ( now >= periodic_next_event )
1025 send_timer_event(v);
1026 v->periodic_last_event = now;
1027 periodic_next_event = now + v->periodic_period;
1030 migrate_timer(&v->periodic_timer, smp_processor_id());
1031 set_timer(&v->periodic_timer, periodic_next_event);
1034 /*
1035 * The main function
1036 * - deschedule the current domain (scheduler independent).
1037 * - pick a new domain (scheduler dependent).
1038 */
1039 static void schedule(void)
1041 struct vcpu *prev = current, *next = NULL;
1042 s_time_t now = NOW();
1043 struct scheduler *sched = this_cpu(scheduler);
1044 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1045 bool_t tasklet_work_scheduled = 0;
1046 struct schedule_data *sd;
1047 struct task_slice next_slice;
1049 ASSERT(!in_atomic());
1051 perfc_incr(sched_run);
1053 sd = &this_cpu(schedule_data);
1055 /* Update tasklet scheduling status. */
1056 switch ( *tasklet_work )
1058 case TASKLET_enqueued:
1059 set_bit(_TASKLET_scheduled, tasklet_work);
1060 case TASKLET_enqueued|TASKLET_scheduled:
1061 tasklet_work_scheduled = 1;
1062 break;
1063 case TASKLET_scheduled:
1064 clear_bit(_TASKLET_scheduled, tasklet_work);
1065 case 0:
1066 /*tasklet_work_scheduled = 0;*/
1067 break;
1068 default:
1069 BUG();
1072 spin_lock_irq(sd->schedule_lock);
1074 stop_timer(&sd->s_timer);
1076 /* get policy-specific decision on scheduling... */
1077 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1079 next = next_slice.task;
1081 sd->curr = next;
1083 if ( next_slice.time >= 0 ) /* -ve means no limit */
1084 set_timer(&sd->s_timer, now + next_slice.time);
1086 if ( unlikely(prev == next) )
1088 spin_unlock_irq(sd->schedule_lock);
1089 trace_continue_running(next);
1090 return continue_running(prev);
1093 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
1094 prev->domain->domain_id,
1095 now - prev->runstate.state_entry_time);
1096 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
1097 next->domain->domain_id,
1098 (next->runstate.state == RUNSTATE_runnable) ?
1099 (now - next->runstate.state_entry_time) : 0,
1100 next_slice.time);
1102 ASSERT(prev->runstate.state == RUNSTATE_running);
1104 TRACE_4D(TRC_SCHED_SWITCH,
1105 prev->domain->domain_id, prev->vcpu_id,
1106 next->domain->domain_id, next->vcpu_id);
1108 vcpu_runstate_change(
1109 prev,
1110 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
1111 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1112 now);
1113 prev->last_run_time = now;
1115 ASSERT(next->runstate.state != RUNSTATE_running);
1116 vcpu_runstate_change(next, RUNSTATE_running, now);
1118 /*
1119 * NB. Don't add any trace records from here until the actual context
1120 * switch, else lost_records resume will not work properly.
1121 */
1123 ASSERT(!next->is_running);
1124 next->is_running = 1;
1126 spin_unlock_irq(sd->schedule_lock);
1128 perfc_incr(sched_ctx);
1130 stop_timer(&prev->periodic_timer);
1132 if ( next_slice.migrated )
1133 evtchn_move_pirqs(next);
1135 /* Ensure that the domain has an up-to-date time base. */
1136 update_vcpu_system_time(next);
1137 vcpu_periodic_timer_work(next);
1139 context_switch(prev, next);
1142 void context_saved(struct vcpu *prev)
1144 /* Clear running flag /after/ writing context to memory. */
1145 smp_wmb();
1147 prev->is_running = 0;
1149 /* Check for migration request /after/ clearing running flag. */
1150 smp_mb();
1152 SCHED_OP(VCPU2OP(prev), context_saved, prev);
1154 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
1155 vcpu_migrate(prev);
1158 /* The scheduler timer: force a run through the scheduler */
1159 static void s_timer_fn(void *unused)
1161 raise_softirq(SCHEDULE_SOFTIRQ);
1162 perfc_incr(sched_irq);
1165 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
1166 static void vcpu_periodic_timer_fn(void *data)
1168 struct vcpu *v = data;
1169 vcpu_periodic_timer_work(v);
1172 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
1173 static void vcpu_singleshot_timer_fn(void *data)
1175 struct vcpu *v = data;
1176 send_timer_event(v);
1179 /* SCHEDOP_poll timeout callback. */
1180 static void poll_timer_fn(void *data)
1182 struct vcpu *v = data;
1184 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1185 vcpu_unblock(v);
1188 static int cpu_schedule_up(unsigned int cpu)
1190 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1192 per_cpu(scheduler, cpu) = &ops;
1193 spin_lock_init(&sd->_lock);
1194 sd->schedule_lock = &sd->_lock;
1195 sd->curr = idle_vcpu[cpu];
1196 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1197 atomic_set(&sd->urgent_count, 0);
1199 /* Boot CPU is dealt with later in schedule_init(). */
1200 if ( cpu == 0 )
1201 return 0;
1203 if ( idle_vcpu[cpu] == NULL )
1204 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1205 if ( idle_vcpu[cpu] == NULL )
1206 return -ENOMEM;
1208 if ( (ops.alloc_pdata != NULL) &&
1209 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
1210 return -ENOMEM;
1212 return 0;
1215 static void cpu_schedule_down(unsigned int cpu)
1217 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1219 if ( sd->sched_priv != NULL )
1220 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
1222 kill_timer(&sd->s_timer);
1225 static int cpu_schedule_callback(
1226 struct notifier_block *nfb, unsigned long action, void *hcpu)
1228 unsigned int cpu = (unsigned long)hcpu;
1229 int rc = 0;
1231 switch ( action )
1233 case CPU_UP_PREPARE:
1234 rc = cpu_schedule_up(cpu);
1235 break;
1236 case CPU_UP_CANCELED:
1237 case CPU_DEAD:
1238 cpu_schedule_down(cpu);
1239 break;
1240 default:
1241 break;
1244 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1247 static struct notifier_block cpu_schedule_nfb = {
1248 .notifier_call = cpu_schedule_callback
1249 };
1251 /* Initialise the data structures. */
1252 void __init scheduler_init(void)
1254 struct domain *idle_domain;
1255 int i;
1257 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1259 for ( i = 0; schedulers[i] != NULL; i++ )
1261 ops = *schedulers[i];
1262 if ( strcmp(ops.opt_name, opt_sched) == 0 )
1263 break;
1266 if ( schedulers[i] == NULL )
1268 printk("Could not find scheduler: %s\n", opt_sched);
1269 ops = *schedulers[0];
1272 if ( cpu_schedule_up(0) )
1273 BUG();
1274 register_cpu_notifier(&cpu_schedule_nfb);
1276 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1277 if ( SCHED_OP(&ops, init) )
1278 panic("scheduler returned error on init\n");
1280 idle_domain = domain_create(DOMID_IDLE, 0, 0);
1281 BUG_ON(idle_domain == NULL);
1282 idle_domain->vcpu = idle_vcpu;
1283 idle_domain->max_vcpus = NR_CPUS;
1284 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1285 BUG();
1286 if ( ops.alloc_pdata &&
1287 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
1288 BUG();
1291 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1293 unsigned long flags;
1294 struct vcpu *idle;
1295 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1296 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1297 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1299 if ( old_ops == new_ops )
1300 return;
1302 idle = idle_vcpu[cpu];
1303 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1304 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1306 pcpu_schedule_lock_irqsave(cpu, flags);
1308 SCHED_OP(old_ops, tick_suspend, cpu);
1309 vpriv_old = idle->sched_priv;
1310 idle->sched_priv = vpriv;
1311 per_cpu(scheduler, cpu) = new_ops;
1312 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1313 per_cpu(schedule_data, cpu).sched_priv = ppriv;
1314 SCHED_OP(new_ops, tick_resume, cpu);
1315 SCHED_OP(new_ops, insert_vcpu, idle);
1317 pcpu_schedule_unlock_irqrestore(cpu, flags);
1319 SCHED_OP(old_ops, free_vdata, vpriv_old);
1320 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1323 struct scheduler *scheduler_get_default(void)
1325 return &ops;
1328 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1330 int i;
1331 struct scheduler *sched;
1333 for ( i = 0; schedulers[i] != NULL; i++ )
1334 if ( schedulers[i]->sched_id == sched_id )
1335 goto found;
1336 *perr = -ENOENT;
1337 return NULL;
1339 found:
1340 *perr = -ENOMEM;
1341 if ( (sched = xmalloc(struct scheduler)) == NULL )
1342 return NULL;
1343 memcpy(sched, schedulers[i], sizeof(*sched));
1344 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1346 xfree(sched);
1347 sched = NULL;
1350 return sched;
1353 void scheduler_free(struct scheduler *sched)
1355 BUG_ON(sched == &ops);
1356 SCHED_OP(sched, deinit);
1357 xfree(sched);
1360 void schedule_dump(struct cpupool *c)
1362 int i;
1363 struct scheduler *sched;
1364 cpumask_t *cpus;
1366 sched = (c == NULL) ? &ops : c->sched;
1367 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
1368 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1369 SCHED_OP(sched, dump_settings);
1371 for_each_cpu_mask (i, *cpus)
1373 pcpu_schedule_lock(i);
1374 printk("CPU[%02d] ", i);
1375 SCHED_OP(sched, dump_cpu_state, i);
1376 pcpu_schedule_unlock(i);
1380 void sched_tick_suspend(void)
1382 struct scheduler *sched;
1383 unsigned int cpu = smp_processor_id();
1385 sched = per_cpu(scheduler, cpu);
1386 SCHED_OP(sched, tick_suspend, cpu);
1389 void sched_tick_resume(void)
1391 struct scheduler *sched;
1392 unsigned int cpu = smp_processor_id();
1394 sched = per_cpu(scheduler, cpu);
1395 SCHED_OP(sched, tick_resume, cpu);
1398 void wait(void)
1400 schedule();
1403 #ifdef CONFIG_COMPAT
1404 #include "compat/schedule.c"
1405 #endif
1407 #endif /* !COMPAT */
1409 /*
1410 * Local variables:
1411 * mode: C
1412 * c-set-style: "BSD"
1413 * c-basic-offset: 4
1414 * tab-width: 4
1415 * indent-tabs-mode: nil
1416 * End:
1417 */