debuggers.hg

annotate xen/common/schedule.c @ 22855:1d1eec7e1fb4

xl: Perform minimal validation of virtual disk file while parsing config file

This patch performs some very basic validation on the virtual disk
file passed through the config file. This validation ensures that we
don't go too far with the initialization like spawn qemu and more
while there could be some potentially fundamental issues.

[ Patch fixed up to work with PHYSTYPE_EMPTY 22808:6ec61438713a -iwj ]

Signed-off-by: Kamala Narasimhan <kamala.narasimhan@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
author Kamala Narasimhan <kamala.narasimhan@gmail.com>
date Tue Jan 25 18:09:49 2011 +0000 (2011-01-25)
parents e8acb9753ff1
children 700ac6445812
rev   line source
kaf24@3952 1 /****************************************************************************
kaf24@756 2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
kaf24@756 3 * (C) 2002-2003 University of Cambridge
mwilli2@1232 4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
iap10@274 5 ****************************************************************************
iap10@274 6 *
kaf24@756 7 * File: common/schedule.c
kaf24@1098 8 * Author: Rolf Neugebauer & Keir Fraser
mwilli2@1232 9 * Updated for generic API by Mark Williamson
iap10@274 10 *
mwilli2@1232 11 * Description: Generic CPU scheduling code
mwilli2@1232 12 * implements support functionality for the Xen scheduler API.
mwilli2@1232 13 *
iap10@274 14 */
iap10@274 15
ack@13304 16 #ifndef COMPAT
kaf24@1248 17 #include <xen/config.h>
kaf24@1248 18 #include <xen/init.h>
kaf24@1248 19 #include <xen/lib.h>
kaf24@1248 20 #include <xen/sched.h>
cl349@5285 21 #include <xen/domain.h>
kaf24@1248 22 #include <xen/delay.h>
kaf24@1248 23 #include <xen/event.h>
kaf24@1248 24 #include <xen/time.h>
kaf24@8616 25 #include <xen/timer.h>
kaf24@1248 26 #include <xen/perfc.h>
kaf24@1248 27 #include <xen/sched-if.h>
kaf24@1544 28 #include <xen/softirq.h>
kaf24@1248 29 #include <xen/trace.h>
kaf24@4877 30 #include <xen/mm.h>
kaf24@11236 31 #include <xen/errno.h>
kaf24@9276 32 #include <xen/guest_access.h>
kfraser@12510 33 #include <xen/multicall.h>
keir@21436 34 #include <xen/cpu.h>
keir@22446 35 #include <xen/preempt.h>
kaf24@7234 36 #include <public/sched.h>
kfraser@15846 37 #include <xsm/xsm.h>
rn@316 38
shand@10497 39 /* opt_sched: scheduler - default to credit */
keir@20173 40 static char __initdata opt_sched[10] = "credit";
kaf24@3372 41 string_param("sched", opt_sched);
kaf24@3372 42
keir@19450 43 /* if sched_smt_power_savings is set,
keir@19450 44 * scheduler will give preferrence to partially idle package compared to
keir@19450 45 * the full idle package, when picking pCPU to schedule vCPU.
keir@19450 46 */
keir@22676 47 bool_t sched_smt_power_savings = 0;
keir@19450 48 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
keir@19450 49
mwilli2@1232 50 /* Various timer handlers. */
kaf24@5190 51 static void s_timer_fn(void *unused);
kfraser@14358 52 static void vcpu_periodic_timer_fn(void *data);
kfraser@14358 53 static void vcpu_singleshot_timer_fn(void *data);
kaf24@9276 54 static void poll_timer_fn(void *data);
mwilli2@1232 55
mwilli2@1284 56 /* This is global for now so that private implementations can reach it */
kaf24@11017 57 DEFINE_PER_CPU(struct schedule_data, schedule_data);
keir@21258 58 DEFINE_PER_CPU(struct scheduler *, scheduler);
mwilli2@1232 59
keir@20420 60 extern const struct scheduler sched_sedf_def;
keir@20420 61 extern const struct scheduler sched_credit_def;
keir@21217 62 extern const struct scheduler sched_credit2_def;
keir@22501 63 extern const struct scheduler sched_arinc653_def;
keir@21258 64 static const struct scheduler *schedulers[] = {
sd386@3487 65 &sched_sedf_def,
ack@10206 66 &sched_credit_def,
keir@21217 67 &sched_credit2_def,
keir@22501 68 &sched_arinc653_def,
kaf24@2633 69 NULL
kaf24@2633 70 };
mwilli2@1232 71
keir@20420 72 static struct scheduler __read_mostly ops;
mwilli2@1232 73
keir@21258 74 #define SCHED_OP(opsptr, fn, ...) \
keir@21258 75 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
keir@21258 76 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
keir@21258 77
keir@21453 78 #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched))
keir@21258 79 #define VCPU2OP(_v) (DOM2OP((_v)->domain))
keir@21258 80 #define VCPU2ONLINE(_v) \
keir@21258 81 (((_v)->domain->cpupool == NULL) ? &cpu_online_map \
keir@21258 82 : &(_v)->domain->cpupool->cpu_valid)
kaf24@974 83
keir@18475 84 static inline void trace_runstate_change(struct vcpu *v, int new_state)
keir@18475 85 {
keir@18475 86 struct { uint32_t vcpu:16, domain:16; } d;
keir@18475 87 uint32_t event;
keir@18475 88
keir@18475 89 if ( likely(!tb_init_done) )
keir@18475 90 return;
keir@18475 91
keir@18475 92 d.vcpu = v->vcpu_id;
keir@18475 93 d.domain = v->domain->domain_id;
keir@18475 94
keir@18475 95 event = TRC_SCHED_RUNSTATE_CHANGE;
keir@18475 96 event |= ( v->runstate.state & 0x3 ) << 8;
keir@18475 97 event |= ( new_state & 0x3 ) << 4;
keir@18475 98
keir@22230 99 __trace_var(event, 1/*tsc*/, sizeof(d), &d);
keir@18475 100 }
keir@18475 101
keir@19339 102 static inline void trace_continue_running(struct vcpu *v)
keir@19339 103 {
keir@19339 104 struct { uint32_t vcpu:16, domain:16; } d;
keir@19339 105
keir@19339 106 if ( likely(!tb_init_done) )
keir@19339 107 return;
keir@19339 108
keir@19339 109 d.vcpu = v->vcpu_id;
keir@19339 110 d.domain = v->domain->domain_id;
keir@19339 111
keir@22230 112 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
keir@19339 113 }
keir@19339 114
keir@20992 115 static inline void vcpu_urgent_count_update(struct vcpu *v)
keir@20992 116 {
keir@20992 117 if ( is_idle_vcpu(v) )
keir@20992 118 return;
keir@20992 119
keir@20992 120 if ( unlikely(v->is_urgent) )
keir@20992 121 {
keir@21018 122 if ( !test_bit(_VPF_blocked, &v->pause_flags) ||
keir@21018 123 !test_bit(v->vcpu_id, v->domain->poll_mask) )
keir@20992 124 {
keir@20992 125 v->is_urgent = 0;
keir@20992 126 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
keir@20992 127 }
keir@20992 128 }
keir@20992 129 else
keir@20992 130 {
keir@21018 131 if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) &&
keir@21018 132 test_bit(v->vcpu_id, v->domain->poll_mask)) )
keir@20992 133 {
keir@20992 134 v->is_urgent = 1;
keir@20992 135 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
keir@20992 136 }
keir@20992 137 }
keir@20992 138 }
keir@20992 139
kaf24@9008 140 static inline void vcpu_runstate_change(
kaf24@9008 141 struct vcpu *v, int new_state, s_time_t new_entry_time)
kaf24@9008 142 {
keir@18936 143 s_time_t delta;
keir@18936 144
kaf24@9008 145 ASSERT(v->runstate.state != new_state);
keir@21215 146 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
kaf24@9008 147
keir@20992 148 vcpu_urgent_count_update(v);
keir@20992 149
keir@18475 150 trace_runstate_change(v, new_state);
keir@18475 151
keir@18936 152 delta = new_entry_time - v->runstate.state_entry_time;
keir@18936 153 if ( delta > 0 )
keir@18936 154 {
keir@18936 155 v->runstate.time[v->runstate.state] += delta;
keir@18936 156 v->runstate.state_entry_time = new_entry_time;
keir@18936 157 }
keir@18940 158
keir@18940 159 v->runstate.state = new_state;
kaf24@9008 160 }
kaf24@9008 161
kaf24@9008 162 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
kaf24@9008 163 {
keir@18936 164 s_time_t delta;
keir@18936 165
keir@18936 166 if ( unlikely(v != current) )
kaf24@9008 167 vcpu_schedule_lock_irq(v);
keir@18936 168
keir@18936 169 memcpy(runstate, &v->runstate, sizeof(*runstate));
keir@18936 170 delta = NOW() - runstate->state_entry_time;
keir@18936 171 if ( delta > 0 )
keir@18936 172 runstate->time[runstate->state] += delta;
keir@18936 173
keir@18936 174 if ( unlikely(v != current) )
kaf24@9008 175 vcpu_schedule_unlock_irq(v);
keir@18936 176 }
keir@18936 177
keir@18936 178 uint64_t get_cpu_idle_time(unsigned int cpu)
keir@18936 179 {
keir@19004 180 struct vcpu_runstate_info state;
keir@18936 181 struct vcpu *v;
keir@18936 182
keir@18936 183 if ( (v = idle_vcpu[cpu]) == NULL )
keir@18936 184 return 0;
keir@18936 185
keir@18936 186 vcpu_runstate_get(v, &state);
keir@18936 187 return state.time[RUNSTATE_running];
kaf24@9008 188 }
kaf24@9008 189
ack@11650 190 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
iap10@274 191 {
kfraser@11652 192 struct domain *d = v->domain;
ack@11650 193
kfraser@11652 194 /*
kfraser@11652 195 * Initialize processor and affinity settings. The idler, and potentially
kfraser@11652 196 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
kfraser@11652 197 */
kfraser@11652 198 v->processor = processor;
keir@17225 199 if ( is_idle_domain(d) || d->is_pinned )
kfraser@11652 200 v->cpu_affinity = cpumask_of_cpu(processor);
ack@11650 201 else
kfraser@11837 202 cpus_setall(v->cpu_affinity);
ack@11650 203
kfraser@14365 204 /* Initialise the per-vcpu timers. */
kfraser@14358 205 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
kfraser@14358 206 v, v->processor);
kfraser@14358 207 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
kfraser@14358 208 v, v->processor);
kfraser@14358 209 init_timer(&v->poll_timer, poll_timer_fn,
kfraser@14358 210 v, v->processor);
rn@340 211
kfraser@11652 212 /* Idle VCPUs are scheduled immediately. */
kfraser@11652 213 if ( is_idle_domain(d) )
kfraser@11652 214 {
kfraser@11652 215 per_cpu(schedule_data, v->processor).curr = v;
kfraser@14692 216 v->is_running = 1;
kfraser@11652 217 }
kfraser@11652 218
kaf24@8563 219 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
kaf24@10281 220
keir@21258 221 v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv);
keir@21258 222 if ( v->sched_priv == NULL )
keir@21258 223 return 1;
keir@21258 224
keir@22324 225 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
keir@22324 226
keir@21258 227 return 0;
keir@21258 228 }
keir@21258 229
keir@21258 230 int sched_move_domain(struct domain *d, struct cpupool *c)
keir@21258 231 {
keir@21258 232 struct vcpu *v;
keir@21258 233 unsigned int new_p;
keir@21258 234 void **vcpu_priv;
keir@21258 235 void *domdata;
keir@21258 236
keir@21453 237 domdata = SCHED_OP(c->sched, alloc_domdata, d);
keir@21258 238 if ( domdata == NULL )
keir@21258 239 return -ENOMEM;
keir@21258 240
keir@21258 241 vcpu_priv = xmalloc_array(void *, d->max_vcpus);
keir@21258 242 if ( vcpu_priv == NULL )
keir@21258 243 {
keir@21453 244 SCHED_OP(c->sched, free_domdata, domdata);
keir@21258 245 return -ENOMEM;
keir@21258 246 }
keir@21258 247
keir@21258 248 memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *));
keir@21258 249 for_each_vcpu ( d, v )
keir@21258 250 {
keir@21453 251 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
keir@21258 252 if ( vcpu_priv[v->vcpu_id] == NULL )
keir@21258 253 {
keir@21258 254 for_each_vcpu ( d, v )
keir@21258 255 {
keir@21258 256 if ( vcpu_priv[v->vcpu_id] != NULL )
keir@21258 257 xfree(vcpu_priv[v->vcpu_id]);
keir@21258 258 }
keir@21258 259 xfree(vcpu_priv);
keir@21453 260 SCHED_OP(c->sched, free_domdata, domdata);
keir@21258 261 return -ENOMEM;
keir@21258 262 }
keir@21258 263 }
keir@21258 264
keir@21258 265 domain_pause(d);
keir@21258 266
keir@21258 267 new_p = first_cpu(c->cpu_valid);
keir@21258 268 for_each_vcpu ( d, v )
keir@21258 269 {
keir@21258 270 migrate_timer(&v->periodic_timer, new_p);
keir@21258 271 migrate_timer(&v->singleshot_timer, new_p);
keir@21258 272 migrate_timer(&v->poll_timer, new_p);
keir@21258 273
keir@22324 274 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
keir@22324 275 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
keir@21258 276
keir@21258 277 cpus_setall(v->cpu_affinity);
keir@21258 278 v->processor = new_p;
keir@21258 279 v->sched_priv = vcpu_priv[v->vcpu_id];
keir@21671 280 evtchn_move_pirqs(v);
keir@21258 281
keir@21258 282 new_p = cycle_cpu(new_p, c->cpu_valid);
keir@22324 283
keir@22324 284 SCHED_OP(VCPU2OP(v), insert_vcpu, v);
keir@21258 285 }
keir@21960 286 domain_update_node_affinity(d);
keir@21258 287
keir@21258 288 d->cpupool = c;
keir@21258 289 SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
keir@21258 290 d->sched_priv = domdata;
keir@21258 291
keir@21258 292 domain_unpause(d);
keir@21258 293
keir@21258 294 xfree(vcpu_priv);
keir@21258 295
keir@21258 296 return 0;
iap10@274 297 }
iap10@274 298
kfraser@12284 299 void sched_destroy_vcpu(struct vcpu *v)
kfraser@12284 300 {
kfraser@14358 301 kill_timer(&v->periodic_timer);
kfraser@14358 302 kill_timer(&v->singleshot_timer);
kfraser@12284 303 kill_timer(&v->poll_timer);
keir@20992 304 if ( test_and_clear_bool(v->is_urgent) )
keir@20992 305 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
keir@22324 306 SCHED_OP(VCPU2OP(v), remove_vcpu, v);
keir@22324 307 SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
kfraser@12284 308 }
kfraser@12284 309
kfraser@12284 310 int sched_init_domain(struct domain *d)
kfraser@12284 311 {
keir@21258 312 return SCHED_OP(DOM2OP(d), init_domain, d);
kfraser@12284 313 }
kfraser@12284 314
kaf24@10281 315 void sched_destroy_domain(struct domain *d)
iap10@274 316 {
keir@21258 317 SCHED_OP(DOM2OP(d), destroy_domain, d);
iap10@274 318 }
iap10@274 319
kaf24@6483 320 void vcpu_sleep_nosync(struct vcpu *v)
kaf24@1544 321 {
kaf24@1544 322 unsigned long flags;
kaf24@1544 323
kaf24@8626 324 vcpu_schedule_lock_irqsave(v, flags);
kaf24@9008 325
kaf24@8547 326 if ( likely(!vcpu_runnable(v)) )
kaf24@9008 327 {
kaf24@9008 328 if ( v->runstate.state == RUNSTATE_runnable )
kaf24@9008 329 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
kaf24@9008 330
keir@21258 331 SCHED_OP(VCPU2OP(v), sleep, v);
kaf24@9008 332 }
kaf24@9008 333
kaf24@8626 334 vcpu_schedule_unlock_irqrestore(v, flags);
kaf24@2633 335
kaf24@5327 336 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
kaf24@8615 337 }
kaf24@5294 338
kaf24@6483 339 void vcpu_sleep_sync(struct vcpu *v)
kaf24@5294 340 {
kaf24@6483 341 vcpu_sleep_nosync(v);
kaf24@5294 342
kfraser@14692 343 while ( !vcpu_runnable(v) && v->is_running )
kaf24@1544 344 cpu_relax();
kaf24@5294 345
kaf24@6491 346 sync_vcpu_execstate(v);
kaf24@1544 347 }
kaf24@1544 348
kaf24@6483 349 void vcpu_wake(struct vcpu *v)
iap10@274 350 {
kaf24@2633 351 unsigned long flags;
gm281@2007 352
kaf24@8626 353 vcpu_schedule_lock_irqsave(v, flags);
kaf24@9008 354
kaf24@8547 355 if ( likely(vcpu_runnable(v)) )
kaf24@1543 356 {
kaf24@9008 357 if ( v->runstate.state >= RUNSTATE_blocked )
kaf24@9008 358 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
keir@21258 359 SCHED_OP(VCPU2OP(v), wake, v);
kaf24@1543 360 }
kfraser@14698 361 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
kaf24@9008 362 {
kaf24@9008 363 if ( v->runstate.state == RUNSTATE_blocked )
kaf24@9008 364 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
kaf24@9008 365 }
kaf24@9008 366
kaf24@8626 367 vcpu_schedule_unlock_irqrestore(v, flags);
bren@4523 368
kaf24@5327 369 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
iap10@274 370 }
iap10@274 371
keir@18466 372 void vcpu_unblock(struct vcpu *v)
keir@18466 373 {
keir@18466 374 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
keir@18466 375 return;
keir@18466 376
keir@18466 377 /* Polling period ends when a VCPU is unblocked. */
keir@18466 378 if ( unlikely(v->poll_evtchn != 0) )
keir@18466 379 {
keir@18466 380 v->poll_evtchn = 0;
keir@18466 381 /*
keir@18466 382 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
keir@18466 383 * this VCPU (and it then going back to sleep on poll_mask).
keir@18466 384 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
keir@18466 385 */
keir@18466 386 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
keir@18466 387 clear_bit(_VPF_blocked, &v->pause_flags);
keir@18466 388 }
keir@18466 389
keir@18466 390 vcpu_wake(v);
keir@18466 391 }
keir@18466 392
kfraser@11601 393 static void vcpu_migrate(struct vcpu *v)
kfraser@11601 394 {
kfraser@11601 395 unsigned long flags;
keir@20992 396 int old_cpu, new_cpu;
kfraser@11601 397
kfraser@11601 398 vcpu_schedule_lock_irqsave(v, flags);
kfraser@11601 399
kfraser@14692 400 /*
kfraser@14692 401 * NB. Check of v->running happens /after/ setting migration flag
kfraser@14692 402 * because they both happen in (different) spinlock regions, and those
kfraser@14692 403 * regions are strictly serialised.
kfraser@14692 404 */
kfraser@14692 405 if ( v->is_running ||
kfraser@14698 406 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
kfraser@11601 407 {
kfraser@11601 408 vcpu_schedule_unlock_irqrestore(v, flags);
kfraser@11601 409 return;
kfraser@11601 410 }
kfraser@11601 411
keir@20992 412 /* Select new CPU. */
kfraser@11601 413 old_cpu = v->processor;
keir@21258 414 new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v);
keir@20992 415
keir@20992 416 /*
keir@20992 417 * Transfer urgency status to new CPU before switching CPUs, as once
keir@20992 418 * the switch occurs, v->is_urgent is no longer protected by the per-CPU
keir@20992 419 * scheduler lock we are holding.
keir@20992 420 */
keir@20992 421 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
keir@20992 422 {
keir@20992 423 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
keir@20992 424 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
keir@20992 425 }
keir@20992 426
keir@22655 427 /* Switch to new CPU, then unlock old CPU. This is safe because
keir@22655 428 * the lock pointer cant' change while the current lock is held. */
keir@20992 429 v->processor = new_cpu;
kfraser@11601 430 spin_unlock_irqrestore(
keir@21215 431 per_cpu(schedule_data, old_cpu).schedule_lock, flags);
kfraser@11601 432
keir@21671 433 if ( old_cpu != new_cpu )
keir@21671 434 evtchn_move_pirqs(v);
keir@21671 435
kfraser@11601 436 /* Wake on new CPU. */
kfraser@11601 437 vcpu_wake(v);
kfraser@11601 438 }
kfraser@11601 439
kfraser@14358 440 /*
kfraser@14358 441 * Force a VCPU through a deschedule/reschedule path.
kfraser@14358 442 * For example, using this when setting the periodic timer period means that
kfraser@14358 443 * most periodic-timer state need only be touched from within the scheduler
kfraser@14358 444 * which can thus be done without need for synchronisation.
kfraser@14358 445 */
kfraser@14358 446 void vcpu_force_reschedule(struct vcpu *v)
kfraser@14358 447 {
kfraser@14358 448 vcpu_schedule_lock_irq(v);
kfraser@14692 449 if ( v->is_running )
kfraser@14698 450 set_bit(_VPF_migrating, &v->pause_flags);
kfraser@14358 451 vcpu_schedule_unlock_irq(v);
kfraser@14358 452
kfraser@14698 453 if ( test_bit(_VPF_migrating, &v->pause_flags) )
kfraser@14358 454 {
kfraser@14358 455 vcpu_sleep_nosync(v);
kfraser@14358 456 vcpu_migrate(v);
kfraser@14358 457 }
kfraser@14358 458 }
kfraser@14358 459
keir@18502 460 /*
keir@21258 461 * This function is used by cpu_hotplug code from stop_machine context
keir@21258 462 * and from cpupools to switch schedulers on a cpu.
keir@18502 463 */
keir@21258 464 int cpu_disable_scheduler(unsigned int cpu)
keir@18502 465 {
keir@18502 466 struct domain *d;
keir@18502 467 struct vcpu *v;
keir@21258 468 struct cpupool *c;
keir@21258 469 int ret = 0;
keir@21960 470 bool_t affinity_broken;
keir@21258 471
keir@21258 472 c = per_cpu(cpupool, cpu);
keir@21258 473 if ( c == NULL )
keir@21258 474 return ret;
keir@18502 475
keir@18502 476 for_each_domain ( d )
keir@18502 477 {
keir@21258 478 if ( d->cpupool != c )
keir@21258 479 continue;
keir@21258 480
keir@21960 481 affinity_broken = 0;
keir@21960 482
keir@18502 483 for_each_vcpu ( d, v )
keir@18502 484 {
keir@21258 485 vcpu_schedule_lock_irq(v);
keir@21258 486
keir@18502 487 if ( (cpus_weight(v->cpu_affinity) == 1) &&
keir@18502 488 cpu_isset(cpu, v->cpu_affinity) )
keir@18502 489 {
keir@18502 490 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
keir@18502 491 v->domain->domain_id, v->vcpu_id);
keir@18502 492 cpus_setall(v->cpu_affinity);
keir@21960 493 affinity_broken = 1;
keir@18502 494 }
keir@18502 495
keir@18502 496 if ( v->processor == cpu )
keir@18502 497 {
keir@18502 498 set_bit(_VPF_migrating, &v->pause_flags);
keir@21258 499 vcpu_schedule_unlock_irq(v);
keir@18502 500 vcpu_sleep_nosync(v);
keir@18502 501 vcpu_migrate(v);
keir@18502 502 }
keir@21258 503 else
keir@21258 504 {
keir@21258 505 vcpu_schedule_unlock_irq(v);
keir@21258 506 }
keir@21258 507
keir@21258 508 /*
keir@21258 509 * A vcpu active in the hypervisor will not be migratable.
keir@21258 510 * The caller should try again after releasing and reaquiring
keir@21258 511 * all locks.
keir@21258 512 */
keir@21258 513 if ( v->processor == cpu )
keir@21258 514 ret = -EAGAIN;
keir@18502 515 }
keir@21960 516
keir@21960 517 if ( affinity_broken )
keir@21960 518 domain_update_node_affinity(d);
keir@18502 519 }
keir@21960 520
keir@21258 521 return ret;
keir@18502 522 }
keir@18502 523
keir@21212 524 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
kaf24@8541 525 {
keir@16934 526 cpumask_t online_affinity, old_affinity;
keir@21258 527 cpumask_t *online;
ack@11650 528
keir@21212 529 if ( v->domain->is_pinned )
keir@21212 530 return -EINVAL;
keir@21258 531 online = VCPU2ONLINE(v);
keir@21258 532 cpus_and(online_affinity, *affinity, *online);
kaf24@8605 533 if ( cpus_empty(online_affinity) )
kaf24@8541 534 return -EINVAL;
kaf24@8541 535
kfraser@14358 536 vcpu_schedule_lock_irq(v);
kfraser@11601 537
keir@16934 538 old_affinity = v->cpu_affinity;
kfraser@11601 539 v->cpu_affinity = *affinity;
keir@16934 540 *affinity = old_affinity;
kfraser@11601 541 if ( !cpu_isset(v->processor, v->cpu_affinity) )
kfraser@14698 542 set_bit(_VPF_migrating, &v->pause_flags);
kfraser@11601 543
kfraser@14358 544 vcpu_schedule_unlock_irq(v);
kfraser@11601 545
keir@21975 546 domain_update_node_affinity(v->domain);
keir@21975 547
kfraser@14698 548 if ( test_bit(_VPF_migrating, &v->pause_flags) )
kfraser@11601 549 {
kfraser@11601 550 vcpu_sleep_nosync(v);
kfraser@11601 551 vcpu_migrate(v);
kfraser@11601 552 }
kfraser@11601 553
kfraser@11601 554 return 0;
kaf24@8541 555 }
kaf24@8541 556
kaf24@1543 557 /* Block the currently-executing domain until a pertinent event occurs. */
kaf24@8742 558 static long do_block(void)
rn@316 559 {
kaf24@5327 560 struct vcpu *v = current;
kaf24@4495 561
kaf24@10354 562 local_event_delivery_enable();
kfraser@14698 563 set_bit(_VPF_blocked, &v->pause_flags);
kaf24@4495 564
kaf24@4495 565 /* Check for events /after/ blocking: avoids wakeup waiting race. */
kaf24@10354 566 if ( local_events_need_delivery() )
kaf24@4734 567 {
kfraser@14698 568 clear_bit(_VPF_blocked, &v->pause_flags);
kaf24@4734 569 }
kaf24@4495 570 else
bren@4523 571 {
kaf24@5327 572 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
kfraser@12509 573 raise_softirq(SCHEDULE_SOFTIRQ);
bren@4523 574 }
kaf24@4495 575
rn@316 576 return 0;
rn@316 577 }
rn@316 578
kaf24@9276 579 static long do_poll(struct sched_poll *sched_poll)
kaf24@9276 580 {
ack@13294 581 struct vcpu *v = current;
ack@13294 582 struct domain *d = v->domain;
ack@13294 583 evtchn_port_t port;
keir@18466 584 long rc;
ack@13294 585 unsigned int i;
kaf24@9276 586
kaf24@9276 587 /* Fairly arbitrary limit. */
kaf24@9276 588 if ( sched_poll->nr_ports > 128 )
kaf24@9276 589 return -EINVAL;
kaf24@9276 590
kaf24@9276 591 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
kaf24@9276 592 return -EFAULT;
kaf24@9276 593
kfraser@14698 594 set_bit(_VPF_blocked, &v->pause_flags);
keir@18466 595 v->poll_evtchn = -1;
keir@18466 596 set_bit(v->vcpu_id, d->poll_mask);
keir@18466 597
keir@18466 598 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
keir@18466 599 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
keir@18466 600 smp_mb();
kfraser@14692 601
keir@18466 602 /*
keir@18466 603 * Someone may have seen we are blocked but not that we are polling, or
keir@18466 604 * vice versa. We are certainly being woken, so clean up and bail. Beyond
keir@18466 605 * this point others can be guaranteed to clean up for us if they wake us.
keir@18466 606 */
keir@18466 607 rc = 0;
keir@18466 608 if ( (v->poll_evtchn == 0) ||
keir@18466 609 !test_bit(_VPF_blocked, &v->pause_flags) ||
keir@18466 610 !test_bit(v->vcpu_id, d->poll_mask) )
keir@18466 611 goto out;
keir@18466 612 #endif
kaf24@9276 613
keir@18678 614 rc = 0;
keir@18678 615 if ( local_events_need_delivery() )
keir@18678 616 goto out;
keir@18678 617
kaf24@9276 618 for ( i = 0; i < sched_poll->nr_ports; i++ )
kaf24@9276 619 {
kaf24@9276 620 rc = -EFAULT;
kaf24@9276 621 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
kaf24@9276 622 goto out;
kaf24@9276 623
kaf24@9276 624 rc = -EINVAL;
ack@13294 625 if ( port >= MAX_EVTCHNS(d) )
kaf24@9276 626 goto out;
kaf24@9276 627
kaf24@9276 628 rc = 0;
keir@17232 629 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
kaf24@9276 630 goto out;
kaf24@9276 631 }
kaf24@9276 632
keir@18466 633 if ( sched_poll->nr_ports == 1 )
keir@18466 634 v->poll_evtchn = port;
keir@18466 635
kaf24@9276 636 if ( sched_poll->timeout != 0 )
kaf24@9276 637 set_timer(&v->poll_timer, sched_poll->timeout);
kaf24@9276 638
ack@13294 639 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
kfraser@12509 640 raise_softirq(SCHEDULE_SOFTIRQ);
kaf24@9276 641
kfraser@12509 642 return 0;
kaf24@9276 643
kaf24@9276 644 out:
keir@18466 645 v->poll_evtchn = 0;
keir@18466 646 clear_bit(v->vcpu_id, d->poll_mask);
kfraser@14698 647 clear_bit(_VPF_blocked, &v->pause_flags);
kaf24@9276 648 return rc;
kaf24@9276 649 }
kaf24@9276 650
kaf24@1544 651 /* Voluntarily yield the processor for this allocation. */
kaf24@1098 652 static long do_yield(void)
kaf24@1098 653 {
keir@21981 654 struct vcpu * v=current;
keir@21981 655
keir@21981 656 vcpu_schedule_lock_irq(v);
keir@21981 657 SCHED_OP(VCPU2OP(v), yield, v);
keir@21981 658 vcpu_schedule_unlock_irq(v);
keir@21981 659
kaf24@4915 660 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
kfraser@12509 661 raise_softirq(SCHEDULE_SOFTIRQ);
kaf24@1098 662 return 0;
kaf24@1098 663 }
kaf24@1098 664
keir@21575 665 static void domain_watchdog_timeout(void *data)
keir@21575 666 {
keir@21575 667 struct domain *d = data;
keir@21575 668
keir@21575 669 if ( d->is_shutting_down || d->is_dying )
keir@21575 670 return;
keir@21575 671
keir@21575 672 printk("Watchdog timer fired for domain %u\n", d->domain_id);
keir@21575 673 domain_shutdown(d, SHUTDOWN_watchdog);
keir@21575 674 }
keir@21575 675
keir@21575 676 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
keir@21575 677 {
keir@21575 678 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
keir@21575 679 return -EINVAL;
keir@21575 680
keir@21575 681 spin_lock(&d->watchdog_lock);
keir@21575 682
keir@21575 683 if ( id == 0 )
keir@21575 684 {
keir@21575 685 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
keir@21575 686 {
keir@21575 687 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
keir@21575 688 continue;
keir@21575 689 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
keir@21575 690 break;
keir@21575 691 }
keir@21575 692 spin_unlock(&d->watchdog_lock);
keir@21575 693 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -EEXIST : id + 1;
keir@21575 694 }
keir@21575 695
keir@21575 696 id -= 1;
keir@21575 697 if ( !test_bit(id, &d->watchdog_inuse_map) )
keir@21575 698 {
keir@21575 699 spin_unlock(&d->watchdog_lock);
keir@21575 700 return -EEXIST;
keir@21575 701 }
keir@21575 702
keir@21575 703 if ( timeout == 0 )
keir@21575 704 {
keir@21575 705 stop_timer(&d->watchdog_timer[id]);
keir@21575 706 clear_bit(id, &d->watchdog_inuse_map);
keir@21575 707 }
keir@21575 708 else
keir@21575 709 {
keir@21575 710 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
keir@21575 711 }
keir@21575 712
keir@21575 713 spin_unlock(&d->watchdog_lock);
keir@21575 714 return 0;
keir@21575 715 }
keir@21575 716
keir@21575 717 void watchdog_domain_init(struct domain *d)
keir@21575 718 {
keir@21575 719 unsigned int i;
keir@21575 720
keir@21575 721 spin_lock_init(&d->watchdog_lock);
keir@21575 722
keir@21575 723 d->watchdog_inuse_map = 0;
keir@21575 724
keir@21575 725 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
keir@21575 726 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
keir@21575 727 }
keir@21575 728
keir@21575 729 void watchdog_domain_destroy(struct domain *d)
keir@21575 730 {
keir@21575 731 unsigned int i;
keir@21575 732
keir@21575 733 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
keir@21575 734 kill_timer(&d->watchdog_timer[i]);
keir@21575 735 }
keir@21575 736
kaf24@9548 737 long do_sched_op_compat(int cmd, unsigned long arg)
akw27@915 738 {
akw27@915 739 long ret = 0;
akw27@915 740
kaf24@7234 741 switch ( cmd )
akw27@915 742 {
akw27@915 743 case SCHEDOP_yield:
akw27@915 744 {
akw27@915 745 ret = do_yield();
akw27@915 746 break;
akw27@915 747 }
akw27@915 748
kaf24@1098 749 case SCHEDOP_block:
kaf24@1098 750 {
kaf24@1098 751 ret = do_block();
kaf24@1098 752 break;
kaf24@1098 753 }
kaf24@1098 754
kaf24@1574 755 case SCHEDOP_shutdown:
kaf24@920 756 {
kaf24@4961 757 TRACE_3D(TRC_SCHED_SHUTDOWN,
kaf24@7234 758 current->domain->domain_id, current->vcpu_id, arg);
kaf24@7830 759 domain_shutdown(current->domain, (u8)arg);
kaf24@920 760 break;
kaf24@920 761 }
kaf24@920 762
akw27@915 763 default:
akw27@915 764 ret = -ENOSYS;
akw27@915 765 }
akw27@915 766
akw27@915 767 return ret;
akw27@915 768 }
akw27@915 769
ack@13304 770 typedef long ret_t;
ack@13304 771
ack@13304 772 #endif /* !COMPAT */
ack@13304 773
ack@13304 774 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
kaf24@9276 775 {
ack@13304 776 ret_t ret = 0;
kaf24@9276 777
kaf24@9276 778 switch ( cmd )
kaf24@9276 779 {
kaf24@9276 780 case SCHEDOP_yield:
kaf24@9276 781 {
kaf24@9276 782 ret = do_yield();
kaf24@9276 783 break;
kaf24@9276 784 }
kaf24@9276 785
kaf24@9276 786 case SCHEDOP_block:
kaf24@9276 787 {
kaf24@9276 788 ret = do_block();
kaf24@9276 789 break;
kaf24@9276 790 }
kaf24@9276 791
kaf24@9276 792 case SCHEDOP_shutdown:
kaf24@9276 793 {
kaf24@9276 794 struct sched_shutdown sched_shutdown;
kaf24@9276 795
kaf24@9276 796 ret = -EFAULT;
kaf24@9276 797 if ( copy_from_guest(&sched_shutdown, arg, 1) )
kaf24@9276 798 break;
kaf24@9276 799
kaf24@9276 800 ret = 0;
kaf24@9276 801 TRACE_3D(TRC_SCHED_SHUTDOWN,
kaf24@9276 802 current->domain->domain_id, current->vcpu_id,
kaf24@9276 803 sched_shutdown.reason);
kaf24@9276 804 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
kaf24@9276 805
kaf24@9276 806 break;
kaf24@9276 807 }
kaf24@9276 808
keir@21556 809 case SCHEDOP_shutdown_code:
keir@21556 810 {
keir@21556 811 struct sched_shutdown sched_shutdown;
keir@21556 812 struct domain *d = current->domain;
keir@21556 813
keir@21556 814 ret = -EFAULT;
keir@21556 815 if ( copy_from_guest(&sched_shutdown, arg, 1) )
keir@21556 816 break;
keir@21556 817
keir@21556 818 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
keir@21556 819 d->domain_id, current->vcpu_id, sched_shutdown.reason);
keir@21556 820
keir@21556 821 spin_lock(&d->shutdown_lock);
keir@21556 822 if ( d->shutdown_code == -1 )
keir@21556 823 d->shutdown_code = (u8)sched_shutdown.reason;
keir@21556 824 spin_unlock(&d->shutdown_lock);
keir@21556 825
keir@21556 826 ret = 0;
keir@21556 827 break;
keir@21556 828 }
keir@21556 829
kaf24@9276 830 case SCHEDOP_poll:
kaf24@9276 831 {
kaf24@9276 832 struct sched_poll sched_poll;
kaf24@9276 833
kaf24@9276 834 ret = -EFAULT;
kaf24@9276 835 if ( copy_from_guest(&sched_poll, arg, 1) )
kaf24@9276 836 break;
kaf24@9276 837
kaf24@9276 838 ret = do_poll(&sched_poll);
kaf24@9276 839
kaf24@9276 840 break;
kaf24@9276 841 }
kaf24@9276 842
kaf24@9598 843 case SCHEDOP_remote_shutdown:
kaf24@9598 844 {
kaf24@9598 845 struct domain *d;
kaf24@9598 846 struct sched_remote_shutdown sched_remote_shutdown;
kaf24@9598 847
kaf24@9598 848 ret = -EFAULT;
kaf24@9598 849 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
kaf24@9598 850 break;
kaf24@9598 851
kaf24@9598 852 ret = -ESRCH;
kfraser@14224 853 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
kaf24@9598 854 if ( d == NULL )
kaf24@9598 855 break;
kaf24@9598 856
keir@16894 857 if ( !IS_PRIV_FOR(current->domain, d) )
keir@16894 858 {
keir@16894 859 rcu_unlock_domain(d);
keir@16894 860 return -EPERM;
keir@16894 861 }
keir@16894 862
kfraser@15846 863 ret = xsm_schedop_shutdown(current->domain, d);
kfraser@15846 864 if ( ret )
kfraser@15846 865 {
kfraser@15846 866 rcu_unlock_domain(d);
kfraser@15846 867 return ret;
kfraser@15846 868 }
kfraser@15846 869
kaf24@9598 870 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
kfraser@14742 871
kfraser@14224 872 rcu_unlock_domain(d);
kaf24@9598 873 ret = 0;
kaf24@9598 874
kaf24@9598 875 break;
kaf24@9598 876 }
kaf24@9598 877
keir@21575 878 case SCHEDOP_watchdog:
keir@21575 879 {
keir@21575 880 struct sched_watchdog sched_watchdog;
keir@21575 881
keir@21575 882 ret = -EFAULT;
keir@21575 883 if ( copy_from_guest(&sched_watchdog, arg, 1) )
keir@21575 884 break;
keir@21575 885
keir@21575 886 ret = domain_watchdog(
keir@21575 887 current->domain, sched_watchdog.id, sched_watchdog.timeout);
keir@21575 888 break;
keir@21575 889 }
keir@21575 890
kaf24@9276 891 default:
kaf24@9276 892 ret = -ENOSYS;
kaf24@9276 893 }
kaf24@9276 894
kaf24@9276 895 return ret;
kaf24@9276 896 }
kaf24@9276 897
ack@13304 898 #ifndef COMPAT
ack@13304 899
kfraser@14365 900 /* Per-vcpu oneshot-timer hypercall. */
kaf24@4488 901 long do_set_timer_op(s_time_t timeout)
kaf24@1098 902 {
kaf24@5327 903 struct vcpu *v = current;
kfraser@10585 904 s_time_t offset = timeout - NOW();
kaf24@1098 905
kaf24@5189 906 if ( timeout == 0 )
kaf24@10576 907 {
kfraser@14358 908 stop_timer(&v->singleshot_timer);
kaf24@10576 909 }
kfraser@10585 910 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
kfraser@10585 911 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
kfraser@10585 912 {
kfraser@10585 913 /*
kfraser@10585 914 * Linux workaround: occasionally we will see timeouts a long way in
kfraser@10585 915 * the future due to wrapping in Linux's jiffy time handling. We check
kfraser@10585 916 * for timeouts wrapped negative, and for positive timeouts more than
kfraser@10585 917 * about 13 days in the future (2^50ns). The correct fix is to trigger
kfraser@10585 918 * an interrupt immediately (since Linux in fact has pending work to
kfraser@10767 919 * do in this situation). However, older guests also set a long timeout
kfraser@10767 920 * when they have *no* pending timers at all: setting an immediate
kfraser@10767 921 * timeout in this case can burn a lot of CPU. We therefore go for a
kfraser@10767 922 * reasonable middleground of triggering a timer event in 100ms.
kfraser@10585 923 */
keir@16612 924 gdprintk(XENLOG_INFO,
keir@16612 925 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
keir@16612 926 v->vcpu_id, (uint64_t)timeout);
kfraser@14358 927 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
kfraser@10585 928 }
kaf24@5189 929 else
kaf24@10576 930 {
keir@21554 931 migrate_timer(&v->singleshot_timer, smp_processor_id());
kfraser@14358 932 set_timer(&v->singleshot_timer, timeout);
kaf24@10576 933 }
mwilli2@1232 934
kaf24@1098 935 return 0;
kaf24@1098 936 }
kaf24@1098 937
kaf24@7199 938 /* sched_id - fetch ID of current scheduler */
kaf24@7199 939 int sched_id(void)
mwilli2@1284 940 {
mwilli2@1284 941 return ops.sched_id;
mwilli2@1284 942 }
kaf24@1098 943
kfraser@11295 944 /* Adjust scheduling parameter for a given domain. */
kfraser@11295 945 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
rn@340 946 {
kaf24@8752 947 struct vcpu *v;
ewan@14570 948 long ret;
sd386@4767 949
keir@21258 950 if ( (op->sched_id != DOM2OP(d)->sched_id) ||
kfraser@11295 951 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
kfraser@11295 952 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
mwilli2@1284 953 return -EINVAL;
mwilli2@1284 954
kaf24@8615 955 /*
kaf24@8615 956 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
kaf24@8615 957 * we acquire the local schedule_lock to guard against concurrent updates.
ack@8751 958 *
ack@8751 959 * We only acquire the local schedule lock after we have paused all other
ack@8751 960 * VCPUs in this domain. There are two reasons for this:
ack@8751 961 * 1- We don't want to hold up interrupts as pausing a VCPU can
ack@8751 962 * trigger a tlb shootdown.
ack@8751 963 * 2- Pausing other VCPUs involves briefly locking the schedule
ack@8751 964 * lock of the CPU they are running on. This CPU could be the
ack@8751 965 * same as ours.
kaf24@8615 966 */
ack@8751 967
kaf24@8615 968 for_each_vcpu ( d, v )
kaf24@8615 969 {
kaf24@8752 970 if ( v != current )
kaf24@8615 971 vcpu_pause(v);
kaf24@8615 972 }
kaf24@6483 973
kaf24@8752 974 if ( d == current->domain )
kaf24@8752 975 vcpu_schedule_lock_irq(current);
kaf24@6483 976
keir@21258 977 if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
ewan@14570 978 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
rn@340 979
kaf24@8752 980 if ( d == current->domain )
kaf24@8752 981 vcpu_schedule_unlock_irq(current);
ack@8751 982
kaf24@8615 983 for_each_vcpu ( d, v )
kaf24@8615 984 {
kaf24@8752 985 if ( v != current )
kaf24@8615 986 vcpu_unpause(v);
kaf24@8615 987 }
kaf24@8615 988
ewan@14570 989 return ret;
rn@316 990 }
rn@316 991
keir@21328 992 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
keir@21328 993 {
keir@21672 994 struct cpupool *pool;
keir@21672 995 int rc;
keir@21328 996
keir@21328 997 if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
keir@21328 998 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) )
keir@21328 999 return -EINVAL;
keir@21328 1000
keir@21672 1001 pool = cpupool_get_by_id(op->cpupool_id);
keir@21672 1002 if ( pool == NULL )
keir@21672 1003 return -ESRCH;
keir@21672 1004
keir@22478 1005 rc = ((op->sched_id == pool->sched->sched_id)
keir@22478 1006 ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
keir@21672 1007
keir@21672 1008 cpupool_put(pool);
keir@21672 1009
keir@21672 1010 return rc;
keir@21328 1011 }
keir@21328 1012
kfraser@14358 1013 static void vcpu_periodic_timer_work(struct vcpu *v)
kfraser@14358 1014 {
kfraser@14358 1015 s_time_t now = NOW();
keir@21795 1016 s_time_t periodic_next_event;
kfraser@14358 1017
kfraser@14358 1018 if ( v->periodic_period == 0 )
kfraser@14358 1019 return;
kfraser@14358 1020
kfraser@14358 1021 periodic_next_event = v->periodic_last_event + v->periodic_period;
keir@18454 1022
keir@20967 1023 if ( now >= periodic_next_event )
kfraser@14358 1024 {
kfraser@14358 1025 send_timer_event(v);
kfraser@14358 1026 v->periodic_last_event = now;
kfraser@14358 1027 periodic_next_event = now + v->periodic_period;
kfraser@14358 1028 }
kfraser@14358 1029
keir@21554 1030 migrate_timer(&v->periodic_timer, smp_processor_id());
kfraser@14358 1031 set_timer(&v->periodic_timer, periodic_next_event);
kfraser@14358 1032 }
kfraser@14358 1033
akw27@915 1034 /*
rn@316 1035 * The main function
mwilli2@1232 1036 * - deschedule the current domain (scheduler independent).
mwilli2@1232 1037 * - pick a new domain (scheduler dependent).
akw27@915 1038 */
kfraser@12509 1039 static void schedule(void)
rn@316 1040 {
kaf24@11017 1041 struct vcpu *prev = current, *next = NULL;
kaf24@11017 1042 s_time_t now = NOW();
keir@21258 1043 struct scheduler *sched = this_cpu(scheduler);
keir@21390 1044 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
keir@21390 1045 bool_t tasklet_work_scheduled = 0;
kaf24@11017 1046 struct schedule_data *sd;
kaf24@11017 1047 struct task_slice next_slice;
rn@316 1048
keir@22441 1049 ASSERT(!in_atomic());
kaf24@8547 1050
kfraser@14625 1051 perfc_incr(sched_run);
kaf24@8547 1052
kaf24@11017 1053 sd = &this_cpu(schedule_data);
sd386@3888 1054
keir@21390 1055 /* Update tasklet scheduling status. */
keir@21390 1056 switch ( *tasklet_work )
keir@21390 1057 {
keir@21390 1058 case TASKLET_enqueued:
keir@21390 1059 set_bit(_TASKLET_scheduled, tasklet_work);
keir@21390 1060 case TASKLET_enqueued|TASKLET_scheduled:
keir@21390 1061 tasklet_work_scheduled = 1;
keir@21390 1062 break;
keir@21390 1063 case TASKLET_scheduled:
keir@21390 1064 clear_bit(_TASKLET_scheduled, tasklet_work);
keir@21390 1065 case 0:
keir@21390 1066 /*tasklet_work_scheduled = 0;*/
keir@21390 1067 break;
keir@21390 1068 default:
keir@21390 1069 BUG();
keir@21390 1070 }
keir@21390 1071
keir@21215 1072 spin_lock_irq(sd->schedule_lock);
kaf24@11017 1073
kaf24@11017 1074 stop_timer(&sd->s_timer);
mwilli2@1232 1075
mwilli2@1232 1076 /* get policy-specific decision on scheduling... */
keir@21390 1077 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
rn@316 1078
mwilli2@1284 1079 next = next_slice.task;
kaf24@8547 1080
kaf24@11017 1081 sd->curr = next;
keir@19530 1082
keir@19538 1083 if ( next_slice.time >= 0 ) /* -ve means no limit */
keir@19538 1084 set_timer(&sd->s_timer, now + next_slice.time);
rn@316 1085
kaf24@6237 1086 if ( unlikely(prev == next) )
kaf24@6237 1087 {
keir@21215 1088 spin_unlock_irq(sd->schedule_lock);
keir@19339 1089 trace_continue_running(next);
kaf24@6237 1090 return continue_running(prev);
kaf24@6237 1091 }
kaf24@2633 1092
kaf24@7839 1093 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
kaf24@9008 1094 prev->domain->domain_id,
kaf24@9008 1095 now - prev->runstate.state_entry_time);
kaf24@7839 1096 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
kaf24@9008 1097 next->domain->domain_id,
kaf24@9008 1098 (next->runstate.state == RUNSTATE_runnable) ?
kaf24@9008 1099 (now - next->runstate.state_entry_time) : 0,
keir@19538 1100 next_slice.time);
kaf24@7839 1101
kaf24@9008 1102 ASSERT(prev->runstate.state == RUNSTATE_running);
keir@21198 1103
keir@21198 1104 TRACE_4D(TRC_SCHED_SWITCH,
keir@21198 1105 prev->domain->domain_id, prev->vcpu_id,
keir@21198 1106 next->domain->domain_id, next->vcpu_id);
keir@21198 1107
kaf24@9008 1108 vcpu_runstate_change(
kaf24@9008 1109 prev,
kfraser@14698 1110 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
kaf24@9008 1111 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
kaf24@9008 1112 now);
keir@19346 1113 prev->last_run_time = now;
kaf24@9008 1114
kaf24@9008 1115 ASSERT(next->runstate.state != RUNSTATE_running);
kaf24@9008 1116 vcpu_runstate_change(next, RUNSTATE_running, now);
rn@369 1117
keir@21198 1118 /*
keir@21198 1119 * NB. Don't add any trace records from here until the actual context
keir@21198 1120 * switch, else lost_records resume will not work properly.
keir@21198 1121 */
keir@21198 1122
kfraser@14692 1123 ASSERT(!next->is_running);
kfraser@14692 1124 next->is_running = 1;
kaf24@8547 1125
keir@21215 1126 spin_unlock_irq(sd->schedule_lock);
kaf24@8547 1127
kfraser@14625 1128 perfc_incr(sched_ctx);
kaf24@8547 1129
kfraser@14358 1130 stop_timer(&prev->periodic_timer);
kaf24@1544 1131
keir@21671 1132 if ( next_slice.migrated )
keir@21671 1133 evtchn_move_pirqs(next);
keir@21671 1134
kaf24@4042 1135 /* Ensure that the domain has an up-to-date time base. */
kfraser@14611 1136 update_vcpu_system_time(next);
kfraser@14611 1137 vcpu_periodic_timer_work(next);
kaf24@4042 1138
kaf24@4072 1139 context_switch(prev, next);
rn@344 1140 }
rn@344 1141
kfraser@11601 1142 void context_saved(struct vcpu *prev)
kfraser@11601 1143 {
kfraser@14692 1144 /* Clear running flag /after/ writing context to memory. */
kfraser@14692 1145 smp_wmb();
kfraser@14692 1146
kfraser@14692 1147 prev->is_running = 0;
kfraser@14692 1148
kfraser@14692 1149 /* Check for migration request /after/ clearing running flag. */
kfraser@14692 1150 smp_mb();
kfraser@11601 1151
keir@21258 1152 SCHED_OP(VCPU2OP(prev), context_saved, prev);
keir@21214 1153
kfraser@14698 1154 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
kfraser@11601 1155 vcpu_migrate(prev);
kfraser@11601 1156 }
rn@344 1157
kaf24@4332 1158 /* The scheduler timer: force a run through the scheduler */
kaf24@5190 1159 static void s_timer_fn(void *unused)
rn@316 1160 {
kaf24@1543 1161 raise_softirq(SCHEDULE_SOFTIRQ);
kfraser@14625 1162 perfc_incr(sched_irq);
rn@316 1163 }
rn@316 1164
kfraser@14358 1165 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
kfraser@14358 1166 static void vcpu_periodic_timer_fn(void *data)
rn@316 1167 {
kfraser@14358 1168 struct vcpu *v = data;
kfraser@14358 1169 vcpu_periodic_timer_work(v);
kaf24@1098 1170 }
kaf24@1098 1171
kfraser@14358 1172 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
kfraser@14358 1173 static void vcpu_singleshot_timer_fn(void *data)
kaf24@1098 1174 {
kaf24@5327 1175 struct vcpu *v = data;
kaf24@9278 1176 send_timer_event(v);
kaf24@974 1177 }
kaf24@974 1178
kaf24@9276 1179 /* SCHEDOP_poll timeout callback. */
kaf24@9276 1180 static void poll_timer_fn(void *data)
kaf24@9276 1181 {
kaf24@9276 1182 struct vcpu *v = data;
kfraser@14692 1183
keir@18466 1184 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
keir@18466 1185 vcpu_unblock(v);
kaf24@9276 1186 }
kaf24@9276 1187
keir@21468 1188 static int cpu_schedule_up(unsigned int cpu)
keir@21468 1189 {
keir@21468 1190 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
keir@21468 1191
keir@21468 1192 per_cpu(scheduler, cpu) = &ops;
keir@21468 1193 spin_lock_init(&sd->_lock);
keir@21468 1194 sd->schedule_lock = &sd->_lock;
keir@21468 1195 sd->curr = idle_vcpu[cpu];
keir@21468 1196 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
keir@21468 1197 atomic_set(&sd->urgent_count, 0);
keir@21468 1198
keir@21468 1199 /* Boot CPU is dealt with later in schedule_init(). */
keir@21468 1200 if ( cpu == 0 )
keir@21468 1201 return 0;
keir@21468 1202
keir@21468 1203 if ( idle_vcpu[cpu] == NULL )
keir@21468 1204 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
keir@21468 1205 if ( idle_vcpu[cpu] == NULL )
keir@21468 1206 return -ENOMEM;
keir@21468 1207
keir@21468 1208 if ( (ops.alloc_pdata != NULL) &&
keir@21468 1209 ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) )
keir@21468 1210 return -ENOMEM;
keir@21468 1211
keir@21468 1212 return 0;
keir@21468 1213 }
keir@21468 1214
keir@21468 1215 static void cpu_schedule_down(unsigned int cpu)
keir@21468 1216 {
keir@21468 1217 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
keir@21468 1218
keir@21468 1219 if ( sd->sched_priv != NULL )
keir@21468 1220 SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
keir@21468 1221
keir@21468 1222 kill_timer(&sd->s_timer);
keir@21468 1223 }
keir@21468 1224
keir@21468 1225 static int cpu_schedule_callback(
keir@21436 1226 struct notifier_block *nfb, unsigned long action, void *hcpu)
keir@21436 1227 {
keir@21436 1228 unsigned int cpu = (unsigned long)hcpu;
keir@21468 1229 int rc = 0;
keir@21436 1230
keir@21436 1231 switch ( action )
keir@21436 1232 {
keir@21436 1233 case CPU_UP_PREPARE:
keir@21468 1234 rc = cpu_schedule_up(cpu);
keir@21436 1235 break;
keir@21468 1236 case CPU_UP_CANCELED:
keir@21436 1237 case CPU_DEAD:
keir@21468 1238 cpu_schedule_down(cpu);
keir@21436 1239 break;
keir@21436 1240 default:
keir@21436 1241 break;
keir@21436 1242 }
keir@21436 1243
keir@21468 1244 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
keir@21436 1245 }
keir@21436 1246
keir@21468 1247 static struct notifier_block cpu_schedule_nfb = {
keir@21468 1248 .notifier_call = cpu_schedule_callback
keir@21436 1249 };
keir@21436 1250
kaf24@1098 1251 /* Initialise the data structures. */
rn@316 1252 void __init scheduler_init(void)
rn@316 1253 {
keir@21468 1254 struct domain *idle_domain;
kaf24@10281 1255 int i;
rn@316 1256
kfraser@12509 1257 open_softirq(SCHEDULE_SOFTIRQ, schedule);
kaf24@1543 1258
mwilli2@1232 1259 for ( i = 0; schedulers[i] != NULL; i++ )
mwilli2@1232 1260 {
kaf24@1524 1261 ops = *schedulers[i];
kaf24@1524 1262 if ( strcmp(ops.opt_name, opt_sched) == 0 )
mwilli2@1232 1263 break;
mwilli2@1232 1264 }
keir@21453 1265
mwilli2@1232 1266 if ( schedulers[i] == NULL )
keir@20990 1267 {
mwilli2@1232 1268 printk("Could not find scheduler: %s\n", opt_sched);
keir@20990 1269 ops = *schedulers[0];
keir@20990 1270 }
mwilli2@1232 1271
keir@21468 1272 if ( cpu_schedule_up(0) )
keir@21468 1273 BUG();
keir@21468 1274 register_cpu_notifier(&cpu_schedule_nfb);
keir@21258 1275
mwilli2@1232 1276 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
keir@21453 1277 if ( SCHED_OP(&ops, init) )
keir@21258 1278 panic("scheduler returned error on init\n");
keir@21468 1279
keir@22518 1280 idle_domain = domain_create(DOMID_IDLE, 0, 0);
keir@21468 1281 BUG_ON(idle_domain == NULL);
keir@21468 1282 idle_domain->vcpu = idle_vcpu;
keir@21468 1283 idle_domain->max_vcpus = NR_CPUS;
keir@21468 1284 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
keir@21468 1285 BUG();
keir@21468 1286 if ( ops.alloc_pdata &&
keir@21468 1287 !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) )
keir@21468 1288 BUG();
rn@316 1289 }
rn@316 1290
keir@21258 1291 void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
iap10@274 1292 {
kaf24@2633 1293 unsigned long flags;
keir@21468 1294 struct vcpu *idle;
keir@21468 1295 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
keir@21453 1296 struct scheduler *old_ops = per_cpu(scheduler, cpu);
keir@21453 1297 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
kaf24@2633 1298
keir@21453 1299 if ( old_ops == new_ops )
keir@21453 1300 return;
keir@21453 1301
keir@21468 1302 idle = idle_vcpu[cpu];
keir@21258 1303 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
keir@21468 1304 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
keir@21258 1305
keir@22655 1306 pcpu_schedule_lock_irqsave(cpu, flags);
keir@21258 1307
keir@21258 1308 SCHED_OP(old_ops, tick_suspend, cpu);
keir@21468 1309 vpriv_old = idle->sched_priv;
keir@21468 1310 idle->sched_priv = vpriv;
keir@21258 1311 per_cpu(scheduler, cpu) = new_ops;
keir@21258 1312 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
keir@21258 1313 per_cpu(schedule_data, cpu).sched_priv = ppriv;
keir@21258 1314 SCHED_OP(new_ops, tick_resume, cpu);
keir@21468 1315 SCHED_OP(new_ops, insert_vcpu, idle);
keir@21258 1316
keir@22655 1317 pcpu_schedule_unlock_irqrestore(cpu, flags);
iap10@274 1318
keir@21984 1319 SCHED_OP(old_ops, free_vdata, vpriv_old);
keir@21258 1320 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
keir@21258 1321 }
keir@21258 1322
keir@21672 1323 struct scheduler *scheduler_get_default(void)
keir@21672 1324 {
keir@21672 1325 return &ops;
keir@21672 1326 }
keir@21672 1327
keir@21672 1328 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
keir@21258 1329 {
keir@21258 1330 int i;
keir@21453 1331 struct scheduler *sched;
keir@21453 1332
keir@21672 1333 for ( i = 0; schedulers[i] != NULL; i++ )
keir@21672 1334 if ( schedulers[i]->sched_id == sched_id )
keir@21672 1335 goto found;
keir@21672 1336 *perr = -ENOENT;
keir@21672 1337 return NULL;
kaf24@2633 1338
keir@21672 1339 found:
keir@21672 1340 *perr = -ENOMEM;
keir@21453 1341 if ( (sched = xmalloc(struct scheduler)) == NULL )
keir@21453 1342 return NULL;
keir@21672 1343 memcpy(sched, schedulers[i], sizeof(*sched));
keir@21672 1344 if ( (*perr = SCHED_OP(sched, init)) != 0 )
keir@21453 1345 {
keir@21453 1346 xfree(sched);
keir@21453 1347 sched = NULL;
keir@21453 1348 }
keir@21453 1349
keir@21453 1350 return sched;
keir@21258 1351 }
keir@21258 1352
keir@21453 1353 void scheduler_free(struct scheduler *sched)
keir@21258 1354 {
keir@21453 1355 BUG_ON(sched == &ops);
keir@21258 1356 SCHED_OP(sched, deinit);
keir@21453 1357 xfree(sched);
keir@21258 1358 }
keir@21258 1359
keir@21258 1360 void schedule_dump(struct cpupool *c)
keir@21258 1361 {
keir@21258 1362 int i;
keir@21258 1363 struct scheduler *sched;
keir@21258 1364 cpumask_t *cpus;
keir@21258 1365
keir@21453 1366 sched = (c == NULL) ? &ops : c->sched;
keir@21258 1367 cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid;
keir@21258 1368 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
keir@21258 1369 SCHED_OP(sched, dump_settings);
keir@21258 1370
keir@21258 1371 for_each_cpu_mask (i, *cpus)
kaf24@1544 1372 {
keir@22655 1373 pcpu_schedule_lock(i);
mwilli2@1232 1374 printk("CPU[%02d] ", i);
keir@21258 1375 SCHED_OP(sched, dump_cpu_state, i);
keir@22655 1376 pcpu_schedule_unlock(i);
iap10@274 1377 }
kaf24@337 1378 }
kaf24@337 1379
keir@19498 1380 void sched_tick_suspend(void)
keir@19498 1381 {
keir@21258 1382 struct scheduler *sched;
keir@21258 1383 unsigned int cpu = smp_processor_id();
keir@21258 1384
keir@21258 1385 sched = per_cpu(scheduler, cpu);
keir@21258 1386 SCHED_OP(sched, tick_suspend, cpu);
keir@19498 1387 }
keir@19498 1388
keir@19498 1389 void sched_tick_resume(void)
keir@19498 1390 {
keir@21258 1391 struct scheduler *sched;
keir@21258 1392 unsigned int cpu = smp_processor_id();
keir@21258 1393
keir@21258 1394 sched = per_cpu(scheduler, cpu);
keir@21258 1395 SCHED_OP(sched, tick_resume, cpu);
keir@19498 1396 }
keir@19498 1397
keir@22442 1398 void wait(void)
keir@22442 1399 {
keir@22442 1400 schedule();
keir@22442 1401 }
keir@22442 1402
ack@13304 1403 #ifdef CONFIG_COMPAT
ack@13304 1404 #include "compat/schedule.c"
ack@13304 1405 #endif
ack@13304 1406
ack@13304 1407 #endif /* !COMPAT */
ack@13304 1408
kaf24@3952 1409 /*
kaf24@3952 1410 * Local variables:
kaf24@3952 1411 * mode: C
kaf24@3952 1412 * c-set-style: "BSD"
kaf24@3952 1413 * c-basic-offset: 4
kaf24@3952 1414 * tab-width: 4
kaf24@3952 1415 * indent-tabs-mode: nil
kaf24@4026 1416 * End:
kaf24@3952 1417 */