/root/src/xen/xen/common/schedule.c
Line | Count | Source (jump to first uncovered line) |
1 | | /**************************************************************************** |
2 | | * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge |
3 | | * (C) 2002-2003 University of Cambridge |
4 | | * (C) 2004 - Mark Williamson - Intel Research Cambridge |
5 | | **************************************************************************** |
6 | | * |
7 | | * File: common/schedule.c |
8 | | * Author: Rolf Neugebauer & Keir Fraser |
9 | | * Updated for generic API by Mark Williamson |
10 | | * |
11 | | * Description: Generic CPU scheduling code |
12 | | * implements support functionality for the Xen scheduler API. |
13 | | * |
14 | | */ |
15 | | |
16 | | #ifndef COMPAT |
17 | | #include <xen/init.h> |
18 | | #include <xen/lib.h> |
19 | | #include <xen/sched.h> |
20 | | #include <xen/domain.h> |
21 | | #include <xen/delay.h> |
22 | | #include <xen/event.h> |
23 | | #include <xen/time.h> |
24 | | #include <xen/timer.h> |
25 | | #include <xen/perfc.h> |
26 | | #include <xen/sched-if.h> |
27 | | #include <xen/softirq.h> |
28 | | #include <xen/trace.h> |
29 | | #include <xen/mm.h> |
30 | | #include <xen/err.h> |
31 | | #include <xen/guest_access.h> |
32 | | #include <xen/hypercall.h> |
33 | | #include <xen/multicall.h> |
34 | | #include <xen/cpu.h> |
35 | | #include <xen/preempt.h> |
36 | | #include <xen/event.h> |
37 | | #include <public/sched.h> |
38 | | #include <xsm/xsm.h> |
39 | | #include <xen/err.h> |
40 | | |
41 | | /* opt_sched: scheduler - default to configured value */ |
42 | | static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT; |
43 | | string_param("sched", opt_sched); |
44 | | |
45 | | /* if sched_smt_power_savings is set, |
46 | | * scheduler will give preferrence to partially idle package compared to |
47 | | * the full idle package, when picking pCPU to schedule vCPU. |
48 | | */ |
49 | | bool_t sched_smt_power_savings = 0; |
50 | | boolean_param("sched_smt_power_savings", sched_smt_power_savings); |
51 | | |
52 | | /* Default scheduling rate limit: 1ms |
53 | | * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined |
54 | | * */ |
55 | | int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; |
56 | | integer_param("sched_ratelimit_us", sched_ratelimit_us); |
57 | | /* Various timer handlers. */ |
58 | | static void s_timer_fn(void *unused); |
59 | | static void vcpu_periodic_timer_fn(void *data); |
60 | | static void vcpu_singleshot_timer_fn(void *data); |
61 | | static void poll_timer_fn(void *data); |
62 | | |
63 | | /* This is global for now so that private implementations can reach it */ |
64 | | DEFINE_PER_CPU(struct schedule_data, schedule_data); |
65 | | DEFINE_PER_CPU(struct scheduler *, scheduler); |
66 | | |
67 | | /* Scratch space for cpumasks. */ |
68 | | DEFINE_PER_CPU(cpumask_t, cpumask_scratch); |
69 | | |
70 | | extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[]; |
71 | 6 | #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array) |
72 | 6 | #define schedulers __start_schedulers_array |
73 | | |
74 | | static struct scheduler __read_mostly ops; |
75 | | |
76 | | #define SCHED_OP(opsptr, fn, ...) \ |
77 | 8.70M | (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \ |
78 | 18.4E | : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 ) |
79 | | |
80 | | static inline struct scheduler *dom_scheduler(const struct domain *d) |
81 | 114 | { |
82 | 114 | if ( likely(d->cpupool != NULL) ) |
83 | 75 | return d->cpupool->sched; |
84 | 114 | |
85 | 114 | /* |
86 | 114 | * If d->cpupool is NULL, this is the idle domain. This is special |
87 | 114 | * because the idle domain does not really belong to any cpupool, and, |
88 | 114 | * hence, does not really have a scheduler. |
89 | 114 | * |
90 | 114 | * This is (should be!) only called like this for allocating the idle |
91 | 114 | * vCPUs for the first time, during boot, in which case what we want |
92 | 114 | * is the default scheduler that has been, choosen at boot. |
93 | 114 | */ |
94 | 39 | ASSERT(is_idle_domain(d)); |
95 | 39 | return &ops; |
96 | 114 | } |
97 | | |
98 | | static inline struct scheduler *vcpu_scheduler(const struct vcpu *v) |
99 | 14.1M | { |
100 | 14.1M | struct domain *d = v->domain; |
101 | 14.1M | |
102 | 14.1M | if ( likely(d->cpupool != NULL) ) |
103 | 14.2M | return d->cpupool->sched; |
104 | 14.1M | |
105 | 14.1M | /* |
106 | 14.1M | * If d->cpupool is NULL, this is a vCPU of the idle domain. And this |
107 | 14.1M | * case is special because the idle domain does not really belong to |
108 | 14.1M | * a cpupool and, hence, doesn't really have a scheduler). In fact, its |
109 | 14.1M | * vCPUs (may) run on pCPUs which are in different pools, with different |
110 | 14.1M | * schedulers. |
111 | 14.1M | * |
112 | 14.1M | * What we want, in this case, is the scheduler of the pCPU where this |
113 | 14.1M | * particular idle vCPU is running. And, since v->processor never changes |
114 | 14.1M | * for idle vCPUs, it is safe to use it, with no locks, to figure that out. |
115 | 14.1M | */ |
116 | 18.4E | ASSERT(is_idle_domain(d)); |
117 | 18.4E | return per_cpu(scheduler, v->processor); |
118 | 14.1M | } |
119 | 0 | #define VCPU2ONLINE(_v) cpupool_domain_cpumask((_v)->domain) |
120 | | |
121 | | static inline void trace_runstate_change(struct vcpu *v, int new_state) |
122 | 393k | { |
123 | 393k | struct { uint32_t vcpu:16, domain:16; } d; |
124 | 393k | uint32_t event; |
125 | 393k | |
126 | 393k | if ( likely(!tb_init_done) ) |
127 | 393k | return; |
128 | 393k | |
129 | 18.4E | d.vcpu = v->vcpu_id; |
130 | 18.4E | d.domain = v->domain->domain_id; |
131 | 18.4E | |
132 | 18.4E | event = TRC_SCHED_RUNSTATE_CHANGE; |
133 | 18.4E | event |= ( v->runstate.state & 0x3 ) << 8; |
134 | 18.4E | event |= ( new_state & 0x3 ) << 4; |
135 | 18.4E | |
136 | 18.4E | __trace_var(event, 1/*tsc*/, sizeof(d), &d); |
137 | 18.4E | } |
138 | | |
139 | | static inline void trace_continue_running(struct vcpu *v) |
140 | 4.49M | { |
141 | 4.49M | struct { uint32_t vcpu:16, domain:16; } d; |
142 | 4.49M | |
143 | 4.49M | if ( likely(!tb_init_done) ) |
144 | 4.49M | return; |
145 | 4.49M | |
146 | 8.19k | d.vcpu = v->vcpu_id; |
147 | 8.19k | d.domain = v->domain->domain_id; |
148 | 8.19k | |
149 | 8.19k | __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); |
150 | 8.19k | } |
151 | | |
152 | | static inline void vcpu_urgent_count_update(struct vcpu *v) |
153 | 393k | { |
154 | 393k | if ( is_idle_vcpu(v) ) |
155 | 131k | return; |
156 | 393k | |
157 | 262k | if ( unlikely(v->is_urgent) ) |
158 | 0 | { |
159 | 0 | if ( !(v->pause_flags & VPF_blocked) || |
160 | 0 | !test_bit(v->vcpu_id, v->domain->poll_mask) ) |
161 | 0 | { |
162 | 0 | v->is_urgent = 0; |
163 | 0 | atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count); |
164 | 0 | } |
165 | 0 | } |
166 | 262k | else |
167 | 262k | { |
168 | 262k | if ( unlikely(v->pause_flags & VPF_blocked) && |
169 | 64.8k | unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) ) |
170 | 0 | { |
171 | 0 | v->is_urgent = 1; |
172 | 0 | atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count); |
173 | 0 | } |
174 | 262k | } |
175 | 262k | } |
176 | | |
177 | | static inline void vcpu_runstate_change( |
178 | | struct vcpu *v, int new_state, s_time_t new_entry_time) |
179 | 393k | { |
180 | 393k | s_time_t delta; |
181 | 393k | |
182 | 393k | ASSERT(v->runstate.state != new_state); |
183 | 393k | ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock)); |
184 | 393k | |
185 | 393k | vcpu_urgent_count_update(v); |
186 | 393k | |
187 | 393k | trace_runstate_change(v, new_state); |
188 | 393k | |
189 | 393k | delta = new_entry_time - v->runstate.state_entry_time; |
190 | 393k | if ( delta > 0 ) |
191 | 339k | { |
192 | 339k | v->runstate.time[v->runstate.state] += delta; |
193 | 339k | v->runstate.state_entry_time = new_entry_time; |
194 | 339k | } |
195 | 393k | |
196 | 393k | v->runstate.state = new_state; |
197 | 393k | } |
198 | | |
199 | | void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) |
200 | 48 | { |
201 | 48 | spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v); |
202 | 48 | s_time_t delta; |
203 | 48 | |
204 | 48 | memcpy(runstate, &v->runstate, sizeof(*runstate)); |
205 | 48 | delta = NOW() - runstate->state_entry_time; |
206 | 48 | if ( delta > 0 ) |
207 | 48 | runstate->time[runstate->state] += delta; |
208 | 48 | |
209 | 48 | if ( unlikely(lock != NULL) ) |
210 | 44 | vcpu_schedule_unlock_irq(lock, v); |
211 | 48 | } |
212 | | |
213 | | uint64_t get_cpu_idle_time(unsigned int cpu) |
214 | 0 | { |
215 | 0 | struct vcpu_runstate_info state = { 0 }; |
216 | 0 | struct vcpu *v = idle_vcpu[cpu]; |
217 | 0 |
|
218 | 0 | if ( cpu_online(cpu) && v ) |
219 | 0 | vcpu_runstate_get(v, &state); |
220 | 0 |
|
221 | 0 | return state.time[RUNSTATE_running]; |
222 | 0 | } |
223 | | |
224 | | /* |
225 | | * If locks are different, take the one with the lower address first. |
226 | | * This avoids dead- or live-locks when this code is running on both |
227 | | * cpus at the same time. |
228 | | */ |
229 | | static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, |
230 | | unsigned long *flags) |
231 | 96 | { |
232 | 96 | if ( lock1 == lock2 ) |
233 | 66 | { |
234 | 66 | spin_lock_irqsave(lock1, *flags); |
235 | 66 | } |
236 | 30 | else if ( lock1 < lock2 ) |
237 | 10 | { |
238 | 10 | spin_lock_irqsave(lock1, *flags); |
239 | 10 | spin_lock(lock2); |
240 | 10 | } |
241 | 30 | else |
242 | 20 | { |
243 | 20 | spin_lock_irqsave(lock2, *flags); |
244 | 20 | spin_lock(lock1); |
245 | 20 | } |
246 | 96 | } |
247 | | |
248 | | static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, |
249 | | unsigned long flags) |
250 | 96 | { |
251 | 96 | if ( lock1 != lock2 ) |
252 | 30 | spin_unlock(lock2); |
253 | 96 | spin_unlock_irqrestore(lock1, flags); |
254 | 96 | } |
255 | | |
256 | | int sched_init_vcpu(struct vcpu *v, unsigned int processor) |
257 | 24 | { |
258 | 24 | struct domain *d = v->domain; |
259 | 24 | |
260 | 24 | /* |
261 | 24 | * Initialize processor and affinity settings. The idler, and potentially |
262 | 24 | * domain-0 VCPUs, are pinned onto their respective physical CPUs. |
263 | 24 | */ |
264 | 24 | v->processor = processor; |
265 | 24 | if ( is_idle_domain(d) || d->is_pinned ) |
266 | 12 | cpumask_copy(v->cpu_hard_affinity, cpumask_of(processor)); |
267 | 24 | else |
268 | 12 | cpumask_setall(v->cpu_hard_affinity); |
269 | 24 | |
270 | 24 | cpumask_setall(v->cpu_soft_affinity); |
271 | 24 | |
272 | 24 | /* Initialise the per-vcpu timers. */ |
273 | 24 | init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, |
274 | 24 | v, v->processor); |
275 | 24 | init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, |
276 | 24 | v, v->processor); |
277 | 24 | init_timer(&v->poll_timer, poll_timer_fn, |
278 | 24 | v, v->processor); |
279 | 24 | |
280 | 24 | v->sched_priv = SCHED_OP(dom_scheduler(d), alloc_vdata, v, |
281 | 24 | d->sched_priv); |
282 | 24 | if ( v->sched_priv == NULL ) |
283 | 0 | return 1; |
284 | 24 | |
285 | 24 | /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */ |
286 | 24 | if ( is_idle_domain(d) ) |
287 | 12 | { |
288 | 12 | per_cpu(schedule_data, v->processor).curr = v; |
289 | 12 | v->is_running = 1; |
290 | 12 | } |
291 | 24 | else |
292 | 12 | { |
293 | 12 | SCHED_OP(dom_scheduler(d), insert_vcpu, v); |
294 | 12 | } |
295 | 24 | |
296 | 24 | return 0; |
297 | 24 | } |
298 | | |
299 | | static void sched_move_irqs(struct vcpu *v) |
300 | 536 | { |
301 | 536 | arch_move_irqs(v); |
302 | 536 | evtchn_move_pirqs(v); |
303 | 536 | } |
304 | | |
305 | | int sched_move_domain(struct domain *d, struct cpupool *c) |
306 | 0 | { |
307 | 0 | struct vcpu *v; |
308 | 0 | unsigned int new_p; |
309 | 0 | void **vcpu_priv; |
310 | 0 | void *domdata; |
311 | 0 | void *vcpudata; |
312 | 0 | struct scheduler *old_ops; |
313 | 0 | void *old_domdata; |
314 | 0 |
|
315 | 0 | for_each_vcpu ( d, v ) |
316 | 0 | { |
317 | 0 | if ( v->affinity_broken ) |
318 | 0 | return -EBUSY; |
319 | 0 | } |
320 | 0 |
|
321 | 0 | domdata = SCHED_OP(c->sched, alloc_domdata, d); |
322 | 0 | if ( domdata == NULL ) |
323 | 0 | return -ENOMEM; |
324 | 0 |
|
325 | 0 | vcpu_priv = xzalloc_array(void *, d->max_vcpus); |
326 | 0 | if ( vcpu_priv == NULL ) |
327 | 0 | { |
328 | 0 | SCHED_OP(c->sched, free_domdata, domdata); |
329 | 0 | return -ENOMEM; |
330 | 0 | } |
331 | 0 |
|
332 | 0 | for_each_vcpu ( d, v ) |
333 | 0 | { |
334 | 0 | vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata); |
335 | 0 | if ( vcpu_priv[v->vcpu_id] == NULL ) |
336 | 0 | { |
337 | 0 | for_each_vcpu ( d, v ) |
338 | 0 | xfree(vcpu_priv[v->vcpu_id]); |
339 | 0 | xfree(vcpu_priv); |
340 | 0 | SCHED_OP(c->sched, free_domdata, domdata); |
341 | 0 | return -ENOMEM; |
342 | 0 | } |
343 | 0 | } |
344 | 0 |
|
345 | 0 | domain_pause(d); |
346 | 0 |
|
347 | 0 | old_ops = dom_scheduler(d); |
348 | 0 | old_domdata = d->sched_priv; |
349 | 0 |
|
350 | 0 | for_each_vcpu ( d, v ) |
351 | 0 | { |
352 | 0 | SCHED_OP(old_ops, remove_vcpu, v); |
353 | 0 | } |
354 | 0 |
|
355 | 0 | d->cpupool = c; |
356 | 0 | d->sched_priv = domdata; |
357 | 0 |
|
358 | 0 | new_p = cpumask_first(c->cpu_valid); |
359 | 0 | for_each_vcpu ( d, v ) |
360 | 0 | { |
361 | 0 | spinlock_t *lock; |
362 | 0 |
|
363 | 0 | vcpudata = v->sched_priv; |
364 | 0 |
|
365 | 0 | migrate_timer(&v->periodic_timer, new_p); |
366 | 0 | migrate_timer(&v->singleshot_timer, new_p); |
367 | 0 | migrate_timer(&v->poll_timer, new_p); |
368 | 0 |
|
369 | 0 | cpumask_setall(v->cpu_hard_affinity); |
370 | 0 | cpumask_setall(v->cpu_soft_affinity); |
371 | 0 |
|
372 | 0 | lock = vcpu_schedule_lock_irq(v); |
373 | 0 | v->processor = new_p; |
374 | 0 | /* |
375 | 0 | * With v->processor modified we must not |
376 | 0 | * - make any further changes assuming we hold the scheduler lock, |
377 | 0 | * - use vcpu_schedule_unlock_irq(). |
378 | 0 | */ |
379 | 0 | spin_unlock_irq(lock); |
380 | 0 |
|
381 | 0 | v->sched_priv = vcpu_priv[v->vcpu_id]; |
382 | 0 | if ( !d->is_dying ) |
383 | 0 | sched_move_irqs(v); |
384 | 0 |
|
385 | 0 | new_p = cpumask_cycle(new_p, c->cpu_valid); |
386 | 0 |
|
387 | 0 | SCHED_OP(c->sched, insert_vcpu, v); |
388 | 0 |
|
389 | 0 | SCHED_OP(old_ops, free_vdata, vcpudata); |
390 | 0 | } |
391 | 0 |
|
392 | 0 | domain_update_node_affinity(d); |
393 | 0 |
|
394 | 0 | domain_unpause(d); |
395 | 0 |
|
396 | 0 | SCHED_OP(old_ops, free_domdata, old_domdata); |
397 | 0 |
|
398 | 0 | xfree(vcpu_priv); |
399 | 0 |
|
400 | 0 | return 0; |
401 | 0 | } |
402 | | |
403 | | void sched_destroy_vcpu(struct vcpu *v) |
404 | 0 | { |
405 | 0 | kill_timer(&v->periodic_timer); |
406 | 0 | kill_timer(&v->singleshot_timer); |
407 | 0 | kill_timer(&v->poll_timer); |
408 | 0 | if ( test_and_clear_bool(v->is_urgent) ) |
409 | 0 | atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count); |
410 | 0 | SCHED_OP(vcpu_scheduler(v), remove_vcpu, v); |
411 | 0 | SCHED_OP(vcpu_scheduler(v), free_vdata, v->sched_priv); |
412 | 0 | } |
413 | | |
414 | | int sched_init_domain(struct domain *d, int poolid) |
415 | 2 | { |
416 | 2 | int ret; |
417 | 2 | |
418 | 2 | ASSERT(d->cpupool == NULL); |
419 | 2 | |
420 | 2 | if ( (ret = cpupool_add_domain(d, poolid)) ) |
421 | 0 | return ret; |
422 | 2 | |
423 | 2 | SCHED_STAT_CRANK(dom_init); |
424 | 2 | TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id); |
425 | 2 | return SCHED_OP(dom_scheduler(d), init_domain, d); |
426 | 2 | } |
427 | | |
428 | | void sched_destroy_domain(struct domain *d) |
429 | 0 | { |
430 | 0 | ASSERT(d->cpupool != NULL || is_idle_domain(d)); |
431 | 0 |
|
432 | 0 | SCHED_STAT_CRANK(dom_destroy); |
433 | 0 | TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); |
434 | 0 | SCHED_OP(dom_scheduler(d), destroy_domain, d); |
435 | 0 |
|
436 | 0 | cpupool_rm_domain(d); |
437 | 0 | } |
438 | | |
439 | | void vcpu_sleep_nosync(struct vcpu *v) |
440 | 360 | { |
441 | 360 | unsigned long flags; |
442 | 360 | spinlock_t *lock; |
443 | 360 | |
444 | 360 | TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); |
445 | 360 | |
446 | 360 | lock = vcpu_schedule_lock_irqsave(v, &flags); |
447 | 360 | |
448 | 360 | if ( likely(!vcpu_runnable(v)) ) |
449 | 360 | { |
450 | 360 | if ( v->runstate.state == RUNSTATE_runnable ) |
451 | 1 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
452 | 360 | |
453 | 360 | SCHED_OP(vcpu_scheduler(v), sleep, v); |
454 | 360 | } |
455 | 360 | |
456 | 360 | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
457 | 360 | } |
458 | | |
459 | | void vcpu_sleep_sync(struct vcpu *v) |
460 | 315 | { |
461 | 315 | vcpu_sleep_nosync(v); |
462 | 315 | |
463 | 3.22k | while ( !vcpu_runnable(v) && v->is_running ) |
464 | 2.90k | cpu_relax(); |
465 | 315 | |
466 | 315 | sync_vcpu_execstate(v); |
467 | 315 | } |
468 | | |
469 | | void vcpu_wake(struct vcpu *v) |
470 | 67.2k | { |
471 | 67.2k | unsigned long flags; |
472 | 67.2k | spinlock_t *lock; |
473 | 67.2k | |
474 | 67.2k | TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); |
475 | 67.2k | |
476 | 67.2k | lock = vcpu_schedule_lock_irqsave(v, &flags); |
477 | 67.2k | |
478 | 67.2k | if ( likely(vcpu_runnable(v)) ) |
479 | 67.1k | { |
480 | 67.1k | if ( v->runstate.state >= RUNSTATE_blocked ) |
481 | 66.3k | vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); |
482 | 67.1k | SCHED_OP(vcpu_scheduler(v), wake, v); |
483 | 67.1k | } |
484 | 86 | else if ( !(v->pause_flags & VPF_blocked) ) |
485 | 117 | { |
486 | 117 | if ( v->runstate.state == RUNSTATE_blocked ) |
487 | 0 | vcpu_runstate_change(v, RUNSTATE_offline, NOW()); |
488 | 117 | } |
489 | 67.2k | |
490 | 67.2k | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
491 | 67.2k | } |
492 | | |
493 | | void vcpu_unblock(struct vcpu *v) |
494 | 103k | { |
495 | 103k | if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) |
496 | 36.8k | return; |
497 | 103k | |
498 | 103k | /* Polling period ends when a VCPU is unblocked. */ |
499 | 67.0k | if ( unlikely(v->poll_evtchn != 0) ) |
500 | 0 | { |
501 | 0 | v->poll_evtchn = 0; |
502 | 0 | /* |
503 | 0 | * We *must* re-clear _VPF_blocked to avoid racing other wakeups of |
504 | 0 | * this VCPU (and it then going back to sleep on poll_mask). |
505 | 0 | * Test-and-clear is idiomatic and ensures clear_bit not reordered. |
506 | 0 | */ |
507 | 0 | if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) |
508 | 0 | clear_bit(_VPF_blocked, &v->pause_flags); |
509 | 0 | } |
510 | 67.0k | |
511 | 67.0k | vcpu_wake(v); |
512 | 67.0k | } |
513 | | |
514 | | /* |
515 | | * Do the actual movement of a vcpu from old to new CPU. Locks for *both* |
516 | | * CPUs needs to have been taken already when calling this! |
517 | | */ |
518 | | static void vcpu_move_locked(struct vcpu *v, unsigned int new_cpu) |
519 | 65 | { |
520 | 65 | unsigned int old_cpu = v->processor; |
521 | 65 | |
522 | 65 | /* |
523 | 65 | * Transfer urgency status to new CPU before switching CPUs, as |
524 | 65 | * once the switch occurs, v->is_urgent is no longer protected by |
525 | 65 | * the per-CPU scheduler lock we are holding. |
526 | 65 | */ |
527 | 65 | if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) ) |
528 | 0 | { |
529 | 0 | atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count); |
530 | 0 | atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count); |
531 | 0 | } |
532 | 65 | |
533 | 65 | /* |
534 | 65 | * Actual CPU switch to new CPU. This is safe because the lock |
535 | 65 | * pointer cant' change while the current lock is held. |
536 | 65 | */ |
537 | 65 | if ( vcpu_scheduler(v)->migrate ) |
538 | 0 | SCHED_OP(vcpu_scheduler(v), migrate, v, new_cpu); |
539 | 65 | else |
540 | 65 | v->processor = new_cpu; |
541 | 65 | } |
542 | | |
543 | | /* |
544 | | * Move a vcpu from its current processor to a target new processor, |
545 | | * without asking the scheduler to do any placement. This is intended |
546 | | * for being called from special contexts, where things are quiet |
547 | | * enough that no contention is supposed to happen (i.e., during |
548 | | * shutdown or software suspend, like ACPI S3). |
549 | | */ |
550 | | static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu) |
551 | 0 | { |
552 | 0 | unsigned long flags; |
553 | 0 | spinlock_t *lock, *new_lock; |
554 | 0 |
|
555 | 0 | ASSERT(system_state == SYS_STATE_suspend); |
556 | 0 | ASSERT(!vcpu_runnable(v) && (atomic_read(&v->pause_count) || |
557 | 0 | atomic_read(&v->domain->pause_count))); |
558 | 0 |
|
559 | 0 | lock = per_cpu(schedule_data, v->processor).schedule_lock; |
560 | 0 | new_lock = per_cpu(schedule_data, new_cpu).schedule_lock; |
561 | 0 |
|
562 | 0 | sched_spin_lock_double(lock, new_lock, &flags); |
563 | 0 | ASSERT(new_cpu != v->processor); |
564 | 0 | vcpu_move_locked(v, new_cpu); |
565 | 0 | sched_spin_unlock_double(lock, new_lock, flags); |
566 | 0 |
|
567 | 0 | sched_move_irqs(v); |
568 | 0 | } |
569 | | |
570 | | static void vcpu_migrate(struct vcpu *v) |
571 | 66 | { |
572 | 66 | unsigned long flags; |
573 | 66 | unsigned int old_cpu, new_cpu; |
574 | 66 | spinlock_t *old_lock, *new_lock; |
575 | 66 | bool_t pick_called = 0; |
576 | 66 | |
577 | 66 | old_cpu = new_cpu = v->processor; |
578 | 66 | for ( ; ; ) |
579 | 96 | { |
580 | 96 | /* |
581 | 96 | * We need another iteration if the pre-calculated lock addresses |
582 | 96 | * are not correct any longer after evaluating old and new cpu holding |
583 | 96 | * the locks. |
584 | 96 | */ |
585 | 96 | old_lock = per_cpu(schedule_data, old_cpu).schedule_lock; |
586 | 96 | new_lock = per_cpu(schedule_data, new_cpu).schedule_lock; |
587 | 96 | |
588 | 96 | sched_spin_lock_double(old_lock, new_lock, &flags); |
589 | 96 | |
590 | 96 | old_cpu = v->processor; |
591 | 96 | if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock ) |
592 | 96 | { |
593 | 96 | /* |
594 | 96 | * If we selected a CPU on the previosu iteration, check if it |
595 | 96 | * remains suitable for running this vCPU. |
596 | 96 | */ |
597 | 96 | if ( pick_called && |
598 | 30 | (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) && |
599 | 96 | cpumask_test_cpu(new_cpu, v->cpu_hard_affinity) && |
600 | 30 | cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) ) |
601 | 30 | break; |
602 | 96 | |
603 | 96 | /* Select a new CPU. */ |
604 | 66 | new_cpu = SCHED_OP(vcpu_scheduler(v), pick_cpu, v); |
605 | 66 | if ( (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) && |
606 | 36 | cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) ) |
607 | 36 | break; |
608 | 30 | pick_called = 1; |
609 | 30 | } |
610 | 96 | else |
611 | 0 | { |
612 | 0 | /* |
613 | 0 | * We do not hold the scheduler lock appropriate for this vCPU. |
614 | 0 | * Thus we cannot select a new CPU on this iteration. Try again. |
615 | 0 | */ |
616 | 0 | pick_called = 0; |
617 | 0 | } |
618 | 96 | |
619 | 30 | sched_spin_unlock_double(old_lock, new_lock, flags); |
620 | 30 | } |
621 | 66 | |
622 | 66 | /* |
623 | 66 | * NB. Check of v->running happens /after/ setting migration flag |
624 | 66 | * because they both happen in (different) spinlock regions, and those |
625 | 66 | * regions are strictly serialised. |
626 | 66 | */ |
627 | 66 | if ( v->is_running || |
628 | 65 | !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) |
629 | 1 | { |
630 | 1 | sched_spin_unlock_double(old_lock, new_lock, flags); |
631 | 1 | return; |
632 | 1 | } |
633 | 66 | |
634 | 65 | vcpu_move_locked(v, new_cpu); |
635 | 65 | |
636 | 65 | sched_spin_unlock_double(old_lock, new_lock, flags); |
637 | 65 | |
638 | 65 | if ( old_cpu != new_cpu ) |
639 | 30 | sched_move_irqs(v); |
640 | 65 | |
641 | 65 | /* Wake on new CPU. */ |
642 | 65 | vcpu_wake(v); |
643 | 65 | } |
644 | | |
645 | | /* |
646 | | * Force a VCPU through a deschedule/reschedule path. |
647 | | * For example, using this when setting the periodic timer period means that |
648 | | * most periodic-timer state need only be touched from within the scheduler |
649 | | * which can thus be done without need for synchronisation. |
650 | | */ |
651 | | void vcpu_force_reschedule(struct vcpu *v) |
652 | 14 | { |
653 | 14 | spinlock_t *lock = vcpu_schedule_lock_irq(v); |
654 | 14 | |
655 | 14 | if ( v->is_running ) |
656 | 1 | set_bit(_VPF_migrating, &v->pause_flags); |
657 | 14 | vcpu_schedule_unlock_irq(lock, v); |
658 | 14 | |
659 | 14 | if ( v->pause_flags & VPF_migrating ) |
660 | 1 | { |
661 | 1 | vcpu_sleep_nosync(v); |
662 | 1 | vcpu_migrate(v); |
663 | 1 | } |
664 | 14 | } |
665 | | |
666 | | void restore_vcpu_affinity(struct domain *d) |
667 | 0 | { |
668 | 0 | unsigned int cpu = smp_processor_id(); |
669 | 0 | struct vcpu *v; |
670 | 0 |
|
671 | 0 | ASSERT(system_state == SYS_STATE_resume); |
672 | 0 |
|
673 | 0 | for_each_vcpu ( d, v ) |
674 | 0 | { |
675 | 0 | spinlock_t *lock; |
676 | 0 |
|
677 | 0 | ASSERT(!vcpu_runnable(v)); |
678 | 0 |
|
679 | 0 | lock = vcpu_schedule_lock_irq(v); |
680 | 0 |
|
681 | 0 | if ( v->affinity_broken ) |
682 | 0 | { |
683 | 0 | cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved); |
684 | 0 | v->affinity_broken = 0; |
685 | 0 |
|
686 | 0 | } |
687 | 0 |
|
688 | 0 | /* |
689 | 0 | * During suspend (in cpu_disable_scheduler()), we moved every vCPU |
690 | 0 | * to BSP (which, as of now, is pCPU 0), as a temporary measure to |
691 | 0 | * allow the nonboot processors to have their data structure freed |
692 | 0 | * and go to sleep. But nothing guardantees that the BSP is a valid |
693 | 0 | * pCPU for a particular domain. |
694 | 0 | * |
695 | 0 | * Therefore, here, before actually unpausing the domains, we should |
696 | 0 | * set v->processor of each of their vCPUs to something that will |
697 | 0 | * make sense for the scheduler of the cpupool in which they are in. |
698 | 0 | */ |
699 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity, |
700 | 0 | cpupool_domain_cpumask(v->domain)); |
701 | 0 | v->processor = cpumask_any(cpumask_scratch_cpu(cpu)); |
702 | 0 |
|
703 | 0 | spin_unlock_irq(lock); |
704 | 0 |
|
705 | 0 | lock = vcpu_schedule_lock_irq(v); |
706 | 0 | v->processor = SCHED_OP(vcpu_scheduler(v), pick_cpu, v); |
707 | 0 | spin_unlock_irq(lock); |
708 | 0 | } |
709 | 0 |
|
710 | 0 | domain_update_node_affinity(d); |
711 | 0 | } |
712 | | |
713 | | /* |
714 | | * This function is used by cpu_hotplug code from stop_machine context |
715 | | * and from cpupools to switch schedulers on a cpu. |
716 | | */ |
717 | | int cpu_disable_scheduler(unsigned int cpu) |
718 | 0 | { |
719 | 0 | struct domain *d; |
720 | 0 | struct vcpu *v; |
721 | 0 | struct cpupool *c; |
722 | 0 | cpumask_t online_affinity; |
723 | 0 | unsigned int new_cpu; |
724 | 0 | int ret = 0; |
725 | 0 |
|
726 | 0 | c = per_cpu(cpupool, cpu); |
727 | 0 | if ( c == NULL ) |
728 | 0 | return ret; |
729 | 0 |
|
730 | 0 | /* |
731 | 0 | * We'd need the domain RCU lock, but: |
732 | 0 | * - when we are called from cpupool code, it's acquired there already; |
733 | 0 | * - when we are called for CPU teardown, we're in stop-machine context, |
734 | 0 | * so that's not be a problem. |
735 | 0 | */ |
736 | 0 | for_each_domain_in_cpupool ( d, c ) |
737 | 0 | { |
738 | 0 | for_each_vcpu ( d, v ) |
739 | 0 | { |
740 | 0 | unsigned long flags; |
741 | 0 | spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); |
742 | 0 |
|
743 | 0 | cpumask_and(&online_affinity, v->cpu_hard_affinity, c->cpu_valid); |
744 | 0 | if ( cpumask_empty(&online_affinity) && |
745 | 0 | cpumask_test_cpu(cpu, v->cpu_hard_affinity) ) |
746 | 0 | { |
747 | 0 | if ( v->affinity_broken ) |
748 | 0 | { |
749 | 0 | /* The vcpu is temporarily pinned, can't move it. */ |
750 | 0 | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
751 | 0 | ret = -EADDRINUSE; |
752 | 0 | break; |
753 | 0 | } |
754 | 0 |
|
755 | 0 | if (system_state == SYS_STATE_suspend) |
756 | 0 | { |
757 | 0 | cpumask_copy(v->cpu_hard_affinity_saved, |
758 | 0 | v->cpu_hard_affinity); |
759 | 0 | v->affinity_broken = 1; |
760 | 0 | } |
761 | 0 | else |
762 | 0 | printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v); |
763 | 0 |
|
764 | 0 | cpumask_setall(v->cpu_hard_affinity); |
765 | 0 | } |
766 | 0 |
|
767 | 0 | if ( v->processor != cpu ) |
768 | 0 | { |
769 | 0 | /* The vcpu is not on this cpu, so we can move on. */ |
770 | 0 | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
771 | 0 | continue; |
772 | 0 | } |
773 | 0 |
|
774 | 0 | /* If it is on this cpu, we must send it away. */ |
775 | 0 | if ( unlikely(system_state == SYS_STATE_suspend) ) |
776 | 0 | { |
777 | 0 | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
778 | 0 |
|
779 | 0 | /* |
780 | 0 | * If we are doing a shutdown/suspend, it is not necessary to |
781 | 0 | * ask the scheduler to chime in. In fact: |
782 | 0 | * * there is no reason for it: the end result we are after |
783 | 0 | * is just 'all the vcpus on the boot pcpu, and no vcpu |
784 | 0 | * anywhere else', so let's just go for it; |
785 | 0 | * * it's wrong, for cpupools with only non-boot pcpus, as |
786 | 0 | * the scheduler would always fail to send the vcpus away |
787 | 0 | * from the last online (non boot) pcpu! |
788 | 0 | * |
789 | 0 | * Therefore, in the shutdown/suspend case, we just pick up |
790 | 0 | * one (still) online pcpu. Note that, at this stage, all |
791 | 0 | * domains (including dom0) have been paused already, so we |
792 | 0 | * do not expect any vcpu activity at all. |
793 | 0 | */ |
794 | 0 | cpumask_andnot(&online_affinity, &cpu_online_map, |
795 | 0 | cpumask_of(cpu)); |
796 | 0 | BUG_ON(cpumask_empty(&online_affinity)); |
797 | 0 | /* |
798 | 0 | * As boot cpu is, usually, pcpu #0, using cpumask_first() |
799 | 0 | * will make us converge quicker. |
800 | 0 | */ |
801 | 0 | new_cpu = cpumask_first(&online_affinity); |
802 | 0 | vcpu_move_nosched(v, new_cpu); |
803 | 0 | } |
804 | 0 | else |
805 | 0 | { |
806 | 0 | /* |
807 | 0 | * OTOH, if the system is still live, and we are here because |
808 | 0 | * we are doing some cpupool manipulations: |
809 | 0 | * * we want to call the scheduler, and let it re-evaluation |
810 | 0 | * the placement of the vcpu, taking into account the new |
811 | 0 | * cpupool configuration; |
812 | 0 | * * the scheduler will always fine a suitable solution, or |
813 | 0 | * things would have failed before getting in here. |
814 | 0 | */ |
815 | 0 | set_bit(_VPF_migrating, &v->pause_flags); |
816 | 0 | vcpu_schedule_unlock_irqrestore(lock, flags, v); |
817 | 0 | vcpu_sleep_nosync(v); |
818 | 0 | vcpu_migrate(v); |
819 | 0 |
|
820 | 0 | /* |
821 | 0 | * The only caveat, in this case, is that if a vcpu active in |
822 | 0 | * the hypervisor isn't migratable. In this case, the caller |
823 | 0 | * should try again after releasing and reaquiring all locks. |
824 | 0 | */ |
825 | 0 | if ( v->processor == cpu ) |
826 | 0 | ret = -EAGAIN; |
827 | 0 | } |
828 | 0 | } |
829 | 0 | } |
830 | 0 |
|
831 | 0 | return ret; |
832 | 0 | } |
833 | | |
834 | | static int vcpu_set_affinity( |
835 | | struct vcpu *v, const cpumask_t *affinity, cpumask_t *which) |
836 | 0 | { |
837 | 0 | spinlock_t *lock; |
838 | 0 | int ret = 0; |
839 | 0 |
|
840 | 0 | lock = vcpu_schedule_lock_irq(v); |
841 | 0 |
|
842 | 0 | if ( v->affinity_broken ) |
843 | 0 | ret = -EBUSY; |
844 | 0 | else |
845 | 0 | { |
846 | 0 | cpumask_copy(which, affinity); |
847 | 0 |
|
848 | 0 | /* |
849 | 0 | * Always ask the scheduler to re-evaluate placement |
850 | 0 | * when changing the affinity. |
851 | 0 | */ |
852 | 0 | set_bit(_VPF_migrating, &v->pause_flags); |
853 | 0 | } |
854 | 0 |
|
855 | 0 | vcpu_schedule_unlock_irq(lock, v); |
856 | 0 |
|
857 | 0 | domain_update_node_affinity(v->domain); |
858 | 0 |
|
859 | 0 | if ( v->pause_flags & VPF_migrating ) |
860 | 0 | { |
861 | 0 | vcpu_sleep_nosync(v); |
862 | 0 | vcpu_migrate(v); |
863 | 0 | } |
864 | 0 |
|
865 | 0 | return ret; |
866 | 0 | } |
867 | | |
868 | | int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity) |
869 | 0 | { |
870 | 0 | cpumask_t online_affinity; |
871 | 0 | cpumask_t *online; |
872 | 0 |
|
873 | 0 | if ( v->domain->is_pinned ) |
874 | 0 | return -EINVAL; |
875 | 0 |
|
876 | 0 | online = VCPU2ONLINE(v); |
877 | 0 | cpumask_and(&online_affinity, affinity, online); |
878 | 0 | if ( cpumask_empty(&online_affinity) ) |
879 | 0 | return -EINVAL; |
880 | 0 |
|
881 | 0 | return vcpu_set_affinity(v, affinity, v->cpu_hard_affinity); |
882 | 0 | } |
883 | | |
884 | | int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity) |
885 | 0 | { |
886 | 0 | return vcpu_set_affinity(v, affinity, v->cpu_soft_affinity); |
887 | 0 | } |
888 | | |
889 | | /* Block the currently-executing domain until a pertinent event occurs. */ |
890 | | void vcpu_block(void) |
891 | 65.3k | { |
892 | 65.3k | struct vcpu *v = current; |
893 | 65.3k | |
894 | 65.3k | set_bit(_VPF_blocked, &v->pause_flags); |
895 | 65.3k | |
896 | 65.3k | arch_vcpu_block(v); |
897 | 65.3k | |
898 | 65.3k | /* Check for events /after/ blocking: avoids wakeup waiting race. */ |
899 | 65.3k | if ( local_events_need_delivery() ) |
900 | 171 | { |
901 | 171 | clear_bit(_VPF_blocked, &v->pause_flags); |
902 | 171 | } |
903 | 65.3k | else |
904 | 65.2k | { |
905 | 65.2k | TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); |
906 | 65.2k | raise_softirq(SCHEDULE_SOFTIRQ); |
907 | 65.2k | } |
908 | 65.3k | } |
909 | | |
910 | | static void vcpu_block_enable_events(void) |
911 | 65.4k | { |
912 | 65.4k | local_event_delivery_enable(); |
913 | 65.4k | vcpu_block(); |
914 | 65.4k | } |
915 | | |
916 | | static long do_poll(struct sched_poll *sched_poll) |
917 | 0 | { |
918 | 0 | struct vcpu *v = current; |
919 | 0 | struct domain *d = v->domain; |
920 | 0 | evtchn_port_t port; |
921 | 0 | long rc; |
922 | 0 | unsigned int i; |
923 | 0 |
|
924 | 0 | /* Fairly arbitrary limit. */ |
925 | 0 | if ( sched_poll->nr_ports > 128 ) |
926 | 0 | return -EINVAL; |
927 | 0 |
|
928 | 0 | if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) |
929 | 0 | return -EFAULT; |
930 | 0 |
|
931 | 0 | set_bit(_VPF_blocked, &v->pause_flags); |
932 | 0 | v->poll_evtchn = -1; |
933 | 0 | set_bit(v->vcpu_id, d->poll_mask); |
934 | 0 |
|
935 | 0 | arch_vcpu_block(v); |
936 | 0 |
|
937 | 0 | #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ |
938 | | /* Check for events /after/ setting flags: avoids wakeup waiting race. */ |
939 | | smp_mb(); |
940 | | |
941 | | /* |
942 | | * Someone may have seen we are blocked but not that we are polling, or |
943 | | * vice versa. We are certainly being woken, so clean up and bail. Beyond |
944 | | * this point others can be guaranteed to clean up for us if they wake us. |
945 | | */ |
946 | | rc = 0; |
947 | | if ( (v->poll_evtchn == 0) || |
948 | | !test_bit(_VPF_blocked, &v->pause_flags) || |
949 | | !test_bit(v->vcpu_id, d->poll_mask) ) |
950 | | goto out; |
951 | | #endif |
952 | 0 |
|
953 | 0 | rc = 0; |
954 | 0 | if ( local_events_need_delivery() ) |
955 | 0 | goto out; |
956 | 0 |
|
957 | 0 | for ( i = 0; i < sched_poll->nr_ports; i++ ) |
958 | 0 | { |
959 | 0 | rc = -EFAULT; |
960 | 0 | if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) |
961 | 0 | goto out; |
962 | 0 |
|
963 | 0 | rc = -EINVAL; |
964 | 0 | if ( port >= d->max_evtchns ) |
965 | 0 | goto out; |
966 | 0 |
|
967 | 0 | rc = 0; |
968 | 0 | if ( evtchn_port_is_pending(d, port) ) |
969 | 0 | goto out; |
970 | 0 | } |
971 | 0 |
|
972 | 0 | if ( sched_poll->nr_ports == 1 ) |
973 | 0 | v->poll_evtchn = port; |
974 | 0 |
|
975 | 0 | if ( sched_poll->timeout != 0 ) |
976 | 0 | set_timer(&v->poll_timer, sched_poll->timeout); |
977 | 0 |
|
978 | 0 | TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); |
979 | 0 | raise_softirq(SCHEDULE_SOFTIRQ); |
980 | 0 |
|
981 | 0 | return 0; |
982 | 0 |
|
983 | 0 | out: |
984 | 0 | v->poll_evtchn = 0; |
985 | 0 | clear_bit(v->vcpu_id, d->poll_mask); |
986 | 0 | clear_bit(_VPF_blocked, &v->pause_flags); |
987 | 0 | return rc; |
988 | 0 | } |
989 | | |
990 | | /* Voluntarily yield the processor for this allocation. */ |
991 | | long vcpu_yield(void) |
992 | 4.53M | { |
993 | 4.53M | struct vcpu * v=current; |
994 | 4.53M | spinlock_t *lock = vcpu_schedule_lock_irq(v); |
995 | 4.53M | |
996 | 4.53M | SCHED_OP(vcpu_scheduler(v), yield, v); |
997 | 4.53M | vcpu_schedule_unlock_irq(lock, v); |
998 | 4.53M | |
999 | 4.53M | SCHED_STAT_CRANK(vcpu_yield); |
1000 | 4.53M | |
1001 | 4.53M | TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); |
1002 | 4.53M | raise_softirq(SCHEDULE_SOFTIRQ); |
1003 | 4.53M | return 0; |
1004 | 4.53M | } |
1005 | | |
1006 | | static void domain_watchdog_timeout(void *data) |
1007 | 0 | { |
1008 | 0 | struct domain *d = data; |
1009 | 0 |
|
1010 | 0 | if ( d->is_shutting_down || d->is_dying ) |
1011 | 0 | return; |
1012 | 0 |
|
1013 | 0 | printk("Watchdog timer fired for domain %u\n", d->domain_id); |
1014 | 0 | domain_shutdown(d, SHUTDOWN_watchdog); |
1015 | 0 | } |
1016 | | |
1017 | | static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout) |
1018 | 0 | { |
1019 | 0 | if ( id > NR_DOMAIN_WATCHDOG_TIMERS ) |
1020 | 0 | return -EINVAL; |
1021 | 0 |
|
1022 | 0 | spin_lock(&d->watchdog_lock); |
1023 | 0 |
|
1024 | 0 | if ( id == 0 ) |
1025 | 0 | { |
1026 | 0 | for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ ) |
1027 | 0 | { |
1028 | 0 | if ( test_and_set_bit(id, &d->watchdog_inuse_map) ) |
1029 | 0 | continue; |
1030 | 0 | set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); |
1031 | 0 | break; |
1032 | 0 | } |
1033 | 0 | spin_unlock(&d->watchdog_lock); |
1034 | 0 | return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1; |
1035 | 0 | } |
1036 | 0 |
|
1037 | 0 | id -= 1; |
1038 | 0 | if ( !test_bit(id, &d->watchdog_inuse_map) ) |
1039 | 0 | { |
1040 | 0 | spin_unlock(&d->watchdog_lock); |
1041 | 0 | return -EINVAL; |
1042 | 0 | } |
1043 | 0 |
|
1044 | 0 | if ( timeout == 0 ) |
1045 | 0 | { |
1046 | 0 | stop_timer(&d->watchdog_timer[id]); |
1047 | 0 | clear_bit(id, &d->watchdog_inuse_map); |
1048 | 0 | } |
1049 | 0 | else |
1050 | 0 | { |
1051 | 0 | set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); |
1052 | 0 | } |
1053 | 0 |
|
1054 | 0 | spin_unlock(&d->watchdog_lock); |
1055 | 0 | return 0; |
1056 | 0 | } |
1057 | | |
1058 | | void watchdog_domain_init(struct domain *d) |
1059 | 5 | { |
1060 | 5 | unsigned int i; |
1061 | 5 | |
1062 | 5 | spin_lock_init(&d->watchdog_lock); |
1063 | 5 | |
1064 | 5 | d->watchdog_inuse_map = 0; |
1065 | 5 | |
1066 | 15 | for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) |
1067 | 10 | init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); |
1068 | 5 | } |
1069 | | |
1070 | | void watchdog_domain_destroy(struct domain *d) |
1071 | 0 | { |
1072 | 0 | unsigned int i; |
1073 | 0 |
|
1074 | 0 | for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) |
1075 | 0 | kill_timer(&d->watchdog_timer[i]); |
1076 | 0 | } |
1077 | | |
1078 | | int vcpu_pin_override(struct vcpu *v, int cpu) |
1079 | 0 | { |
1080 | 0 | spinlock_t *lock; |
1081 | 0 | int ret = -EINVAL; |
1082 | 0 |
|
1083 | 0 | lock = vcpu_schedule_lock_irq(v); |
1084 | 0 |
|
1085 | 0 | if ( cpu < 0 ) |
1086 | 0 | { |
1087 | 0 | if ( v->affinity_broken ) |
1088 | 0 | { |
1089 | 0 | cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved); |
1090 | 0 | v->affinity_broken = 0; |
1091 | 0 | set_bit(_VPF_migrating, &v->pause_flags); |
1092 | 0 | ret = 0; |
1093 | 0 | } |
1094 | 0 | } |
1095 | 0 | else if ( cpu < nr_cpu_ids ) |
1096 | 0 | { |
1097 | 0 | if ( v->affinity_broken ) |
1098 | 0 | ret = -EBUSY; |
1099 | 0 | else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) ) |
1100 | 0 | { |
1101 | 0 | cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity); |
1102 | 0 | v->affinity_broken = 1; |
1103 | 0 | cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu)); |
1104 | 0 | set_bit(_VPF_migrating, &v->pause_flags); |
1105 | 0 | ret = 0; |
1106 | 0 | } |
1107 | 0 | } |
1108 | 0 |
|
1109 | 0 | vcpu_schedule_unlock_irq(lock, v); |
1110 | 0 |
|
1111 | 0 | domain_update_node_affinity(v->domain); |
1112 | 0 |
|
1113 | 0 | if ( v->pause_flags & VPF_migrating ) |
1114 | 0 | { |
1115 | 0 | vcpu_sleep_nosync(v); |
1116 | 0 | vcpu_migrate(v); |
1117 | 0 | } |
1118 | 0 |
|
1119 | 0 | return ret; |
1120 | 0 | } |
1121 | | |
1122 | | typedef long ret_t; |
1123 | | |
1124 | | #endif /* !COMPAT */ |
1125 | | |
1126 | | ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) |
1127 | 4.58M | { |
1128 | 0 | ret_t ret = 0; |
1129 | 4.58M | |
1130 | 4.58M | switch ( cmd ) |
1131 | 4.58M | { |
1132 | 4.52M | case SCHEDOP_yield: |
1133 | 4.52M | { |
1134 | 4.52M | ret = vcpu_yield(); |
1135 | 4.52M | break; |
1136 | 4.52M | } |
1137 | 4.52M | |
1138 | 65.6k | case SCHEDOP_block: |
1139 | 65.6k | { |
1140 | 65.6k | vcpu_block_enable_events(); |
1141 | 65.6k | break; |
1142 | 4.52M | } |
1143 | 4.52M | |
1144 | 0 | case SCHEDOP_shutdown: |
1145 | 0 | { |
1146 | 0 | struct sched_shutdown sched_shutdown; |
1147 | 0 |
|
1148 | 0 | ret = -EFAULT; |
1149 | 0 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) |
1150 | 0 | break; |
1151 | 0 |
|
1152 | 0 | ret = 0; |
1153 | 0 | TRACE_3D(TRC_SCHED_SHUTDOWN, |
1154 | 0 | current->domain->domain_id, current->vcpu_id, |
1155 | 0 | sched_shutdown.reason); |
1156 | 0 | domain_shutdown(current->domain, (u8)sched_shutdown.reason); |
1157 | 0 |
|
1158 | 0 | break; |
1159 | 0 | } |
1160 | 0 |
|
1161 | 0 | case SCHEDOP_shutdown_code: |
1162 | 0 | { |
1163 | 0 | struct sched_shutdown sched_shutdown; |
1164 | 0 | struct domain *d = current->domain; |
1165 | 0 |
|
1166 | 0 | ret = -EFAULT; |
1167 | 0 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) |
1168 | 0 | break; |
1169 | 0 |
|
1170 | 0 | TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, |
1171 | 0 | d->domain_id, current->vcpu_id, sched_shutdown.reason); |
1172 | 0 |
|
1173 | 0 | spin_lock(&d->shutdown_lock); |
1174 | 0 | if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) |
1175 | 0 | d->shutdown_code = (u8)sched_shutdown.reason; |
1176 | 0 | spin_unlock(&d->shutdown_lock); |
1177 | 0 |
|
1178 | 0 | ret = 0; |
1179 | 0 | break; |
1180 | 0 | } |
1181 | 0 |
|
1182 | 0 | case SCHEDOP_poll: |
1183 | 0 | { |
1184 | 0 | struct sched_poll sched_poll; |
1185 | 0 |
|
1186 | 0 | ret = -EFAULT; |
1187 | 0 | if ( copy_from_guest(&sched_poll, arg, 1) ) |
1188 | 0 | break; |
1189 | 0 |
|
1190 | 0 | ret = do_poll(&sched_poll); |
1191 | 0 |
|
1192 | 0 | break; |
1193 | 0 | } |
1194 | 0 |
|
1195 | 0 | case SCHEDOP_remote_shutdown: |
1196 | 0 | { |
1197 | 0 | struct domain *d; |
1198 | 0 | struct sched_remote_shutdown sched_remote_shutdown; |
1199 | 0 |
|
1200 | 0 | ret = -EFAULT; |
1201 | 0 | if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) |
1202 | 0 | break; |
1203 | 0 |
|
1204 | 0 | ret = -ESRCH; |
1205 | 0 | d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); |
1206 | 0 | if ( d == NULL ) |
1207 | 0 | break; |
1208 | 0 |
|
1209 | 0 | ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); |
1210 | 0 | if ( likely(!ret) ) |
1211 | 0 | domain_shutdown(d, sched_remote_shutdown.reason); |
1212 | 0 |
|
1213 | 0 | rcu_unlock_domain(d); |
1214 | 0 |
|
1215 | 0 | break; |
1216 | 0 | } |
1217 | 0 |
|
1218 | 0 | case SCHEDOP_watchdog: |
1219 | 0 | { |
1220 | 0 | struct sched_watchdog sched_watchdog; |
1221 | 0 |
|
1222 | 0 | ret = -EFAULT; |
1223 | 0 | if ( copy_from_guest(&sched_watchdog, arg, 1) ) |
1224 | 0 | break; |
1225 | 0 |
|
1226 | 0 | ret = domain_watchdog( |
1227 | 0 | current->domain, sched_watchdog.id, sched_watchdog.timeout); |
1228 | 0 | break; |
1229 | 0 | } |
1230 | 0 |
|
1231 | 0 | case SCHEDOP_pin_override: |
1232 | 0 | { |
1233 | 0 | struct sched_pin_override sched_pin_override; |
1234 | 0 |
|
1235 | 0 | ret = -EPERM; |
1236 | 0 | if ( !is_hardware_domain(current->domain) ) |
1237 | 0 | break; |
1238 | 0 |
|
1239 | 0 | ret = -EFAULT; |
1240 | 0 | if ( copy_from_guest(&sched_pin_override, arg, 1) ) |
1241 | 0 | break; |
1242 | 0 |
|
1243 | 0 | ret = vcpu_pin_override(current, sched_pin_override.pcpu); |
1244 | 0 |
|
1245 | 0 | break; |
1246 | 0 | } |
1247 | 0 |
|
1248 | 0 | default: |
1249 | 0 | ret = -ENOSYS; |
1250 | 4.58M | } |
1251 | 4.58M | |
1252 | 5.02M | return ret; |
1253 | 4.58M | } Line | Count | Source | 1127 | 4.58M | { | 1128 | 4.58M | ret_t ret = 0; | 1129 | 4.58M | | 1130 | 4.58M | switch ( cmd ) | 1131 | 4.58M | { | 1132 | 4.52M | case SCHEDOP_yield: | 1133 | 4.52M | { | 1134 | 4.52M | ret = vcpu_yield(); | 1135 | 4.52M | break; | 1136 | 4.52M | } | 1137 | 4.52M | | 1138 | 65.6k | case SCHEDOP_block: | 1139 | 65.6k | { | 1140 | 65.6k | vcpu_block_enable_events(); | 1141 | 65.6k | break; | 1142 | 4.52M | } | 1143 | 4.52M | | 1144 | 0 | case SCHEDOP_shutdown: | 1145 | 0 | { | 1146 | 0 | struct sched_shutdown sched_shutdown; | 1147 | 0 |
| 1148 | 0 | ret = -EFAULT; | 1149 | 0 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) | 1150 | 0 | break; | 1151 | 0 |
| 1152 | 0 | ret = 0; | 1153 | 0 | TRACE_3D(TRC_SCHED_SHUTDOWN, | 1154 | 0 | current->domain->domain_id, current->vcpu_id, | 1155 | 0 | sched_shutdown.reason); | 1156 | 0 | domain_shutdown(current->domain, (u8)sched_shutdown.reason); | 1157 | 0 |
| 1158 | 0 | break; | 1159 | 0 | } | 1160 | 0 |
| 1161 | 0 | case SCHEDOP_shutdown_code: | 1162 | 0 | { | 1163 | 0 | struct sched_shutdown sched_shutdown; | 1164 | 0 | struct domain *d = current->domain; | 1165 | 0 |
| 1166 | 0 | ret = -EFAULT; | 1167 | 0 | if ( copy_from_guest(&sched_shutdown, arg, 1) ) | 1168 | 0 | break; | 1169 | 0 |
| 1170 | 0 | TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, | 1171 | 0 | d->domain_id, current->vcpu_id, sched_shutdown.reason); | 1172 | 0 |
| 1173 | 0 | spin_lock(&d->shutdown_lock); | 1174 | 0 | if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) | 1175 | 0 | d->shutdown_code = (u8)sched_shutdown.reason; | 1176 | 0 | spin_unlock(&d->shutdown_lock); | 1177 | 0 |
| 1178 | 0 | ret = 0; | 1179 | 0 | break; | 1180 | 0 | } | 1181 | 0 |
| 1182 | 0 | case SCHEDOP_poll: | 1183 | 0 | { | 1184 | 0 | struct sched_poll sched_poll; | 1185 | 0 |
| 1186 | 0 | ret = -EFAULT; | 1187 | 0 | if ( copy_from_guest(&sched_poll, arg, 1) ) | 1188 | 0 | break; | 1189 | 0 |
| 1190 | 0 | ret = do_poll(&sched_poll); | 1191 | 0 |
| 1192 | 0 | break; | 1193 | 0 | } | 1194 | 0 |
| 1195 | 0 | case SCHEDOP_remote_shutdown: | 1196 | 0 | { | 1197 | 0 | struct domain *d; | 1198 | 0 | struct sched_remote_shutdown sched_remote_shutdown; | 1199 | 0 |
| 1200 | 0 | ret = -EFAULT; | 1201 | 0 | if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) | 1202 | 0 | break; | 1203 | 0 |
| 1204 | 0 | ret = -ESRCH; | 1205 | 0 | d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); | 1206 | 0 | if ( d == NULL ) | 1207 | 0 | break; | 1208 | 0 |
| 1209 | 0 | ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); | 1210 | 0 | if ( likely(!ret) ) | 1211 | 0 | domain_shutdown(d, sched_remote_shutdown.reason); | 1212 | 0 |
| 1213 | 0 | rcu_unlock_domain(d); | 1214 | 0 |
| 1215 | 0 | break; | 1216 | 0 | } | 1217 | 0 |
| 1218 | 0 | case SCHEDOP_watchdog: | 1219 | 0 | { | 1220 | 0 | struct sched_watchdog sched_watchdog; | 1221 | 0 |
| 1222 | 0 | ret = -EFAULT; | 1223 | 0 | if ( copy_from_guest(&sched_watchdog, arg, 1) ) | 1224 | 0 | break; | 1225 | 0 |
| 1226 | 0 | ret = domain_watchdog( | 1227 | 0 | current->domain, sched_watchdog.id, sched_watchdog.timeout); | 1228 | 0 | break; | 1229 | 0 | } | 1230 | 0 |
| 1231 | 0 | case SCHEDOP_pin_override: | 1232 | 0 | { | 1233 | 0 | struct sched_pin_override sched_pin_override; | 1234 | 0 |
| 1235 | 0 | ret = -EPERM; | 1236 | 0 | if ( !is_hardware_domain(current->domain) ) | 1237 | 0 | break; | 1238 | 0 |
| 1239 | 0 | ret = -EFAULT; | 1240 | 0 | if ( copy_from_guest(&sched_pin_override, arg, 1) ) | 1241 | 0 | break; | 1242 | 0 |
| 1243 | 0 | ret = vcpu_pin_override(current, sched_pin_override.pcpu); | 1244 | 0 |
| 1245 | 0 | break; | 1246 | 0 | } | 1247 | 0 |
| 1248 | 0 | default: | 1249 | 0 | ret = -ENOSYS; | 1250 | 4.58M | } | 1251 | 4.58M | | 1252 | 5.02M | return ret; | 1253 | 4.58M | } |
Unexecuted instantiation: compat_sched_op |
1254 | | |
1255 | | #ifndef COMPAT |
1256 | | |
1257 | | /* Per-vcpu oneshot-timer hypercall. */ |
1258 | | long do_set_timer_op(s_time_t timeout) |
1259 | 0 | { |
1260 | 0 | struct vcpu *v = current; |
1261 | 0 | s_time_t offset = timeout - NOW(); |
1262 | 0 |
|
1263 | 0 | if ( timeout == 0 ) |
1264 | 0 | { |
1265 | 0 | stop_timer(&v->singleshot_timer); |
1266 | 0 | } |
1267 | 0 | else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ |
1268 | 0 | unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) |
1269 | 0 | { |
1270 | 0 | /* |
1271 | 0 | * Linux workaround: occasionally we will see timeouts a long way in |
1272 | 0 | * the future due to wrapping in Linux's jiffy time handling. We check |
1273 | 0 | * for timeouts wrapped negative, and for positive timeouts more than |
1274 | 0 | * about 13 days in the future (2^50ns). The correct fix is to trigger |
1275 | 0 | * an interrupt immediately (since Linux in fact has pending work to |
1276 | 0 | * do in this situation). However, older guests also set a long timeout |
1277 | 0 | * when they have *no* pending timers at all: setting an immediate |
1278 | 0 | * timeout in this case can burn a lot of CPU. We therefore go for a |
1279 | 0 | * reasonable middleground of triggering a timer event in 100ms. |
1280 | 0 | */ |
1281 | 0 | gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n", |
1282 | 0 | timeout); |
1283 | 0 | set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); |
1284 | 0 | } |
1285 | 0 | else |
1286 | 0 | { |
1287 | 0 | migrate_timer(&v->singleshot_timer, smp_processor_id()); |
1288 | 0 | set_timer(&v->singleshot_timer, timeout); |
1289 | 0 | } |
1290 | 0 |
|
1291 | 0 | return 0; |
1292 | 0 | } |
1293 | | |
1294 | | /* sched_id - fetch ID of current scheduler */ |
1295 | | int sched_id(void) |
1296 | 0 | { |
1297 | 0 | return ops.sched_id; |
1298 | 0 | } |
1299 | | |
1300 | | /* Adjust scheduling parameter for a given domain. */ |
1301 | | long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) |
1302 | 0 | { |
1303 | 0 | long ret; |
1304 | 0 |
|
1305 | 0 | ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd); |
1306 | 0 | if ( ret ) |
1307 | 0 | return ret; |
1308 | 0 |
|
1309 | 0 | if ( op->sched_id != dom_scheduler(d)->sched_id ) |
1310 | 0 | return -EINVAL; |
1311 | 0 |
|
1312 | 0 | switch ( op->cmd ) |
1313 | 0 | { |
1314 | 0 | case XEN_DOMCTL_SCHEDOP_putinfo: |
1315 | 0 | case XEN_DOMCTL_SCHEDOP_getinfo: |
1316 | 0 | case XEN_DOMCTL_SCHEDOP_putvcpuinfo: |
1317 | 0 | case XEN_DOMCTL_SCHEDOP_getvcpuinfo: |
1318 | 0 | break; |
1319 | 0 | default: |
1320 | 0 | return -EINVAL; |
1321 | 0 | } |
1322 | 0 |
|
1323 | 0 | /* NB: the pluggable scheduler code needs to take care |
1324 | 0 | * of locking by itself. */ |
1325 | 0 | if ( (ret = SCHED_OP(dom_scheduler(d), adjust, d, op)) == 0 ) |
1326 | 0 | TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); |
1327 | 0 |
|
1328 | 0 | return ret; |
1329 | 0 | } |
1330 | | |
1331 | | long sched_adjust_global(struct xen_sysctl_scheduler_op *op) |
1332 | 0 | { |
1333 | 0 | struct cpupool *pool; |
1334 | 0 | int rc; |
1335 | 0 |
|
1336 | 0 | rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd); |
1337 | 0 | if ( rc ) |
1338 | 0 | return rc; |
1339 | 0 |
|
1340 | 0 | if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) && |
1341 | 0 | (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) ) |
1342 | 0 | return -EINVAL; |
1343 | 0 |
|
1344 | 0 | pool = cpupool_get_by_id(op->cpupool_id); |
1345 | 0 | if ( pool == NULL ) |
1346 | 0 | return -ESRCH; |
1347 | 0 |
|
1348 | 0 | rc = ((op->sched_id == pool->sched->sched_id) |
1349 | 0 | ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL); |
1350 | 0 |
|
1351 | 0 | cpupool_put(pool); |
1352 | 0 |
|
1353 | 0 | return rc; |
1354 | 0 | } |
1355 | | |
1356 | | static void vcpu_periodic_timer_work(struct vcpu *v) |
1357 | 165k | { |
1358 | 165k | s_time_t now = NOW(); |
1359 | 165k | s_time_t periodic_next_event; |
1360 | 165k | |
1361 | 165k | if ( v->periodic_period == 0 ) |
1362 | 165k | return; |
1363 | 165k | |
1364 | 271 | periodic_next_event = v->periodic_last_event + v->periodic_period; |
1365 | 271 | |
1366 | 271 | if ( now >= periodic_next_event ) |
1367 | 0 | { |
1368 | 0 | send_timer_event(v); |
1369 | 0 | v->periodic_last_event = now; |
1370 | 0 | periodic_next_event = now + v->periodic_period; |
1371 | 0 | } |
1372 | 271 | |
1373 | 271 | migrate_timer(&v->periodic_timer, smp_processor_id()); |
1374 | 271 | set_timer(&v->periodic_timer, periodic_next_event); |
1375 | 271 | } |
1376 | | |
1377 | | /* |
1378 | | * The main function |
1379 | | * - deschedule the current domain (scheduler independent). |
1380 | | * - pick a new domain (scheduler dependent). |
1381 | | */ |
1382 | | static void schedule(void) |
1383 | 4.92M | { |
1384 | 4.92M | struct vcpu *prev = current, *next = NULL; |
1385 | 4.92M | s_time_t now; |
1386 | 4.92M | struct scheduler *sched; |
1387 | 4.92M | unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do); |
1388 | 4.92M | bool_t tasklet_work_scheduled = 0; |
1389 | 4.92M | struct schedule_data *sd; |
1390 | 4.92M | spinlock_t *lock; |
1391 | 4.92M | struct task_slice next_slice; |
1392 | 4.92M | int cpu = smp_processor_id(); |
1393 | 4.92M | |
1394 | 4.92M | ASSERT_NOT_IN_ATOMIC(); |
1395 | 4.92M | |
1396 | 4.92M | SCHED_STAT_CRANK(sched_run); |
1397 | 4.92M | |
1398 | 4.92M | sd = &this_cpu(schedule_data); |
1399 | 4.92M | |
1400 | 4.92M | /* Update tasklet scheduling status. */ |
1401 | 4.92M | switch ( *tasklet_work ) |
1402 | 4.92M | { |
1403 | 44 | case TASKLET_enqueued: |
1404 | 44 | set_bit(_TASKLET_scheduled, tasklet_work); |
1405 | 44 | /* fallthrough */ |
1406 | 44 | case TASKLET_enqueued|TASKLET_scheduled: |
1407 | 44 | tasklet_work_scheduled = 1; |
1408 | 44 | break; |
1409 | 44 | case TASKLET_scheduled: |
1410 | 44 | clear_bit(_TASKLET_scheduled, tasklet_work); |
1411 | 4.88M | case 0: |
1412 | 4.88M | /*tasklet_work_scheduled = 0;*/ |
1413 | 4.88M | break; |
1414 | 0 | default: |
1415 | 0 | BUG(); |
1416 | 4.92M | } |
1417 | 4.92M | |
1418 | 4.88M | lock = pcpu_schedule_lock_irq(cpu); |
1419 | 4.88M | |
1420 | 4.88M | now = NOW(); |
1421 | 4.88M | |
1422 | 4.88M | stop_timer(&sd->s_timer); |
1423 | 4.88M | |
1424 | 4.88M | /* get policy-specific decision on scheduling... */ |
1425 | 4.88M | sched = this_cpu(scheduler); |
1426 | 4.88M | next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled); |
1427 | 4.88M | |
1428 | 4.88M | next = next_slice.task; |
1429 | 4.88M | |
1430 | 4.88M | sd->curr = next; |
1431 | 4.88M | |
1432 | 4.88M | if ( next_slice.time >= 0 ) /* -ve means no limit */ |
1433 | 4.92M | set_timer(&sd->s_timer, now + next_slice.time); |
1434 | 4.88M | |
1435 | 4.88M | if ( unlikely(prev == next) ) |
1436 | 4.57M | { |
1437 | 4.57M | pcpu_schedule_unlock_irq(lock, cpu); |
1438 | 4.57M | TRACE_4D(TRC_SCHED_SWITCH_INFCONT, |
1439 | 4.57M | next->domain->domain_id, next->vcpu_id, |
1440 | 4.57M | now - prev->runstate.state_entry_time, |
1441 | 4.57M | next_slice.time); |
1442 | 4.57M | trace_continue_running(next); |
1443 | 4.57M | return continue_running(prev); |
1444 | 4.57M | } |
1445 | 4.88M | |
1446 | 314k | TRACE_3D(TRC_SCHED_SWITCH_INFPREV, |
1447 | 314k | prev->domain->domain_id, prev->vcpu_id, |
1448 | 314k | now - prev->runstate.state_entry_time); |
1449 | 314k | TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, |
1450 | 314k | next->domain->domain_id, next->vcpu_id, |
1451 | 314k | (next->runstate.state == RUNSTATE_runnable) ? |
1452 | 314k | (now - next->runstate.state_entry_time) : 0, |
1453 | 314k | next_slice.time); |
1454 | 314k | |
1455 | 314k | ASSERT(prev->runstate.state == RUNSTATE_running); |
1456 | 314k | |
1457 | 314k | TRACE_4D(TRC_SCHED_SWITCH, |
1458 | 314k | prev->domain->domain_id, prev->vcpu_id, |
1459 | 314k | next->domain->domain_id, next->vcpu_id); |
1460 | 314k | |
1461 | 314k | vcpu_runstate_change( |
1462 | 314k | prev, |
1463 | 314k | ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked : |
1464 | 249k | (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), |
1465 | 314k | now); |
1466 | 314k | prev->last_run_time = now; |
1467 | 314k | |
1468 | 314k | ASSERT(next->runstate.state != RUNSTATE_running); |
1469 | 314k | vcpu_runstate_change(next, RUNSTATE_running, now); |
1470 | 314k | |
1471 | 314k | /* |
1472 | 314k | * NB. Don't add any trace records from here until the actual context |
1473 | 314k | * switch, else lost_records resume will not work properly. |
1474 | 314k | */ |
1475 | 314k | |
1476 | 314k | ASSERT(!next->is_running); |
1477 | 314k | next->is_running = 1; |
1478 | 314k | |
1479 | 314k | pcpu_schedule_unlock_irq(lock, cpu); |
1480 | 314k | |
1481 | 314k | SCHED_STAT_CRANK(sched_ctx); |
1482 | 314k | |
1483 | 314k | stop_timer(&prev->periodic_timer); |
1484 | 314k | |
1485 | 314k | if ( next_slice.migrated ) |
1486 | 506 | sched_move_irqs(next); |
1487 | 314k | |
1488 | 314k | vcpu_periodic_timer_work(next); |
1489 | 314k | |
1490 | 314k | context_switch(prev, next); |
1491 | 314k | } |
1492 | | |
1493 | | void context_saved(struct vcpu *prev) |
1494 | 163k | { |
1495 | 163k | /* Clear running flag /after/ writing context to memory. */ |
1496 | 163k | smp_wmb(); |
1497 | 163k | |
1498 | 163k | prev->is_running = 0; |
1499 | 163k | |
1500 | 163k | /* Check for migration request /after/ clearing running flag. */ |
1501 | 163k | smp_mb(); |
1502 | 163k | |
1503 | 163k | SCHED_OP(vcpu_scheduler(prev), context_saved, prev); |
1504 | 163k | |
1505 | 163k | if ( unlikely(prev->pause_flags & VPF_migrating) ) |
1506 | 65 | vcpu_migrate(prev); |
1507 | 163k | } |
1508 | | |
1509 | | /* The scheduler timer: force a run through the scheduler */ |
1510 | | static void s_timer_fn(void *unused) |
1511 | 2.10k | { |
1512 | 2.10k | raise_softirq(SCHEDULE_SOFTIRQ); |
1513 | 2.10k | SCHED_STAT_CRANK(sched_irq); |
1514 | 2.10k | } |
1515 | | |
1516 | | /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ |
1517 | | static void vcpu_periodic_timer_fn(void *data) |
1518 | 0 | { |
1519 | 0 | struct vcpu *v = data; |
1520 | 0 | vcpu_periodic_timer_work(v); |
1521 | 0 | } |
1522 | | |
1523 | | /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ |
1524 | | static void vcpu_singleshot_timer_fn(void *data) |
1525 | 5.74k | { |
1526 | 5.74k | struct vcpu *v = data; |
1527 | 5.74k | send_timer_event(v); |
1528 | 5.74k | } |
1529 | | |
1530 | | /* SCHEDOP_poll timeout callback. */ |
1531 | | static void poll_timer_fn(void *data) |
1532 | 0 | { |
1533 | 0 | struct vcpu *v = data; |
1534 | 0 |
|
1535 | 0 | if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) |
1536 | 0 | vcpu_unblock(v); |
1537 | 0 | } |
1538 | | |
1539 | | static int cpu_schedule_up(unsigned int cpu) |
1540 | 12 | { |
1541 | 12 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
1542 | 12 | void *sched_priv; |
1543 | 12 | |
1544 | 12 | per_cpu(scheduler, cpu) = &ops; |
1545 | 12 | spin_lock_init(&sd->_lock); |
1546 | 12 | sd->schedule_lock = &sd->_lock; |
1547 | 12 | sd->curr = idle_vcpu[cpu]; |
1548 | 12 | init_timer(&sd->s_timer, s_timer_fn, NULL, cpu); |
1549 | 12 | atomic_set(&sd->urgent_count, 0); |
1550 | 12 | |
1551 | 12 | /* Boot CPU is dealt with later in schedule_init(). */ |
1552 | 12 | if ( cpu == 0 ) |
1553 | 1 | return 0; |
1554 | 12 | |
1555 | 11 | if ( idle_vcpu[cpu] == NULL ) |
1556 | 11 | alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu); |
1557 | 11 | else |
1558 | 0 | { |
1559 | 0 | struct vcpu *idle = idle_vcpu[cpu]; |
1560 | 0 |
|
1561 | 0 | /* |
1562 | 0 | * During (ACPI?) suspend the idle vCPU for this pCPU is not freed, |
1563 | 0 | * while its scheduler specific data (what is pointed by sched_priv) |
1564 | 0 | * is. Also, at this stage of the resume path, we attach the pCPU |
1565 | 0 | * to the default scheduler, no matter in what cpupool it was before |
1566 | 0 | * suspend. To avoid inconsistency, let's allocate default scheduler |
1567 | 0 | * data for the idle vCPU here. If the pCPU was in a different pool |
1568 | 0 | * with a different scheduler, it is schedule_cpu_switch(), invoked |
1569 | 0 | * later, that will set things up as appropriate. |
1570 | 0 | */ |
1571 | 0 | ASSERT(idle->sched_priv == NULL); |
1572 | 0 |
|
1573 | 0 | idle->sched_priv = SCHED_OP(&ops, alloc_vdata, idle, |
1574 | 0 | idle->domain->sched_priv); |
1575 | 0 | if ( idle->sched_priv == NULL ) |
1576 | 0 | return -ENOMEM; |
1577 | 0 | } |
1578 | 11 | if ( idle_vcpu[cpu] == NULL ) |
1579 | 0 | return -ENOMEM; |
1580 | 11 | |
1581 | 11 | /* |
1582 | 11 | * We don't want to risk calling xfree() on an sd->sched_priv |
1583 | 11 | * (e.g., inside free_pdata, from cpu_schedule_down() called |
1584 | 11 | * during CPU_UP_CANCELLED) that contains an IS_ERR value. |
1585 | 11 | */ |
1586 | 11 | sched_priv = SCHED_OP(&ops, alloc_pdata, cpu); |
1587 | 11 | if ( IS_ERR(sched_priv) ) |
1588 | 0 | return PTR_ERR(sched_priv); |
1589 | 11 | |
1590 | 11 | sd->sched_priv = sched_priv; |
1591 | 11 | |
1592 | 11 | return 0; |
1593 | 11 | } |
1594 | | |
1595 | | static void cpu_schedule_down(unsigned int cpu) |
1596 | 0 | { |
1597 | 0 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
1598 | 0 | struct scheduler *sched = per_cpu(scheduler, cpu); |
1599 | 0 |
|
1600 | 0 | SCHED_OP(sched, free_pdata, sd->sched_priv, cpu); |
1601 | 0 | SCHED_OP(sched, free_vdata, idle_vcpu[cpu]->sched_priv); |
1602 | 0 |
|
1603 | 0 | idle_vcpu[cpu]->sched_priv = NULL; |
1604 | 0 | sd->sched_priv = NULL; |
1605 | 0 |
|
1606 | 0 | kill_timer(&sd->s_timer); |
1607 | 0 | } |
1608 | | |
1609 | | static int cpu_schedule_callback( |
1610 | | struct notifier_block *nfb, unsigned long action, void *hcpu) |
1611 | 33 | { |
1612 | 33 | unsigned int cpu = (unsigned long)hcpu; |
1613 | 33 | struct scheduler *sched = per_cpu(scheduler, cpu); |
1614 | 33 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
1615 | 33 | int rc = 0; |
1616 | 33 | |
1617 | 33 | /* |
1618 | 33 | * From the scheduler perspective, bringing up a pCPU requires |
1619 | 33 | * allocating and initializing the per-pCPU scheduler specific data, |
1620 | 33 | * as well as "registering" this pCPU to the scheduler (which may |
1621 | 33 | * involve modifying some scheduler wide data structures). |
1622 | 33 | * This happens by calling the alloc_pdata and init_pdata hooks, in |
1623 | 33 | * this order. A scheduler that does not need to allocate any per-pCPU |
1624 | 33 | * data can avoid implementing alloc_pdata. init_pdata may, however, be |
1625 | 33 | * necessary/useful in this case too (e.g., it can contain the "register |
1626 | 33 | * the pCPU to the scheduler" part). alloc_pdata (if present) is called |
1627 | 33 | * during CPU_UP_PREPARE. init_pdata (if present) is called during |
1628 | 33 | * CPU_STARTING. |
1629 | 33 | * |
1630 | 33 | * On the other hand, at teardown, we need to reverse what has been done |
1631 | 33 | * during initialization, and then free the per-pCPU specific data. This |
1632 | 33 | * happens by calling the deinit_pdata and free_pdata hooks, in this |
1633 | 33 | * order. If no per-pCPU memory was allocated, there is no need to |
1634 | 33 | * provide an implementation of free_pdata. deinit_pdata may, however, |
1635 | 33 | * be necessary/useful in this case too (e.g., it can undo something done |
1636 | 33 | * on scheduler wide data structure during init_pdata). Both deinit_pdata |
1637 | 33 | * and free_pdata are called during CPU_DEAD. |
1638 | 33 | * |
1639 | 33 | * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED |
1640 | 33 | * *before* having called init_pdata. In this case, as there is no |
1641 | 33 | * initialization needing undoing, only free_pdata should be called. |
1642 | 33 | * This means it is possible to call free_pdata just after alloc_pdata, |
1643 | 33 | * without a init_pdata/deinit_pdata "cycle" in between the two. |
1644 | 33 | * |
1645 | 33 | * So, in summary, the usage pattern should look either |
1646 | 33 | * - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or |
1647 | 33 | * - alloc_pdata-->free_pdata. |
1648 | 33 | */ |
1649 | 33 | switch ( action ) |
1650 | 33 | { |
1651 | 11 | case CPU_STARTING: |
1652 | 11 | SCHED_OP(sched, init_pdata, sd->sched_priv, cpu); |
1653 | 11 | break; |
1654 | 11 | case CPU_UP_PREPARE: |
1655 | 11 | rc = cpu_schedule_up(cpu); |
1656 | 11 | break; |
1657 | 0 | case CPU_DEAD: |
1658 | 0 | SCHED_OP(sched, deinit_pdata, sd->sched_priv, cpu); |
1659 | 0 | /* Fallthrough */ |
1660 | 0 | case CPU_UP_CANCELED: |
1661 | 0 | cpu_schedule_down(cpu); |
1662 | 0 | break; |
1663 | 11 | default: |
1664 | 11 | break; |
1665 | 33 | } |
1666 | 33 | |
1667 | 33 | return !rc ? NOTIFY_DONE : notifier_from_errno(rc); |
1668 | 33 | } |
1669 | | |
1670 | | static struct notifier_block cpu_schedule_nfb = { |
1671 | | .notifier_call = cpu_schedule_callback |
1672 | | }; |
1673 | | |
1674 | | /* Initialise the data structures. */ |
1675 | | void __init scheduler_init(void) |
1676 | 1 | { |
1677 | 1 | struct domain *idle_domain; |
1678 | 1 | int i; |
1679 | 1 | |
1680 | 1 | open_softirq(SCHEDULE_SOFTIRQ, schedule); |
1681 | 1 | |
1682 | 6 | for ( i = 0; i < NUM_SCHEDULERS; i++) |
1683 | 5 | { |
1684 | 5 | if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 ) |
1685 | 0 | schedulers[i] = NULL; |
1686 | 5 | else if ( !ops.name && !strcmp(schedulers[i]->opt_name, opt_sched) ) |
1687 | 1 | ops = *schedulers[i]; |
1688 | 5 | } |
1689 | 1 | |
1690 | 1 | if ( !ops.name ) |
1691 | 0 | { |
1692 | 0 | printk("Could not find scheduler: %s\n", opt_sched); |
1693 | 0 | for ( i = 0; i < NUM_SCHEDULERS; i++ ) |
1694 | 0 | if ( schedulers[i] && |
1695 | 0 | !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) ) |
1696 | 0 | { |
1697 | 0 | ops = *schedulers[i]; |
1698 | 0 | break; |
1699 | 0 | } |
1700 | 0 | BUG_ON(!ops.name); |
1701 | 0 | printk("Using '%s' (%s)\n", ops.name, ops.opt_name); |
1702 | 0 | } |
1703 | 1 | |
1704 | 1 | if ( cpu_schedule_up(0) ) |
1705 | 0 | BUG(); |
1706 | 1 | register_cpu_notifier(&cpu_schedule_nfb); |
1707 | 1 | |
1708 | 1 | printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); |
1709 | 1 | if ( SCHED_OP(&ops, init) ) |
1710 | 0 | panic("scheduler returned error on init"); |
1711 | 1 | |
1712 | 1 | if ( sched_ratelimit_us && |
1713 | 1 | (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX |
1714 | 1 | || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) ) |
1715 | 0 | { |
1716 | 0 | printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n" |
1717 | 0 | " Resetting to default %u\n", |
1718 | 0 | XEN_SYSCTL_SCHED_RATELIMIT_MIN, |
1719 | 0 | XEN_SYSCTL_SCHED_RATELIMIT_MAX, |
1720 | 0 | SCHED_DEFAULT_RATELIMIT_US); |
1721 | 0 | sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; |
1722 | 0 | } |
1723 | 1 | |
1724 | 1 | idle_domain = domain_create(DOMID_IDLE, 0, 0, NULL); |
1725 | 1 | BUG_ON(IS_ERR(idle_domain)); |
1726 | 1 | idle_domain->vcpu = idle_vcpu; |
1727 | 1 | idle_domain->max_vcpus = nr_cpu_ids; |
1728 | 1 | if ( alloc_vcpu(idle_domain, 0, 0) == NULL ) |
1729 | 0 | BUG(); |
1730 | 1 | this_cpu(schedule_data).sched_priv = SCHED_OP(&ops, alloc_pdata, 0); |
1731 | 1 | BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv)); |
1732 | 1 | SCHED_OP(&ops, init_pdata, this_cpu(schedule_data).sched_priv, 0); |
1733 | 1 | } |
1734 | | |
1735 | | /* |
1736 | | * Move a pCPU outside of the influence of the scheduler of its current |
1737 | | * cpupool, or subject it to the scheduler of a new cpupool. |
1738 | | * |
1739 | | * For the pCPUs that are removed from their cpupool, their scheduler becomes |
1740 | | * &ops (the default scheduler, selected at boot, which also services the |
1741 | | * default cpupool). However, as these pCPUs are not really part of any pool, |
1742 | | * there won't be any scheduling event on them, not even from the default |
1743 | | * scheduler. Basically, they will just sit idle until they are explicitly |
1744 | | * added back to a cpupool. |
1745 | | */ |
1746 | | int schedule_cpu_switch(unsigned int cpu, struct cpupool *c) |
1747 | 12 | { |
1748 | 12 | struct vcpu *idle; |
1749 | 12 | void *ppriv, *ppriv_old, *vpriv, *vpriv_old; |
1750 | 12 | struct scheduler *old_ops = per_cpu(scheduler, cpu); |
1751 | 12 | struct scheduler *new_ops = (c == NULL) ? &ops : c->sched; |
1752 | 12 | struct cpupool *old_pool = per_cpu(cpupool, cpu); |
1753 | 12 | spinlock_t * old_lock; |
1754 | 12 | |
1755 | 12 | /* |
1756 | 12 | * pCPUs only move from a valid cpupool to free (i.e., out of any pool), |
1757 | 12 | * or from free to a valid cpupool. In the former case (which happens when |
1758 | 12 | * c is NULL), we want the CPU to have been marked as free already, as |
1759 | 12 | * well as to not be valid for the source pool any longer, when we get to |
1760 | 12 | * here. In the latter case (which happens when c is a valid cpupool), we |
1761 | 12 | * want the CPU to still be marked as free, as well as to not yet be valid |
1762 | 12 | * for the destination pool. |
1763 | 12 | */ |
1764 | 12 | ASSERT(c != old_pool && (c != NULL || old_pool != NULL)); |
1765 | 12 | ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); |
1766 | 12 | ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) || |
1767 | 12 | (c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid))); |
1768 | 12 | |
1769 | 12 | if ( old_ops == new_ops ) |
1770 | 12 | goto out; |
1771 | 12 | |
1772 | 12 | /* |
1773 | 12 | * To setup the cpu for the new scheduler we need: |
1774 | 12 | * - a valid instance of per-CPU scheduler specific data, as it is |
1775 | 12 | * allocated by SCHED_OP(alloc_pdata). Note that we do not want to |
1776 | 12 | * initialize it yet (i.e., we are not calling SCHED_OP(init_pdata)). |
1777 | 12 | * That will be done by the target scheduler, in SCHED_OP(switch_sched), |
1778 | 12 | * in proper ordering and with locking. |
1779 | 12 | * - a valid instance of per-vCPU scheduler specific data, for the idle |
1780 | 12 | * vCPU of cpu. That is what the target scheduler will use for the |
1781 | 12 | * sched_priv field of the per-vCPU info of the idle domain. |
1782 | 12 | */ |
1783 | 0 | idle = idle_vcpu[cpu]; |
1784 | 0 | ppriv = SCHED_OP(new_ops, alloc_pdata, cpu); |
1785 | 0 | if ( IS_ERR(ppriv) ) |
1786 | 0 | return PTR_ERR(ppriv); |
1787 | 0 | vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv); |
1788 | 0 | if ( vpriv == NULL ) |
1789 | 0 | { |
1790 | 0 | SCHED_OP(new_ops, free_pdata, ppriv, cpu); |
1791 | 0 | return -ENOMEM; |
1792 | 0 | } |
1793 | 0 |
|
1794 | 0 | SCHED_OP(old_ops, tick_suspend, cpu); |
1795 | 0 |
|
1796 | 0 | /* |
1797 | 0 | * The actual switch, including (if necessary) the rerouting of the |
1798 | 0 | * scheduler lock to whatever new_ops prefers, needs to happen in one |
1799 | 0 | * critical section, protected by old_ops' lock, or races are possible. |
1800 | 0 | * It is, in fact, the lock of another scheduler that we are taking (the |
1801 | 0 | * scheduler of the cpupool that cpu still belongs to). But that is ok |
1802 | 0 | * as, anyone trying to schedule on this cpu will spin until when we |
1803 | 0 | * release that lock (bottom of this function). When he'll get the lock |
1804 | 0 | * --thanks to the loop inside *_schedule_lock() functions-- he'll notice |
1805 | 0 | * that the lock itself changed, and retry acquiring the new one (which |
1806 | 0 | * will be the correct, remapped one, at that point). |
1807 | 0 | */ |
1808 | 0 | old_lock = pcpu_schedule_lock_irq(cpu); |
1809 | 0 |
|
1810 | 0 | vpriv_old = idle->sched_priv; |
1811 | 0 | ppriv_old = per_cpu(schedule_data, cpu).sched_priv; |
1812 | 0 | SCHED_OP(new_ops, switch_sched, cpu, ppriv, vpriv); |
1813 | 0 |
|
1814 | 0 | /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ |
1815 | 0 | spin_unlock_irq(old_lock); |
1816 | 0 |
|
1817 | 0 | SCHED_OP(new_ops, tick_resume, cpu); |
1818 | 0 |
|
1819 | 0 | SCHED_OP(old_ops, deinit_pdata, ppriv_old, cpu); |
1820 | 0 |
|
1821 | 0 | SCHED_OP(old_ops, free_vdata, vpriv_old); |
1822 | 0 | SCHED_OP(old_ops, free_pdata, ppriv_old, cpu); |
1823 | 0 |
|
1824 | 12 | out: |
1825 | 12 | per_cpu(cpupool, cpu) = c; |
1826 | 12 | /* When a cpu is added to a pool, trigger it to go pick up some work */ |
1827 | 12 | if ( c != NULL ) |
1828 | 12 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
1829 | 12 | |
1830 | 12 | return 0; |
1831 | 0 | } |
1832 | | |
1833 | | struct scheduler *scheduler_get_default(void) |
1834 | 1 | { |
1835 | 1 | return &ops; |
1836 | 1 | } |
1837 | | |
1838 | | struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr) |
1839 | 0 | { |
1840 | 0 | int i; |
1841 | 0 | struct scheduler *sched; |
1842 | 0 |
|
1843 | 0 | for ( i = 0; i < NUM_SCHEDULERS; i++ ) |
1844 | 0 | if ( schedulers[i] && schedulers[i]->sched_id == sched_id ) |
1845 | 0 | goto found; |
1846 | 0 | *perr = -ENOENT; |
1847 | 0 | return NULL; |
1848 | 0 |
|
1849 | 0 | found: |
1850 | 0 | *perr = -ENOMEM; |
1851 | 0 | if ( (sched = xmalloc(struct scheduler)) == NULL ) |
1852 | 0 | return NULL; |
1853 | 0 | memcpy(sched, schedulers[i], sizeof(*sched)); |
1854 | 0 | if ( (*perr = SCHED_OP(sched, init)) != 0 ) |
1855 | 0 | { |
1856 | 0 | xfree(sched); |
1857 | 0 | sched = NULL; |
1858 | 0 | } |
1859 | 0 |
|
1860 | 0 | return sched; |
1861 | 0 | } |
1862 | | |
1863 | | void scheduler_free(struct scheduler *sched) |
1864 | 0 | { |
1865 | 0 | BUG_ON(sched == &ops); |
1866 | 0 | SCHED_OP(sched, deinit); |
1867 | 0 | xfree(sched); |
1868 | 0 | } |
1869 | | |
1870 | | void schedule_dump(struct cpupool *c) |
1871 | 0 | { |
1872 | 0 | unsigned int i; |
1873 | 0 | struct scheduler *sched; |
1874 | 0 | cpumask_t *cpus; |
1875 | 0 |
|
1876 | 0 | /* Locking, if necessary, must be handled withing each scheduler */ |
1877 | 0 |
|
1878 | 0 | if ( c != NULL ) |
1879 | 0 | { |
1880 | 0 | sched = c->sched; |
1881 | 0 | cpus = c->cpu_valid; |
1882 | 0 | printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); |
1883 | 0 | SCHED_OP(sched, dump_settings); |
1884 | 0 | } |
1885 | 0 | else |
1886 | 0 | { |
1887 | 0 | sched = &ops; |
1888 | 0 | cpus = &cpupool_free_cpus; |
1889 | 0 | } |
1890 | 0 |
|
1891 | 0 | if ( sched->dump_cpu_state != NULL ) |
1892 | 0 | { |
1893 | 0 | printk("CPUs info:\n"); |
1894 | 0 | for_each_cpu (i, cpus) |
1895 | 0 | SCHED_OP(sched, dump_cpu_state, i); |
1896 | 0 | } |
1897 | 0 | } |
1898 | | |
1899 | | void sched_tick_suspend(void) |
1900 | 1.89M | { |
1901 | 1.89M | struct scheduler *sched; |
1902 | 1.89M | unsigned int cpu = smp_processor_id(); |
1903 | 1.89M | |
1904 | 1.89M | sched = per_cpu(scheduler, cpu); |
1905 | 1.89M | SCHED_OP(sched, tick_suspend, cpu); |
1906 | 1.89M | rcu_idle_enter(cpu); |
1907 | 1.89M | rcu_idle_timer_start(); |
1908 | 1.89M | } |
1909 | | |
1910 | | void sched_tick_resume(void) |
1911 | 1.97M | { |
1912 | 1.97M | struct scheduler *sched; |
1913 | 1.97M | unsigned int cpu = smp_processor_id(); |
1914 | 1.97M | |
1915 | 1.97M | rcu_idle_timer_stop(); |
1916 | 1.97M | rcu_idle_exit(cpu); |
1917 | 1.97M | sched = per_cpu(scheduler, cpu); |
1918 | 1.97M | SCHED_OP(sched, tick_resume, cpu); |
1919 | 1.97M | } |
1920 | | |
1921 | | void wait(void) |
1922 | 0 | { |
1923 | 0 | schedule(); |
1924 | 0 | } |
1925 | | |
1926 | | #ifdef CONFIG_COMPAT |
1927 | | #include "compat/schedule.c" |
1928 | | #endif |
1929 | | |
1930 | | #endif /* !COMPAT */ |
1931 | | |
1932 | | /* |
1933 | | * Local variables: |
1934 | | * mode: C |
1935 | | * c-file-style: "BSD" |
1936 | | * c-basic-offset: 4 |
1937 | | * tab-width: 4 |
1938 | | * indent-tabs-mode: nil |
1939 | | * End: |
1940 | | */ |