/root/src/xen/xen/common/sched_null.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * xen/common/sched_null.c |
3 | | * |
4 | | * Copyright (c) 2017, Dario Faggioli, Citrix Ltd |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU General Public |
8 | | * License v2 as published by the Free Software Foundation. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | | * General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public |
16 | | * License along with this program; If not, see <http://www.gnu.org/licenses/>. |
17 | | */ |
18 | | |
19 | | /* |
20 | | * The 'null' scheduler always choose to run, on each pCPU, either nothing |
21 | | * (i.e., the pCPU stays idle) or always the same vCPU. |
22 | | * |
23 | | * It is aimed at supporting static scenarios, where there always are |
24 | | * less vCPUs than pCPUs (and the vCPUs don't need to move among pCPUs |
25 | | * for any reason) with the least possible overhead. |
26 | | * |
27 | | * Typical usecase are embedded applications, but also HPC, especially |
28 | | * if the scheduler is used inside a cpupool. |
29 | | */ |
30 | | |
31 | | #include <xen/sched.h> |
32 | | #include <xen/sched-if.h> |
33 | | #include <xen/softirq.h> |
34 | | #include <xen/keyhandler.h> |
35 | | #include <xen/trace.h> |
36 | | |
37 | | /* |
38 | | * null tracing events. Check include/public/trace.h for more details. |
39 | | */ |
40 | 0 | #define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1) |
41 | 0 | #define TRC_SNULL_VCPU_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2) |
42 | 0 | #define TRC_SNULL_VCPU_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3) |
43 | 0 | #define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4) |
44 | 0 | #define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5) |
45 | 0 | #define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6) |
46 | | |
47 | | /* |
48 | | * Locking: |
49 | | * - Scheduler-lock (a.k.a. runqueue lock): |
50 | | * + is per-pCPU; |
51 | | * + serializes assignment and deassignment of vCPUs to a pCPU. |
52 | | * - Private data lock (a.k.a. private scheduler lock): |
53 | | * + is scheduler-wide; |
54 | | * + serializes accesses to the list of domains in this scheduler. |
55 | | * - Waitqueue lock: |
56 | | * + is scheduler-wide; |
57 | | * + serialize accesses to the list of vCPUs waiting to be assigned |
58 | | * to pCPUs. |
59 | | * |
60 | | * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH, |
61 | | * waitqueue lock nests inside runqueue lock which nests inside private |
62 | | * lock. More specifically: |
63 | | * + if we need both runqueue and private locks, we must acquire the |
64 | | * private lock for first; |
65 | | * + if we need both runqueue and waitqueue locks, we must acquire |
66 | | * the runqueue lock for first; |
67 | | * + if we need both private and waitqueue locks, we must acquire |
68 | | * the private lock for first; |
69 | | * + if we already own a runqueue lock, we must never acquire |
70 | | * the private lock; |
71 | | * + if we already own the waitqueue lock, we must never acquire |
72 | | * the runqueue lock or the private lock. |
73 | | */ |
74 | | |
75 | | /* |
76 | | * System-wide private data |
77 | | */ |
78 | | struct null_private { |
79 | | spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */ |
80 | | struct list_head ndom; /* Domains of this scheduler */ |
81 | | struct list_head waitq; /* vCPUs not assigned to any pCPU */ |
82 | | spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */ |
83 | | cpumask_t cpus_free; /* CPUs without a vCPU associated to them */ |
84 | | }; |
85 | | |
86 | | /* |
87 | | * Physical CPU |
88 | | */ |
89 | | struct null_pcpu { |
90 | | struct vcpu *vcpu; |
91 | | }; |
92 | | DEFINE_PER_CPU(struct null_pcpu, npc); |
93 | | |
94 | | /* |
95 | | * Virtual CPU |
96 | | */ |
97 | | struct null_vcpu { |
98 | | struct list_head waitq_elem; |
99 | | struct vcpu *vcpu; |
100 | | }; |
101 | | |
102 | | /* |
103 | | * Domain |
104 | | */ |
105 | | struct null_dom { |
106 | | struct list_head ndom_elem; |
107 | | struct domain *dom; |
108 | | }; |
109 | | |
110 | | /* |
111 | | * Accessor helpers functions |
112 | | */ |
113 | | static inline struct null_private *null_priv(const struct scheduler *ops) |
114 | 0 | { |
115 | 0 | return ops->sched_data; |
116 | 0 | } |
117 | | |
118 | | static inline struct null_vcpu *null_vcpu(const struct vcpu *v) |
119 | 0 | { |
120 | 0 | return v->sched_priv; |
121 | 0 | } |
122 | | |
123 | | static inline struct null_dom *null_dom(const struct domain *d) |
124 | 0 | { |
125 | 0 | return d->sched_priv; |
126 | 0 | } |
127 | | |
128 | | static inline bool vcpu_check_affinity(struct vcpu *v, unsigned int cpu, |
129 | | unsigned int balance_step) |
130 | 0 | { |
131 | 0 | affinity_balance_cpumask(v, balance_step, cpumask_scratch_cpu(cpu)); |
132 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
133 | 0 | cpupool_domain_cpumask(v->domain)); |
134 | 0 |
|
135 | 0 | return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu)); |
136 | 0 | } |
137 | | |
138 | | static int null_init(struct scheduler *ops) |
139 | 0 | { |
140 | 0 | struct null_private *prv; |
141 | 0 |
|
142 | 0 | printk("Initializing null scheduler\n" |
143 | 0 | "WARNING: This is experimental software in development.\n" |
144 | 0 | "Use at your own risk.\n"); |
145 | 0 |
|
146 | 0 | prv = xzalloc(struct null_private); |
147 | 0 | if ( prv == NULL ) |
148 | 0 | return -ENOMEM; |
149 | 0 |
|
150 | 0 | spin_lock_init(&prv->lock); |
151 | 0 | spin_lock_init(&prv->waitq_lock); |
152 | 0 | INIT_LIST_HEAD(&prv->ndom); |
153 | 0 | INIT_LIST_HEAD(&prv->waitq); |
154 | 0 |
|
155 | 0 | ops->sched_data = prv; |
156 | 0 |
|
157 | 0 | return 0; |
158 | 0 | } |
159 | | |
160 | | static void null_deinit(struct scheduler *ops) |
161 | 0 | { |
162 | 0 | xfree(ops->sched_data); |
163 | 0 | ops->sched_data = NULL; |
164 | 0 | } |
165 | | |
166 | | static void init_pdata(struct null_private *prv, unsigned int cpu) |
167 | 0 | { |
168 | 0 | /* Mark the pCPU as free, and with no vCPU assigned */ |
169 | 0 | cpumask_set_cpu(cpu, &prv->cpus_free); |
170 | 0 | per_cpu(npc, cpu).vcpu = NULL; |
171 | 0 | } |
172 | | |
173 | | static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu) |
174 | 0 | { |
175 | 0 | struct null_private *prv = null_priv(ops); |
176 | 0 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
177 | 0 |
|
178 | 0 | /* alloc_pdata is not implemented, so we want this to be NULL. */ |
179 | 0 | ASSERT(!pdata); |
180 | 0 |
|
181 | 0 | /* |
182 | 0 | * The scheduler lock points already to the default per-cpu spinlock, |
183 | 0 | * so there is no remapping to be done. |
184 | 0 | */ |
185 | 0 | ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock)); |
186 | 0 |
|
187 | 0 | init_pdata(prv, cpu); |
188 | 0 | } |
189 | | |
190 | | static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) |
191 | 0 | { |
192 | 0 | struct null_private *prv = null_priv(ops); |
193 | 0 |
|
194 | 0 | /* alloc_pdata not implemented, so this must have stayed NULL */ |
195 | 0 | ASSERT(!pcpu); |
196 | 0 |
|
197 | 0 | cpumask_clear_cpu(cpu, &prv->cpus_free); |
198 | 0 | per_cpu(npc, cpu).vcpu = NULL; |
199 | 0 | } |
200 | | |
201 | | static void *null_alloc_vdata(const struct scheduler *ops, |
202 | | struct vcpu *v, void *dd) |
203 | 0 | { |
204 | 0 | struct null_vcpu *nvc; |
205 | 0 |
|
206 | 0 | nvc = xzalloc(struct null_vcpu); |
207 | 0 | if ( nvc == NULL ) |
208 | 0 | return NULL; |
209 | 0 |
|
210 | 0 | INIT_LIST_HEAD(&nvc->waitq_elem); |
211 | 0 | nvc->vcpu = v; |
212 | 0 |
|
213 | 0 | SCHED_STAT_CRANK(vcpu_alloc); |
214 | 0 |
|
215 | 0 | return nvc; |
216 | 0 | } |
217 | | |
218 | | static void null_free_vdata(const struct scheduler *ops, void *priv) |
219 | 0 | { |
220 | 0 | struct null_vcpu *nvc = priv; |
221 | 0 |
|
222 | 0 | xfree(nvc); |
223 | 0 | } |
224 | | |
225 | | static void * null_alloc_domdata(const struct scheduler *ops, |
226 | | struct domain *d) |
227 | 0 | { |
228 | 0 | struct null_private *prv = null_priv(ops); |
229 | 0 | struct null_dom *ndom; |
230 | 0 | unsigned long flags; |
231 | 0 |
|
232 | 0 | ndom = xzalloc(struct null_dom); |
233 | 0 | if ( ndom == NULL ) |
234 | 0 | return NULL; |
235 | 0 |
|
236 | 0 | ndom->dom = d; |
237 | 0 |
|
238 | 0 | spin_lock_irqsave(&prv->lock, flags); |
239 | 0 | list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom); |
240 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
241 | 0 |
|
242 | 0 | return (void*)ndom; |
243 | 0 | } |
244 | | |
245 | | static void null_free_domdata(const struct scheduler *ops, void *data) |
246 | 0 | { |
247 | 0 | unsigned long flags; |
248 | 0 | struct null_dom *ndom = data; |
249 | 0 | struct null_private *prv = null_priv(ops); |
250 | 0 |
|
251 | 0 | spin_lock_irqsave(&prv->lock, flags); |
252 | 0 | list_del_init(&ndom->ndom_elem); |
253 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
254 | 0 |
|
255 | 0 | xfree(data); |
256 | 0 | } |
257 | | |
258 | | static int null_dom_init(const struct scheduler *ops, struct domain *d) |
259 | 0 | { |
260 | 0 | struct null_dom *ndom; |
261 | 0 |
|
262 | 0 | if ( is_idle_domain(d) ) |
263 | 0 | return 0; |
264 | 0 |
|
265 | 0 | ndom = null_alloc_domdata(ops, d); |
266 | 0 | if ( ndom == NULL ) |
267 | 0 | return -ENOMEM; |
268 | 0 |
|
269 | 0 | d->sched_priv = ndom; |
270 | 0 |
|
271 | 0 | return 0; |
272 | 0 | } |
273 | | static void null_dom_destroy(const struct scheduler *ops, struct domain *d) |
274 | 0 | { |
275 | 0 | null_free_domdata(ops, null_dom(d)); |
276 | 0 | } |
277 | | |
278 | | /* |
279 | | * vCPU to pCPU assignment and placement. This _only_ happens: |
280 | | * - on insert, |
281 | | * - on migrate. |
282 | | * |
283 | | * Insert occurs when a vCPU joins this scheduler for the first time |
284 | | * (e.g., when the domain it's part of is moved to the scheduler's |
285 | | * cpupool). |
286 | | * |
287 | | * Migration may be necessary if a pCPU (with a vCPU assigned to it) |
288 | | * is removed from the scheduler's cpupool. |
289 | | * |
290 | | * So this is not part of any hot path. |
291 | | */ |
292 | | static unsigned int pick_cpu(struct null_private *prv, struct vcpu *v) |
293 | 0 | { |
294 | 0 | unsigned int bs; |
295 | 0 | unsigned int cpu = v->processor, new_cpu; |
296 | 0 | cpumask_t *cpus = cpupool_domain_cpumask(v->domain); |
297 | 0 |
|
298 | 0 | ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); |
299 | 0 |
|
300 | 0 | for_each_affinity_balance_step( bs ) |
301 | 0 | { |
302 | 0 | if ( bs == BALANCE_SOFT_AFFINITY && |
303 | 0 | !has_soft_affinity(v, v->cpu_hard_affinity) ) |
304 | 0 | continue; |
305 | 0 |
|
306 | 0 | affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu)); |
307 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus); |
308 | 0 |
|
309 | 0 | /* |
310 | 0 | * If our processor is free, or we are assigned to it, and it is also |
311 | 0 | * still valid and part of our affinity, just go for it. |
312 | 0 | * (Note that we may call vcpu_check_affinity(), but we deliberately |
313 | 0 | * don't, so we get to keep in the scratch cpumask what we have just |
314 | 0 | * put in it.) |
315 | 0 | */ |
316 | 0 | if ( likely((per_cpu(npc, cpu).vcpu == NULL || per_cpu(npc, cpu).vcpu == v) |
317 | 0 | && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) |
318 | 0 | { |
319 | 0 | new_cpu = cpu; |
320 | 0 | goto out; |
321 | 0 | } |
322 | 0 |
|
323 | 0 | /* If not, just go for a free pCPU, within our affinity, if any */ |
324 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), |
325 | 0 | &prv->cpus_free); |
326 | 0 | new_cpu = cpumask_first(cpumask_scratch_cpu(cpu)); |
327 | 0 |
|
328 | 0 | if ( likely(new_cpu != nr_cpu_ids) ) |
329 | 0 | goto out; |
330 | 0 | } |
331 | 0 |
|
332 | 0 | /* |
333 | 0 | * If we didn't find any free pCPU, just pick any valid pcpu, even if |
334 | 0 | * it has another vCPU assigned. This will happen during shutdown and |
335 | 0 | * suspend/resume, but it may also happen during "normal operation", if |
336 | 0 | * all the pCPUs are busy. |
337 | 0 | * |
338 | 0 | * In fact, there must always be something sane in v->processor, or |
339 | 0 | * vcpu_schedule_lock() and friends won't work. This is not a problem, |
340 | 0 | * as we will actually assign the vCPU to the pCPU we return from here, |
341 | 0 | * only if the pCPU is free. |
342 | 0 | */ |
343 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), cpus, v->cpu_hard_affinity); |
344 | 0 | new_cpu = cpumask_any(cpumask_scratch_cpu(cpu)); |
345 | 0 |
|
346 | 0 | out: |
347 | 0 | if ( unlikely(tb_init_done) ) |
348 | 0 | { |
349 | 0 | struct { |
350 | 0 | uint16_t vcpu, dom; |
351 | 0 | uint32_t new_cpu; |
352 | 0 | } d; |
353 | 0 | d.dom = v->domain->domain_id; |
354 | 0 | d.vcpu = v->vcpu_id; |
355 | 0 | d.new_cpu = new_cpu; |
356 | 0 | __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d); |
357 | 0 | } |
358 | 0 |
|
359 | 0 | return new_cpu; |
360 | 0 | } |
361 | | |
362 | | static void vcpu_assign(struct null_private *prv, struct vcpu *v, |
363 | | unsigned int cpu) |
364 | 0 | { |
365 | 0 | per_cpu(npc, cpu).vcpu = v; |
366 | 0 | v->processor = cpu; |
367 | 0 | cpumask_clear_cpu(cpu, &prv->cpus_free); |
368 | 0 |
|
369 | 0 | dprintk(XENLOG_G_INFO, "%d <-- d%dv%d\n", cpu, v->domain->domain_id, v->vcpu_id); |
370 | 0 |
|
371 | 0 | if ( unlikely(tb_init_done) ) |
372 | 0 | { |
373 | 0 | struct { |
374 | 0 | uint16_t vcpu, dom; |
375 | 0 | uint32_t cpu; |
376 | 0 | } d; |
377 | 0 | d.dom = v->domain->domain_id; |
378 | 0 | d.vcpu = v->vcpu_id; |
379 | 0 | d.cpu = cpu; |
380 | 0 | __trace_var(TRC_SNULL_VCPU_ASSIGN, 1, sizeof(d), &d); |
381 | 0 | } |
382 | 0 | } |
383 | | |
384 | | static void vcpu_deassign(struct null_private *prv, struct vcpu *v, |
385 | | unsigned int cpu) |
386 | 0 | { |
387 | 0 | per_cpu(npc, cpu).vcpu = NULL; |
388 | 0 | cpumask_set_cpu(cpu, &prv->cpus_free); |
389 | 0 |
|
390 | 0 | dprintk(XENLOG_G_INFO, "%d <-- NULL (d%dv%d)\n", cpu, v->domain->domain_id, v->vcpu_id); |
391 | 0 |
|
392 | 0 | if ( unlikely(tb_init_done) ) |
393 | 0 | { |
394 | 0 | struct { |
395 | 0 | uint16_t vcpu, dom; |
396 | 0 | uint32_t cpu; |
397 | 0 | } d; |
398 | 0 | d.dom = v->domain->domain_id; |
399 | 0 | d.vcpu = v->vcpu_id; |
400 | 0 | d.cpu = cpu; |
401 | 0 | __trace_var(TRC_SNULL_VCPU_DEASSIGN, 1, sizeof(d), &d); |
402 | 0 | } |
403 | 0 | } |
404 | | |
405 | | /* Change the scheduler of cpu to us (null). */ |
406 | | static void null_switch_sched(struct scheduler *new_ops, unsigned int cpu, |
407 | | void *pdata, void *vdata) |
408 | 0 | { |
409 | 0 | struct schedule_data *sd = &per_cpu(schedule_data, cpu); |
410 | 0 | struct null_private *prv = null_priv(new_ops); |
411 | 0 | struct null_vcpu *nvc = vdata; |
412 | 0 |
|
413 | 0 | ASSERT(nvc && is_idle_vcpu(nvc->vcpu)); |
414 | 0 |
|
415 | 0 | idle_vcpu[cpu]->sched_priv = vdata; |
416 | 0 |
|
417 | 0 | /* |
418 | 0 | * We are holding the runqueue lock already (it's been taken in |
419 | 0 | * schedule_cpu_switch()). It actually may or may not be the 'right' |
420 | 0 | * one for this cpu, but that is ok for preventing races. |
421 | 0 | */ |
422 | 0 | ASSERT(!local_irq_is_enabled()); |
423 | 0 |
|
424 | 0 | init_pdata(prv, cpu); |
425 | 0 |
|
426 | 0 | per_cpu(scheduler, cpu) = new_ops; |
427 | 0 | per_cpu(schedule_data, cpu).sched_priv = pdata; |
428 | 0 |
|
429 | 0 | /* |
430 | 0 | * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact, |
431 | 0 | * if it is free (and it can be) we want that anyone that manages |
432 | 0 | * taking it, finds all the initializations we've done above in place. |
433 | 0 | */ |
434 | 0 | smp_mb(); |
435 | 0 | sd->schedule_lock = &sd->_lock; |
436 | 0 | } |
437 | | |
438 | | static void null_vcpu_insert(const struct scheduler *ops, struct vcpu *v) |
439 | 0 | { |
440 | 0 | struct null_private *prv = null_priv(ops); |
441 | 0 | struct null_vcpu *nvc = null_vcpu(v); |
442 | 0 | unsigned int cpu; |
443 | 0 | spinlock_t *lock; |
444 | 0 |
|
445 | 0 | ASSERT(!is_idle_vcpu(v)); |
446 | 0 |
|
447 | 0 | lock = vcpu_schedule_lock_irq(v); |
448 | 0 | retry: |
449 | 0 |
|
450 | 0 | cpu = v->processor = pick_cpu(prv, v); |
451 | 0 |
|
452 | 0 | spin_unlock(lock); |
453 | 0 |
|
454 | 0 | lock = vcpu_schedule_lock(v); |
455 | 0 |
|
456 | 0 | cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity, |
457 | 0 | cpupool_domain_cpumask(v->domain)); |
458 | 0 |
|
459 | 0 | /* If the pCPU is free, we assign v to it */ |
460 | 0 | if ( likely(per_cpu(npc, cpu).vcpu == NULL) ) |
461 | 0 | { |
462 | 0 | /* |
463 | 0 | * Insert is followed by vcpu_wake(), so there's no need to poke |
464 | 0 | * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that. |
465 | 0 | */ |
466 | 0 | vcpu_assign(prv, v, cpu); |
467 | 0 | } |
468 | 0 | else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) ) |
469 | 0 | { |
470 | 0 | /* |
471 | 0 | * If the pCPU is not free (e.g., because we raced with another |
472 | 0 | * insert or a migrate), but there are other free pCPUs, we can |
473 | 0 | * try to pick again. |
474 | 0 | */ |
475 | 0 | goto retry; |
476 | 0 | } |
477 | 0 | else |
478 | 0 | { |
479 | 0 | /* |
480 | 0 | * If the pCPU is not free, and there aren't any (valid) others, |
481 | 0 | * we have no alternatives than to go into the waitqueue. |
482 | 0 | */ |
483 | 0 | spin_lock(&prv->waitq_lock); |
484 | 0 | list_add_tail(&nvc->waitq_elem, &prv->waitq); |
485 | 0 | dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", |
486 | 0 | v->domain->domain_id, v->vcpu_id); |
487 | 0 | spin_unlock(&prv->waitq_lock); |
488 | 0 | } |
489 | 0 | spin_unlock_irq(lock); |
490 | 0 |
|
491 | 0 | SCHED_STAT_CRANK(vcpu_insert); |
492 | 0 | } |
493 | | |
494 | | static void _vcpu_remove(struct null_private *prv, struct vcpu *v) |
495 | 0 | { |
496 | 0 | unsigned int bs; |
497 | 0 | unsigned int cpu = v->processor; |
498 | 0 | struct null_vcpu *wvc; |
499 | 0 |
|
500 | 0 | ASSERT(list_empty(&null_vcpu(v)->waitq_elem)); |
501 | 0 |
|
502 | 0 | vcpu_deassign(prv, v, cpu); |
503 | 0 |
|
504 | 0 | spin_lock(&prv->waitq_lock); |
505 | 0 |
|
506 | 0 | /* |
507 | 0 | * If v is assigned to a pCPU, let's see if there is someone waiting, |
508 | 0 | * suitable to be assigned to it (prioritizing vcpus that have |
509 | 0 | * soft-affinity with cpu). |
510 | 0 | */ |
511 | 0 | for_each_affinity_balance_step( bs ) |
512 | 0 | { |
513 | 0 | list_for_each_entry( wvc, &prv->waitq, waitq_elem ) |
514 | 0 | { |
515 | 0 | if ( bs == BALANCE_SOFT_AFFINITY && |
516 | 0 | !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) ) |
517 | 0 | continue; |
518 | 0 |
|
519 | 0 | if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) ) |
520 | 0 | { |
521 | 0 | list_del_init(&wvc->waitq_elem); |
522 | 0 | vcpu_assign(prv, wvc->vcpu, cpu); |
523 | 0 | cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); |
524 | 0 | spin_unlock(&prv->waitq_lock); |
525 | 0 | return; |
526 | 0 | } |
527 | 0 | } |
528 | 0 | } |
529 | 0 | spin_unlock(&prv->waitq_lock); |
530 | 0 | } |
531 | | |
532 | | static void null_vcpu_remove(const struct scheduler *ops, struct vcpu *v) |
533 | 0 | { |
534 | 0 | struct null_private *prv = null_priv(ops); |
535 | 0 | struct null_vcpu *nvc = null_vcpu(v); |
536 | 0 | spinlock_t *lock; |
537 | 0 |
|
538 | 0 | ASSERT(!is_idle_vcpu(v)); |
539 | 0 |
|
540 | 0 | lock = vcpu_schedule_lock_irq(v); |
541 | 0 |
|
542 | 0 | /* If v is in waitqueue, just get it out of there and bail */ |
543 | 0 | if ( unlikely(!list_empty(&nvc->waitq_elem)) ) |
544 | 0 | { |
545 | 0 | spin_lock(&prv->waitq_lock); |
546 | 0 | list_del_init(&nvc->waitq_elem); |
547 | 0 | spin_unlock(&prv->waitq_lock); |
548 | 0 |
|
549 | 0 | goto out; |
550 | 0 | } |
551 | 0 |
|
552 | 0 | ASSERT(per_cpu(npc, v->processor).vcpu == v); |
553 | 0 | ASSERT(!cpumask_test_cpu(v->processor, &prv->cpus_free)); |
554 | 0 |
|
555 | 0 | _vcpu_remove(prv, v); |
556 | 0 |
|
557 | 0 | out: |
558 | 0 | vcpu_schedule_unlock_irq(lock, v); |
559 | 0 |
|
560 | 0 | SCHED_STAT_CRANK(vcpu_remove); |
561 | 0 | } |
562 | | |
563 | | static void null_vcpu_wake(const struct scheduler *ops, struct vcpu *v) |
564 | 0 | { |
565 | 0 | ASSERT(!is_idle_vcpu(v)); |
566 | 0 |
|
567 | 0 | if ( unlikely(curr_on_cpu(v->processor) == v) ) |
568 | 0 | { |
569 | 0 | SCHED_STAT_CRANK(vcpu_wake_running); |
570 | 0 | return; |
571 | 0 | } |
572 | 0 |
|
573 | 0 | if ( unlikely(!list_empty(&null_vcpu(v)->waitq_elem)) ) |
574 | 0 | { |
575 | 0 | /* Not exactly "on runq", but close enough for reusing the counter */ |
576 | 0 | SCHED_STAT_CRANK(vcpu_wake_onrunq); |
577 | 0 | return; |
578 | 0 | } |
579 | 0 |
|
580 | 0 | if ( likely(vcpu_runnable(v)) ) |
581 | 0 | SCHED_STAT_CRANK(vcpu_wake_runnable); |
582 | 0 | else |
583 | 0 | SCHED_STAT_CRANK(vcpu_wake_not_runnable); |
584 | 0 |
|
585 | 0 | /* Note that we get here only for vCPUs assigned to a pCPU */ |
586 | 0 | cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ); |
587 | 0 | } |
588 | | |
589 | | static void null_vcpu_sleep(const struct scheduler *ops, struct vcpu *v) |
590 | 0 | { |
591 | 0 | ASSERT(!is_idle_vcpu(v)); |
592 | 0 |
|
593 | 0 | /* If v is not assigned to a pCPU, or is not running, no need to bother */ |
594 | 0 | if ( curr_on_cpu(v->processor) == v ) |
595 | 0 | cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ); |
596 | 0 |
|
597 | 0 | SCHED_STAT_CRANK(vcpu_sleep); |
598 | 0 | } |
599 | | |
600 | | static int null_cpu_pick(const struct scheduler *ops, struct vcpu *v) |
601 | 0 | { |
602 | 0 | ASSERT(!is_idle_vcpu(v)); |
603 | 0 | return pick_cpu(null_priv(ops), v); |
604 | 0 | } |
605 | | |
606 | | static void null_vcpu_migrate(const struct scheduler *ops, struct vcpu *v, |
607 | | unsigned int new_cpu) |
608 | 0 | { |
609 | 0 | struct null_private *prv = null_priv(ops); |
610 | 0 | struct null_vcpu *nvc = null_vcpu(v); |
611 | 0 |
|
612 | 0 | ASSERT(!is_idle_vcpu(v)); |
613 | 0 |
|
614 | 0 | if ( v->processor == new_cpu ) |
615 | 0 | return; |
616 | 0 |
|
617 | 0 | if ( unlikely(tb_init_done) ) |
618 | 0 | { |
619 | 0 | struct { |
620 | 0 | uint16_t vcpu, dom; |
621 | 0 | uint16_t cpu, new_cpu; |
622 | 0 | } d; |
623 | 0 | d.dom = v->domain->domain_id; |
624 | 0 | d.vcpu = v->vcpu_id; |
625 | 0 | d.cpu = v->processor; |
626 | 0 | d.new_cpu = new_cpu; |
627 | 0 | __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d); |
628 | 0 | } |
629 | 0 |
|
630 | 0 | /* |
631 | 0 | * v is either assigned to a pCPU, or in the waitqueue. |
632 | 0 | * |
633 | 0 | * In the former case, the pCPU to which it was assigned would |
634 | 0 | * become free, and we, therefore, should check whether there is |
635 | 0 | * anyone in the waitqueue that can be assigned to it. |
636 | 0 | * |
637 | 0 | * In the latter, there is just nothing to do. |
638 | 0 | */ |
639 | 0 | if ( likely(list_empty(&nvc->waitq_elem)) ) |
640 | 0 | { |
641 | 0 | _vcpu_remove(prv, v); |
642 | 0 | SCHED_STAT_CRANK(migrate_running); |
643 | 0 | } |
644 | 0 | else |
645 | 0 | SCHED_STAT_CRANK(migrate_on_runq); |
646 | 0 |
|
647 | 0 | SCHED_STAT_CRANK(migrated); |
648 | 0 |
|
649 | 0 | /* |
650 | 0 | * Let's now consider new_cpu, which is where v is being sent. It can be |
651 | 0 | * either free, or have a vCPU already assigned to it. |
652 | 0 | * |
653 | 0 | * In the former case, we should assign v to it, and try to get it to run, |
654 | 0 | * if possible, according to affinity. |
655 | 0 | * |
656 | 0 | * In latter, all we can do is to park v in the waitqueue. |
657 | 0 | */ |
658 | 0 | if ( per_cpu(npc, new_cpu).vcpu == NULL && |
659 | 0 | vcpu_check_affinity(v, new_cpu, BALANCE_HARD_AFFINITY) ) |
660 | 0 | { |
661 | 0 | /* v might have been in the waitqueue, so remove it */ |
662 | 0 | spin_lock(&prv->waitq_lock); |
663 | 0 | list_del_init(&nvc->waitq_elem); |
664 | 0 | spin_unlock(&prv->waitq_lock); |
665 | 0 |
|
666 | 0 | vcpu_assign(prv, v, new_cpu); |
667 | 0 | } |
668 | 0 | else |
669 | 0 | { |
670 | 0 | /* Put v in the waitqueue, if it wasn't there already */ |
671 | 0 | spin_lock(&prv->waitq_lock); |
672 | 0 | if ( list_empty(&nvc->waitq_elem) ) |
673 | 0 | { |
674 | 0 | list_add_tail(&nvc->waitq_elem, &prv->waitq); |
675 | 0 | dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", |
676 | 0 | v->domain->domain_id, v->vcpu_id); |
677 | 0 | } |
678 | 0 | spin_unlock(&prv->waitq_lock); |
679 | 0 | } |
680 | 0 |
|
681 | 0 | /* |
682 | 0 | * Whatever all the above, we always at least override v->processor. |
683 | 0 | * This is especially important for shutdown or suspend/resume paths, |
684 | 0 | * when it is important to let our caller (cpu_disable_scheduler()) |
685 | 0 | * know that the migration did happen, to the best of our possibilities, |
686 | 0 | * at least. In case of suspend, any temporary inconsistency caused |
687 | 0 | * by this, will be fixed-up during resume. |
688 | 0 | */ |
689 | 0 | v->processor = new_cpu; |
690 | 0 | } |
691 | | |
692 | | #ifndef NDEBUG |
693 | | static inline void null_vcpu_check(struct vcpu *v) |
694 | 0 | { |
695 | 0 | struct null_vcpu * const nvc = null_vcpu(v); |
696 | 0 | struct null_dom * const ndom = null_dom(v->domain); |
697 | 0 |
|
698 | 0 | BUG_ON(nvc->vcpu != v); |
699 | 0 |
|
700 | 0 | if ( ndom ) |
701 | 0 | BUG_ON(is_idle_vcpu(v)); |
702 | 0 | else |
703 | 0 | BUG_ON(!is_idle_vcpu(v)); |
704 | 0 |
|
705 | 0 | SCHED_STAT_CRANK(vcpu_check); |
706 | 0 | } |
707 | 0 | #define NULL_VCPU_CHECK(v) (null_vcpu_check(v)) |
708 | | #else |
709 | | #define NULL_VCPU_CHECK(v) |
710 | | #endif |
711 | | |
712 | | |
713 | | /* |
714 | | * The most simple scheduling function of all times! We either return: |
715 | | * - the vCPU assigned to the pCPU, if there's one and it can run; |
716 | | * - the idle vCPU, otherwise. |
717 | | */ |
718 | | static struct task_slice null_schedule(const struct scheduler *ops, |
719 | | s_time_t now, |
720 | | bool_t tasklet_work_scheduled) |
721 | 0 | { |
722 | 0 | unsigned int bs; |
723 | 0 | const unsigned int cpu = smp_processor_id(); |
724 | 0 | struct null_private *prv = null_priv(ops); |
725 | 0 | struct null_vcpu *wvc; |
726 | 0 | struct task_slice ret; |
727 | 0 |
|
728 | 0 | SCHED_STAT_CRANK(schedule); |
729 | 0 | NULL_VCPU_CHECK(current); |
730 | 0 |
|
731 | 0 | if ( unlikely(tb_init_done) ) |
732 | 0 | { |
733 | 0 | struct { |
734 | 0 | uint16_t tasklet, cpu; |
735 | 0 | int16_t vcpu, dom; |
736 | 0 | } d; |
737 | 0 | d.cpu = cpu; |
738 | 0 | d.tasklet = tasklet_work_scheduled; |
739 | 0 | if ( per_cpu(npc, cpu).vcpu == NULL ) |
740 | 0 | { |
741 | 0 | d.vcpu = d.dom = -1; |
742 | 0 | } |
743 | 0 | else |
744 | 0 | { |
745 | 0 | d.vcpu = per_cpu(npc, cpu).vcpu->vcpu_id; |
746 | 0 | d.dom = per_cpu(npc, cpu).vcpu->domain->domain_id; |
747 | 0 | } |
748 | 0 | __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d); |
749 | 0 | } |
750 | 0 |
|
751 | 0 | if ( tasklet_work_scheduled ) |
752 | 0 | { |
753 | 0 | trace_var(TRC_SNULL_TASKLET, 1, 0, NULL); |
754 | 0 | ret.task = idle_vcpu[cpu]; |
755 | 0 | } |
756 | 0 | else |
757 | 0 | ret.task = per_cpu(npc, cpu).vcpu; |
758 | 0 | ret.migrated = 0; |
759 | 0 | ret.time = -1; |
760 | 0 |
|
761 | 0 | /* |
762 | 0 | * We may be new in the cpupool, or just coming back online. In which |
763 | 0 | * case, there may be vCPUs in the waitqueue that we can assign to us |
764 | 0 | * and run. |
765 | 0 | */ |
766 | 0 | if ( unlikely(ret.task == NULL) ) |
767 | 0 | { |
768 | 0 | spin_lock(&prv->waitq_lock); |
769 | 0 |
|
770 | 0 | if ( list_empty(&prv->waitq) ) |
771 | 0 | goto unlock; |
772 | 0 |
|
773 | 0 | /* |
774 | 0 | * We scan the waitqueue twice, for prioritizing vcpus that have |
775 | 0 | * soft-affinity with cpu. This may look like something expensive to |
776 | 0 | * do here in null_schedule(), but it's actually fine, beceuse we do |
777 | 0 | * it only in cases where a pcpu has no vcpu associated (e.g., as |
778 | 0 | * said above, the cpu has just joined a cpupool). |
779 | 0 | */ |
780 | 0 | for_each_affinity_balance_step( bs ) |
781 | 0 | { |
782 | 0 | list_for_each_entry( wvc, &prv->waitq, waitq_elem ) |
783 | 0 | { |
784 | 0 | if ( bs == BALANCE_SOFT_AFFINITY && |
785 | 0 | !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) ) |
786 | 0 | continue; |
787 | 0 |
|
788 | 0 | if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) ) |
789 | 0 | { |
790 | 0 | vcpu_assign(prv, wvc->vcpu, cpu); |
791 | 0 | list_del_init(&wvc->waitq_elem); |
792 | 0 | ret.task = wvc->vcpu; |
793 | 0 | goto unlock; |
794 | 0 | } |
795 | 0 | } |
796 | 0 | } |
797 | 0 | unlock: |
798 | 0 | spin_unlock(&prv->waitq_lock); |
799 | 0 | } |
800 | 0 |
|
801 | 0 | if ( unlikely(ret.task == NULL || !vcpu_runnable(ret.task)) ) |
802 | 0 | ret.task = idle_vcpu[cpu]; |
803 | 0 |
|
804 | 0 | NULL_VCPU_CHECK(ret.task); |
805 | 0 | return ret; |
806 | 0 | } |
807 | | |
808 | | static inline void dump_vcpu(struct null_private *prv, struct null_vcpu *nvc) |
809 | 0 | { |
810 | 0 | printk("[%i.%i] pcpu=%d", nvc->vcpu->domain->domain_id, |
811 | 0 | nvc->vcpu->vcpu_id, list_empty(&nvc->waitq_elem) ? |
812 | 0 | nvc->vcpu->processor : -1); |
813 | 0 | } |
814 | | |
815 | | static void null_dump_pcpu(const struct scheduler *ops, int cpu) |
816 | 0 | { |
817 | 0 | struct null_private *prv = null_priv(ops); |
818 | 0 | struct null_vcpu *nvc; |
819 | 0 | spinlock_t *lock; |
820 | 0 | unsigned long flags; |
821 | 0 | #define cpustr keyhandler_scratch |
822 | 0 |
|
823 | 0 | lock = pcpu_schedule_lock_irqsave(cpu, &flags); |
824 | 0 |
|
825 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); |
826 | 0 | printk("CPU[%02d] sibling=%s, ", cpu, cpustr); |
827 | 0 | cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); |
828 | 0 | printk("core=%s", cpustr); |
829 | 0 | if ( per_cpu(npc, cpu).vcpu != NULL ) |
830 | 0 | printk(", vcpu=d%dv%d", per_cpu(npc, cpu).vcpu->domain->domain_id, |
831 | 0 | per_cpu(npc, cpu).vcpu->vcpu_id); |
832 | 0 | printk("\n"); |
833 | 0 |
|
834 | 0 | /* current VCPU (nothing to say if that's the idle vcpu) */ |
835 | 0 | nvc = null_vcpu(curr_on_cpu(cpu)); |
836 | 0 | if ( nvc && !is_idle_vcpu(nvc->vcpu) ) |
837 | 0 | { |
838 | 0 | printk("\trun: "); |
839 | 0 | dump_vcpu(prv, nvc); |
840 | 0 | printk("\n"); |
841 | 0 | } |
842 | 0 |
|
843 | 0 | pcpu_schedule_unlock_irqrestore(lock, flags, cpu); |
844 | 0 | #undef cpustr |
845 | 0 | } |
846 | | |
847 | | static void null_dump(const struct scheduler *ops) |
848 | 0 | { |
849 | 0 | struct null_private *prv = null_priv(ops); |
850 | 0 | struct list_head *iter; |
851 | 0 | unsigned long flags; |
852 | 0 | unsigned int loop; |
853 | 0 | #define cpustr keyhandler_scratch |
854 | 0 |
|
855 | 0 | spin_lock_irqsave(&prv->lock, flags); |
856 | 0 |
|
857 | 0 | cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->cpus_free); |
858 | 0 | printk("\tcpus_free = %s\n", cpustr); |
859 | 0 |
|
860 | 0 | printk("Domain info:\n"); |
861 | 0 | loop = 0; |
862 | 0 | list_for_each( iter, &prv->ndom ) |
863 | 0 | { |
864 | 0 | struct null_dom *ndom; |
865 | 0 | struct vcpu *v; |
866 | 0 |
|
867 | 0 | ndom = list_entry(iter, struct null_dom, ndom_elem); |
868 | 0 |
|
869 | 0 | printk("\tDomain: %d\n", ndom->dom->domain_id); |
870 | 0 | for_each_vcpu( ndom->dom, v ) |
871 | 0 | { |
872 | 0 | struct null_vcpu * const nvc = null_vcpu(v); |
873 | 0 | spinlock_t *lock; |
874 | 0 |
|
875 | 0 | lock = vcpu_schedule_lock(nvc->vcpu); |
876 | 0 |
|
877 | 0 | printk("\t%3d: ", ++loop); |
878 | 0 | dump_vcpu(prv, nvc); |
879 | 0 | printk("\n"); |
880 | 0 |
|
881 | 0 | vcpu_schedule_unlock(lock, nvc->vcpu); |
882 | 0 | } |
883 | 0 | } |
884 | 0 |
|
885 | 0 | printk("Waitqueue: "); |
886 | 0 | loop = 0; |
887 | 0 | spin_lock(&prv->waitq_lock); |
888 | 0 | list_for_each( iter, &prv->waitq ) |
889 | 0 | { |
890 | 0 | struct null_vcpu *nvc = list_entry(iter, struct null_vcpu, waitq_elem); |
891 | 0 |
|
892 | 0 | if ( loop++ != 0 ) |
893 | 0 | printk(", "); |
894 | 0 | if ( loop % 24 == 0 ) |
895 | 0 | printk("\n\t"); |
896 | 0 | printk("d%dv%d", nvc->vcpu->domain->domain_id, nvc->vcpu->vcpu_id); |
897 | 0 | } |
898 | 0 | printk("\n"); |
899 | 0 | spin_unlock(&prv->waitq_lock); |
900 | 0 |
|
901 | 0 | spin_unlock_irqrestore(&prv->lock, flags); |
902 | 0 | #undef cpustr |
903 | 0 | } |
904 | | |
905 | | const struct scheduler sched_null_def = { |
906 | | .name = "null Scheduler", |
907 | | .opt_name = "null", |
908 | | .sched_id = XEN_SCHEDULER_NULL, |
909 | | .sched_data = NULL, |
910 | | |
911 | | .init = null_init, |
912 | | .deinit = null_deinit, |
913 | | .init_pdata = null_init_pdata, |
914 | | .switch_sched = null_switch_sched, |
915 | | .deinit_pdata = null_deinit_pdata, |
916 | | |
917 | | .alloc_vdata = null_alloc_vdata, |
918 | | .free_vdata = null_free_vdata, |
919 | | .alloc_domdata = null_alloc_domdata, |
920 | | .free_domdata = null_free_domdata, |
921 | | |
922 | | .init_domain = null_dom_init, |
923 | | .destroy_domain = null_dom_destroy, |
924 | | |
925 | | .insert_vcpu = null_vcpu_insert, |
926 | | .remove_vcpu = null_vcpu_remove, |
927 | | |
928 | | .wake = null_vcpu_wake, |
929 | | .sleep = null_vcpu_sleep, |
930 | | .pick_cpu = null_cpu_pick, |
931 | | .migrate = null_vcpu_migrate, |
932 | | .do_schedule = null_schedule, |
933 | | |
934 | | .dump_cpu_state = null_dump_pcpu, |
935 | | .dump_settings = null_dump, |
936 | | }; |
937 | | |
938 | | REGISTER_SCHEDULER(sched_null_def); |