rev |
line source |
ack@10206
|
1 /****************************************************************************
|
ack@10206
|
2 * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
|
ack@10206
|
3 ****************************************************************************
|
ack@10206
|
4 *
|
ack@10206
|
5 * File: common/csched_credit.c
|
ack@10206
|
6 * Author: Emmanuel Ackaouy
|
ack@10206
|
7 *
|
ack@10206
|
8 * Description: Credit-based SMP CPU scheduler
|
ack@10206
|
9 */
|
ack@10206
|
10
|
ack@10206
|
11 #include <xen/config.h>
|
ack@10206
|
12 #include <xen/init.h>
|
ack@10206
|
13 #include <xen/lib.h>
|
ack@10206
|
14 #include <xen/sched.h>
|
ack@10206
|
15 #include <xen/domain.h>
|
ack@10206
|
16 #include <xen/delay.h>
|
ack@10206
|
17 #include <xen/event.h>
|
ack@10206
|
18 #include <xen/time.h>
|
ack@10206
|
19 #include <xen/perfc.h>
|
ack@10206
|
20 #include <xen/sched-if.h>
|
ack@10206
|
21 #include <xen/softirq.h>
|
ack@10206
|
22 #include <asm/atomic.h>
|
kaf24@11236
|
23 #include <xen/errno.h>
|
keir@20975
|
24 #include <xen/keyhandler.h>
|
ack@10206
|
25
|
ack@10206
|
26 /*
|
ack@10206
|
27 * CSCHED_STATS
|
ack@10206
|
28 *
|
keir@19335
|
29 * Manage very basic per-vCPU counters and stats.
|
ack@10206
|
30 *
|
ack@10206
|
31 * Useful for debugging live systems. The stats are displayed
|
ack@10206
|
32 * with runq dumps ('r' on the Xen console).
|
ack@10206
|
33 */
|
keir@19335
|
34 #ifdef PERF_COUNTERS
|
ack@10206
|
35 #define CSCHED_STATS
|
keir@19335
|
36 #endif
|
ack@10206
|
37
|
ack@10206
|
38
|
ack@10206
|
39 /*
|
ack@10206
|
40 * Basic constants
|
ack@10206
|
41 */
|
ack@12071
|
42 #define CSCHED_DEFAULT_WEIGHT 256
|
ack@12071
|
43 #define CSCHED_TICKS_PER_TSLICE 3
|
ack@12071
|
44 #define CSCHED_TICKS_PER_ACCT 3
|
ack@12071
|
45 #define CSCHED_MSECS_PER_TICK 10
|
ack@12071
|
46 #define CSCHED_MSECS_PER_TSLICE \
|
ack@12071
|
47 (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
|
keir@20308
|
48 #define CSCHED_CREDITS_PER_MSEC 10
|
ack@12071
|
49 #define CSCHED_CREDITS_PER_TSLICE \
|
keir@20308
|
50 (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TSLICE)
|
ack@12071
|
51 #define CSCHED_CREDITS_PER_ACCT \
|
keir@20308
|
52 (CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_ACCT)
|
ack@10206
|
53
|
ack@10206
|
54
|
ack@10206
|
55 /*
|
ack@10206
|
56 * Priorities
|
ack@10206
|
57 */
|
ack@12048
|
58 #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
|
ack@10206
|
59 #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
|
ack@10206
|
60 #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
|
ack@10206
|
61 #define CSCHED_PRI_IDLE -64 /* idle */
|
ack@13046
|
62
|
ack@13046
|
63
|
ack@13046
|
64 /*
|
ack@13046
|
65 * Flags
|
ack@13046
|
66 */
|
keir@21982
|
67 #define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
|
keir@21982
|
68 #define CSCHED_FLAG_VCPU_YIELD 0x0002 /* VCPU yielding */
|
ack@10206
|
69
|
ack@10206
|
70
|
ack@10206
|
71 /*
|
ack@10206
|
72 * Useful macros
|
ack@10206
|
73 */
|
keir@21258
|
74 #define CSCHED_PRIV(_ops) \
|
keir@21258
|
75 ((struct csched_private *)((_ops)->sched_data))
|
kaf24@11017
|
76 #define CSCHED_PCPU(_c) \
|
kaf24@11017
|
77 ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
|
ack@10206
|
78 #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
|
ack@10206
|
79 #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
|
ack@10206
|
80 #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
|
keir@21258
|
81 #define CSCHED_CPUONLINE(_pool) \
|
keir@21258
|
82 (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid)
|
ack@10206
|
83
|
ack@10206
|
84
|
ack@10206
|
85 /*
|
ack@10206
|
86 * Stats
|
ack@10206
|
87 */
|
keir@19335
|
88 #define CSCHED_STAT_CRANK(_X) (perfc_incr(_X))
|
ack@10206
|
89
|
keir@19335
|
90 #ifdef CSCHED_STATS
|
ack@12941
|
91
|
ack@12941
|
92 #define CSCHED_VCPU_STATS_RESET(_V) \
|
ack@12941
|
93 do \
|
ack@12941
|
94 { \
|
ack@12941
|
95 memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
|
ack@12941
|
96 } while ( 0 )
|
ack@12941
|
97
|
ack@12941
|
98 #define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
|
ack@12941
|
99
|
ack@12941
|
100 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
|
ack@10206
|
101
|
ack@10206
|
102 #else /* CSCHED_STATS */
|
ack@10206
|
103
|
ack@12941
|
104 #define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
|
ack@12941
|
105 #define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
|
ack@12941
|
106 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
|
ack@10206
|
107
|
ack@10206
|
108 #endif /* CSCHED_STATS */
|
ack@10206
|
109
|
ack@10206
|
110
|
ack@10206
|
111 /*
|
keir@21982
|
112 * Boot parameters
|
keir@21982
|
113 */
|
keir@22676
|
114 static bool_t __read_mostly sched_credit_default_yield;
|
keir@21982
|
115 boolean_param("sched_credit_default_yield", sched_credit_default_yield);
|
keir@21982
|
116
|
keir@21982
|
117 /*
|
ack@10206
|
118 * Physical CPU
|
ack@10206
|
119 */
|
ack@10206
|
120 struct csched_pcpu {
|
ack@10206
|
121 struct list_head runq;
|
ack@10206
|
122 uint32_t runq_sort_last;
|
kfraser@14358
|
123 struct timer ticker;
|
kfraser@14358
|
124 unsigned int tick;
|
keir@20423
|
125 unsigned int idle_bias;
|
ack@10206
|
126 };
|
ack@10206
|
127
|
ack@10206
|
128 /*
|
ack@10206
|
129 * Virtual CPU
|
ack@10206
|
130 */
|
ack@10206
|
131 struct csched_vcpu {
|
ack@10206
|
132 struct list_head runq_elem;
|
ack@10206
|
133 struct list_head active_vcpu_elem;
|
ack@10206
|
134 struct csched_dom *sdom;
|
ack@10206
|
135 struct vcpu *vcpu;
|
ack@10206
|
136 atomic_t credit;
|
keir@20160
|
137 s_time_t start_time; /* When we were scheduled (used for credit) */
|
ack@13046
|
138 uint16_t flags;
|
ack@10206
|
139 int16_t pri;
|
ack@12941
|
140 #ifdef CSCHED_STATS
|
ack@12073
|
141 struct {
|
ack@12073
|
142 int credit_last;
|
ack@12073
|
143 uint32_t credit_incr;
|
ack@12073
|
144 uint32_t state_active;
|
ack@12073
|
145 uint32_t state_idle;
|
ack@12941
|
146 uint32_t migrate_q;
|
ack@12941
|
147 uint32_t migrate_r;
|
ack@12073
|
148 } stats;
|
ack@12941
|
149 #endif
|
ack@10206
|
150 };
|
ack@10206
|
151
|
ack@10206
|
152 /*
|
ack@10206
|
153 * Domain
|
ack@10206
|
154 */
|
ack@10206
|
155 struct csched_dom {
|
ack@10206
|
156 struct list_head active_vcpu;
|
ack@10206
|
157 struct list_head active_sdom_elem;
|
ack@10206
|
158 struct domain *dom;
|
ack@10206
|
159 uint16_t active_vcpu_count;
|
ack@10206
|
160 uint16_t weight;
|
ack@10206
|
161 uint16_t cap;
|
ack@10206
|
162 };
|
ack@10206
|
163
|
ack@10206
|
164 /*
|
ack@10206
|
165 * System-wide private data
|
ack@10206
|
166 */
|
ack@10206
|
167 struct csched_private {
|
ack@10206
|
168 spinlock_t lock;
|
ack@10206
|
169 struct list_head active_sdom;
|
ack@10206
|
170 uint32_t ncpus;
|
keir@19498
|
171 struct timer master_ticker;
|
ack@10206
|
172 unsigned int master;
|
ack@10206
|
173 cpumask_t idlers;
|
keir@21258
|
174 cpumask_t cpus;
|
ack@10206
|
175 uint32_t weight;
|
ack@10206
|
176 uint32_t credit;
|
ack@10206
|
177 int credit_balance;
|
ack@10206
|
178 uint32_t runq_sort;
|
ack@10206
|
179 };
|
ack@10206
|
180
|
kfraser@14358
|
181 static void csched_tick(void *_cpu);
|
keir@21258
|
182 static void csched_acct(void *dummy);
|
ack@10206
|
183
|
ack@10206
|
184 static inline int
|
ack@10206
|
185 __vcpu_on_runq(struct csched_vcpu *svc)
|
ack@10206
|
186 {
|
ack@10206
|
187 return !list_empty(&svc->runq_elem);
|
ack@10206
|
188 }
|
ack@10206
|
189
|
ack@10206
|
190 static inline struct csched_vcpu *
|
ack@10206
|
191 __runq_elem(struct list_head *elem)
|
ack@10206
|
192 {
|
ack@10206
|
193 return list_entry(elem, struct csched_vcpu, runq_elem);
|
ack@10206
|
194 }
|
ack@10206
|
195
|
ack@10206
|
196 static inline void
|
ack@10206
|
197 __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
|
ack@10206
|
198 {
|
ack@10206
|
199 const struct list_head * const runq = RUNQ(cpu);
|
ack@10206
|
200 struct list_head *iter;
|
ack@10206
|
201
|
ack@10206
|
202 BUG_ON( __vcpu_on_runq(svc) );
|
ack@10206
|
203 BUG_ON( cpu != svc->vcpu->processor );
|
ack@10206
|
204
|
ack@10206
|
205 list_for_each( iter, runq )
|
ack@10206
|
206 {
|
ack@10206
|
207 const struct csched_vcpu * const iter_svc = __runq_elem(iter);
|
ack@10206
|
208 if ( svc->pri > iter_svc->pri )
|
ack@10206
|
209 break;
|
ack@10206
|
210 }
|
ack@10206
|
211
|
keir@21982
|
212 /* If the vcpu yielded, try to put it behind one lower-priority
|
keir@21982
|
213 * runnable vcpu if we can. The next runq_sort will bring it forward
|
keir@21982
|
214 * within 30ms if the queue too long. */
|
keir@21982
|
215 if ( svc->flags & CSCHED_FLAG_VCPU_YIELD
|
keir@21982
|
216 && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
|
keir@21982
|
217 {
|
keir@21982
|
218 iter=iter->next;
|
keir@21982
|
219
|
keir@21982
|
220 /* Some sanity checks */
|
keir@21982
|
221 BUG_ON(iter == runq);
|
keir@21982
|
222 }
|
keir@21982
|
223
|
ack@10206
|
224 list_add_tail(&svc->runq_elem, iter);
|
ack@10206
|
225 }
|
ack@10206
|
226
|
ack@10206
|
227 static inline void
|
ack@10206
|
228 __runq_remove(struct csched_vcpu *svc)
|
ack@10206
|
229 {
|
ack@10206
|
230 BUG_ON( !__vcpu_on_runq(svc) );
|
ack@10206
|
231 list_del_init(&svc->runq_elem);
|
ack@10206
|
232 }
|
ack@10206
|
233
|
keir@20300
|
234 static void burn_credits(struct csched_vcpu *svc, s_time_t now)
|
keir@20160
|
235 {
|
keir@20160
|
236 s_time_t delta;
|
keir@20308
|
237 unsigned int credits;
|
keir@20160
|
238
|
keir@20160
|
239 /* Assert svc is current */
|
keir@20160
|
240 ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
|
keir@20160
|
241
|
keir@20308
|
242 if ( (delta = now - svc->start_time) <= 0 )
|
keir@20160
|
243 return;
|
keir@20160
|
244
|
keir@20308
|
245 credits = (delta*CSCHED_CREDITS_PER_MSEC + MILLISECS(1)/2) / MILLISECS(1);
|
keir@20308
|
246 atomic_sub(credits, &svc->credit);
|
keir@20308
|
247 svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
|
keir@20160
|
248 }
|
keir@20160
|
249
|
keir@22676
|
250 static bool_t __read_mostly opt_tickle_one_idle = 1;
|
keir@21145
|
251 boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
|
keir@21145
|
252
|
keir@21462
|
253 DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
|
keir@21145
|
254
|
ack@10206
|
255 static inline void
|
ack@10206
|
256 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
|
ack@10206
|
257 {
|
kaf24@11017
|
258 struct csched_vcpu * const cur =
|
kaf24@11017
|
259 CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
|
keir@21258
|
260 struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
|
ack@10206
|
261 cpumask_t mask;
|
ack@10206
|
262
|
ack@10206
|
263 ASSERT(cur);
|
ack@10206
|
264 cpus_clear(mask);
|
ack@10206
|
265
|
ack@10206
|
266 /* If strictly higher priority than current VCPU, signal the CPU */
|
ack@10206
|
267 if ( new->pri > cur->pri )
|
ack@10206
|
268 {
|
ack@10206
|
269 if ( cur->pri == CSCHED_PRI_IDLE )
|
ack@10206
|
270 CSCHED_STAT_CRANK(tickle_local_idler);
|
ack@10206
|
271 else if ( cur->pri == CSCHED_PRI_TS_OVER )
|
ack@10206
|
272 CSCHED_STAT_CRANK(tickle_local_over);
|
ack@10206
|
273 else if ( cur->pri == CSCHED_PRI_TS_UNDER )
|
ack@10206
|
274 CSCHED_STAT_CRANK(tickle_local_under);
|
ack@10206
|
275 else
|
ack@10206
|
276 CSCHED_STAT_CRANK(tickle_local_other);
|
ack@10206
|
277
|
ack@10206
|
278 cpu_set(cpu, mask);
|
ack@10206
|
279 }
|
ack@10206
|
280
|
ack@10206
|
281 /*
|
ack@10206
|
282 * If this CPU has at least two runnable VCPUs, we tickle any idlers to
|
ack@10206
|
283 * let them know there is runnable work in the system...
|
ack@10206
|
284 */
|
ack@10206
|
285 if ( cur->pri > CSCHED_PRI_IDLE )
|
ack@10206
|
286 {
|
keir@21258
|
287 if ( cpus_empty(prv->idlers) )
|
ack@10206
|
288 {
|
ack@10206
|
289 CSCHED_STAT_CRANK(tickle_idlers_none);
|
ack@10206
|
290 }
|
ack@10206
|
291 else
|
ack@10206
|
292 {
|
keir@21145
|
293 cpumask_t idle_mask;
|
keir@21145
|
294
|
keir@21258
|
295 cpus_and(idle_mask, prv->idlers, new->vcpu->cpu_affinity);
|
keir@21145
|
296 if ( !cpus_empty(idle_mask) )
|
keir@21145
|
297 {
|
keir@21145
|
298 CSCHED_STAT_CRANK(tickle_idlers_some);
|
keir@21145
|
299 if ( opt_tickle_one_idle )
|
keir@21145
|
300 {
|
keir@21145
|
301 this_cpu(last_tickle_cpu) =
|
keir@21145
|
302 cycle_cpu(this_cpu(last_tickle_cpu), idle_mask);
|
keir@21145
|
303 cpu_set(this_cpu(last_tickle_cpu), mask);
|
keir@21145
|
304 }
|
keir@21145
|
305 else
|
keir@21145
|
306 cpus_or(mask, mask, idle_mask);
|
keir@21145
|
307 }
|
kaf24@11519
|
308 cpus_and(mask, mask, new->vcpu->cpu_affinity);
|
ack@10206
|
309 }
|
ack@10206
|
310 }
|
ack@10206
|
311
|
ack@10206
|
312 /* Send scheduler interrupts to designated CPUs */
|
ack@10206
|
313 if ( !cpus_empty(mask) )
|
ack@10206
|
314 cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
|
ack@10206
|
315 }
|
ack@10206
|
316
|
keir@21258
|
317 static void
|
keir@21327
|
318 csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
|
keir@21258
|
319 {
|
keir@21258
|
320 struct csched_private *prv = CSCHED_PRIV(ops);
|
keir@21258
|
321 struct csched_pcpu *spc = pcpu;
|
keir@21258
|
322 unsigned long flags;
|
keir@21258
|
323
|
keir@21258
|
324 if ( spc == NULL )
|
keir@21258
|
325 return;
|
keir@21258
|
326
|
keir@21258
|
327 spin_lock_irqsave(&prv->lock, flags);
|
keir@21258
|
328
|
keir@21258
|
329 prv->credit -= CSCHED_CREDITS_PER_ACCT;
|
keir@21258
|
330 prv->ncpus--;
|
keir@21258
|
331 cpu_clear(cpu, prv->idlers);
|
keir@21258
|
332 cpu_clear(cpu, prv->cpus);
|
keir@21258
|
333 if ( (prv->master == cpu) && (prv->ncpus > 0) )
|
keir@21258
|
334 {
|
keir@21258
|
335 prv->master = first_cpu(prv->cpus);
|
keir@21258
|
336 migrate_timer(&prv->master_ticker, prv->master);
|
keir@21258
|
337 }
|
keir@21258
|
338 kill_timer(&spc->ticker);
|
keir@21258
|
339 if ( prv->ncpus == 0 )
|
keir@21258
|
340 kill_timer(&prv->master_ticker);
|
keir@21258
|
341
|
keir@21258
|
342 spin_unlock_irqrestore(&prv->lock, flags);
|
keir@21258
|
343
|
keir@21258
|
344 xfree(spc);
|
keir@21258
|
345 }
|
keir@21258
|
346
|
keir@21258
|
347 static void *
|
keir@21327
|
348 csched_alloc_pdata(const struct scheduler *ops, int cpu)
|
ack@10206
|
349 {
|
ack@10206
|
350 struct csched_pcpu *spc;
|
keir@21258
|
351 struct csched_private *prv = CSCHED_PRIV(ops);
|
ack@10206
|
352 unsigned long flags;
|
ack@10206
|
353
|
kfraser@10930
|
354 /* Allocate per-PCPU info */
|
kfraser@10930
|
355 spc = xmalloc(struct csched_pcpu);
|
kfraser@10930
|
356 if ( spc == NULL )
|
keir@21258
|
357 return NULL;
|
keir@20308
|
358 memset(spc, 0, sizeof(*spc));
|
kfraser@10930
|
359
|
keir@21258
|
360 spin_lock_irqsave(&prv->lock, flags);
|
ack@10206
|
361
|
ack@10206
|
362 /* Initialize/update system-wide config */
|
keir@21258
|
363 prv->credit += CSCHED_CREDITS_PER_ACCT;
|
keir@21258
|
364 prv->ncpus++;
|
keir@21258
|
365 cpu_set(cpu, prv->cpus);
|
keir@21453
|
366 if ( prv->ncpus == 1 )
|
keir@21258
|
367 {
|
keir@21258
|
368 prv->master = cpu;
|
keir@21453
|
369 init_timer(&prv->master_ticker, csched_acct, prv, cpu);
|
keir@21453
|
370 set_timer(&prv->master_ticker, NOW() +
|
keir@21453
|
371 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
|
keir@21258
|
372 }
|
ack@10206
|
373
|
kfraser@14358
|
374 init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
|
keir@21453
|
375 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
|
keir@21258
|
376
|
ack@10206
|
377 INIT_LIST_HEAD(&spc->runq);
|
keir@21258
|
378 spc->runq_sort_last = prv->runq_sort;
|
keir@20423
|
379 spc->idle_bias = NR_CPUS - 1;
|
keir@21258
|
380 if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
|
keir@21258
|
381 per_cpu(schedule_data, cpu).sched_priv = spc;
|
ack@10206
|
382
|
ack@10206
|
383 /* Start off idling... */
|
kfraser@14358
|
384 BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
|
keir@21258
|
385 cpu_set(cpu, prv->idlers);
|
ack@10206
|
386
|
keir@21258
|
387 spin_unlock_irqrestore(&prv->lock, flags);
|
kfraser@10930
|
388
|
keir@21258
|
389 return spc;
|
ack@10206
|
390 }
|
ack@10206
|
391
|
ack@10206
|
392 #ifndef NDEBUG
|
ack@10206
|
393 static inline void
|
ack@10206
|
394 __csched_vcpu_check(struct vcpu *vc)
|
ack@10206
|
395 {
|
ack@10206
|
396 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
|
ack@10206
|
397 struct csched_dom * const sdom = svc->sdom;
|
ack@10206
|
398
|
ack@10206
|
399 BUG_ON( svc->vcpu != vc );
|
ack@10206
|
400 BUG_ON( sdom != CSCHED_DOM(vc->domain) );
|
ack@10206
|
401 if ( sdom )
|
ack@10206
|
402 {
|
ack@10206
|
403 BUG_ON( is_idle_vcpu(vc) );
|
ack@10206
|
404 BUG_ON( sdom->dom != vc->domain );
|
ack@10206
|
405 }
|
ack@10206
|
406 else
|
ack@10206
|
407 {
|
ack@10206
|
408 BUG_ON( !is_idle_vcpu(vc) );
|
ack@10206
|
409 }
|
ack@10206
|
410
|
ack@10206
|
411 CSCHED_STAT_CRANK(vcpu_check);
|
ack@10206
|
412 }
|
ack@10206
|
413 #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
|
ack@10206
|
414 #else
|
ack@10206
|
415 #define CSCHED_VCPU_CHECK(_vc)
|
ack@10206
|
416 #endif
|
ack@10206
|
417
|
keir@19331
|
418 /*
|
keir@19331
|
419 * Delay, in microseconds, between migrations of a VCPU between PCPUs.
|
keir@19331
|
420 * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
|
keir@19331
|
421 * implicit overheads such as cache-warming. 1ms (1000) has been measured
|
keir@19331
|
422 * as a good value.
|
keir@19331
|
423 */
|
keir@19331
|
424 static unsigned int vcpu_migration_delay;
|
keir@19331
|
425 integer_param("vcpu_migration_delay", vcpu_migration_delay);
|
keir@19331
|
426
|
keir@19540
|
427 void set_vcpu_migration_delay(unsigned int delay)
|
keir@19540
|
428 {
|
keir@19540
|
429 vcpu_migration_delay = delay;
|
keir@19540
|
430 }
|
keir@19540
|
431
|
keir@19540
|
432 unsigned int get_vcpu_migration_delay(void)
|
keir@19540
|
433 {
|
keir@19540
|
434 return vcpu_migration_delay;
|
keir@19540
|
435 }
|
keir@19540
|
436
|
keir@19331
|
437 static inline int
|
keir@19331
|
438 __csched_vcpu_is_cache_hot(struct vcpu *v)
|
keir@19331
|
439 {
|
keir@19346
|
440 int hot = ((NOW() - v->last_run_time) <
|
keir@19331
|
441 ((uint64_t)vcpu_migration_delay * 1000u));
|
keir@19331
|
442
|
keir@19331
|
443 if ( hot )
|
keir@19331
|
444 CSCHED_STAT_CRANK(vcpu_hot);
|
keir@19331
|
445
|
keir@19331
|
446 return hot;
|
keir@19331
|
447 }
|
keir@19331
|
448
|
ack@10206
|
449 static inline int
|
ack@12941
|
450 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
|
ack@10206
|
451 {
|
ack@10206
|
452 /*
|
keir@19331
|
453 * Don't pick up work that's in the peer's scheduling tail or hot on
|
keir@19331
|
454 * peer PCPU. Only pick up work that's allowed to run on our CPU.
|
ack@10206
|
455 */
|
keir@19331
|
456 return !vc->is_running &&
|
keir@19331
|
457 !__csched_vcpu_is_cache_hot(vc) &&
|
keir@19331
|
458 cpu_isset(dest_cpu, vc->cpu_affinity);
|
ack@10206
|
459 }
|
ack@10206
|
460
|
ack@12941
|
461 static int
|
keir@21327
|
462 _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
|
ack@12291
|
463 {
|
ack@12941
|
464 cpumask_t cpus;
|
ack@12941
|
465 cpumask_t idlers;
|
keir@21258
|
466 cpumask_t *online;
|
ack@12941
|
467 int cpu;
|
ack@12941
|
468
|
ack@12941
|
469 /*
|
ack@12941
|
470 * Pick from online CPUs in VCPU's affinity mask, giving a
|
ack@12941
|
471 * preference to its current processor if it's in there.
|
ack@12941
|
472 */
|
keir@21258
|
473 online = CSCHED_CPUONLINE(vc->domain->cpupool);
|
keir@21258
|
474 cpus_and(cpus, *online, vc->cpu_affinity);
|
ack@12941
|
475 cpu = cpu_isset(vc->processor, cpus)
|
ack@12941
|
476 ? vc->processor
|
keir@19314
|
477 : cycle_cpu(vc->processor, cpus);
|
ack@12941
|
478 ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
|
ack@12291
|
479
|
ack@12941
|
480 /*
|
ack@12941
|
481 * Try to find an idle processor within the above constraints.
|
ack@12941
|
482 *
|
ack@12941
|
483 * In multi-core and multi-threaded CPUs, not all idle execution
|
ack@12941
|
484 * vehicles are equal!
|
ack@12941
|
485 *
|
ack@12941
|
486 * We give preference to the idle execution vehicle with the most
|
ack@12941
|
487 * idling neighbours in its grouping. This distributes work across
|
ack@12941
|
488 * distinct cores first and guarantees we don't do something stupid
|
ack@12941
|
489 * like run two VCPUs on co-hyperthreads while there are idle cores
|
ack@12941
|
490 * or sockets.
|
ack@12941
|
491 */
|
keir@21258
|
492 cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers);
|
ack@12941
|
493 cpu_set(cpu, idlers);
|
ack@12941
|
494 cpus_and(cpus, cpus, idlers);
|
ack@12941
|
495 cpu_clear(cpu, cpus);
|
ack@12941
|
496
|
ack@12941
|
497 while ( !cpus_empty(cpus) )
|
ack@12291
|
498 {
|
ack@12941
|
499 cpumask_t cpu_idlers;
|
ack@12941
|
500 cpumask_t nxt_idlers;
|
keir@19450
|
501 int nxt, weight_cpu, weight_nxt;
|
keir@22226
|
502 int migrate_factor;
|
ack@12941
|
503
|
keir@19314
|
504 nxt = cycle_cpu(cpu, cpus);
|
ack@12941
|
505
|
keir@19965
|
506 if ( cpu_isset(cpu, per_cpu(cpu_core_map, nxt)) )
|
ack@12941
|
507 {
|
keir@22226
|
508 /* We're on the same socket, so check the busy-ness of threads.
|
keir@22226
|
509 * Migrate if # of idlers is less at all */
|
keir@19965
|
510 ASSERT( cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
|
keir@22226
|
511 migrate_factor = 1;
|
keir@19965
|
512 cpus_and(cpu_idlers, idlers, per_cpu(cpu_sibling_map, cpu));
|
keir@19965
|
513 cpus_and(nxt_idlers, idlers, per_cpu(cpu_sibling_map, nxt));
|
ack@12941
|
514 }
|
ack@12941
|
515 else
|
ack@12941
|
516 {
|
keir@22226
|
517 /* We're on different sockets, so check the busy-ness of cores.
|
keir@22226
|
518 * Migrate only if the other core is twice as idle */
|
keir@19965
|
519 ASSERT( !cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
|
keir@22226
|
520 migrate_factor = 2;
|
keir@19965
|
521 cpus_and(cpu_idlers, idlers, per_cpu(cpu_core_map, cpu));
|
keir@19965
|
522 cpus_and(nxt_idlers, idlers, per_cpu(cpu_core_map, nxt));
|
ack@12941
|
523 }
|
ack@12941
|
524
|
keir@19450
|
525 weight_cpu = cpus_weight(cpu_idlers);
|
keir@19450
|
526 weight_nxt = cpus_weight(nxt_idlers);
|
keir@22226
|
527 /* smt_power_savings: consolidate work rather than spreading it */
|
keir@22226
|
528 if ( ( sched_smt_power_savings
|
keir@22226
|
529 && (weight_cpu > weight_nxt) )
|
keir@22226
|
530 || ( !sched_smt_power_savings
|
keir@22226
|
531 && (weight_cpu * migrate_factor < weight_nxt) ) )
|
ack@12941
|
532 {
|
keir@20423
|
533 cpu = cycle_cpu(CSCHED_PCPU(nxt)->idle_bias, nxt_idlers);
|
keir@20423
|
534 if ( commit )
|
keir@20423
|
535 CSCHED_PCPU(nxt)->idle_bias = cpu;
|
keir@20423
|
536 cpus_andnot(cpus, cpus, per_cpu(cpu_sibling_map, cpu));
|
ack@12941
|
537 }
|
ack@12941
|
538 else
|
ack@12941
|
539 {
|
ack@12941
|
540 cpus_andnot(cpus, cpus, nxt_idlers);
|
ack@12941
|
541 }
|
ack@12291
|
542 }
|
ack@12291
|
543
|
ack@12941
|
544 return cpu;
|
ack@12291
|
545 }
|
ack@12291
|
546
|
keir@20423
|
547 static int
|
keir@21327
|
548 csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
|
keir@20423
|
549 {
|
keir@21258
|
550 return _csched_cpu_pick(ops, vc, 1);
|
keir@20423
|
551 }
|
keir@20423
|
552
|
ack@12941
|
553 static inline void
|
keir@21258
|
554 __csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
|
ack@10206
|
555 {
|
ack@10206
|
556 struct csched_dom * const sdom = svc->sdom;
|
ack@10206
|
557 unsigned long flags;
|
ack@10206
|
558
|
keir@21258
|
559 spin_lock_irqsave(&prv->lock, flags);
|
ack@10206
|
560
|
ack@10206
|
561 if ( list_empty(&svc->active_vcpu_elem) )
|
ack@10206
|
562 {
|
ack@12941
|
563 CSCHED_VCPU_STAT_CRANK(svc, state_active);
|
ack@12941
|
564 CSCHED_STAT_CRANK(acct_vcpu_active);
|
ack@10206
|
565
|
ack@12941
|
566 sdom->active_vcpu_count++;
|
ack@12941
|
567 list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
|
keir@22026
|
568 /* Make weight per-vcpu */
|
keir@22026
|
569 prv->weight += sdom->weight;
|
ack@12941
|
570 if ( list_empty(&sdom->active_sdom_elem) )
|
ack@10206
|
571 {
|
keir@21258
|
572 list_add(&sdom->active_sdom_elem, &prv->active_sdom);
|
ack@12941
|
573 }
|
ack@12941
|
574 }
|
ack@12941
|
575
|
keir@21258
|
576 spin_unlock_irqrestore(&prv->lock, flags);
|
ack@12941
|
577 }
|
ack@12941
|
578
|
ack@12941
|
579 static inline void
|
keir@21258
|
580 __csched_vcpu_acct_stop_locked(struct csched_private *prv,
|
keir@21258
|
581 struct csched_vcpu *svc)
|
ack@12941
|
582 {
|
ack@12941
|
583 struct csched_dom * const sdom = svc->sdom;
|
ack@10206
|
584
|
ack@12941
|
585 BUG_ON( list_empty(&svc->active_vcpu_elem) );
|
ack@12941
|
586
|
ack@12941
|
587 CSCHED_VCPU_STAT_CRANK(svc, state_idle);
|
ack@12941
|
588 CSCHED_STAT_CRANK(acct_vcpu_idle);
|
ack@10206
|
589
|
keir@22026
|
590 BUG_ON( prv->weight < sdom->weight );
|
ack@12941
|
591 sdom->active_vcpu_count--;
|
ack@12941
|
592 list_del_init(&svc->active_vcpu_elem);
|
keir@22026
|
593 prv->weight -= sdom->weight;
|
ack@12941
|
594 if ( list_empty(&sdom->active_vcpu) )
|
ack@12941
|
595 {
|
ack@12941
|
596 list_del_init(&sdom->active_sdom_elem);
|
ack@10206
|
597 }
|
ack@12941
|
598 }
|
ack@12941
|
599
|
ack@12941
|
600 static void
|
keir@21258
|
601 csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
|
ack@12941
|
602 {
|
ack@12941
|
603 struct csched_vcpu * const svc = CSCHED_VCPU(current);
|
keir@21327
|
604 const struct scheduler *ops = per_cpu(scheduler, cpu);
|
ack@12941
|
605
|
ack@12941
|
606 ASSERT( current->processor == cpu );
|
ack@12941
|
607 ASSERT( svc->sdom != NULL );
|
ack@12048
|
608
|
ack@12048
|
609 /*
|
ack@12048
|
610 * If this VCPU's priority was boosted when it last awoke, reset it.
|
ack@12048
|
611 * If the VCPU is found here, then it's consuming a non-negligeable
|
ack@12048
|
612 * amount of CPU resources and should no longer be boosted.
|
ack@12048
|
613 */
|
ack@12048
|
614 if ( svc->pri == CSCHED_PRI_TS_BOOST )
|
ack@12048
|
615 svc->pri = CSCHED_PRI_TS_UNDER;
|
ack@10206
|
616
|
ack@12941
|
617 /*
|
ack@12941
|
618 * Update credits
|
ack@12941
|
619 */
|
keir@20308
|
620 if ( !is_idle_vcpu(svc->vcpu) )
|
keir@20308
|
621 burn_credits(svc, NOW());
|
ack@10206
|
622
|
ack@12941
|
623 /*
|
ack@12941
|
624 * Put this VCPU and domain back on the active list if it was
|
ack@12941
|
625 * idling.
|
ack@12941
|
626 *
|
ack@12941
|
627 * If it's been active a while, check if we'd be better off
|
ack@12941
|
628 * migrating it to run elsewhere (see multi-core and multi-thread
|
ack@12941
|
629 * support in csched_cpu_pick()).
|
ack@12941
|
630 */
|
ack@12941
|
631 if ( list_empty(&svc->active_vcpu_elem) )
|
ack@10206
|
632 {
|
keir@21258
|
633 __csched_vcpu_acct_start(prv, svc);
|
ack@12941
|
634 }
|
keir@21258
|
635 else if ( _csched_cpu_pick(ops, current, 0) != cpu )
|
ack@12941
|
636 {
|
ack@12941
|
637 CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
|
ack@12941
|
638 CSCHED_STAT_CRANK(migrate_running);
|
kfraser@14698
|
639 set_bit(_VPF_migrating, ¤t->pause_flags);
|
ack@12941
|
640 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
|
ack@10206
|
641 }
|
ack@10206
|
642 }
|
ack@10206
|
643
|
keir@21258
|
644 static void *
|
keir@21327
|
645 csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
|
ack@10206
|
646 {
|
ack@10206
|
647 struct csched_vcpu *svc;
|
ack@10206
|
648
|
ack@10206
|
649 /* Allocate per-VCPU info */
|
ack@10206
|
650 svc = xmalloc(struct csched_vcpu);
|
kfraser@12284
|
651 if ( svc == NULL )
|
keir@21258
|
652 return NULL;
|
keir@20308
|
653 memset(svc, 0, sizeof(*svc));
|
ack@10206
|
654
|
ack@10206
|
655 INIT_LIST_HEAD(&svc->runq_elem);
|
ack@10206
|
656 INIT_LIST_HEAD(&svc->active_vcpu_elem);
|
keir@21258
|
657 svc->sdom = dd;
|
ack@10206
|
658 svc->vcpu = vc;
|
ack@10206
|
659 atomic_set(&svc->credit, 0);
|
ack@13046
|
660 svc->flags = 0U;
|
keir@21258
|
661 svc->pri = is_idle_domain(vc->domain) ?
|
keir@21258
|
662 CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
|
ack@12941
|
663 CSCHED_VCPU_STATS_RESET(svc);
|
keir@21258
|
664 CSCHED_STAT_CRANK(vcpu_init);
|
keir@21258
|
665 return svc;
|
keir@21258
|
666 }
|
ack@10206
|
667
|
keir@21258
|
668 static void
|
keir@21327
|
669 csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
|
keir@21258
|
670 {
|
keir@21258
|
671 struct csched_vcpu *svc = vc->sched_priv;
|
ack@10206
|
672
|
keir@21258
|
673 if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
|
keir@21258
|
674 __runq_insert(vc->processor, svc);
|
ack@10206
|
675 }
|
ack@10206
|
676
|
ack@10206
|
677 static void
|
keir@21327
|
678 csched_free_vdata(const struct scheduler *ops, void *priv)
|
keir@21258
|
679 {
|
keir@22324
|
680 struct csched_vcpu *svc = priv;
|
keir@22324
|
681
|
keir@22324
|
682 BUG_ON( !list_empty(&svc->runq_elem) );
|
keir@22324
|
683
|
keir@22324
|
684 xfree(svc);
|
keir@22324
|
685 }
|
keir@22324
|
686
|
keir@22324
|
687 static void
|
keir@22324
|
688 csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
|
keir@22324
|
689 {
|
keir@21258
|
690 struct csched_private *prv = CSCHED_PRIV(ops);
|
keir@22324
|
691 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
|
keir@22324
|
692 struct csched_dom * const sdom = svc->sdom;
|
keir@21258
|
693 unsigned long flags;
|
keir@21258
|
694
|
keir@22324
|
695 CSCHED_STAT_CRANK(vcpu_destroy);
|
keir@22324
|
696
|
keir@21258
|
697 if ( __vcpu_on_runq(svc) )
|
keir@21258
|
698 __runq_remove(svc);
|
keir@21258
|
699
|
keir@21258
|
700 spin_lock_irqsave(&(prv->lock), flags);
|
keir@21258
|
701
|
keir@21258
|
702 if ( !list_empty(&svc->active_vcpu_elem) )
|
keir@21258
|
703 __csched_vcpu_acct_stop_locked(prv, svc);
|
keir@21258
|
704
|
keir@21258
|
705 spin_unlock_irqrestore(&(prv->lock), flags);
|
keir@21258
|
706
|
ack@10206
|
707 BUG_ON( sdom == NULL );
|
ack@10206
|
708 BUG_ON( !list_empty(&svc->runq_elem) );
|
ack@10206
|
709 }
|
ack@10206
|
710
|
ack@10206
|
711 static void
|
keir@21327
|
712 csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
|
ack@10206
|
713 {
|
ack@10206
|
714 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
|
ack@10206
|
715
|
ack@10206
|
716 CSCHED_STAT_CRANK(vcpu_sleep);
|
ack@10206
|
717
|
ack@10206
|
718 BUG_ON( is_idle_vcpu(vc) );
|
ack@10206
|
719
|
kaf24@11017
|
720 if ( per_cpu(schedule_data, vc->processor).curr == vc )
|
ack@10206
|
721 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
|
ack@10206
|
722 else if ( __vcpu_on_runq(svc) )
|
ack@10206
|
723 __runq_remove(svc);
|
ack@10206
|
724 }
|
ack@10206
|
725
|
ack@10206
|
726 static void
|
keir@21327
|
727 csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
|
ack@10206
|
728 {
|
ack@10206
|
729 struct csched_vcpu * const svc = CSCHED_VCPU(vc);
|
ack@10206
|
730 const unsigned int cpu = vc->processor;
|
ack@10206
|
731
|
ack@10206
|
732 BUG_ON( is_idle_vcpu(vc) );
|
ack@10206
|
733
|
kaf24@11017
|
734 if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
|
ack@10206
|
735 {
|
ack@10206
|
736 CSCHED_STAT_CRANK(vcpu_wake_running);
|
ack@10206
|
737 return;
|
ack@10206
|
738 }
|
ack@10206
|
739 if ( unlikely(__vcpu_on_runq(svc)) )
|
ack@10206
|
740 {
|
ack@10206
|
741 CSCHED_STAT_CRANK(vcpu_wake_onrunq);
|
ack@10206
|
742 return;
|
ack@10206
|
743 }
|
ack@10206
|
744
|
ack@10206
|
745 if ( likely(vcpu_runnable(vc)) )
|
ack@10206
|
746 CSCHED_STAT_CRANK(vcpu_wake_runnable);
|
ack@10206
|
747 else
|
ack@10206
|
748 CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
|
ack@10206
|
749
|
ack@12048
|
750 /*
|
ack@12048
|
751 * We temporarly boost the priority of awaking VCPUs!
|
ack@12048
|
752 *
|
ack@12048
|
753 * If this VCPU consumes a non negligeable amount of CPU, it
|
ack@12048
|
754 * will eventually find itself in the credit accounting code
|
ack@12048
|
755 * path where its priority will be reset to normal.
|
ack@12048
|
756 *
|
ack@12048
|
757 * If on the other hand the VCPU consumes little CPU and is
|
ack@12048
|
758 * blocking and awoken a lot (doing I/O for example), its
|
ack@12048
|
759 * priority will remain boosted, optimizing it's wake-to-run
|
ack@12048
|
760 * latencies.
|
ack@12048
|
761 *
|
ack@12048
|
762 * This allows wake-to-run latency sensitive VCPUs to preempt
|
ack@12048
|
763 * more CPU resource intensive VCPUs without impacting overall
|
ack@12048
|
764 * system fairness.
|
ack@13046
|
765 *
|
ack@13046
|
766 * The one exception is for VCPUs of capped domains unpausing
|
ack@13046
|
767 * after earning credits they had overspent. We don't boost
|
ack@13046
|
768 * those.
|
ack@12048
|
769 */
|
ack@13046
|
770 if ( svc->pri == CSCHED_PRI_TS_UNDER &&
|
ack@13046
|
771 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
|
ack@13046
|
772 {
|
ack@12048
|
773 svc->pri = CSCHED_PRI_TS_BOOST;
|
ack@13046
|
774 }
|
ack@12048
|
775
|
ack@10206
|
776 /* Put the VCPU on the runq and tickle CPUs */
|
ack@10206
|
777 __runq_insert(cpu, svc);
|
ack@10206
|
778 __runq_tickle(cpu, svc);
|
ack@10206
|
779 }
|
ack@10206
|
780
|
keir@21982
|
781 static void
|
keir@21982
|
782 csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc)
|
keir@21982
|
783 {
|
keir@21982
|
784 struct csched_vcpu * const sv = CSCHED_VCPU(vc);
|
keir@21982
|
785
|
keir@21982
|
786 if ( !sched_credit_default_yield )
|
keir@21982
|
787 {
|
keir@21982
|
788 /* Let the scheduler know that this vcpu is trying to yield */
|
keir@21982
|
789 sv->flags |= CSCHED_FLAG_VCPU_YIELD;
|
keir@21982
|
790 }
|
keir@21982
|
791 }
|
keir@21982
|
792
|
ack@10206
|
793 static int
|
ack@10206
|
794 csched_dom_cntl(
|
keir@21327
|
795 const struct scheduler *ops,
|
ack@10206
|
796 struct domain *d,
|
kfraser@11295
|
797 struct xen_domctl_scheduler_op *op)
|
ack@10206
|
798 {
|
ack@10206
|
799 struct csched_dom * const sdom = CSCHED_DOM(d);
|
keir@21258
|
800 struct csched_private *prv = CSCHED_PRIV(ops);
|
ack@10206
|
801 unsigned long flags;
|
ack@10206
|
802
|
kaf24@11296
|
803 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
|
ack@10206
|
804 {
|
kfraser@11295
|
805 op->u.credit.weight = sdom->weight;
|
kfraser@11295
|
806 op->u.credit.cap = sdom->cap;
|
ack@10206
|
807 }
|
ack@10206
|
808 else
|
ack@10206
|
809 {
|
kaf24@11296
|
810 ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
|
ack@10206
|
811
|
keir@21258
|
812 spin_lock_irqsave(&prv->lock, flags);
|
ack@10206
|
813
|
kfraser@11295
|
814 if ( op->u.credit.weight != 0 )
|
ack@10206
|
815 {
|
ack@10609
|
816 if ( !list_empty(&sdom->active_sdom_elem) )
|
ack@10609
|
817 {
|
keir@22026
|
818 prv->weight -= sdom->weight * sdom->active_vcpu_count;
|
keir@22026
|
819 prv->weight += op->u.credit.weight * sdom->active_vcpu_count;
|
ack@10609
|
820 }
|
kfraser@11295
|
821 sdom->weight = op->u.credit.weight;
|
ack@10206
|
822 }
|
ack@10206
|
823
|
kfraser@11295
|
824 if ( op->u.credit.cap != (uint16_t)~0U )
|
kfraser@11295
|
825 sdom->cap = op->u.credit.cap;
|
ack@10206
|
826
|
keir@21258
|
827 spin_unlock_irqrestore(&prv->lock, flags);
|
ack@10206
|
828 }
|
ack@10206
|
829
|
ack@10206
|
830 return 0;
|
ack@10206
|
831 }
|
ack@10206
|
832
|
keir@21258
|
833 static void *
|
keir@21327
|
834 csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
|
kfraser@12284
|
835 {
|
kfraser@12284
|
836 struct csched_dom *sdom;
|
kfraser@12284
|
837
|
kfraser@12284
|
838 sdom = xmalloc(struct csched_dom);
|
kfraser@12284
|
839 if ( sdom == NULL )
|
keir@21258
|
840 return NULL;
|
keir@20308
|
841 memset(sdom, 0, sizeof(*sdom));
|
kfraser@12284
|
842
|
kfraser@12284
|
843 /* Initialize credit and weight */
|
kfraser@12284
|
844 INIT_LIST_HEAD(&sdom->active_vcpu);
|
kfraser@12284
|
845 sdom->active_vcpu_count = 0;
|
kfraser@12284
|
846 INIT_LIST_HEAD(&sdom->active_sdom_elem);
|
kfraser@12284
|
847 sdom->dom = dom;
|
kfraser@12284
|
848 sdom->weight = CSCHED_DEFAULT_WEIGHT;
|
kfraser@12284
|
849 sdom->cap = 0U;
|
keir@21258
|
850
|
keir@21258
|
851 return (void *)sdom;
|
keir@21258
|
852 }
|
keir@21258
|
853
|
keir@21258
|
854 static int
|
keir@21327
|
855 csched_dom_init(const struct scheduler *ops, struct domain *dom)
|
keir@21258
|
856 {
|
keir@21258
|
857 struct csched_dom *sdom;
|
keir@21258
|
858
|
keir@21258
|
859 CSCHED_STAT_CRANK(dom_init);
|
keir@21258
|
860
|
keir@21258
|
861 if ( is_idle_domain(dom) )
|
keir@21258
|
862 return 0;
|
keir@21258
|
863
|
keir@21258
|
864 sdom = csched_alloc_domdata(ops, dom);
|
keir@21258
|
865 if ( sdom == NULL )
|
keir@21258
|
866 return -ENOMEM;
|
keir@21258
|
867
|
kfraser@12284
|
868 dom->sched_priv = sdom;
|
kfraser@12284
|
869
|
kfraser@12284
|
870 return 0;
|
kfraser@12284
|
871 }
|
kfraser@12284
|
872
|
ack@10206
|
873 static void
|
keir@21327
|
874 csched_free_domdata(const struct scheduler *ops, void *data)
|
keir@21258
|
875 {
|
keir@21258
|
876 xfree(data);
|
keir@21258
|
877 }
|
keir@21258
|
878
|
keir@21258
|
879 static void
|
keir@21327
|
880 csched_dom_destroy(const struct scheduler *ops, struct domain *dom)
|
ack@10206
|
881 {
|
kaf24@10281
|
882 CSCHED_STAT_CRANK(dom_destroy);
|
keir@21258
|
883 csched_free_domdata(ops, CSCHED_DOM(dom));
|
ack@10206
|
884 }
|
ack@10206
|
885
|
ack@10206
|
886 /*
|
ack@10206
|
887 * This is a O(n) optimized sort of the runq.
|
ack@10206
|
888 *
|
ack@10206
|
889 * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
|
ack@10206
|
890 * through the runq and move up any UNDERs that are preceded by OVERS. We
|
ack@10206
|
891 * remember the last UNDER to make the move up operation O(1).
|
ack@10206
|
892 */
|
ack@10206
|
893 static void
|
keir@21258
|
894 csched_runq_sort(struct csched_private *prv, unsigned int cpu)
|
ack@10206
|
895 {
|
ack@10206
|
896 struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
|
ack@10206
|
897 struct list_head *runq, *elem, *next, *last_under;
|
ack@10206
|
898 struct csched_vcpu *svc_elem;
|
ack@10206
|
899 unsigned long flags;
|
ack@10206
|
900 int sort_epoch;
|
ack@10206
|
901
|
keir@21258
|
902 sort_epoch = prv->runq_sort;
|
ack@10206
|
903 if ( sort_epoch == spc->runq_sort_last )
|
ack@10206
|
904 return;
|
ack@10206
|
905
|
ack@10206
|
906 spc->runq_sort_last = sort_epoch;
|
ack@10206
|
907
|
keir@22655
|
908 pcpu_schedule_lock_irqsave(cpu, flags);
|
ack@10206
|
909
|
ack@10206
|
910 runq = &spc->runq;
|
ack@10206
|
911 elem = runq->next;
|
ack@10206
|
912 last_under = runq;
|
ack@10206
|
913
|
ack@10206
|
914 while ( elem != runq )
|
ack@10206
|
915 {
|
ack@10206
|
916 next = elem->next;
|
ack@10206
|
917 svc_elem = __runq_elem(elem);
|
ack@10206
|
918
|
ack@12048
|
919 if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
|
ack@10206
|
920 {
|
ack@10206
|
921 /* does elem need to move up the runq? */
|
ack@10206
|
922 if ( elem->prev != last_under )
|
ack@10206
|
923 {
|
ack@10206
|
924 list_del(elem);
|
ack@10206
|
925 list_add(elem, last_under);
|
ack@10206
|
926 }
|
ack@10206
|
927 last_under = elem;
|
ack@10206
|
928 }
|
ack@10206
|
929
|
ack@10206
|
930 elem = next;
|
ack@10206
|
931 }
|
ack@10206
|
932
|
keir@22655
|
933 pcpu_schedule_unlock_irqrestore(cpu, flags);
|
ack@10206
|
934 }
|
ack@10206
|
935
|
ack@10206
|
936 static void
|
keir@19498
|
937 csched_acct(void* dummy)
|
ack@10206
|
938 {
|
keir@21258
|
939 struct csched_private *prv = dummy;
|
ack@10206
|
940 unsigned long flags;
|
ack@10206
|
941 struct list_head *iter_vcpu, *next_vcpu;
|
ack@10206
|
942 struct list_head *iter_sdom, *next_sdom;
|
ack@10206
|
943 struct csched_vcpu *svc;
|
ack@10206
|
944 struct csched_dom *sdom;
|
ack@10206
|
945 uint32_t credit_total;
|
ack@10206
|
946 uint32_t weight_total;
|
ack@10206
|
947 uint32_t weight_left;
|
ack@10206
|
948 uint32_t credit_fair;
|
ack@10206
|
949 uint32_t credit_peak;
|
ack@12240
|
950 uint32_t credit_cap;
|
ack@10206
|
951 int credit_balance;
|
ack@10206
|
952 int credit_xtra;
|
ack@10206
|
953 int credit;
|
ack@10206
|
954
|
ack@10206
|
955
|
keir@21258
|
956 spin_lock_irqsave(&prv->lock, flags);
|
ack@10206
|
957
|
keir@21258
|
958 weight_total = prv->weight;
|
keir@21258
|
959 credit_total = prv->credit;
|
ack@10206
|
960
|
ack@10206
|
961 /* Converge balance towards 0 when it drops negative */
|
keir@21258
|
962 if ( prv->credit_balance < 0 )
|
ack@10206
|
963 {
|
keir@21258
|
964 credit_total -= prv->credit_balance;
|
ack@10206
|
965 CSCHED_STAT_CRANK(acct_balance);
|
ack@10206
|
966 }
|
ack@10206
|
967
|
ack@10206
|
968 if ( unlikely(weight_total == 0) )
|
ack@10206
|
969 {
|
keir@21258
|
970 prv->credit_balance = 0;
|
keir@21258
|
971 spin_unlock_irqrestore(&prv->lock, flags);
|
ack@10206
|
972 CSCHED_STAT_CRANK(acct_no_work);
|
keir@19498
|
973 goto out;
|
ack@10206
|
974 }
|
ack@10206
|
975
|
ack@10206
|
976 CSCHED_STAT_CRANK(acct_run);
|
ack@10206
|
977
|
ack@10206
|
978 weight_left = weight_total;
|
ack@10206
|
979 credit_balance = 0;
|
ack@10206
|
980 credit_xtra = 0;
|
ack@12240
|
981 credit_cap = 0U;
|
ack@10206
|
982
|
keir@21258
|
983 list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
|
ack@10206
|
984 {
|
ack@10206
|
985 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
|
ack@10206
|
986
|
ack@10206
|
987 BUG_ON( is_idle_domain(sdom->dom) );
|
ack@10206
|
988 BUG_ON( sdom->active_vcpu_count == 0 );
|
ack@10206
|
989 BUG_ON( sdom->weight == 0 );
|
keir@22026
|
990 BUG_ON( (sdom->weight * sdom->active_vcpu_count) > weight_left );
|
ack@10206
|
991
|
keir@22026
|
992 weight_left -= ( sdom->weight * sdom->active_vcpu_count );
|
ack@10206
|
993
|
ack@10206
|
994 /*
|
ack@10206
|
995 * A domain's fair share is computed using its weight in competition
|
ack@10206
|
996 * with that of all other active domains.
|
ack@10206
|
997 *
|
ack@10206
|
998 * At most, a domain can use credits to run all its active VCPUs
|
ack@10206
|
999 * for one full accounting period. We allow a domain to earn more
|
ack@10206
|
1000 * only when the system-wide credit balance is negative.
|
ack@10206
|
1001 */
|
ack@12071
|
1002 credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
|
keir@21258
|
1003 if ( prv->credit_balance < 0 )
|
ack@10206
|
1004 {
|
keir@22026
|
1005 credit_peak += ( ( -prv->credit_balance
|
keir@22026
|
1006 * sdom->weight
|
keir@22026
|
1007 * sdom->active_vcpu_count) +
|
ack@10206
|
1008 (weight_total - 1)
|
ack@10206
|
1009 ) / weight_total;
|
ack@10206
|
1010 }
|
ack@12240
|
1011
|
ack@10206
|
1012 if ( sdom->cap != 0U )
|
ack@10206
|
1013 {
|
ack@12071
|
1014 credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
|
ack@10206
|
1015 if ( credit_cap < credit_peak )
|
ack@10206
|
1016 credit_peak = credit_cap;
|
ack@12240
|
1017
|
keir@22026
|
1018 /* FIXME -- set cap per-vcpu as well...? */
|
ack@12240
|
1019 credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
|
ack@12240
|
1020 ) / sdom->active_vcpu_count;
|
ack@10206
|
1021 }
|
ack@10206
|
1022
|
keir@22026
|
1023 credit_fair = ( ( credit_total
|
keir@22026
|
1024 * sdom->weight
|
keir@22026
|
1025 * sdom->active_vcpu_count )
|
keir@22026
|
1026 + (weight_total - 1)
|
ack@10206
|
1027 ) / weight_total;
|
ack@10206
|
1028
|
ack@10206
|
1029 if ( credit_fair < credit_peak )
|
ack@10206
|
1030 {
|
ack@10206
|
1031 credit_xtra = 1;
|
ack@10206
|
1032 }
|
ack@10206
|
1033 else
|
ack@10206
|
1034 {
|
ack@10206
|
1035 if ( weight_left != 0U )
|
ack@10206
|
1036 {
|
ack@10206
|
1037 /* Give other domains a chance at unused credits */
|
ack@10206
|
1038 credit_total += ( ( ( credit_fair - credit_peak
|
ack@10206
|
1039 ) * weight_total
|
ack@10206
|
1040 ) + ( weight_left - 1 )
|
ack@10206
|
1041 ) / weight_left;
|
ack@10206
|
1042 }
|
ack@10206
|
1043
|
ack@10206
|
1044 if ( credit_xtra )
|
ack@10206
|
1045 {
|
ack@10206
|
1046 /*
|
ack@10206
|
1047 * Lazily keep domains with extra credits at the head of
|
ack@10206
|
1048 * the queue to give others a chance at them in future
|
ack@10206
|
1049 * accounting periods.
|
ack@10206
|
1050 */
|
ack@10206
|
1051 CSCHED_STAT_CRANK(acct_reorder);
|
ack@10206
|
1052 list_del(&sdom->active_sdom_elem);
|
keir@21258
|
1053 list_add(&sdom->active_sdom_elem, &prv->active_sdom);
|
ack@10206
|
1054 }
|
ack@10206
|
1055
|
ack@10206
|
1056 credit_fair = credit_peak;
|
ack@10206
|
1057 }
|
ack@10206
|
1058
|
ack@10206
|
1059 /* Compute fair share per VCPU */
|
ack@10206
|
1060 credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
|
ack@10206
|
1061 ) / sdom->active_vcpu_count;
|
ack@10206
|
1062
|
ack@10206
|
1063
|
ack@10206
|
1064 list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
|
ack@10206
|
1065 {
|
ack@10206
|
1066 svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
|
ack@10206
|
1067 BUG_ON( sdom != svc->sdom );
|
ack@10206
|
1068
|
ack@10206
|
1069 /* Increment credit */
|
ack@10206
|
1070 atomic_add(credit_fair, &svc->credit);
|
ack@10206
|
1071 credit = atomic_read(&svc->credit);
|
ack@10206
|
1072
|
ack@10206
|
1073 /*
|
ack@10206
|
1074 * Recompute priority or, if VCPU is idling, remove it from
|
ack@10206
|
1075 * the active list.
|
ack@10206
|
1076 */
|
ack@10206
|
1077 if ( credit < 0 )
|
ack@10206
|
1078 {
|
ack@13046
|
1079 svc->pri = CSCHED_PRI_TS_OVER;
|
ack@10206
|
1080
|
ack@13046
|
1081 /* Park running VCPUs of capped-out domains */
|
ack@13046
|
1082 if ( sdom->cap != 0U &&
|
ack@13046
|
1083 credit < -credit_cap &&
|
ack@13046
|
1084 !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
|
ack@13046
|
1085 {
|
ack@13046
|
1086 CSCHED_STAT_CRANK(vcpu_park);
|
ack@13046
|
1087 vcpu_pause_nosync(svc->vcpu);
|
ack@13046
|
1088 svc->flags |= CSCHED_FLAG_VCPU_PARKED;
|
ack@13046
|
1089 }
|
ack@13046
|
1090
|
ack@13046
|
1091 /* Lower bound on credits */
|
ack@12071
|
1092 if ( credit < -CSCHED_CREDITS_PER_TSLICE )
|
ack@10206
|
1093 {
|
ack@10206
|
1094 CSCHED_STAT_CRANK(acct_min_credit);
|
ack@12071
|
1095 credit = -CSCHED_CREDITS_PER_TSLICE;
|
ack@10206
|
1096 atomic_set(&svc->credit, credit);
|
ack@10206
|
1097 }
|
ack@10206
|
1098 }
|
ack@10206
|
1099 else
|
ack@10206
|
1100 {
|
ack@10206
|
1101 svc->pri = CSCHED_PRI_TS_UNDER;
|
ack@10206
|
1102
|
ack@13046
|
1103 /* Unpark any capped domains whose credits go positive */
|
ack@13046
|
1104 if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
|
ack@13046
|
1105 {
|
ack@13046
|
1106 /*
|
ack@13046
|
1107 * It's important to unset the flag AFTER the unpause()
|
ack@13046
|
1108 * call to make sure the VCPU's priority is not boosted
|
ack@13046
|
1109 * if it is woken up here.
|
ack@13046
|
1110 */
|
ack@13046
|
1111 CSCHED_STAT_CRANK(vcpu_unpark);
|
ack@13046
|
1112 vcpu_unpause(svc->vcpu);
|
ack@13046
|
1113 svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
|
ack@13046
|
1114 }
|
ack@13046
|
1115
|
ack@13046
|
1116 /* Upper bound on credits means VCPU stops earning */
|
ack@12071
|
1117 if ( credit > CSCHED_CREDITS_PER_TSLICE )
|
ack@12071
|
1118 {
|
keir@21258
|
1119 __csched_vcpu_acct_stop_locked(prv, svc);
|
keir@21983
|
1120 /* Divide credits in half, so that when it starts
|
keir@21983
|
1121 * accounting again, it starts a little bit "ahead" */
|
keir@21983
|
1122 credit /= 2;
|
ack@12071
|
1123 atomic_set(&svc->credit, credit);
|
ack@12071
|
1124 }
|
ack@10206
|
1125 }
|
ack@10206
|
1126
|
ack@12941
|
1127 CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
|
ack@12941
|
1128 CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
|
ack@10206
|
1129 credit_balance += credit;
|
ack@10206
|
1130 }
|
ack@10206
|
1131 }
|
ack@10206
|
1132
|
keir@21258
|
1133 prv->credit_balance = credit_balance;
|
ack@10206
|
1134
|
keir@21258
|
1135 spin_unlock_irqrestore(&prv->lock, flags);
|
ack@10206
|
1136
|
ack@10206
|
1137 /* Inform each CPU that its runq needs to be sorted */
|
keir@21258
|
1138 prv->runq_sort++;
|
keir@19498
|
1139
|
keir@19498
|
1140 out:
|
keir@21258
|
1141 set_timer( &prv->master_ticker, NOW() +
|
keir@19498
|
1142 MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
|
ack@10206
|
1143 }
|
ack@10206
|
1144
|
ack@10206
|
1145 static void
|
kfraser@14358
|
1146 csched_tick(void *_cpu)
|
ack@10206
|
1147 {
|
kfraser@14358
|
1148 unsigned int cpu = (unsigned long)_cpu;
|
kfraser@14358
|
1149 struct csched_pcpu *spc = CSCHED_PCPU(cpu);
|
keir@21258
|
1150 struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
|
kfraser@14358
|
1151
|
kfraser@14358
|
1152 spc->tick++;
|
kfraser@14358
|
1153
|
ack@10206
|
1154 /*
|
ack@10206
|
1155 * Accounting for running VCPU
|
ack@10206
|
1156 */
|
ack@12941
|
1157 if ( !is_idle_vcpu(current) )
|
keir@21258
|
1158 csched_vcpu_acct(prv, cpu);
|
ack@10206
|
1159
|
ack@10206
|
1160 /*
|
ack@10206
|
1161 * Check if runq needs to be sorted
|
ack@10206
|
1162 *
|
ack@10206
|
1163 * Every physical CPU resorts the runq after the accounting master has
|
ack@10206
|
1164 * modified priorities. This is a special O(n) sort and runs at most
|
ack@10206
|
1165 * once per accounting period (currently 30 milliseconds).
|
ack@10206
|
1166 */
|
keir@21258
|
1167 csched_runq_sort(prv, cpu);
|
kfraser@14358
|
1168
|
kfraser@14358
|
1169 set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
|
ack@10206
|
1170 }
|
ack@10206
|
1171
|
ack@10206
|
1172 static struct csched_vcpu *
|
ack@12941
|
1173 csched_runq_steal(int peer_cpu, int cpu, int pri)
|
ack@10206
|
1174 {
|
ack@12941
|
1175 const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
|
ack@12941
|
1176 const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
|
ack@12941
|
1177 struct csched_vcpu *speer;
|
ack@10206
|
1178 struct list_head *iter;
|
ack@10206
|
1179 struct vcpu *vc;
|
ack@10206
|
1180
|
ack@12941
|
1181 /*
|
ack@12941
|
1182 * Don't steal from an idle CPU's runq because it's about to
|
ack@12941
|
1183 * pick up work from it itself.
|
ack@12941
|
1184 */
|
ack@12941
|
1185 if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
|
ack@10206
|
1186 {
|
ack@12941
|
1187 list_for_each( iter, &peer_pcpu->runq )
|
ack@10206
|
1188 {
|
ack@12941
|
1189 speer = __runq_elem(iter);
|
ack@10206
|
1190
|
ack@12941
|
1191 /*
|
ack@13046
|
1192 * If next available VCPU here is not of strictly higher
|
ack@13046
|
1193 * priority than ours, this PCPU is useless to us.
|
ack@12941
|
1194 */
|
ack@13046
|
1195 if ( speer->pri <= pri )
|
ack@12941
|
1196 break;
|
ack@10206
|
1197
|
ack@12941
|
1198 /* Is this VCPU is runnable on our PCPU? */
|
ack@12941
|
1199 vc = speer->vcpu;
|
ack@12941
|
1200 BUG_ON( is_idle_vcpu(vc) );
|
ack@10206
|
1201
|
ack@12941
|
1202 if (__csched_vcpu_is_migrateable(vc, cpu))
|
ack@12941
|
1203 {
|
ack@12941
|
1204 /* We got a candidate. Grab it! */
|
ack@12941
|
1205 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
|
ack@12941
|
1206 CSCHED_STAT_CRANK(migrate_queued);
|
keir@21019
|
1207 WARN_ON(vc->is_urgent);
|
ack@12941
|
1208 __runq_remove(speer);
|
ack@12941
|
1209 vc->processor = cpu;
|
ack@12941
|
1210 return speer;
|
ack@12941
|
1211 }
|
ack@10206
|
1212 }
|
ack@10206
|
1213 }
|
ack@10206
|
1214
|
ack@12941
|
1215 CSCHED_STAT_CRANK(steal_peer_idle);
|
ack@10206
|
1216 return NULL;
|
ack@10206
|
1217 }
|
ack@10206
|
1218
|
ack@10206
|
1219 static struct csched_vcpu *
|
keir@21258
|
1220 csched_load_balance(struct csched_private *prv, int cpu,
|
keir@21671
|
1221 struct csched_vcpu *snext, bool_t *stolen)
|
ack@10206
|
1222 {
|
ack@12290
|
1223 struct csched_vcpu *speer;
|
ack@12290
|
1224 cpumask_t workers;
|
keir@21258
|
1225 cpumask_t *online;
|
ack@10206
|
1226 int peer_cpu;
|
ack@10206
|
1227
|
ack@12941
|
1228 BUG_ON( cpu != snext->vcpu->processor );
|
keir@21258
|
1229 online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
|
ack@12941
|
1230
|
keir@18502
|
1231 /* If this CPU is going offline we shouldn't steal work. */
|
keir@21258
|
1232 if ( unlikely(!cpu_isset(cpu, *online)) )
|
keir@18502
|
1233 goto out;
|
keir@18502
|
1234
|
ack@10206
|
1235 if ( snext->pri == CSCHED_PRI_IDLE )
|
ack@10206
|
1236 CSCHED_STAT_CRANK(load_balance_idle);
|
ack@10206
|
1237 else if ( snext->pri == CSCHED_PRI_TS_OVER )
|
ack@10206
|
1238 CSCHED_STAT_CRANK(load_balance_over);
|
ack@10206
|
1239 else
|
ack@10206
|
1240 CSCHED_STAT_CRANK(load_balance_other);
|
ack@10206
|
1241
|
ack@12290
|
1242 /*
|
ack@12941
|
1243 * Peek at non-idling CPUs in the system, starting with our
|
ack@12941
|
1244 * immediate neighbour.
|
ack@12290
|
1245 */
|
keir@21258
|
1246 cpus_andnot(workers, *online, prv->idlers);
|
ack@12290
|
1247 cpu_clear(cpu, workers);
|
ack@10206
|
1248 peer_cpu = cpu;
|
ack@10206
|
1249
|
ack@12290
|
1250 while ( !cpus_empty(workers) )
|
ack@10206
|
1251 {
|
keir@19314
|
1252 peer_cpu = cycle_cpu(peer_cpu, workers);
|
ack@12290
|
1253 cpu_clear(peer_cpu, workers);
|
ack@10206
|
1254
|
ack@10206
|
1255 /*
|
ack@10206
|
1256 * Get ahold of the scheduler lock for this peer CPU.
|
ack@10206
|
1257 *
|
ack@10206
|
1258 * Note: We don't spin on this lock but simply try it. Spinning could
|
ack@10206
|
1259 * cause a deadlock if the peer CPU is also load balancing and trying
|
ack@10206
|
1260 * to lock this CPU.
|
ack@10206
|
1261 */
|
keir@22655
|
1262 if ( !pcpu_schedule_trylock(peer_cpu) )
|
ack@10206
|
1263 {
|
kaf24@11514
|
1264 CSCHED_STAT_CRANK(steal_trylock_failed);
|
kaf24@11514
|
1265 continue;
|
kaf24@11514
|
1266 }
|
ack@10206
|
1267
|
ack@12941
|
1268 /*
|
ack@12941
|
1269 * Any work over there to steal?
|
ack@12941
|
1270 */
|
ack@12941
|
1271 speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
|
keir@22655
|
1272 pcpu_schedule_unlock(peer_cpu);
|
ack@12941
|
1273 if ( speer != NULL )
|
keir@21671
|
1274 {
|
keir@21671
|
1275 *stolen = 1;
|
ack@12941
|
1276 return speer;
|
keir@21671
|
1277 }
|
ack@10206
|
1278 }
|
ack@10206
|
1279
|
keir@18502
|
1280 out:
|
ack@12291
|
1281 /* Failed to find more important work elsewhere... */
|
ack@10206
|
1282 __runq_remove(snext);
|
ack@10206
|
1283 return snext;
|
ack@10206
|
1284 }
|
ack@10206
|
1285
|
ack@10206
|
1286 /*
|
ack@10206
|
1287 * This function is in the critical path. It is designed to be simple and
|
ack@10206
|
1288 * fast for the common case.
|
ack@10206
|
1289 */
|
ack@10206
|
1290 static struct task_slice
|
keir@21390
|
1291 csched_schedule(
|
keir@21390
|
1292 const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
|
ack@10206
|
1293 {
|
ack@10206
|
1294 const int cpu = smp_processor_id();
|
ack@10206
|
1295 struct list_head * const runq = RUNQ(cpu);
|
ack@10206
|
1296 struct csched_vcpu * const scurr = CSCHED_VCPU(current);
|
keir@21258
|
1297 struct csched_private *prv = CSCHED_PRIV(ops);
|
ack@10206
|
1298 struct csched_vcpu *snext;
|
ack@10206
|
1299 struct task_slice ret;
|
ack@10206
|
1300
|
ack@10206
|
1301 CSCHED_STAT_CRANK(schedule);
|
ack@10206
|
1302 CSCHED_VCPU_CHECK(current);
|
ack@10206
|
1303
|
keir@20308
|
1304 if ( !is_idle_vcpu(scurr->vcpu) )
|
keir@20308
|
1305 {
|
keir@21243
|
1306 /* Update credits of a non-idle VCPU. */
|
keir@20308
|
1307 burn_credits(scurr, now);
|
keir@20308
|
1308 scurr->start_time -= now;
|
keir@20308
|
1309 }
|
keir@21243
|
1310 else
|
keir@21243
|
1311 {
|
keir@21243
|
1312 /* Re-instate a boosted idle VCPU as normal-idle. */
|
keir@21243
|
1313 scurr->pri = CSCHED_PRI_IDLE;
|
keir@21243
|
1314 }
|
keir@20160
|
1315
|
ack@10206
|
1316 /*
|
ack@10206
|
1317 * Select next runnable local VCPU (ie top of local runq)
|
ack@10206
|
1318 */
|
ack@10206
|
1319 if ( vcpu_runnable(current) )
|
ack@10206
|
1320 __runq_insert(cpu, scurr);
|
ack@10206
|
1321 else
|
ack@10206
|
1322 BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
|
ack@10206
|
1323
|
ack@10206
|
1324 snext = __runq_elem(runq->next);
|
keir@21671
|
1325 ret.migrated = 0;
|
ack@10206
|
1326
|
keir@21243
|
1327 /* Tasklet work (which runs in idle VCPU context) overrides all else. */
|
keir@21390
|
1328 if ( tasklet_work_scheduled )
|
keir@21243
|
1329 {
|
keir@21243
|
1330 snext = CSCHED_VCPU(idle_vcpu[cpu]);
|
keir@21243
|
1331 snext->pri = CSCHED_PRI_TS_BOOST;
|
keir@21243
|
1332 }
|
keir@21243
|
1333
|
ack@10206
|
1334 /*
|
keir@21982
|
1335 * Clear YIELD flag before scheduling out
|
keir@21982
|
1336 */
|
keir@21982
|
1337 if ( scurr->flags & CSCHED_FLAG_VCPU_YIELD )
|
keir@21982
|
1338 scurr->flags &= ~(CSCHED_FLAG_VCPU_YIELD);
|
keir@21982
|
1339
|
keir@21982
|
1340 /*
|
ack@10206
|
1341 * SMP Load balance:
|
ack@10206
|
1342 *
|
ack@10206
|
1343 * If the next highest priority local runnable VCPU has already eaten
|
ack@10206
|
1344 * through its credits, look on other PCPUs to see if we have more
|
ack@10206
|
1345 * urgent work... If not, csched_load_balance() will return snext, but
|
ack@10206
|
1346 * already removed from the runq.
|
ack@10206
|
1347 */
|
ack@10206
|
1348 if ( snext->pri > CSCHED_PRI_TS_OVER )
|
ack@10206
|
1349 __runq_remove(snext);
|
ack@10206
|
1350 else
|
keir@21671
|
1351 snext = csched_load_balance(prv, cpu, snext, &ret.migrated);
|
ack@10206
|
1352
|
ack@10206
|
1353 /*
|
ack@10206
|
1354 * Update idlers mask if necessary. When we're idling, other CPUs
|
ack@10206
|
1355 * will tickle us when they get extra work.
|
ack@10206
|
1356 */
|
ack@10206
|
1357 if ( snext->pri == CSCHED_PRI_IDLE )
|
ack@10206
|
1358 {
|
keir@21258
|
1359 if ( !cpu_isset(cpu, prv->idlers) )
|
keir@21258
|
1360 cpu_set(cpu, prv->idlers);
|
ack@10206
|
1361 }
|
keir@21258
|
1362 else if ( cpu_isset(cpu, prv->idlers) )
|
ack@10206
|
1363 {
|
keir@21258
|
1364 cpu_clear(cpu, prv->idlers);
|
ack@10206
|
1365 }
|
ack@10206
|
1366
|
keir@20160
|
1367 if ( !is_idle_vcpu(snext->vcpu) )
|
keir@20308
|
1368 snext->start_time += now;
|
keir@20160
|
1369
|
ack@10206
|
1370 /*
|
ack@10206
|
1371 * Return task to run next...
|
ack@10206
|
1372 */
|
keir@19538
|
1373 ret.time = (is_idle_vcpu(snext->vcpu) ?
|
keir@19538
|
1374 -1 : MILLISECS(CSCHED_MSECS_PER_TSLICE));
|
ack@10206
|
1375 ret.task = snext->vcpu;
|
ack@10206
|
1376
|
ack@10206
|
1377 CSCHED_VCPU_CHECK(ret.task);
|
ack@10206
|
1378 return ret;
|
ack@10206
|
1379 }
|
ack@10206
|
1380
|
ack@10206
|
1381 static void
|
ack@10206
|
1382 csched_dump_vcpu(struct csched_vcpu *svc)
|
ack@10206
|
1383 {
|
ack@10206
|
1384 struct csched_dom * const sdom = svc->sdom;
|
ack@10206
|
1385
|
ack@13046
|
1386 printk("[%i.%i] pri=%i flags=%x cpu=%i",
|
ack@10206
|
1387 svc->vcpu->domain->domain_id,
|
ack@10206
|
1388 svc->vcpu->vcpu_id,
|
ack@10206
|
1389 svc->pri,
|
ack@13046
|
1390 svc->flags,
|
ack@10206
|
1391 svc->vcpu->processor);
|
ack@10206
|
1392
|
ack@10206
|
1393 if ( sdom )
|
ack@10206
|
1394 {
|
ack@12941
|
1395 printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
|
ack@12941
|
1396 #ifdef CSCHED_STATS
|
ack@12941
|
1397 printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
|
ack@12941
|
1398 svc->stats.credit_last,
|
ack@12941
|
1399 svc->stats.credit_incr,
|
ack@12941
|
1400 svc->stats.state_active,
|
ack@12941
|
1401 svc->stats.state_idle,
|
ack@12941
|
1402 svc->stats.migrate_q,
|
ack@12941
|
1403 svc->stats.migrate_r);
|
ack@12941
|
1404 #endif
|
ack@10206
|
1405 }
|
ack@10206
|
1406
|
ack@10206
|
1407 printk("\n");
|
ack@10206
|
1408 }
|
ack@10206
|
1409
|
ack@10206
|
1410 static void
|
keir@21327
|
1411 csched_dump_pcpu(const struct scheduler *ops, int cpu)
|
ack@10206
|
1412 {
|
ack@10206
|
1413 struct list_head *runq, *iter;
|
ack@10206
|
1414 struct csched_pcpu *spc;
|
ack@10206
|
1415 struct csched_vcpu *svc;
|
ack@10206
|
1416 int loop;
|
keir@20975
|
1417 #define cpustr keyhandler_scratch
|
ack@10206
|
1418
|
ack@10206
|
1419 spc = CSCHED_PCPU(cpu);
|
ack@10206
|
1420 runq = &spc->runq;
|
ack@10206
|
1421
|
keir@19965
|
1422 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map, cpu));
|
keir@18561
|
1423 printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
|
keir@19965
|
1424 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map, cpu));
|
keir@18561
|
1425 printk("core=%s\n", cpustr);
|
ack@10206
|
1426
|
ack@10206
|
1427 /* current VCPU */
|
kaf24@11017
|
1428 svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
|
ack@10206
|
1429 if ( svc )
|
ack@10206
|
1430 {
|
ack@10206
|
1431 printk("\trun: ");
|
ack@10206
|
1432 csched_dump_vcpu(svc);
|
ack@10206
|
1433 }
|
ack@10206
|
1434
|
ack@10206
|
1435 loop = 0;
|
ack@10206
|
1436 list_for_each( iter, runq )
|
ack@10206
|
1437 {
|
ack@10206
|
1438 svc = __runq_elem(iter);
|
ack@10206
|
1439 if ( svc )
|
ack@10206
|
1440 {
|
ack@10206
|
1441 printk("\t%3d: ", ++loop);
|
ack@10206
|
1442 csched_dump_vcpu(svc);
|
ack@10206
|
1443 }
|
ack@10206
|
1444 }
|
keir@20975
|
1445 #undef cpustr
|
ack@10206
|
1446 }
|
ack@10206
|
1447
|
ack@10206
|
1448 static void
|
keir@21327
|
1449 csched_dump(const struct scheduler *ops)
|
ack@10206
|
1450 {
|
ack@10206
|
1451 struct list_head *iter_sdom, *iter_svc;
|
keir@21258
|
1452 struct csched_private *prv = CSCHED_PRIV(ops);
|
ack@10206
|
1453 int loop;
|
keir@20975
|
1454 #define idlers_buf keyhandler_scratch
|
ack@10206
|
1455
|
ack@10206
|
1456 printk("info:\n"
|
ack@10206
|
1457 "\tncpus = %u\n"
|
ack@10206
|
1458 "\tmaster = %u\n"
|
ack@10206
|
1459 "\tcredit = %u\n"
|
ack@10206
|
1460 "\tcredit balance = %d\n"
|
ack@10206
|
1461 "\tweight = %u\n"
|
ack@10206
|
1462 "\trunq_sort = %u\n"
|
ack@12071
|
1463 "\tdefault-weight = %d\n"
|
ack@12071
|
1464 "\tmsecs per tick = %dms\n"
|
keir@20160
|
1465 "\tcredits per msec = %d\n"
|
ack@12071
|
1466 "\tticks per tslice = %d\n"
|
keir@19331
|
1467 "\tticks per acct = %d\n"
|
keir@19331
|
1468 "\tmigration delay = %uus\n",
|
keir@21258
|
1469 prv->ncpus,
|
keir@21258
|
1470 prv->master,
|
keir@21258
|
1471 prv->credit,
|
keir@21258
|
1472 prv->credit_balance,
|
keir@21258
|
1473 prv->weight,
|
keir@21258
|
1474 prv->runq_sort,
|
ack@12071
|
1475 CSCHED_DEFAULT_WEIGHT,
|
ack@12071
|
1476 CSCHED_MSECS_PER_TICK,
|
keir@20160
|
1477 CSCHED_CREDITS_PER_MSEC,
|
ack@12071
|
1478 CSCHED_TICKS_PER_TSLICE,
|
keir@19331
|
1479 CSCHED_TICKS_PER_ACCT,
|
keir@19331
|
1480 vcpu_migration_delay);
|
ack@10206
|
1481
|
keir@21258
|
1482 cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
|
keir@18561
|
1483 printk("idlers: %s\n", idlers_buf);
|
ack@10206
|
1484
|
ack@10206
|
1485 printk("active vcpus:\n");
|
ack@10206
|
1486 loop = 0;
|
keir@21258
|
1487 list_for_each( iter_sdom, &prv->active_sdom )
|
ack@10206
|
1488 {
|
ack@10206
|
1489 struct csched_dom *sdom;
|
ack@10206
|
1490 sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
|
ack@10206
|
1491
|
ack@10206
|
1492 list_for_each( iter_svc, &sdom->active_vcpu )
|
ack@10206
|
1493 {
|
ack@10206
|
1494 struct csched_vcpu *svc;
|
ack@10206
|
1495 svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
|
ack@10206
|
1496
|
ack@10206
|
1497 printk("\t%3d: ", ++loop);
|
ack@10206
|
1498 csched_dump_vcpu(svc);
|
ack@10206
|
1499 }
|
ack@10206
|
1500 }
|
keir@20975
|
1501 #undef idlers_buf
|
ack@10206
|
1502 }
|
ack@10206
|
1503
|
keir@21258
|
1504 static int
|
keir@21453
|
1505 csched_init(struct scheduler *ops)
|
ack@10206
|
1506 {
|
keir@21258
|
1507 struct csched_private *prv;
|
keir@21258
|
1508
|
keir@21258
|
1509 prv = xmalloc(struct csched_private);
|
keir@21258
|
1510 if ( prv == NULL )
|
keir@21453
|
1511 return -ENOMEM;
|
keir@21453
|
1512
|
keir@21258
|
1513 memset(prv, 0, sizeof(*prv));
|
keir@21258
|
1514 ops->sched_data = prv;
|
keir@21258
|
1515 spin_lock_init(&prv->lock);
|
keir@21258
|
1516 INIT_LIST_HEAD(&prv->active_sdom);
|
keir@21258
|
1517 prv->master = UINT_MAX;
|
keir@21258
|
1518
|
keir@21258
|
1519 return 0;
|
ack@10206
|
1520 }
|
ack@10206
|
1521
|
keir@21258
|
1522 static void
|
keir@21327
|
1523 csched_deinit(const struct scheduler *ops)
|
keir@21258
|
1524 {
|
keir@21258
|
1525 struct csched_private *prv;
|
keir@21258
|
1526
|
keir@21258
|
1527 prv = CSCHED_PRIV(ops);
|
keir@21258
|
1528 if ( prv != NULL )
|
keir@21258
|
1529 xfree(prv);
|
keir@21258
|
1530 }
|
keir@21258
|
1531
|
keir@21327
|
1532 static void csched_tick_suspend(const struct scheduler *ops, unsigned int cpu)
|
keir@19498
|
1533 {
|
keir@19498
|
1534 struct csched_pcpu *spc;
|
keir@19498
|
1535
|
keir@21258
|
1536 spc = CSCHED_PCPU(cpu);
|
keir@19498
|
1537
|
keir@19498
|
1538 stop_timer(&spc->ticker);
|
keir@19498
|
1539 }
|
keir@19498
|
1540
|
keir@21327
|
1541 static void csched_tick_resume(const struct scheduler *ops, unsigned int cpu)
|
keir@19498
|
1542 {
|
keir@19498
|
1543 struct csched_pcpu *spc;
|
keir@19498
|
1544 uint64_t now = NOW();
|
keir@21258
|
1545
|
keir@21258
|
1546 spc = CSCHED_PCPU(cpu);
|
keir@19498
|
1547
|
keir@19498
|
1548 set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
|
keir@19498
|
1549 - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
|
keir@19498
|
1550 }
|
ack@10206
|
1551
|
keir@21258
|
1552 static struct csched_private _csched_priv;
|
keir@21258
|
1553
|
keir@21327
|
1554 const struct scheduler sched_credit_def = {
|
ack@10206
|
1555 .name = "SMP Credit Scheduler",
|
ack@10206
|
1556 .opt_name = "credit",
|
kfraser@11295
|
1557 .sched_id = XEN_SCHEDULER_CREDIT,
|
keir@21258
|
1558 .sched_data = &_csched_priv,
|
ack@10206
|
1559
|
kfraser@12284
|
1560 .init_domain = csched_dom_init,
|
kfraser@12284
|
1561 .destroy_domain = csched_dom_destroy,
|
kfraser@12284
|
1562
|
keir@21258
|
1563 .insert_vcpu = csched_vcpu_insert,
|
keir@22324
|
1564 .remove_vcpu = csched_vcpu_remove,
|
kaf24@10281
|
1565
|
ack@10206
|
1566 .sleep = csched_vcpu_sleep,
|
ack@10206
|
1567 .wake = csched_vcpu_wake,
|
keir@21982
|
1568 .yield = csched_vcpu_yield,
|
kaf24@10281
|
1569
|
kfraser@11295
|
1570 .adjust = csched_dom_cntl,
|
ack@10206
|
1571
|
ack@12291
|
1572 .pick_cpu = csched_cpu_pick,
|
ack@10206
|
1573 .do_schedule = csched_schedule,
|
ack@10206
|
1574
|
ack@10206
|
1575 .dump_cpu_state = csched_dump_pcpu,
|
ack@10206
|
1576 .dump_settings = csched_dump,
|
ack@10206
|
1577 .init = csched_init,
|
keir@21258
|
1578 .deinit = csched_deinit,
|
keir@21258
|
1579 .alloc_vdata = csched_alloc_vdata,
|
keir@21258
|
1580 .free_vdata = csched_free_vdata,
|
keir@21258
|
1581 .alloc_pdata = csched_alloc_pdata,
|
keir@21258
|
1582 .free_pdata = csched_free_pdata,
|
keir@21258
|
1583 .alloc_domdata = csched_alloc_domdata,
|
keir@21258
|
1584 .free_domdata = csched_free_domdata,
|
keir@19498
|
1585
|
keir@19498
|
1586 .tick_suspend = csched_tick_suspend,
|
keir@19498
|
1587 .tick_resume = csched_tick_resume,
|
ack@10206
|
1588 };
|