Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/sched_rt.c
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
3
 * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4
 *
5
 * by Sisu Xi, 2013, Washington University in Saint Louis
6
 * Meng Xu, 2014-2016, University of Pennsylvania
7
 *
8
 * Conversion toward event driven model by Tianyang Chen
9
 * and Dagaen Golomb, 2016, University of Pennsylvania
10
 *
11
 * based on the code of credit Scheduler
12
 */
13
14
#include <xen/init.h>
15
#include <xen/lib.h>
16
#include <xen/sched.h>
17
#include <xen/domain.h>
18
#include <xen/delay.h>
19
#include <xen/event.h>
20
#include <xen/time.h>
21
#include <xen/timer.h>
22
#include <xen/perfc.h>
23
#include <xen/sched-if.h>
24
#include <xen/softirq.h>
25
#include <asm/atomic.h>
26
#include <xen/errno.h>
27
#include <xen/trace.h>
28
#include <xen/cpu.h>
29
#include <xen/keyhandler.h>
30
#include <xen/trace.h>
31
#include <xen/err.h>
32
#include <xen/guest_access.h>
33
34
/*
35
 * TODO:
36
 *
37
 * Migration compensation and resist like credit2 to better use cache;
38
 * Lock Holder Problem, using yield?
39
 * Self switch problem: VCPUs of the same domain may preempt each other;
40
 */
41
42
/*
43
 * Design:
44
 *
45
 * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
46
 * theory in real-time field.
47
 * At any scheduling point, the VCPU with earlier deadline has higher priority.
48
 * The scheduler always picks highest priority VCPU to run on a feasible PCPU.
49
 * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or
50
 * has a lower-priority VCPU running on it.)
51
 *
52
 * Each VCPU has a dedicated period, budget and a extratime flag
53
 * The deadline of a VCPU is at the end of each period;
54
 * A VCPU has its budget replenished at the beginning of each period;
55
 * While scheduled, a VCPU burns its budget.
56
 * The VCPU needs to finish its budget before its deadline in each period;
57
 * The VCPU discards its unused budget at the end of each period.
58
 * When a VCPU runs out of budget in a period, if its extratime flag is set,
59
 * the VCPU increases its priority_level by 1 and refills its budget; otherwise,
60
 * it has to wait until next period.
61
 *
62
 * Each VCPU is implemented as a deferable server.
63
 * When a VCPU has a task running on it, its budget is continuously burned;
64
 * When a VCPU has no task but with budget left, its budget is preserved.
65
 *
66
 * Queue scheme:
67
 * A global runqueue and a global depletedqueue for each CPU pool.
68
 * The runqueue holds all runnable VCPUs with budget,
69
 * sorted by priority_level and deadline;
70
 * The depletedqueue holds all VCPUs without budget, unsorted;
71
 *
72
 * Note: cpumask and cpupool is supported.
73
 */
74
75
/*
76
 * Locking:
77
 * A global system lock is used to protect the RunQ and DepletedQ.
78
 * The global lock is referenced by schedule_data.schedule_lock
79
 * from all physical cpus.
80
 *
81
 * The lock is already grabbed when calling wake/sleep/schedule/ functions
82
 * in schedule.c
83
 *
84
 * The functions involes RunQ and needs to grab locks are:
85
 *    vcpu_insert, vcpu_remove, context_saved, runq_insert
86
 */
87
88
89
/*
90
 * Default parameters:
91
 * Period and budget in default is 10 and 4 ms, respectively
92
 */
93
0
#define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
94
0
#define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
95
96
/*
97
 * Max period: max delta of time type, because period is added to the time
98
 * a vcpu activates, so this must not overflow.
99
 * Min period: 10 us, considering the scheduling overhead (when period is
100
 * too low, scheduling is invoked too frequently, causing high overhead).
101
 */
102
0
#define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
103
0
#define RTDS_MIN_PERIOD     (MICROSECS(10))
104
105
/*
106
 * Min budget: 10 us, considering the scheduling overhead (when budget is
107
 * consumed too fast, scheduling is invoked too frequently, causing
108
 * high overhead).
109
 */
110
0
#define RTDS_MIN_BUDGET     (MICROSECS(10))
111
112
/*
113
 * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
114
 * the next deadline, performing addition could be faster if the difference
115
 * between cur_deadline and now is small. If the difference is bigger than
116
 * 1024 * period, use multiplication.
117
 */
118
0
#define UPDATE_LIMIT_SHIFT      10
119
120
/*
121
 * Flags
122
 */
123
/*
124
 * RTDS_scheduled: Is this vcpu either running on, or context-switching off,
125
 * a phyiscal cpu?
126
 * + Accessed only with global lock held.
127
 * + Set when chosen as next in rt_schedule().
128
 * + Cleared after context switch has been saved in rt_context_saved()
129
 * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should
130
 *   set RTDS_delayed_runq_add
131
 * + Checked to be false in runq_insert.
132
 */
133
#define __RTDS_scheduled            1
134
#define RTDS_scheduled (1<<__RTDS_scheduled)
135
/*
136
 * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
137
 * once it's done being context switching out?
138
 * + Set when scheduling out in rt_schedule() if prev is runable
139
 * + Set in rt_vcpu_wake if it finds RTDS_scheduled set
140
 * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
141
 *   and clears the bit.
142
 */
143
0
#define __RTDS_delayed_runq_add     2
144
0
#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
145
146
/*
147
 * RTDS_depleted: Does this vcp run out of budget?
148
 * This flag is
149
 * + set in burn_budget() if a vcpu has zero budget left;
150
 * + cleared and checked in the repenishment handler,
151
 *   for the vcpus that are being replenished.
152
 */
153
#define __RTDS_depleted     3
154
#define RTDS_depleted (1<<__RTDS_depleted)
155
156
/*
157
 * RTDS_extratime: Can the vcpu run in the time that is
158
 * not part of any real-time reservation, and would therefore
159
 * be otherwise left idle?
160
 */
161
0
#define __RTDS_extratime    4
162
0
#define RTDS_extratime (1<<__RTDS_extratime)
163
164
/*
165
 * rt tracing events ("only" 512 available!). Check
166
 * include/public/trace.h for more details.
167
 */
168
0
#define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
169
0
#define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
170
0
#define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
171
0
#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
172
0
#define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
173
0
#define TRC_RTDS_SCHEDULE         TRC_SCHED_CLASS_EVT(RTDS, 6)
174
175
static void repl_timer_handler(void *data);
176
177
/*
178
 * System-wide private data, include global RunQueue/DepletedQ
179
 * Global lock is referenced by schedule_data.schedule_lock from all
180
 * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
181
 */
182
struct rt_private {
183
    spinlock_t lock;            /* the global coarse-grained lock */
184
    struct list_head sdom;      /* list of availalbe domains, used for dump */
185
186
    struct list_head runq;      /* ordered list of runnable vcpus */
187
    struct list_head depletedq; /* unordered list of depleted vcpus */
188
189
    struct timer *repl_timer;   /* replenishment timer */
190
    struct list_head replq;     /* ordered list of vcpus that need replenishment */
191
192
    cpumask_t tickled;          /* cpus been tickled */
193
};
194
195
/*
196
 * Virtual CPU
197
 */
198
struct rt_vcpu {
199
    struct list_head q_elem;     /* on the runq/depletedq list */
200
    struct list_head replq_elem; /* on the replenishment events list */
201
202
    /* VCPU parameters, in nanoseconds */
203
    s_time_t period;
204
    s_time_t budget;
205
206
    /* VCPU current infomation in nanosecond */
207
    s_time_t cur_budget;         /* current budget */
208
    s_time_t last_start;         /* last start time */
209
    s_time_t cur_deadline;       /* current deadline for EDF */
210
211
    /* Up-pointers */
212
    struct rt_dom *sdom;
213
    struct vcpu *vcpu;
214
215
    unsigned priority_level;
216
217
    unsigned flags;              /* mark __RTDS_scheduled, etc.. */
218
};
219
220
/*
221
 * Domain
222
 */
223
struct rt_dom {
224
    struct list_head sdom_elem; /* link list on rt_priv */
225
    struct domain *dom;         /* pointer to upper domain */
226
};
227
228
/*
229
 * Useful inline functions
230
 */
231
static inline struct rt_private *rt_priv(const struct scheduler *ops)
232
0
{
233
0
    return ops->sched_data;
234
0
}
235
236
static inline struct rt_vcpu *rt_vcpu(const struct vcpu *vcpu)
237
0
{
238
0
    return vcpu->sched_priv;
239
0
}
240
241
static inline struct rt_dom *rt_dom(const struct domain *dom)
242
0
{
243
0
    return dom->sched_priv;
244
0
}
245
246
static inline struct list_head *rt_runq(const struct scheduler *ops)
247
0
{
248
0
    return &rt_priv(ops)->runq;
249
0
}
250
251
static inline struct list_head *rt_depletedq(const struct scheduler *ops)
252
0
{
253
0
    return &rt_priv(ops)->depletedq;
254
0
}
255
256
static inline struct list_head *rt_replq(const struct scheduler *ops)
257
0
{
258
0
    return &rt_priv(ops)->replq;
259
0
}
260
261
static inline bool has_extratime(const struct rt_vcpu *svc)
262
0
{
263
0
    return svc->flags & RTDS_extratime;
264
0
}
265
266
/*
267
 * Helper functions for manipulating the runqueue, the depleted queue,
268
 * and the replenishment events queue.
269
 */
270
static int
271
vcpu_on_q(const struct rt_vcpu *svc)
272
0
{
273
0
   return !list_empty(&svc->q_elem);
274
0
}
275
276
static struct rt_vcpu *
277
q_elem(struct list_head *elem)
278
0
{
279
0
    return list_entry(elem, struct rt_vcpu, q_elem);
280
0
}
281
282
static struct rt_vcpu *
283
replq_elem(struct list_head *elem)
284
0
{
285
0
    return list_entry(elem, struct rt_vcpu, replq_elem);
286
0
}
287
288
static int
289
vcpu_on_replq(const struct rt_vcpu *svc)
290
0
{
291
0
    return !list_empty(&svc->replq_elem);
292
0
}
293
294
/*
295
 * If v1 priority >= v2 priority, return value > 0
296
 * Otherwise, return value < 0
297
 */
298
static s_time_t
299
compare_vcpu_priority(const struct rt_vcpu *v1, const struct rt_vcpu *v2)
300
0
{
301
0
    int prio = v2->priority_level - v1->priority_level;
302
0
303
0
    if ( prio == 0 )
304
0
        return v2->cur_deadline - v1->cur_deadline;
305
0
306
0
    return prio;
307
0
}
308
309
/*
310
 * Debug related code, dump vcpu/cpu information
311
 */
312
static void
313
rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
314
0
{
315
0
    cpumask_t *cpupool_mask, *mask;
316
0
317
0
    ASSERT(svc != NULL);
318
0
    /* idle vcpu */
319
0
    if( svc->sdom == NULL )
320
0
    {
321
0
        printk("\n");
322
0
        return;
323
0
    }
324
0
325
0
    /*
326
0
     * We can't just use 'cpumask_scratch' because the dumping can
327
0
     * happen from a pCPU outside of this scheduler's cpupool, and
328
0
     * hence it's not right to use its pCPU's scratch mask.
329
0
     * On the other hand, it is safe to use svc->vcpu->processor's
330
0
     * own scratch space, since we hold the runqueue lock.
331
0
     */
332
0
    mask = cpumask_scratch_cpu(svc->vcpu->processor);
333
0
334
0
    cpupool_mask = cpupool_domain_cpumask(svc->vcpu->domain);
335
0
    cpumask_and(mask, cpupool_mask, svc->vcpu->cpu_hard_affinity);
336
0
    cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), mask);
337
0
    printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
338
0
           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
339
0
           " \t\t priority_level=%d has_extratime=%d\n"
340
0
           " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%s\n",
341
0
            svc->vcpu->domain->domain_id,
342
0
            svc->vcpu->vcpu_id,
343
0
            svc->vcpu->processor,
344
0
            svc->period,
345
0
            svc->budget,
346
0
            svc->cur_budget,
347
0
            svc->cur_deadline,
348
0
            svc->last_start,
349
0
            svc->priority_level,
350
0
            has_extratime(svc),
351
0
            vcpu_on_q(svc),
352
0
            vcpu_runnable(svc->vcpu),
353
0
            svc->flags,
354
0
            keyhandler_scratch);
355
0
}
356
357
static void
358
rt_dump_pcpu(const struct scheduler *ops, int cpu)
359
0
{
360
0
    struct rt_private *prv = rt_priv(ops);
361
0
    struct rt_vcpu *svc;
362
0
    unsigned long flags;
363
0
364
0
    spin_lock_irqsave(&prv->lock, flags);
365
0
    printk("CPU[%02d]\n", cpu);
366
0
    /* current VCPU (nothing to say if that's the idle vcpu). */
367
0
    svc = rt_vcpu(curr_on_cpu(cpu));
368
0
    if ( svc && !is_idle_vcpu(svc->vcpu) )
369
0
    {
370
0
        rt_dump_vcpu(ops, svc);
371
0
    }
372
0
    spin_unlock_irqrestore(&prv->lock, flags);
373
0
}
374
375
static void
376
rt_dump(const struct scheduler *ops)
377
0
{
378
0
    struct list_head *runq, *depletedq, *replq, *iter;
379
0
    struct rt_private *prv = rt_priv(ops);
380
0
    struct rt_vcpu *svc;
381
0
    struct rt_dom *sdom;
382
0
    unsigned long flags;
383
0
384
0
    spin_lock_irqsave(&prv->lock, flags);
385
0
386
0
    if ( list_empty(&prv->sdom) )
387
0
        goto out;
388
0
389
0
    runq = rt_runq(ops);
390
0
    depletedq = rt_depletedq(ops);
391
0
    replq = rt_replq(ops);
392
0
393
0
    printk("Global RunQueue info:\n");
394
0
    list_for_each ( iter, runq )
395
0
    {
396
0
        svc = q_elem(iter);
397
0
        rt_dump_vcpu(ops, svc);
398
0
    }
399
0
400
0
    printk("Global DepletedQueue info:\n");
401
0
    list_for_each ( iter, depletedq )
402
0
    {
403
0
        svc = q_elem(iter);
404
0
        rt_dump_vcpu(ops, svc);
405
0
    }
406
0
407
0
    printk("Global Replenishment Events info:\n");
408
0
    list_for_each ( iter, replq )
409
0
    {
410
0
        svc = replq_elem(iter);
411
0
        rt_dump_vcpu(ops, svc);
412
0
    }
413
0
414
0
    printk("Domain info:\n");
415
0
    list_for_each ( iter, &prv->sdom )
416
0
    {
417
0
        struct vcpu *v;
418
0
419
0
        sdom = list_entry(iter, struct rt_dom, sdom_elem);
420
0
        printk("\tdomain: %d\n", sdom->dom->domain_id);
421
0
422
0
        for_each_vcpu ( sdom->dom, v )
423
0
        {
424
0
            svc = rt_vcpu(v);
425
0
            rt_dump_vcpu(ops, svc);
426
0
        }
427
0
    }
428
0
429
0
 out:
430
0
    spin_unlock_irqrestore(&prv->lock, flags);
431
0
}
432
433
/*
434
 * update deadline and budget when now >= cur_deadline
435
 * it needs to be updated to the deadline of the current period
436
 */
437
static void
438
rt_update_deadline(s_time_t now, struct rt_vcpu *svc)
439
0
{
440
0
    ASSERT(now >= svc->cur_deadline);
441
0
    ASSERT(svc->period != 0);
442
0
443
0
    if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
444
0
    {
445
0
        do
446
0
            svc->cur_deadline += svc->period;
447
0
        while ( svc->cur_deadline <= now );
448
0
    }
449
0
    else
450
0
    {
451
0
        long count = ((now - svc->cur_deadline) / svc->period) + 1;
452
0
        svc->cur_deadline += count * svc->period;
453
0
    }
454
0
455
0
    /*
456
0
     * svc may be scheduled to run immediately after it misses deadline
457
0
     * Then rt_update_deadline is called before rt_schedule, which
458
0
     * should only deduct the time spent in current period from the budget
459
0
     */
460
0
    svc->last_start = now;
461
0
    svc->cur_budget = svc->budget;
462
0
    svc->priority_level = 0;
463
0
464
0
    /* TRACE */
465
0
    {
466
0
        struct __packed {
467
0
            unsigned vcpu:16, dom:16;
468
0
            unsigned priority_level;
469
0
            uint64_t cur_deadline, cur_budget;
470
0
        } d;
471
0
        d.dom = svc->vcpu->domain->domain_id;
472
0
        d.vcpu = svc->vcpu->vcpu_id;
473
0
        d.priority_level = svc->priority_level;
474
0
        d.cur_deadline = (uint64_t) svc->cur_deadline;
475
0
        d.cur_budget = (uint64_t) svc->cur_budget;
476
0
        trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
477
0
                  sizeof(d),
478
0
                  (unsigned char *) &d);
479
0
    }
480
0
481
0
    return;
482
0
}
483
484
/*
485
 * Helpers for removing and inserting a vcpu in a queue
486
 * that is being kept ordered by the vcpus' deadlines (as EDF
487
 * mandates).
488
 *
489
 * For callers' convenience, the vcpu removing helper returns
490
 * true if the vcpu removed was the one at the front of the
491
 * queue; similarly, the inserting helper returns true if the
492
 * inserted ended at the front of the queue (i.e., in both
493
 * cases, if the vcpu with the earliest deadline is what we
494
 * are dealing with).
495
 */
496
static inline bool
497
deadline_queue_remove(struct list_head *queue, struct list_head *elem)
498
0
{
499
0
    int pos = 0;
500
0
501
0
    if ( queue->next != elem )
502
0
        pos = 1;
503
0
504
0
    list_del_init(elem);
505
0
    return !pos;
506
0
}
507
508
static inline bool
509
deadline_queue_insert(struct rt_vcpu * (*qelem)(struct list_head *),
510
                      struct rt_vcpu *svc, struct list_head *elem,
511
                      struct list_head *queue)
512
0
{
513
0
    struct list_head *iter;
514
0
    int pos = 0;
515
0
516
0
    list_for_each ( iter, queue )
517
0
    {
518
0
        struct rt_vcpu * iter_svc = (*qelem)(iter);
519
0
        if ( compare_vcpu_priority(svc, iter_svc) > 0 )
520
0
            break;
521
0
        pos++;
522
0
    }
523
0
    list_add_tail(elem, iter);
524
0
    return !pos;
525
0
}
526
#define deadline_runq_insert(...) \
527
0
  deadline_queue_insert(&q_elem, ##__VA_ARGS__)
528
#define deadline_replq_insert(...) \
529
0
  deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
530
531
static inline void
532
q_remove(struct rt_vcpu *svc)
533
0
{
534
0
    ASSERT( vcpu_on_q(svc) );
535
0
    list_del_init(&svc->q_elem);
536
0
}
537
538
static inline void
539
replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
540
0
{
541
0
    struct rt_private *prv = rt_priv(ops);
542
0
    struct list_head *replq = rt_replq(ops);
543
0
544
0
    ASSERT( vcpu_on_replq(svc) );
545
0
546
0
    if ( deadline_queue_remove(replq, &svc->replq_elem) )
547
0
    {
548
0
        /*
549
0
         * The replenishment timer needs to be set to fire when a
550
0
         * replenishment for the vcpu at the front of the replenishment
551
0
         * queue is due. If it is such vcpu that we just removed, we may
552
0
         * need to reprogram the timer.
553
0
         */
554
0
        if ( !list_empty(replq) )
555
0
        {
556
0
            struct rt_vcpu *svc_next = replq_elem(replq->next);
557
0
            set_timer(prv->repl_timer, svc_next->cur_deadline);
558
0
        }
559
0
        else
560
0
            stop_timer(prv->repl_timer);
561
0
    }
562
0
}
563
564
/*
565
 * Insert svc with budget in RunQ according to EDF:
566
 * vcpus with smaller deadlines go first.
567
 * Insert svc without budget in DepletedQ unsorted;
568
 */
569
static void
570
runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
571
0
{
572
0
    struct rt_private *prv = rt_priv(ops);
573
0
    struct list_head *runq = rt_runq(ops);
574
0
575
0
    ASSERT( spin_is_locked(&prv->lock) );
576
0
    ASSERT( !vcpu_on_q(svc) );
577
0
    ASSERT( vcpu_on_replq(svc) );
578
0
579
0
    /* add svc to runq if svc still has budget or its extratime is set */
580
0
    if ( svc->cur_budget > 0 ||
581
0
         has_extratime(svc) )
582
0
        deadline_runq_insert(svc, &svc->q_elem, runq);
583
0
    else
584
0
        list_add(&svc->q_elem, &prv->depletedq);
585
0
}
586
587
static void
588
replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
589
0
{
590
0
    struct list_head *replq = rt_replq(ops);
591
0
    struct rt_private *prv = rt_priv(ops);
592
0
593
0
    ASSERT( !vcpu_on_replq(svc) );
594
0
595
0
    /*
596
0
     * The timer may be re-programmed if svc is inserted
597
0
     * at the front of the event list.
598
0
     */
599
0
    if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
600
0
        set_timer(prv->repl_timer, svc->cur_deadline);
601
0
}
602
603
/*
604
 * Removes and re-inserts an event to the replenishment queue.
605
 * The aim is to update its position inside the queue, as its
606
 * deadline (and hence its replenishment time) could have
607
 * changed.
608
 */
609
static void
610
replq_reinsert(const struct scheduler *ops, struct rt_vcpu *svc)
611
0
{
612
0
    struct list_head *replq = rt_replq(ops);
613
0
    struct rt_vcpu *rearm_svc = svc;
614
0
    bool_t rearm = 0;
615
0
616
0
    ASSERT( vcpu_on_replq(svc) );
617
0
618
0
    /*
619
0
     * If svc was at the front of the replenishment queue, we certainly
620
0
     * need to re-program the timer, and we want to use the deadline of
621
0
     * the vcpu which is now at the front of the queue (which may still
622
0
     * be svc or not).
623
0
     *
624
0
     * We may also need to re-program, if svc has been put at the front
625
0
     * of the replenishment queue when being re-inserted.
626
0
     */
627
0
    if ( deadline_queue_remove(replq, &svc->replq_elem) )
628
0
    {
629
0
        deadline_replq_insert(svc, &svc->replq_elem, replq);
630
0
        rearm_svc = replq_elem(replq->next);
631
0
        rearm = 1;
632
0
    }
633
0
    else
634
0
        rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
635
0
636
0
    if ( rearm )
637
0
        set_timer(rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
638
0
}
639
640
/*
641
 * Pick a valid CPU for the vcpu vc
642
 * Valid CPU of a vcpu is intesection of vcpu's affinity
643
 * and available cpus
644
 */
645
static int
646
rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
647
0
{
648
0
    cpumask_t cpus;
649
0
    cpumask_t *online;
650
0
    int cpu;
651
0
652
0
    online = cpupool_domain_cpumask(vc->domain);
653
0
    cpumask_and(&cpus, online, vc->cpu_hard_affinity);
654
0
655
0
    cpu = cpumask_test_cpu(vc->processor, &cpus)
656
0
            ? vc->processor
657
0
            : cpumask_cycle(vc->processor, &cpus);
658
0
    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
659
0
660
0
    return cpu;
661
0
}
662
663
/*
664
 * Init/Free related code
665
 */
666
static int
667
rt_init(struct scheduler *ops)
668
0
{
669
0
    int rc = -ENOMEM;
670
0
    struct rt_private *prv = xzalloc(struct rt_private);
671
0
672
0
    printk("Initializing RTDS scheduler\n"
673
0
           "WARNING: This is experimental software in development.\n"
674
0
           "Use at your own risk.\n");
675
0
676
0
    if ( prv == NULL )
677
0
        goto err;
678
0
679
0
    prv->repl_timer = xzalloc(struct timer);
680
0
    if ( prv->repl_timer == NULL )
681
0
        goto err;
682
0
683
0
    spin_lock_init(&prv->lock);
684
0
    INIT_LIST_HEAD(&prv->sdom);
685
0
    INIT_LIST_HEAD(&prv->runq);
686
0
    INIT_LIST_HEAD(&prv->depletedq);
687
0
    INIT_LIST_HEAD(&prv->replq);
688
0
689
0
    cpumask_clear(&prv->tickled);
690
0
691
0
    ops->sched_data = prv;
692
0
    rc = 0;
693
0
694
0
 err:
695
0
    if ( rc && prv )
696
0
    {
697
0
        xfree(prv->repl_timer);
698
0
        xfree(prv);
699
0
    }
700
0
701
0
    return rc;
702
0
}
703
704
static void
705
rt_deinit(struct scheduler *ops)
706
0
{
707
0
    struct rt_private *prv = rt_priv(ops);
708
0
709
0
    ASSERT(prv->repl_timer->status == TIMER_STATUS_invalid ||
710
0
           prv->repl_timer->status == TIMER_STATUS_killed);
711
0
    xfree(prv->repl_timer);
712
0
713
0
    ops->sched_data = NULL;
714
0
    xfree(prv);
715
0
}
716
717
/*
718
 * Point per_cpu spinlock to the global system lock;
719
 * All cpu have same global system lock
720
 */
721
static void
722
rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
723
0
{
724
0
    struct rt_private *prv = rt_priv(ops);
725
0
    spinlock_t *old_lock;
726
0
    unsigned long flags;
727
0
728
0
    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
729
0
730
0
    /*
731
0
     * TIMER_STATUS_invalid means we are the first cpu that sees the timer
732
0
     * allocated but not initialized, and so it's up to us to initialize it.
733
0
     */
734
0
    if ( prv->repl_timer->status == TIMER_STATUS_invalid )
735
0
    {
736
0
        init_timer(prv->repl_timer, repl_timer_handler, (void*) ops, cpu);
737
0
        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
738
0
    }
739
0
740
0
    /* Move the scheduler lock to our global runqueue lock.  */
741
0
    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
742
0
743
0
    /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
744
0
    spin_unlock_irqrestore(old_lock, flags);
745
0
}
746
747
/* Change the scheduler of cpu to us (RTDS). */
748
static void
749
rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
750
                void *pdata, void *vdata)
751
0
{
752
0
    struct rt_private *prv = rt_priv(new_ops);
753
0
    struct rt_vcpu *svc = vdata;
754
0
755
0
    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
756
0
757
0
    /*
758
0
     * We are holding the runqueue lock already (it's been taken in
759
0
     * schedule_cpu_switch()). It's actually the runqueue lock of
760
0
     * another scheduler, but that is how things need to be, for
761
0
     * preventing races.
762
0
     */
763
0
    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->lock);
764
0
765
0
    /*
766
0
     * If we are the absolute first cpu being switched toward this
767
0
     * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
768
0
     * first one that is added back to the cpupool that had all its cpus
769
0
     * removed (in which case we'll see TIMER_STATUS_killed), it's our
770
0
     * job to (re)initialize the timer.
771
0
     */
772
0
    if ( prv->repl_timer->status == TIMER_STATUS_invalid ||
773
0
         prv->repl_timer->status == TIMER_STATUS_killed )
774
0
    {
775
0
        init_timer(prv->repl_timer, repl_timer_handler, (void*) new_ops, cpu);
776
0
        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
777
0
    }
778
0
779
0
    idle_vcpu[cpu]->sched_priv = vdata;
780
0
    per_cpu(scheduler, cpu) = new_ops;
781
0
    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
782
0
783
0
    /*
784
0
     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
785
0
     * if it is free (and it can be) we want that anyone that manages
786
0
     * taking it, find all the initializations we've done above in place.
787
0
     */
788
0
    smp_mb();
789
0
    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
790
0
}
791
792
static void
793
rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
794
0
{
795
0
    unsigned long flags;
796
0
    struct rt_private *prv = rt_priv(ops);
797
0
798
0
    spin_lock_irqsave(&prv->lock, flags);
799
0
800
0
    if ( prv->repl_timer->cpu == cpu )
801
0
    {
802
0
        struct cpupool *c = per_cpu(cpupool, cpu);
803
0
        unsigned int new_cpu = cpumask_cycle(cpu, cpupool_online_cpumask(c));
804
0
805
0
        /*
806
0
         * Make sure the timer run on one of the cpus that are still available
807
0
         * to this scheduler. If there aren't any left, it means it's the time
808
0
         * to just kill it.
809
0
         */
810
0
        if ( new_cpu >= nr_cpu_ids )
811
0
        {
812
0
            kill_timer(prv->repl_timer);
813
0
            dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
814
0
        }
815
0
        else
816
0
        {
817
0
            migrate_timer(prv->repl_timer, new_cpu);
818
0
        }
819
0
    }
820
0
821
0
    spin_unlock_irqrestore(&prv->lock, flags);
822
0
}
823
824
static void *
825
rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
826
0
{
827
0
    unsigned long flags;
828
0
    struct rt_dom *sdom;
829
0
    struct rt_private * prv = rt_priv(ops);
830
0
831
0
    sdom = xzalloc(struct rt_dom);
832
0
    if ( sdom == NULL )
833
0
        return NULL;
834
0
835
0
    INIT_LIST_HEAD(&sdom->sdom_elem);
836
0
    sdom->dom = dom;
837
0
838
0
    /* spinlock here to insert the dom */
839
0
    spin_lock_irqsave(&prv->lock, flags);
840
0
    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
841
0
    spin_unlock_irqrestore(&prv->lock, flags);
842
0
843
0
    return sdom;
844
0
}
845
846
static void
847
rt_free_domdata(const struct scheduler *ops, void *data)
848
0
{
849
0
    unsigned long flags;
850
0
    struct rt_dom *sdom = data;
851
0
    struct rt_private *prv = rt_priv(ops);
852
0
853
0
    spin_lock_irqsave(&prv->lock, flags);
854
0
    list_del_init(&sdom->sdom_elem);
855
0
    spin_unlock_irqrestore(&prv->lock, flags);
856
0
    xfree(data);
857
0
}
858
859
static int
860
rt_dom_init(const struct scheduler *ops, struct domain *dom)
861
0
{
862
0
    struct rt_dom *sdom;
863
0
864
0
    /* IDLE Domain does not link on rt_private */
865
0
    if ( is_idle_domain(dom) )
866
0
        return 0;
867
0
868
0
    sdom = rt_alloc_domdata(ops, dom);
869
0
    if ( sdom == NULL )
870
0
        return -ENOMEM;
871
0
872
0
    dom->sched_priv = sdom;
873
0
874
0
    return 0;
875
0
}
876
877
static void
878
rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
879
0
{
880
0
    rt_free_domdata(ops, rt_dom(dom));
881
0
}
882
883
static void *
884
rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
885
0
{
886
0
    struct rt_vcpu *svc;
887
0
888
0
    /* Allocate per-VCPU info */
889
0
    svc = xzalloc(struct rt_vcpu);
890
0
    if ( svc == NULL )
891
0
        return NULL;
892
0
893
0
    INIT_LIST_HEAD(&svc->q_elem);
894
0
    INIT_LIST_HEAD(&svc->replq_elem);
895
0
    svc->flags = 0U;
896
0
    svc->sdom = dd;
897
0
    svc->vcpu = vc;
898
0
    svc->last_start = 0;
899
0
900
0
    __set_bit(__RTDS_extratime, &svc->flags);
901
0
    svc->priority_level = 0;
902
0
    svc->period = RTDS_DEFAULT_PERIOD;
903
0
    if ( !is_idle_vcpu(vc) )
904
0
        svc->budget = RTDS_DEFAULT_BUDGET;
905
0
906
0
    SCHED_STAT_CRANK(vcpu_alloc);
907
0
908
0
    return svc;
909
0
}
910
911
static void
912
rt_free_vdata(const struct scheduler *ops, void *priv)
913
0
{
914
0
    struct rt_vcpu *svc = priv;
915
0
916
0
    xfree(svc);
917
0
}
918
919
/*
920
 * It is called in sched_move_domain() and sched_init_vcpu
921
 * in schedule.c.
922
 * When move a domain to a new cpupool.
923
 * It inserts vcpus of moving domain to the scheduler's RunQ in
924
 * dest. cpupool.
925
 */
926
static void
927
rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
928
0
{
929
0
    struct rt_vcpu *svc = rt_vcpu(vc);
930
0
    s_time_t now;
931
0
    spinlock_t *lock;
932
0
933
0
    BUG_ON( is_idle_vcpu(vc) );
934
0
935
0
    /* This is safe because vc isn't yet being scheduled */
936
0
    vc->processor = rt_cpu_pick(ops, vc);
937
0
938
0
    lock = vcpu_schedule_lock_irq(vc);
939
0
940
0
    now = NOW();
941
0
    if ( now >= svc->cur_deadline )
942
0
        rt_update_deadline(now, svc);
943
0
944
0
    if ( !vcpu_on_q(svc) && vcpu_runnable(vc) )
945
0
    {
946
0
        replq_insert(ops, svc);
947
0
948
0
        if ( !vc->is_running )
949
0
            runq_insert(ops, svc);
950
0
    }
951
0
    vcpu_schedule_unlock_irq(lock, vc);
952
0
953
0
    SCHED_STAT_CRANK(vcpu_insert);
954
0
}
955
956
/*
957
 * Remove rt_vcpu svc from the old scheduler in source cpupool.
958
 */
959
static void
960
rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
961
0
{
962
0
    struct rt_vcpu * const svc = rt_vcpu(vc);
963
0
    struct rt_dom * const sdom = svc->sdom;
964
0
    spinlock_t *lock;
965
0
966
0
    SCHED_STAT_CRANK(vcpu_remove);
967
0
968
0
    BUG_ON( sdom == NULL );
969
0
970
0
    lock = vcpu_schedule_lock_irq(vc);
971
0
    if ( vcpu_on_q(svc) )
972
0
        q_remove(svc);
973
0
974
0
    if ( vcpu_on_replq(svc) )
975
0
        replq_remove(ops,svc);
976
0
977
0
    vcpu_schedule_unlock_irq(lock, vc);
978
0
}
979
980
/*
981
 * Burn budget in nanosecond granularity
982
 */
983
static void
984
burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
985
0
{
986
0
    s_time_t delta;
987
0
988
0
    /* don't burn budget for idle VCPU */
989
0
    if ( is_idle_vcpu(svc->vcpu) )
990
0
        return;
991
0
992
0
    /* burn at nanoseconds level */
993
0
    delta = now - svc->last_start;
994
0
    /*
995
0
     * delta < 0 only happens in nested virtualization;
996
0
     * TODO: how should we handle delta < 0 in a better way?
997
0
     */
998
0
    if ( delta < 0 )
999
0
    {
1000
0
        printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
1001
0
                __func__, delta);
1002
0
        svc->last_start = now;
1003
0
        return;
1004
0
    }
1005
0
1006
0
    svc->cur_budget -= delta;
1007
0
    svc->last_start = now;
1008
0
1009
0
    if ( svc->cur_budget <= 0 )
1010
0
    {
1011
0
        if ( has_extratime(svc) )
1012
0
        {
1013
0
            svc->priority_level++;
1014
0
            svc->cur_budget = svc->budget;
1015
0
        }
1016
0
        else
1017
0
        {
1018
0
            svc->cur_budget = 0;
1019
0
            __set_bit(__RTDS_depleted, &svc->flags);
1020
0
        }
1021
0
    }
1022
0
1023
0
    /* TRACE */
1024
0
    {
1025
0
        struct __packed {
1026
0
            unsigned vcpu:16, dom:16;
1027
0
            uint64_t cur_budget;
1028
0
            int delta;
1029
0
            unsigned priority_level;
1030
0
            bool has_extratime;
1031
0
        } d;
1032
0
        d.dom = svc->vcpu->domain->domain_id;
1033
0
        d.vcpu = svc->vcpu->vcpu_id;
1034
0
        d.cur_budget = (uint64_t) svc->cur_budget;
1035
0
        d.delta = delta;
1036
0
        d.priority_level = svc->priority_level;
1037
0
        d.has_extratime = svc->flags & RTDS_extratime;
1038
0
        trace_var(TRC_RTDS_BUDGET_BURN, 1,
1039
0
                  sizeof(d),
1040
0
                  (unsigned char *) &d);
1041
0
    }
1042
0
}
1043
1044
/*
1045
 * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
1046
 * lock is grabbed before calling this function
1047
 */
1048
static struct rt_vcpu *
1049
runq_pick(const struct scheduler *ops, const cpumask_t *mask)
1050
0
{
1051
0
    struct list_head *runq = rt_runq(ops);
1052
0
    struct list_head *iter;
1053
0
    struct rt_vcpu *svc = NULL;
1054
0
    struct rt_vcpu *iter_svc = NULL;
1055
0
    cpumask_t cpu_common;
1056
0
    cpumask_t *online;
1057
0
1058
0
    list_for_each ( iter, runq )
1059
0
    {
1060
0
        iter_svc = q_elem(iter);
1061
0
1062
0
        /* mask cpu_hard_affinity & cpupool & mask */
1063
0
        online = cpupool_domain_cpumask(iter_svc->vcpu->domain);
1064
0
        cpumask_and(&cpu_common, online, iter_svc->vcpu->cpu_hard_affinity);
1065
0
        cpumask_and(&cpu_common, mask, &cpu_common);
1066
0
        if ( cpumask_empty(&cpu_common) )
1067
0
            continue;
1068
0
1069
0
        ASSERT( iter_svc->cur_budget > 0 );
1070
0
1071
0
        svc = iter_svc;
1072
0
        break;
1073
0
    }
1074
0
1075
0
    /* TRACE */
1076
0
    {
1077
0
        if( svc != NULL )
1078
0
        {
1079
0
            struct __packed {
1080
0
                unsigned vcpu:16, dom:16;
1081
0
                uint64_t cur_deadline, cur_budget;
1082
0
            } d;
1083
0
            d.dom = svc->vcpu->domain->domain_id;
1084
0
            d.vcpu = svc->vcpu->vcpu_id;
1085
0
            d.cur_deadline = (uint64_t) svc->cur_deadline;
1086
0
            d.cur_budget = (uint64_t) svc->cur_budget;
1087
0
            trace_var(TRC_RTDS_RUNQ_PICK, 1,
1088
0
                      sizeof(d),
1089
0
                      (unsigned char *) &d);
1090
0
        }
1091
0
    }
1092
0
1093
0
    return svc;
1094
0
}
1095
1096
/*
1097
 * schedule function for rt scheduler.
1098
 * The lock is already grabbed in schedule.c, no need to lock here
1099
 */
1100
static struct task_slice
1101
rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1102
0
{
1103
0
    const int cpu = smp_processor_id();
1104
0
    struct rt_private *prv = rt_priv(ops);
1105
0
    struct rt_vcpu *const scurr = rt_vcpu(current);
1106
0
    struct rt_vcpu *snext = NULL;
1107
0
    struct task_slice ret = { .migrated = 0 };
1108
0
1109
0
    /* TRACE */
1110
0
    {
1111
0
        struct __packed {
1112
0
            unsigned cpu:16, tasklet:8, tickled:4, idle:4;
1113
0
        } d;
1114
0
        d.cpu = cpu;
1115
0
        d.tasklet = tasklet_work_scheduled;
1116
0
        d.tickled = cpumask_test_cpu(cpu, &prv->tickled);
1117
0
        d.idle = is_idle_vcpu(current);
1118
0
        trace_var(TRC_RTDS_SCHEDULE, 1,
1119
0
                  sizeof(d),
1120
0
                  (unsigned char *)&d);
1121
0
    }
1122
0
1123
0
    /* clear ticked bit now that we've been scheduled */
1124
0
    cpumask_clear_cpu(cpu, &prv->tickled);
1125
0
1126
0
    /* burn_budget would return for IDLE VCPU */
1127
0
    burn_budget(ops, scurr, now);
1128
0
1129
0
    if ( tasklet_work_scheduled )
1130
0
    {
1131
0
        trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0,  NULL);
1132
0
        snext = rt_vcpu(idle_vcpu[cpu]);
1133
0
    }
1134
0
    else
1135
0
    {
1136
0
        snext = runq_pick(ops, cpumask_of(cpu));
1137
0
        if ( snext == NULL )
1138
0
            snext = rt_vcpu(idle_vcpu[cpu]);
1139
0
1140
0
        /* if scurr has higher priority and budget, still pick scurr */
1141
0
        if ( !is_idle_vcpu(current) &&
1142
0
             vcpu_runnable(current) &&
1143
0
             scurr->cur_budget > 0 &&
1144
0
             ( is_idle_vcpu(snext->vcpu) ||
1145
0
               compare_vcpu_priority(scurr, snext) > 0 ) )
1146
0
            snext = scurr;
1147
0
    }
1148
0
1149
0
    if ( snext != scurr &&
1150
0
         !is_idle_vcpu(current) &&
1151
0
         vcpu_runnable(current) )
1152
0
        __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1153
0
1154
0
    snext->last_start = now;
1155
0
    ret.time =  -1; /* if an idle vcpu is picked */
1156
0
    if ( !is_idle_vcpu(snext->vcpu) )
1157
0
    {
1158
0
        if ( snext != scurr )
1159
0
        {
1160
0
            q_remove(snext);
1161
0
            __set_bit(__RTDS_scheduled, &snext->flags);
1162
0
        }
1163
0
        if ( snext->vcpu->processor != cpu )
1164
0
        {
1165
0
            snext->vcpu->processor = cpu;
1166
0
            ret.migrated = 1;
1167
0
        }
1168
0
        ret.time = snext->cur_budget; /* invoke the scheduler next time */
1169
0
    }
1170
0
    ret.task = snext->vcpu;
1171
0
1172
0
    return ret;
1173
0
}
1174
1175
/*
1176
 * Remove VCPU from RunQ
1177
 * The lock is already grabbed in schedule.c, no need to lock here
1178
 */
1179
static void
1180
rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
1181
0
{
1182
0
    struct rt_vcpu * const svc = rt_vcpu(vc);
1183
0
1184
0
    BUG_ON( is_idle_vcpu(vc) );
1185
0
    SCHED_STAT_CRANK(vcpu_sleep);
1186
0
1187
0
    if ( curr_on_cpu(vc->processor) == vc )
1188
0
        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
1189
0
    else if ( vcpu_on_q(svc) )
1190
0
    {
1191
0
        q_remove(svc);
1192
0
        replq_remove(ops, svc);
1193
0
    }
1194
0
    else if ( svc->flags & RTDS_delayed_runq_add )
1195
0
        __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1196
0
}
1197
1198
/*
1199
 * Pick a cpu where to run a vcpu,
1200
 * possibly kicking out the vcpu running there
1201
 * Called by wake() and context_saved()
1202
 * We have a running candidate here, the kick logic is:
1203
 * Among all the cpus that are within the cpu affinity
1204
 * 1) if there are any idle CPUs, kick one.
1205
      For cache benefit, we check new->cpu as first
1206
 * 2) now all pcpus are busy;
1207
 *    among all the running vcpus, pick lowest priority one
1208
 *    if snext has higher priority, kick it.
1209
 *
1210
 * TODO:
1211
 * 1) what if these two vcpus belongs to the same domain?
1212
 *    replace a vcpu belonging to the same domain introduces more overhead
1213
 *
1214
 * lock is grabbed before calling this function
1215
 */
1216
static void
1217
runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
1218
0
{
1219
0
    struct rt_private *prv = rt_priv(ops);
1220
0
    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
1221
0
    struct rt_vcpu *iter_svc;
1222
0
    struct vcpu *iter_vc;
1223
0
    int cpu = 0, cpu_to_tickle = 0;
1224
0
    cpumask_t not_tickled;
1225
0
    cpumask_t *online;
1226
0
1227
0
    if ( new == NULL || is_idle_vcpu(new->vcpu) )
1228
0
        return;
1229
0
1230
0
    online = cpupool_domain_cpumask(new->vcpu->domain);
1231
0
    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
1232
0
    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
1233
0
1234
0
    /*
1235
0
     * 1) If there are any idle CPUs, kick one.
1236
0
     *    For cache benefit,we first search new->cpu.
1237
0
     *    The same loop also find the one with lowest priority.
1238
0
     */
1239
0
    cpu = cpumask_test_or_cycle(new->vcpu->processor, &not_tickled);
1240
0
    while ( cpu!= nr_cpu_ids )
1241
0
    {
1242
0
        iter_vc = curr_on_cpu(cpu);
1243
0
        if ( is_idle_vcpu(iter_vc) )
1244
0
        {
1245
0
            SCHED_STAT_CRANK(tickled_idle_cpu);
1246
0
            cpu_to_tickle = cpu;
1247
0
            goto out;
1248
0
        }
1249
0
        iter_svc = rt_vcpu(iter_vc);
1250
0
        if ( latest_deadline_vcpu == NULL ||
1251
0
             compare_vcpu_priority(iter_svc, latest_deadline_vcpu) < 0 )
1252
0
            latest_deadline_vcpu = iter_svc;
1253
0
1254
0
        cpumask_clear_cpu(cpu, &not_tickled);
1255
0
        cpu = cpumask_cycle(cpu, &not_tickled);
1256
0
    }
1257
0
1258
0
    /* 2) candicate has higher priority, kick out lowest priority vcpu */
1259
0
    if ( latest_deadline_vcpu != NULL &&
1260
0
         compare_vcpu_priority(latest_deadline_vcpu, new) < 0 )
1261
0
    {
1262
0
        SCHED_STAT_CRANK(tickled_busy_cpu);
1263
0
        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
1264
0
        goto out;
1265
0
    }
1266
0
1267
0
    /* didn't tickle any cpu */
1268
0
    SCHED_STAT_CRANK(tickled_no_cpu);
1269
0
    return;
1270
0
 out:
1271
0
    /* TRACE */
1272
0
    {
1273
0
        struct {
1274
0
            unsigned cpu:16, pad:16;
1275
0
        } d;
1276
0
        d.cpu = cpu_to_tickle;
1277
0
        d.pad = 0;
1278
0
        trace_var(TRC_RTDS_TICKLE, 1,
1279
0
                  sizeof(d),
1280
0
                  (unsigned char *)&d);
1281
0
    }
1282
0
1283
0
    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1284
0
    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1285
0
    return;
1286
0
}
1287
1288
/*
1289
 * Should always wake up runnable vcpu, put it back to RunQ.
1290
 * Check priority to raise interrupt
1291
 * The lock is already grabbed in schedule.c, no need to lock here
1292
 * TODO: what if these two vcpus belongs to the same domain?
1293
 */
1294
static void
1295
rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
1296
0
{
1297
0
    struct rt_vcpu * const svc = rt_vcpu(vc);
1298
0
    s_time_t now;
1299
0
    bool_t missed;
1300
0
1301
0
    BUG_ON( is_idle_vcpu(vc) );
1302
0
1303
0
    if ( unlikely(curr_on_cpu(vc->processor) == vc) )
1304
0
    {
1305
0
        SCHED_STAT_CRANK(vcpu_wake_running);
1306
0
        return;
1307
0
    }
1308
0
1309
0
    /* on RunQ/DepletedQ, just update info is ok */
1310
0
    if ( unlikely(vcpu_on_q(svc)) )
1311
0
    {
1312
0
        SCHED_STAT_CRANK(vcpu_wake_onrunq);
1313
0
        return;
1314
0
    }
1315
0
1316
0
    if ( likely(vcpu_runnable(vc)) )
1317
0
        SCHED_STAT_CRANK(vcpu_wake_runnable);
1318
0
    else
1319
0
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);
1320
0
1321
0
    /*
1322
0
     * If a deadline passed while svc was asleep/blocked, we need new
1323
0
     * scheduling parameters (a new deadline and full budget).
1324
0
     */
1325
0
    now = NOW();
1326
0
1327
0
    missed = ( now >= svc->cur_deadline );
1328
0
    if ( missed )
1329
0
        rt_update_deadline(now, svc);
1330
0
1331
0
    /*
1332
0
     * If context hasn't been saved for this vcpu yet, we can't put it on
1333
0
     * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1334
0
     * the vcpu will be put back on queue after the context has been saved
1335
0
     * (in rt_context_save()).
1336
0
     */
1337
0
    if ( unlikely(svc->flags & RTDS_scheduled) )
1338
0
    {
1339
0
        __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1340
0
        /*
1341
0
         * The vcpu is waking up already, and we didn't even had the time to
1342
0
         * remove its next replenishment event from the replenishment queue
1343
0
         * when it blocked! No big deal. If we did not miss the deadline in
1344
0
         * the meantime, let's just leave it there. If we did, let's remove it
1345
0
         * and queue a new one (to occur at our new deadline).
1346
0
         */
1347
0
        if ( missed )
1348
0
           replq_reinsert(ops, svc);
1349
0
        return;
1350
0
    }
1351
0
1352
0
    /* Replenishment event got cancelled when we blocked. Add it back. */
1353
0
    replq_insert(ops, svc);
1354
0
    /* insert svc to runq/depletedq because svc is not in queue now */
1355
0
    runq_insert(ops, svc);
1356
0
1357
0
    runq_tickle(ops, svc);
1358
0
}
1359
1360
/*
1361
 * scurr has finished context switch, insert it back to the RunQ,
1362
 * and then pick the highest priority vcpu from runq to run
1363
 */
1364
static void
1365
rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
1366
0
{
1367
0
    struct rt_vcpu *svc = rt_vcpu(vc);
1368
0
    spinlock_t *lock = vcpu_schedule_lock_irq(vc);
1369
0
1370
0
    __clear_bit(__RTDS_scheduled, &svc->flags);
1371
0
    /* not insert idle vcpu to runq */
1372
0
    if ( is_idle_vcpu(vc) )
1373
0
        goto out;
1374
0
1375
0
    if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1376
0
         likely(vcpu_runnable(vc)) )
1377
0
    {
1378
0
        runq_insert(ops, svc);
1379
0
        runq_tickle(ops, svc);
1380
0
    }
1381
0
    else
1382
0
        replq_remove(ops, svc);
1383
0
1384
0
out:
1385
0
    vcpu_schedule_unlock_irq(lock, vc);
1386
0
}
1387
1388
/*
1389
 * set/get each vcpu info of each domain
1390
 */
1391
static int
1392
rt_dom_cntl(
1393
    const struct scheduler *ops,
1394
    struct domain *d,
1395
    struct xen_domctl_scheduler_op *op)
1396
0
{
1397
0
    struct rt_private *prv = rt_priv(ops);
1398
0
    struct rt_vcpu *svc;
1399
0
    struct vcpu *v;
1400
0
    unsigned long flags;
1401
0
    int rc = 0;
1402
0
    struct xen_domctl_schedparam_vcpu local_sched;
1403
0
    s_time_t period, budget;
1404
0
    uint32_t index = 0;
1405
0
1406
0
    switch ( op->cmd )
1407
0
    {
1408
0
    case XEN_DOMCTL_SCHEDOP_getinfo:
1409
0
        /* Return the default parameters. */
1410
0
        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1411
0
        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1412
0
        break;
1413
0
    case XEN_DOMCTL_SCHEDOP_putinfo:
1414
0
        if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1415
0
        {
1416
0
            rc = -EINVAL;
1417
0
            break;
1418
0
        }
1419
0
        spin_lock_irqsave(&prv->lock, flags);
1420
0
        for_each_vcpu ( d, v )
1421
0
        {
1422
0
            svc = rt_vcpu(v);
1423
0
            svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1424
0
            svc->budget = MICROSECS(op->u.rtds.budget);
1425
0
        }
1426
0
        spin_unlock_irqrestore(&prv->lock, flags);
1427
0
        break;
1428
0
    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1429
0
    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1430
0
        while ( index < op->u.v.nr_vcpus )
1431
0
        {
1432
0
            if ( copy_from_guest_offset(&local_sched,
1433
0
                                        op->u.v.vcpus, index, 1) )
1434
0
            {
1435
0
                rc = -EFAULT;
1436
0
                break;
1437
0
            }
1438
0
            if ( local_sched.vcpuid >= d->max_vcpus ||
1439
0
                 d->vcpu[local_sched.vcpuid] == NULL )
1440
0
            {
1441
0
                rc = -EINVAL;
1442
0
                break;
1443
0
            }
1444
0
1445
0
            if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1446
0
            {
1447
0
                spin_lock_irqsave(&prv->lock, flags);
1448
0
                svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1449
0
                local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1450
0
                local_sched.u.rtds.period = svc->period / MICROSECS(1);
1451
0
                if ( has_extratime(svc) )
1452
0
                    local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1453
0
                else
1454
0
                    local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1455
0
                spin_unlock_irqrestore(&prv->lock, flags);
1456
0
1457
0
                if ( copy_to_guest_offset(op->u.v.vcpus, index,
1458
0
                                          &local_sched, 1) )
1459
0
                {
1460
0
                    rc = -EFAULT;
1461
0
                    break;
1462
0
                }
1463
0
            }
1464
0
            else
1465
0
            {
1466
0
                period = MICROSECS(local_sched.u.rtds.period);
1467
0
                budget = MICROSECS(local_sched.u.rtds.budget);
1468
0
                if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1469
0
                     budget > period || period < RTDS_MIN_PERIOD )
1470
0
                {
1471
0
                    rc = -EINVAL;
1472
0
                    break;
1473
0
                }
1474
0
1475
0
                spin_lock_irqsave(&prv->lock, flags);
1476
0
                svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1477
0
                svc->period = period;
1478
0
                svc->budget = budget;
1479
0
                if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1480
0
                    __set_bit(__RTDS_extratime, &svc->flags);
1481
0
                else
1482
0
                    __clear_bit(__RTDS_extratime, &svc->flags);
1483
0
                spin_unlock_irqrestore(&prv->lock, flags);
1484
0
            }
1485
0
            /* Process a most 64 vCPUs without checking for preemptions. */
1486
0
            if ( (++index > 63) && hypercall_preempt_check() )
1487
0
                break;
1488
0
        }
1489
0
        if ( !rc )
1490
0
            /* notify upper caller how many vcpus have been processed. */
1491
0
            op->u.v.nr_vcpus = index;
1492
0
        break;
1493
0
    }
1494
0
1495
0
    return rc;
1496
0
}
1497
1498
/*
1499
 * The replenishment timer handler picks vcpus
1500
 * from the replq and does the actual replenishment.
1501
 */
1502
0
static void repl_timer_handler(void *data){
1503
0
    s_time_t now;
1504
0
    struct scheduler *ops = data;
1505
0
    struct rt_private *prv = rt_priv(ops);
1506
0
    struct list_head *replq = rt_replq(ops);
1507
0
    struct list_head *runq = rt_runq(ops);
1508
0
    struct timer *repl_timer = prv->repl_timer;
1509
0
    struct list_head *iter, *tmp;
1510
0
    struct rt_vcpu *svc;
1511
0
    LIST_HEAD(tmp_replq);
1512
0
1513
0
    spin_lock_irq(&prv->lock);
1514
0
1515
0
    now = NOW();
1516
0
1517
0
    /*
1518
0
     * Do the replenishment and move replenished vcpus
1519
0
     * to the temporary list to tickle.
1520
0
     * If svc is on run queue, we need to put it at
1521
0
     * the correct place since its deadline changes.
1522
0
     */
1523
0
    list_for_each_safe ( iter, tmp, replq )
1524
0
    {
1525
0
        svc = replq_elem(iter);
1526
0
1527
0
        if ( now < svc->cur_deadline )
1528
0
            break;
1529
0
1530
0
        list_del(&svc->replq_elem);
1531
0
        rt_update_deadline(now, svc);
1532
0
        list_add(&svc->replq_elem, &tmp_replq);
1533
0
1534
0
        if ( vcpu_on_q(svc) )
1535
0
        {
1536
0
            q_remove(svc);
1537
0
            runq_insert(ops, svc);
1538
0
        }
1539
0
    }
1540
0
1541
0
    /*
1542
0
     * Iterate through the list of updated vcpus.
1543
0
     * If an updated vcpu is running, tickle the head of the
1544
0
     * runqueue if it has a higher priority.
1545
0
     * If an updated vcpu was depleted and on the runqueue, tickle it.
1546
0
     * Finally, reinsert the vcpus back to replenishement events list.
1547
0
     */
1548
0
    list_for_each_safe ( iter, tmp, &tmp_replq )
1549
0
    {
1550
0
        svc = replq_elem(iter);
1551
0
1552
0
        if ( curr_on_cpu(svc->vcpu->processor) == svc->vcpu &&
1553
0
             !list_empty(runq) )
1554
0
        {
1555
0
            struct rt_vcpu *next_on_runq = q_elem(runq->next);
1556
0
1557
0
            if ( compare_vcpu_priority(svc, next_on_runq) < 0 )
1558
0
                runq_tickle(ops, next_on_runq);
1559
0
        }
1560
0
        else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1561
0
                  vcpu_on_q(svc) )
1562
0
            runq_tickle(ops, svc);
1563
0
1564
0
        list_del(&svc->replq_elem);
1565
0
        deadline_replq_insert(svc, &svc->replq_elem, replq);
1566
0
    }
1567
0
1568
0
    /*
1569
0
     * If there are vcpus left in the replenishment event list,
1570
0
     * set the next replenishment to happen at the deadline of
1571
0
     * the one in the front.
1572
0
     */
1573
0
    if ( !list_empty(replq) )
1574
0
        set_timer(repl_timer, replq_elem(replq->next)->cur_deadline);
1575
0
1576
0
    spin_unlock_irq(&prv->lock);
1577
0
}
1578
1579
static const struct scheduler sched_rtds_def = {
1580
    .name           = "SMP RTDS Scheduler",
1581
    .opt_name       = "rtds",
1582
    .sched_id       = XEN_SCHEDULER_RTDS,
1583
    .sched_data     = NULL,
1584
1585
    .dump_cpu_state = rt_dump_pcpu,
1586
    .dump_settings  = rt_dump,
1587
    .init           = rt_init,
1588
    .deinit         = rt_deinit,
1589
    .init_pdata     = rt_init_pdata,
1590
    .switch_sched   = rt_switch_sched,
1591
    .deinit_pdata   = rt_deinit_pdata,
1592
    .alloc_domdata  = rt_alloc_domdata,
1593
    .free_domdata   = rt_free_domdata,
1594
    .init_domain    = rt_dom_init,
1595
    .destroy_domain = rt_dom_destroy,
1596
    .alloc_vdata    = rt_alloc_vdata,
1597
    .free_vdata     = rt_free_vdata,
1598
    .insert_vcpu    = rt_vcpu_insert,
1599
    .remove_vcpu    = rt_vcpu_remove,
1600
1601
    .adjust         = rt_dom_cntl,
1602
1603
    .pick_cpu       = rt_cpu_pick,
1604
    .do_schedule    = rt_schedule,
1605
    .sleep          = rt_vcpu_sleep,
1606
    .wake           = rt_vcpu_wake,
1607
    .context_saved  = rt_context_saved,
1608
};
1609
1610
REGISTER_SCHEDULER(sched_rtds_def);