/root/src/xen/xen/common/sched_credit2.c

Source (jump to first uncovered line)

/****************************************************************************
 * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
 ****************************************************************************
 *
 *        File: common/sched_credit2.c
 *      Author: George Dunlap
 *
 * Description: Credit-based SMP CPU scheduler
 * Based on an earlier verson by Emmanuel Ackaouy.
 */

#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
#include <xen/domain.h>
#include <xen/delay.h>
#include <xen/event.h>
#include <xen/time.h>
#include <xen/perfc.h>
#include <xen/sched-if.h>
#include <xen/softirq.h>
#include <asm/div64.h>
#include <xen/errno.h>
#include <xen/trace.h>
#include <xen/cpu.h>
#include <xen/keyhandler.h>

/* Meant only for helping developers during debugging. */
/* #define d2printk printk */
#define d2printk(x...)


/*
 * Credit2 tracing events ("only" 512 available!). Check
 * include/public/trace.h for more details.
 */
#define TRC_CSCHED2_TICK             TRC_SCHED_CLASS_EVT(CSCHED2, 1)
#define TRC_CSCHED2_RUNQ_POS         TRC_SCHED_CLASS_EVT(CSCHED2, 2)
#define TRC_CSCHED2_CREDIT_BURN      TRC_SCHED_CLASS_EVT(CSCHED2, 3)
#define TRC_CSCHED2_CREDIT_ADD       TRC_SCHED_CLASS_EVT(CSCHED2, 4)
#define TRC_CSCHED2_TICKLE_CHECK     TRC_SCHED_CLASS_EVT(CSCHED2, 5)
#define TRC_CSCHED2_TICKLE           TRC_SCHED_CLASS_EVT(CSCHED2, 6)
#define TRC_CSCHED2_CREDIT_RESET     TRC_SCHED_CLASS_EVT(CSCHED2, 7)
#define TRC_CSCHED2_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(CSCHED2, 8)
#define TRC_CSCHED2_UPDATE_LOAD      TRC_SCHED_CLASS_EVT(CSCHED2, 9)
#define TRC_CSCHED2_RUNQ_ASSIGN      TRC_SCHED_CLASS_EVT(CSCHED2, 10)
#define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
#define TRC_CSCHED2_TICKLE_NEW       TRC_SCHED_CLASS_EVT(CSCHED2, 13)
#define TRC_CSCHED2_RUNQ_MAX_WEIGHT  TRC_SCHED_CLASS_EVT(CSCHED2, 14)
#define TRC_CSCHED2_MIGRATE          TRC_SCHED_CLASS_EVT(CSCHED2, 15)
#define TRC_CSCHED2_LOAD_CHECK       TRC_SCHED_CLASS_EVT(CSCHED2, 16)
#define TRC_CSCHED2_LOAD_BALANCE     TRC_SCHED_CLASS_EVT(CSCHED2, 17)
#define TRC_CSCHED2_PICKED_CPU       TRC_SCHED_CLASS_EVT(CSCHED2, 19)
#define TRC_CSCHED2_RUNQ_CANDIDATE   TRC_SCHED_CLASS_EVT(CSCHED2, 20)
#define TRC_CSCHED2_SCHEDULE         TRC_SCHED_CLASS_EVT(CSCHED2, 21)
#define TRC_CSCHED2_RATELIMIT        TRC_SCHED_CLASS_EVT(CSCHED2, 22)
#define TRC_CSCHED2_RUNQ_CAND_CHECK  TRC_SCHED_CLASS_EVT(CSCHED2, 23)

/*
 * WARNING: This is still in an experimental phase.  Status and work can be found at the
 * credit2 wiki page:
 *  http://wiki.xen.org/wiki/Credit2_Scheduler_Development
 *
 * TODO:
 * + Hyperthreading
 *  - Look for non-busy core if possible
 *  - "Discount" time run on a thread with busy siblings
 * + Algorithm:
 *  - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
 *    a flash animation in the background) can we schedule it with low enough latency
 *    so that audio doesn't skip?
 *  - Cap and reservation: How to implement with the current system?
 * + Optimizing
 *  - Profiling, making new algorithms, making math more efficient (no long division)
 */

/*
 * Design:
 *
 * VMs "burn" credits based on their weight; higher weight means
 * credits burn more slowly.  The highest weight vcpu burns credits at
 * a rate of 1 credit per nanosecond.  Others burn proportionally
 * more.
 *
 * vcpus are inserted into the runqueue by credit order.
 *
 * Credits are "reset" when the next vcpu in the runqueue is less than
 * or equal to zero.  At that point, everyone's credits are "clipped"
 * to a small value, and a fixed credit is added to everyone.
 */

/*
 * Utilization cap:
 *
 * Setting an pCPU utilization cap for a domain means the following:
 *
 * - a domain can have a cap, expressed in terms of % of physical CPU time.
 *   A domain that must not use more than 1/4 of _one_ physical CPU, will
 *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
 *   physical CPU time, will be given a cap of 150%;
 *
 * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
 *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
 *   vCPUs, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
 *   split among the v vCPUs. How much each of the vCPUs will actually get,
 *   during any given interval of time, is unspecified (as it depends on
 *   various aspects: workload, system load, etc.). For instance, it is
 *   possible that, during a given time interval, 2 vCPUs use 100% each,
 *   and the other two use nothing; while during another time interval,
 *   two vCPUs use 80%, one uses 10% and the other 30%; or that each use
 *   50% (and so on and so forth).
 *
 * For implementing this, we use the following approach:
 *
 * - each domain is given a 'budget', an each domain has a timer, which
 *   replenishes the domain's budget periodically. The budget is the amount
 *   of time the vCPUs of the domain can use every 'period';
 *
 * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
 *   (but each domain has its own timer; so the all are periodic by the same
 *   period, but replenishment of the budgets of the various domains, at
 *   periods boundaries, are not synchronous);
 *
 * - when vCPUs run, they consume budget. When they don't run, they don't
 *   consume budget. If there is no budget left for the domain, no vCPU of
 *   that domain can run. If a vCPU tries to run and finds that there is no
 *   budget, it blocks.
 *   At whatever time a vCPU wants to run, it must check the domain's budget,
 *   and if there is some, it can use it.
 *
 * - budget is replenished to the top of the capacity for the domain once
 *   per period. Even if there was some leftover budget from previous period,
 *   though, the budget after a replenishment will always be at most equal
 *   to the total capacify of the domain ('tot_budget');
 *
 * - when a budget replenishment occurs, if there are vCPUs that had been
 *   blocked because of lack of budget, they'll be unblocked, and they will
 *   (potentially) be able to run again.
 *
 * Finally, some even more implementation related detail:
 *
 * - budget is stored in a domain-wide pool. vCPUs of the domain that want
 *   to run go to such pool, and grub some. When they do so, the amount
 *   they grabbed is _immediately_ removed from the pool. This happens in
 *   vcpu_grab_budget();
 *
 * - when vCPUs stop running, if they've not consumed all the budget they
 *   took, the leftover is put back in the pool. This happens in
 *   vcpu_return_budget();
 *
 * - the above means that a vCPU can find out that there is no budget and
 *   block, not only if the cap has actually been reached (for this period),
 *   but also if some other vCPUs, in order to run, have grabbed a certain
 *   quota of budget, no matter whether they've already used it all or not.
 *   A vCPU blocking because (any form of) lack of budget is said to be
 *   "parked", and such blocking happens in park_vcpu();
 *
 * - when a vCPU stops running, and puts back some budget in the domain pool,
 *   we need to check whether there is someone which has been parked and that
 *   can be unparked. This happens in unpark_parked_vcpus(), called from
 *   csched2_context_saved();
 *
 * - of course, unparking happens also as a consequence of the domain's budget
 *   being replenished by the periodic timer. This also occurs by means of
 *   calling csched2_context_saved() (but from replenish_domain_budget());
 *
 * - parked vCPUs of a domain are kept in a (per-domain) list, called
 *   'parked_vcpus'). Manipulation of the list and of the domain-wide budget
 *   pool, must occur only when holding the 'budget_lock'.
 */

/*
 * Locking:
 *
 * - runqueue lock
 *  + it is per-runqueue, so:
 *   * cpus in a runqueue take the runqueue lock, when using
 *     pcpu_schedule_lock() / vcpu_schedule_lock() (and friends),
 *   * a cpu may (try to) take a "remote" runqueue lock, e.g., for
 *     load balancing;
 *  + serializes runqueue operations (removing and inserting vcpus);
 *  + protects runqueue-wide data in csched2_runqueue_data;
 *  + protects vcpu parameters in csched2_vcpu for the vcpu in the
 *    runqueue.
 *
 * - Private scheduler lock
 *  + protects scheduler-wide data in csched2_private, such as:
 *   * the list of domains active in this scheduler,
 *   * what cpus and what runqueues are active and in what
 *     runqueue each cpu is;
 *  + serializes the operation of changing the weights of domains;
 *
 * - Budget lock
 *  + it is per-domain;
 *  + protects, in domains that have an utilization cap;
 *   * manipulation of the total budget of the domain (as it is shared
 *     among all vCPUs of the domain),
 *   * manipulation of the list of vCPUs that are blocked waiting for
 *     some budget to be available.
 *
 * - Type:
 *  + runqueue locks are 'regular' spinlocks;
 *  + the private scheduler lock can be an rwlock. In fact, data
 *    it protects is modified only during initialization, cpupool
 *    manipulation and when changing weights, and read in all
 *    other cases (e.g., during load balancing);
 *  + budget locks are 'regular' spinlocks.
 *
 * Ordering:
 *  + tylock must be used when wanting to take a runqueue lock,
 *    if we already hold another one;
 *  + if taking both a runqueue lock and the private scheduler
 *    lock is, the latter must always be taken for first;
 *  + if taking both a runqueue lock and a budget lock, the former
 *    must always be taken for first.
 */

/*
 * Basic constants
 */
/* Default weight: How much a new domain starts with. */
#define CSCHED2_DEFAULT_WEIGHT       256
/*
 * Min timer: Minimum length a timer will be set, to
 * achieve efficiency.
 */
#define CSCHED2_MIN_TIMER            MICROSECS(500)
/*
 * Amount of credit VMs begin with, and are reset to.
 * ATM, set so that highest-weight VMs can only run for 10ms
 * before a reset event.
 */
#define CSCHED2_CREDIT_INIT          MILLISECS(10)
/*
 * Amount of credit the idle vcpus have. It never changes, as idle
 * vcpus does not consume credits, and it must be lower than whatever
 * amount of credit 'regular' vcpu would end up with.
 */
#define CSCHED2_IDLE_CREDIT          (-(1U<<30))
/*
 * Carryover: How much "extra" credit may be carried over after
 * a reset.
 */
#define CSCHED2_CARRYOVER_MAX        CSCHED2_MIN_TIMER
/*
 * Stickiness: Cross-L2 migration resistance.  Should be less than
 * MIN_TIMER.
 */
#define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
/* How much to "compensate" a vcpu for L2 migration. */
#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
/* How tolerant we should be when peeking at runtime of vcpus on other cpus */
#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
/* Reset: Value below which credit will be reset. */
#define CSCHED2_CREDIT_RESET         0
/* Max timer: Maximum time a guest can be run for. */
#define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
/* Period of the cap replenishment timer. */
#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))

/*
 * Flags
 */
/*
 * CSFLAG_scheduled: Is this vcpu either running on, or context-switching off,
 * a physical cpu?
 * + Accessed only with runqueue lock held
 * + Set when chosen as next in csched2_schedule().
 * + Cleared after context switch has been saved in csched2_context_saved()
 * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
 *   set CSFLAG_delayed_runq_add
 * + Checked to be false in runq_insert.
 */
#define __CSFLAG_scheduled 1
#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
/*
 * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
 * being context switched out?
 * + Set when scheduling out in csched2_schedule() if prev is runnable
 * + Set in csched2_vcpu_wake if it finds CSFLAG_scheduled set
 * + Read in csched2_context_saved().  If set, it adds prev to the runqueue and
 *   clears the bit.
 */
#define __CSFLAG_delayed_runq_add 2
#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
/*
 * CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a
 * credit2-initiated runq migrate request; migrate it to the runqueue indicated
 * in the svc struct. 
 */
#define __CSFLAG_runq_migrate_request 3
#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
/*
 * CSFLAG_vcpu_yield: this vcpu was running, and has called vcpu_yield(). The
 * scheduler is invoked to see if we can give the cpu to someone else, and
 * get back to the yielding vcpu in a while.
 */
#define __CSFLAG_vcpu_yield 4
#define CSFLAG_vcpu_yield (1U<<__CSFLAG_vcpu_yield)

static unsigned int __read_mostly opt_migrate_resist = 500;
integer_param("sched_credit2_migrate_resist", opt_migrate_resist);

/*
 * Load tracking and load balancing
 *
 * Load history of runqueues and vcpus is accounted for by using an
 * exponential weighted moving average algorithm. However, instead of using
 * fractions,we shift everything to left by the number of bits we want to
 * use for representing the fractional part (Q-format).
 *
 * We may also want to reduce the precision of time accounting, to
 * accommodate 'longer  windows'. So, if that is the case, we just need to
 * shift all time samples to the right.
 *
 * The details of the formulas used for load tracking are explained close to
 * update_runq_load(). Let's just say here that, with full nanosecond time
 * granularity, a 30 bits wide 'decaying window' is ~1 second long.
 *
 * We want to consider the following equations:
 *
 *  avg[0] = load*P
 *  avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W,  0 <= delta <= W
 *
 * where W is the length of the window, P the multiplier for transitiong into
 * Q-format fixed point arithmetic and load is the instantaneous load of a
 * runqueue, which basically is the number of runnable vcpus there are on the
 * runqueue (for the meaning of the other terms, look at the doc comment to
 *  update_runq_load()).
 *
 *  So, again, with full nanosecond granularity, and 1 second window, we have:
 *
 *  W = 2^30
 *  P = 2^18
 *
 * The maximum possible value for the average load, which we want to store in
 * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
 * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
 * means we can have 2^45 vcpus in each runqueue, before overflow occurs!
 *
 * However, it can happen that, at step j+1, if:
 *
 *  avg[j] = load*P
 *  delta = W
 *
 * then:
 *
 *  avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
 *
 * So we must be able to deal with W*load*P. This means load can't be higher
 * than:
 *
 *  2^(63 - 30 - 18) = 2^15 = 32768
 *
 * So 32768 is the maximum number of vcpus the we can have in a runqueue,
 * at any given time, and still not have problems with the load tracking
 * calculations... and this is more than fine.
 *
 * As a matter of fact, since we are using microseconds granularity, we have
 * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
 * may be 2^25 = 33554432 vcpus in a runq before we have to start thinking
 * about overflow.
 */

/* If >0, decreases the granularity of time samples used for load tracking. */
#define LOADAVG_GRANULARITY_SHIFT   (10)
/* Time window during which we still give value to previous load history. */
#define LOADAVG_WINDOW_SHIFT        (30)
/* 18 bits by default (and not less than 4) for decimals. */
#define LOADAVG_PRECISION_SHIFT     (18)
#define LOADAVG_PRECISION_SHIFT_MIN (4)

/*
 * Both the length of the window and the number of fractional bits can be
 * decided with boot parameters.
 *
 * The length of the window is always expressed in nanoseconds. The actual
 * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
 */
static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
integer_param("credit2_load_window_shift", opt_load_window_shift);
static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
integer_param("credit2_load_precision_shift", opt_load_precision_shift);

static int __read_mostly opt_underload_balance_tolerance = 0;
integer_param("credit2_balance_under", opt_underload_balance_tolerance);
static int __read_mostly opt_overload_balance_tolerance = -3;
integer_param("credit2_balance_over", opt_overload_balance_tolerance);
/*
 * Domains subject to a cap receive a replenishment of their runtime budget
 * once every opt_cap_period interval. Default is 10 ms. The amount of budget
 * they receive depends on their cap. For instance, a domain with a 50% cap
 * will receive 50% of 10 ms, so 5 ms.
 */
static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
integer_param("credit2_cap_period_ms", opt_cap_period);

/*
 * Runqueue organization.
 *
 * The various cpus are to be assigned each one to a runqueue, and we
 * want that to happen basing on topology. At the moment, it is possible
 * to choose to arrange runqueues to be:
 *
 * - per-cpu: meaning that there will be one runqueue per logical cpu. This
 *            will happen when if the opt_runqueue parameter is set to 'cpu'.
 *
 * - per-core: meaning that there will be one runqueue per each physical
 *             core of the host. This will happen if the opt_runqueue
 *             parameter is set to 'core';
 *
 * - per-socket: meaning that there will be one runqueue per each physical
 *               socket (AKA package, which often, but not always, also
 *               matches a NUMA node) of the host; This will happen if
 *               the opt_runqueue parameter is set to 'socket';
 *
 * - per-node: meaning that there will be one runqueue per each physical
 *             NUMA node of the host. This will happen if the opt_runqueue
 *             parameter is set to 'node';
 *
 * - global: meaning that there will be only one runqueue to which all the
 *           (logical) processors of the host belong. This will happen if
 *           the opt_runqueue parameter is set to 'all'.
 *
 * Depending on the value of opt_runqueue, therefore, cpus that are part of
 * either the same physical core, the same physical socket, the same NUMA
 * node, or just all of them, will be put together to form runqueues.
 */
#define OPT_RUNQUEUE_CPU    0
#define OPT_RUNQUEUE_CORE   1
#define OPT_RUNQUEUE_SOCKET 2
#define OPT_RUNQUEUE_NODE   3
#define OPT_RUNQUEUE_ALL    4
static const char *const opt_runqueue_str[] = {
    [OPT_RUNQUEUE_CPU] = "cpu",
    [OPT_RUNQUEUE_CORE] = "core",
    [OPT_RUNQUEUE_SOCKET] = "socket",
    [OPT_RUNQUEUE_NODE] = "node",
    [OPT_RUNQUEUE_ALL] = "all"
};
static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;

static int parse_credit2_runqueue(const char *s)
{
    unsigned int i;

    for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
    {
        if ( !strcmp(s, opt_runqueue_str[i]) )
        {
            opt_runqueue = i;
            return 0;
        }
    }

    return -EINVAL;
}
custom_param("credit2_runqueue", parse_credit2_runqueue);

/*
 * Per-runqueue data
 */
struct csched2_runqueue_data {
    spinlock_t lock;           /* Lock for this runqueue                     */

    struct list_head runq;     /* Ordered list of runnable vms               */
    int id;                    /* ID of this runqueue (-1 if invalid)        */

    int load;                  /* Instantaneous load (num of non-idle vcpus) */
    s_time_t load_last_update; /* Last time average was updated              */
    s_time_t avgload;          /* Decaying queue load                        */
    s_time_t b_avgload;        /* Decaying queue load modified by balancing  */

    cpumask_t active,          /* CPUs enabled for this runqueue             */
        smt_idle,              /* Fully idle-and-untickled cores (see below) */
        tickled,               /* Have been asked to go through schedule     */
        idle;                  /* Currently idle pcpus                       */

    struct list_head svc;      /* List of all vcpus assigned to the runqueue */
    unsigned int max_weight;   /* Max weight of the vcpus in this runqueue   */
    unsigned int pick_bias;    /* Last picked pcpu. Start from it next time  */
};

/*
 * System-wide private data
 */
struct csched2_private {
    rwlock_t lock;                     /* Private scheduler lock             */

    unsigned int load_precision_shift; /* Precision of load calculations     */
    unsigned int load_window_shift;    /* Lenght of load decaying window     */
    unsigned int ratelimit_us;         /* Rate limiting for this scheduler   */

    cpumask_t active_queues;           /* Runqueues with (maybe) active cpus */
    struct csched2_runqueue_data *rqd; /* Data of the various runqueues      */

    cpumask_t initialized;             /* CPUs part of this scheduler        */
    struct list_head sdom;             /* List of domains (for debug key)    */
};

/*
 * Physical CPU
 *
 * The only per-pCPU information we need to maintain is of which runqueue
 * each CPU is part of.
 */
static DEFINE_PER_CPU(int, runq_map);

/*
 * Virtual CPU
 */
struct csched2_vcpu {
    struct csched2_dom *sdom;          /* Up-pointer to domain                */
    struct vcpu *vcpu;                 /* Up-pointer, to vcpu                 */
    struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue          */

    int credit;                        /* Current amount of credit            */
    unsigned int weight;               /* Weight of this vcpu                 */
    unsigned int residual;             /* Reminder of div(max_weight/weight)  */
    unsigned flags;                    /* Status flags (16 bits would be ok,  */
    s_time_t budget;                   /* Current budget (if domains has cap) */
                                       /* but clear_bit() does not like that) */
    s_time_t budget_quota;             /* Budget to which vCPU is entitled    */

    s_time_t start_time;               /* Time we were scheduled (for credit) */

    /* Individual contribution to load                                        */
    s_time_t load_last_update;         /* Last time average was updated       */
    s_time_t avgload;                  /* Decaying queue load                 */

    struct list_head runq_elem;        /* On the runqueue (rqd->runq)         */
    struct list_head parked_elem;      /* On the parked_vcpus list            */
    struct list_head rqd_elem;         /* On csched2_runqueue_data's svc list */
    struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
    int tickled_cpu;                   /* Cpu that will pick us (-1 if none)  */
};

/*
 * Domain
 */
struct csched2_dom {
    struct domain *dom;         /* Up-pointer to domain                       */

    spinlock_t budget_lock;     /* Serialized budget calculations             */
    s_time_t tot_budget;        /* Total amount of budget                     */
    s_time_t budget;            /* Currently available budget                 */

    struct timer *repl_timer;   /* Timer for periodic replenishment of budget */
    s_time_t next_repl;         /* Time at which next replenishment occurs    */
    struct list_head parked_vcpus; /* List of CPUs waiting for budget         */

    struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list       */
    uint16_t weight;            /* User specified weight                      */
    uint16_t cap;               /* User specified cap                         */
    uint16_t nr_vcpus;          /* Number of vcpus of this domain             */
};

/*
 * Accessor helpers functions.
 */
static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
{
    return ops->sched_data;
}

static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v)
{
    return v->sched_priv;
}

static inline struct csched2_dom *csched2_dom(const struct domain *d)
{
    return d->sched_priv;
}

/* CPU to runq_id macro */
static inline int c2r(unsigned int cpu)
{
    return per_cpu(runq_map, cpu);
}

/* CPU to runqueue struct macro */
static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
                                                  unsigned int cpu)
{
    return &csched2_priv(ops)->rqd[c2r(cpu)];
}

/* Does the domain of this vCPU have a cap? */
static inline bool has_cap(const struct csched2_vcpu *svc)
{
    return svc->budget != STIME_MAX;
}

/*
 * Hyperthreading (SMT) support.
 *
 * We use a special per-runq mask (smt_idle) and update it according to the
 * following logic:
 *  - when _all_ the SMT sibling in a core are idle, all their corresponding
 *    bits are set in the smt_idle mask;
 *  - when even _just_one_ of the SMT siblings in a core is not idle, all the
 *    bits correspondings to it and to all its siblings are clear in the
 *    smt_idle mask.
 *
 * Once we have such a mask, it is easy to implement a policy that, either:
 *  - uses fully idle cores first: it is enough to try to schedule the vcpus
 *    on pcpus from smt_idle mask first. This is what happens if
 *    sched_smt_power_savings was not set at boot (default), and it maximizes
 *    true parallelism, and hence performance;
 *  - uses already busy cores first: it is enough to try to schedule the vcpus
 *    on pcpus that are idle, but are not in smt_idle. This is what happens if
 *    sched_smt_power_savings is set at boot, and it allows as more cores as
 *    possible to stay in low power states, minimizing power consumption.
 *
 * This logic is entirely implemented in runq_tickle(), and that is enough.
 * In fact, in this scheduler, placement of a vcpu on one of the pcpus of a
 * runq, _always_ happens by means of tickling:
 *  - when a vcpu wakes up, it calls csched2_vcpu_wake(), which calls
 *    runq_tickle();
 *  - when a migration is initiated in schedule.c, we call csched2_cpu_pick(),
 *    csched2_vcpu_migrate() (which calls migrate()) and csched2_vcpu_wake().
 *    csched2_cpu_pick() looks for the least loaded runq and return just any
 *    of its processors. Then, csched2_vcpu_migrate() just moves the vcpu to
 *    the chosen runq, and it is again runq_tickle(), called by
 *    csched2_vcpu_wake() that actually decides what pcpu to use within the
 *    chosen runq;
 *  - when a migration is initiated in sched_credit2.c, by calling  migrate()
 *    directly, that again temporarily use a random pcpu from the new runq,
 *    and then calls runq_tickle(), by itself.
 */

/*
 * If all the siblings of cpu (including cpu itself) are both idle and
 * untickled, set all their bits in mask.
 *
 * NB that rqd->smt_idle is different than rqd->idle.  rqd->idle
 * records pcpus that at are merely idle (i.e., at the moment do not
 * have a vcpu running on them).  But you have to manually filter out
 * which pcpus have been tickled in order to find cores that are not
 * going to be busy soon.  Filtering out tickled cpus pairwise is a
 * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
 * the bits of a pcpu are set only if all the threads on its core are
 * both idle *and* untickled.
 *
 * This means changing the mask when either rqd->idle or rqd->tickled
 * changes.
 */
static inline
void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
                       cpumask_t *mask)
{
    const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu);

    if ( cpumask_subset(cpu_siblings, idlers) )
        cpumask_or(mask, mask, cpu_siblings);
}

/*
 * Clear the bits of all the siblings of cpu from mask (if necessary).
 */
static inline
void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
{
    const cpumask_t *cpu_siblings = per_cpu(cpu_sibling_mask, cpu);

    if ( cpumask_subset(cpu_siblings, mask) )
        cpumask_andnot(mask, mask, per_cpu(cpu_sibling_mask, cpu));
}

/*
 * In csched2_cpu_pick(), it may not be possible to actually look at remote
 * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
 * we pick, in order of decreasing preference:
 *  1) svc's current pcpu, if it is part of svc's soft affinity;
 *  2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
 *  3) svc's current pcpu, if it is part of svc's hard affinity;
 *  4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
 *  5) just one valid pcpu from svc's hard affinity
 *
 * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
 * note that at least 5 is guaranteed to _always_ return at least one pcpu.
 */
static int get_fallback_cpu(struct csched2_vcpu *svc)
{
    struct vcpu *v = svc->vcpu;
    unsigned int bs;

    SCHED_STAT_CRANK(need_fallback_cpu);

    for_each_affinity_balance_step( bs )
    {
        int cpu = v->processor;

        if ( bs == BALANCE_SOFT_AFFINITY &&
             !has_soft_affinity(v, v->cpu_hard_affinity) )
            continue;

        affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu));
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                    cpupool_domain_cpumask(v->domain));

        /*
         * This is cases 1 or 3 (depending on bs): if v->processor is (still)
         * in our affinity, go for it, for cache betterness.
         */
        if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
            return cpu;

        /*
         * This is cases 2 or 4 (depending on bs): v->processor isn't there
         * any longer, check if we at least can stay in our current runq.
         */
        if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
                                       &svc->rqd->active)) )
        {
            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                        &svc->rqd->active);
            return cpumask_first(cpumask_scratch_cpu(cpu));
        }

        /*
         * We may well pick any valid pcpu from our soft-affinity, outside
         * of our current runqueue, but we decide not to. In fact, changing
         * runqueue is slow, affects load distribution, and is a source of
         * overhead for the vcpus running on the other runqueue (we need the
         * lock). So, better do that as a consequence of a well informed
         * decision (or if we really don't have any other chance, as we will,
         * at step 5, if we get to there).
         *
         * Also, being here, looking for a fallback, is an unfortunate and
         * infrequent event, while the decision of putting us in the runqueue
         * wehere we are was (likely) made taking all the relevant factors
         * into account. So let's not disrupt that, just for the sake of
         * soft-affinity, and let's wait here to be able to made (hopefully,
         * soon), another similar well informed decision.
         */
        if ( bs == BALANCE_SOFT_AFFINITY )
            continue;

        /*
         * This is cases 5: last stand, just one valid pcpu from our hard
         * affinity. It's guaranteed that there is at least one valid cpu,
         * and therefore we are sure that we return it, and never really
         * exit the loop.
         */
        ASSERT(bs == BALANCE_HARD_AFFINITY &&
               !cpumask_empty(cpumask_scratch_cpu(cpu)));
        cpu = cpumask_first(cpumask_scratch_cpu(cpu));
        if ( likely(cpu < nr_cpu_ids) )
            return cpu;
    }
    ASSERT_UNREACHABLE();
    /*
     * We can't be here.  But if that somehow happen (in non-debug builds),
     * at least return something which both online and in our hard-affinity.
     */
    return cpumask_any(cpumask_scratch_cpu(v->processor));
}

/*
 * Time-to-credit, credit-to-time.
 *
 * We keep track of the "residual" time to make sure that frequent short
 * schedules still get accounted for in the end.
 *
 * FIXME: Do pre-calculated division?
 */
static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
                          struct csched2_vcpu *svc)
{
    uint64_t val = time * rqd->max_weight + svc->residual;

    svc->residual = do_div(val, svc->weight);
    svc->credit -= val;
}

static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_vcpu *svc)
{
    return credit * svc->weight / rqd->max_weight;
}

/*
 * Runqueue related code.
 */

static inline int vcpu_on_runq(struct csched2_vcpu *svc)
{
    return !list_empty(&svc->runq_elem);
}

static inline struct csched2_vcpu * runq_elem(struct list_head *elem)
{
    return list_entry(elem, struct csched2_vcpu, runq_elem);
}

static void activate_runqueue(struct csched2_private *prv, int rqi)
{
    struct csched2_runqueue_data *rqd;

    rqd = prv->rqd + rqi;

    BUG_ON(!cpumask_empty(&rqd->active));

    rqd->max_weight = 1;
    rqd->id = rqi;
    INIT_LIST_HEAD(&rqd->svc);
    INIT_LIST_HEAD(&rqd->runq);
    spin_lock_init(&rqd->lock);

    __cpumask_set_cpu(rqi, &prv->active_queues);
}

static void deactivate_runqueue(struct csched2_private *prv, int rqi)
{
    struct csched2_runqueue_data *rqd;

    rqd = prv->rqd + rqi;

    BUG_ON(!cpumask_empty(&rqd->active));

    rqd->id = -1;

    __cpumask_clear_cpu(rqi, &prv->active_queues);
}

static inline bool same_node(unsigned int cpua, unsigned int cpub)
{
    return cpu_to_node(cpua) == cpu_to_node(cpub);
}

static inline bool same_socket(unsigned int cpua, unsigned int cpub)
{
    return cpu_to_socket(cpua) == cpu_to_socket(cpub);
}

static inline bool same_core(unsigned int cpua, unsigned int cpub)
{
    return same_socket(cpua, cpub) &&
           cpu_to_core(cpua) == cpu_to_core(cpub);
}

static unsigned int
cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
{
    struct csched2_runqueue_data *rqd;
    unsigned int rqi;

    for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
    {
        unsigned int peer_cpu;

        /*
         * As soon as we come across an uninitialized runqueue, use it.
         * In fact, either:
         *  - we are initializing the first cpu, and we assign it to
         *    runqueue 0. This is handy, especially if we are dealing
         *    with the boot cpu (if credit2 is the default scheduler),
         *    as we would not be able to use cpu_to_socket() and similar
         *    helpers anyway (they're result of which is not reliable yet);
         *  - we have gone through all the active runqueues, and have not
         *    found anyone whose cpus' topology matches the one we are
         *    dealing with, so activating a new runqueue is what we want.
         */
        if ( prv->rqd[rqi].id == -1 )
            break;

        rqd = prv->rqd + rqi;
        BUG_ON(cpumask_empty(&rqd->active));

        peer_cpu = cpumask_first(&rqd->active);
        BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
               cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);

        if (opt_runqueue == OPT_RUNQUEUE_CPU)
            continue;
        if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
             (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
             (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
             (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
            break;
    }

    /* We really expect to be able to assign each cpu to a runqueue. */
    BUG_ON(rqi >= nr_cpu_ids);

    return rqi;
}

/* Find the domain with the highest weight. */
static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
                              int old_weight)
{
    /* Try to avoid brute-force search:
     * - If new_weight is larger, max_weigth <- new_weight
     * - If old_weight != max_weight, someone else is still max_weight
     *   (No action required)
     * - If old_weight == max_weight, brute-force search for max weight
     */
    if ( new_weight > rqd->max_weight )
    {
        rqd->max_weight = new_weight;
        SCHED_STAT_CRANK(upd_max_weight_quick);
    }
    else if ( old_weight == rqd->max_weight )
    {
        struct list_head *iter;
        int max_weight = 1;

        list_for_each( iter, &rqd->svc )
        {
            struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, rqd_elem);

            if ( svc->weight > max_weight )
                max_weight = svc->weight;
        }

        rqd->max_weight = max_weight;
        SCHED_STAT_CRANK(upd_max_weight_full);
    }

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned rqi:16, max_weight:16;
        } d;
        d.rqi = rqd->id;
        d.max_weight = rqd->max_weight;
        __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }
}

/* Add and remove from runqueue assignment (not active run queue) */
static void
_runq_assign(struct csched2_vcpu *svc, struct csched2_runqueue_data *rqd)
{

    svc->rqd = rqd;
    list_add_tail(&svc->rqd_elem, &svc->rqd->svc);

    update_max_weight(svc->rqd, svc->weight, 0);

    /* Expected new load based on adding this vcpu */
    rqd->b_avgload += svc->avgload;

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            unsigned rqi:16;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.rqi=rqd->id;
        __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

}

static void
runq_assign(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu *svc = vc->sched_priv;

    ASSERT(svc->rqd == NULL);

    _runq_assign(svc, c2rqd(ops, vc->processor));
}

static void
_runq_deassign(struct csched2_vcpu *svc)
{
    struct csched2_runqueue_data *rqd = svc->rqd;

    ASSERT(!vcpu_on_runq(svc));
    ASSERT(!(svc->flags & CSFLAG_scheduled));

    list_del_init(&svc->rqd_elem);
    update_max_weight(rqd, 0, svc->weight);

    /* Expected new load based on removing this vcpu */
    rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);

    svc->rqd = NULL;
}

static void
runq_deassign(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu *svc = vc->sched_priv;

    ASSERT(svc->rqd == c2rqd(ops, vc->processor));

    _runq_deassign(svc);
}

/*
 * Track the runq load by gathering instantaneous load samples, and using
 * exponentially weighted moving average (EWMA) for the 'decaying'.
 *
 * We consider a window of length W=2^(prv->load_window_shift) nsecs
 * (which takes LOADAVG_GRANULARITY_SHIFT into account).
 *
 * If load is the instantaneous load, the formula for EWMA looks as follows,
 * for the i-eth sample:
 *
 *  avg[i] = a*load + (1 - a)*avg[i-1]
 *
 * where avg[i] is the new value of the average load, avg[i-1] is the value
 * of the average load calculated so far, and a is a coefficient less or
 * equal to 1.
 *
 * So, for us, it becomes:
 *
 *  avgload = a*load + (1 - a)*avgload
 *
 * For determining a, we consider _when_ we are doing the load update, wrt
 * the length of the window. We define delta as follows:
 *
 *  delta = t - load_last_update
 *
 * where t is current time (i.e., time at which we are both sampling and
 * updating the load average) and load_last_update is the last time we did
 * that.
 *
 * There are two possible situations:
 *
 * a) delta <= W
 *    this means that, during the last window of length W, the runeuque load
 *    was avgload for (W - detla) time, and load for delta time:
 *
 *                |----------- W ---------|
 *                |                       |
 *                |     load_last_update  t
 *     -------------------------|---------|---
 *                |             |         |
 *                \__W - delta__/\_delta__/
 *                |             |         |
 *                |___avgload___|__load___|
 *
 *    So, what about using delta/W as our smoothing coefficient a. If we do,
 *    here's what happens:
 *
 *     a = delta / W
 *     1 - a = 1 - (delta / W) = (W - delta) / W
 *
 *    Which matches the above description of what happened in the last
 *    window of length W.
 *
 *    Note that this also means that the weight that we assign to both the
 *    latest load sample, and to previous history, varies at each update.
 *    The longer the latest load sample has been in efect, within the last
 *    window, the higher it weights (and the lesser the previous history
 *    weights).
 *
 *    This is some sort of extension of plain EWMA to fit even better to our
 *    use case.
 *
 * b) delta > W
 *    this means more than a full window has passed since the last update:
 *
 *                |----------- W ---------|
 *                |                       |
 *       load_last_update                 t
 *     ----|------------------------------|---
 *         |                              |
 *         \_________________delta________/
 *
 *    Basically, it means the last load sample has been in effect for more
 *    than W time, and hence we should just use it, and forget everything
 *    before that.
 *
 *    This can be seen as a 'reset condition', occurring when, for whatever
 *    reason, load has not been updated for longer than we expected. (It is
 *    also how avgload is assigned its first value.)
 *
 * The formula for avgload then becomes:
 *
 *  avgload = (delta/W)*load + (W - delta)*avgload/W
 *  avgload = delta*load/W + W*avgload/W - delta*avgload/W
 *  avgload = avgload + delta*load/W - delta*avgload/W
 *
 * So, final form is:
 *
 *  avgload_0 = load
 *  avgload = avgload + delta*load/W - delta*avgload/W,  0<=delta<=W
 *
 * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
 * what happens if we  update the load twice, at the same time instant?):
 *
 *  avgload = avgload + 0*load/W - 0*avgload/W
 *  avgload = avgload
 *
 * and when delta is W (i.e., what happens if we update at the last
 * possible instant before the window 'expires'?):
 *
 *  avgload = avgload + W*load/W - W*avgload/W
 *  avgload = avgload + load - avgload
 *  avgload = load
 *
 * Which, in both cases, is what we expect.
 */
static void
update_runq_load(const struct scheduler *ops,
                 struct csched2_runqueue_data *rqd, int change, s_time_t now)
{
    struct csched2_private *prv = csched2_priv(ops);
    s_time_t delta, load = rqd->load;
    unsigned int P, W;

    W = prv->load_window_shift;
    P = prv->load_precision_shift;
    now >>= LOADAVG_GRANULARITY_SHIFT;

    /*
     * To avoid using fractions, we shift to left by load_precision_shift,
     * and use the least last load_precision_shift bits as fractional part.
     * Looking back at the formula we want to use, we now have:
     *
     *  P = 2^(load_precision_shift)
     *  P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
     *  P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
     *
     * And if we are ok storing and using P*avgload, we can rewrite this as:
     *
     *  P*avgload = avgload'
     *  avgload' = avgload' + delta*P*load/W - delta*avgload'/W
     *
     * Coupled with, of course:
     *
     *  avgload_0' = P*load
     */

    if ( rqd->load_last_update + (1ULL << W)  < now )
    {
        rqd->avgload = load << P;
        rqd->b_avgload = load << P;
    }
    else
    {
        delta = now - rqd->load_last_update;
        if ( unlikely(delta < 0) )
        {
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
                     __func__, now, rqd->load_last_update);
            delta = 0;
        }

        /*
         * Note that, if we were to enforce (or check) some relationship
         * between P and W, we may save one shift. E.g., if we are sure
         * that P < W, we could write:
         *
         *  (delta * (load << P)) >> W
         *
         * as:
         *
         *  (delta * load) >> (W - P)
         */
        rqd->avgload = rqd->avgload +
                       ((delta * (load << P)) >> W) -
                       ((delta * rqd->avgload) >> W);
        rqd->b_avgload = rqd->b_avgload +
                         ((delta * (load << P)) >> W) -
                         ((delta * rqd->b_avgload) >> W);
    }
    rqd->load += change;
    rqd->load_last_update = now;

    /* Overflow, capable of making the load look negative, must not occur. */
    ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);

    if ( unlikely(tb_init_done) )
    {
        struct {
            uint64_t rq_avgload, b_avgload;
            unsigned rq_load:16, rq_id:8, shift:8;
        } d;
        d.rq_id = rqd->id;
        d.rq_load = rqd->load;
        d.rq_avgload = rqd->avgload;
        d.b_avgload = rqd->b_avgload;
        d.shift = P;
        __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }
}

static void
update_svc_load(const struct scheduler *ops,
                struct csched2_vcpu *svc, int change, s_time_t now)
{
    struct csched2_private *prv = csched2_priv(ops);
    s_time_t delta, vcpu_load;
    unsigned int P, W;

    if ( change == -1 )
        vcpu_load = 1;
    else if ( change == 1 )
        vcpu_load = 0;
    else
        vcpu_load = vcpu_runnable(svc->vcpu);

    W = prv->load_window_shift;
    P = prv->load_precision_shift;
    now >>= LOADAVG_GRANULARITY_SHIFT;

    if ( svc->load_last_update + (1ULL << W) < now )
    {
        svc->avgload = vcpu_load << P;
    }
    else
    {
        delta = now - svc->load_last_update;
        if ( unlikely(delta < 0) )
        {
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
                     __func__, now, svc->load_last_update);
            delta = 0;
        }

        svc->avgload = svc->avgload +
                       ((delta * (vcpu_load << P)) >> W) -
                       ((delta * svc->avgload) >> W);
    }
    svc->load_last_update = now;

    /* Overflow, capable of making the load look negative, must not occur. */
    ASSERT(svc->avgload >= 0);

    if ( unlikely(tb_init_done) )
    {
        struct {
            uint64_t v_avgload;
            unsigned vcpu:16, dom:16;
            unsigned shift;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.v_avgload = svc->avgload;
        d.shift = P;
        __trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }
}

static void
update_load(const struct scheduler *ops,
            struct csched2_runqueue_data *rqd,
            struct csched2_vcpu *svc, int change, s_time_t now)
{
    trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0,  NULL);

    update_runq_load(ops, rqd, change, now);
    if ( svc )
        update_svc_load(ops, svc, change, now);
}

static void
runq_insert(const struct scheduler *ops, struct csched2_vcpu *svc)
{
    struct list_head *iter;
    unsigned int cpu = svc->vcpu->processor;
    struct list_head * runq = &c2rqd(ops, cpu)->runq;
    int pos = 0;

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));

    ASSERT(!vcpu_on_runq(svc));
    ASSERT(c2r(cpu) == c2r(svc->vcpu->processor));

    ASSERT(&svc->rqd->runq == runq);
    ASSERT(!is_idle_vcpu(svc->vcpu));
    ASSERT(!svc->vcpu->is_running);
    ASSERT(!(svc->flags & CSFLAG_scheduled));

    list_for_each( iter, runq )
    {
        struct csched2_vcpu * iter_svc = runq_elem(iter);

        if ( svc->credit > iter_svc->credit )
            break;

        pos++;
    }
    list_add_tail(&svc->runq_elem, iter);

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            unsigned pos;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.pos = pos;
        __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }
}

static inline void runq_remove(struct csched2_vcpu *svc)
{
    ASSERT(vcpu_on_runq(svc));
    list_del_init(&svc->runq_elem);
}

void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *, s_time_t);

static inline void
tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
{
    __cpumask_set_cpu(cpu, &rqd->tickled);
    smt_idle_mask_clear(cpu, &rqd->smt_idle);
    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
}

/*
 * What we want to know is whether svc, which we assume to be running on some
 * pcpu, can be interrupted and preempted (which, so far, basically means
 * whether or not it already run for more than the ratelimit, to which we
 * apply some tolerance).
 */
static inline bool is_preemptable(const struct csched2_vcpu *svc,
                                    s_time_t now, s_time_t ratelimit)
{
    if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
        return true;

    ASSERT(svc->vcpu->is_running);
    return now - svc->vcpu->runstate.state_entry_time >
           ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
}

/*
 * Score to preempt the target cpu.  Return a negative number if the
 * credit isn't high enough; if it is, favor a preemption on cpu in
 * this order:
 * - cpu is in new's soft-affinity, not in cur's soft-affinity
 *   (2 x CSCHED2_CREDIT_INIT score bonus);
 * - cpu is in new's soft-affinity and cur's soft-affinity, or
 *   cpu is not in new's soft-affinity, nor in cur's soft-affinity
 *   (1x CSCHED2_CREDIT_INIT score bonus);
 * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
 *   (no bonus).
 *
 * Within the same class, the highest difference of credit.
 */
static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
                             struct csched2_vcpu *new, unsigned int cpu)
{
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
    struct csched2_vcpu * cur = csched2_vcpu(curr_on_cpu(cpu));
    struct csched2_private *prv = csched2_priv(ops);
    s_time_t score;

    /*
     * We are dealing with cpus that are marked non-idle (i.e., that are not
     * in rqd->idle). However, some of them may be running their idle vcpu,
     * if taking care of tasklets. In that case, we want to leave it alone.
     */
    if ( unlikely(is_idle_vcpu(cur->vcpu) ||
         !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
        return -1;

    burn_credits(rqd, cur, now);

    score = new->credit - cur->credit;
    if ( new->vcpu->processor != cpu )
        score -= CSCHED2_MIGRATE_RESIST;

    /*
     * If score is positive, it means new has enough credits (i.e.,
     * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
     *
     * Let's compute the bonuses for soft-affinities.
     */
    if ( score > 0 )
    {
        if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) )
            score += CSCHED2_CREDIT_INIT;

        if ( !cpumask_test_cpu(cpu, cur->vcpu->cpu_soft_affinity) )
            score += CSCHED2_CREDIT_INIT;
    }

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            int credit, score;
        } d;
        d.dom = cur->vcpu->domain->domain_id;
        d.vcpu = cur->vcpu->vcpu_id;
        d.credit = cur->credit;
        d.score = score;
        __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    return score;
}

/*
 * Check what processor it is best to 'wake', for picking up a vcpu that has
 * just been put (back) in the runqueue. Logic is as follows:
 *  1. if there are idle processors in the runq, wake one of them;
 *  2. if there aren't idle processor, check the one were the vcpu was
 *     running before to see if we can preempt what's running there now
 *     (and hence doing just one migration);
 *  3. last stand: check all processors and see if the vcpu is in right
 *     of preempting any of the other vcpus running on them (this requires
 *     two migrations, and that's indeed why it is left as the last stand).
 *
 * Note that when we say 'idle processors' what we really mean is (pretty
 * much always) both _idle_ and _not_already_tickled_. In fact, if a
 * processor has been tickled, it will run csched2_schedule() shortly, and
 * pick up some work, so it would be wrong to consider it idle.
 */
static void
runq_tickle(const struct scheduler *ops, struct csched2_vcpu *new, s_time_t now)
{
    int i, ipid = -1;
    s_time_t max = 0;
    unsigned int bs, cpu = new->vcpu->processor;
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
    cpumask_t *online = cpupool_domain_cpumask(new->vcpu->domain);
    cpumask_t mask;

    ASSERT(new->rqd == rqd);

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            unsigned processor;
            int credit;
        } d;
        d.dom = new->vcpu->domain->domain_id;
        d.vcpu = new->vcpu->vcpu_id;
        d.processor = new->vcpu->processor;
        d.credit = new->credit;
        __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    for_each_affinity_balance_step( bs )
    {
        /* Just skip first step, if we don't have a soft affinity */
        if ( bs == BALANCE_SOFT_AFFINITY &&
             !has_soft_affinity(new->vcpu, new->vcpu->cpu_hard_affinity) )
            continue;

        affinity_balance_cpumask(new->vcpu, bs, cpumask_scratch_cpu(cpu));

        /*
         * First of all, consider idle cpus, checking if we can just
         * re-use the pcpu where we were running before.
         *
         * If there are cores where all the siblings are idle, consider
         * them first, honoring whatever the spreading-vs-consolidation
         * SMT policy wants us to do.
         */
        if ( unlikely(sched_smt_power_savings) )
        {
            cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
            cpumask_and(&mask, &mask, online);
        }
        else
            cpumask_and(&mask, &rqd->smt_idle, online);
        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
        i = cpumask_test_or_cycle(cpu, &mask);
        if ( i < nr_cpu_ids )
        {
            SCHED_STAT_CRANK(tickled_idle_cpu);
            ipid = i;
            goto tickle;
        }

        /*
         * If there are no fully idle cores, check all idlers, after
         * having filtered out pcpus that have been tickled but haven't
         * gone through the scheduler yet.
         */
        cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
        i = cpumask_test_or_cycle(cpu, &mask);
        if ( i < nr_cpu_ids )
        {
            SCHED_STAT_CRANK(tickled_idle_cpu);
            ipid = i;
            goto tickle;
        }
    }

    /*
     * Note that, if we are here, it means we have done the hard-affinity
     * balancing step of the loop, and hence what we have in cpumask_scratch
     * is what we put there for last, i.e., new's vcpu_hard_affinity & online
     * which is exactly what we need for the next part of the function.
     */

    /*
     * Otherwise, look for the non-idle (and non-tickled) processors with
     * the lowest credit, among the ones new is allowed to run on. Again,
     * the cpu were it was running on would be the best candidate.
     *
     * For deciding which cpu to tickle, we use tickle_score(), which will
     * factor in both new's soft-affinity, and the soft-affinity of the
     * vcpu running on each cpu that we consider.
     */
    cpumask_andnot(&mask, &rqd->active, &rqd->idle);
    cpumask_andnot(&mask, &mask, &rqd->tickled);
    cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
    if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
    {
        s_time_t score = tickle_score(ops, now, new, cpu);

        if ( score > max )
        {
            max = score;
            ipid = cpu;

            /* If this is in new's soft affinity, just take it */
            if ( cpumask_test_cpu(cpu, new->vcpu->cpu_soft_affinity) )
            {
                SCHED_STAT_CRANK(tickled_busy_cpu);
                goto tickle;
            }
        }
    }

    for_each_cpu(i, &mask)
    {
        s_time_t score;

        /* Already looked at this one above */
        ASSERT(i != cpu);

        score = tickle_score(ops, now, new, i);

        if ( score > max )
        {
            max = score;
            ipid = i;
        }
    }

    if ( ipid == -1 )
    {
        SCHED_STAT_CRANK(tickled_no_cpu);
        return;
    }

    ASSERT(!is_idle_vcpu(curr_on_cpu(ipid)));
    SCHED_STAT_CRANK(tickled_busy_cpu);
 tickle:
    BUG_ON(ipid == -1);

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned cpu:16, pad:16;
        } d;
        d.cpu = ipid; d.pad = 0;
        __trace_var(TRC_CSCHED2_TICKLE, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    tickle_cpu(ipid, rqd);

    if ( unlikely(new->tickled_cpu != -1) )
        SCHED_STAT_CRANK(tickled_cpu_overwritten);
    new->tickled_cpu = ipid;
}

/*
 * Credit-related code
 */
static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
                         struct csched2_vcpu *snext)
{
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
    struct list_head *iter;
    int m;

    /*
     * Under normal circumstances, snext->credit should never be less
     * than -CSCHED2_MIN_TIMER.  However, under some circumstances, a
     * vcpu with low credits may be allowed to run long enough that
     * its credits are actually less than -CSCHED2_CREDIT_INIT.
     * (Instances have been observed, for example, where a vcpu with
     * 200us of credit was allowed to run for 11ms, giving it -10.8ms
     * of credit.  Thus it was still negative even after the reset.)
     *
     * If this is the case for snext, we simply want to keep moving
     * everyone up until it is in the black again.  This fair because
     * none of the other vcpus want to run at the moment.
     *
     * Rather than looping, however, we just calculate a multiplier,
     * avoiding an integer division and multiplication in the common
     * case.
     */
    m = 1;
    if ( snext->credit < -CSCHED2_CREDIT_INIT )
        m += (-snext->credit) / CSCHED2_CREDIT_INIT;

    list_for_each( iter, &rqd->svc )
    {
        unsigned int svc_cpu;
        struct csched2_vcpu * svc;
        int start_credit;

        svc = list_entry(iter, struct csched2_vcpu, rqd_elem);
        svc_cpu = svc->vcpu->processor;

        ASSERT(!is_idle_vcpu(svc->vcpu));
        ASSERT(svc->rqd == rqd);

        /*
         * If svc is running, it is our responsibility to make sure, here,
         * that the credit it has spent so far get accounted.
         */
        if ( svc->vcpu == curr_on_cpu(svc_cpu) )
        {
            burn_credits(rqd, svc, now);
            /*
             * And, similarly, in case it has run out of budget, as a
             * consequence of this round of accounting, we also must inform
             * its pCPU that it's time to park it, and pick up someone else.
             */
            if ( unlikely(svc->budget <= 0) )
                tickle_cpu(svc_cpu, rqd);
        }

        start_credit = svc->credit;

        /*
         * Add INIT * m, avoiding integer multiplication in the common case.
         */
        if ( likely(m==1) )
            svc->credit += CSCHED2_CREDIT_INIT;
        else
            svc->credit += m * CSCHED2_CREDIT_INIT;

        /* "Clip" credits to max carryover */
        if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
            svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;

        svc->start_time = now;

        if ( unlikely(tb_init_done) )
        {
            struct {
                unsigned vcpu:16, dom:16;
                int credit_start, credit_end;
                unsigned multiplier;
            } d;
            d.dom = svc->vcpu->domain->domain_id;
            d.vcpu = svc->vcpu->vcpu_id;
            d.credit_start = start_credit;
            d.credit_end = svc->credit;
            d.multiplier = m;
            __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
                        sizeof(d),
                        (unsigned char *)&d);
        }
    }

    SCHED_STAT_CRANK(credit_reset);

    /* No need to resort runqueue, as everyone's order should be the same. */
}

void burn_credits(struct csched2_runqueue_data *rqd,
                  struct csched2_vcpu *svc, s_time_t now)
{
    s_time_t delta;

    ASSERT(svc == csched2_vcpu(curr_on_cpu(svc->vcpu->processor)));

    if ( unlikely(is_idle_vcpu(svc->vcpu)) )
    {
        ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
        return;
    }

    delta = now - svc->start_time;

    if ( unlikely(delta <= 0) )
    {
        if ( unlikely(delta < 0) )
            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
                     " start_time %"PRI_stime"\n", __func__, now,
                     svc->start_time);
        goto out;
    }

    SCHED_STAT_CRANK(burn_credits_t2c);
    t2c_update(rqd, delta, svc);

    if ( has_cap(svc) )
        svc->budget -= delta;

    svc->start_time = now;

 out:
    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            int credit, budget;
            int delta;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.credit = svc->credit;
        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
        d.delta = delta;
        __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }
}

/*
 * Budget-related code.
 */

static void park_vcpu(struct csched2_vcpu *svc)
{
    struct vcpu *v = svc->vcpu;

    ASSERT(spin_is_locked(&svc->sdom->budget_lock));

    /*
     * It was impossible to find budget for this vCPU, so it has to be
     * "parked". This implies it is not runnable, so we mark it as such in
     * its pause_flags. If the vCPU is currently scheduled (which means we
     * are here after being called from within csched_schedule()), flagging
     * is enough, as we'll choose someone else, and then context_saved()
     * will take care of updating the load properly.
     *
     * If, OTOH, the vCPU is sitting in the runqueue (which means we are here
     * after being called from within runq_candidate()), we must go all the
     * way down to taking it out of there, and updating the load accordingly.
     *
     * In both cases, we also add it to the list of parked vCPUs of the domain.
     */
    __set_bit(_VPF_parked, &v->pause_flags);
    if ( vcpu_on_runq(svc) )
    {
        runq_remove(svc);
        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
    }
    list_add(&svc->parked_elem, &svc->sdom->parked_vcpus);
}

static bool vcpu_grab_budget(struct csched2_vcpu *svc)
{
    struct csched2_dom *sdom = svc->sdom;
    unsigned int cpu = svc->vcpu->processor;

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));

    if ( svc->budget > 0 )
        return true;

    /* budget_lock nests inside runqueue lock. */
    spin_lock(&sdom->budget_lock);

    /*
     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
     * above!). That basically means the vCPU has overrun a bit --because of
     * various reasons-- and we want to take that into account. With the +=,
     * we are actually subtracting the amount of budget the vCPU has
     * overconsumed, from the total domain budget.
     */
    sdom->budget += svc->budget;

    if ( sdom->budget > 0 )
    {
        s_time_t budget;

        /* Get our quota, if there's at least as much budget */
        if ( likely(sdom->budget >= svc->budget_quota) )
            budget = svc->budget_quota;
        else
            budget = sdom->budget;

        svc->budget = budget;
        sdom->budget -= budget;
    }
    else
    {
        svc->budget = 0;
        park_vcpu(svc);
    }

    spin_unlock(&sdom->budget_lock);

    return svc->budget > 0;
}

static void
vcpu_return_budget(struct csched2_vcpu *svc, struct list_head *parked)
{
    struct csched2_dom *sdom = svc->sdom;
    unsigned int cpu = svc->vcpu->processor;

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
    ASSERT(list_empty(parked));

    /* budget_lock nests inside runqueue lock. */
    spin_lock(&sdom->budget_lock);

    /*
     * The vCPU is stopping running (e.g., because it's blocking, or it has
     * been preempted). If it hasn't consumed all the budget it got when,
     * starting to run, put that remaining amount back in the domain's budget
     * pool.
     */
    sdom->budget += svc->budget;
    svc->budget = 0;

    /*
     * Making budget available again to the domain means that parked vCPUs
     * may be unparked and run. They are, if any, in the domain's parked_vcpus
     * list, so we want to go through that and unpark them (so they can try
     * to get some budget).
     *
     * Touching the list requires the budget_lock, which we hold. Let's
     * therefore put everyone in that list in another, temporary list, which
     * then the caller will traverse, unparking the vCPUs it finds there.
     *
     * In fact, we can't do the actual unparking here, because that requires
     * taking the runqueue lock of the vCPUs being unparked, and we can't
     * take any runqueue locks while we hold a budget_lock.
     */
    if ( sdom->budget > 0 )
        list_splice_init(&sdom->parked_vcpus, parked);

    spin_unlock(&sdom->budget_lock);
}

static void
unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus)
{
    struct csched2_vcpu *svc, *tmp;
    spinlock_t *lock;

    list_for_each_entry_safe(svc, tmp, vcpus, parked_elem)
    {
        unsigned long flags;
        s_time_t now;

        lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags);

        __clear_bit(_VPF_parked, &svc->vcpu->pause_flags);
        if ( unlikely(svc->flags & CSFLAG_scheduled) )
        {
            /*
             * We end here if a budget replenishment arrived between
             * csched2_schedule() (and, in particular, after a call to
             * vcpu_grab_budget() that returned false), and
             * context_saved(). By setting __CSFLAG_delayed_runq_add,
             * we tell context_saved() to put the vCPU back in the
             * runqueue, from where it will compete with the others
             * for the newly replenished budget.
             */
            ASSERT( svc->rqd != NULL );
            ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd );
            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
        }
        else if ( vcpu_runnable(svc->vcpu) )
        {
            /*
             * The vCPU should go back to the runqueue, and compete for
             * the newly replenished budget, but only if it is actually
             * runnable (and was therefore offline only because of the
             * lack of budget).
             */
            now = NOW();
            update_load(ops, svc->rqd, svc, 1, now);
            runq_insert(ops, svc);
            runq_tickle(ops, svc, now);
        }
        list_del_init(&svc->parked_elem);

        vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu);
    }
}

static inline void do_replenish(struct csched2_dom *sdom)
{
    sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
    sdom->budget += sdom->tot_budget;
}

static void replenish_domain_budget(void* data)
{
    struct csched2_dom *sdom = data;
    unsigned long flags;
    s_time_t now;
    LIST_HEAD(parked);

    spin_lock_irqsave(&sdom->budget_lock, flags);

    now = NOW();

    /*
     * Let's do the replenishment. Note, though, that a domain may overrun,
     * which means the budget would have gone below 0 (reasons may be system
     * overbooking, accounting issues, etc.). It also may happen that we are
     * handling the replenishment (much) later than we should (reasons may
     * again be overbooking, or issues with timers).
     *
     * Even in cases of overrun or delay, however, we expect that in 99% of
     * cases, doing just one replenishment will be good enough for being able
     * to unpark the vCPUs that are waiting for some budget.
     */
    do_replenish(sdom);

    /*
     * And now, the special cases:
     * 1) if we are late enough to have skipped (at least) one full period,
     * what we must do is doing more replenishments. Note that, however,
     * every time we add tot_budget to the budget, we also move next_repl
     * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
     * respected.
     */
    if ( unlikely(sdom->next_repl <= now) )
    {
        do
            do_replenish(sdom);
        while ( sdom->next_repl <= now );
    }
    /*
     * 2) if we overrun by more than tot_budget, then budget+tot_budget is
     * still < 0, which means that we can't unpark the vCPUs. Let's bail,
     * and wait for future replenishments.
     */
    if ( unlikely(sdom->budget <= 0) )
    {
        spin_unlock_irqrestore(&sdom->budget_lock, flags);
        goto out;
    }

    /* Since we do more replenishments, make sure we didn't overshot. */
    sdom->budget = min(sdom->budget, sdom->tot_budget);

    /*
     * As above, let's prepare the temporary list, out of the domain's
     * parked_vcpus list, now that we hold the budget_lock. Then, drop such
     * lock, and pass the list to the unparking function.
     */
    list_splice_init(&sdom->parked_vcpus, &parked);

    spin_unlock_irqrestore(&sdom->budget_lock, flags);

    unpark_parked_vcpus(sdom->dom->cpupool->sched, &parked);

 out:
    set_timer(sdom->repl_timer, sdom->next_repl);
}

#ifndef NDEBUG
static inline void
csched2_vcpu_check(struct vcpu *vc)
{
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
    struct csched2_dom * const sdom = svc->sdom;

    BUG_ON( svc->vcpu != vc );
    BUG_ON( sdom != csched2_dom(vc->domain) );
    if ( sdom )
    {
        BUG_ON( is_idle_vcpu(vc) );
        BUG_ON( sdom->dom != vc->domain );
    }
    else
    {
        BUG_ON( !is_idle_vcpu(vc) );
    }
    SCHED_STAT_CRANK(vcpu_check);
}
#define CSCHED2_VCPU_CHECK(_vc)  (csched2_vcpu_check(_vc))
#else
#define CSCHED2_VCPU_CHECK(_vc)
#endif

static void *
csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
{
    struct csched2_vcpu *svc;

    /* Allocate per-VCPU info */
    svc = xzalloc(struct csched2_vcpu);
    if ( svc == NULL )
        return NULL;

    INIT_LIST_HEAD(&svc->rqd_elem);
    INIT_LIST_HEAD(&svc->runq_elem);

    svc->sdom = dd;
    svc->vcpu = vc;
    svc->flags = 0U;

    if ( ! is_idle_vcpu(vc) )
    {
        ASSERT(svc->sdom != NULL);
        svc->credit = CSCHED2_CREDIT_INIT;
        svc->weight = svc->sdom->weight;
        /* Starting load of 50% */
        svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
        svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
    }
    else
    {
        ASSERT(svc->sdom == NULL);
        svc->credit = CSCHED2_IDLE_CREDIT;
        svc->weight = 0;
    }
    svc->tickled_cpu = -1;

    svc->budget = STIME_MAX;
    svc->budget_quota = 0;
    INIT_LIST_HEAD(&svc->parked_elem);

    SCHED_STAT_CRANK(vcpu_alloc);

    return svc;
}

static void
csched2_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu * const svc = csched2_vcpu(vc);

    ASSERT(!is_idle_vcpu(vc));
    SCHED_STAT_CRANK(vcpu_sleep);

    if ( curr_on_cpu(vc->processor) == vc )
    {
        tickle_cpu(vc->processor, svc->rqd);
    }
    else if ( vcpu_on_runq(svc) )
    {
        ASSERT(svc->rqd == c2rqd(ops, vc->processor));
        update_load(ops, svc->rqd, svc, -1, NOW());
        runq_remove(svc);
    }
    else if ( svc->flags & CSFLAG_delayed_runq_add )
        __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
}

static void
csched2_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
    unsigned int cpu = vc->processor;
    s_time_t now;

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));

    ASSERT(!is_idle_vcpu(vc));

    if ( unlikely(curr_on_cpu(cpu) == vc) )
    {
        SCHED_STAT_CRANK(vcpu_wake_running);
        goto out;
    }

    if ( unlikely(vcpu_on_runq(svc)) )
    {
        SCHED_STAT_CRANK(vcpu_wake_onrunq);
        goto out;
    }

    if ( likely(vcpu_runnable(vc)) )
        SCHED_STAT_CRANK(vcpu_wake_runnable);
    else
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);

    /* If the context hasn't been saved for this vcpu yet, we can't put it on
     * another runqueue.  Instead, we set a flag so that it will be put on the runqueue
     * after the context has been saved. */
    if ( unlikely(svc->flags & CSFLAG_scheduled) )
    {
        __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
        goto out;
    }

    /* Add into the new runqueue if necessary */
    if ( svc->rqd == NULL )
        runq_assign(ops, vc);
    else
        ASSERT(c2rqd(ops, vc->processor) == svc->rqd );

    now = NOW();

    update_load(ops, svc->rqd, svc, 1, now);
        
    /* Put the VCPU on the runq */
    runq_insert(ops, svc);
    runq_tickle(ops, svc, now);

out:
    return;
}

static void
csched2_vcpu_yield(const struct scheduler *ops, struct vcpu *v)
{
    struct csched2_vcpu * const svc = csched2_vcpu(v);

    __set_bit(__CSFLAG_vcpu_yield, &svc->flags);
}

static void
csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
    spinlock_t *lock = vcpu_schedule_lock_irq(vc);
    s_time_t now = NOW();
    LIST_HEAD(were_parked);

    BUG_ON( !is_idle_vcpu(vc) && svc->rqd != c2rqd(ops, vc->processor));
    ASSERT(is_idle_vcpu(vc) || svc->rqd == c2rqd(ops, vc->processor));

    /* This vcpu is now eligible to be put on the runqueue again */
    __clear_bit(__CSFLAG_scheduled, &svc->flags);

    if ( unlikely(has_cap(svc) && svc->budget > 0) )
        vcpu_return_budget(svc, &were_parked);

    /* If someone wants it on the runqueue, put it there. */
    /*
     * NB: We can get rid of CSFLAG_scheduled by checking for
     * vc->is_running and vcpu_on_runq(svc) here.  However,
     * since we're accessing the flags cacheline anyway,
     * it seems a bit pointless; especially as we have plenty of
     * bits free.
     */
    if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
         && likely(vcpu_runnable(vc)) )
    {
        ASSERT(!vcpu_on_runq(svc));

        runq_insert(ops, svc);
        runq_tickle(ops, svc, now);
    }
    else if ( !is_idle_vcpu(vc) )
        update_load(ops, svc->rqd, svc, -1, now);

    vcpu_schedule_unlock_irq(lock, vc);

    unpark_parked_vcpus(ops, &were_parked);
}

#define MAX_LOAD (STIME_MAX)
static int
csched2_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_private *prv = csched2_priv(ops);
    int i, min_rqi = -1, min_s_rqi = -1;
    unsigned int new_cpu, cpu = vc->processor;
    struct csched2_vcpu *svc = csched2_vcpu(vc);
    s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
    bool has_soft;

    ASSERT(!cpumask_empty(&prv->active_queues));

    SCHED_STAT_CRANK(pick_cpu);

    /* Locking:
     * - Runqueue lock of vc->processor is already locked
     * - Need to grab prv lock to make sure active runqueues don't
     *   change
     * - Need to grab locks for other runqueues while checking
     *   avgload
     * Locking constraint is:
     * - Lock prv before runqueue locks
     * - Trylock between runqueue locks (no ordering)
     *
     * Since one of the runqueue locks is already held, we can't
     * just grab the prv lock.  Instead, we'll have to trylock, and
     * do something else reasonable if we fail.
     */
    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));

    if ( !read_trylock(&prv->lock) )
    {
        /* We may be here because someone requested us to migrate. */
        __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
        new_cpu = get_fallback_cpu(svc);
        /*
         * Tracing of runq and its load won't be accurate, since we could
         * not get the lock, but at least we will output the chosen pcpu.
         */
        goto out;
    }

    cpumask_and(cpumask_scratch_cpu(cpu), vc->cpu_hard_affinity,
                cpupool_domain_cpumask(vc->domain));

    /*
     * First check to see if we're here because someone else suggested a place
     * for us to move.
     */
    if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
    {
        if ( unlikely(svc->migrate_rqd->id < 0) )
        {
            printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
                   __func__);
        }
        else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
                                     &svc->migrate_rqd->active) )
        {
            /*
             * If we've been asked to move to migrate_rqd, we should just do
             * that, which we actually do by returning one cpu from that runq.
             * There is no need to take care of soft affinity, as that will
             * happen in runq_tickle().
             */
            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                        &svc->migrate_rqd->active);
            new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
                                    cpumask_scratch_cpu(cpu));

            svc->migrate_rqd->pick_bias = new_cpu;
            goto out_up;
        }
        /* Fall-through to normal cpu pick */
    }

    /*
     * What we want is:
     *  - if we have soft affinity, the runqueue with the lowest average
     *    load, among the ones that contain cpus in our soft affinity; this
     *    represents the best runq on which we would want to run.
     *  - the runqueue with the lowest average load among the ones that
     *    contains cpus in our hard affinity; this represent the best runq
     *    on which we can run.
     *
     * Find both runqueues in one pass.
     */
    has_soft = has_soft_affinity(vc, vc->cpu_hard_affinity);
    for_each_cpu(i, &prv->active_queues)
    {
        struct csched2_runqueue_data *rqd;
        s_time_t rqd_avgload = MAX_LOAD;

        rqd = prv->rqd + i;

        /*
         * If none of the cpus of this runqueue is in svc's hard-affinity,
         * skip the runqueue.
         *
         * Note that, in case svc's hard-affinity has changed, this is the
         * first time when we see such change, so it is indeed possible
         * that we end up skipping svc's current runqueue.
         */
        if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
            continue;

        /*
         * If checking a different runqueue, grab the lock, read the avg,
         * and then release the lock.
         *
         * If on our own runqueue, don't grab or release the lock;
         * but subtract our own load from the runqueue load to simulate
         * impartiality.
         */
        if ( rqd == svc->rqd )
        {
            rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
        }
        else if ( spin_trylock(&rqd->lock) )
        {
            rqd_avgload = rqd->b_avgload;
            spin_unlock(&rqd->lock);
        }

        /*
         * if svc has a soft-affinity, and some cpus of rqd are part of it,
         * see if we need to update the "soft-affinity minimum".
         */
        if ( has_soft &&
             rqd_avgload < min_s_avgload )
        {
            cpumask_t mask;

            cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
            if ( cpumask_intersects(&mask, svc->vcpu->cpu_soft_affinity) )
            {
                min_s_avgload = rqd_avgload;
                min_s_rqi = i;
            }
        }
        /* In any case, keep the "hard-affinity minimum" updated too. */
        if ( rqd_avgload < min_avgload )
        {
            min_avgload = rqd_avgload;
            min_rqi = i;
        }
    }

    if ( has_soft && min_s_rqi != -1 )
    {
        /*
         * We have soft affinity, and we have a candidate runq, so go for it.
         *
         * Note that, to obtain the soft-affinity mask, we "just" put what we
         * have in cpumask_scratch in && with vc->cpu_soft_affinity. This is
         * ok because:
         * - we know that vc->cpu_hard_affinity and vc->cpu_soft_affinity have
         *   a non-empty intersection (because has_soft is true);
         * - we have vc->cpu_hard_affinity & cpupool_domain_cpumask() already
         *   in cpumask_scratch, we do save a lot doing like this.
         *
         * It's kind of like open coding affinity_balance_cpumask() but, in
         * this specific case, calling that would mean a lot of (unnecessary)
         * cpumask operations.
         */
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                    vc->cpu_soft_affinity);
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                    &prv->rqd[min_s_rqi].active);
    }
    else if ( min_rqi != -1 )
    {
        /*
         * Either we don't have soft-affinity, or we do, but we did not find
         * any suitable runq. But we did find one when considering hard
         * affinity, so go for it.
         *
         * cpumask_scratch already has vc->cpu_hard_affinity &
         * cpupool_domain_cpumask() in it, so it's enough that we filter
         * with the cpus of the runq.
         */
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                    &prv->rqd[min_rqi].active);
    }
    else
    {
        /*
         * We didn't find anyone at all (most likely because of spinlock
         * contention).
         */
        new_cpu = get_fallback_cpu(svc);
        min_rqi = c2r(new_cpu);
        min_avgload = prv->rqd[min_rqi].b_avgload;
        goto out_up;
    }

    new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
                            cpumask_scratch_cpu(cpu));
    prv->rqd[min_rqi].pick_bias = new_cpu;
    BUG_ON(new_cpu >= nr_cpu_ids);

 out_up:
    read_unlock(&prv->lock);
 out:
    if ( unlikely(tb_init_done) )
    {
        struct {
            uint64_t b_avgload;
            unsigned vcpu:16, dom:16;
            unsigned rq_id:16, new_cpu:16;
        } d;
        d.dom = vc->domain->domain_id;
        d.vcpu = vc->vcpu_id;
        d.rq_id = min_rqi;
        d.b_avgload = min_avgload;
        d.new_cpu = new_cpu;
        __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    return new_cpu;
}

/* Working state of the load-balancing algorithm */
typedef struct {
    /* NB: Modified by consider() */
    s_time_t load_delta;
    struct csched2_vcpu * best_push_svc, *best_pull_svc;
    /* NB: Read by consider() */
    struct csched2_runqueue_data *lrqd;
    struct csched2_runqueue_data *orqd;                  
} balance_state_t;

static void consider(balance_state_t *st, 
                     struct csched2_vcpu *push_svc,
                     struct csched2_vcpu *pull_svc)
{
    s_time_t l_load, o_load, delta;

    l_load = st->lrqd->b_avgload;
    o_load = st->orqd->b_avgload;
    if ( push_svc )
    {
        /* What happens to the load on both if we push? */
        l_load -= push_svc->avgload;
        o_load += push_svc->avgload;
    }
    if ( pull_svc )
    {
        /* What happens to the load on both if we pull? */
        l_load += pull_svc->avgload;
        o_load -= pull_svc->avgload;
    }

    delta = l_load - o_load;
    if ( delta < 0 )
        delta = -delta;

    if ( delta < st->load_delta )
    {
        st->load_delta = delta;
        st->best_push_svc=push_svc;
        st->best_pull_svc=pull_svc;
    }
}


static void migrate(const struct scheduler *ops,
                    struct csched2_vcpu *svc, 
                    struct csched2_runqueue_data *trqd, 
                    s_time_t now)
{
    int cpu = svc->vcpu->processor;

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            unsigned rqi:16, trqi:16;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.rqi = svc->rqd->id;
        d.trqi = trqd->id;
        __trace_var(TRC_CSCHED2_MIGRATE, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    if ( svc->flags & CSFLAG_scheduled )
    {
        /* It's running; mark it to migrate. */
        svc->migrate_rqd = trqd;
        __set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
        __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
        SCHED_STAT_CRANK(migrate_requested);
        tickle_cpu(cpu, svc->rqd);
    }
    else
    {
        int on_runq = 0;
        /* It's not running; just move it */
        if ( vcpu_on_runq(svc) )
        {
            runq_remove(svc);
            update_load(ops, svc->rqd, NULL, -1, now);
            on_runq = 1;
        }
        _runq_deassign(svc);

        cpumask_and(cpumask_scratch_cpu(cpu), svc->vcpu->cpu_hard_affinity,
                    cpupool_domain_cpumask(svc->vcpu->domain));
        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
                    &trqd->active);
        svc->vcpu->processor = cpumask_cycle(trqd->pick_bias,
                                             cpumask_scratch_cpu(cpu));
        trqd->pick_bias = svc->vcpu->processor;
        ASSERT(svc->vcpu->processor < nr_cpu_ids);

        _runq_assign(svc, trqd);
        if ( on_runq )
        {
            update_load(ops, svc->rqd, NULL, 1, now);
            runq_insert(ops, svc);
            runq_tickle(ops, svc, now);
            SCHED_STAT_CRANK(migrate_on_runq);
        }
        else
            SCHED_STAT_CRANK(migrate_no_runq);
    }
}

/*
 * It makes sense considering migrating svc to rqd, if:
 *  - svc is not already flagged to migrate,
 *  - if svc is allowed to run on at least one of the pcpus of rqd.
 */
static bool vcpu_is_migrateable(struct csched2_vcpu *svc,
                                  struct csched2_runqueue_data *rqd)
{
    struct vcpu *v = svc->vcpu;
    int cpu = svc->vcpu->processor;

    cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
                cpupool_domain_cpumask(v->domain));

    return !(svc->flags & CSFLAG_runq_migrate_request) &&
           cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
}

static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
{
    struct csched2_private *prv = csched2_priv(ops);
    int i, max_delta_rqi = -1;
    struct list_head *push_iter, *pull_iter;
    bool inner_load_updated = 0;

    balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };

    /*
     * Basic algorithm: Push, pull, or swap.
     * - Find the runqueue with the furthest load distance
     * - Find a pair that makes the difference the least (where one
     * on either side may be empty).
     */

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
    st.lrqd = c2rqd(ops, cpu);

    update_runq_load(ops, st.lrqd, 0, now);

retry:
    if ( !read_trylock(&prv->lock) )
        return;

    st.load_delta = 0;

    for_each_cpu(i, &prv->active_queues)
    {
        s_time_t delta;
        
        st.orqd = prv->rqd + i;

        if ( st.orqd == st.lrqd
             || !spin_trylock(&st.orqd->lock) )
            continue;

        update_runq_load(ops, st.orqd, 0, now);
    
        delta = st.lrqd->b_avgload - st.orqd->b_avgload;
        if ( delta < 0 )
            delta = -delta;

        if ( delta > st.load_delta )
        {
            st.load_delta = delta;
            max_delta_rqi = i;
        }

        spin_unlock(&st.orqd->lock);
    }

    /* Minimize holding the private scheduler lock. */
    read_unlock(&prv->lock);
    if ( max_delta_rqi == -1 )
        goto out;

    {
        s_time_t load_max;
        int cpus_max;

        
        load_max = st.lrqd->b_avgload;
        if ( st.orqd->b_avgload > load_max )
            load_max = st.orqd->b_avgload;

        cpus_max = cpumask_weight(&st.lrqd->active);
        i = cpumask_weight(&st.orqd->active);
        if ( i > cpus_max )
            cpus_max = i;

        if ( unlikely(tb_init_done) )
        {
            struct {
                unsigned lrq_id:16, orq_id:16;
                unsigned load_delta;
            } d;
            d.lrq_id = st.lrqd->id;
            d.orq_id = st.orqd->id;
            d.load_delta = st.load_delta;
            __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
                        sizeof(d),
                        (unsigned char *)&d);
        }

        /*
         * If we're under 100% capacaty, only shift if load difference
         * is > 1.  otherwise, shift if under 12.5%
         */
        if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
        {
            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
                                           opt_underload_balance_tolerance)) )
                 goto out;
        }
        else
            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
                                           opt_overload_balance_tolerance)) )
                goto out;
    }
             
    /* Try to grab the other runqueue lock; if it's been taken in the
     * meantime, try the process over again.  This can't deadlock
     * because if it doesn't get any other rqd locks, it will simply
     * give up and return. */
    st.orqd = prv->rqd + max_delta_rqi;
    if ( !spin_trylock(&st.orqd->lock) )
        goto retry;

    /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
    if ( unlikely(st.orqd->id < 0) )
        goto out_up;

    if ( unlikely(tb_init_done) )
    {
        struct {
            uint64_t lb_avgload, ob_avgload;
            unsigned lrq_id:16, orq_id:16;
        } d;
        d.lrq_id = st.lrqd->id;
        d.lb_avgload = st.lrqd->b_avgload;
        d.orq_id = st.orqd->id;
        d.ob_avgload = st.orqd->b_avgload;
        __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    SCHED_STAT_CRANK(acct_load_balance);

    /* Look for "swap" which gives the best load average
     * FIXME: O(n^2)! */

    /* Reuse load delta (as we're trying to minimize it) */
    list_for_each( push_iter, &st.lrqd->svc )
    {
        struct csched2_vcpu * push_svc = list_entry(push_iter, struct csched2_vcpu, rqd_elem);

        update_svc_load(ops, push_svc, 0, now);

        if ( !vcpu_is_migrateable(push_svc, st.orqd) )
            continue;

        list_for_each( pull_iter, &st.orqd->svc )
        {
            struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
            
            if ( !inner_load_updated )
                update_svc_load(ops, pull_svc, 0, now);
        
            if ( !vcpu_is_migrateable(pull_svc, st.lrqd) )
                continue;

            consider(&st, push_svc, pull_svc);
        }

        inner_load_updated = 1;

        /* Consider push only */
        consider(&st, push_svc, NULL);
    }

    list_for_each( pull_iter, &st.orqd->svc )
    {
        struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
        
        if ( !vcpu_is_migrateable(pull_svc, st.lrqd) )
            continue;

        /* Consider pull only */
        consider(&st, NULL, pull_svc);
    }

    /* OK, now we have some candidates; do the moving */
    if ( st.best_push_svc )
        migrate(ops, st.best_push_svc, st.orqd, now);
    if ( st.best_pull_svc )
        migrate(ops, st.best_pull_svc, st.lrqd, now);

 out_up:
    spin_unlock(&st.orqd->lock);
 out:
    return;
}

static void
csched2_vcpu_migrate(
    const struct scheduler *ops, struct vcpu *vc, unsigned int new_cpu)
{
    struct domain *d = vc->domain;
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
    struct csched2_runqueue_data *trqd;
    s_time_t now = NOW();

    /*
     * Being passed a target pCPU which is outside of our cpupool is only
     * valid if we are shutting down (or doing ACPI suspend), and we are
     * moving everyone to BSP, no matter whether or not BSP is inside our
     * cpupool.
     *
     * And since there indeed is the chance that it is not part of it, all
     * we must do is remove _and_ unassign the vCPU from any runqueue, as
     * well as updating v->processor with the target, so that the suspend
     * process can continue.
     *
     * It will then be during resume that a new, meaningful, value for
     * v->processor will be chosen, and during actual domain unpause that
     * the vCPU will be assigned to and added to the proper runqueue.
     */
    if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_cpumask(d))) )
    {
        ASSERT(system_state == SYS_STATE_suspend);
        if ( vcpu_on_runq(svc) )
        {
            runq_remove(svc);
            update_load(ops, svc->rqd, NULL, -1, now);
        }
        _runq_deassign(svc);
        vc->processor = new_cpu;
        return;
    }

    /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
    ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
    ASSERT(cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));

    trqd = c2rqd(ops, new_cpu);

    /*
     * Do the actual movement toward new_cpu, and update vc->processor.
     * If we are changing runqueue, migrate() takes care of everything.
     * If we are not changing runqueue, we need to update vc->processor
     * here. In fact, if, for instance, we are here because the vcpu's
     * hard affinity changed, we don't want to risk leaving vc->processor
     * pointing to a pcpu where we can't run any longer.
     */
    if ( trqd != svc->rqd )
        migrate(ops, svc, trqd, now);
    else
        vc->processor = new_cpu;
}

static int
csched2_dom_cntl(
    const struct scheduler *ops,
    struct domain *d,
    struct xen_domctl_scheduler_op *op)
{
    struct csched2_dom * const sdom = csched2_dom(d);
    struct csched2_private *prv = csched2_priv(ops);
    unsigned long flags;
    struct vcpu *v;
    int rc = 0;

    /*
     * Locking:
     *  - we must take the private lock for accessing the weights of the
     *    vcpus of d, and/or the cap;
     *  - in the putinfo case, we also need the runqueue lock(s), for
     *    updating the max waight of the runqueue(s).
     *    If changing the cap, we also need the budget_lock, for updating
     *    the value of the domain budget pool (and the runqueue lock,
     *    for adjusting the parameters and rescheduling any vCPU that is
     *    running at the time of the change).
     */
    switch ( op->cmd )
    {
    case XEN_DOMCTL_SCHEDOP_getinfo:
        read_lock_irqsave(&prv->lock, flags);
        op->u.credit2.weight = sdom->weight;
        op->u.credit2.cap = sdom->cap;
        read_unlock_irqrestore(&prv->lock, flags);
        break;
    case XEN_DOMCTL_SCHEDOP_putinfo:
        write_lock_irqsave(&prv->lock, flags);
        /* Weight */
        if ( op->u.credit2.weight != 0 )
        {
            int old_weight;

            old_weight = sdom->weight;

            sdom->weight = op->u.credit2.weight;

            /* Update weights for vcpus, and max_weight for runqueues on which they reside */
            for_each_vcpu ( d, v )
            {
                struct csched2_vcpu *svc = csched2_vcpu(v);
                spinlock_t *lock = vcpu_schedule_lock(svc->vcpu);

                ASSERT(svc->rqd == c2rqd(ops, svc->vcpu->processor));

                svc->weight = sdom->weight;
                update_max_weight(svc->rqd, svc->weight, old_weight);

                vcpu_schedule_unlock(lock, svc->vcpu);
            }
        }
        /* Cap */
        if ( op->u.credit2.cap != 0 )
        {
            struct csched2_vcpu *svc;
            spinlock_t *lock;

            /* Cap is only valid if it's below 100 * nr_of_vCPUS */
            if ( op->u.credit2.cap > 100 * sdom->nr_vcpus )
            {
                rc = -EINVAL;
                write_unlock_irqrestore(&prv->lock, flags);
                break;
            }

            spin_lock(&sdom->budget_lock);
            sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
            sdom->tot_budget /= 100;
            spin_unlock(&sdom->budget_lock);

            /*
             * When trying to get some budget and run, each vCPU will grab
             * from the pool 1/N (with N = nr of vCPUs of the domain) of
             * the total budget. Roughly speaking, this means each vCPU will
             * have at least one chance to run during every period.
             */
            for_each_vcpu ( d, v )
            {
                svc = csched2_vcpu(v);
                lock = vcpu_schedule_lock(svc->vcpu);
                /*
                 * Too small quotas would in theory cause a lot of overhead,
                 * which then won't happen because, in csched2_runtime(),
                 * CSCHED2_MIN_TIMER is what would be used anyway.
                 */
                svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus,
                                        CSCHED2_MIN_TIMER);
                vcpu_schedule_unlock(lock, svc->vcpu);
            }

            if ( sdom->cap == 0 )
            {
                /*
                 * We give to the domain the budget to which it is entitled,
                 * and queue its first replenishment event.
                 *
                 * Since cap is currently disabled for this domain, we
                 * know no vCPU is messing with the domain's budget, and
                 * the replenishment timer is still off.
                 * For these reasons, it is safe to do the following without
                 * taking the budget_lock.
                 */
                sdom->budget = sdom->tot_budget;
                sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
                set_timer(sdom->repl_timer, sdom->next_repl);

                /*
                 * Now, let's enable budget accounting for all the vCPUs.
                 * For making sure that they will start to honour the domain's
                 * cap, we set their budget to 0.
                 * This way, as soon as they will try to run, they will have
                 * to get some budget.
                 *
                 * For the vCPUs that are already running, we trigger the
                 * scheduler on their pCPU. When, as a consequence of this,
                 * csched2_schedule() will run, it will figure out there is
                 * no budget, and the vCPU will try to get some (and be parked,
                 * if there's none, and we'll switch to someone else).
                 */
                for_each_vcpu ( d, v )
                {
                    svc = csched2_vcpu(v);
                    lock = vcpu_schedule_lock(svc->vcpu);
                    if ( v->is_running )
                    {
                        unsigned int cpu = v->processor;
                        struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);

                        ASSERT(curr_on_cpu(cpu) == v);

                        /*
                         * We are triggering a reschedule on the vCPU's
                         * pCPU. That will run burn_credits() and, since
                         * the vCPU is capped now, it would charge all the
                         * execution time of this last round as budget as
                         * well. That will make the vCPU budget go negative,
                         * potentially by a large amount, and it's unfair.
                         *
                         * To avoid that, call burn_credit() here, to do the
                         * accounting of this current running instance now,
                         * with budgetting still disabled. This does not
                         * prevent some small amount of budget being charged
                         * to the vCPU (i.e., the amount of time it runs from
                         * now, to when scheduling happens). The budget will
                         * also go below 0, but a lot less than how it would
                         * if we don't do this.
                         */
                        burn_credits(rqd, svc, NOW());
                        __cpumask_set_cpu(cpu, &rqd->tickled);
                        ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
                        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
                    }
                    svc->budget = 0;
                    vcpu_schedule_unlock(lock, svc->vcpu);
                }
            }

            sdom->cap = op->u.credit2.cap;
        }
        else if ( sdom->cap != 0 )
        {
            LIST_HEAD(parked);

            stop_timer(sdom->repl_timer);

            /* Disable budget accounting for all the vCPUs. */
            for_each_vcpu ( d, v )
            {
                struct csched2_vcpu *svc = csched2_vcpu(v);
                spinlock_t *lock = vcpu_schedule_lock(svc->vcpu);

                svc->budget = STIME_MAX;
                svc->budget_quota = 0;

                vcpu_schedule_unlock(lock, svc->vcpu);
            }
            sdom->cap = 0;
            /*
             * We are disabling the cap for this domain, which may have
             * vCPUs waiting for a replenishment, so we unpark them all.
             * Note that, since we have already disabled budget accounting
             * for all the vCPUs of the domain, no currently running vCPU
             * will be added to the parked vCPUs list any longer.
             */
            spin_lock(&sdom->budget_lock);
            list_splice_init(&sdom->parked_vcpus, &parked);
            spin_unlock(&sdom->budget_lock);

            unpark_parked_vcpus(ops, &parked);
        }
        write_unlock_irqrestore(&prv->lock, flags);
        break;
    default:
        rc = -EINVAL;
        break;
    }


    return rc;
}

static int csched2_sys_cntl(const struct scheduler *ops,
                            struct xen_sysctl_scheduler_op *sc)
{
    struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
    struct csched2_private *prv = csched2_priv(ops);
    unsigned long flags;

    switch (sc->cmd )
    {
    case XEN_SYSCTL_SCHEDOP_putinfo:
        if ( params->ratelimit_us &&
             (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
              params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
            return -EINVAL;

        write_lock_irqsave(&prv->lock, flags);
        if ( !prv->ratelimit_us && params->ratelimit_us )
            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
        else if ( prv->ratelimit_us && !params->ratelimit_us )
            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
        prv->ratelimit_us = params->ratelimit_us;
        write_unlock_irqrestore(&prv->lock, flags);

    /* FALLTHRU */
    case XEN_SYSCTL_SCHEDOP_getinfo:
        params->ratelimit_us = prv->ratelimit_us;
        break;
    }

    return 0;
}

static void *
csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
{
    struct csched2_private *prv = csched2_priv(ops);
    struct csched2_dom *sdom;
    unsigned long flags;

    sdom = xzalloc(struct csched2_dom);
    if ( sdom == NULL )
        return NULL;

    sdom->repl_timer = xzalloc(struct timer);
    if ( sdom->repl_timer == NULL )
    {
        xfree(sdom);
        return NULL;
    }

    /* Initialize credit, cap and weight */
    INIT_LIST_HEAD(&sdom->sdom_elem);
    sdom->dom = dom;
    sdom->weight = CSCHED2_DEFAULT_WEIGHT;
    sdom->cap = 0U;
    sdom->nr_vcpus = 0;

    init_timer(sdom->repl_timer, replenish_domain_budget, sdom,
               cpumask_any(cpupool_domain_cpumask(dom)));
    spin_lock_init(&sdom->budget_lock);
    INIT_LIST_HEAD(&sdom->parked_vcpus);

    write_lock_irqsave(&prv->lock, flags);

    list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);

    write_unlock_irqrestore(&prv->lock, flags);

    return (void *)sdom;
}

static int
csched2_dom_init(const struct scheduler *ops, struct domain *dom)
{
    struct csched2_dom *sdom;

    if ( is_idle_domain(dom) )
        return 0;

    sdom = csched2_alloc_domdata(ops, dom);
    if ( sdom == NULL )
        return -ENOMEM;

    dom->sched_priv = sdom;

    return 0;
}

static void
csched2_free_domdata(const struct scheduler *ops, void *data)
{
    unsigned long flags;
    struct csched2_dom *sdom = data;
    struct csched2_private *prv = csched2_priv(ops);

    kill_timer(sdom->repl_timer);

    write_lock_irqsave(&prv->lock, flags);

    list_del_init(&sdom->sdom_elem);

    write_unlock_irqrestore(&prv->lock, flags);

    xfree(sdom->repl_timer);
    xfree(data);
}

static void
csched2_dom_destroy(const struct scheduler *ops, struct domain *dom)
{
    ASSERT(csched2_dom(dom)->nr_vcpus == 0);

    csched2_free_domdata(ops, csched2_dom(dom));
}

static void
csched2_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu *svc = vc->sched_priv;
    struct csched2_dom * const sdom = svc->sdom;
    spinlock_t *lock;

    ASSERT(!is_idle_vcpu(vc));
    ASSERT(list_empty(&svc->runq_elem));

    /* csched2_cpu_pick() expects the pcpu lock to be held */
    lock = vcpu_schedule_lock_irq(vc);

    vc->processor = csched2_cpu_pick(ops, vc);

    spin_unlock_irq(lock);

    lock = vcpu_schedule_lock_irq(vc);

    /* Add vcpu to runqueue of initial processor */
    runq_assign(ops, vc);

    vcpu_schedule_unlock_irq(lock, vc);

    sdom->nr_vcpus++;

    SCHED_STAT_CRANK(vcpu_insert);

    CSCHED2_VCPU_CHECK(vc);
}

static void
csched2_free_vdata(const struct scheduler *ops, void *priv)
{
    struct csched2_vcpu *svc = priv;

    xfree(svc);
}

static void
csched2_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
{
    struct csched2_vcpu * const svc = csched2_vcpu(vc);
    spinlock_t *lock;

    ASSERT(!is_idle_vcpu(vc));
    ASSERT(list_empty(&svc->runq_elem));

    SCHED_STAT_CRANK(vcpu_remove);

    /* Remove from runqueue */
    lock = vcpu_schedule_lock_irq(vc);

    runq_deassign(ops, vc);

    vcpu_schedule_unlock_irq(lock, vc);

    svc->sdom->nr_vcpus--;
}

/* How long should we let this vcpu run for? */
static s_time_t
csched2_runtime(const struct scheduler *ops, int cpu,
                struct csched2_vcpu *snext, s_time_t now)
{
    s_time_t time, min_time;
    int rt_credit; /* Proposed runtime measured in credits */
    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
    struct list_head *runq = &rqd->runq;
    struct csched2_private *prv = csched2_priv(ops);

    /*
     * If we're idle, just stay so. Others (or external events)
     * will poke us when necessary.
     */
    if ( is_idle_vcpu(snext->vcpu) )
        return -1;

    /* General algorithm:
     * 1) Run until snext's credit will be 0.
     * 2) But if someone is waiting, run until snext's credit is equal
     *    to his.
     * 3) But, if we are capped, never run more than our budget.
     * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
     *    the ratelimit time.
     */

    /* Calculate mintime */
    min_time = CSCHED2_MIN_TIMER;
    if ( prv->ratelimit_us )
    {
        s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
        if ( snext->vcpu->is_running )
            ratelimit_min = snext->vcpu->runstate.state_entry_time +
                            MICROSECS(prv->ratelimit_us) - now;
        if ( ratelimit_min > min_time )
            min_time = ratelimit_min;
    }

    /* 1) Run until snext's credit will be 0. */
    rt_credit = snext->credit;

    /*
     * 2) If there's someone waiting whose credit is positive,
     *    run until your credit ~= his.
     */
    if ( ! list_empty(runq) )
    {
        struct csched2_vcpu *swait = runq_elem(runq->next);

        if ( ! is_idle_vcpu(swait->vcpu)
             && swait->credit > 0 )
        {
            rt_credit = snext->credit - swait->credit;
        }
    }

    /*
     * The next guy on the runqueue may actually have a higher credit,
     * if we've tried to avoid migrating him from a different cpu.
     * Setting time=0 will ensure the minimum timeslice is chosen.
     *
     * FIXME: See if we can eliminate this conversion if we know time
     * will be outside (MIN,MAX).  Probably requires pre-calculating
     * credit values of MIN,MAX per vcpu, since each vcpu burns credit
     * at a different rate.
     */
    if ( rt_credit > 0 )
        time = c2t(rqd, rt_credit, snext);
    else
        time = 0;

    /*
     * 3) But, if capped, never run more than our budget.
     */
    if ( has_cap(snext) )
        time = snext->budget < time ? snext->budget : time;

    /*
     * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
     *    the rate_limit time.
     */
    if ( time < min_time )
    {
        time = min_time;
        SCHED_STAT_CRANK(runtime_min_timer);
    }
    else if (time > CSCHED2_MAX_TIMER)
    {
        time = CSCHED2_MAX_TIMER;
        SCHED_STAT_CRANK(runtime_max_timer);
    }

    return time;
}

/*
 * Find a candidate.
 */
static struct csched2_vcpu *
runq_candidate(struct csched2_runqueue_data *rqd,
               struct csched2_vcpu *scurr,
               int cpu, s_time_t now,
               unsigned int *skipped)
{
    struct list_head *iter, *temp;
    struct csched2_vcpu *snext = NULL;
    struct csched2_private *prv = csched2_priv(per_cpu(scheduler, cpu));
    bool yield = false, soft_aff_preempt = false;

    *skipped = 0;

    if ( unlikely(is_idle_vcpu(scurr->vcpu)) )
    {
        snext = scurr;
        goto check_runq;
    }

    yield = __test_and_clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);

    /*
     * Return the current vcpu if it has executed for less than ratelimit.
     * Adjuststment for the selected vcpu's credit and decision
     * for how long it will run will be taken in csched2_runtime.
     *
     * Note that, if scurr is yielding, we don't let rate limiting kick in.
     * In fact, it may be the case that scurr is about to spin, and there's
     * no point forcing it to do so until rate limiting expires.
     */
    if ( !yield && prv->ratelimit_us && vcpu_runnable(scurr->vcpu) &&
         (now - scurr->vcpu->runstate.state_entry_time) <
          MICROSECS(prv->ratelimit_us) )
    {
        if ( unlikely(tb_init_done) )
        {
            struct {
                unsigned vcpu:16, dom:16;
                unsigned runtime;
            } d;
            d.dom = scurr->vcpu->domain->domain_id;
            d.vcpu = scurr->vcpu->vcpu_id;
            d.runtime = now - scurr->vcpu->runstate.state_entry_time;
            __trace_var(TRC_CSCHED2_RATELIMIT, 1,
                        sizeof(d),
                        (unsigned char *)&d);
        }
        return scurr;
    }

    /* If scurr has a soft-affinity, let's check whether cpu is part of it */
    if ( has_soft_affinity(scurr->vcpu, scurr->vcpu->cpu_hard_affinity) )
    {
        affinity_balance_cpumask(scurr->vcpu, BALANCE_SOFT_AFFINITY,
                                 cpumask_scratch);
        if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
        {
            cpumask_t *online = cpupool_domain_cpumask(scurr->vcpu->domain);

            /* Ok, is any of the pcpus in scurr soft-affinity idle? */
            cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
            cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
            soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
        }
    }

    /*
     * If scurr is runnable, and this cpu is in its soft-affinity, default to
     * it. We also default to it, even if cpu is not in its soft-affinity, if
     * there aren't any idle and not tickled cpu in its soft-affinity. In
     * fact, we don't want to risk leaving scurr in the runq and this cpu idle
     * only because scurr is running outside of its soft-affinity.
     *
     * On the other hand, if cpu is not in scurr's soft-affinity, and there
     * looks to be better options, go for them. That happens by defaulting to
     * idle here, which means scurr will be preempted, put back in runq, and
     * one of those idle and not tickled cpus from its soft-affinity will be
     * tickled to pick it up.
     *
     * Finally, if scurr does not have a valid soft-affinity, we also let it
     * continue to run here (in fact, soft_aff_preempt will still be false,
     * in this case).
     *
     * Of course, we also default to idle also if scurr is not runnable.
     */
    if ( vcpu_runnable(scurr->vcpu) && !soft_aff_preempt )
        snext = scurr;
    else
        snext = csched2_vcpu(idle_vcpu[cpu]);

 check_runq:
    list_for_each_safe( iter, temp, &rqd->runq )
    {
        struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);

        if ( unlikely(tb_init_done) )
        {
            struct {
                unsigned vcpu:16, dom:16;
            } d;
            d.dom = svc->vcpu->domain->domain_id;
            d.vcpu = svc->vcpu->vcpu_id;
            __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
                        sizeof(d),
                        (unsigned char *)&d);
        }

        /* Only consider vcpus that are allowed to run on this processor. */
        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
        {
            (*skipped)++;
            continue;
        }

        /*
         * If a vcpu is meant to be picked up by another processor, and such
         * processor has not scheduled yet, leave it in the runqueue for him.
         */
        if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
             cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
        {
            (*skipped)++;
            SCHED_STAT_CRANK(deferred_to_tickled_cpu);
            continue;
        }

        /*
         * If this is on a different processor, don't pull it unless
         * its credit is at least CSCHED2_MIGRATE_RESIST higher.
         */
        if ( svc->vcpu->processor != cpu
             && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
        {
            (*skipped)++;
            SCHED_STAT_CRANK(migrate_resisted);
            continue;
        }

        /*
         * If the one in the runqueue has more credit than current (or idle,
         * if current is not runnable), or if current is yielding, and also
         * if the one in runqueue either is not capped, or is capped but has
         * some budget, then choose it.
         */
        if ( (yield || svc->credit > snext->credit) &&
             (!has_cap(svc) || vcpu_grab_budget(svc)) )
            snext = svc;

        /* In any case, if we got this far, break. */
        break;
    }

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned vcpu:16, dom:16;
            unsigned tickled_cpu, skipped;
            int credit;
        } d;
        d.dom = snext->vcpu->domain->domain_id;
        d.vcpu = snext->vcpu->vcpu_id;
        d.credit = snext->credit;
        d.tickled_cpu = snext->tickled_cpu;
        d.skipped = *skipped;
        __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
        SCHED_STAT_CRANK(tickled_cpu_overridden);

    /*
     * If snext is from a capped domain, it must have budget (or it
     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
     * which still is >= 0.
     */
    ASSERT(snext->budget >= 0);

    return snext;
}

/*
 * This function is in the critical path. It is designed to be simple and
 * fast for the common case.
 */
static struct task_slice
csched2_schedule(
    const struct scheduler *ops, s_time_t now, bool tasklet_work_scheduled)
{
    const int cpu = smp_processor_id();
    struct csched2_runqueue_data *rqd;
    struct csched2_vcpu * const scurr = csched2_vcpu(current);
    struct csched2_vcpu *snext = NULL;
    unsigned int skipped_vcpus = 0;
    struct task_slice ret;
    bool tickled;

    SCHED_STAT_CRANK(schedule);
    CSCHED2_VCPU_CHECK(current);

    BUG_ON(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));

    rqd = c2rqd(ops, cpu);
    BUG_ON(!cpumask_test_cpu(cpu, &rqd->active));

    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));

    BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd);

    /* Clear "tickled" bit now that we've been scheduled */
    tickled = cpumask_test_cpu(cpu, &rqd->tickled);
    if ( tickled )
    {
        __cpumask_clear_cpu(cpu, &rqd->tickled);
        cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
        smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle);
    }

    if ( unlikely(tb_init_done) )
    {
        struct {
            unsigned cpu:16, rq_id:16;
            unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
        } d;
        d.cpu = cpu;
        d.rq_id = c2r(cpu);
        d.tasklet = tasklet_work_scheduled;
        d.idle = is_idle_vcpu(current);
        d.smt_idle = cpumask_test_cpu(cpu, &rqd->smt_idle);
        d.tickled = tickled;
        __trace_var(TRC_CSCHED2_SCHEDULE, 1,
                    sizeof(d),
                    (unsigned char *)&d);
    }

    /* Update credits (and budget, if necessary). */
    burn_credits(rqd, scurr, now);

    /*
     *  Below 0, means that we are capped and we have overrun our  budget.
     *  Let's try to get some more but, if we fail (e.g., because of the
     *  other running vcpus), we will be parked.
     */
    if ( unlikely(scurr->budget <= 0) )
        vcpu_grab_budget(scurr);

    /*
     * Select next runnable local VCPU (ie top of local runq).
     *
     * If the current vcpu is runnable, and has higher credit than
     * the next guy on the queue (or there is noone else), we want to
     * run him again.
     *
     * If there's tasklet work to do, we want to chose the idle vcpu
     * for this processor, and mark the current for delayed runqueue
     * add.
     *
     * If the current vcpu is runnable, and there's another runnable
     * candidate, we want to mark current for delayed runqueue add,
     * and remove the next guy from the queue.
     *
     * If the current vcpu is not runnable, we want to chose the idle
     * vcpu for this processor.
     */
    if ( tasklet_work_scheduled )
    {
        __clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);
        trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
        snext = csched2_vcpu(idle_vcpu[cpu]);
    }
    else
        snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus);

    /* If switching from a non-idle runnable vcpu, put it
     * back on the runqueue. */
    if ( snext != scurr
         && !is_idle_vcpu(scurr->vcpu)
         && vcpu_runnable(current) )
        __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);

    ret.migrated = 0;

    /* Accounting for non-idle tasks */
    if ( !is_idle_vcpu(snext->vcpu) )
    {
        /* If switching, remove this from the runqueue and mark it scheduled */
        if ( snext != scurr )
        {
            ASSERT(snext->rqd == rqd);
            ASSERT(!snext->vcpu->is_running);

            runq_remove(snext);
            __set_bit(__CSFLAG_scheduled, &snext->flags);
        }

        /*
         * The reset condition is "has a scheduler epoch come to an end?".
         * The way this is enforced is checking whether the vcpu at the top
         * of the runqueue has negative credits. This means the epochs have
         * variable length, as in one epoch expores when:
         *  1) the vcpu at the top of the runqueue has executed for
         *     around 10 ms (with default parameters);
         *  2) no other vcpu with higher credits wants to run.
         *
         * Here, where we want to check for reset, we need to make sure the
         * proper vcpu is being used. In fact, runqueue_candidate() may have
         * not returned the first vcpu in the runqueue, for various reasons
         * (e.g., affinity). Only trigger a reset when it does.
         */
        if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
        {
            reset_credit(ops, cpu, now, snext);
            balance_load(ops, cpu, now);
        }

        /* Clear the idle mask if necessary */
        if ( cpumask_test_cpu(cpu, &rqd->idle) )
        {
            __cpumask_clear_cpu(cpu, &rqd->idle);
            smt_idle_mask_clear(cpu, &rqd->smt_idle);
        }

        snext->start_time = now;
        snext->tickled_cpu = -1;

        /* Safe because lock for old processor is held */
        if ( snext->vcpu->processor != cpu )
        {
            snext->credit += CSCHED2_MIGRATE_COMPENSATION;
            snext->vcpu->processor = cpu;
            SCHED_STAT_CRANK(migrated);
            ret.migrated = 1;
        }
    }
    else
    {
        /*
         * Update the idle mask if necessary. Note that, if we're scheduling
         * idle in order to carry on some tasklet work, we want to play busy!
         */
        if ( tasklet_work_scheduled )
        {
            if ( cpumask_test_cpu(cpu, &rqd->idle) )
            {
                __cpumask_clear_cpu(cpu, &rqd->idle);
                smt_idle_mask_clear(cpu, &rqd->smt_idle);
            }
        }
        else if ( !cpumask_test_cpu(cpu, &rqd->idle) )
        {
            __cpumask_set_cpu(cpu, &rqd->idle);
            cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
            smt_idle_mask_set(cpu, cpumask_scratch, &rqd->smt_idle);
        }
        /* Make sure avgload gets updated periodically even
         * if there's no activity */
        update_load(ops, rqd, NULL, 0, now);
    }

    /*
     * Return task to run next...
     */
    ret.time = csched2_runtime(ops, cpu, snext, now);
    ret.task = snext->vcpu;

    CSCHED2_VCPU_CHECK(ret.task);
    return ret;
}

static void
csched2_dump_vcpu(struct csched2_private *prv, struct csched2_vcpu *svc)
{
    printk("[%i.%i] flags=%x cpu=%i",
            svc->vcpu->domain->domain_id,
            svc->vcpu->vcpu_id,
            svc->flags,
            svc->vcpu->processor);

    printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);

    if ( has_cap(svc) )
        printk(" budget=%"PRI_stime"(%"PRI_stime")",
               svc->budget, svc->budget_quota);

    printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
           (svc->avgload * 100) >> prv->load_precision_shift);

    printk("\n");
}

static inline void
dump_pcpu(const struct scheduler *ops, int cpu)
{
    struct csched2_private *prv = csched2_priv(ops);
    struct csched2_vcpu *svc;
#define cpustr keyhandler_scratch

    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
    printk("CPU[%02d] runq=%d, sibling=%s, ", cpu, c2r(cpu), cpustr);
    cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
    printk("core=%s\n", cpustr);

    /* current VCPU (nothing to say if that's the idle vcpu) */
    svc = csched2_vcpu(curr_on_cpu(cpu));
    if ( svc && !is_idle_vcpu(svc->vcpu) )
    {
        printk("\trun: ");
        csched2_dump_vcpu(prv, svc);
    }
#undef cpustr
}

static void
csched2_dump(const struct scheduler *ops)
{
    struct list_head *iter_sdom;
    struct csched2_private *prv = csched2_priv(ops);
    unsigned long flags;
    unsigned int i, j, loop;
#define cpustr keyhandler_scratch

    /*
     * We need the private scheduler lock as we access global
     * scheduler data and (below) the list of active domains.
     */
    read_lock_irqsave(&prv->lock, flags);

    printk("Active queues: %d\n"
           "\tdefault-weight     = %d\n",
           cpumask_weight(&prv->active_queues),
           CSCHED2_DEFAULT_WEIGHT);
    for_each_cpu(i, &prv->active_queues)
    {
        s_time_t fraction;

        fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;

        cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].active);
        printk("Runqueue %d:\n"
               "\tncpus              = %u\n"
               "\tcpus               = %s\n"
               "\tmax_weight         = %u\n"
               "\tpick_bias          = %u\n"
               "\tinstload           = %d\n"
               "\taveload            = %"PRI_stime" (~%"PRI_stime"%%)\n",
               i,
               cpumask_weight(&prv->rqd[i].active),
               cpustr,
               prv->rqd[i].max_weight,
               prv->rqd[i].pick_bias,
               prv->rqd[i].load,
               prv->rqd[i].avgload,
               fraction);

        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].idle);
        printk("\tidlers: %s\n", cpustr);
        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].tickled);
        printk("\ttickled: %s\n", cpustr);
        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].smt_idle);
        printk("\tfully idle cores: %s\n", cpustr);
    }

    printk("Domain info:\n");
    loop = 0;
    list_for_each( iter_sdom, &prv->sdom )
    {
        struct csched2_dom *sdom;
        struct vcpu *v;

        sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);

        printk("\tDomain: %d w %d c %u v %d\n",
               sdom->dom->domain_id,
               sdom->weight,
               sdom->cap,
               sdom->nr_vcpus);

        for_each_vcpu( sdom->dom, v )
        {
            struct csched2_vcpu * const svc = csched2_vcpu(v);
            spinlock_t *lock;

            lock = vcpu_schedule_lock(svc->vcpu);

            printk("\t%3d: ", ++loop);
            csched2_dump_vcpu(prv, svc);

            vcpu_schedule_unlock(lock, svc->vcpu);
        }
    }

    for_each_cpu(i, &prv->active_queues)
    {
        struct csched2_runqueue_data *rqd = prv->rqd + i;
        struct list_head *iter, *runq = &rqd->runq;
        int loop = 0;

        /* We need the lock to scan the runqueue. */
        spin_lock(&rqd->lock);

        printk("Runqueue %d:\n", i);

        for_each_cpu(j, &rqd->active)
            dump_pcpu(ops, j);

        printk("RUNQ:\n");
        list_for_each( iter, runq )
        {
            struct csched2_vcpu *svc = runq_elem(iter);

            if ( svc )
            {
                printk("\t%3d: ", loop++);
                csched2_dump_vcpu(prv, svc);
            }
        }
        spin_unlock(&rqd->lock);
    }

    read_unlock_irqrestore(&prv->lock, flags);
#undef cpustr
}

/* Returns the ID of the runqueue the cpu is assigned to. */
static unsigned
init_pdata(struct csched2_private *prv, unsigned int cpu)
{
    unsigned rqi;
    struct csched2_runqueue_data *rqd;

    ASSERT(rw_is_write_locked(&prv->lock));
    ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));

    /* Figure out which runqueue to put it in */
    rqi = cpu_to_runqueue(prv, cpu);

    rqd = prv->rqd + rqi;

    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi);
    if ( ! cpumask_test_cpu(rqi, &prv->active_queues) )
    {
        printk(XENLOG_INFO " First cpu on runqueue, activating\n");
        activate_runqueue(prv, rqi);
    }
    
    /* Set the runqueue map */
    per_cpu(runq_map, cpu) = rqi;
    
    __cpumask_set_cpu(cpu, &rqd->idle);
    __cpumask_set_cpu(cpu, &rqd->active);
    __cpumask_set_cpu(cpu, &prv->initialized);
    __cpumask_set_cpu(cpu, &rqd->smt_idle);

    if ( cpumask_weight(&rqd->active) == 1 )
        rqd->pick_bias = cpu;

    return rqi;
}

static void
csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
{
    struct csched2_private *prv = csched2_priv(ops);
    spinlock_t *old_lock;
    unsigned long flags;
    unsigned rqi;

    /*
     * pdata contains what alloc_pdata returned. But since we don't (need to)
     * implement alloc_pdata, either that's NULL, or something is very wrong!
     */
    ASSERT(!pdata);

    write_lock_irqsave(&prv->lock, flags);
    old_lock = pcpu_schedule_lock(cpu);

    rqi = init_pdata(prv, cpu);
    /* Move the scheduler lock to the new runq lock. */
    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;

    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
    spin_unlock(old_lock);
    write_unlock_irqrestore(&prv->lock, flags);
}

/* Change the scheduler of cpu to us (Credit2). */
static void
csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
                     void *pdata, void *vdata)
{
    struct csched2_private *prv = csched2_priv(new_ops);
    struct csched2_vcpu *svc = vdata;
    unsigned rqi;

    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));

    /*
     * We own one runqueue lock already (from schedule_cpu_switch()). This
     * looks like it violates this scheduler's locking rules, but it does
     * not, as what we own is the lock of another scheduler, that hence has
     * no particular (ordering) relationship with our private global lock.
     * And owning exactly that one (the lock of the old scheduler of this
     * cpu) is what is necessary to prevent races.
     */
    ASSERT(!local_irq_is_enabled());
    write_lock(&prv->lock);

    idle_vcpu[cpu]->sched_priv = vdata;

    rqi = init_pdata(prv, cpu);

    /*
     * Now that we know what runqueue we'll go in, double check what's said
     * above: the lock we already hold is not the one of this runqueue of
     * this scheduler, and so it's safe to have taken it /before/ our
     * private global lock.
     */
    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);

    per_cpu(scheduler, cpu) = new_ops;
    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */

    /*
     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
     * if it is free (and it can be) we want that anyone that manages
     * taking it, find all the initializations we've done above in place.
     */
    smp_mb();
    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;

    write_unlock(&prv->lock);
}

static void
csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
{
    unsigned long flags;
    struct csched2_private *prv = csched2_priv(ops);
    struct csched2_runqueue_data *rqd;
    int rqi;

    write_lock_irqsave(&prv->lock, flags);

    /*
     * alloc_pdata is not implemented, so pcpu must be NULL. On the other
     * hand, init_pdata must have been called for this pCPU.
     */
    ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized));
    
    /* Find the old runqueue and remove this cpu from it */
    rqi = per_cpu(runq_map, cpu);

    rqd = prv->rqd + rqi;

    /* No need to save IRQs here, they're already disabled */
    spin_lock(&rqd->lock);

    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi);

    __cpumask_clear_cpu(cpu, &rqd->idle);
    __cpumask_clear_cpu(cpu, &rqd->smt_idle);
    __cpumask_clear_cpu(cpu, &rqd->active);

    if ( cpumask_empty(&rqd->active) )
    {
        printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
        deactivate_runqueue(prv, rqi);
    }
    else if ( rqd->pick_bias == cpu )
        rqd->pick_bias = cpumask_first(&rqd->active);

    per_cpu(runq_map, cpu) = -1;

    spin_unlock(&rqd->lock);

    __cpumask_clear_cpu(cpu, &prv->initialized);

    write_unlock_irqrestore(&prv->lock, flags);

    return;
}

static int
csched2_init(struct scheduler *ops)
{
    int i;
    struct csched2_private *prv;

    printk("Initializing Credit2 scheduler\n");

    printk(XENLOG_INFO " load_precision_shift: %d\n"
           XENLOG_INFO " load_window_shift: %d\n"
           XENLOG_INFO " underload_balance_tolerance: %d\n"
           XENLOG_INFO " overload_balance_tolerance: %d\n"
           XENLOG_INFO " runqueues arrangement: %s\n"
           XENLOG_INFO " cap enforcement granularity: %dms\n",
           opt_load_precision_shift,
           opt_load_window_shift,
           opt_underload_balance_tolerance,
           opt_overload_balance_tolerance,
           opt_runqueue_str[opt_runqueue],
           opt_cap_period);

    if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
    {
        printk("WARNING: %s: opt_load_precision_shift %d below min %d, resetting\n",
               __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
        opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
    }

    if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
    {
        printk("WARNING: %s: opt_load_window_shift %d too short, resetting\n",
               __func__, opt_load_window_shift);
        opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
    }
    printk(XENLOG_INFO "load tracking window length %llu ns\n",
           1ULL << opt_load_window_shift);

    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
    {
        printk("WARNING: %s: opt_cap_period %d too small, resetting\n",
               __func__, opt_cap_period);
        opt_cap_period = 10; /* ms */
    }

    /*
     * Basically no CPU information is available at this point; just
     * set up basic structures, and a callback when the CPU info is
     * available.
     */

    prv = xzalloc(struct csched2_private);
    if ( prv == NULL )
        return -ENOMEM;
    ops->sched_data = prv;

    rwlock_init(&prv->lock);
    INIT_LIST_HEAD(&prv->sdom);

    /* Allocate all runqueues and mark them as un-initialized */
    prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
    if ( !prv->rqd )
    {
        xfree(prv);
        return -ENOMEM;
    }
    for ( i = 0; i < nr_cpu_ids; i++ )
        prv->rqd[i].id = -1;

    /* initialize ratelimit */
    prv->ratelimit_us = sched_ratelimit_us;

    prv->load_precision_shift = opt_load_precision_shift;
    prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
    ASSERT(opt_load_window_shift > 0);

    return 0;
}

static void
csched2_deinit(struct scheduler *ops)
{
    struct csched2_private *prv;

    prv = csched2_priv(ops);
    ops->sched_data = NULL;
    xfree(prv);
}

static const struct scheduler sched_credit2_def = {
    .name           = "SMP Credit Scheduler rev2",
    .opt_name       = "credit2",
    .sched_id       = XEN_SCHEDULER_CREDIT2,
    .sched_data     = NULL,

    .init_domain    = csched2_dom_init,
    .destroy_domain = csched2_dom_destroy,

    .insert_vcpu    = csched2_vcpu_insert,
    .remove_vcpu    = csched2_vcpu_remove,

    .sleep          = csched2_vcpu_sleep,
    .wake           = csched2_vcpu_wake,
    .yield          = csched2_vcpu_yield,

    .adjust         = csched2_dom_cntl,
    .adjust_global  = csched2_sys_cntl,

    .pick_cpu       = csched2_cpu_pick,
    .migrate        = csched2_vcpu_migrate,
    .do_schedule    = csched2_schedule,
    .context_saved  = csched2_context_saved,

    .dump_settings  = csched2_dump,
    .init           = csched2_init,
    .deinit         = csched2_deinit,
    .alloc_vdata    = csched2_alloc_vdata,
    .free_vdata     = csched2_free_vdata,
    .init_pdata     = csched2_init_pdata,
    .deinit_pdata   = csched2_deinit_pdata,
    .switch_sched   = csched2_switch_sched,
    .alloc_domdata  = csched2_alloc_domdata,
    .free_domdata   = csched2_free_domdata,
};

REGISTER_SCHEDULER(sched_credit2_def);

Coverage Report

Created: 2017-10-25 09:10