debuggers.hg

changeset 22666:65b63f5af281

credit2: Introduce a loadavg-based load balancer

This is a first-cut at getting load balancing. I'm first working on
looking at behavior I want to get correct; then, once I know what kind
of behavior works well, then I'll work on getting it efficient.

The general idea is when balancing runqueues, look for the runqueue
whose loadavg is the most different from ours (higher or lower).
Then, look for a transaction which will bring the loads closest
together: either pushing a vcpu, pulling a vcpu, or swapping them.
Use the per-vcpu load to calculate the expected load after the
exchange.

The current algorithm looks at every combination, which is O(N^2).
That's not going to be suitable for workloads with large numbers of
vcpus (such as highly consolidated VDI deployments). I'll make a more
efficient algorithm once I've experimented and determined what I think
is the best load-balancing behavior.

At the moment, balance from a runqueue every time the credit resets.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Keir Fraser <keir@xen.org>
date Fri Dec 24 08:31:54 2010 +0000 (2010-12-24)
parents 41d1affef596
children 94d47b8b723f
files xen/common/sched_credit2.c
line diff
     1.1 --- a/xen/common/sched_credit2.c	Fri Dec 24 08:31:24 2010 +0000
     1.2 +++ b/xen/common/sched_credit2.c	Fri Dec 24 08:31:54 2010 +0000
     1.3 @@ -1104,6 +1104,216 @@ out_up:
     1.4      return new_cpu;
     1.5  }
     1.6  
     1.7 +static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
     1.8 +{
     1.9 +    struct csched_private *prv = CSCHED_PRIV(ops);
    1.10 +    int i, max_delta_rqi = -1;
    1.11 +    struct list_head *push_iter, *pull_iter;
    1.12 +
    1.13 +    /* NB: Modified by consider() */
    1.14 +    s_time_t load_delta;
    1.15 +    struct csched_vcpu * best_push_svc=NULL, *best_pull_svc=NULL;
    1.16 +    /* NB: Read by consider() */
    1.17 +    struct csched_runqueue_data *lrqd;
    1.18 +    struct csched_runqueue_data *orqd;
    1.19 +    
    1.20 +    void consider(struct csched_vcpu *push_svc,
    1.21 +                  struct csched_vcpu *pull_svc)
    1.22 +    {
    1.23 +        s_time_t l_load, o_load, delta;
    1.24 +
    1.25 +        l_load = lrqd->b_avgload;
    1.26 +        o_load = orqd->b_avgload;
    1.27 +        if ( push_svc )
    1.28 +        {
    1.29 +            /* What happens to the load on both if we push? */
    1.30 +            l_load -= push_svc->avgload;
    1.31 +            o_load += push_svc->avgload;
    1.32 +        }
    1.33 +        if ( pull_svc )
    1.34 +        {
    1.35 +            /* What happens to the load on both if we pull? */
    1.36 +            l_load += pull_svc->avgload;
    1.37 +            o_load -= pull_svc->avgload;
    1.38 +        }
    1.39 +        
    1.40 +        delta = l_load - o_load;
    1.41 +        if ( delta < 0 )
    1.42 +            delta = -delta;
    1.43 +
    1.44 +        if ( delta < load_delta )
    1.45 +        {
    1.46 +            load_delta = delta;
    1.47 +            best_push_svc=push_svc;
    1.48 +            best_pull_svc=pull_svc;
    1.49 +        }
    1.50 +    }
    1.51 +
    1.52 +    void migrate(struct csched_vcpu *svc, struct csched_runqueue_data *trqd)
    1.53 +    {
    1.54 +        if ( test_bit(__CSFLAG_scheduled, &svc->flags) )
    1.55 +        {
    1.56 +            d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
    1.57 +                     svc->rqd->id, trqd->id);
    1.58 +            /* It's running; mark it to migrate. */
    1.59 +            svc->migrate_rqd = trqd;
    1.60 +            set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
    1.61 +            set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
    1.62 +        }
    1.63 +        else
    1.64 +        {
    1.65 +            int on_runq=0;
    1.66 +            /* It's not running; just move it */
    1.67 +            d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id,
    1.68 +                     svc->rqd->id, trqd->id);
    1.69 +            if ( __vcpu_on_runq(svc) )
    1.70 +            {
    1.71 +                __runq_remove(svc);
    1.72 +                update_load(ops, svc->rqd, svc, -1, now);
    1.73 +                on_runq=1;
    1.74 +            }
    1.75 +            __runq_deassign(svc);
    1.76 +            svc->vcpu->processor = first_cpu(trqd->active);
    1.77 +            __runq_assign(svc, trqd);
    1.78 +            if ( on_runq )
    1.79 +            {
    1.80 +                update_load(ops, svc->rqd, svc, 1, now);
    1.81 +                runq_insert(ops, svc->vcpu->processor, svc);
    1.82 +                runq_tickle(ops, svc->vcpu->processor, svc, now);
    1.83 +            }
    1.84 +        }
    1.85 +    }
    1.86 +                  
    1.87 +    
    1.88 +    /*
    1.89 +     * Basic algorithm: Push, pull, or swap.
    1.90 +     * - Find the runqueue with the furthest load distance
    1.91 +     * - Find a pair that makes the difference the least (where one
    1.92 +     * on either side may be empty).
    1.93 +     */
    1.94 +
    1.95 +    /* Locking:
    1.96 +     * - pcpu schedule lock should be already locked
    1.97 +     */
    1.98 +    lrqd = RQD(ops, cpu);
    1.99 +
   1.100 +    __update_runq_load(ops, lrqd, 0, now);
   1.101 +
   1.102 +retry:
   1.103 +    if ( !spin_trylock(&prv->lock) )
   1.104 +        return;
   1.105 +
   1.106 +    load_delta = 0;
   1.107 +
   1.108 +    for_each_cpu_mask(i, prv->active_queues)
   1.109 +    {
   1.110 +        s_time_t delta;
   1.111 +        
   1.112 +        orqd = prv->rqd + i;
   1.113 +
   1.114 +        if ( orqd == lrqd
   1.115 +             || !spin_trylock(&orqd->lock) )
   1.116 +            continue;
   1.117 +
   1.118 +        __update_runq_load(ops, orqd, 0, now);
   1.119 +    
   1.120 +        delta = lrqd->b_avgload - orqd->b_avgload;
   1.121 +        if ( delta < 0 )
   1.122 +            delta = -delta;
   1.123 +
   1.124 +        if ( delta > load_delta )
   1.125 +        {
   1.126 +            load_delta = delta;
   1.127 +            max_delta_rqi = i;
   1.128 +        }
   1.129 +
   1.130 +        spin_unlock(&orqd->lock);
   1.131 +    }
   1.132 +
   1.133 +    /* Minimize holding the big lock */
   1.134 +    spin_unlock(&prv->lock);
   1.135 +
   1.136 +    if ( max_delta_rqi == -1 )
   1.137 +        goto out;
   1.138 +
   1.139 +    /* Don't bother with load differences less than 25%. */
   1.140 +    if ( load_delta < (1ULL<<(prv->load_window_shift - 2)) )
   1.141 +        goto out;
   1.142 +
   1.143 +    /* Try to grab the other runqueue lock; if it's been taken in the
   1.144 +     * meantime, try the process over again.  This can't deadlock
   1.145 +     * because if it doesn't get any other rqd locks, it will simply
   1.146 +     * give up and return. */
   1.147 +    orqd = prv->rqd + max_delta_rqi;
   1.148 +    if ( !spin_trylock(&orqd->lock) )
   1.149 +        goto retry;
   1.150 +
   1.151 +    /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
   1.152 +    if ( unlikely(orqd->id < 0) )
   1.153 +        goto out_up;
   1.154 +
   1.155 +    /* Look for "swap" which gives the best load average
   1.156 +     * FIXME: O(n^2)! */
   1.157 +
   1.158 +    /* Reuse load delta (as we're trying to minimize it) */
   1.159 +    list_for_each( push_iter, &lrqd->svc )
   1.160 +    {
   1.161 +        int inner_load_updated = 0;
   1.162 +        struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem);
   1.163 +
   1.164 +        __update_svc_load(ops, push_svc, 0, now);
   1.165 +
   1.166 +        /* Skip this one if it's already been flagged to migrate */
   1.167 +        if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
   1.168 +            continue;
   1.169 +
   1.170 +        list_for_each( pull_iter, &orqd->svc )
   1.171 +        {
   1.172 +            struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
   1.173 +            
   1.174 +            if ( ! inner_load_updated )
   1.175 +            {
   1.176 +                __update_svc_load(ops, pull_svc, 0, now);
   1.177 +            }
   1.178 +        
   1.179 +            /* Skip this one if it's already been flagged to migrate */
   1.180 +            if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
   1.181 +                continue;
   1.182 +
   1.183 +            consider(push_svc, pull_svc);
   1.184 +        }
   1.185 +
   1.186 +        inner_load_updated = 1;
   1.187 +
   1.188 +        /* Consider push only */
   1.189 +        consider(push_svc, NULL);
   1.190 +    }
   1.191 +
   1.192 +    list_for_each( pull_iter, &orqd->svc )
   1.193 +    {
   1.194 +        struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem);
   1.195 +        
   1.196 +        /* Skip this one if it's already been flagged to migrate */
   1.197 +        if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
   1.198 +            continue;
   1.199 +
   1.200 +        /* Consider pull only */
   1.201 +        consider(NULL, pull_svc);
   1.202 +    }
   1.203 +
   1.204 +    /* OK, now we have some candidates; do the moving */
   1.205 +    if ( best_push_svc )
   1.206 +        migrate(best_push_svc, orqd);
   1.207 +    if ( best_pull_svc )
   1.208 +        migrate(best_pull_svc, lrqd);
   1.209 +
   1.210 +out_up:
   1.211 +    spin_unlock(&orqd->lock);
   1.212 +
   1.213 +out:
   1.214 +    return;
   1.215 +}
   1.216 +
   1.217  static int
   1.218  csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
   1.219  {
   1.220 @@ -1437,7 +1647,10 @@ csched_schedule(
   1.221  
   1.222          /* Check for the reset condition */
   1.223          if ( snext->credit <= CSCHED_CREDIT_RESET )
   1.224 +        {
   1.225              reset_credit(ops, cpu, now);
   1.226 +            balance_load(ops, cpu, now);
   1.227 +        }
   1.228  
   1.229          /* Clear the idle mask if necessary */
   1.230          if ( cpu_isset(cpu, rqd->idle) )