.dump_settings = csched_dump,
.init = csched_init,
};
+
+
+/*
+ * Boost Credit Schdeuler(bcredit)
+ * Alternative Credit Scheduler optimized for client hypervisor
+ */
+
+/*
+ * Basic constants
+ */
+#define BCSCHED_DEFAULT_WEIGHT CSCHED_DEFAULT_WEIGHT
+#define BCSCHED_TICKS_PER_TSLICE CSCHED_TICKS_PER_TSLICE
+#define BCSCHED_TICKS_PER_ACCT CSCHED_TICKS_PER_ACCT
+#define BCSCHED_MSECS_PER_TICK CSCHED_MSECS_PER_TICK
+#define BCSCHED_MSECS_PER_TSLICE \
+ (BCSCHED_MSECS_PER_TICK * BCSCHED_TICKS_PER_TSLICE)
+#define BCSCHED_CREDITS_PER_TICK 10000
+#define BCSCHED_CREDITS_PER_TSLICE \
+ (BCSCHED_CREDITS_PER_TICK * BCSCHED_TICKS_PER_TSLICE)
+#define BCSCHED_CREDITS_PER_ACCT \
+ (BCSCHED_CREDITS_PER_TICK * BCSCHED_TICKS_PER_ACCT)
+#define BCSCHED_MSECS_BOOSTTSLICE_PER_CPU 2
+#define BCSCHED_NSECS_MIN_BOOST_TSLICE 500000
+
+/*
+ * Macros
+ */
+#define svc_sbvc(_v) (container_of((_v), struct bcsched_vcpu, svc))
+#define sdom_sbdom(_d) (container_of((_d), struct bcsched_dom, sdom))
+
+/*
+ * Virtual CPU
+ */
+struct bcsched_vcpu {
+ struct csched_vcpu svc;
+ struct list_head inactive_vcpu_elem;
+ s_time_t start_time;
+ atomic_t boost_credit;
+};
+
+/*
+ * Domain
+ */
+struct bcsched_dom {
+ struct csched_dom sdom;
+ uint16_t boost_ratio;
+ uint16_t max_boost_period;
+};
+
+/*
+ * System-wide private data
+ */
+struct bcsched_private {
+ struct list_head inactive_vcpu;
+ uint32_t nvcpus;
+ s_time_t boost_tslice;
+ uint32_t boost_credit;
+ uint16_t total_boost_ratio;
+};
+
+/*
+ * Global variables
+ */
+static struct bcsched_private bcsched_priv;
+
+/* opt_bcsched_tslice: time slice for BOOST priority */
+static unsigned int opt_bcsched_tslice = BCSCHED_MSECS_BOOSTTSLICE_PER_CPU;
+integer_param("bcsched_tslice", opt_bcsched_tslice);
+
+static void bcsched_tick(void *_cpu);
+
+static int
+bcsched_pcpu_init(int cpu)
+{
+ struct csched_pcpu *spc;
+ unsigned long flags;
+
+ /* Allocate per-PCPU info */
+ spc = xmalloc(struct csched_pcpu);
+ if ( spc == NULL )
+ return -1;
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ /* Initialize/update system-wide config */
+ csched_priv.credit += BCSCHED_CREDITS_PER_ACCT;
+ if ( csched_priv.ncpus <= cpu )
+ csched_priv.ncpus = cpu + 1;
+ if ( csched_priv.master >= csched_priv.ncpus )
+ csched_priv.master = cpu;
+
+ init_timer(&spc->ticker, bcsched_tick, (void *)(unsigned long)cpu, cpu);
+ INIT_LIST_HEAD(&spc->runq);
+ spc->runq_sort_last = csched_priv.runq_sort;
+ per_cpu(schedule_data, cpu).sched_priv = spc;
+
+ /* Start off idling... */
+ BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
+ cpu_set(cpu, csched_priv.idlers);
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+ return 0;
+}
+
+static inline void
+__bcsched_vcpu_acct_start_locked(struct csched_vcpu *svc)
+{
+ struct csched_dom * const sdom = svc->sdom;
+ struct bcsched_vcpu * const sbvc = svc_sbvc(svc);
+ struct bcsched_dom * const sbdom = sdom_sbdom(sdom);
+
+ CSCHED_VCPU_STAT_CRANK(svc, state_active);
+ CSCHED_STAT_CRANK(acct_vcpu_active);
+
+ sdom->active_vcpu_count++;
+ list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
+ list_del_init(&sbvc->inactive_vcpu_elem);
+ if ( list_empty(&sdom->active_sdom_elem) )
+ {
+ list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+ csched_priv.weight += sdom->weight;
+ bcsched_priv.boost_credit += (sbdom->boost_ratio *
+ BCSCHED_CREDITS_PER_TSLICE) / 100;
+ }
+}
+
+static inline void
+__bcsched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+{
+ struct csched_dom * const sdom = svc->sdom;
+ struct bcsched_vcpu * const sbvc = svc_sbvc(svc);
+ struct bcsched_dom * const sbdom = sdom_sbdom(sdom);
+
+ BUG_ON( list_empty(&svc->active_vcpu_elem) );
+
+ CSCHED_VCPU_STAT_CRANK(svc, state_idle);
+ CSCHED_STAT_CRANK(acct_vcpu_idle);
+
+ sdom->active_vcpu_count--;
+ list_del_init(&svc->active_vcpu_elem);
+ list_add(&sbvc->inactive_vcpu_elem, &bcsched_priv.inactive_vcpu);
+ if ( list_empty(&sdom->active_vcpu) )
+ {
+ BUG_ON( csched_priv.weight < sdom->weight );
+ list_del_init(&sdom->active_sdom_elem);
+ csched_priv.weight -= sdom->weight;
+ bcsched_priv.boost_credit -= (sbdom->boost_ratio *
+ BCSCHED_CREDITS_PER_TSLICE) / 100;
+ }
+}
+
+static void
+bcsched_vcpu_acct(unsigned int cpu)
+{
+ ASSERT( current->processor == cpu );
+ ASSERT( CSCHED_VCPU(current)->sdom != NULL );
+
+ /*
+ * If it's been active a while, check if we'd be better off
+ * migrating it to run elsewhere (see multi-core and multi-thread
+ * support in csched_cpu_pick()).
+ */
+ if ( csched_cpu_pick(current) != cpu )
+ {
+ CSCHED_VCPU_STAT_CRANK(CSCHED_VCPU(current), migrate_r);
+ CSCHED_STAT_CRANK(migrate_running);
+ set_bit(_VPF_migrating, ¤t->pause_flags);
+ cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+ }
+}
+
+static int
+bcsched_vcpu_init(struct vcpu *vc)
+{
+ struct domain * const dom = vc->domain;
+ struct csched_dom *sdom = CSCHED_DOM(dom);
+ struct bcsched_vcpu *sbvc;
+ struct csched_vcpu *svc;
+ unsigned long flags;
+
+ CSCHED_STAT_CRANK(vcpu_init);
+
+ /* Allocate per-VCPU info */
+ sbvc = xmalloc(struct bcsched_vcpu);
+ if ( sbvc == NULL )
+ return -1;
+ svc = &(sbvc->svc);
+
+ INIT_LIST_HEAD(&svc->runq_elem);
+ INIT_LIST_HEAD(&svc->active_vcpu_elem);
+ INIT_LIST_HEAD(&sbvc->inactive_vcpu_elem);
+ svc->sdom = sdom;
+ svc->vcpu = vc;
+ atomic_set(&svc->credit, 0);
+ svc->flags = 0U;
+ svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+ CSCHED_VCPU_STATS_RESET(svc);
+ vc->sched_priv = svc;
+ atomic_set(&sbvc->boost_credit, 0);
+
+ /* Allocate per-PCPU info */
+ if ( unlikely(!CSCHED_PCPU(vc->processor)) )
+ {
+ if ( bcsched_pcpu_init(vc->processor) != 0 )
+ return -1;
+ }
+
+ /* Add inactive queue in order to start acct */
+ if ( !is_idle_vcpu(vc) )
+ {
+ uint32_t vcpus_per_cpu;
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ list_add(&sbvc->inactive_vcpu_elem, &bcsched_priv.inactive_vcpu);
+
+ bcsched_priv.nvcpus++;
+ vcpus_per_cpu = ( (bcsched_priv.nvcpus + (csched_priv.ncpus-1)) /
+ csched_priv.ncpus
+ ) - 1;
+ if ( vcpus_per_cpu == 0 )
+ bcsched_priv.boost_tslice = MILLISECS(BCSCHED_MSECS_PER_TSLICE);
+ else
+ {
+ bcsched_priv.boost_tslice = MILLISECS(opt_bcsched_tslice) /
+ vcpus_per_cpu;
+ if ( bcsched_priv.boost_tslice < BCSCHED_NSECS_MIN_BOOST_TSLICE )
+ bcsched_priv.boost_tslice = BCSCHED_NSECS_MIN_BOOST_TSLICE;
+ }
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+ }
+
+ CSCHED_VCPU_CHECK(vc);
+ return 0;
+}
+
+static void
+bcsched_vcpu_destroy(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+ struct bcsched_vcpu * const sbvc = svc_sbvc(svc);
+ struct csched_dom * const sdom = svc->sdom;
+ unsigned long flags;
+
+ CSCHED_STAT_CRANK(vcpu_destroy);
+
+ BUG_ON( sdom == NULL );
+ BUG_ON( !list_empty(&svc->runq_elem) );
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ if ( !list_empty(&svc->active_vcpu_elem) )
+ __bcsched_vcpu_acct_stop_locked(svc);
+
+ if ( !list_empty(&sbvc->inactive_vcpu_elem) )
+ list_del_init(&sbvc->inactive_vcpu_elem);
+
+ if ( !is_idle_vcpu(vc) )
+ {
+ uint32_t vcpus_per_cpu;
+
+ bcsched_priv.nvcpus--;
+ vcpus_per_cpu = ( (bcsched_priv.nvcpus + (csched_priv.ncpus-1)) /
+ csched_priv.ncpus
+ ) - 1;
+ if ( vcpus_per_cpu == 0 )
+ bcsched_priv.boost_tslice = MILLISECS(BCSCHED_MSECS_PER_TSLICE);
+ else
+ {
+ bcsched_priv.boost_tslice = MILLISECS(opt_bcsched_tslice) /
+ vcpus_per_cpu;
+ if ( bcsched_priv.boost_tslice < BCSCHED_NSECS_MIN_BOOST_TSLICE )
+ bcsched_priv.boost_tslice = BCSCHED_NSECS_MIN_BOOST_TSLICE;
+ }
+ }
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+ xfree(sbvc);
+}
+
+static int
+bcsched_dom_cntl(
+ struct domain *d,
+ struct xen_domctl_scheduler_op *op)
+{
+ struct csched_dom * const sdom = CSCHED_DOM(d);
+ struct bcsched_dom * const sbdom = sdom_sbdom(sdom);
+ unsigned long flags;
+
+ if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
+ {
+ op->u.bcredit.weight = sdom->weight;
+ op->u.bcredit.cap = sdom->cap;
+ op->u.bcredit.max_boost_period = sbdom->max_boost_period;
+ op->u.bcredit.boost_ratio = sbdom->boost_ratio;
+ }
+ else
+ {
+ uint16_t weight = (uint16_t)~0U;
+
+ ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ if ( (op->u.bcredit.weight != 0) &&
+ (sbdom->boost_ratio == 0 || op->u.bcredit.boost_ratio == 0) )
+ {
+ weight = op->u.bcredit.weight;
+ }
+
+ if ( op->u.bcredit.cap != (uint16_t)~0U )
+ sdom->cap = op->u.bcredit.cap;
+
+ if ( (op->u.bcredit.max_boost_period != (uint16_t)~0U) &&
+ (op->u.bcredit.max_boost_period >= BCSCHED_MSECS_PER_TSLICE ||
+ op->u.bcredit.max_boost_period == 0) )
+ {
+ sbdom->max_boost_period = op->u.bcredit.max_boost_period;
+ }
+
+ if ( (op->u.bcredit.boost_ratio != (uint16_t)~0U) &&
+ ((bcsched_priv.total_boost_ratio - sbdom->boost_ratio +
+ op->u.bcredit.boost_ratio) <= 100 * csched_priv.ncpus) &&
+ (sbdom->max_boost_period || op->u.bcredit.boost_ratio == 0) )
+ {
+ uint16_t new_bc, old_bc;
+
+ new_bc = ( op->u.bcredit.boost_ratio *
+ BCSCHED_CREDITS_PER_TSLICE ) / 100;
+ old_bc = ( sbdom->boost_ratio *
+ BCSCHED_CREDITS_PER_TSLICE ) / 100;
+
+ bcsched_priv.total_boost_ratio -= sbdom->boost_ratio;
+ bcsched_priv.total_boost_ratio += op->u.bcredit.boost_ratio;
+
+ sbdom->boost_ratio = op->u.bcredit.boost_ratio;
+
+ if ( !list_empty(&sdom->active_sdom_elem) )
+ {
+ bcsched_priv.boost_credit -= old_bc;
+ bcsched_priv.boost_credit += new_bc;
+ }
+ if ( new_bc == 0 )
+ {
+ if ( sdom->weight == 0 )
+ weight = BCSCHED_DEFAULT_WEIGHT;
+ }
+ else
+ weight = 0;
+ }
+
+ if ( weight != (uint16_t)~0U )
+ {
+ if ( !list_empty(&sdom->active_sdom_elem) )
+ {
+ csched_priv.weight -= sdom->weight;
+ csched_priv.weight += weight;
+ }
+ sdom->weight = weight;
+ }
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+ }
+
+ return 0;
+}
+
+static int
+bcsched_dom_init(struct domain *dom)
+{
+ struct csched_dom *sdom;
+ struct bcsched_dom *sbdom;
+
+ CSCHED_STAT_CRANK(dom_init);
+
+ if ( is_idle_domain(dom) )
+ return 0;
+
+ sbdom = xmalloc(struct bcsched_dom);
+ if ( sbdom == NULL )
+ return -ENOMEM;
+ sdom = &(sbdom->sdom);
+
+ /* Initalize credit and weight */
+ INIT_LIST_HEAD(&sdom->active_vcpu);
+ sdom->active_vcpu_count = 0;
+ INIT_LIST_HEAD(&sdom->active_sdom_elem);
+ sdom->dom = dom;
+ sdom->weight = BCSCHED_DEFAULT_WEIGHT;
+ sdom->cap = 0U;
+ sbdom->boost_ratio = 0U;
+ sbdom->max_boost_period = 0;
+ dom->sched_priv = sdom;
+
+ return 0;
+}
+
+static void
+bcsched_dom_destroy(struct domain *dom)
+{
+ CSCHED_STAT_CRANK(dom_destroy);
+ xfree(sdom_sbdom(CSCHED_DOM(dom)));
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share VCPUs can only be one of three priorities, BOOST, UNDER or OVER.
+ * We walk through the runq and move up any BOOSTs that are preceded by UNDERs
+ * or OVERs, and any UNDERs that are preceded by OVERs. We remember the last
+ * BOOST and UNDER to make the move up operation O(1).
+ */
+static void
+bcsched_runq_sort(unsigned int cpu)
+{
+ struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+ struct list_head *runq, *elem, *next, *last_boost, *last_under;
+ struct csched_vcpu *svc_elem;
+ unsigned long flags;
+ int sort_epoch;
+
+ sort_epoch = csched_priv.runq_sort;
+ if ( sort_epoch == spc->runq_sort_last )
+ return;
+
+ spc->runq_sort_last = sort_epoch;
+
+ spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+ runq = &spc->runq;
+ elem = runq->next;
+ last_boost = last_under = runq;
+ while ( elem != runq )
+ {
+ next = elem->next;
+ svc_elem = __runq_elem(elem);
+
+ if ( svc_elem->pri == CSCHED_PRI_TS_BOOST )
+ {
+ /* does elem need to move up the runq? */
+ if ( elem->prev != last_boost )
+ {
+ list_del(elem);
+ list_add(elem, last_boost);
+ }
+ if ( last_boost == last_under )
+ last_under = elem;
+ last_boost = elem;
+ }
+ else if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
+ {
+ /* does elem need to move up the runq? */
+ if ( elem->prev != last_under )
+ {
+ list_del(elem);
+ list_add(elem, last_under);
+ }
+ last_under = elem;
+ }
+
+ elem = next;
+ }
+
+ spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+}
+
+static void
+bcsched_acct(void)
+{
+ unsigned long flags;
+ struct list_head *iter_vcpu, *next_vcpu;
+ struct list_head *iter_sdom, *next_sdom;
+ struct bcsched_vcpu *sbvc;
+ struct bcsched_dom *sbdom;
+ struct csched_vcpu *svc;
+ struct csched_dom *sdom;
+ uint32_t credit_total;
+ uint32_t weight_total;
+ uint32_t bc_total;
+ uint32_t weight_left;
+ uint32_t credit_fair;
+ uint32_t credit_peak;
+ uint32_t credit_cap;
+ uint32_t bc_fair;
+ int credit_balance;
+ int credit_xtra;
+ int credit;
+ int boost_credit;
+ int max_boost_credit;
+ int64_t c_sum, bc_sum;
+ int c_average, bc_average;
+
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ /* Add vcpu to active list when its credits were consumued by one tick */
+ list_for_each_safe( iter_vcpu, next_vcpu, &bcsched_priv.inactive_vcpu )
+ {
+ sbvc = list_entry(iter_vcpu, struct bcsched_vcpu, inactive_vcpu_elem);
+ svc = &(sbvc->svc);
+ sbdom = sdom_sbdom(svc->sdom);
+
+ max_boost_credit = sbdom->max_boost_period *
+ (BCSCHED_CREDITS_PER_TSLICE/BCSCHED_MSECS_PER_TSLICE);
+ if ( (atomic_read(&sbvc->boost_credit)
+ <= (max_boost_credit-BCSCHED_CREDITS_PER_TICK)) ||
+ (atomic_read(&svc->credit)
+ <= BCSCHED_CREDITS_PER_TICK*(BCSCHED_TICKS_PER_ACCT-1)) )
+ {
+ __bcsched_vcpu_acct_start_locked(svc);
+ }
+ }
+
+ weight_total = csched_priv.weight;
+ credit_total = csched_priv.credit;
+ bc_total = bcsched_priv.boost_credit;
+
+ /* Converge balance towards 0 when it drops negative */
+ if ( csched_priv.credit_balance < 0 )
+ {
+ credit_total -= csched_priv.credit_balance;
+ CSCHED_STAT_CRANK(acct_balance);
+ }
+
+ if ( unlikely(weight_total == 0 && bc_total == 0) )
+ {
+ csched_priv.credit_balance = 0;
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+ CSCHED_STAT_CRANK(acct_no_work);
+ return;
+ }
+
+ CSCHED_STAT_CRANK(acct_run);
+
+ weight_left = weight_total;
+ credit_balance = 0;
+ credit_xtra = 0;
+ credit_cap = 0U;
+
+ /* Firstly, subtract boost credits from credit_total. */
+ if ( bc_total != 0 )
+ {
+ credit_total -= bc_total;
+ credit_balance += bc_total;
+ }
+
+ /* Avoid 0 divide error */
+ if ( weight_total == 0 )
+ weight_total = 1;
+
+ list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+ {
+ sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+ sbdom = sdom_sbdom(sdom);
+
+ BUG_ON( is_idle_domain(sdom->dom) );
+ BUG_ON( sdom->active_vcpu_count == 0 );
+ BUG_ON( sdom->weight > weight_left );
+
+ max_boost_credit = sbdom->max_boost_period *
+ (BCSCHED_CREDITS_PER_TSLICE/BCSCHED_MSECS_PER_TSLICE);
+ c_sum = bc_sum = 0;
+ list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
+ {
+ svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
+ sbvc = svc_sbvc(svc);
+
+ BUG_ON( sdom != svc->sdom );
+
+ c_sum += atomic_read(&svc->credit);
+ bc_sum += atomic_read(&sbvc->boost_credit);
+ }
+ c_average = ( c_sum + ( sdom->active_vcpu_count - 1 )
+ ) / sdom->active_vcpu_count;
+ bc_average = ( bc_sum + ( sdom->active_vcpu_count - 1 )
+ ) / sdom->active_vcpu_count;
+
+ weight_left -= sdom->weight;
+
+ /*
+ * A domain's fair share is computed using its weight in competition
+ * with that of all other active domains.
+ *
+ * At most, a domain can use credits to run all its active VCPUs
+ * for one full accounting period. We allow a domain to earn more
+ * only when the system-wide credit balance is negative.
+ */
+ credit_peak = sdom->active_vcpu_count * BCSCHED_CREDITS_PER_ACCT;
+ if ( csched_priv.credit_balance < 0 )
+ {
+ credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+ (weight_total - 1)
+ ) / weight_total;
+ }
+
+ if ( sdom->cap != 0U )
+ {
+ credit_cap = ((sdom->cap * BCSCHED_CREDITS_PER_ACCT) + 99) / 100;
+ if ( credit_cap < credit_peak )
+ credit_peak = credit_cap;
+
+ credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
+ ) / sdom->active_vcpu_count;
+ }
+
+ credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
+ ) / weight_total;
+
+ if ( credit_fair < credit_peak )
+ {
+ /* credit_fair is 0 if weight is 0. */
+ if ( sdom->weight != 0 )
+ credit_xtra = 1;
+ }
+ else
+ {
+ if ( weight_left != 0U )
+ {
+ /* Give other domains a chance at unused credits */
+ credit_total += ( ( ( credit_fair - credit_peak
+ ) * weight_total
+ ) + ( weight_left - 1 )
+ ) / weight_left;
+ }
+
+ if ( credit_xtra )
+ {
+ /*
+ * Lazily keep domains with extra credits at the head of
+ * the queue to give others a chance at them in future
+ * accounting periods.
+ */
+ CSCHED_STAT_CRANK(acct_reorder);
+ list_del(&sdom->active_sdom_elem);
+ list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+ }
+
+ credit_fair = credit_peak;
+ }
+
+ /* Compute fair share per VCPU */
+ credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
+ ) / sdom->active_vcpu_count;
+
+ /* Compute fair share of boost_credit per VCPU */
+ bc_fair = ( ((sbdom->boost_ratio * BCSCHED_CREDITS_PER_ACCT)/100) +
+ (sdom->active_vcpu_count - 1)
+ ) / sdom->active_vcpu_count;
+
+ list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
+ {
+ svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
+ sbvc = svc_sbvc(svc);
+
+ BUG_ON( sdom != svc->sdom );
+
+ /* Balance two credits */
+ credit = atomic_read(&svc->credit);
+ atomic_add(c_average - credit, &svc->credit);
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ atomic_add(bc_average - boost_credit, &sbvc->boost_credit);
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ if ( sbdom->boost_ratio != 0 )
+ {
+ /* Increment boost credit */
+ atomic_add(bc_fair, &sbvc->boost_credit);
+ boost_credit = atomic_read(&sbvc->boost_credit);
+
+ /*
+ * Upper bound on boost credits.
+ * Add excess to credit.
+ */
+ if ( boost_credit > max_boost_credit )
+ {
+ atomic_add(boost_credit - max_boost_credit, &svc->credit);
+ atomic_set(&sbvc->boost_credit, max_boost_credit);
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ }
+ /*
+ * If credit is negative,
+ * boost credits compensate credit.
+ */
+ credit = atomic_read(&svc->credit);
+ if ( credit < 0 && boost_credit > 0 )
+ {
+ if ( boost_credit > -credit )
+ {
+ atomic_sub(-credit, &sbvc->boost_credit);
+ atomic_add(-credit, &svc->credit);
+ }
+ else
+ {
+ atomic_sub(boost_credit, &sbvc->boost_credit);
+ atomic_add(boost_credit, &svc->credit);
+ }
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ }
+ }
+
+ /* Increment credit */
+ atomic_add(credit_fair, &svc->credit);
+ credit = atomic_read(&svc->credit);
+
+ /*
+ * Recompute priority or, if VCPU is idling, remove it from
+ * the active list.
+ */
+ if ( credit < 0 )
+ {
+ svc->pri = CSCHED_PRI_TS_OVER;
+
+ /* Park running VCPUs of capped-out domains */
+ if ( sdom->cap != 0U &&
+ credit < -credit_cap &&
+ !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+ {
+ CSCHED_STAT_CRANK(vcpu_park);
+ vcpu_pause_nosync(svc->vcpu);
+ svc->flags |= CSCHED_FLAG_VCPU_PARKED;
+ }
+
+ /* Lower bound on credits */
+ if ( credit < -BCSCHED_CREDITS_PER_TSLICE )
+ {
+ CSCHED_STAT_CRANK(acct_min_credit);
+ credit = -BCSCHED_CREDITS_PER_TSLICE;
+ atomic_set(&svc->credit, credit);
+ }
+ }
+ else
+ {
+ if ( boost_credit <= 0 )
+ svc->pri = CSCHED_PRI_TS_UNDER;
+ else
+ svc->pri = CSCHED_PRI_TS_BOOST;
+
+ /* Unpark any capped domains whose credits go positive */
+ if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
+ {
+ /*
+ * It's important to unset the flag AFTER the unpause()
+ * call to make sure the VCPU's priority is not boosted
+ * if it is woken up here.
+ */
+ CSCHED_STAT_CRANK(vcpu_unpark);
+ vcpu_unpause(svc->vcpu);
+ svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
+ }
+
+ if ( credit > BCSCHED_CREDITS_PER_TSLICE )
+ {
+ atomic_add(credit - BCSCHED_CREDITS_PER_TSLICE,
+ &sbvc->boost_credit);
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ credit = BCSCHED_CREDITS_PER_TSLICE;
+ atomic_set(&svc->credit, credit);
+
+ if ( boost_credit > max_boost_credit )
+ {
+ atomic_set(&sbvc->boost_credit, max_boost_credit);
+ __bcsched_vcpu_acct_stop_locked(svc);
+ }
+ }
+ }
+
+ if ( sbdom->boost_ratio == 0 )
+ {
+ CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
+ CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
+ credit_balance += credit;
+ }
+ else
+ {
+ CSCHED_VCPU_STAT_SET(svc, credit_last, boost_credit);
+ CSCHED_VCPU_STAT_SET(svc, credit_incr, bc_fair);
+ }
+ }
+ }
+
+ csched_priv.credit_balance = credit_balance;
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+ /* Inform each CPU that its runq needs to be sorted */
+ csched_priv.runq_sort++;
+}
+
+static void
+bcsched_tick(void *_cpu)
+{
+ unsigned int cpu = (unsigned long)_cpu;
+ struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+
+ spc->tick++;
+
+ /*
+ * Accounting for running VCPU
+ */
+ if ( !is_idle_vcpu(current) )
+ bcsched_vcpu_acct(cpu);
+
+ /*
+ * Host-wide accounting duty
+ *
+ * Note: Currently, this is always done by the master boot CPU. Eventually,
+ * we could distribute or at the very least cycle the duty.
+ */
+ if ( (csched_priv.master == cpu) &&
+ (spc->tick % BCSCHED_TICKS_PER_ACCT) == 0 )
+ {
+ bcsched_acct();
+ }
+
+ /*
+ * Check if runq needs to be sorted
+ *
+ * Every physical CPU resorts the runq after the accounting master has
+ * modified priorities. This is a special O(n) sort and runs at most
+ * once per accounting period (currently 30 milliseconds).
+ */
+ bcsched_runq_sort(cpu);
+
+ set_timer(&spc->ticker, NOW() + MILLISECS(BCSCHED_MSECS_PER_TICK));
+}
+
+static struct task_slice
+bcsched_schedule(s_time_t now)
+{
+ struct csched_vcpu *svc = CSCHED_VCPU(current);
+ struct bcsched_vcpu *sbvc = svc_sbvc(svc);
+ s_time_t passed = now - sbvc->start_time;
+ int consumed;
+ int boost_credit;
+ struct task_slice ret;
+
+ /*
+ * Update credit
+ */
+ consumed = ( passed +
+ (MILLISECS(BCSCHED_MSECS_PER_TSLICE) /
+ BCSCHED_CREDITS_PER_TSLICE - 1)
+ ) / (MILLISECS(BCSCHED_MSECS_PER_TSLICE) /
+ BCSCHED_CREDITS_PER_TSLICE);
+ if ( svc->pri == CSCHED_PRI_TS_BOOST )
+ {
+ boost_credit = atomic_read(&sbvc->boost_credit);
+ if ( boost_credit > consumed )
+ {
+ atomic_sub(consumed, &sbvc->boost_credit);
+ consumed = 0;
+ }
+ else
+ {
+ atomic_sub(boost_credit, &sbvc->boost_credit);
+ consumed -= boost_credit;
+ svc->pri = CSCHED_PRI_TS_UNDER;
+ }
+ }
+ if ( consumed > 0 && !is_idle_vcpu(current) )
+ atomic_sub(consumed, &svc->credit);
+
+ ret = csched_schedule(now);
+
+ svc = CSCHED_VCPU(ret.task);
+ if ( svc->pri == CSCHED_PRI_TS_BOOST )
+ ret.time = bcsched_priv.boost_tslice;
+
+ sbvc = svc_sbvc(svc);
+ sbvc->start_time = now;
+
+ return ret;
+}
+
+static void
+bcsched_dump_vcpu(struct csched_vcpu *svc)
+{
+ struct bcsched_vcpu * const sbvc = svc_sbvc(svc);
+
+ csched_dump_vcpu(svc);
+
+ if ( svc->sdom )
+ {
+ struct bcsched_dom * const sbdom = sdom_sbdom(svc->sdom);
+
+ printk("\t bc=%i [bc=%i]\n",
+ atomic_read(&sbvc->boost_credit),
+ sbdom->boost_ratio * BCSCHED_CREDITS_PER_TSLICE / 100);
+ }
+}
+
+static void
+bcsched_dump(void)
+{
+ struct list_head *iter_sdom, *iter_svc;
+ int loop;
+ char idlers_buf[100];
+
+ printk("info:\n"
+ "\tncpus = %u\n"
+ "\tmaster = %u\n"
+ "\tcredit = %u\n"
+ "\tcredit balance = %d\n"
+ "\tweight = %u\n"
+ "\trunq_sort = %u\n"
+ "\tboost_tslice = %"PRId64"\n"
+ "\tboost_credit = %u\n"
+ "\ttotal_boost_ratio = %u\n"
+ "\tdefault-weight = %d\n"
+ "\tmsecs per tick = %dms\n"
+ "\tcredits per tick = %d\n"
+ "\tticks per tslice = %d\n"
+ "\tticks per acct = %d\n",
+ csched_priv.ncpus,
+ csched_priv.master,
+ csched_priv.credit,
+ csched_priv.credit_balance,
+ csched_priv.weight,
+ csched_priv.runq_sort,
+ bcsched_priv.boost_tslice,
+ bcsched_priv.boost_credit,
+ bcsched_priv.total_boost_ratio,
+ CSCHED_DEFAULT_WEIGHT,
+ BCSCHED_MSECS_PER_TICK,
+ BCSCHED_CREDITS_PER_TICK,
+ BCSCHED_TICKS_PER_TSLICE,
+ BCSCHED_TICKS_PER_ACCT);
+
+ cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+ printk("idlers: %s\n", idlers_buf);
+
+ CSCHED_STATS_PRINTK();
+
+ printk("active vcpus:\n");
+ loop = 0;
+ list_for_each( iter_sdom, &csched_priv.active_sdom )
+ {
+ struct csched_dom *sdom;
+ sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+ list_for_each( iter_svc, &sdom->active_vcpu )
+ {
+ struct csched_vcpu *svc;
+ svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
+
+ printk("\t%3d: ", ++loop);
+ bcsched_dump_vcpu(svc);
+ }
+ }
+
+ printk("inactive vcpus:\n");
+ loop = 0;
+ list_for_each( iter_svc, &bcsched_priv.inactive_vcpu )
+ {
+ struct bcsched_vcpu *sbvc;
+ sbvc = list_entry(iter_svc, struct bcsched_vcpu, inactive_vcpu_elem);
+
+ printk("\t%3d: ", ++loop);
+ bcsched_dump_vcpu(&sbvc->svc);
+ }
+}
+
+static void
+bcsched_init(void)
+{
+ csched_init();
+
+ INIT_LIST_HEAD(&bcsched_priv.inactive_vcpu);
+ bcsched_priv.boost_tslice = MILLISECS(BCSCHED_MSECS_PER_TSLICE);
+ bcsched_priv.boost_credit = 0;
+ bcsched_priv.total_boost_ratio = 0;
+}
+
+
+struct scheduler sched_bcredit_def = {
+ .name = "SMP Credit Scheduler for client side",
+ .opt_name = "bcredit",
+ .sched_id = XEN_SCHEDULER_BCREDIT,
+
+ .init_domain = bcsched_dom_init,
+ .destroy_domain = bcsched_dom_destroy,
+
+ .init_vcpu = bcsched_vcpu_init,
+ .destroy_vcpu = bcsched_vcpu_destroy,
+
+ .sleep = csched_vcpu_sleep,
+ .wake = csched_vcpu_wake,
+
+ .adjust = bcsched_dom_cntl,
+
+ .pick_cpu = csched_cpu_pick,
+ .do_schedule = bcsched_schedule,
+
+ .dump_cpu_state = csched_dump_pcpu,
+ .dump_settings = bcsched_dump,
+ .init = bcsched_init,
+};
+