debuggers.hg

view xen/include/xen/sched.h @ 20992:ae2b7f1c89c8

cpuidle: do not enter deep C state if there is urgent VCPU

when VCPU is polling on event channel, it usually has urgent task
running, e.g. spin_lock, in this case, it is better for cpuidle driver
not to enter deep C state.

This patch fix the issue that SLES 11 SP1 domain0 hangs in the box of
large number of CPUs (>= 64 CPUs).

Signed-off-by: Yu Ke <ke.yu@intel.com>
Signed-off-by: Tian Kevin <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Feb 16 09:27:45 2010 +0000 (2010-02-16)
parents f03bb5277f04
children 07befd9cf6d3
line source
2 #ifndef __SCHED_H__
3 #define __SCHED_H__
5 #include <xen/config.h>
6 #include <xen/types.h>
7 #include <xen/spinlock.h>
8 #include <xen/smp.h>
9 #include <xen/shared.h>
10 #include <public/xen.h>
11 #include <public/domctl.h>
12 #include <public/vcpu.h>
13 #include <public/xsm/acm.h>
14 #include <xen/time.h>
15 #include <xen/timer.h>
16 #include <xen/grant_table.h>
17 #include <xen/rangeset.h>
18 #include <asm/domain.h>
19 #include <xen/xenoprof.h>
20 #include <xen/rcupdate.h>
21 #include <xen/irq.h>
22 #include <xen/mm.h>
23 #include <public/mem_event.h>
25 #ifdef CONFIG_COMPAT
26 #include <compat/vcpu.h>
27 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
28 #endif
30 /* A global pointer to the initial domain (DOM0). */
31 extern struct domain *dom0;
33 #ifndef CONFIG_COMPAT
34 #define BITS_PER_EVTCHN_WORD(d) BITS_PER_LONG
35 #else
36 #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG)
37 #endif
38 #define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d))
39 #define EVTCHNS_PER_BUCKET 128
40 #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
42 struct evtchn
43 {
44 #define ECS_FREE 0 /* Channel is available for use. */
45 #define ECS_RESERVED 1 /* Channel is reserved. */
46 #define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */
47 #define ECS_INTERDOMAIN 3 /* Channel is bound to another domain. */
48 #define ECS_PIRQ 4 /* Channel is bound to a physical IRQ line. */
49 #define ECS_VIRQ 5 /* Channel is bound to a virtual IRQ line. */
50 #define ECS_IPI 6 /* Channel is bound to a virtual IPI line. */
51 u8 state; /* ECS_* */
52 u8 consumer_is_xen; /* Consumed by Xen or by guest? */
53 u16 notify_vcpu_id; /* VCPU for local delivery notification */
54 union {
55 struct {
56 domid_t remote_domid;
57 } unbound; /* state == ECS_UNBOUND */
58 struct {
59 u16 remote_port;
60 struct domain *remote_dom;
61 } interdomain; /* state == ECS_INTERDOMAIN */
62 u16 pirq; /* state == ECS_PIRQ */
63 u16 virq; /* state == ECS_VIRQ */
64 } u;
65 #ifdef FLASK_ENABLE
66 void *ssid;
67 #endif
68 };
70 int evtchn_init(struct domain *d); /* from domain_create */
71 void evtchn_destroy(struct domain *d); /* from domain_kill */
72 void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
74 struct vcpu
75 {
76 int vcpu_id;
78 int processor;
80 vcpu_info_t *vcpu_info;
82 struct domain *domain;
84 struct vcpu *next_in_list;
86 uint64_t periodic_period;
87 uint64_t periodic_last_event;
88 struct timer periodic_timer;
89 struct timer singleshot_timer;
91 struct timer poll_timer; /* timeout for SCHEDOP_poll */
93 void *sched_priv; /* scheduler-specific data */
95 struct vcpu_runstate_info runstate;
96 #ifndef CONFIG_COMPAT
97 # define runstate_guest(v) ((v)->runstate_guest)
98 XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
99 #else
100 # define runstate_guest(v) ((v)->runstate_guest.native)
101 union {
102 XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
103 XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
104 } runstate_guest; /* guest address */
105 #endif
107 /* last time when vCPU is scheduled out */
108 uint64_t last_run_time;
110 /* Has the FPU been initialised? */
111 bool_t fpu_initialised;
112 /* Has the FPU been used since it was last saved? */
113 bool_t fpu_dirtied;
114 /* Initialization completed for this VCPU? */
115 bool_t is_initialised;
116 /* Currently running on a CPU? */
117 bool_t is_running;
118 /* VCPU should wake fast (do not deep sleep the CPU). */
119 bool_t is_urgent;
121 #ifdef VCPU_TRAP_LAST
122 #define VCPU_TRAP_NONE 0
123 struct {
124 bool_t pending;
125 uint8_t old_mask;
126 } async_exception_state[VCPU_TRAP_LAST];
127 #define async_exception_state(t) async_exception_state[(t)-1]
128 uint8_t async_exception_mask;
129 #endif
131 /* Require shutdown to be deferred for some asynchronous operation? */
132 bool_t defer_shutdown;
133 /* VCPU is paused following shutdown request (d->is_shutting_down)? */
134 bool_t paused_for_shutdown;
135 /* VCPU affinity is temporarily locked from controller changes? */
136 bool_t affinity_locked;
138 /*
139 * > 0: a single port is being polled;
140 * = 0: nothing is being polled (vcpu should be clear in d->poll_mask);
141 * < 0: multiple ports may be being polled.
142 */
143 int poll_evtchn;
145 unsigned long pause_flags;
146 atomic_t pause_count;
148 /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
149 u16 virq_to_evtchn[NR_VIRQS];
150 spinlock_t virq_lock;
152 /* Bitmask of CPUs on which this VCPU may run. */
153 cpumask_t cpu_affinity;
154 /* Used to change affinity temporarily. */
155 cpumask_t cpu_affinity_tmp;
157 /* Bitmask of CPUs which are holding onto this VCPU's state. */
158 cpumask_t vcpu_dirty_cpumask;
160 struct arch_vcpu arch;
161 };
163 /* Per-domain lock can be recursively acquired in fault handlers. */
164 #define domain_lock(d) spin_lock_recursive(&(d)->domain_lock)
165 #define domain_unlock(d) spin_unlock_recursive(&(d)->domain_lock)
166 #define domain_is_locked(d) spin_is_locked(&(d)->domain_lock)
168 /* Memory event */
169 struct mem_event_domain
170 {
171 /* ring lock */
172 spinlock_t ring_lock;
173 /* shared page */
174 mem_event_shared_page_t *shared_page;
175 /* shared ring page */
176 void *ring_page;
177 /* front-end ring */
178 mem_event_front_ring_t front_ring;
179 /* if domain has been paused due to ring contention */
180 bool_t paused;
181 int paused_vcpus[MAX_VIRT_CPUS];
182 /* the memory event mode */
183 unsigned long mode;
184 /* domain to receive memory events */
185 struct domain *domain;
186 /* enabled? */
187 bool_t enabled;
188 /* event channel port (vcpu0 only) */
189 int xen_port;
190 /* tasklet */
191 struct tasklet tasklet;
192 };
194 struct domain
195 {
196 domid_t domain_id;
198 shared_info_t *shared_info; /* shared data area */
200 spinlock_t domain_lock;
202 spinlock_t page_alloc_lock; /* protects all the following fields */
203 struct page_list_head page_list; /* linked list, of size tot_pages */
204 struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
205 unsigned int tot_pages; /* number of pages currently possesed */
206 unsigned int max_pages; /* maximum value for tot_pages */
207 atomic_t shr_pages; /* number of shared pages */
208 unsigned int xenheap_pages; /* # pages allocated from Xen heap */
210 unsigned int max_vcpus;
212 /* Scheduling. */
213 void *sched_priv; /* scheduler-specific data */
215 struct domain *next_in_list;
216 struct domain *next_in_hashbucket;
218 struct list_head rangesets;
219 spinlock_t rangesets_lock;
221 /* Event channel information. */
222 struct evtchn *evtchn[NR_EVTCHN_BUCKETS];
223 spinlock_t event_lock;
225 struct grant_table *grant_table;
227 /*
228 * Interrupt to event-channel mappings. Updates should be protected by the
229 * domain's event-channel spinlock. Read accesses can also synchronise on
230 * the lock, but races don't usually matter.
231 */
232 unsigned int nr_pirqs;
233 u16 *pirq_to_evtchn;
234 unsigned long *pirq_mask;
236 /* I/O capabilities (access to IRQs and memory-mapped I/O). */
237 struct rangeset *iomem_caps;
238 struct rangeset *irq_caps;
240 /* Is this an HVM guest? */
241 bool_t is_hvm;
242 /* Does this guest need iommu mappings? */
243 bool_t need_iommu;
244 /* Is this guest fully privileged (aka dom0)? */
245 bool_t is_privileged;
246 /* Which guest this guest has privileges on */
247 struct domain *target;
248 /* Is this guest being debugged by dom0? */
249 bool_t debugger_attached;
250 /* Is this guest dying (i.e., a zombie)? */
251 enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
252 /* Domain is paused by controller software? */
253 bool_t is_paused_by_controller;
254 /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
255 bool_t is_pinned;
257 /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
258 #if MAX_VIRT_CPUS <= BITS_PER_LONG
259 DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
260 #else
261 unsigned long *poll_mask;
262 #endif
264 /* Guest has shut down (inc. reason code)? */
265 spinlock_t shutdown_lock;
266 bool_t is_shutting_down; /* in process of shutting down? */
267 bool_t is_shut_down; /* fully shut down? */
268 int shutdown_code;
270 /* If this is not 0, send suspend notification here instead of
271 * raising DOM_EXC */
272 int suspend_evtchn;
274 atomic_t pause_count;
276 unsigned long vm_assist;
278 atomic_t refcnt;
280 struct vcpu **vcpu;
282 /* Bitmask of CPUs which are holding onto this domain's state. */
283 cpumask_t domain_dirty_cpumask;
285 struct arch_domain arch;
287 void *ssid; /* sHype security subject identifier */
289 /* Control-plane tools handle for this domain. */
290 xen_domain_handle_t handle;
292 /* OProfile support. */
293 struct xenoprof *xenoprof;
294 int32_t time_offset_seconds;
296 struct rcu_head rcu;
298 /*
299 * Hypercall deadlock avoidance lock. Used if a hypercall might
300 * cause a deadlock. Acquirers don't spin waiting; they preempt.
301 */
302 spinlock_t hypercall_deadlock_mutex;
304 /* transcendent memory, auto-allocated on first tmem op by each domain */
305 void *tmem;
307 struct lock_profile_qhead profile_head;
309 /* Non-migratable and non-restoreable? */
310 bool_t disable_migrate;
312 /* Memory paging support */
313 struct mem_event_domain mem_event;
314 };
316 struct domain_setup_info
317 {
318 /* Initialised by caller. */
319 unsigned long image_addr;
320 unsigned long image_len;
321 /* Initialised by loader: Public. */
322 unsigned long v_start;
323 unsigned long v_end;
324 unsigned long v_kernstart;
325 unsigned long v_kernend;
326 unsigned long v_kernentry;
327 #define PAEKERN_no 0
328 #define PAEKERN_yes 1
329 #define PAEKERN_extended_cr3 2
330 #define PAEKERN_bimodal 3
331 unsigned int pae_kernel;
332 /* Initialised by loader: Private. */
333 unsigned long elf_paddr_offset;
334 unsigned int load_symtab;
335 unsigned long symtab_addr;
336 unsigned long symtab_len;
337 };
339 extern struct vcpu *idle_vcpu[NR_CPUS];
340 #define IDLE_DOMAIN_ID (0x7FFFU)
341 #define is_idle_domain(d) ((d)->domain_id == IDLE_DOMAIN_ID)
342 #define is_idle_vcpu(v) (is_idle_domain((v)->domain))
344 #define DOMAIN_DESTROYED (1<<31) /* assumes atomic_t is >= 32 bits */
345 #define put_domain(_d) \
346 if ( atomic_dec_and_test(&(_d)->refcnt) ) domain_destroy(_d)
348 /*
349 * Use this when you don't have an existing reference to @d. It returns
350 * FALSE if @d is being destroyed.
351 */
352 static always_inline int get_domain(struct domain *d)
353 {
354 atomic_t old, new, seen = d->refcnt;
355 do
356 {
357 old = seen;
358 if ( unlikely(_atomic_read(old) & DOMAIN_DESTROYED) )
359 return 0;
360 _atomic_set(new, _atomic_read(old) + 1);
361 seen = atomic_compareandswap(old, new, &d->refcnt);
362 }
363 while ( unlikely(_atomic_read(seen) != _atomic_read(old)) );
364 return 1;
365 }
367 /*
368 * Use this when you already have, or are borrowing, a reference to @d.
369 * In this case we know that @d cannot be destroyed under our feet.
370 */
371 static inline void get_knownalive_domain(struct domain *d)
372 {
373 atomic_inc(&d->refcnt);
374 ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
375 }
377 /* Obtain a reference to the currently-running domain. */
378 static inline struct domain *get_current_domain(void)
379 {
380 struct domain *d = current->domain;
381 get_knownalive_domain(d);
382 return d;
383 }
385 struct domain *domain_create(
386 domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
387 /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
388 #define _DOMCRF_hvm 0
389 #define DOMCRF_hvm (1U<<_DOMCRF_hvm)
390 /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
391 #define _DOMCRF_hap 1
392 #define DOMCRF_hap (1U<<_DOMCRF_hap)
393 /* DOMCRF_s3_integrity: Create a domain with tboot memory integrity protection
394 by tboot */
395 #define _DOMCRF_s3_integrity 2
396 #define DOMCRF_s3_integrity (1U<<_DOMCRF_s3_integrity)
397 /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
398 #define _DOMCRF_dummy 3
399 #define DOMCRF_dummy (1U<<_DOMCRF_dummy)
400 /* DOMCRF_oos_off: dont use out-of-sync optimization for shadow page tables */
401 #define _DOMCRF_oos_off 4
402 #define DOMCRF_oos_off (1U<<_DOMCRF_oos_off)
404 /*
405 * rcu_lock_domain_by_id() is more efficient than get_domain_by_id().
406 * This is the preferred function if the returned domain reference
407 * is short lived, but it cannot be used if the domain reference needs
408 * to be kept beyond the current scope (e.g., across a softirq).
409 * The returned domain reference must be discarded using rcu_unlock_domain().
410 */
411 struct domain *rcu_lock_domain_by_id(domid_t dom);
413 /*
414 * As above function, but accounts for current domain context:
415 * - Translates target DOMID_SELF into caller's domain id; and
416 * - Checks that caller has permission to act on the target domain.
417 */
418 int rcu_lock_target_domain_by_id(domid_t dom, struct domain **d);
420 /* Finish a RCU critical region started by rcu_lock_domain_by_id(). */
421 static inline void rcu_unlock_domain(struct domain *d)
422 {
423 rcu_read_unlock(&domlist_read_lock);
424 }
426 static inline struct domain *rcu_lock_domain(struct domain *d)
427 {
428 rcu_read_lock(d);
429 return d;
430 }
432 static inline struct domain *rcu_lock_current_domain(void)
433 {
434 return rcu_lock_domain(current->domain);
435 }
437 struct domain *get_domain_by_id(domid_t dom);
438 void domain_destroy(struct domain *d);
439 int domain_kill(struct domain *d);
440 void domain_shutdown(struct domain *d, u8 reason);
441 void domain_resume(struct domain *d);
442 void domain_pause_for_debugger(void);
444 int vcpu_start_shutdown_deferral(struct vcpu *v);
445 void vcpu_end_shutdown_deferral(struct vcpu *v);
447 /*
448 * Mark specified domain as crashed. This function always returns, even if the
449 * caller is the specified domain. The domain is not synchronously descheduled
450 * from any processor.
451 */
452 void __domain_crash(struct domain *d);
453 #define domain_crash(d) do { \
454 printk("domain_crash called from %s:%d\n", __FILE__, __LINE__); \
455 __domain_crash(d); \
456 } while (0)
458 /*
459 * Mark current domain as crashed and synchronously deschedule from the local
460 * processor. This function never returns.
461 */
462 void __domain_crash_synchronous(void) __attribute__((noreturn));
463 #define domain_crash_synchronous() do { \
464 printk("domain_crash_sync called from %s:%d\n", __FILE__, __LINE__); \
465 __domain_crash_synchronous(); \
466 } while (0)
468 #define set_current_state(_s) do { current->state = (_s); } while (0)
469 void scheduler_init(void);
470 int sched_init_vcpu(struct vcpu *v, unsigned int processor);
471 void sched_destroy_vcpu(struct vcpu *v);
472 int sched_init_domain(struct domain *d);
473 void sched_destroy_domain(struct domain *d);
474 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
475 int sched_id(void);
476 void sched_tick_suspend(void);
477 void sched_tick_resume(void);
478 void vcpu_wake(struct vcpu *d);
479 void vcpu_sleep_nosync(struct vcpu *d);
480 void vcpu_sleep_sync(struct vcpu *d);
482 /*
483 * Force synchronisation of given VCPU's state. If it is currently descheduled,
484 * this call will ensure that all its state is committed to memory and that
485 * no CPU is using critical state (e.g., page tables) belonging to the VCPU.
486 */
487 void sync_vcpu_execstate(struct vcpu *v);
489 /*
490 * Called by the scheduler to switch to another VCPU. This function must
491 * call context_saved(@prev) when the local CPU is no longer running in
492 * @prev's context, and that context is saved to memory. Alternatively, if
493 * implementing lazy context switching, it suffices to ensure that invoking
494 * sync_vcpu_execstate() will switch and commit @prev's state.
495 */
496 void context_switch(
497 struct vcpu *prev,
498 struct vcpu *next);
500 /*
501 * As described above, context_switch() must call this function when the
502 * local CPU is no longer running in @prev's context, and @prev's context is
503 * saved to memory. Alternatively, if implementing lazy context switching,
504 * ensure that invoking sync_vcpu_execstate() will switch and commit @prev.
505 */
506 void context_saved(struct vcpu *prev);
508 /* Called by the scheduler to continue running the current VCPU. */
509 void continue_running(
510 struct vcpu *same);
512 void startup_cpu_idle_loop(void);
514 /*
515 * Creates a continuation to resume the current hypercall. The caller should
516 * return immediately, propagating the value returned from this invocation.
517 * The format string specifies the types and number of hypercall arguments.
518 * It contains one character per argument as follows:
519 * 'i' [unsigned] {char, int}
520 * 'l' [unsigned] long
521 * 'h' guest handle (XEN_GUEST_HANDLE(foo))
522 */
523 unsigned long hypercall_create_continuation(
524 unsigned int op, const char *format, ...);
526 #define hypercall_preempt_check() (unlikely( \
527 softirq_pending(smp_processor_id()) | \
528 local_events_need_delivery() \
529 ))
531 /* Protect updates/reads (resp.) of domain_list and domain_hash. */
532 extern spinlock_t domlist_update_lock;
533 extern rcu_read_lock_t domlist_read_lock;
535 extern struct domain *domain_list;
537 /* Caller must hold the domlist_read_lock or domlist_update_lock. */
538 #define for_each_domain(_d) \
539 for ( (_d) = rcu_dereference(domain_list); \
540 (_d) != NULL; \
541 (_d) = rcu_dereference((_d)->next_in_list )) \
543 #define for_each_vcpu(_d,_v) \
544 for ( (_v) = (_d)->vcpu ? (_d)->vcpu[0] : NULL; \
545 (_v) != NULL; \
546 (_v) = (_v)->next_in_list )
548 /*
549 * Per-VCPU pause flags.
550 */
551 /* Domain is blocked waiting for an event. */
552 #define _VPF_blocked 0
553 #define VPF_blocked (1UL<<_VPF_blocked)
554 /* VCPU is offline. */
555 #define _VPF_down 1
556 #define VPF_down (1UL<<_VPF_down)
557 /* VCPU is blocked awaiting an event to be consumed by Xen. */
558 #define _VPF_blocked_in_xen 2
559 #define VPF_blocked_in_xen (1UL<<_VPF_blocked_in_xen)
560 /* VCPU affinity has changed: migrating to a new CPU. */
561 #define _VPF_migrating 3
562 #define VPF_migrating (1UL<<_VPF_migrating)
564 static inline int vcpu_runnable(struct vcpu *v)
565 {
566 return !(v->pause_flags |
567 atomic_read(&v->pause_count) |
568 atomic_read(&v->domain->pause_count));
569 }
571 void vcpu_unblock(struct vcpu *v);
572 void vcpu_pause(struct vcpu *v);
573 void vcpu_pause_nosync(struct vcpu *v);
574 void domain_pause(struct domain *d);
575 void vcpu_unpause(struct vcpu *v);
576 void domain_unpause(struct domain *d);
577 void domain_pause_by_systemcontroller(struct domain *d);
578 void domain_unpause_by_systemcontroller(struct domain *d);
579 void cpu_init(void);
581 void vcpu_force_reschedule(struct vcpu *v);
582 void cpu_disable_scheduler(void);
583 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
584 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
585 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
586 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
588 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
589 uint64_t get_cpu_idle_time(unsigned int cpu);
591 #define IS_PRIV(_d) ((_d)->is_privileged)
592 #define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == (_t)))
594 #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist))
596 #define is_hvm_domain(d) ((d)->is_hvm)
597 #define is_hvm_vcpu(v) (is_hvm_domain(v->domain))
598 #define need_iommu(d) ((d)->need_iommu)
600 void set_vcpu_migration_delay(unsigned int delay);
601 unsigned int get_vcpu_migration_delay(void);
603 extern int sched_smt_power_savings;
605 extern enum cpufreq_controller {
606 FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
607 } cpufreq_controller;
609 #endif /* __SCHED_H__ */
611 /*
612 * Local variables:
613 * mode: C
614 * c-set-style: "BSD"
615 * c-basic-offset: 4
616 * tab-width: 4
617 * indent-tabs-mode: nil
618 * End:
619 */