debuggers.hg

view xen/arch/ia64/xen/domain.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents ac3ecce4502d
children 1c01814f9a25
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vmx_vcpu_save.h>
45 #include <asm/vhpt.h>
46 #include <asm/vcpu.h>
47 #include <asm/tlbflush.h>
48 #include <asm/regionreg.h>
49 #include <asm/dom_fw.h>
50 #include <asm/shadow.h>
51 #include <xen/guest_access.h>
52 #include <asm/tlb_track.h>
53 #include <asm/perfmon.h>
54 #include <asm/sal.h>
55 #include <public/vcpu.h>
56 #include <linux/cpu.h>
57 #include <linux/notifier.h>
58 #include <asm/debugger.h>
60 /* dom0_size: default memory allocation for dom0 (~4GB) */
61 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
63 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
64 static unsigned int __initdata dom0_max_vcpus = 4;
65 integer_param("dom0_max_vcpus", dom0_max_vcpus);
67 extern char dom0_command_line[];
69 /* forward declaration */
70 static void init_switch_stack(struct vcpu *v);
72 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
73 This is a Xen virtual address. */
74 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
75 DEFINE_PER_CPU(int *, current_psr_ic_addr);
77 DEFINE_PER_CPU(struct vcpu *, fp_owner);
79 #include <xen/sched-if.h>
81 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
82 {
83 int cpu = smp_processor_id();
84 int last_vcpu_id, last_processor;
86 if (!is_idle_domain(prev->domain))
87 tlbflush_update_time
88 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
89 tlbflush_current_time());
91 if (is_idle_domain(next->domain))
92 return;
94 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
95 last_processor = next->arch.last_processor;
97 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
98 next->arch.last_processor = cpu;
100 if ((last_vcpu_id != next->vcpu_id &&
101 last_vcpu_id != INVALID_VCPU_ID) ||
102 (last_vcpu_id == next->vcpu_id &&
103 last_processor != cpu &&
104 last_processor != INVALID_PROCESSOR)) {
105 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
106 u32 last_tlbflush_timestamp =
107 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
108 #endif
109 int vhpt_is_flushed = 0;
111 // if the vTLB implementation was changed,
112 // the followings must be updated either.
113 if (VMX_DOMAIN(next)) {
114 // currently vTLB for vt-i domian is per vcpu.
115 // so any flushing isn't needed.
116 } else if (HAS_PERVCPU_VHPT(next->domain)) {
117 // nothing to do
118 } else {
119 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
120 last_tlbflush_timestamp)) {
121 local_vhpt_flush();
122 vhpt_is_flushed = 1;
123 }
124 }
125 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
126 last_tlbflush_timestamp)) {
127 local_flush_tlb_all();
128 perfc_incr(tlbflush_clock_cswitch_purge);
129 } else {
130 perfc_incr(tlbflush_clock_cswitch_skip);
131 }
132 perfc_incr(flush_vtlb_for_context_switch);
133 }
134 }
136 static void flush_cache_for_context_switch(struct vcpu *next)
137 {
138 extern cpumask_t cpu_cache_coherent_map;
139 int cpu = smp_processor_id();
141 if (is_idle_vcpu(next) ||
142 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
143 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
144 unsigned long flags;
145 u64 progress = 0;
146 s64 status;
148 local_irq_save(flags);
149 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
150 local_irq_restore(flags);
151 if (status != 0)
152 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
153 "cache_type=4 status %lx", status);
154 }
155 }
156 }
158 static void set_current_psr_i_addr(struct vcpu* v)
159 {
160 __ia64_per_cpu_var(current_psr_i_addr) =
161 (uint8_t*)(v->domain->arch.shared_info_va +
162 INT_ENABLE_OFFSET(v));
163 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
164 (v->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
165 }
167 static void clear_current_psr_i_addr(void)
168 {
169 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
170 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
171 }
173 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
174 {
175 /*
176 * Implement eager save, lazy restore
177 */
178 if (!is_idle_vcpu(prev)) {
179 if (VMX_DOMAIN(prev)) {
180 if (FP_PSR(prev) & IA64_PSR_MFH) {
181 __ia64_save_fpu(prev->arch._thread.fph);
182 __ia64_per_cpu_var(fp_owner) = prev;
183 }
184 } else {
185 if (PSCB(prev, hpsr_mfh)) {
186 __ia64_save_fpu(prev->arch._thread.fph);
187 __ia64_per_cpu_var(fp_owner) = prev;
188 }
189 }
190 }
192 if (!is_idle_vcpu(next)) {
193 if (VMX_DOMAIN(next)) {
194 FP_PSR(next) = IA64_PSR_DFH;
195 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
196 } else {
197 PSCB(next, hpsr_dfh) = 1;
198 PSCB(next, hpsr_mfh) = 0;
199 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
200 }
201 }
202 }
204 static void load_state(struct vcpu *v)
205 {
206 load_region_regs(v);
207 ia64_set_pta(vcpu_pta(v));
208 vcpu_load_kernel_regs(v);
209 if (vcpu_pkr_in_use(v))
210 vcpu_pkr_load_regs(v);
211 set_current_psr_i_addr(v);
212 }
214 void schedule_tail(struct vcpu *prev)
215 {
216 extern char ia64_ivt;
218 context_saved(prev);
220 if (VMX_DOMAIN(current))
221 vmx_do_resume(current);
222 else {
223 if (VMX_DOMAIN(prev))
224 ia64_set_iva(&ia64_ivt);
225 load_state(current);
226 migrate_timer(&current->arch.hlt_timer, current->processor);
227 }
228 flush_vtlb_for_context_switch(prev, current);
229 }
231 void context_switch(struct vcpu *prev, struct vcpu *next)
232 {
233 uint64_t spsr;
235 local_irq_save(spsr);
237 if (VMX_DOMAIN(prev)) {
238 vmx_save_state(prev);
239 if (!VMX_DOMAIN(next)) {
240 /* VMX domains can change the physical cr.dcr.
241 * Restore default to prevent leakage. */
242 uint64_t dcr = ia64_getreg(_IA64_REG_CR_DCR);
243 /* xenoprof:
244 * don't change psr.pp.
245 * It is manipulated by xenoprof.
246 */
247 dcr = (IA64_DEFAULT_DCR_BITS & ~IA64_DCR_PP) | (dcr & IA64_DCR_PP);
248 ia64_setreg(_IA64_REG_CR_DCR, dcr);
249 }
250 }
252 lazy_fp_switch(prev, current);
254 if (prev->arch.dbg_used || next->arch.dbg_used) {
255 /*
256 * Load debug registers either because they are valid or to clear
257 * the previous one.
258 */
259 ia64_load_debug_regs(next->arch.dbr);
260 }
262 /*
263 * disable VHPT walker.
264 * ia64_switch_to() might cause VHPT fault because it flushes
265 * dtr[IA64_TR_VHPT] and reinsert the mapping with dtr[IA64_TR_STACK].
266 * (VHPT_SIZE_LOG2 << 2) is just for avoiding
267 * Reserved Register/Field fault.
268 */
269 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
270 prev = ia64_switch_to(next);
272 /* Note: ia64_switch_to does not return here at vcpu initialization. */
274 if (VMX_DOMAIN(current)) {
275 vmx_load_state(current);
276 } else {
277 extern char ia64_ivt;
279 if (VMX_DOMAIN(prev))
280 ia64_set_iva(&ia64_ivt);
282 if (!is_idle_vcpu(current)) {
283 load_state(current);
284 vcpu_set_next_timer(current);
285 if (vcpu_timer_expired(current))
286 vcpu_pend_timer(current);
287 /* steal time accounting */
288 if (!guest_handle_is_null(runstate_guest(current)))
289 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
290 } else {
291 /* When switching to idle domain, only need to disable vhpt
292 * walker. Then all accesses happen within idle context will
293 * be handled by TR mapping and identity mapping.
294 */
295 clear_current_psr_i_addr();
296 }
297 }
298 local_irq_restore(spsr);
300 /* lazy fp */
301 if (current->processor != current->arch.last_processor) {
302 unsigned long *addr;
303 addr = (unsigned long *)per_cpu_addr(fp_owner,
304 current->arch.last_processor);
305 ia64_cmpxchg(acq, addr, current, 0, 8);
306 }
308 flush_vtlb_for_context_switch(prev, current);
309 flush_cache_for_context_switch(current);
310 context_saved(prev);
311 }
313 void continue_running(struct vcpu *same)
314 {
315 /* nothing to do */
316 }
318 #ifdef CONFIG_PERFMON
319 static int pal_halt = 1;
320 static int can_do_pal_halt = 1;
322 static int __init nohalt_setup(char * str)
323 {
324 pal_halt = can_do_pal_halt = 0;
325 return 1;
326 }
327 __setup("nohalt", nohalt_setup);
329 void
330 update_pal_halt_status(int status)
331 {
332 can_do_pal_halt = pal_halt && status;
333 }
334 #else
335 #define can_do_pal_halt (1)
336 #endif
338 static void default_idle(void)
339 {
340 local_irq_disable();
341 if ( !softirq_pending(smp_processor_id()) ) {
342 if (can_do_pal_halt)
343 safe_halt();
344 else
345 cpu_relax();
346 }
347 local_irq_enable();
348 }
350 extern void play_dead(void);
352 static void continue_cpu_idle_loop(void)
353 {
354 int cpu = smp_processor_id();
356 for ( ; ; )
357 {
358 #ifdef IA64
359 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
360 #else
361 irq_stat[cpu].idle_timestamp = jiffies;
362 #endif
363 page_scrub_schedule_work();
364 while ( !softirq_pending(cpu) )
365 default_idle();
366 raise_softirq(SCHEDULE_SOFTIRQ);
367 do_softirq();
368 if (!cpu_online(cpu))
369 play_dead();
370 }
371 }
373 void startup_cpu_idle_loop(void)
374 {
375 /* Just some sanity to ensure that the scheduler is set up okay. */
376 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
377 raise_softirq(SCHEDULE_SOFTIRQ);
379 continue_cpu_idle_loop();
380 }
382 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
383 * get_order_from_shift(XMAPPEDREGS_SHIFT))
384 */
385 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
386 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
387 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
388 #endif
390 void hlt_timer_fn(void *data)
391 {
392 struct vcpu *v = data;
393 vcpu_unblock(v);
394 }
396 void relinquish_vcpu_resources(struct vcpu *v)
397 {
398 if (HAS_PERVCPU_VHPT(v->domain))
399 pervcpu_vhpt_free(v);
400 if (v->arch.privregs != NULL) {
401 free_xenheap_pages(v->arch.privregs,
402 get_order_from_shift(XMAPPEDREGS_SHIFT));
403 v->arch.privregs = NULL;
404 }
405 kill_timer(&v->arch.hlt_timer);
406 }
408 struct domain *alloc_domain_struct(void)
409 {
410 #ifdef CONFIG_IA64_PICKLE_DOMAIN
411 struct domain *d;
412 /*
413 * We pack the MFN of the domain structure into a 32-bit field within
414 * the page_info structure. Hence the MEMF_bits() restriction.
415 */
416 d = alloc_xenheap_pages(get_order_from_bytes(sizeof(*d)),
417 MEMF_bits(32 + PAGE_SHIFT));
418 if ( d != NULL )
419 memset(d, 0, sizeof(*d));
420 return d;
421 #else
422 return xmalloc(struct domain);
423 #endif
424 }
426 void free_domain_struct(struct domain *d)
427 {
428 #ifdef CONFIG_IA64_PICKLE_DOMAIN
429 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
430 #else
431 xfree(d);
432 #endif
433 }
435 struct vcpu *alloc_vcpu_struct(void)
436 {
437 struct page_info *page;
438 struct vcpu *v;
439 struct thread_info *ti;
440 static int first_allocation = 1;
442 if (first_allocation) {
443 first_allocation = 0;
444 /* Still keep idle vcpu0 static allocated at compilation, due
445 * to some code from Linux still requires it in early phase.
446 */
447 return idle_vcpu[0];
448 }
450 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
451 if (page == NULL)
452 return NULL;
453 v = page_to_virt(page);
454 memset(v, 0, sizeof(*v));
456 ti = alloc_thread_info(v);
457 /* Clear thread_info to clear some important fields, like
458 * preempt_count
459 */
460 memset(ti, 0, sizeof(struct thread_info));
461 init_switch_stack(v);
463 return v;
464 }
466 void free_vcpu_struct(struct vcpu *v)
467 {
468 free_domheap_pages(virt_to_page(v), KERNEL_STACK_SIZE_ORDER);
469 }
471 int vcpu_initialise(struct vcpu *v)
472 {
473 struct domain *d = v->domain;
475 if (!is_idle_domain(d)) {
476 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
477 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
478 /* Set default values to saved_rr. */
479 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
480 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
482 /* Is it correct ?
483 It depends on the domain rid usage.
485 A domain may share rid among its processor (eg having a
486 global VHPT). In this case, we should also share rid
487 among vcpus and the rid range should be the same.
489 However a domain may have per cpu rid allocation. In
490 this case we don't want to share rid among vcpus, but we may
491 do it if two vcpus are on the same cpu... */
493 v->arch.starting_rid = d->arch.starting_rid;
494 v->arch.ending_rid = d->arch.ending_rid;
495 v->arch.rid_bits = d->arch.rid_bits;
496 v->arch.breakimm = d->arch.breakimm;
497 v->arch.last_processor = INVALID_PROCESSOR;
498 v->arch.vhpt_pg_shift = PAGE_SHIFT;
499 }
501 if (!VMX_DOMAIN(v))
502 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
503 first_cpu(cpu_online_map));
505 return 0;
506 }
508 static void vcpu_share_privregs_with_guest(struct vcpu *v)
509 {
510 struct domain *d = v->domain;
511 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
513 for (i = 0; i < (1 << order); i++)
514 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
515 d, XENSHARE_writable);
516 /*
517 * XXX IA64_XMAPPEDREGS_PADDR
518 * assign these pages into guest pseudo physical address
519 * space for dom0 to map this page by gmfn.
520 * this is necessary for domain save, restore and dump-core.
521 */
522 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
523 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
524 virt_to_maddr(v->arch.privregs + i));
525 }
527 int vcpu_late_initialise(struct vcpu *v)
528 {
529 int rc, order;
531 if (HAS_PERVCPU_VHPT(v->domain)) {
532 rc = pervcpu_vhpt_alloc(v);
533 if (rc != 0)
534 return rc;
535 }
537 /* Create privregs page. */
538 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
539 v->arch.privregs = alloc_xenheap_pages(order, 0);
540 if (v->arch.privregs == NULL)
541 return -ENOMEM;
542 BUG_ON(v->arch.privregs == NULL);
543 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
544 vcpu_share_privregs_with_guest(v);
546 return 0;
547 }
549 void vcpu_destroy(struct vcpu *v)
550 {
551 if (is_hvm_vcpu(v))
552 vmx_relinquish_vcpu_resources(v);
553 else
554 relinquish_vcpu_resources(v);
555 }
557 static unsigned long*
558 vcpu_to_rbs_bottom(struct vcpu *v)
559 {
560 return (unsigned long*)((char *)v + IA64_RBS_OFFSET);
561 }
563 static void init_switch_stack(struct vcpu *v)
564 {
565 struct pt_regs *regs = vcpu_regs (v);
566 struct switch_stack *sw = (struct switch_stack *) regs - 1;
567 extern void ia64_ret_from_clone;
569 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
570 sw->ar_bspstore = (unsigned long)vcpu_to_rbs_bottom(v);
571 sw->b0 = (unsigned long) &ia64_ret_from_clone;
572 sw->ar_fpsr = FPSR_DEFAULT;
573 v->arch._thread.ksp = (unsigned long) sw - 16;
574 // stay on kernel stack because may get interrupts!
575 // ia64_ret_from_clone switches to user stack
576 v->arch._thread.on_ustack = 0;
577 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
578 }
580 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
581 static int opt_pervcpu_vhpt = 1;
582 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
583 #endif
585 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
586 {
587 int i;
589 // the following will eventually need to be negotiated dynamically
590 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
591 d->arch.breakimm = __IA64_XEN_HYPERCALL_DEFAULT;
592 for (i = 0; i < NR_CPUS; i++) {
593 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
594 }
596 if (is_idle_domain(d))
597 return 0;
599 INIT_LIST_HEAD(&d->arch.pdev_list);
600 foreign_p2m_init(d);
601 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
602 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
603 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
604 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
605 #endif
606 if (tlb_track_create(d) < 0)
607 goto fail_nomem1;
608 d->shared_info = alloc_xenheap_pages(
609 get_order_from_shift(XSI_SHIFT), 0);
610 if (d->shared_info == NULL)
611 goto fail_nomem;
612 BUG_ON(d->shared_info == NULL);
613 memset(d->shared_info, 0, XSI_SIZE);
614 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
615 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
616 d, XENSHARE_writable);
618 /* We may also need emulation rid for region4, though it's unlikely
619 * to see guest issue uncacheable access in metaphysical mode. But
620 * keep such info here may be more sane.
621 */
622 if (!allocate_rid_range(d,0))
623 goto fail_nomem;
625 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
626 d->arch.relres = RELRES_not_started;
627 d->arch.mm_teardown_offset = 0;
628 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
630 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
631 goto fail_nomem;
633 if(iommu_domain_init(d) != 0)
634 goto fail_iommu;
636 /*
637 * grant_table_create() can't fully initialize grant table for domain
638 * because it is called before arch_domain_create().
639 * Here we complete the initialization which requires p2m table.
640 */
641 spin_lock(&d->grant_table->lock);
642 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
643 ia64_gnttab_create_shared_page(d, d->grant_table, i);
644 spin_unlock(&d->grant_table->lock);
646 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
647 RANGESETF_prettyprint_hex);
649 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
650 return 0;
652 fail_iommu:
653 iommu_domain_destroy(d);
654 fail_nomem:
655 tlb_track_destroy(d);
656 fail_nomem1:
657 if (d->arch.mm.pgd != NULL)
658 pgd_free(d->arch.mm.pgd);
659 if (d->shared_info != NULL)
660 free_xenheap_pages(d->shared_info,
661 get_order_from_shift(XSI_SHIFT));
662 return -ENOMEM;
663 }
665 void arch_domain_destroy(struct domain *d)
666 {
667 mm_final_teardown(d);
669 if (d->shared_info != NULL)
670 free_xenheap_pages(d->shared_info,
671 get_order_from_shift(XSI_SHIFT));
673 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ) {
674 pci_release_devices(d);
675 iommu_domain_destroy(d);
676 }
678 tlb_track_destroy(d);
680 /* Clear vTLB for the next domain. */
681 domain_flush_tlb_vhpt(d);
683 deallocate_rid_range(d);
684 }
686 void arch_vcpu_reset(struct vcpu *v)
687 {
688 /* FIXME: Stub for now */
689 }
691 /* Here it is assumed that all of the CPUs has same RSE.N_STACKED_PHYS */
692 static unsigned long num_phys_stacked;
693 static int __init
694 init_num_phys_stacked(void)
695 {
696 switch (ia64_pal_rse_info(&num_phys_stacked, NULL)) {
697 case 0L:
698 printk("the number of physical stacked general registers"
699 "(RSE.N_STACKED_PHYS) = %ld\n", num_phys_stacked);
700 return 0;
701 case -2L:
702 case -3L:
703 default:
704 break;
705 }
706 printk("WARNING: PAL_RSE_INFO call failed. "
707 "domain save/restore may NOT work!\n");
708 return -EINVAL;
709 }
710 __initcall(init_num_phys_stacked);
712 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
714 #define AR_PFS_PEC_SHIFT 51
715 #define AR_PFS_REC_SIZE 6
716 #define AR_PFS_PEC_MASK (((1UL << 6) - 1) << 51)
718 /*
719 * See init_swtich_stack() and ptrace.h
720 */
721 static struct switch_stack*
722 vcpu_to_switch_stack(struct vcpu* v)
723 {
724 return (struct switch_stack *)(v->arch._thread.ksp + 16);
725 }
727 static int
728 vcpu_has_not_run(struct vcpu* v)
729 {
730 extern void ia64_ret_from_clone;
731 struct switch_stack *sw = vcpu_to_switch_stack(v);
733 return (sw == (struct switch_stack *)(vcpu_regs(v)) - 1) &&
734 (sw->b0 == (unsigned long)&ia64_ret_from_clone);
735 }
737 static void
738 nats_update(unsigned int* nats, unsigned int reg, char nat)
739 {
740 BUG_ON(reg > 31);
742 if (nat)
743 *nats |= (1UL << reg);
744 else
745 *nats &= ~(1UL << reg);
746 }
748 static unsigned long
749 __vcpu_get_itc(struct vcpu *v)
750 {
751 unsigned long itc_last;
752 unsigned long itc_offset;
753 unsigned long itc;
755 if (unlikely(v->arch.privregs == NULL))
756 return ia64_get_itc();
758 itc_last = v->arch.privregs->itc_last;
759 itc_offset = v->arch.privregs->itc_offset;
760 itc = ia64_get_itc();
761 itc += itc_offset;
762 if (itc_last >= itc)
763 itc = itc_last;
764 return itc;
765 }
767 static void
768 __vcpu_set_itc(struct vcpu *v, u64 val)
769 {
770 unsigned long itc;
771 unsigned long itc_offset;
772 unsigned long itc_last;
774 BUG_ON(v->arch.privregs == NULL);
776 if (v != current)
777 vcpu_pause(v);
779 itc = ia64_get_itc();
780 itc_offset = val - itc;
781 itc_last = val;
783 v->arch.privregs->itc_offset = itc_offset;
784 v->arch.privregs->itc_last = itc_last;
786 if (v != current)
787 vcpu_unpause(v);
788 }
790 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
791 {
792 int i;
793 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
794 struct cpu_user_regs *uregs = vcpu_regs(v);
795 struct switch_stack *sw = vcpu_to_switch_stack(v);
796 struct unw_frame_info info;
797 int is_hvm = VMX_DOMAIN(v);
798 unsigned int rbs_size;
799 unsigned long *const rbs_bottom = vcpu_to_rbs_bottom(v);
800 unsigned long *rbs_top;
801 unsigned long *rbs_rnat_addr;
802 unsigned int top_slot;
803 unsigned int num_regs;
805 memset(c.nat, 0, sizeof(*c.nat));
806 c.nat->regs.b[6] = uregs->b6;
807 c.nat->regs.b[7] = uregs->b7;
809 memset(&info, 0, sizeof(info));
810 unw_init_from_blocked_task(&info, v);
811 if (vcpu_has_not_run(v)) {
812 c.nat->regs.ar.lc = sw->ar_lc;
813 c.nat->regs.ar.ec =
814 (sw->ar_pfs & AR_PFS_PEC_MASK) >> AR_PFS_PEC_SHIFT;
815 } else if (unw_unwind_to_user(&info) < 0) {
816 /* warn: should panic? */
817 gdprintk(XENLOG_ERR, "vcpu=%d unw_unwind_to_user() failed.\n",
818 v->vcpu_id);
819 show_stack(v, NULL);
821 /* can't return error */
822 c.nat->regs.ar.lc = 0;
823 c.nat->regs.ar.ec = 0;
824 } else {
825 unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
826 unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
827 }
829 if (!is_hvm)
830 c.nat->regs.ar.itc = __vcpu_get_itc(v);
832 c.nat->regs.ar.csd = uregs->ar_csd;
833 c.nat->regs.ar.ssd = uregs->ar_ssd;
835 c.nat->regs.r[8] = uregs->r8;
836 c.nat->regs.r[9] = uregs->r9;
837 c.nat->regs.r[10] = uregs->r10;
838 c.nat->regs.r[11] = uregs->r11;
840 if (is_hvm)
841 c.nat->regs.psr = vmx_vcpu_get_psr(v);
842 else
843 c.nat->regs.psr = vcpu_get_psr(v);
845 c.nat->regs.ip = uregs->cr_iip;
846 c.nat->regs.cfm = uregs->cr_ifs;
848 c.nat->regs.ar.unat = uregs->ar_unat;
849 c.nat->regs.ar.pfs = uregs->ar_pfs;
850 c.nat->regs.ar.rsc = uregs->ar_rsc;
851 c.nat->regs.ar.rnat = uregs->ar_rnat;
852 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
854 c.nat->regs.pr = uregs->pr;
855 c.nat->regs.b[0] = uregs->b0;
856 rbs_size = uregs->loadrs >> 16;
857 num_regs = ia64_rse_num_regs(rbs_bottom,
858 (unsigned long*)((char*)rbs_bottom + rbs_size));
859 c.nat->regs.ar.bsp = (unsigned long)ia64_rse_skip_regs(
860 (unsigned long*)c.nat->regs.ar.bspstore, num_regs);
861 BUG_ON(num_regs > num_phys_stacked);
863 c.nat->regs.r[1] = uregs->r1;
864 c.nat->regs.r[12] = uregs->r12;
865 c.nat->regs.r[13] = uregs->r13;
866 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
867 c.nat->regs.r[15] = uregs->r15;
869 c.nat->regs.r[14] = uregs->r14;
870 c.nat->regs.r[2] = uregs->r2;
871 c.nat->regs.r[3] = uregs->r3;
872 c.nat->regs.r[16] = uregs->r16;
873 c.nat->regs.r[17] = uregs->r17;
874 c.nat->regs.r[18] = uregs->r18;
875 c.nat->regs.r[19] = uregs->r19;
876 c.nat->regs.r[20] = uregs->r20;
877 c.nat->regs.r[21] = uregs->r21;
878 c.nat->regs.r[22] = uregs->r22;
879 c.nat->regs.r[23] = uregs->r23;
880 c.nat->regs.r[24] = uregs->r24;
881 c.nat->regs.r[25] = uregs->r25;
882 c.nat->regs.r[26] = uregs->r26;
883 c.nat->regs.r[27] = uregs->r27;
884 c.nat->regs.r[28] = uregs->r28;
885 c.nat->regs.r[29] = uregs->r29;
886 c.nat->regs.r[30] = uregs->r30;
887 c.nat->regs.r[31] = uregs->r31;
889 c.nat->regs.ar.ccv = uregs->ar_ccv;
891 COPY_FPREG(&c.nat->regs.f[2], &sw->f2);
892 COPY_FPREG(&c.nat->regs.f[3], &sw->f3);
893 COPY_FPREG(&c.nat->regs.f[4], &sw->f4);
894 COPY_FPREG(&c.nat->regs.f[5], &sw->f5);
896 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
897 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
898 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
899 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
900 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
901 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
903 COPY_FPREG(&c.nat->regs.f[12], &sw->f12);
904 COPY_FPREG(&c.nat->regs.f[13], &sw->f13);
905 COPY_FPREG(&c.nat->regs.f[14], &sw->f14);
906 COPY_FPREG(&c.nat->regs.f[15], &sw->f15);
907 COPY_FPREG(&c.nat->regs.f[16], &sw->f16);
908 COPY_FPREG(&c.nat->regs.f[17], &sw->f17);
909 COPY_FPREG(&c.nat->regs.f[18], &sw->f18);
910 COPY_FPREG(&c.nat->regs.f[19], &sw->f19);
911 COPY_FPREG(&c.nat->regs.f[20], &sw->f20);
912 COPY_FPREG(&c.nat->regs.f[21], &sw->f21);
913 COPY_FPREG(&c.nat->regs.f[22], &sw->f22);
914 COPY_FPREG(&c.nat->regs.f[23], &sw->f23);
915 COPY_FPREG(&c.nat->regs.f[24], &sw->f24);
916 COPY_FPREG(&c.nat->regs.f[25], &sw->f25);
917 COPY_FPREG(&c.nat->regs.f[26], &sw->f26);
918 COPY_FPREG(&c.nat->regs.f[27], &sw->f27);
919 COPY_FPREG(&c.nat->regs.f[28], &sw->f28);
920 COPY_FPREG(&c.nat->regs.f[29], &sw->f29);
921 COPY_FPREG(&c.nat->regs.f[30], &sw->f30);
922 COPY_FPREG(&c.nat->regs.f[31], &sw->f31);
924 // f32 - f127
925 memcpy(&c.nat->regs.f[32], &v->arch._thread.fph[0],
926 sizeof(v->arch._thread.fph));
928 #define NATS_UPDATE(reg) \
929 nats_update(&c.nat->regs.nats, (reg), \
930 !!(uregs->eml_unat & \
931 (1UL << ia64_unat_pos(&uregs->r ## reg))))
933 // corresponding bit in ar.unat is determined by
934 // (&uregs->rN){8:3}.
935 // r8: the lowest gr member of struct cpu_user_regs.
936 // r7: the highest gr member of struct cpu_user_regs.
937 BUILD_BUG_ON(offsetof(struct cpu_user_regs, r7) -
938 offsetof(struct cpu_user_regs, r8) >
939 64 * sizeof(unsigned long));
941 NATS_UPDATE(1);
942 NATS_UPDATE(2);
943 NATS_UPDATE(3);
945 NATS_UPDATE(8);
946 NATS_UPDATE(9);
947 NATS_UPDATE(10);
948 NATS_UPDATE(11);
949 NATS_UPDATE(12);
950 NATS_UPDATE(13);
951 NATS_UPDATE(14);
952 NATS_UPDATE(15);
953 NATS_UPDATE(16);
954 NATS_UPDATE(17);
955 NATS_UPDATE(18);
956 NATS_UPDATE(19);
957 NATS_UPDATE(20);
958 NATS_UPDATE(21);
959 NATS_UPDATE(22);
960 NATS_UPDATE(23);
961 NATS_UPDATE(24);
962 NATS_UPDATE(25);
963 NATS_UPDATE(26);
964 NATS_UPDATE(27);
965 NATS_UPDATE(28);
966 NATS_UPDATE(29);
967 NATS_UPDATE(30);
968 NATS_UPDATE(31);
970 if (!is_hvm) {
971 c.nat->regs.r[4] = uregs->r4;
972 c.nat->regs.r[5] = uregs->r5;
973 c.nat->regs.r[6] = uregs->r6;
974 c.nat->regs.r[7] = uregs->r7;
976 NATS_UPDATE(4);
977 NATS_UPDATE(5);
978 NATS_UPDATE(6);
979 NATS_UPDATE(7);
980 #undef NATS_UPDATE
981 } else {
982 /*
983 * for VTi domain, r[4-7] are saved sometimes both in
984 * uregs->r[4-7] and memory stack or only in memory stack.
985 * So it is ok to get them from memory stack.
986 */
987 if (vcpu_has_not_run(v)) {
988 c.nat->regs.r[4] = sw->r4;
989 c.nat->regs.r[5] = sw->r5;
990 c.nat->regs.r[6] = sw->r6;
991 c.nat->regs.r[7] = sw->r7;
993 nats_update(&c.nat->regs.nats, 4,
994 !!(sw->ar_unat &
995 (1UL << ia64_unat_pos(&sw->r4))));
996 nats_update(&c.nat->regs.nats, 5,
997 !!(sw->ar_unat &
998 (1UL << ia64_unat_pos(&sw->r5))));
999 nats_update(&c.nat->regs.nats, 6,
1000 !!(sw->ar_unat &
1001 (1UL << ia64_unat_pos(&sw->r6))));
1002 nats_update(&c.nat->regs.nats, 7,
1003 !!(sw->ar_unat &
1004 (1UL << ia64_unat_pos(&sw->r7))));
1005 } else {
1006 char nat;
1008 unw_get_gr(&info, 4, &c.nat->regs.r[4], &nat);
1009 nats_update(&c.nat->regs.nats, 4, nat);
1010 unw_get_gr(&info, 5, &c.nat->regs.r[5], &nat);
1011 nats_update(&c.nat->regs.nats, 5, nat);
1012 unw_get_gr(&info, 6, &c.nat->regs.r[6], &nat);
1013 nats_update(&c.nat->regs.nats, 6, nat);
1014 unw_get_gr(&info, 7, &c.nat->regs.r[7], &nat);
1015 nats_update(&c.nat->regs.nats, 7, nat);
1019 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
1020 if (unlikely(rbs_size > sizeof(c.nat->regs.rbs)))
1021 gdprintk(XENLOG_INFO,
1022 "rbs_size is too large 0x%x > 0x%lx\n",
1023 rbs_size, sizeof(c.nat->regs.rbs));
1024 else
1025 memcpy(c.nat->regs.rbs, rbs_bottom, rbs_size);
1027 rbs_top = (unsigned long*)((char *)rbs_bottom + rbs_size) - 1;
1028 rbs_rnat_addr = ia64_rse_rnat_addr(rbs_top);
1029 if ((unsigned long)rbs_rnat_addr >= sw->ar_bspstore)
1030 rbs_rnat_addr = &sw->ar_rnat;
1032 top_slot = ia64_rse_slot_num(rbs_top);
1034 c.nat->regs.rbs_rnat = (*rbs_rnat_addr) & ((1UL << top_slot) - 1);
1035 if (ia64_rse_rnat_addr(rbs_bottom) == ia64_rse_rnat_addr(rbs_top)) {
1036 unsigned int bottom_slot = ia64_rse_slot_num(rbs_bottom);
1037 c.nat->regs.rbs_rnat &= ~((1UL << bottom_slot) - 1);
1040 c.nat->regs.num_phys_stacked = num_phys_stacked;
1042 if (VMX_DOMAIN(v))
1043 c.nat->privregs_pfn = VGC_PRIVREGS_HVM;
1044 else
1045 c.nat->privregs_pfn = get_gpfn_from_mfn(
1046 virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
1048 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1049 if (VMX_DOMAIN(v)) {
1050 vmx_vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
1051 vmx_vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
1052 } else {
1053 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
1054 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
1058 for (i = 0; i < 8; i++)
1059 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
1061 /* Fill extra regs. */
1062 for (i = 0;
1063 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1064 i++) {
1065 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
1066 tr->itrs[i].itir = v->arch.itrs[i].itir;
1067 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
1068 tr->itrs[i].rid = v->arch.itrs[i].rid;
1070 for (i = 0;
1071 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1072 i++) {
1073 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
1074 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
1075 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
1076 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
1078 c.nat->event_callback_ip = v->arch.event_callback_ip;
1080 /* If PV and privregs is not set, we can't read mapped registers. */
1081 if (!is_hvm_vcpu(v) && v->arch.privregs == NULL)
1082 return;
1084 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
1086 c.nat->regs.cr.itm = is_hvm_vcpu(v) ?
1087 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
1088 vcpu_get_iva(v, &c.nat->regs.cr.iva);
1089 vcpu_get_pta(v, &c.nat->regs.cr.pta);
1091 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
1092 vcpu_get_isr(v, &c.nat->regs.cr.isr);
1093 vcpu_get_iip(v, &c.nat->regs.cr.iip);
1094 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
1095 vcpu_get_itir(v, &c.nat->regs.cr.itir);
1096 vcpu_get_iha(v, &c.nat->regs.cr.iha);
1098 //XXX change irr[] and arch.insvc[]
1099 if (is_hvm_vcpu(v))
1100 /* c.nat->regs.cr.ivr = vmx_vcpu_get_ivr(v)*/;//XXXnot SMP-safe
1101 else
1102 vcpu_get_ivr (v, &c.nat->regs.cr.ivr);
1103 vcpu_get_iim(v, &c.nat->regs.cr.iim);
1105 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
1106 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
1107 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
1108 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
1109 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
1110 vcpu_get_itv(v, &c.nat->regs.cr.itv);//XXX vlsapic
1111 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
1112 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
1114 if (is_hvm)
1115 vmx_arch_get_info_guest(v, c);
1118 #if 0
1119 // for debug
1120 static void
1121 __rbs_print(const char* func, int line, const char* name,
1122 const unsigned long* rbs, unsigned int rbs_size)
1124 unsigned int i;
1125 printk("%s:%d %s rbs %p\n", func, line, name, rbs);
1126 printk(" rbs_size 0x%016x no 0x%lx\n",
1127 rbs_size, rbs_size / sizeof(unsigned long));
1129 for (i = 0; i < rbs_size / sizeof(unsigned long); i++) {
1130 const char* zero_or_n = "0x";
1131 if (ia64_rse_is_rnat_slot((unsigned long*)&rbs[i]))
1132 zero_or_n = "Nx";
1134 if ((i % 3) == 0)
1135 printk("0x%02x:", i);
1136 printk(" %s%016lx", zero_or_n, rbs[i]);
1137 if ((i % 3) == 2)
1138 printk("\n");
1140 printk("\n");
1143 #define rbs_print(rbs, rbs_size) \
1144 __rbs_print(__func__, __LINE__, (#rbs), (rbs), (rbs_size))
1145 #endif
1147 static int
1148 copy_rbs(struct vcpu* v, unsigned long* dst_rbs_size,
1149 const unsigned long* rbs, unsigned long rbs_size,
1150 unsigned long src_rnat, unsigned long rbs_voff)
1152 int rc = -EINVAL;
1153 struct page_info* page;
1154 unsigned char* vaddr;
1155 unsigned long* src_bsp;
1156 unsigned long* src_bspstore;
1158 struct switch_stack* sw = vcpu_to_switch_stack(v);
1159 unsigned long num_regs;
1160 unsigned long* dst_bsp;
1161 unsigned long* dst_bspstore;
1162 unsigned long* dst_rnat;
1163 unsigned long dst_rnat_tmp;
1164 unsigned long dst_rnat_mask;
1165 unsigned long flags;
1166 extern void ia64_copy_rbs(unsigned long* dst_bspstore,
1167 unsigned long* dst_rbs_size,
1168 unsigned long* dst_rnat_p,
1169 unsigned long* src_bsp,
1170 unsigned long src_rbs_size,
1171 unsigned long src_rnat);
1173 dst_bspstore = vcpu_to_rbs_bottom(v);
1174 *dst_rbs_size = rbs_size;
1175 if (rbs_size == 0)
1176 return 0;
1178 // rbs offset depends on sizeof(struct vcpu) so that
1179 // it's too unstable for hypercall ABI.
1180 // we need to take rbs offset into acount.
1181 //memcpy(dst_bspstore, c.nat->regs.rbs, rbs_size);
1183 // It is assumed that rbs_size is small enough compared
1184 // to KERNEL_STACK_SIZE.
1185 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
1186 if (page == NULL)
1187 return -ENOMEM;
1188 vaddr = page_to_virt(page);
1190 src_bspstore = (unsigned long*)(vaddr + rbs_voff * 8);
1191 src_bsp = (unsigned long*)((unsigned char*)src_bspstore + rbs_size);
1192 if ((unsigned long)src_bsp >= (unsigned long)vaddr + PAGE_SIZE)
1193 goto out;
1194 memcpy(src_bspstore, rbs, rbs_size);
1196 num_regs = ia64_rse_num_regs(src_bspstore, src_bsp);
1197 dst_bsp = ia64_rse_skip_regs(dst_bspstore, num_regs);
1198 *dst_rbs_size = (unsigned long)dst_bsp - (unsigned long)dst_bspstore;
1200 // rough check.
1201 if (((unsigned long)dst_bsp & ~PAGE_MASK) > KERNEL_STACK_SIZE / 2)
1202 goto out;
1204 // ia64_copy_rbs() uses real cpu's stack register.
1205 // So it may fault with an Illigal Operation fault resulting
1206 // in panic if rbs_size is too large to load compared to
1207 // the number of physical stacked registers, RSE.N_STACKED_PHYS,
1208 // which is cpu implementatin specific.
1209 // See SDM vol. 2 Register Stack Engine 6, especially 6.5.5.
1210 //
1211 // For safe operation and cpu model independency,
1212 // we need to copy them by hand without loadrs and flushrs
1213 // However even if we implement that, similar issue still occurs
1214 // when running guest. CPU context restore routine issues loadrs
1215 // resulting in Illegal Operation fault. And what if the vRSE is in
1216 // enforced lazy mode? We can't store any dirty stacked registers
1217 // into RBS without cover or br.call.
1218 if (num_regs > num_phys_stacked) {
1219 rc = -ENOSYS;
1220 gdprintk(XENLOG_WARNING,
1221 "%s:%d domain %d: can't load stacked registres\n"
1222 "requested size 0x%lx => 0x%lx, num regs %ld"
1223 "RSE.N_STACKED_PHYS %ld\n",
1224 __func__, __LINE__, v->domain->domain_id,
1225 rbs_size, *dst_rbs_size, num_regs,
1226 num_phys_stacked);
1227 goto out;
1230 // we mask interrupts to avoid using register backing store.
1231 local_irq_save(flags);
1232 ia64_copy_rbs(dst_bspstore, dst_rbs_size, &dst_rnat_tmp,
1233 src_bsp, rbs_size, src_rnat);
1234 local_irq_restore(flags);
1236 dst_rnat_mask = (1UL << ia64_rse_slot_num(dst_bsp)) - 1;
1237 dst_rnat = ia64_rse_rnat_addr(dst_bsp);
1238 if ((unsigned long)dst_rnat > sw->ar_bspstore)
1239 dst_rnat = &sw->ar_rnat;
1240 // if ia64_rse_rnat_addr(dst_bsp) ==
1241 // ia64_rse_rnat_addr(vcpu_to_rbs_bottom(v)), the lsb bit of rnat
1242 // is just ignored. so we don't have to mask it out.
1243 *dst_rnat =
1244 (*dst_rnat & ~dst_rnat_mask) | (dst_rnat_tmp & dst_rnat_mask);
1246 rc = 0;
1247 out:
1248 free_domheap_pages(page, KERNEL_STACK_SIZE_ORDER);
1249 return rc;
1252 static void
1253 unat_update(unsigned long *unat_eml, unsigned long *spill_addr, char nat)
1255 unsigned int pos = ia64_unat_pos(spill_addr);
1256 if (nat)
1257 *unat_eml |= (1UL << pos);
1258 else
1259 *unat_eml &= ~(1UL << pos);
1262 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
1264 struct cpu_user_regs *uregs = vcpu_regs(v);
1265 struct domain *d = v->domain;
1266 struct switch_stack *sw = vcpu_to_switch_stack(v);
1267 int was_initialised = v->is_initialised;
1268 struct unw_frame_info info;
1269 unsigned int rbs_size;
1270 unsigned int num_regs;
1271 unsigned long * const rbs_bottom = vcpu_to_rbs_bottom(v);
1272 int rc = 0;
1273 int i;
1275 /* Finish vcpu initialization. */
1276 if (!was_initialised) {
1277 if (is_hvm_domain(d))
1278 rc = vmx_final_setup_guest(v);
1279 else
1280 rc = vcpu_late_initialise(v);
1281 if (rc != 0)
1282 return rc;
1284 vcpu_init_regs(v);
1286 v->is_initialised = 1;
1287 /* Auto-online VCPU0 when it is initialised. */
1288 if (v->vcpu_id == 0 || (c.nat != NULL &&
1289 c.nat->flags & VGCF_online))
1290 clear_bit(_VPF_down, &v->pause_flags);
1293 if (c.nat == NULL)
1294 return 0;
1296 uregs->b6 = c.nat->regs.b[6];
1297 uregs->b7 = c.nat->regs.b[7];
1299 memset(&info, 0, sizeof(info));
1300 unw_init_from_blocked_task(&info, v);
1301 if (vcpu_has_not_run(v)) {
1302 sw->ar_lc = c.nat->regs.ar.lc;
1303 sw->ar_pfs =
1304 (sw->ar_pfs & ~AR_PFS_PEC_MASK) |
1305 ((c.nat->regs.ar.ec << AR_PFS_PEC_SHIFT) &
1306 AR_PFS_PEC_MASK);
1307 } else if (unw_unwind_to_user(&info) < 0) {
1308 /* warn: should panic? */
1309 gdprintk(XENLOG_ERR,
1310 "vcpu=%d unw_unwind_to_user() failed.\n",
1311 v->vcpu_id);
1312 show_stack(v, NULL);
1314 //return -ENOSYS;
1315 } else {
1316 unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
1317 unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
1320 if (!is_hvm_domain(d) && (c.nat->flags & VGCF_SET_AR_ITC))
1321 __vcpu_set_itc(v, c.nat->regs.ar.itc);
1323 uregs->ar_csd = c.nat->regs.ar.csd;
1324 uregs->ar_ssd = c.nat->regs.ar.ssd;
1326 uregs->r8 = c.nat->regs.r[8];
1327 uregs->r9 = c.nat->regs.r[9];
1328 uregs->r10 = c.nat->regs.r[10];
1329 uregs->r11 = c.nat->regs.r[11];
1331 if (!is_hvm_domain(d))
1332 vcpu_set_psr(v, c.nat->regs.psr);
1333 else
1334 vmx_vcpu_set_psr(v, c.nat->regs.psr);
1335 uregs->cr_iip = c.nat->regs.ip;
1336 uregs->cr_ifs = c.nat->regs.cfm;
1338 uregs->ar_unat = c.nat->regs.ar.unat;
1339 uregs->ar_pfs = c.nat->regs.ar.pfs;
1340 uregs->ar_rsc = c.nat->regs.ar.rsc;
1341 uregs->ar_rnat = c.nat->regs.ar.rnat;
1342 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
1344 uregs->pr = c.nat->regs.pr;
1345 uregs->b0 = c.nat->regs.b[0];
1346 num_regs = ia64_rse_num_regs((unsigned long*)c.nat->regs.ar.bspstore,
1347 (unsigned long*)c.nat->regs.ar.bsp);
1348 rbs_size = (unsigned long)ia64_rse_skip_regs(rbs_bottom, num_regs) -
1349 (unsigned long)rbs_bottom;
1350 if (rbs_size > sizeof (c.nat->regs.rbs)) {
1351 gdprintk(XENLOG_INFO,
1352 "rbs size is too large %x > %lx\n",
1353 rbs_size, sizeof (c.nat->regs.rbs));
1354 return -EINVAL;
1356 if (rbs_size > 0 &&
1357 ((IA64_RBS_OFFSET / 8) % 64) != c.nat->regs.rbs_voff)
1358 gdprintk(XENLOG_INFO,
1359 "rbs stack offset is different! xen 0x%x given 0x%x",
1360 (IA64_RBS_OFFSET / 8) % 64, c.nat->regs.rbs_voff);
1362 /* Protection against crazy user code. */
1363 if (!was_initialised)
1364 uregs->loadrs = (rbs_size << 16);
1365 if (rbs_size == (uregs->loadrs >> 16)) {
1366 unsigned long dst_rbs_size = 0;
1367 if (vcpu_has_not_run(v))
1368 sw->ar_bspstore = (unsigned long)rbs_bottom;
1370 rc = copy_rbs(v, &dst_rbs_size,
1371 c.nat->regs.rbs, rbs_size,
1372 c.nat->regs.rbs_rnat,
1373 c.nat->regs.rbs_voff);
1374 if (rc < 0)
1375 return rc;
1377 /* In case of newly created vcpu, ar_bspstore points to
1378 * the bottom of register stack. Move it up.
1379 * See also init_switch_stack().
1380 */
1381 if (vcpu_has_not_run(v)) {
1382 uregs->loadrs = (dst_rbs_size << 16);
1383 sw->ar_bspstore = (unsigned long)((char*)rbs_bottom +
1384 dst_rbs_size);
1388 // inhibit save/restore between cpus of different RSE.N_STACKED_PHYS.
1389 // to avoid nasty issues.
1390 //
1391 // The number of physical stacked general register(RSE.N_STACKED_PHYS)
1392 // isn't virtualized. Guest OS utilizes it via PAL_RSE_INFO call and
1393 // the value might be exported to user/user process.
1394 // (Linux does via /proc/cpuinfo)
1395 // The SDM says only that the number is cpu implementation specific.
1396 //
1397 // If the number of restoring cpu is different from one of saving cpu,
1398 // the following, or something worse, might happen.
1399 // - Xen VMM itself may panic when issuing loadrs to run guest with
1400 // illegal operation fault
1401 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1402 // restoring CPU
1403 // This case is detected to refuse restore by rbs_copy()
1404 // - guest kernel may panic with illegal operation fault
1405 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1406 // restoring CPU
1407 // - infomation leak from guest kernel to user process
1408 // When RSE.N_STACKED_PHYS of saving CPU < RSE.N_STACKED_PHYS of
1409 // restoring CPU
1410 // Before returning to user process, kernel should zero clear all
1411 // physical stacked resgisters to prevent kernel bits leak.
1412 // It would be based on RSE.N_STACKED_PHYS (Linux does.).
1413 // On the restored environtment the kernel clears only a part
1414 // of the physical stacked registers.
1415 // - user processes or human operators would be confused.
1416 // RSE.N_STACKED_PHYS might be exported to user process or human
1417 // operators. Actually on linux it is exported via /proc/cpuinfo.
1418 // user processes might use it.
1419 // I don't know any concrete example, but it's possible in theory.
1420 // e.g. thread libraly may allocate RBS area based on the value.
1421 // (Fortunately glibc nptl doesn't)
1422 if (c.nat->regs.num_phys_stacked != 0 && /* COMPAT */
1423 c.nat->regs.num_phys_stacked != num_phys_stacked) {
1424 gdprintk(XENLOG_WARNING,
1425 "num phys stacked is different! "
1426 "xen 0x%lx given 0x%lx",
1427 num_phys_stacked, c.nat->regs.num_phys_stacked);
1428 return -EINVAL;
1431 uregs->r1 = c.nat->regs.r[1];
1432 uregs->r12 = c.nat->regs.r[12];
1433 uregs->r13 = c.nat->regs.r[13];
1434 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
1435 uregs->r15 = c.nat->regs.r[15];
1437 uregs->r14 = c.nat->regs.r[14];
1438 uregs->r2 = c.nat->regs.r[2];
1439 uregs->r3 = c.nat->regs.r[3];
1440 uregs->r16 = c.nat->regs.r[16];
1441 uregs->r17 = c.nat->regs.r[17];
1442 uregs->r18 = c.nat->regs.r[18];
1443 uregs->r19 = c.nat->regs.r[19];
1444 uregs->r20 = c.nat->regs.r[20];
1445 uregs->r21 = c.nat->regs.r[21];
1446 uregs->r22 = c.nat->regs.r[22];
1447 uregs->r23 = c.nat->regs.r[23];
1448 uregs->r24 = c.nat->regs.r[24];
1449 uregs->r25 = c.nat->regs.r[25];
1450 uregs->r26 = c.nat->regs.r[26];
1451 uregs->r27 = c.nat->regs.r[27];
1452 uregs->r28 = c.nat->regs.r[28];
1453 uregs->r29 = c.nat->regs.r[29];
1454 uregs->r30 = c.nat->regs.r[30];
1455 uregs->r31 = c.nat->regs.r[31];
1457 uregs->ar_ccv = c.nat->regs.ar.ccv;
1459 COPY_FPREG(&sw->f2, &c.nat->regs.f[2]);
1460 COPY_FPREG(&sw->f3, &c.nat->regs.f[3]);
1461 COPY_FPREG(&sw->f4, &c.nat->regs.f[4]);
1462 COPY_FPREG(&sw->f5, &c.nat->regs.f[5]);
1464 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
1465 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
1466 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
1467 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
1468 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
1469 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
1471 COPY_FPREG(&sw->f12, &c.nat->regs.f[12]);
1472 COPY_FPREG(&sw->f13, &c.nat->regs.f[13]);
1473 COPY_FPREG(&sw->f14, &c.nat->regs.f[14]);
1474 COPY_FPREG(&sw->f15, &c.nat->regs.f[15]);
1475 COPY_FPREG(&sw->f16, &c.nat->regs.f[16]);
1476 COPY_FPREG(&sw->f17, &c.nat->regs.f[17]);
1477 COPY_FPREG(&sw->f18, &c.nat->regs.f[18]);
1478 COPY_FPREG(&sw->f19, &c.nat->regs.f[19]);
1479 COPY_FPREG(&sw->f20, &c.nat->regs.f[20]);
1480 COPY_FPREG(&sw->f21, &c.nat->regs.f[21]);
1481 COPY_FPREG(&sw->f22, &c.nat->regs.f[22]);
1482 COPY_FPREG(&sw->f23, &c.nat->regs.f[23]);
1483 COPY_FPREG(&sw->f24, &c.nat->regs.f[24]);
1484 COPY_FPREG(&sw->f25, &c.nat->regs.f[25]);
1485 COPY_FPREG(&sw->f26, &c.nat->regs.f[26]);
1486 COPY_FPREG(&sw->f27, &c.nat->regs.f[27]);
1487 COPY_FPREG(&sw->f28, &c.nat->regs.f[28]);
1488 COPY_FPREG(&sw->f29, &c.nat->regs.f[29]);
1489 COPY_FPREG(&sw->f30, &c.nat->regs.f[30]);
1490 COPY_FPREG(&sw->f31, &c.nat->regs.f[31]);
1492 // f32 - f127
1493 memcpy(&v->arch._thread.fph[0], &c.nat->regs.f[32],
1494 sizeof(v->arch._thread.fph));
1496 #define UNAT_UPDATE(reg) \
1497 unat_update(&uregs->eml_unat, &uregs->r ## reg, \
1498 !!(c.nat->regs.nats & (1UL << (reg))));
1500 uregs->eml_unat = 0;
1501 UNAT_UPDATE(1);
1502 UNAT_UPDATE(2);
1503 UNAT_UPDATE(3);
1505 UNAT_UPDATE(8);
1506 UNAT_UPDATE(9);
1507 UNAT_UPDATE(10);
1508 UNAT_UPDATE(11);
1509 UNAT_UPDATE(12);
1510 UNAT_UPDATE(13);
1511 UNAT_UPDATE(14);
1512 UNAT_UPDATE(15);
1513 UNAT_UPDATE(16);
1514 UNAT_UPDATE(17);
1515 UNAT_UPDATE(18);
1516 UNAT_UPDATE(19);
1517 UNAT_UPDATE(20);
1518 UNAT_UPDATE(21);
1519 UNAT_UPDATE(22);
1520 UNAT_UPDATE(23);
1521 UNAT_UPDATE(24);
1522 UNAT_UPDATE(25);
1523 UNAT_UPDATE(26);
1524 UNAT_UPDATE(27);
1525 UNAT_UPDATE(28);
1526 UNAT_UPDATE(29);
1527 UNAT_UPDATE(30);
1528 UNAT_UPDATE(31);
1530 /*
1531 * r4-r7 is saved sometimes both in pt_regs->r[4-7] and memory stack or
1532 * only in memory stack.
1533 * for both cases, both memory stack and pt_regs->r[4-7] are updated.
1534 */
1535 uregs->r4 = c.nat->regs.r[4];
1536 uregs->r5 = c.nat->regs.r[5];
1537 uregs->r6 = c.nat->regs.r[6];
1538 uregs->r7 = c.nat->regs.r[7];
1540 UNAT_UPDATE(4);
1541 UNAT_UPDATE(5);
1542 UNAT_UPDATE(6);
1543 UNAT_UPDATE(7);
1544 #undef UNAT_UPDATE
1545 if (vcpu_has_not_run(v)) {
1546 sw->r4 = c.nat->regs.r[4];
1547 sw->r5 = c.nat->regs.r[5];
1548 sw->r6 = c.nat->regs.r[6];
1549 sw->r7 = c.nat->regs.r[7];
1551 unat_update(&sw->ar_unat, &sw->r4,
1552 !!(c.nat->regs.nats & (1UL << 4)));
1553 unat_update(&sw->ar_unat, &sw->r5,
1554 !!(c.nat->regs.nats & (1UL << 5)));
1555 unat_update(&sw->ar_unat, &sw->r6,
1556 !!(c.nat->regs.nats & (1UL << 6)));
1557 unat_update(&sw->ar_unat, &sw->r7,
1558 !!(c.nat->regs.nats & (1UL << 7)));
1559 } else {
1560 unw_set_gr(&info, 4, c.nat->regs.r[4],
1561 !!(c.nat->regs.nats & (1UL << 4)));
1562 unw_set_gr(&info, 5, c.nat->regs.r[5],
1563 !!(c.nat->regs.nats & (1UL << 5)));
1564 unw_set_gr(&info, 6, c.nat->regs.r[6],
1565 !!(c.nat->regs.nats & (1UL << 6)));
1566 unw_set_gr(&info, 7, c.nat->regs.r[7],
1567 !!(c.nat->regs.nats & (1UL << 7)));
1570 if (!is_hvm_domain(d)) {
1571 /* domain runs at PL2/3 */
1572 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
1573 IA64_PSR_CPL0_BIT);
1574 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
1577 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1578 if (is_hvm_domain(d)) {
1579 vmx_vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1580 vmx_vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1581 } else {
1582 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1583 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1587 /* rr[] must be set before setting itrs[] dtrs[] */
1588 for (i = 0; i < 8; i++) {
1589 unsigned long rrval = c.nat->regs.rr[i];
1590 unsigned long reg = (unsigned long)i << 61;
1591 IA64FAULT fault = IA64_NO_FAULT;
1593 if (rrval == 0)
1594 continue;
1595 if (is_hvm_domain(d)) {
1596 //without VGCF_EXTRA_REGS check,
1597 //VTi domain doesn't boot.
1598 if (c.nat->flags & VGCF_EXTRA_REGS)
1599 fault = vmx_vcpu_set_rr(v, reg, rrval);
1600 } else
1601 fault = vcpu_set_rr(v, reg, rrval);
1602 if (fault != IA64_NO_FAULT)
1603 return -EINVAL;
1606 if (c.nat->flags & VGCF_EXTRA_REGS) {
1607 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
1609 for (i = 0;
1610 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1611 i++) {
1612 if (is_hvm_domain(d))
1613 vmx_vcpu_itr_i(v, i, tr->itrs[i].pte,
1614 tr->itrs[i].itir,
1615 tr->itrs[i].vadr);
1616 else
1617 vcpu_set_itr(v, i, tr->itrs[i].pte,
1618 tr->itrs[i].itir,
1619 tr->itrs[i].vadr,
1620 tr->itrs[i].rid);
1622 for (i = 0;
1623 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1624 i++) {
1625 if (is_hvm_domain(d))
1626 vmx_vcpu_itr_d(v, i, tr->dtrs[i].pte,
1627 tr->dtrs[i].itir,
1628 tr->dtrs[i].vadr);
1629 else
1630 vcpu_set_dtr(v, i,
1631 tr->dtrs[i].pte,
1632 tr->dtrs[i].itir,
1633 tr->dtrs[i].vadr,
1634 tr->dtrs[i].rid);
1636 v->arch.event_callback_ip = c.nat->event_callback_ip;
1637 vcpu_set_iva(v, c.nat->regs.cr.iva);
1640 if (is_hvm_domain(d))
1641 rc = vmx_arch_set_info_guest(v, c);
1643 return rc;
1646 static int relinquish_memory(struct domain *d, struct page_list_head *list)
1648 struct page_info *page;
1649 #ifndef __ia64__
1650 unsigned long x, y;
1651 #endif
1652 int ret = 0;
1654 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1655 spin_lock_recursive(&d->page_alloc_lock);
1657 while ( (page = page_list_remove_head(list)) )
1659 /* Grab a reference to the page so it won't disappear from under us. */
1660 if ( unlikely(!get_page(page, d)) )
1662 /* Couldn't get a reference -- someone is freeing this page. */
1663 page_list_add_tail(page, &d->arch.relmem_list);
1664 continue;
1667 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1668 put_page_and_type(page);
1670 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1671 put_page(page);
1673 #ifndef __ia64__
1674 /*
1675 * Forcibly invalidate base page tables at this point to break circular
1676 * 'linear page table' references. This is okay because MMU structures
1677 * are not shared across domains and this domain is now dead. Thus base
1678 * tables are not in use so a non-zero count means circular reference.
1679 */
1680 y = page->u.inuse.type_info;
1681 for ( ; ; )
1683 x = y;
1684 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1685 (PGT_base_page_table|PGT_validated)) )
1686 break;
1688 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1689 if ( likely(y == x) )
1691 free_page_type(page, PGT_base_page_table);
1692 break;
1695 #endif
1697 /* Follow the list chain and /then/ potentially free the page. */
1698 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
1699 page_list_add_tail(page, &d->arch.relmem_list);
1700 put_page(page);
1702 if (hypercall_preempt_check()) {
1703 ret = -EAGAIN;
1704 goto out;
1708 page_list_splice_init(&d->arch.relmem_list, list);
1710 out:
1711 spin_unlock_recursive(&d->page_alloc_lock);
1712 return ret;
1715 int domain_relinquish_resources(struct domain *d)
1717 int ret = 0;
1719 switch (d->arch.relres) {
1720 case RELRES_not_started:
1721 /* Relinquish guest resources for VT-i domain. */
1722 if (is_hvm_domain(d))
1723 vmx_relinquish_guest_resources(d);
1724 d->arch.relres = RELRES_mm_teardown;
1725 /*fallthrough*/
1727 case RELRES_mm_teardown:
1728 if (d->arch.pirq_eoi_map != NULL) {
1729 put_page(virt_to_page(d->arch.pirq_eoi_map));
1730 d->arch.pirq_eoi_map = NULL;
1733 /* Tear down shadow mode stuff. */
1734 ret = mm_teardown(d);
1735 if (ret != 0)
1736 return ret;
1737 d->arch.relres = RELRES_xen;
1738 /* fallthrough */
1740 case RELRES_xen:
1741 /* Relinquish every xen page of memory. */
1742 ret = relinquish_memory(d, &d->xenpage_list);
1743 if (ret != 0)
1744 return ret;
1745 d->arch.relres = RELRES_dom;
1746 /* fallthrough */
1748 case RELRES_dom:
1749 /* Relinquish every domain page of memory. */
1750 ret = relinquish_memory(d, &d->page_list);
1751 if (ret != 0)
1752 return ret;
1753 d->arch.relres = RELRES_done;
1754 /* fallthrough */
1756 case RELRES_done:
1757 break;
1759 default:
1760 BUG();
1763 if (is_hvm_domain(d) && d->arch.sal_data)
1764 xfree(d->arch.sal_data);
1766 return 0;
1769 unsigned long
1770 domain_set_shared_info_va (unsigned long va)
1772 struct vcpu *v = current;
1773 struct domain *d = v->domain;
1774 int rc;
1776 /* Check virtual address:
1777 must belong to region 7,
1778 must be 64Kb aligned,
1779 must not be within Xen virtual space. */
1780 if ((va >> 61) != 7
1781 || (va & 0xffffUL) != 0
1782 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1783 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1785 /* Note: this doesn't work well if other cpus are already running.
1786 However this is part of the spec :-) */
1787 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1788 d->arch.shared_info_va = va;
1790 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1791 INT_ENABLE_OFFSET(v);
1792 set_current_psr_i_addr(v);
1794 /* Remap the shared pages. */
1795 BUG_ON(VMX_DOMAIN(v));
1796 rc = !set_one_rr(7UL << 61, PSCB(v,rrs[7]));
1797 BUG_ON(rc);
1799 return rc;
1802 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1803 #define SHADOW_COPY_CHUNK 1024
1805 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1807 unsigned int op = sc->op;
1808 int rc = 0;
1809 int i;
1810 //struct vcpu *v;
1812 if (unlikely(d == current->domain)) {
1813 gdprintk(XENLOG_INFO,
1814 "Don't try to do a shadow op on yourself!\n");
1815 return -EINVAL;
1818 domain_pause(d);
1820 switch (op)
1822 case XEN_DOMCTL_SHADOW_OP_OFF:
1823 if (shadow_mode_enabled (d)) {
1824 u64 *bm = d->arch.shadow_bitmap;
1825 struct vcpu *v;
1827 for_each_vcpu(d, v)
1828 v->arch.shadow_bitmap = NULL;
1830 /* Flush vhpt and tlb to restore dirty bit usage. */
1831 flush_tlb_for_log_dirty(d);
1833 /* Free bitmap. */
1834 d->arch.shadow_bitmap_size = 0;
1835 d->arch.shadow_bitmap = NULL;
1836 xfree(bm);
1838 break;
1840 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1841 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1842 rc = -EINVAL;
1843 break;
1845 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1846 if (shadow_mode_enabled(d)) {
1847 rc = -EINVAL;
1848 break;
1851 atomic64_set(&d->arch.shadow_fault_count, 0);
1852 atomic64_set(&d->arch.shadow_dirty_count, 0);
1854 d->arch.shadow_bitmap_size =
1855 (domain_get_maximum_gpfn(d) + BITS_PER_LONG) &
1856 ~(BITS_PER_LONG - 1);
1857 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1858 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1859 if (d->arch.shadow_bitmap == NULL) {
1860 d->arch.shadow_bitmap_size = 0;
1861 rc = -ENOMEM;
1863 else {
1864 struct vcpu *v;
1865 memset(d->arch.shadow_bitmap, 0,
1866 d->arch.shadow_bitmap_size / 8);
1868 for_each_vcpu(d, v)
1869 v->arch.shadow_bitmap = d->arch.shadow_bitmap;
1870 /* Flush vhtp and tlb to enable dirty bit
1871 virtualization. */
1872 flush_tlb_for_log_dirty(d);
1874 break;
1876 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1878 int nbr_bytes;
1880 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1881 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1883 atomic64_set(&d->arch.shadow_fault_count, 0);
1884 atomic64_set(&d->arch.shadow_dirty_count, 0);
1886 if (guest_handle_is_null(sc->dirty_bitmap) ||
1887 (d->arch.shadow_bitmap == NULL)) {
1888 rc = -EINVAL;
1889 break;
1892 if (sc->pages > d->arch.shadow_bitmap_size)
1893 sc->pages = d->arch.shadow_bitmap_size;
1895 nbr_bytes = (sc->pages + 7) / 8;
1897 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1898 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1899 SHADOW_COPY_CHUNK : nbr_bytes - i;
1901 if (copy_to_guest_offset(
1902 sc->dirty_bitmap, i,
1903 (uint8_t *)d->arch.shadow_bitmap + i,
1904 size)) {
1905 rc = -EFAULT;
1906 break;
1909 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1911 flush_tlb_for_log_dirty(d);
1913 break;
1916 case XEN_DOMCTL_SHADOW_OP_PEEK:
1918 unsigned long size;
1920 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1921 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1923 if (guest_handle_is_null(sc->dirty_bitmap) ||
1924 (d->arch.shadow_bitmap == NULL)) {
1925 rc = -EINVAL;
1926 break;
1929 if (sc->pages > d->arch.shadow_bitmap_size)
1930 sc->pages = d->arch.shadow_bitmap_size;
1932 size = (sc->pages + 7) / 8;
1933 if (copy_to_guest(sc->dirty_bitmap,
1934 (uint8_t *)d->arch.shadow_bitmap, size)) {
1935 rc = -EFAULT;
1936 break;
1938 break;
1940 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1941 sc->mb = 0;
1942 break;
1943 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1944 if (sc->mb > 0) {
1945 BUG();
1946 rc = -ENOMEM;
1948 break;
1949 default:
1950 rc = -EINVAL;
1951 break;
1954 domain_unpause(d);
1956 return rc;
1959 // remove following line if not privifying in memory
1960 //#define HAVE_PRIVIFY_MEMORY
1961 #ifndef HAVE_PRIVIFY_MEMORY
1962 #define privify_memory(x,y) do {} while(0)
1963 #endif
1965 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1966 unsigned long phys_load_offset)
1968 const elf_phdr *phdr;
1969 int phnum, h, filesz, memsz;
1970 unsigned long elfaddr, dom_mpaddr, dom_imva;
1971 struct page_info *p;
1973 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1974 for (h = 0; h < phnum; h++) {
1975 phdr = elf_phdr_by_index(elf, h);
1976 if (!elf_phdr_is_loadable(elf, phdr))
1977 continue;
1979 filesz = elf_uval(elf, phdr, p_filesz);
1980 memsz = elf_uval(elf, phdr, p_memsz);
1981 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1982 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1983 dom_mpaddr += phys_load_offset;
1985 while (memsz > 0) {
1986 p = assign_new_domain_page(d,dom_mpaddr);
1987 BUG_ON (unlikely(p == NULL));
1988 dom_imva = __va_ul(page_to_maddr(p));
1989 if (filesz > 0) {
1990 if (filesz >= PAGE_SIZE)
1991 copy_page((void *) dom_imva,
1992 (void *) elfaddr);
1993 else {
1994 // copy partial page
1995 memcpy((void *) dom_imva,
1996 (void *) elfaddr, filesz);
1997 // zero the rest of page
1998 memset((void *) dom_imva+filesz, 0,
1999 PAGE_SIZE-filesz);
2001 //FIXME: This test for code seems to find a lot more than objdump -x does
2002 if (elf_uval(elf, phdr, p_flags) & PF_X) {
2003 privify_memory(dom_imva,PAGE_SIZE);
2004 flush_icache_range(dom_imva,
2005 dom_imva+PAGE_SIZE);
2008 else if (memsz > 0) {
2009 /* always zero out entire page */
2010 clear_page((void *) dom_imva);
2012 memsz -= PAGE_SIZE;
2013 filesz -= PAGE_SIZE;
2014 elfaddr += PAGE_SIZE;
2015 dom_mpaddr += PAGE_SIZE;
2020 static void __init calc_dom0_size(void)
2022 unsigned long domheap_pages;
2023 unsigned long p2m_pages;
2024 unsigned long spare_hv_pages;
2025 unsigned long max_dom0_size;
2026 unsigned long iommu_pg_table_pages = 0;
2028 /* Estimate maximum memory we can safely allocate for dom0
2029 * by subtracting the p2m table allocation and a chunk of memory
2030 * for DMA and PCI mapping from the available domheap pages. The
2031 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
2032 * to have a good idea of what those requirements might be ahead
2033 * of time, calculated at 128MB + 1MB per 4GB of system memory */
2034 domheap_pages = avail_domheap_pages();
2035 p2m_pages = domheap_pages / PTRS_PER_PTE;
2036 spare_hv_pages = 8192 + (domheap_pages / 4096);
2038 if (iommu_enabled)
2039 iommu_pg_table_pages = domheap_pages * 4 / 512;
2040 /* There are 512 ptes in one 4K vtd page. */
2042 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages) -
2043 iommu_pg_table_pages) * PAGE_SIZE;
2044 printk("Maximum permitted dom0 size: %luMB\n",
2045 max_dom0_size / (1024*1024));
2047 /* validate proposed dom0_size, fix up as needed */
2048 if (dom0_size > max_dom0_size) {
2049 printk("Reducing dom0 memory allocation from %luK to %luK "
2050 "to fit available memory\n",
2051 dom0_size / 1024, max_dom0_size / 1024);
2052 dom0_size = max_dom0_size;
2055 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
2056 if (dom0_size == 0) {
2057 printk("Allocating all available memory to dom0\n");
2058 dom0_size = max_dom0_size;
2061 /* Check dom0 size. */
2062 if (dom0_size < 4 * 1024 * 1024) {
2063 panic("dom0_mem is too small, boot aborted"
2064 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
2067 if (running_on_sim) {
2068 dom0_size = 128*1024*1024; //FIXME: Should be configurable
2071 /* no need to allocate pages for now
2072 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
2073 */
2077 /*
2078 * Domain 0 has direct access to all devices absolutely. However
2079 * the major point of this stub here, is to allow alloc_dom_mem
2080 * handled with order > 0 request. Dom0 requires that bit set to
2081 * allocate memory for other domains.
2082 */
2083 static void __init physdev_init_dom0(struct domain *d)
2085 if (iomem_permit_access(d, 0UL, ~0UL))
2086 BUG();
2087 if (irqs_permit_access(d, 0, NR_IRQS-1))
2088 BUG();
2089 if (ioports_permit_access(d, 0, 0, 0xffff))
2090 BUG();
2093 int __init construct_dom0(struct domain *d,
2094 unsigned long image_start, unsigned long image_len,
2095 unsigned long initrd_start, unsigned long initrd_len,
2096 char *cmdline)
2098 int i, rc;
2099 start_info_t *si;
2100 dom0_vga_console_info_t *ci;
2101 struct vcpu *v = d->vcpu[0];
2102 unsigned long max_pages;
2104 struct elf_binary elf;
2105 struct elf_dom_parms parms;
2106 unsigned long p_start;
2107 unsigned long pkern_start;
2108 unsigned long pkern_entry;
2109 unsigned long pkern_end;
2110 unsigned long pinitrd_start = 0;
2111 unsigned long pstart_info;
2112 unsigned long phys_load_offset;
2113 struct page_info *start_info_page;
2114 unsigned long bp_mpa;
2115 struct ia64_boot_param *bp;
2117 //printk("construct_dom0: starting\n");
2119 /* Sanity! */
2120 BUG_ON(d != dom0);
2121 BUG_ON(d->vcpu[0] == NULL);
2122 BUG_ON(v->is_initialised);
2124 printk("*** LOADING DOMAIN 0 ***\n");
2126 calc_dom0_size();
2128 max_pages = dom0_size / PAGE_SIZE;
2129 d->max_pages = max_pages;
2130 d->tot_pages = 0;
2132 rc = elf_init(&elf, (void*)image_start, image_len);
2133 if ( rc != 0 )
2134 return rc;
2135 #ifdef VERBOSE
2136 elf_set_verbose(&elf);
2137 #endif
2138 elf_parse_binary(&elf);
2139 if (0 != (elf_xen_parse(&elf, &parms)))
2140 return rc;
2142 /*
2143 * We cannot rely on the load address in the ELF headers to
2144 * determine the meta physical address at which the image
2145 * is loaded. Patch the address to match the real one, based
2146 * on xen_pstart
2147 */
2148 phys_load_offset = xen_pstart - elf.pstart;
2149 elf.pstart += phys_load_offset;
2150 elf.pend += phys_load_offset;
2151 parms.virt_kstart += phys_load_offset;
2152 parms.virt_kend += phys_load_offset;
2153 parms.virt_entry += phys_load_offset;
2155 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
2156 elf_64bit(&elf) ? "64-bit" : "32-bit",
2157 elf_msb(&elf) ? "msb" : "lsb",
2158 elf.pstart, elf.pend);
2159 if (!elf_64bit(&elf) ||
2160 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
2161 printk("Incompatible kernel binary\n");
2162 return -1;
2165 p_start = parms.virt_base;
2166 pkern_start = parms.virt_kstart;
2167 pkern_end = parms.virt_kend;
2168 pkern_entry = parms.virt_entry;
2170 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
2172 if ( (p_start & (PAGE_SIZE-1)) != 0 )
2174 printk("Initial guest OS must load to a page boundary.\n");
2175 return -EINVAL;
2178 pstart_info = PAGE_ALIGN(pkern_end);
2179 if(initrd_start && initrd_len){
2180 unsigned long offset;
2182 /* The next page aligned boundary after the start info.
2183 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
2184 pinitrd_start = pstart_info + PAGE_SIZE;
2186 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
2187 panic("%s: not enough memory assigned to dom0", __func__);
2189 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
2190 struct page_info *p;
2191 p = assign_new_domain_page(d, pinitrd_start + offset);
2192 if (p == NULL)
2193 panic("%s: can't allocate page for initrd image", __func__);
2194 if (initrd_len < offset + PAGE_SIZE)
2195 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
2196 initrd_len - offset);
2197 else
2198 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
2202 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
2203 " Kernel image: %lx->%lx\n"
2204 " Entry address: %lx\n"
2205 " Init. ramdisk: %lx len %lx\n"
2206 " Start info.: %lx->%lx\n",
2207 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
2208 pstart_info, pstart_info + PAGE_SIZE);
2210 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
2212 printk("Initial guest OS requires too much space\n"
2213 "(%luMB is greater than %luMB limit)\n",
2214 (pkern_end-pkern_start)>>20,
2215 (max_pages <<PAGE_SHIFT)>>20);
2216 return -ENOMEM;
2219 // if high 3 bits of pkern start are non-zero, error
2221 // if pkern end is after end of metaphysical memory, error
2222 // (we should be able to deal with this... later)
2224 /* Mask all upcalls... */
2225 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
2226 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
2228 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
2229 for ( i = 1; i < dom0_max_vcpus; i++ )
2230 if (alloc_vcpu(d, i, i) == NULL)
2231 panic("Cannot allocate dom0 vcpu %d\n", i);
2233 /* Copy the OS image. */
2234 loaddomainelfimage(d, &elf, phys_load_offset);
2236 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
2237 sizeof(struct ia64_boot_param) > PAGE_SIZE);
2239 /* Set up start info area. */
2240 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
2241 start_info_page = assign_new_domain_page(d, pstart_info);
2242 if (start_info_page == NULL)
2243 panic("can't allocate start info page");
2244 si = page_to_virt(start_info_page);
2245 clear_page(si);
2246 snprintf(si->magic, sizeof(si->magic), "xen-3.0-ia64");
2247 si->nr_pages = max_pages;
2248 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
2249 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
2251 printk("Dom0: 0x%lx\n", (u64)dom0);
2253 v->is_initialised = 1;
2254 clear_bit(_VPF_down, &v->pause_flags);
2256 /* Build firmware.
2257 Note: Linux kernel reserve memory used by start_info, so there is
2258 no need to remove it from MDT. */
2259 bp_mpa = pstart_info + sizeof(struct start_info);
2260 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
2261 if (rc != 0)
2262 return rc;
2264 /* Fill boot param. */
2265 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
2267 bp = (struct ia64_boot_param *)((unsigned char *)si +
2268 sizeof(start_info_t));
2269 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
2271 /* We assume console has reached the last line! */
2272 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
2273 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
2274 bp->console_info.orig_x = 0;
2275 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
2276 0 : bp->console_info.num_rows - 1;
2278 bp->initrd_start = pinitrd_start;
2279 bp->initrd_size = ia64_boot_param->initrd_size;
2281 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
2282 sizeof(start_info_t) +
2283 sizeof(struct ia64_boot_param));
2285 if (fill_console_start_info(ci)) {
2286 si->console.dom0.info_off = sizeof(start_info_t) +
2287 sizeof(struct ia64_boot_param);
2288 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
2291 vcpu_init_regs (v);
2293 vcpu_regs(v)->r28 = bp_mpa;
2295 vcpu_regs (v)->cr_iip = pkern_entry;
2297 physdev_init_dom0(d);
2299 return 0;
2302 struct vcpu *__init alloc_dom0_vcpu0(void)
2304 if (dom0_max_vcpus == 0)
2305 dom0_max_vcpus = MAX_VIRT_CPUS;
2306 if (dom0_max_vcpus > num_online_cpus())
2307 dom0_max_vcpus = num_online_cpus();
2308 if (dom0_max_vcpus > MAX_VIRT_CPUS)
2309 dom0_max_vcpus = MAX_VIRT_CPUS;
2311 dom0->vcpu = xmalloc_array(struct vcpu *, dom0_max_vcpus);
2312 if ( !dom0->vcpu )
2313 return NULL;
2314 memset(dom0->vcpu, 0, dom0_max_vcpus * sizeof(*dom0->vcpu));
2315 dom0->max_vcpus = dom0_max_vcpus;
2317 return alloc_vcpu(dom0, 0, 0);
2320 void machine_restart(unsigned int delay_millisecs)
2322 mdelay(delay_millisecs);
2323 console_start_sync();
2324 if (running_on_sim)
2325 printk ("machine_restart called. spinning...\n");
2326 else
2327 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
2328 while(1);
2331 extern void cpu_halt(void);
2333 void machine_halt(void)
2335 console_start_sync();
2337 #ifdef CONFIG_SMP
2338 smp_send_stop();
2339 #endif
2341 printk ("machine_halt called. spinning...\n");
2342 while(1);
2345 void sync_vcpu_execstate(struct vcpu *v)
2347 // __ia64_save_fpu(v->arch._thread.fph);
2348 // FIXME SMP: Anything else needed here for SMP?
2351 /* This function is taken from xen/arch/x86/domain.c */
2352 long
2353 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
2355 long rc = 0;
2357 switch (cmd) {
2358 case VCPUOP_register_runstate_memory_area:
2360 struct vcpu_register_runstate_memory_area area;
2361 struct vcpu_runstate_info runstate;
2363 rc = -EFAULT;
2364 if (copy_from_guest(&area, arg, 1))
2365 break;
2367 if (!guest_handle_okay(area.addr.h, 1))
2368 break;
2370 rc = 0;
2371 runstate_guest(v) = area.addr.h;
2373 if (v == current) {
2374 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
2375 } else {
2376 vcpu_runstate_get(v, &runstate);
2377 __copy_to_guest(runstate_guest(v), &runstate, 1);
2380 break;
2382 default:
2383 rc = -ENOSYS;
2384 break;
2387 return rc;
2390 static void __init parse_dom0_mem(char *s)
2392 dom0_size = parse_size_and_unit(s, NULL);
2394 custom_param("dom0_mem", parse_dom0_mem);
2396 /*
2397 * Helper function for the optimization stuff handling the identity mapping
2398 * feature.
2399 */
2400 static inline unsigned long
2401 optf_identity_mapping_cmd_to_flg(unsigned long cmd)
2403 switch(cmd) {
2404 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2405 return XEN_IA64_OPTF_IDENT_MAP_REG7_FLG;
2406 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2407 return XEN_IA64_OPTF_IDENT_MAP_REG4_FLG;
2408 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2409 return XEN_IA64_OPTF_IDENT_MAP_REG5_FLG;
2410 default:
2411 BUG();
2412 return 0;
2415 /* NOTREACHED */
2418 static inline void
2419 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
2420 struct xen_ia64_opt_feature* f)
2422 unsigned long flag = optf_identity_mapping_cmd_to_flg(f->cmd);
2424 if (f->on) {
2425 *mask |= flag;
2426 im->pgprot = f->pgprot;
2427 im->key = f->key;
2428 } else {
2429 *mask &= ~flag;
2430 im->pgprot = 0;
2431 im->key = 0;
2435 /*
2436 * Switch an optimization feature on/off.
2437 * The vcpu must be paused to avoid racy access to opt_feature.
2438 */
2439 int
2440 domain_opt_feature(struct domain *d, struct xen_ia64_opt_feature* f)
2442 struct opt_feature* optf = &d->arch.opt_feature;
2443 struct vcpu *v;
2444 long rc = 0;
2446 for_each_vcpu(d, v) {
2447 if (v != current)
2448 vcpu_pause(v);
2451 switch (f->cmd) {
2452 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2453 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
2454 break;
2455 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2456 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
2457 break;
2458 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2459 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
2460 break;
2461 default:
2462 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
2463 rc = -ENOSYS;
2464 break;
2467 for_each_vcpu(d, v) {
2468 if (v != current)
2469 vcpu_unpause(v);
2472 return rc;