debuggers.hg

view xen/arch/ia64/xen/domain.c @ 13702:d2784d93e760

ia64 and ppc: Remove uses of strcpy and strncpy.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
author kfraser@localhost.localdomain
date Mon Jan 29 15:01:33 2007 +0000 (2007-01-29)
parents 3c9926aadec5
children bd69e83b65ea
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
50 #include <asm/tlb_track.h>
51 #include <asm/perfmon.h>
53 unsigned long dom0_size = 512*1024*1024;
55 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
56 static unsigned int dom0_max_vcpus = 1;
57 integer_param("dom0_max_vcpus", dom0_max_vcpus);
59 extern unsigned long running_on_sim;
61 extern char dom0_command_line[];
63 /* forward declaration */
64 static void init_switch_stack(struct vcpu *v);
66 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
67 This is a Xen virtual address. */
68 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
69 DEFINE_PER_CPU(int *, current_psr_ic_addr);
71 #include <xen/sched-if.h>
73 static void
74 ia64_disable_vhpt_walker(void)
75 {
76 // disable VHPT. ia64_new_rr7() might cause VHPT
77 // fault without this because it flushes dtr[IA64_TR_VHPT]
78 // (VHPT_SIZE_LOG2 << 2) is just for avoid
79 // Reserved Register/Field fault.
80 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
81 }
83 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
84 {
85 int cpu = smp_processor_id();
86 int last_vcpu_id, last_processor;
88 if (!is_idle_domain(prev->domain))
89 tlbflush_update_time
90 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
91 tlbflush_current_time());
93 if (is_idle_domain(next->domain))
94 return;
96 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
97 last_processor = next->arch.last_processor;
99 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
100 next->arch.last_processor = cpu;
102 if ((last_vcpu_id != next->vcpu_id &&
103 last_vcpu_id != INVALID_VCPU_ID) ||
104 (last_vcpu_id == next->vcpu_id &&
105 last_processor != cpu &&
106 last_processor != INVALID_PROCESSOR)) {
107 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
108 u32 last_tlbflush_timestamp =
109 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
110 #endif
111 int vhpt_is_flushed = 0;
113 // if the vTLB implementation was changed,
114 // the followings must be updated either.
115 if (VMX_DOMAIN(next)) {
116 // currently vTLB for vt-i domian is per vcpu.
117 // so any flushing isn't needed.
118 } else if (HAS_PERVCPU_VHPT(next->domain)) {
119 // nothing to do
120 } else {
121 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
122 last_tlbflush_timestamp)) {
123 local_vhpt_flush();
124 vhpt_is_flushed = 1;
125 }
126 }
127 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
128 last_tlbflush_timestamp)) {
129 local_flush_tlb_all();
130 perfc_incrc(tlbflush_clock_cswitch_purge);
131 } else {
132 perfc_incrc(tlbflush_clock_cswitch_skip);
133 }
134 perfc_incrc(flush_vtlb_for_context_switch);
135 }
136 }
138 void schedule_tail(struct vcpu *prev)
139 {
140 extern char ia64_ivt;
141 context_saved(prev);
143 ia64_disable_vhpt_walker();
144 if (VMX_DOMAIN(current)) {
145 vmx_do_launch(current);
146 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
147 current->processor);
148 } else {
149 ia64_set_iva(&ia64_ivt);
150 load_region_regs(current);
151 ia64_set_pta(vcpu_pta(current));
152 vcpu_load_kernel_regs(current);
153 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
154 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
155 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
156 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
157 migrate_timer(&current->arch.hlt_timer, current->processor);
158 }
159 flush_vtlb_for_context_switch(prev, current);
160 }
162 void context_switch(struct vcpu *prev, struct vcpu *next)
163 {
164 uint64_t spsr;
166 local_irq_save(spsr);
168 if (!is_idle_domain(prev->domain))
169 __ia64_save_fpu(prev->arch._thread.fph);
170 if (!is_idle_domain(next->domain))
171 __ia64_load_fpu(next->arch._thread.fph);
173 if (VMX_DOMAIN(prev)) {
174 vmx_save_state(prev);
175 if (!VMX_DOMAIN(next)) {
176 /* VMX domains can change the physical cr.dcr.
177 * Restore default to prevent leakage. */
178 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
179 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
180 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
181 }
182 }
183 if (VMX_DOMAIN(next))
184 vmx_load_state(next);
186 ia64_disable_vhpt_walker();
187 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
188 prev = ia64_switch_to(next);
190 /* Note: ia64_switch_to does not return here at vcpu initialization. */
192 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
194 if (VMX_DOMAIN(current)){
195 vmx_load_all_rr(current);
196 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
197 current->processor);
198 } else {
199 struct domain *nd;
200 extern char ia64_ivt;
202 ia64_set_iva(&ia64_ivt);
204 nd = current->domain;
205 if (!is_idle_domain(nd)) {
206 load_region_regs(current);
207 ia64_set_pta(vcpu_pta(current));
208 vcpu_load_kernel_regs(current);
209 vcpu_set_next_timer(current);
210 if (vcpu_timer_expired(current))
211 vcpu_pend_timer(current);
212 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
213 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
214 __ia64_per_cpu_var(current_psr_ic_addr) =
215 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
216 } else {
217 /* When switching to idle domain, only need to disable vhpt
218 * walker. Then all accesses happen within idle context will
219 * be handled by TR mapping and identity mapping.
220 */
221 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
222 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
223 }
224 }
225 local_irq_restore(spsr);
226 flush_vtlb_for_context_switch(prev, current);
227 context_saved(prev);
228 }
230 void continue_running(struct vcpu *same)
231 {
232 /* nothing to do */
233 }
235 #ifdef CONFIG_PERFMON
236 static int pal_halt = 1;
237 static int can_do_pal_halt = 1;
239 static int __init nohalt_setup(char * str)
240 {
241 pal_halt = can_do_pal_halt = 0;
242 return 1;
243 }
244 __setup("nohalt", nohalt_setup);
246 void
247 update_pal_halt_status(int status)
248 {
249 can_do_pal_halt = pal_halt && status;
250 }
251 #else
252 #define can_do_pal_halt (1)
253 #endif
255 static void default_idle(void)
256 {
257 local_irq_disable();
258 if ( !softirq_pending(smp_processor_id()) ) {
259 if (can_do_pal_halt)
260 safe_halt();
261 else
262 cpu_relax();
263 }
264 local_irq_enable();
265 }
267 static void continue_cpu_idle_loop(void)
268 {
269 for ( ; ; )
270 {
271 #ifdef IA64
272 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
273 #else
274 irq_stat[cpu].idle_timestamp = jiffies;
275 #endif
276 while ( !softirq_pending(smp_processor_id()) )
277 default_idle();
278 raise_softirq(SCHEDULE_SOFTIRQ);
279 do_softirq();
280 }
281 }
283 void startup_cpu_idle_loop(void)
284 {
285 /* Just some sanity to ensure that the scheduler is set up okay. */
286 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
287 raise_softirq(SCHEDULE_SOFTIRQ);
289 continue_cpu_idle_loop();
290 }
292 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
293 * get_order_from_shift(XMAPPEDREGS_SHIFT))
294 */
295 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
296 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
297 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
298 #endif
300 void hlt_timer_fn(void *data)
301 {
302 struct vcpu *v = data;
303 vcpu_unblock(v);
304 }
306 void relinquish_vcpu_resources(struct vcpu *v)
307 {
308 if (HAS_PERVCPU_VHPT(v->domain))
309 pervcpu_vhpt_free(v);
310 if (v->arch.privregs != NULL) {
311 free_xenheap_pages(v->arch.privregs,
312 get_order_from_shift(XMAPPEDREGS_SHIFT));
313 v->arch.privregs = NULL;
314 }
315 kill_timer(&v->arch.hlt_timer);
316 }
318 struct vcpu *alloc_vcpu_struct(void)
319 {
320 struct vcpu *v;
321 struct thread_info *ti;
322 static int first_allocation = 1;
324 if (first_allocation) {
325 first_allocation = 0;
326 /* Still keep idle vcpu0 static allocated at compilation, due
327 * to some code from Linux still requires it in early phase.
328 */
329 return idle_vcpu[0];
330 }
332 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
333 return NULL;
334 memset(v, 0, sizeof(*v));
336 ti = alloc_thread_info(v);
337 /* Clear thread_info to clear some important fields, like
338 * preempt_count
339 */
340 memset(ti, 0, sizeof(struct thread_info));
341 init_switch_stack(v);
343 return v;
344 }
346 void free_vcpu_struct(struct vcpu *v)
347 {
348 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
349 }
351 int vcpu_initialise(struct vcpu *v)
352 {
353 struct domain *d = v->domain;
355 if (!is_idle_domain(d)) {
356 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
357 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
358 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
359 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
361 /* Is it correct ?
362 It depends on the domain rid usage.
364 A domain may share rid among its processor (eg having a
365 global VHPT). In this case, we should also share rid
366 among vcpus and the rid range should be the same.
368 However a domain may have per cpu rid allocation. In
369 this case we don't want to share rid among vcpus, but we may
370 do it if two vcpus are on the same cpu... */
372 v->arch.starting_rid = d->arch.starting_rid;
373 v->arch.ending_rid = d->arch.ending_rid;
374 v->arch.breakimm = d->arch.breakimm;
375 v->arch.last_processor = INVALID_PROCESSOR;
376 }
378 if (!VMX_DOMAIN(v))
379 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
380 first_cpu(cpu_online_map));
382 return 0;
383 }
385 int vcpu_late_initialise(struct vcpu *v)
386 {
387 struct domain *d = v->domain;
388 int rc, order, i;
390 if (HAS_PERVCPU_VHPT(d)) {
391 rc = pervcpu_vhpt_alloc(v);
392 if (rc != 0)
393 return rc;
394 }
396 /* Create privregs page. */
397 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
398 v->arch.privregs = alloc_xenheap_pages(order);
399 BUG_ON(v->arch.privregs == NULL);
400 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
401 for (i = 0; i < (1 << order); i++)
402 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
403 d, XENSHARE_writable);
404 /*
405 * XXX IA64_XMAPPEDREGS_PADDR
406 * assign these pages into guest pseudo physical address
407 * space for dom0 to map this page by gmfn.
408 * this is necessary for domain save, restore and dump-core.
409 */
410 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
411 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
412 virt_to_maddr(v->arch.privregs + i));
414 tlbflush_update_time(&v->arch.tlbflush_timestamp,
415 tlbflush_current_time());
417 return 0;
418 }
420 void vcpu_destroy(struct vcpu *v)
421 {
422 if (v->domain->arch.is_vti)
423 vmx_relinquish_vcpu_resources(v);
424 else
425 relinquish_vcpu_resources(v);
426 }
428 static void init_switch_stack(struct vcpu *v)
429 {
430 struct pt_regs *regs = vcpu_regs (v);
431 struct switch_stack *sw = (struct switch_stack *) regs - 1;
432 extern void ia64_ret_from_clone;
434 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
435 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
436 sw->b0 = (unsigned long) &ia64_ret_from_clone;
437 sw->ar_fpsr = FPSR_DEFAULT;
438 v->arch._thread.ksp = (unsigned long) sw - 16;
439 // stay on kernel stack because may get interrupts!
440 // ia64_ret_from_clone switches to user stack
441 v->arch._thread.on_ustack = 0;
442 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
443 }
445 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
446 static int opt_pervcpu_vhpt = 1;
447 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
448 #endif
450 int arch_domain_create(struct domain *d)
451 {
452 int i;
454 // the following will eventually need to be negotiated dynamically
455 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
456 d->arch.breakimm = 0x1000;
457 for (i = 0; i < NR_CPUS; i++) {
458 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
459 }
461 if (is_idle_domain(d))
462 return 0;
464 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
465 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
466 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
467 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
468 #endif
469 if (tlb_track_create(d) < 0)
470 goto fail_nomem1;
471 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
472 if (d->shared_info == NULL)
473 goto fail_nomem;
474 memset(d->shared_info, 0, XSI_SIZE);
475 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
476 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
477 d, XENSHARE_writable);
479 /* We may also need emulation rid for region4, though it's unlikely
480 * to see guest issue uncacheable access in metaphysical mode. But
481 * keep such info here may be more sane.
482 */
483 if (!allocate_rid_range(d,0))
484 goto fail_nomem;
486 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
488 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
489 goto fail_nomem;
491 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
492 RANGESETF_prettyprint_hex);
494 printk ("arch_domain_create: domain=%p\n", d);
495 return 0;
497 fail_nomem:
498 tlb_track_destroy(d);
499 fail_nomem1:
500 if (d->arch.mm.pgd != NULL)
501 pgd_free(d->arch.mm.pgd);
502 if (d->shared_info != NULL)
503 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
504 return -ENOMEM;
505 }
507 void arch_domain_destroy(struct domain *d)
508 {
509 mm_final_teardown(d);
511 if (d->shared_info != NULL)
512 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
514 tlb_track_destroy(d);
516 /* Clear vTLB for the next domain. */
517 domain_flush_tlb_vhpt(d);
519 deallocate_rid_range(d);
520 }
522 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
523 {
524 int i;
525 struct vcpu_extra_regs *er = &c.nat->extra_regs;
527 c.nat->user_regs = *vcpu_regs(v);
528 c.nat->privregs_pfn = get_gpfn_from_mfn(virt_to_maddr(v->arch.privregs) >>
529 PAGE_SHIFT);
531 /* Fill extra regs. */
532 for (i = 0; i < 8; i++) {
533 er->itrs[i].pte = v->arch.itrs[i].pte.val;
534 er->itrs[i].itir = v->arch.itrs[i].itir;
535 er->itrs[i].vadr = v->arch.itrs[i].vadr;
536 er->itrs[i].rid = v->arch.itrs[i].rid;
537 }
538 for (i = 0; i < 8; i++) {
539 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
540 er->dtrs[i].itir = v->arch.dtrs[i].itir;
541 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
542 er->dtrs[i].rid = v->arch.dtrs[i].rid;
543 }
544 er->event_callback_ip = v->arch.event_callback_ip;
545 er->dcr = v->arch.dcr;
546 er->iva = v->arch.iva;
547 }
549 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
550 {
551 struct pt_regs *regs = vcpu_regs (v);
552 struct domain *d = v->domain;
553 int rc;
555 *regs = c.nat->user_regs;
557 if (!d->arch.is_vti) {
558 /* domain runs at PL2/3 */
559 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
560 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
561 }
563 if (c.nat->flags & VGCF_EXTRA_REGS) {
564 int i;
565 struct vcpu_extra_regs *er = &c.nat->extra_regs;
567 for (i = 0; i < 8; i++) {
568 vcpu_set_itr(v, i, er->itrs[i].pte,
569 er->itrs[i].itir,
570 er->itrs[i].vadr,
571 er->itrs[i].rid);
572 }
573 for (i = 0; i < 8; i++) {
574 vcpu_set_dtr(v, i,
575 er->dtrs[i].pte,
576 er->dtrs[i].itir,
577 er->dtrs[i].vadr,
578 er->dtrs[i].rid);
579 }
580 v->arch.event_callback_ip = er->event_callback_ip;
581 v->arch.dcr = er->dcr;
582 v->arch.iva = er->iva;
583 }
585 if (test_bit(_VCPUF_initialised, &v->vcpu_flags))
586 return 0;
588 if (d->arch.is_vti)
589 vmx_final_setup_guest(v);
590 else {
591 rc = vcpu_late_initialise(v);
592 if (rc != 0)
593 return rc;
594 VCPU(v, interrupt_mask_addr) =
595 (unsigned char *) d->arch.shared_info_va +
596 INT_ENABLE_OFFSET(v);
597 }
599 /* This overrides some registers. */
600 vcpu_init_regs(v);
602 /* Don't redo final setup */
603 set_bit(_VCPUF_initialised, &v->vcpu_flags);
604 return 0;
605 }
607 static void relinquish_memory(struct domain *d, struct list_head *list)
608 {
609 struct list_head *ent;
610 struct page_info *page;
611 #ifndef __ia64__
612 unsigned long x, y;
613 #endif
615 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
616 spin_lock_recursive(&d->page_alloc_lock);
617 ent = list->next;
618 while ( ent != list )
619 {
620 page = list_entry(ent, struct page_info, list);
621 /* Grab a reference to the page so it won't disappear from under us. */
622 if ( unlikely(!get_page(page, d)) )
623 {
624 /* Couldn't get a reference -- someone is freeing this page. */
625 ent = ent->next;
626 continue;
627 }
629 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
630 put_page_and_type(page);
632 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
633 put_page(page);
635 #ifndef __ia64__
636 /*
637 * Forcibly invalidate base page tables at this point to break circular
638 * 'linear page table' references. This is okay because MMU structures
639 * are not shared across domains and this domain is now dead. Thus base
640 * tables are not in use so a non-zero count means circular reference.
641 */
642 y = page->u.inuse.type_info;
643 for ( ; ; )
644 {
645 x = y;
646 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
647 (PGT_base_page_table|PGT_validated)) )
648 break;
650 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
651 if ( likely(y == x) )
652 {
653 free_page_type(page, PGT_base_page_table);
654 break;
655 }
656 }
657 #endif
659 /* Follow the list chain and /then/ potentially free the page. */
660 ent = ent->next;
661 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
662 put_page(page);
663 }
665 spin_unlock_recursive(&d->page_alloc_lock);
666 }
668 void domain_relinquish_resources(struct domain *d)
669 {
670 /* Relinquish guest resources for VT-i domain. */
671 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
672 vmx_relinquish_guest_resources(d);
674 /* Tear down shadow mode stuff. */
675 mm_teardown(d);
677 /* Relinquish every page of memory. */
678 relinquish_memory(d, &d->xenpage_list);
679 relinquish_memory(d, &d->page_list);
681 if (d->arch.is_vti && d->arch.sal_data)
682 xfree(d->arch.sal_data);
684 /* Free page used by xen oprofile buffer */
685 free_xenoprof_pages(d);
686 }
688 unsigned long
689 domain_set_shared_info_va (unsigned long va)
690 {
691 struct vcpu *v = current;
692 struct domain *d = v->domain;
694 /* Check virtual address:
695 must belong to region 7,
696 must be 64Kb aligned,
697 must not be within Xen virtual space. */
698 if ((va >> 61) != 7
699 || (va & 0xffffUL) != 0
700 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
701 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
703 /* Note: this doesn't work well if other cpus are already running.
704 However this is part of the spec :-) */
705 printk ("Domain set shared_info_va to 0x%016lx\n", va);
706 d->arch.shared_info_va = va;
708 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
709 INT_ENABLE_OFFSET(v);
711 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
713 /* Remap the shared pages. */
714 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
716 return 0;
717 }
719 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
720 #define SHADOW_COPY_CHUNK 1024
722 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
723 {
724 unsigned int op = sc->op;
725 int rc = 0;
726 int i;
727 //struct vcpu *v;
729 if (unlikely(d == current->domain)) {
730 gdprintk(XENLOG_INFO,
731 "Don't try to do a shadow op on yourself!\n");
732 return -EINVAL;
733 }
735 domain_pause(d);
737 switch (op)
738 {
739 case XEN_DOMCTL_SHADOW_OP_OFF:
740 if (shadow_mode_enabled (d)) {
741 u64 *bm = d->arch.shadow_bitmap;
743 /* Flush vhpt and tlb to restore dirty bit usage. */
744 domain_flush_tlb_vhpt(d);
746 /* Free bitmap. */
747 d->arch.shadow_bitmap_size = 0;
748 d->arch.shadow_bitmap = NULL;
749 xfree(bm);
750 }
751 break;
753 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
754 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
755 rc = -EINVAL;
756 break;
758 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
759 if (shadow_mode_enabled(d)) {
760 rc = -EINVAL;
761 break;
762 }
764 atomic64_set(&d->arch.shadow_fault_count, 0);
765 atomic64_set(&d->arch.shadow_dirty_count, 0);
767 d->arch.shadow_bitmap_size =
768 ((d->arch.convmem_end >> PAGE_SHIFT) +
769 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
770 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
771 d->arch.shadow_bitmap_size / BITS_PER_LONG);
772 if (d->arch.shadow_bitmap == NULL) {
773 d->arch.shadow_bitmap_size = 0;
774 rc = -ENOMEM;
775 }
776 else {
777 memset(d->arch.shadow_bitmap, 0,
778 d->arch.shadow_bitmap_size / 8);
780 /* Flush vhtp and tlb to enable dirty bit
781 virtualization. */
782 domain_flush_tlb_vhpt(d);
783 }
784 break;
786 case XEN_DOMCTL_SHADOW_OP_CLEAN:
787 {
788 int nbr_bytes;
790 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
791 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
793 atomic64_set(&d->arch.shadow_fault_count, 0);
794 atomic64_set(&d->arch.shadow_dirty_count, 0);
796 if (guest_handle_is_null(sc->dirty_bitmap) ||
797 (d->arch.shadow_bitmap == NULL)) {
798 rc = -EINVAL;
799 break;
800 }
802 if (sc->pages > d->arch.shadow_bitmap_size)
803 sc->pages = d->arch.shadow_bitmap_size;
805 nbr_bytes = (sc->pages + 7) / 8;
807 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
808 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
809 SHADOW_COPY_CHUNK : nbr_bytes - i;
811 if (copy_to_guest_offset(
812 sc->dirty_bitmap, i,
813 (uint8_t *)d->arch.shadow_bitmap + i,
814 size)) {
815 rc = -EFAULT;
816 break;
817 }
819 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
820 }
822 break;
823 }
825 case XEN_DOMCTL_SHADOW_OP_PEEK:
826 {
827 unsigned long size;
829 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
830 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
832 if (guest_handle_is_null(sc->dirty_bitmap) ||
833 (d->arch.shadow_bitmap == NULL)) {
834 rc = -EINVAL;
835 break;
836 }
838 if (sc->pages > d->arch.shadow_bitmap_size)
839 sc->pages = d->arch.shadow_bitmap_size;
841 size = (sc->pages + 7) / 8;
842 if (copy_to_guest(sc->dirty_bitmap,
843 (uint8_t *)d->arch.shadow_bitmap, size)) {
844 rc = -EFAULT;
845 break;
846 }
847 break;
848 }
849 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
850 sc->mb = 0;
851 break;
852 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
853 if (sc->mb > 0) {
854 BUG();
855 rc = -ENOMEM;
856 }
857 break;
858 default:
859 rc = -EINVAL;
860 break;
861 }
863 domain_unpause(d);
865 return rc;
866 }
868 // remove following line if not privifying in memory
869 //#define HAVE_PRIVIFY_MEMORY
870 #ifndef HAVE_PRIVIFY_MEMORY
871 #define privify_memory(x,y) do {} while(0)
872 #endif
874 static void loaddomainelfimage(struct domain *d, struct elf_binary *elf)
875 {
876 const elf_phdr *phdr;
877 int phnum, h, filesz, memsz;
878 unsigned long elfaddr, dom_mpaddr, dom_imva;
879 struct page_info *p;
881 phnum = elf_uval(elf, elf->ehdr, e_phnum);
882 for (h = 0; h < phnum; h++) {
883 phdr = elf_phdr_by_index(elf, h);
884 if (!elf_phdr_is_loadable(elf, phdr))
885 continue;
887 filesz = elf_uval(elf, phdr, p_filesz);
888 memsz = elf_uval(elf, phdr, p_memsz);
889 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
890 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
892 while (memsz > 0) {
893 p = assign_new_domain_page(d,dom_mpaddr);
894 BUG_ON (unlikely(p == NULL));
895 dom_imva = __va_ul(page_to_maddr(p));
896 if (filesz > 0) {
897 if (filesz >= PAGE_SIZE)
898 memcpy((void *) dom_imva,
899 (void *) elfaddr,
900 PAGE_SIZE);
901 else {
902 // copy partial page
903 memcpy((void *) dom_imva,
904 (void *) elfaddr, filesz);
905 // zero the rest of page
906 memset((void *) dom_imva+filesz, 0,
907 PAGE_SIZE-filesz);
908 }
909 //FIXME: This test for code seems to find a lot more than objdump -x does
910 if (elf_uval(elf, phdr, p_flags) & PF_X) {
911 privify_memory(dom_imva,PAGE_SIZE);
912 flush_icache_range(dom_imva,
913 dom_imva+PAGE_SIZE);
914 }
915 }
916 else if (memsz > 0) {
917 /* always zero out entire page */
918 memset((void *) dom_imva, 0, PAGE_SIZE);
919 }
920 memsz -= PAGE_SIZE;
921 filesz -= PAGE_SIZE;
922 elfaddr += PAGE_SIZE;
923 dom_mpaddr += PAGE_SIZE;
924 }
925 }
926 }
928 void alloc_dom0(void)
929 {
930 /* Check dom0 size. */
931 if (dom0_size < 4 * 1024 * 1024) {
932 panic("dom0_mem is too small, boot aborted"
933 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
934 }
936 if (running_on_sim) {
937 dom0_size = 128*1024*1024; //FIXME: Should be configurable
938 }
940 /* no need to allocate pages for now
941 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
942 */
943 }
946 /*
947 * Domain 0 has direct access to all devices absolutely. However
948 * the major point of this stub here, is to allow alloc_dom_mem
949 * handled with order > 0 request. Dom0 requires that bit set to
950 * allocate memory for other domains.
951 */
952 static void physdev_init_dom0(struct domain *d)
953 {
954 if (iomem_permit_access(d, 0UL, ~0UL))
955 BUG();
956 if (irqs_permit_access(d, 0, NR_IRQS-1))
957 BUG();
958 if (ioports_permit_access(d, 0, 0xffff))
959 BUG();
960 }
962 int construct_dom0(struct domain *d,
963 unsigned long image_start, unsigned long image_len,
964 unsigned long initrd_start, unsigned long initrd_len,
965 char *cmdline)
966 {
967 int i, rc;
968 start_info_t *si;
969 dom0_vga_console_info_t *ci;
970 struct vcpu *v = d->vcpu[0];
971 unsigned long max_pages;
973 struct elf_binary elf;
974 struct elf_dom_parms parms;
975 unsigned long p_start;
976 unsigned long pkern_start;
977 unsigned long pkern_entry;
978 unsigned long pkern_end;
979 unsigned long pinitrd_start = 0;
980 unsigned long pstart_info;
981 struct page_info *start_info_page;
982 unsigned long bp_mpa;
983 struct ia64_boot_param *bp;
985 #ifdef VALIDATE_VT
986 unsigned int vmx_dom0 = 0;
987 unsigned long mfn;
988 struct page_info *page = NULL;
989 #endif
991 //printk("construct_dom0: starting\n");
993 /* Sanity! */
994 BUG_ON(d != dom0);
995 BUG_ON(d->vcpu[0] == NULL);
996 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
998 printk("*** LOADING DOMAIN 0 ***\n");
1000 max_pages = dom0_size / PAGE_SIZE;
1001 d->max_pages = max_pages;
1002 d->tot_pages = 0;
1004 rc = elf_init(&elf, (void*)image_start, image_len);
1005 if ( rc != 0 )
1006 return rc;
1007 #ifdef VERBOSE
1008 elf_set_verbose(&elf);
1009 #endif
1010 elf_parse_binary(&elf);
1011 if (0 != (elf_xen_parse(&elf, &parms)))
1012 return rc;
1014 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1015 elf_64bit(&elf) ? "64-bit" : "32-bit",
1016 elf_msb(&elf) ? "msb" : "lsb",
1017 elf.pstart, elf.pend);
1018 if (!elf_64bit(&elf) ||
1019 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1020 printk("Incompatible kernel binary\n");
1021 return -1;
1024 #ifdef VALIDATE_VT
1025 /* Temp workaround */
1026 if (running_on_sim)
1027 dsi.xen_section_string = (char *)1;
1029 /* Check whether dom0 is vti domain */
1030 if ((!vmx_enabled) && !dsi.xen_section_string) {
1031 printk("Lack of hardware support for unmodified vmx dom0\n");
1032 panic("");
1035 if (vmx_enabled && !dsi.xen_section_string) {
1036 printk("Dom0 is vmx domain!\n");
1037 vmx_dom0 = 1;
1039 #endif
1041 p_start = parms.virt_base;
1042 pkern_start = parms.virt_kstart;
1043 pkern_end = parms.virt_kend;
1044 pkern_entry = parms.virt_entry;
1046 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1048 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1050 printk("Initial guest OS must load to a page boundary.\n");
1051 return -EINVAL;
1054 pstart_info = PAGE_ALIGN(pkern_end);
1055 if(initrd_start && initrd_len){
1056 unsigned long offset;
1058 /* The next page aligned boundary after the start info.
1059 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1060 pinitrd_start = pstart_info + PAGE_SIZE;
1061 if (pinitrd_start + initrd_len >= dom0_size)
1062 panic("%s: not enough memory assigned to dom0", __func__);
1063 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1064 struct page_info *p;
1065 p = assign_new_domain_page(d, pinitrd_start + offset);
1066 if (p == NULL)
1067 panic("%s: can't allocate page for initrd image", __func__);
1068 if (initrd_len < offset + PAGE_SIZE)
1069 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1070 initrd_len - offset);
1071 else
1072 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1076 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1077 " Kernel image: %lx->%lx\n"
1078 " Entry address: %lx\n"
1079 " Init. ramdisk: %lx len %lx\n"
1080 " Start info.: %lx->%lx\n",
1081 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1082 pstart_info, pstart_info + PAGE_SIZE);
1084 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1086 printk("Initial guest OS requires too much space\n"
1087 "(%luMB is greater than %luMB limit)\n",
1088 (pkern_end-pkern_start)>>20,
1089 (max_pages <<PAGE_SHIFT)>>20);
1090 return -ENOMEM;
1093 // if high 3 bits of pkern start are non-zero, error
1095 // if pkern end is after end of metaphysical memory, error
1096 // (we should be able to deal with this... later)
1098 /* Mask all upcalls... */
1099 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1100 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1102 if (dom0_max_vcpus == 0)
1103 dom0_max_vcpus = MAX_VIRT_CPUS;
1104 if (dom0_max_vcpus > num_online_cpus())
1105 dom0_max_vcpus = num_online_cpus();
1106 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1107 dom0_max_vcpus = MAX_VIRT_CPUS;
1109 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1110 for ( i = 1; i < dom0_max_vcpus; i++ )
1111 if (alloc_vcpu(d, i, i) == NULL)
1112 panic("Cannot allocate dom0 vcpu %d\n", i);
1114 /* Copy the OS image. */
1115 loaddomainelfimage(d,&elf);
1117 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1118 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1120 /* Set up start info area. */
1121 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1122 start_info_page = assign_new_domain_page(d, pstart_info);
1123 if (start_info_page == NULL)
1124 panic("can't allocate start info page");
1125 si = page_to_virt(start_info_page);
1126 memset(si, 0, PAGE_SIZE);
1127 sprintf(si->magic, "xen-%i.%i-ia64",
1128 xen_major_version(), xen_minor_version());
1129 si->nr_pages = max_pages;
1130 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1132 printk("Dom0: 0x%lx\n", (u64)dom0);
1134 #ifdef VALIDATE_VT
1135 /* VMX specific construction for Dom0, if hardware supports VMX
1136 * and Dom0 is unmodified image
1137 */
1138 if (vmx_dom0)
1139 vmx_final_setup_guest(v);
1140 #endif
1142 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1144 /* Build firmware.
1145 Note: Linux kernel reserve memory used by start_info, so there is
1146 no need to remove it from MDT. */
1147 bp_mpa = pstart_info + sizeof(struct start_info);
1148 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1150 /* Fill boot param. */
1151 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1153 bp = (struct ia64_boot_param *)((unsigned char *)si +
1154 sizeof(start_info_t));
1155 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1157 /* We assume console has reached the last line! */
1158 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1159 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1160 bp->console_info.orig_x = 0;
1161 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1162 0 : bp->console_info.num_rows - 1;
1164 bp->initrd_start = pinitrd_start;
1165 bp->initrd_size = ia64_boot_param->initrd_size;
1167 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1168 sizeof(start_info_t) +
1169 sizeof(struct ia64_boot_param));
1171 if (fill_console_start_info(ci)) {
1172 si->console.dom0.info_off = sizeof(start_info_t) +
1173 sizeof(struct ia64_boot_param);
1174 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1177 vcpu_init_regs (v);
1179 vcpu_regs(v)->r28 = bp_mpa;
1181 vcpu_regs (v)->cr_iip = pkern_entry;
1183 physdev_init_dom0(d);
1185 return 0;
1188 void machine_restart(char * __unused)
1190 console_start_sync();
1191 if (running_on_sim)
1192 printk ("machine_restart called. spinning...\n");
1193 else
1194 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1195 while(1);
1198 extern void cpu_halt(void);
1200 void machine_halt(void)
1202 console_start_sync();
1203 if (running_on_sim)
1204 printk ("machine_halt called. spinning...\n");
1205 else
1206 cpu_halt();
1207 while(1);
1210 void sync_vcpu_execstate(struct vcpu *v)
1212 // __ia64_save_fpu(v->arch._thread.fph);
1213 // if (VMX_DOMAIN(v))
1214 // vmx_save_state(v);
1215 // FIXME SMP: Anything else needed here for SMP?
1218 static void parse_dom0_mem(char *s)
1220 dom0_size = parse_size_and_unit(s, NULL);
1222 custom_param("dom0_mem", parse_dom0_mem);