debuggers.hg

view xen/arch/x86/hvm/hvm.c @ 22807:47d67a64a2d2

x86 hvm: Do not check-and-fail on in_atomic() in hvm_copy().

Stub this out for 4.0, as PV-on-HVM drivers hit this case when
performing grant-table hypercalls. Grant-table code currently accesses
guest memory under bug per-domain lock. The test in hvm_copy() is not
necessary until the xenpaging implementation is more complete, which
will not now be until after 4.1.0.

Signed-off-by: Keir Fraser <keir@xen.org>
author Keir Fraser <keir@xen.org>
date Fri Jan 14 15:18:02 2011 +0000 (2011-01-14)
parents 14ee2ec6ad5a
children
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 * Copyright (c) 2008, Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 */
22 #include <xen/config.h>
23 #include <xen/ctype.h>
24 #include <xen/init.h>
25 #include <xen/lib.h>
26 #include <xen/trace.h>
27 #include <xen/sched.h>
28 #include <xen/irq.h>
29 #include <xen/softirq.h>
30 #include <xen/domain.h>
31 #include <xen/domain_page.h>
32 #include <xen/hypercall.h>
33 #include <xen/guest_access.h>
34 #include <xen/event.h>
35 #include <xen/paging.h>
36 #include <xen/cpu.h>
37 #include <xen/wait.h>
38 #include <asm/shadow.h>
39 #include <asm/hap.h>
40 #include <asm/current.h>
41 #include <asm/e820.h>
42 #include <asm/io.h>
43 #include <asm/regs.h>
44 #include <asm/cpufeature.h>
45 #include <asm/processor.h>
46 #include <asm/types.h>
47 #include <asm/msr.h>
48 #include <asm/i387.h>
49 #include <asm/traps.h>
50 #include <asm/mc146818rtc.h>
51 #include <asm/spinlock.h>
52 #include <asm/mce.h>
53 #include <asm/hvm/hvm.h>
54 #include <asm/hvm/vpt.h>
55 #include <asm/hvm/support.h>
56 #include <asm/hvm/cacheattr.h>
57 #include <asm/hvm/trace.h>
58 #include <asm/mtrr.h>
59 #include <asm/apic.h>
60 #include <public/sched.h>
61 #include <public/hvm/ioreq.h>
62 #include <public/version.h>
63 #include <public/memory.h>
64 #include <asm/mem_event.h>
65 #include <public/mem_event.h>
67 bool_t __read_mostly hvm_enabled;
69 unsigned int opt_hvm_debug_level __read_mostly;
70 integer_param("hvm_debug", opt_hvm_debug_level);
72 struct hvm_function_table hvm_funcs __read_mostly;
74 /* I/O permission bitmap is globally shared by all HVM guests. */
75 unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
76 hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
78 static int cpu_callback(
79 struct notifier_block *nfb, unsigned long action, void *hcpu)
80 {
81 unsigned int cpu = (unsigned long)hcpu;
82 int rc = 0;
84 switch ( action )
85 {
86 case CPU_UP_PREPARE:
87 rc = hvm_funcs.cpu_up_prepare(cpu);
88 break;
89 case CPU_DYING:
90 hvm_cpu_down();
91 break;
92 case CPU_UP_CANCELED:
93 case CPU_DEAD:
94 hvm_funcs.cpu_dead(cpu);
95 break;
96 default:
97 break;
98 }
100 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
101 }
103 static struct notifier_block cpu_nfb = {
104 .notifier_call = cpu_callback
105 };
107 static int __init hvm_enable(void)
108 {
109 extern struct hvm_function_table *start_svm(void);
110 extern struct hvm_function_table *start_vmx(void);
111 extern int hvm_port80_allowed;
113 struct hvm_function_table *fns = NULL;
115 switch ( boot_cpu_data.x86_vendor )
116 {
117 case X86_VENDOR_INTEL:
118 fns = start_vmx();
119 break;
120 case X86_VENDOR_AMD:
121 fns = start_svm();
122 break;
123 default:
124 break;
125 }
127 if ( fns == NULL )
128 return 0;
130 hvm_funcs = *fns;
131 hvm_enabled = 1;
133 printk("HVM: %s enabled\n", hvm_funcs.name);
134 if ( hvm_funcs.hap_supported )
135 printk("HVM: Hardware Assisted Paging detected.\n");
137 /*
138 * Allow direct access to the PC debug ports 0x80 and 0xed (they are
139 * often used for I/O delays, but the vmexits simply slow things down).
140 */
141 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
142 if ( hvm_port80_allowed )
143 __clear_bit(0x80, hvm_io_bitmap);
144 __clear_bit(0xed, hvm_io_bitmap);
146 register_cpu_notifier(&cpu_nfb);
148 return 0;
149 }
150 presmp_initcall(hvm_enable);
152 /*
153 * Need to re-inject a given event? We avoid re-injecting software exceptions
154 * and interrupts because the faulting/trapping instruction can simply be
155 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
156 * INT3/INTO/INTn).
157 */
158 int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
159 {
160 switch ( type )
161 {
162 case X86_EVENTTYPE_EXT_INTR:
163 case X86_EVENTTYPE_NMI:
164 return 1;
165 case X86_EVENTTYPE_HW_EXCEPTION:
166 /*
167 * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
168 * check for these vectors, as they are really SW Exceptions. SVM has
169 * not updated RIP to point after the trapping instruction (INT3/INTO).
170 */
171 return (vector != 3) && (vector != 4);
172 default:
173 /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
174 break;
175 }
176 return 0;
177 }
179 /*
180 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
181 * This means we can assume that @vec2 is contributory or a page fault.
182 */
183 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
184 {
185 /* Exception during double-fault delivery always causes a triple fault. */
186 if ( vec1 == TRAP_double_fault )
187 {
188 hvm_triple_fault();
189 return TRAP_double_fault; /* dummy return */
190 }
192 /* Exception during page-fault delivery always causes a double fault. */
193 if ( vec1 == TRAP_page_fault )
194 return TRAP_double_fault;
196 /* Discard the first exception if it's benign or if we now have a #PF. */
197 if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
198 return vec2;
200 /* Cannot combine the exceptions: double fault. */
201 return TRAP_double_fault;
202 }
204 void hvm_set_rdtsc_exiting(struct domain *d, bool_t enable)
205 {
206 struct vcpu *v;
208 for_each_vcpu ( d, v )
209 hvm_funcs.set_rdtsc_exiting(v, enable);
210 }
212 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
213 {
214 uint64_t tsc;
216 if ( v->domain->arch.vtsc )
217 {
218 tsc = hvm_get_guest_time(v);
219 tsc = gtime_to_gtsc(v->domain, tsc);
220 }
221 else
222 {
223 rdtscll(tsc);
224 }
226 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - tsc;
227 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
228 }
230 u64 hvm_get_guest_tsc(struct vcpu *v)
231 {
232 uint64_t tsc;
234 if ( v->domain->arch.vtsc )
235 {
236 tsc = hvm_get_guest_time(v);
237 tsc = gtime_to_gtsc(v->domain, tsc);
238 v->domain->arch.vtsc_kerncount++;
239 }
240 else
241 {
242 rdtscll(tsc);
243 }
245 return tsc + v->arch.hvm_vcpu.cache_tsc_offset;
246 }
248 void hvm_migrate_timers(struct vcpu *v)
249 {
250 rtc_migrate_timers(v);
251 pt_migrate(v);
252 }
254 void hvm_migrate_pirqs(struct vcpu *v)
255 {
256 int pirq, irq;
257 struct irq_desc *desc;
258 struct domain *d = v->domain;
259 struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
261 if ( !iommu_enabled || (hvm_irq_dpci == NULL) )
262 return;
264 spin_lock(&d->event_lock);
265 for ( pirq = find_first_bit(hvm_irq_dpci->mapping, d->nr_pirqs);
266 pirq < d->nr_pirqs;
267 pirq = find_next_bit(hvm_irq_dpci->mapping, d->nr_pirqs, pirq + 1) )
268 {
269 if ( !(hvm_irq_dpci->mirq[pirq].flags & HVM_IRQ_DPCI_MACH_MSI) ||
270 (hvm_irq_dpci->mirq[pirq].gmsi.dest_vcpu_id != v->vcpu_id) )
271 continue;
272 desc = domain_spin_lock_irq_desc(v->domain, pirq, NULL);
273 if (!desc)
274 continue;
275 irq = desc - irq_desc;
276 ASSERT(MSI_IRQ(irq));
277 irq_set_affinity(desc, cpumask_of(v->processor));
278 spin_unlock_irq(&desc->lock);
279 }
280 spin_unlock(&d->event_lock);
281 }
283 void hvm_do_resume(struct vcpu *v)
284 {
285 ioreq_t *p;
287 pt_restore_timer(v);
289 check_wakeup_from_wait();
291 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
292 p = get_ioreq(v);
293 while ( p->state != STATE_IOREQ_NONE )
294 {
295 switch ( p->state )
296 {
297 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
298 hvm_io_assist();
299 break;
300 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
301 case STATE_IOREQ_INPROCESS:
302 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
303 (p->state != STATE_IOREQ_READY) &&
304 (p->state != STATE_IOREQ_INPROCESS));
305 break;
306 default:
307 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
308 domain_crash(v->domain);
309 return; /* bail */
310 }
311 }
313 /* Inject pending hw/sw trap */
314 if (v->arch.hvm_vcpu.inject_trap != -1)
315 {
316 hvm_inject_exception(v->arch.hvm_vcpu.inject_trap,
317 v->arch.hvm_vcpu.inject_error_code,
318 v->arch.hvm_vcpu.inject_cr2);
319 v->arch.hvm_vcpu.inject_trap = -1;
320 }
321 }
323 static void hvm_init_ioreq_page(
324 struct domain *d, struct hvm_ioreq_page *iorp)
325 {
326 memset(iorp, 0, sizeof(*iorp));
327 spin_lock_init(&iorp->lock);
328 domain_pause(d);
329 }
331 static void hvm_destroy_ioreq_page(
332 struct domain *d, struct hvm_ioreq_page *iorp)
333 {
334 spin_lock(&iorp->lock);
336 ASSERT(d->is_dying);
338 if ( iorp->va != NULL )
339 {
340 unmap_domain_page_global(iorp->va);
341 put_page_and_type(iorp->page);
342 iorp->va = NULL;
343 }
345 spin_unlock(&iorp->lock);
346 }
348 static int hvm_set_ioreq_page(
349 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
350 {
351 struct page_info *page;
352 struct p2m_domain *p2m = p2m_get_hostp2m(d);
353 p2m_type_t p2mt;
354 unsigned long mfn;
355 void *va;
357 mfn = mfn_x(gfn_to_mfn_unshare(p2m, gmfn, &p2mt, 0));
358 if ( !p2m_is_ram(p2mt) )
359 return -EINVAL;
360 if ( p2m_is_paging(p2mt) )
361 {
362 p2m_mem_paging_populate(p2m, gmfn);
363 return -ENOENT;
364 }
365 if ( p2m_is_shared(p2mt) )
366 return -ENOENT;
367 ASSERT(mfn_valid(mfn));
369 page = mfn_to_page(mfn);
370 if ( !get_page_and_type(page, d, PGT_writable_page) )
371 return -EINVAL;
373 va = map_domain_page_global(mfn);
374 if ( va == NULL )
375 {
376 put_page_and_type(page);
377 return -ENOMEM;
378 }
380 spin_lock(&iorp->lock);
382 if ( (iorp->va != NULL) || d->is_dying )
383 {
384 spin_unlock(&iorp->lock);
385 unmap_domain_page_global(va);
386 put_page_and_type(mfn_to_page(mfn));
387 return -EINVAL;
388 }
390 iorp->va = va;
391 iorp->page = page;
393 spin_unlock(&iorp->lock);
395 domain_unpause(d);
397 return 0;
398 }
400 static int hvm_print_line(
401 int dir, uint32_t port, uint32_t bytes, uint32_t *val)
402 {
403 struct vcpu *curr = current;
404 struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
405 char c = *val;
407 BUG_ON(bytes != 1);
409 /* Accept only printable characters, newline, and horizontal tab. */
410 if ( !isprint(c) && (c != '\n') && (c != '\t') )
411 return X86EMUL_OKAY;
413 spin_lock(&hd->pbuf_lock);
414 hd->pbuf[hd->pbuf_idx++] = c;
415 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
416 {
417 if ( c != '\n' )
418 hd->pbuf[hd->pbuf_idx++] = '\n';
419 hd->pbuf[hd->pbuf_idx] = '\0';
420 printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
421 hd->pbuf_idx = 0;
422 }
423 spin_unlock(&hd->pbuf_lock);
425 return X86EMUL_OKAY;
426 }
428 int hvm_domain_initialise(struct domain *d)
429 {
430 int rc;
432 if ( !hvm_enabled )
433 {
434 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
435 "on a non-VT/AMDV platform.\n");
436 return -EINVAL;
437 }
439 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
440 spin_lock_init(&d->arch.hvm_domain.irq_lock);
441 spin_lock_init(&d->arch.hvm_domain.uc_lock);
443 INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
444 spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
446 hvm_init_guest_time(d);
448 d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
450 hvm_init_cacheattr_region_list(d);
452 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
453 if ( rc != 0 )
454 goto fail1;
456 vpic_init(d);
458 rc = vioapic_init(d);
459 if ( rc != 0 )
460 goto fail1;
462 stdvga_init(d);
464 rtc_init(d);
466 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
467 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
469 register_portio_handler(d, 0xe9, 1, hvm_print_line);
471 rc = hvm_funcs.domain_initialise(d);
472 if ( rc != 0 )
473 goto fail2;
475 return 0;
477 fail2:
478 rtc_deinit(d);
479 stdvga_deinit(d);
480 vioapic_deinit(d);
481 fail1:
482 hvm_destroy_cacheattr_region_list(d);
483 return rc;
484 }
486 extern void msixtbl_pt_cleanup(struct domain *d);
488 void hvm_domain_relinquish_resources(struct domain *d)
489 {
490 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
491 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
493 msixtbl_pt_cleanup(d);
495 /* Stop all asynchronous timer actions. */
496 rtc_deinit(d);
497 if ( d->vcpu != NULL && d->vcpu[0] != NULL )
498 {
499 pit_deinit(d);
500 pmtimer_deinit(d);
501 hpet_deinit(d);
502 }
503 }
505 void hvm_domain_destroy(struct domain *d)
506 {
507 hvm_funcs.domain_destroy(d);
508 rtc_deinit(d);
509 stdvga_deinit(d);
510 vioapic_deinit(d);
511 hvm_destroy_cacheattr_region_list(d);
512 }
514 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
515 {
516 struct vcpu *v;
517 struct hvm_hw_cpu ctxt;
518 struct segment_register seg;
519 struct vcpu_guest_context *vc;
521 for_each_vcpu ( d, v )
522 {
523 /* We don't need to save state for a vcpu that is down; the restore
524 * code will leave it down if there is nothing saved. */
525 if ( test_bit(_VPF_down, &v->pause_flags) )
526 continue;
528 /* Architecture-specific vmcs/vmcb bits */
529 hvm_funcs.save_cpu_ctxt(v, &ctxt);
531 ctxt.msr_tsc_aux = hvm_msr_tsc_aux(v);
533 hvm_get_segment_register(v, x86_seg_idtr, &seg);
534 ctxt.idtr_limit = seg.limit;
535 ctxt.idtr_base = seg.base;
537 hvm_get_segment_register(v, x86_seg_gdtr, &seg);
538 ctxt.gdtr_limit = seg.limit;
539 ctxt.gdtr_base = seg.base;
541 hvm_get_segment_register(v, x86_seg_cs, &seg);
542 ctxt.cs_sel = seg.sel;
543 ctxt.cs_limit = seg.limit;
544 ctxt.cs_base = seg.base;
545 ctxt.cs_arbytes = seg.attr.bytes;
547 hvm_get_segment_register(v, x86_seg_ds, &seg);
548 ctxt.ds_sel = seg.sel;
549 ctxt.ds_limit = seg.limit;
550 ctxt.ds_base = seg.base;
551 ctxt.ds_arbytes = seg.attr.bytes;
553 hvm_get_segment_register(v, x86_seg_es, &seg);
554 ctxt.es_sel = seg.sel;
555 ctxt.es_limit = seg.limit;
556 ctxt.es_base = seg.base;
557 ctxt.es_arbytes = seg.attr.bytes;
559 hvm_get_segment_register(v, x86_seg_ss, &seg);
560 ctxt.ss_sel = seg.sel;
561 ctxt.ss_limit = seg.limit;
562 ctxt.ss_base = seg.base;
563 ctxt.ss_arbytes = seg.attr.bytes;
565 hvm_get_segment_register(v, x86_seg_fs, &seg);
566 ctxt.fs_sel = seg.sel;
567 ctxt.fs_limit = seg.limit;
568 ctxt.fs_base = seg.base;
569 ctxt.fs_arbytes = seg.attr.bytes;
571 hvm_get_segment_register(v, x86_seg_gs, &seg);
572 ctxt.gs_sel = seg.sel;
573 ctxt.gs_limit = seg.limit;
574 ctxt.gs_base = seg.base;
575 ctxt.gs_arbytes = seg.attr.bytes;
577 hvm_get_segment_register(v, x86_seg_tr, &seg);
578 ctxt.tr_sel = seg.sel;
579 ctxt.tr_limit = seg.limit;
580 ctxt.tr_base = seg.base;
581 ctxt.tr_arbytes = seg.attr.bytes;
583 hvm_get_segment_register(v, x86_seg_ldtr, &seg);
584 ctxt.ldtr_sel = seg.sel;
585 ctxt.ldtr_limit = seg.limit;
586 ctxt.ldtr_base = seg.base;
587 ctxt.ldtr_arbytes = seg.attr.bytes;
589 vc = &v->arch.guest_context;
591 if ( v->fpu_initialised )
592 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
593 else
594 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
596 ctxt.rax = vc->user_regs.eax;
597 ctxt.rbx = vc->user_regs.ebx;
598 ctxt.rcx = vc->user_regs.ecx;
599 ctxt.rdx = vc->user_regs.edx;
600 ctxt.rbp = vc->user_regs.ebp;
601 ctxt.rsi = vc->user_regs.esi;
602 ctxt.rdi = vc->user_regs.edi;
603 ctxt.rsp = vc->user_regs.esp;
604 ctxt.rip = vc->user_regs.eip;
605 ctxt.rflags = vc->user_regs.eflags;
606 #ifdef __x86_64__
607 ctxt.r8 = vc->user_regs.r8;
608 ctxt.r9 = vc->user_regs.r9;
609 ctxt.r10 = vc->user_regs.r10;
610 ctxt.r11 = vc->user_regs.r11;
611 ctxt.r12 = vc->user_regs.r12;
612 ctxt.r13 = vc->user_regs.r13;
613 ctxt.r14 = vc->user_regs.r14;
614 ctxt.r15 = vc->user_regs.r15;
615 #endif
616 ctxt.dr0 = vc->debugreg[0];
617 ctxt.dr1 = vc->debugreg[1];
618 ctxt.dr2 = vc->debugreg[2];
619 ctxt.dr3 = vc->debugreg[3];
620 ctxt.dr6 = vc->debugreg[6];
621 ctxt.dr7 = vc->debugreg[7];
623 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
624 return 1;
625 }
626 return 0;
627 }
629 static bool_t hvm_efer_valid(uint64_t value, uint64_t efer_validbits)
630 {
631 return !((value & ~efer_validbits) ||
632 ((sizeof(long) != 8) && (value & EFER_LME)) ||
633 (!cpu_has_nx && (value & EFER_NX)) ||
634 (!cpu_has_syscall && (value & EFER_SCE)) ||
635 (!cpu_has_lmsl && (value & EFER_LMSLE)) ||
636 (!cpu_has_ffxsr && (value & EFER_FFXSE)) ||
637 ((value & (EFER_LME|EFER_LMA)) == EFER_LMA));
638 }
640 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
641 {
642 int vcpuid, rc;
643 struct vcpu *v;
644 struct hvm_hw_cpu ctxt;
645 struct segment_register seg;
646 struct vcpu_guest_context *vc;
648 /* Which vcpu is this? */
649 vcpuid = hvm_load_instance(h);
650 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
651 {
652 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
653 return -EINVAL;
654 }
655 vc = &v->arch.guest_context;
657 /* Need to init this vcpu before loading its contents */
658 rc = 0;
659 domain_lock(d);
660 if ( !v->is_initialised )
661 rc = boot_vcpu(d, vcpuid, vc);
662 domain_unlock(d);
663 if ( rc != 0 )
664 return rc;
666 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
667 return -EINVAL;
669 /* Sanity check some control registers. */
670 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
671 !(ctxt.cr0 & X86_CR0_ET) ||
672 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
673 {
674 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
675 ctxt.cr0);
676 return -EINVAL;
677 }
679 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
680 {
681 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
682 ctxt.cr4);
683 return -EINVAL;
684 }
686 if ( !hvm_efer_valid(
687 ctxt.msr_efer,
688 EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
689 {
690 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
691 ctxt.msr_efer);
692 return -EINVAL;
693 }
695 /* Older Xen versions used to save the segment arbytes directly
696 * from the VMCS on Intel hosts. Detect this and rearrange them
697 * into the struct segment_register format. */
698 #define UNFOLD_ARBYTES(_r) \
699 if ( (_r & 0xf000) && !(_r & 0x0f00) ) \
700 _r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
701 UNFOLD_ARBYTES(ctxt.cs_arbytes);
702 UNFOLD_ARBYTES(ctxt.ds_arbytes);
703 UNFOLD_ARBYTES(ctxt.es_arbytes);
704 UNFOLD_ARBYTES(ctxt.fs_arbytes);
705 UNFOLD_ARBYTES(ctxt.gs_arbytes);
706 UNFOLD_ARBYTES(ctxt.ss_arbytes);
707 UNFOLD_ARBYTES(ctxt.tr_arbytes);
708 UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
709 #undef UNFOLD_ARBYTES
711 /* Architecture-specific vmcs/vmcb bits */
712 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
713 return -EINVAL;
715 v->arch.hvm_vcpu.msr_tsc_aux = ctxt.msr_tsc_aux;
717 seg.limit = ctxt.idtr_limit;
718 seg.base = ctxt.idtr_base;
719 hvm_set_segment_register(v, x86_seg_idtr, &seg);
721 seg.limit = ctxt.gdtr_limit;
722 seg.base = ctxt.gdtr_base;
723 hvm_set_segment_register(v, x86_seg_gdtr, &seg);
725 seg.sel = ctxt.cs_sel;
726 seg.limit = ctxt.cs_limit;
727 seg.base = ctxt.cs_base;
728 seg.attr.bytes = ctxt.cs_arbytes;
729 hvm_set_segment_register(v, x86_seg_cs, &seg);
731 seg.sel = ctxt.ds_sel;
732 seg.limit = ctxt.ds_limit;
733 seg.base = ctxt.ds_base;
734 seg.attr.bytes = ctxt.ds_arbytes;
735 hvm_set_segment_register(v, x86_seg_ds, &seg);
737 seg.sel = ctxt.es_sel;
738 seg.limit = ctxt.es_limit;
739 seg.base = ctxt.es_base;
740 seg.attr.bytes = ctxt.es_arbytes;
741 hvm_set_segment_register(v, x86_seg_es, &seg);
743 seg.sel = ctxt.ss_sel;
744 seg.limit = ctxt.ss_limit;
745 seg.base = ctxt.ss_base;
746 seg.attr.bytes = ctxt.ss_arbytes;
747 hvm_set_segment_register(v, x86_seg_ss, &seg);
749 seg.sel = ctxt.fs_sel;
750 seg.limit = ctxt.fs_limit;
751 seg.base = ctxt.fs_base;
752 seg.attr.bytes = ctxt.fs_arbytes;
753 hvm_set_segment_register(v, x86_seg_fs, &seg);
755 seg.sel = ctxt.gs_sel;
756 seg.limit = ctxt.gs_limit;
757 seg.base = ctxt.gs_base;
758 seg.attr.bytes = ctxt.gs_arbytes;
759 hvm_set_segment_register(v, x86_seg_gs, &seg);
761 seg.sel = ctxt.tr_sel;
762 seg.limit = ctxt.tr_limit;
763 seg.base = ctxt.tr_base;
764 seg.attr.bytes = ctxt.tr_arbytes;
765 hvm_set_segment_register(v, x86_seg_tr, &seg);
767 seg.sel = ctxt.ldtr_sel;
768 seg.limit = ctxt.ldtr_limit;
769 seg.base = ctxt.ldtr_base;
770 seg.attr.bytes = ctxt.ldtr_arbytes;
771 hvm_set_segment_register(v, x86_seg_ldtr, &seg);
773 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
775 /* In case xsave-absent save file is restored on a xsave-capable host */
776 if ( cpu_has_xsave )
777 {
778 struct xsave_struct *xsave_area = v->arch.xsave_area;
780 memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
781 xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
782 v->arch.xcr0_accum = XSTATE_FP_SSE;
783 v->arch.xcr0 = XSTATE_FP_SSE;
784 }
786 vc->user_regs.eax = ctxt.rax;
787 vc->user_regs.ebx = ctxt.rbx;
788 vc->user_regs.ecx = ctxt.rcx;
789 vc->user_regs.edx = ctxt.rdx;
790 vc->user_regs.ebp = ctxt.rbp;
791 vc->user_regs.esi = ctxt.rsi;
792 vc->user_regs.edi = ctxt.rdi;
793 vc->user_regs.esp = ctxt.rsp;
794 vc->user_regs.eip = ctxt.rip;
795 vc->user_regs.eflags = ctxt.rflags | 2;
796 #ifdef __x86_64__
797 vc->user_regs.r8 = ctxt.r8;
798 vc->user_regs.r9 = ctxt.r9;
799 vc->user_regs.r10 = ctxt.r10;
800 vc->user_regs.r11 = ctxt.r11;
801 vc->user_regs.r12 = ctxt.r12;
802 vc->user_regs.r13 = ctxt.r13;
803 vc->user_regs.r14 = ctxt.r14;
804 vc->user_regs.r15 = ctxt.r15;
805 #endif
806 vc->debugreg[0] = ctxt.dr0;
807 vc->debugreg[1] = ctxt.dr1;
808 vc->debugreg[2] = ctxt.dr2;
809 vc->debugreg[3] = ctxt.dr3;
810 vc->debugreg[6] = ctxt.dr6;
811 vc->debugreg[7] = ctxt.dr7;
813 vc->flags = VGCF_online;
814 v->fpu_initialised = 1;
816 /* Auxiliary processors should be woken immediately. */
817 v->is_initialised = 1;
818 clear_bit(_VPF_down, &v->pause_flags);
819 vcpu_wake(v);
821 return 0;
822 }
824 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
825 1, HVMSR_PER_VCPU);
827 #define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size)
829 static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
830 {
831 struct vcpu *v;
832 struct hvm_hw_cpu_xsave *ctxt;
834 if ( !cpu_has_xsave )
835 return 0; /* do nothing */
837 for_each_vcpu ( d, v )
838 {
839 if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, HVM_CPU_XSAVE_SIZE) )
840 return 1;
841 ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
842 h->cur += HVM_CPU_XSAVE_SIZE;
843 memset(ctxt, 0, HVM_CPU_XSAVE_SIZE);
845 ctxt->xfeature_mask = xfeature_mask;
846 ctxt->xcr0 = v->arch.xcr0;
847 ctxt->xcr0_accum = v->arch.xcr0_accum;
848 if ( v->fpu_initialised )
849 memcpy(&ctxt->save_area,
850 v->arch.xsave_area, xsave_cntxt_size);
851 }
853 return 0;
854 }
856 static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
857 {
858 int vcpuid;
859 struct vcpu *v;
860 struct hvm_hw_cpu_xsave *ctxt;
861 struct hvm_save_descriptor *desc;
862 uint64_t _xfeature_mask;
864 /* fails since we can't restore an img saved on xsave-capable host */
865 //XXX:
866 if ( !cpu_has_xsave )
867 return -EINVAL;
869 /* Which vcpu is this? */
870 vcpuid = hvm_load_instance(h);
871 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
872 {
873 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
874 return -EINVAL;
875 }
877 /* Customized checking for entry since our entry is of variable length */
878 desc = (struct hvm_save_descriptor *)&h->data[h->cur];
879 if ( sizeof (*desc) > h->size - h->cur)
880 {
881 gdprintk(XENLOG_WARNING,
882 "HVM restore: not enough data left to read descriptpr"
883 "for type %u\n", CPU_XSAVE_CODE);
884 return -1;
885 }
886 if ( desc->length + sizeof (*desc) > h->size - h->cur)
887 {
888 gdprintk(XENLOG_WARNING,
889 "HVM restore: not enough data left to read %u bytes "
890 "for type %u\n", desc->length, CPU_XSAVE_CODE);
891 return -1;
892 }
893 if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > HVM_CPU_XSAVE_SIZE) )
894 {
895 gdprintk(XENLOG_WARNING,
896 "HVM restore mismatch: expected type %u with max length %u, "
897 "saw type %u length %u\n", CPU_XSAVE_CODE,
898 (uint32_t)HVM_CPU_XSAVE_SIZE,
899 desc->typecode, desc->length);
900 return -1;
901 }
902 h->cur += sizeof (*desc);
903 /* Checking finished */
905 ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
906 h->cur += desc->length;
908 _xfeature_mask = ctxt->xfeature_mask;
909 if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask )
910 return -EINVAL;
912 v->arch.xcr0 = ctxt->xcr0;
913 v->arch.xcr0_accum = ctxt->xcr0_accum;
914 memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size);
916 return 0;
917 }
919 /* We need variable length data chunk for xsave area, hence customized
920 * declaration other than HVM_REGISTER_SAVE_RESTORE.
921 */
922 static int __hvm_register_CPU_XSAVE_save_and_restore(void)
923 {
924 hvm_register_savevm(CPU_XSAVE_CODE,
925 "CPU_XSAVE",
926 hvm_save_cpu_xsave_states,
927 hvm_load_cpu_xsave_states,
928 HVM_CPU_XSAVE_SIZE + sizeof (struct hvm_save_descriptor),
929 HVMSR_PER_VCPU);
930 return 0;
931 }
932 __initcall(__hvm_register_CPU_XSAVE_save_and_restore);
934 int hvm_vcpu_initialise(struct vcpu *v)
935 {
936 int rc;
938 hvm_asid_flush_vcpu(v);
940 if ( (rc = vlapic_init(v)) != 0 )
941 goto fail1;
943 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
944 goto fail2;
946 /* Create ioreq event channel. */
947 rc = alloc_unbound_xen_event_channel(v, 0);
948 if ( rc < 0 )
949 goto fail3;
951 /* Register ioreq event channel. */
952 v->arch.hvm_vcpu.xen_port = rc;
953 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
954 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
955 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
956 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
958 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
959 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
961 v->arch.hvm_vcpu.inject_trap = -1;
963 #ifdef CONFIG_COMPAT
964 rc = setup_compat_arg_xlat(v);
965 if ( rc != 0 )
966 goto fail3;
967 #endif
969 rc = hvm_vcpu_cacheattr_init(v);
970 if ( rc != 0 )
971 goto fail4;
973 tasklet_init(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
974 (void(*)(unsigned long))hvm_assert_evtchn_irq,
975 (unsigned long)v);
977 v->arch.guest_context.user_regs.eflags = 2;
979 if ( v->vcpu_id == 0 )
980 {
981 /* NB. All these really belong in hvm_domain_initialise(). */
982 pit_init(v, cpu_khz);
983 pmtimer_init(v);
984 hpet_init(v);
986 /* Init guest TSC to start from zero. */
987 hvm_set_guest_tsc(v, 0);
989 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
990 v->is_initialised = 1;
991 clear_bit(_VPF_down, &v->pause_flags);
992 }
994 return 0;
996 fail4:
997 #ifdef CONFIG_COMPAT
998 free_compat_arg_xlat(v);
999 #endif
1000 fail3:
1001 hvm_funcs.vcpu_destroy(v);
1002 fail2:
1003 vlapic_destroy(v);
1004 fail1:
1005 return rc;
1008 void hvm_vcpu_destroy(struct vcpu *v)
1010 #ifdef CONFIG_COMPAT
1011 free_compat_arg_xlat(v);
1012 #endif
1013 tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
1014 hvm_vcpu_cacheattr_destroy(v);
1015 vlapic_destroy(v);
1016 hvm_funcs.vcpu_destroy(v);
1018 /* Event channel is already freed by evtchn_destroy(). */
1019 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
1022 void hvm_vcpu_down(struct vcpu *v)
1024 struct domain *d = v->domain;
1025 int online_count = 0;
1027 /* Doesn't halt us immediately, but we'll never return to guest context. */
1028 set_bit(_VPF_down, &v->pause_flags);
1029 vcpu_sleep_nosync(v);
1031 /* Any other VCPUs online? ... */
1032 domain_lock(d);
1033 for_each_vcpu ( d, v )
1034 if ( !test_bit(_VPF_down, &v->pause_flags) )
1035 online_count++;
1036 domain_unlock(d);
1038 /* ... Shut down the domain if not. */
1039 if ( online_count == 0 )
1041 gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
1042 domain_shutdown(d, SHUTDOWN_poweroff);
1046 bool_t hvm_send_assist_req(struct vcpu *v)
1048 ioreq_t *p;
1050 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
1051 return 0; /* implicitly bins the i/o operation */
1053 p = get_ioreq(v);
1054 if ( unlikely(p->state != STATE_IOREQ_NONE) )
1056 /* This indicates a bug in the device model. Crash the domain. */
1057 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
1058 domain_crash(v->domain);
1059 return 0;
1062 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
1064 /*
1065 * Following happens /after/ blocking and setting up ioreq contents.
1066 * prepare_wait_on_xen_event_channel() is an implicit barrier.
1067 */
1068 p->state = STATE_IOREQ_READY;
1069 notify_via_xen_event_channel(v->domain, v->arch.hvm_vcpu.xen_port);
1071 return 1;
1074 void hvm_hlt(unsigned long rflags)
1076 struct vcpu *curr = current;
1078 if ( hvm_event_pending(curr) )
1079 return;
1081 /*
1082 * If we halt with interrupts disabled, that's a pretty sure sign that we
1083 * want to shut down. In a real processor, NMIs are the only way to break
1084 * out of this.
1085 */
1086 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
1087 return hvm_vcpu_down(curr);
1089 do_sched_op_compat(SCHEDOP_block, 0);
1091 HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
1094 void hvm_triple_fault(void)
1096 struct vcpu *v = current;
1097 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
1098 "invoking HVM system reset.\n", v->vcpu_id);
1099 domain_shutdown(v->domain, SHUTDOWN_reboot);
1102 bool_t hvm_hap_nested_page_fault(unsigned long gpa,
1103 bool_t gla_valid,
1104 unsigned long gla,
1105 bool_t access_valid,
1106 bool_t access_r,
1107 bool_t access_w,
1108 bool_t access_x)
1110 unsigned long gfn = gpa >> PAGE_SHIFT;
1111 p2m_type_t p2mt;
1112 p2m_access_t p2ma;
1113 mfn_t mfn;
1114 struct vcpu *v = current;
1115 struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
1117 mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
1119 /* Check access permissions first, then handle faults */
1120 if ( access_valid && (mfn_x(mfn) != INVALID_MFN) )
1122 int violation = 0;
1123 /* If the access is against the permissions, then send to mem_event */
1124 switch (p2ma)
1126 case p2m_access_n:
1127 default:
1128 violation = access_r || access_w || access_x;
1129 break;
1130 case p2m_access_r:
1131 violation = access_w || access_x;
1132 break;
1133 case p2m_access_w:
1134 violation = access_r || access_x;
1135 break;
1136 case p2m_access_x:
1137 violation = access_r || access_w;
1138 break;
1139 case p2m_access_rx:
1140 case p2m_access_rx2rw:
1141 violation = access_w;
1142 break;
1143 case p2m_access_wx:
1144 violation = access_r;
1145 break;
1146 case p2m_access_rw:
1147 violation = access_x;
1148 break;
1149 case p2m_access_rwx:
1150 break;
1153 if ( violation )
1155 p2m_mem_access_check(gpa, gla_valid, gla, access_r, access_w, access_x);
1157 return 1;
1161 /*
1162 * If this GFN is emulated MMIO or marked as read-only, pass the fault
1163 * to the mmio handler.
1164 */
1165 if ( (p2mt == p2m_mmio_dm) || (p2mt == p2m_ram_ro) )
1167 if ( !handle_mmio() )
1168 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1169 return 1;
1172 #ifdef __x86_64__
1173 /* Check if the page has been paged out */
1174 if ( p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_out) )
1175 p2m_mem_paging_populate(p2m, gfn);
1177 /* Mem sharing: unshare the page and try again */
1178 if ( p2mt == p2m_ram_shared )
1180 mem_sharing_unshare_page(p2m, gfn, 0);
1181 return 1;
1183 #endif
1185 /* Spurious fault? PoD and log-dirty also take this path. */
1186 if ( p2m_is_ram(p2mt) )
1188 /*
1189 * Page log dirty is always done with order 0. If this mfn resides in
1190 * a large page, we do not change other pages type within that large
1191 * page.
1192 */
1193 paging_mark_dirty(v->domain, mfn_x(mfn));
1194 p2m_change_type(p2m, gfn, p2m_ram_logdirty, p2m_ram_rw);
1195 return 1;
1198 /* Shouldn't happen: Maybe the guest was writing to a r/o grant mapping? */
1199 if ( p2mt == p2m_grant_map_ro )
1201 gdprintk(XENLOG_WARNING,
1202 "trying to write to read-only grant mapping\n");
1203 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1204 return 1;
1207 return 0;
1210 int hvm_handle_xsetbv(u64 new_bv)
1212 struct vcpu *v = current;
1213 struct segment_register sreg;
1215 hvm_get_segment_register(v, x86_seg_ss, &sreg);
1216 if ( sreg.attr.fields.dpl != 0 )
1217 goto err;
1219 if ( ((new_bv ^ xfeature_mask) & ~xfeature_mask) || !(new_bv & 1) )
1220 goto err;
1222 if ( (xfeature_mask & XSTATE_YMM & new_bv) && !(new_bv & XSTATE_SSE) )
1223 goto err;
1225 v->arch.xcr0 = new_bv;
1226 v->arch.xcr0_accum |= new_bv;
1227 set_xcr0(new_bv);
1229 return 0;
1230 err:
1231 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1232 return -1;
1235 int hvm_set_efer(uint64_t value)
1237 struct vcpu *v = current;
1239 value &= ~EFER_LMA;
1241 if ( !hvm_efer_valid(value,
1242 EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_NX | EFER_SCE) )
1244 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
1245 "EFER: %"PRIx64"\n", value);
1246 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1247 return X86EMUL_EXCEPTION;
1250 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
1251 hvm_paging_enabled(v) )
1253 gdprintk(XENLOG_WARNING,
1254 "Trying to change EFER.LME with paging enabled\n");
1255 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1256 return X86EMUL_EXCEPTION;
1259 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
1260 v->arch.hvm_vcpu.guest_efer = value;
1261 hvm_update_guest_efer(v);
1263 return X86EMUL_OKAY;
1266 extern void shadow_blow_tables_per_domain(struct domain *d);
1268 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
1269 static bool_t domain_exit_uc_mode(struct vcpu *v)
1271 struct domain *d = v->domain;
1272 struct vcpu *vs;
1274 for_each_vcpu ( d, vs )
1276 if ( (vs == v) || !vs->is_initialised )
1277 continue;
1278 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
1279 mtrr_pat_not_equal(vs, v) )
1280 return 0;
1283 return 1;
1286 static void local_flush_cache(void *info)
1288 wbinvd();
1291 static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode)
1293 v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode;
1294 shadow_blow_tables_per_domain(v->domain);
1295 if ( hvm_funcs.set_uc_mode )
1296 return hvm_funcs.set_uc_mode(v);
1299 int hvm_set_cr0(unsigned long value)
1301 struct vcpu *v = current;
1302 p2m_type_t p2mt;
1303 struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
1304 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
1306 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
1308 if ( (u32)value != value )
1310 HVM_DBG_LOG(DBG_LEVEL_1,
1311 "Guest attempts to set upper 32 bits in CR0: %lx",
1312 value);
1313 goto gpf;
1316 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
1318 /* ET is reserved and should be always be 1. */
1319 value |= X86_CR0_ET;
1321 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
1322 goto gpf;
1324 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
1326 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
1328 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
1330 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
1331 goto gpf;
1333 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
1334 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
1335 hvm_update_guest_efer(v);
1338 if ( !paging_mode_hap(v->domain) )
1340 /* The guest CR3 must be pointing to the guest physical. */
1341 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
1342 mfn = mfn_x(gfn_to_mfn(p2m, gfn, &p2mt));
1343 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
1344 !get_page(mfn_to_page(mfn), v->domain))
1346 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1347 v->arch.hvm_vcpu.guest_cr[3], mfn);
1348 domain_crash(v->domain);
1349 return X86EMUL_UNHANDLEABLE;
1352 /* Now arch.guest_table points to machine physical. */
1353 v->arch.guest_table = pagetable_from_pfn(mfn);
1355 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1356 v->arch.hvm_vcpu.guest_cr[3], mfn);
1359 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
1361 /* When CR0.PG is cleared, LMA is cleared immediately. */
1362 if ( hvm_long_mode_enabled(v) )
1364 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
1365 hvm_update_guest_efer(v);
1368 if ( !paging_mode_hap(v->domain) )
1370 put_page(pagetable_get_page(v->arch.guest_table));
1371 v->arch.guest_table = pagetable_null();
1375 if ( has_arch_mmios(v->domain) )
1377 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
1379 /* Entering no fill cache mode. */
1380 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
1381 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
1383 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
1385 /* Flush physical caches. */
1386 on_each_cpu(local_flush_cache, NULL, 1);
1387 hvm_set_uc_mode(v, 1);
1389 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
1391 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
1392 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
1394 /* Exit from no fill cache mode. */
1395 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
1396 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
1398 if ( domain_exit_uc_mode(v) )
1399 hvm_set_uc_mode(v, 0);
1401 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
1405 v->arch.hvm_vcpu.guest_cr[0] = value;
1406 hvm_update_guest_cr(v, 0);
1408 if ( (value ^ old_value) & X86_CR0_PG )
1409 paging_update_paging_modes(v);
1411 return X86EMUL_OKAY;
1413 gpf:
1414 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1415 return X86EMUL_EXCEPTION;
1418 int hvm_set_cr3(unsigned long value)
1420 unsigned long mfn;
1421 p2m_type_t p2mt;
1422 struct vcpu *v = current;
1424 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
1425 (value != v->arch.hvm_vcpu.guest_cr[3]) )
1427 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
1428 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1429 mfn = mfn_x(gfn_to_mfn(p2m_get_hostp2m(v->domain),
1430 value >> PAGE_SHIFT, &p2mt));
1431 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
1432 !get_page(mfn_to_page(mfn), v->domain) )
1433 goto bad_cr3;
1435 put_page(pagetable_get_page(v->arch.guest_table));
1436 v->arch.guest_table = pagetable_from_pfn(mfn);
1438 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1441 v->arch.hvm_vcpu.guest_cr[3] = value;
1442 paging_update_cr3(v);
1443 return X86EMUL_OKAY;
1445 bad_cr3:
1446 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1447 domain_crash(v->domain);
1448 return X86EMUL_UNHANDLEABLE;
1451 int hvm_set_cr4(unsigned long value)
1453 struct vcpu *v = current;
1454 unsigned long old_cr;
1456 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1458 HVM_DBG_LOG(DBG_LEVEL_1,
1459 "Guest attempts to set reserved bit in CR4: %lx",
1460 value);
1461 goto gpf;
1464 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
1466 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1467 "EFER.LMA is set");
1468 goto gpf;
1471 old_cr = v->arch.hvm_vcpu.guest_cr[4];
1472 v->arch.hvm_vcpu.guest_cr[4] = value;
1473 hvm_update_guest_cr(v, 4);
1475 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
1476 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1477 paging_update_paging_modes(v);
1479 return X86EMUL_OKAY;
1481 gpf:
1482 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1483 return X86EMUL_EXCEPTION;
1486 int hvm_virtual_to_linear_addr(
1487 enum x86_segment seg,
1488 struct segment_register *reg,
1489 unsigned long offset,
1490 unsigned int bytes,
1491 enum hvm_access_type access_type,
1492 unsigned int addr_size,
1493 unsigned long *linear_addr)
1495 unsigned long addr = offset;
1496 uint32_t last_byte;
1498 if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1500 /*
1501 * REAL MODE: Don't bother with segment access checks.
1502 * Certain of them are not done in native real mode anyway.
1503 */
1504 addr = (uint32_t)(addr + reg->base);
1506 else if ( addr_size != 64 )
1508 /*
1509 * COMPATIBILITY MODE: Apply segment checks and add base.
1510 */
1512 switch ( access_type )
1514 case hvm_access_read:
1515 if ( (reg->attr.fields.type & 0xa) == 0x8 )
1516 goto gpf; /* execute-only code segment */
1517 break;
1518 case hvm_access_write:
1519 if ( (reg->attr.fields.type & 0xa) != 0x2 )
1520 goto gpf; /* not a writable data segment */
1521 break;
1522 default:
1523 break;
1526 last_byte = offset + bytes - 1;
1528 /* Is this a grows-down data segment? Special limit check if so. */
1529 if ( (reg->attr.fields.type & 0xc) == 0x4 )
1531 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
1532 if ( !reg->attr.fields.db )
1533 last_byte = (uint16_t)last_byte;
1535 /* Check first byte and last byte against respective bounds. */
1536 if ( (offset <= reg->limit) || (last_byte < offset) )
1537 goto gpf;
1539 else if ( (last_byte > reg->limit) || (last_byte < offset) )
1540 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
1542 /*
1543 * Hardware truncates to 32 bits in compatibility mode.
1544 * It does not truncate to 16 bits in 16-bit address-size mode.
1545 */
1546 addr = (uint32_t)(addr + reg->base);
1548 else
1550 /*
1551 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
1552 */
1554 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
1555 addr += reg->base;
1557 if ( !is_canonical_address(addr) )
1558 goto gpf;
1561 *linear_addr = addr;
1562 return 1;
1564 gpf:
1565 return 0;
1568 static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable)
1570 unsigned long mfn;
1571 p2m_type_t p2mt;
1572 struct p2m_domain *p2m = p2m_get_hostp2m(current->domain);
1574 mfn = mfn_x(writable
1575 ? gfn_to_mfn_unshare(p2m, gfn, &p2mt, 0)
1576 : gfn_to_mfn(p2m, gfn, &p2mt));
1577 if ( (p2m_is_shared(p2mt) && writable) || !p2m_is_ram(p2mt) )
1578 return NULL;
1579 if ( p2m_is_paging(p2mt) )
1581 p2m_mem_paging_populate(p2m, gfn);
1582 return NULL;
1585 ASSERT(mfn_valid(mfn));
1587 if ( writable )
1588 paging_mark_dirty(current->domain, mfn);
1590 return map_domain_page(mfn);
1593 void *hvm_map_guest_frame_rw(unsigned long gfn)
1595 return __hvm_map_guest_frame(gfn, 1);
1598 void *hvm_map_guest_frame_ro(unsigned long gfn)
1600 return __hvm_map_guest_frame(gfn, 0);
1603 void hvm_unmap_guest_frame(void *p)
1605 if ( p )
1606 unmap_domain_page(p);
1609 static void *hvm_map_entry(unsigned long va)
1611 unsigned long gfn;
1612 uint32_t pfec;
1613 char *v;
1615 if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE )
1617 gdprintk(XENLOG_ERR, "Descriptor table entry "
1618 "straddles page boundary\n");
1619 goto fail;
1622 /*
1623 * We're mapping on behalf of the segment-load logic, which might write
1624 * the accessed flags in the descriptors (in 32-bit mode), but we still
1625 * treat it as a kernel-mode read (i.e. no access checks).
1626 */
1627 pfec = PFEC_page_present;
1628 gfn = paging_gva_to_gfn(current, va, &pfec);
1629 if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
1630 goto fail;
1632 v = hvm_map_guest_frame_rw(gfn);
1633 if ( v == NULL )
1634 goto fail;
1636 return v + (va & ~PAGE_MASK);
1638 fail:
1639 domain_crash(current->domain);
1640 return NULL;
1643 static void hvm_unmap_entry(void *p)
1645 hvm_unmap_guest_frame(p);
1648 static int hvm_load_segment_selector(
1649 enum x86_segment seg, uint16_t sel)
1651 struct segment_register desctab, cs, segr;
1652 struct desc_struct *pdesc, desc;
1653 u8 dpl, rpl, cpl;
1654 int fault_type = TRAP_invalid_tss;
1655 struct cpu_user_regs *regs = guest_cpu_user_regs();
1656 struct vcpu *v = current;
1658 if ( regs->eflags & X86_EFLAGS_VM )
1660 segr.sel = sel;
1661 segr.base = (uint32_t)sel << 4;
1662 segr.limit = 0xffffu;
1663 segr.attr.bytes = 0xf3;
1664 hvm_set_segment_register(v, seg, &segr);
1665 return 0;
1668 /* NULL selector? */
1669 if ( (sel & 0xfffc) == 0 )
1671 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
1672 goto fail;
1673 memset(&segr, 0, sizeof(segr));
1674 hvm_set_segment_register(v, seg, &segr);
1675 return 0;
1678 /* LDT descriptor must be in the GDT. */
1679 if ( (seg == x86_seg_ldtr) && (sel & 4) )
1680 goto fail;
1682 hvm_get_segment_register(v, x86_seg_cs, &cs);
1683 hvm_get_segment_register(
1684 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
1686 /* Check against descriptor table limit. */
1687 if ( ((sel & 0xfff8) + 7) > desctab.limit )
1688 goto fail;
1690 pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
1691 if ( pdesc == NULL )
1692 goto hvm_map_fail;
1694 do {
1695 desc = *pdesc;
1697 /* Segment present in memory? */
1698 if ( !(desc.b & (1u<<15)) )
1700 fault_type = TRAP_no_segment;
1701 goto unmap_and_fail;
1704 /* LDT descriptor is a system segment. All others are code/data. */
1705 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1706 goto unmap_and_fail;
1708 dpl = (desc.b >> 13) & 3;
1709 rpl = sel & 3;
1710 cpl = cs.sel & 3;
1712 switch ( seg )
1714 case x86_seg_cs:
1715 /* Code segment? */
1716 if ( !(desc.b & (1u<<11)) )
1717 goto unmap_and_fail;
1718 /* Non-conforming segment: check DPL against RPL. */
1719 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1720 goto unmap_and_fail;
1721 break;
1722 case x86_seg_ss:
1723 /* Writable data segment? */
1724 if ( (desc.b & (5u<<9)) != (1u<<9) )
1725 goto unmap_and_fail;
1726 if ( (dpl != cpl) || (dpl != rpl) )
1727 goto unmap_and_fail;
1728 break;
1729 case x86_seg_ldtr:
1730 /* LDT system segment? */
1731 if ( (desc.b & (15u<<8)) != (2u<<8) )
1732 goto unmap_and_fail;
1733 goto skip_accessed_flag;
1734 default:
1735 /* Readable code or data segment? */
1736 if ( (desc.b & (5u<<9)) == (4u<<9) )
1737 goto unmap_and_fail;
1738 /* Non-conforming segment: check DPL against RPL and CPL. */
1739 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1740 goto unmap_and_fail;
1741 break;
1743 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1744 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1746 /* Force the Accessed flag in our local copy. */
1747 desc.b |= 0x100;
1749 skip_accessed_flag:
1750 hvm_unmap_entry(pdesc);
1752 segr.base = (((desc.b << 0) & 0xff000000u) |
1753 ((desc.b << 16) & 0x00ff0000u) |
1754 ((desc.a >> 16) & 0x0000ffffu));
1755 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1756 ((desc.b >> 12) & 0x0f00u));
1757 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1758 if ( segr.attr.fields.g )
1759 segr.limit = (segr.limit << 12) | 0xfffu;
1760 segr.sel = sel;
1761 hvm_set_segment_register(v, seg, &segr);
1763 return 0;
1765 unmap_and_fail:
1766 hvm_unmap_entry(pdesc);
1767 fail:
1768 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1769 hvm_map_fail:
1770 return 1;
1773 void hvm_task_switch(
1774 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1775 int32_t errcode)
1777 struct vcpu *v = current;
1778 struct cpu_user_regs *regs = guest_cpu_user_regs();
1779 struct segment_register gdt, tr, prev_tr, segr;
1780 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1781 unsigned long eflags;
1782 int exn_raised, rc;
1783 struct {
1784 u16 back_link,__blh;
1785 u32 esp0;
1786 u16 ss0, _0;
1787 u32 esp1;
1788 u16 ss1, _1;
1789 u32 esp2;
1790 u16 ss2, _2;
1791 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1792 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1793 u16 trace, iomap;
1794 } tss = { 0 };
1796 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1797 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1799 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1801 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1802 TRAP_invalid_tss : TRAP_gp_fault,
1803 tss_sel & 0xfff8, 0);
1804 goto out;
1807 optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8));
1808 if ( optss_desc == NULL )
1809 goto out;
1811 nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8));
1812 if ( nptss_desc == NULL )
1813 goto out;
1815 tss_desc = *nptss_desc;
1816 tr.sel = tss_sel;
1817 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1818 ((tss_desc.b << 16) & 0x00ff0000u) |
1819 ((tss_desc.a >> 16) & 0x0000ffffu));
1820 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1821 ((tss_desc.b >> 12) & 0x0f00u));
1822 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1823 if ( tr.attr.fields.g )
1824 tr.limit = (tr.limit << 12) | 0xfffu;
1826 if ( !tr.attr.fields.p )
1828 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1829 goto out;
1832 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1834 hvm_inject_exception(
1835 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1836 tss_sel & 0xfff8, 0);
1837 goto out;
1840 if ( tr.limit < (sizeof(tss)-1) )
1842 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1843 goto out;
1846 rc = hvm_copy_from_guest_virt(
1847 &tss, prev_tr.base, sizeof(tss), PFEC_page_present);
1848 if ( rc == HVMCOPY_bad_gva_to_gfn )
1849 goto out;
1850 if ( rc == HVMCOPY_gfn_paged_out )
1851 goto out;
1852 if ( rc == HVMCOPY_gfn_shared )
1853 goto out;
1855 eflags = regs->eflags;
1856 if ( taskswitch_reason == TSW_iret )
1857 eflags &= ~X86_EFLAGS_NT;
1859 tss.cr3 = v->arch.hvm_vcpu.guest_cr[3];
1860 tss.eip = regs->eip;
1861 tss.eflags = eflags;
1862 tss.eax = regs->eax;
1863 tss.ecx = regs->ecx;
1864 tss.edx = regs->edx;
1865 tss.ebx = regs->ebx;
1866 tss.esp = regs->esp;
1867 tss.ebp = regs->ebp;
1868 tss.esi = regs->esi;
1869 tss.edi = regs->edi;
1871 hvm_get_segment_register(v, x86_seg_es, &segr);
1872 tss.es = segr.sel;
1873 hvm_get_segment_register(v, x86_seg_cs, &segr);
1874 tss.cs = segr.sel;
1875 hvm_get_segment_register(v, x86_seg_ss, &segr);
1876 tss.ss = segr.sel;
1877 hvm_get_segment_register(v, x86_seg_ds, &segr);
1878 tss.ds = segr.sel;
1879 hvm_get_segment_register(v, x86_seg_fs, &segr);
1880 tss.fs = segr.sel;
1881 hvm_get_segment_register(v, x86_seg_gs, &segr);
1882 tss.gs = segr.sel;
1883 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1884 tss.ldt = segr.sel;
1886 rc = hvm_copy_to_guest_virt(
1887 prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
1888 if ( rc == HVMCOPY_bad_gva_to_gfn )
1889 goto out;
1890 if ( rc == HVMCOPY_gfn_paged_out )
1891 goto out;
1892 if ( rc == HVMCOPY_gfn_shared )
1893 goto out;
1895 rc = hvm_copy_from_guest_virt(
1896 &tss, tr.base, sizeof(tss), PFEC_page_present);
1897 if ( rc == HVMCOPY_bad_gva_to_gfn )
1898 goto out;
1899 if ( rc == HVMCOPY_gfn_paged_out )
1900 goto out;
1901 /* Note: this could be optimised, if the callee functions knew we want RO
1902 * access */
1903 if ( rc == HVMCOPY_gfn_shared )
1904 goto out;
1907 if ( hvm_set_cr3(tss.cr3) )
1908 goto out;
1910 regs->eip = tss.eip;
1911 regs->eflags = tss.eflags | 2;
1912 regs->eax = tss.eax;
1913 regs->ecx = tss.ecx;
1914 regs->edx = tss.edx;
1915 regs->ebx = tss.ebx;
1916 regs->esp = tss.esp;
1917 regs->ebp = tss.ebp;
1918 regs->esi = tss.esi;
1919 regs->edi = tss.edi;
1921 if ( (taskswitch_reason == TSW_call_or_int) )
1923 regs->eflags |= X86_EFLAGS_NT;
1924 tss.back_link = prev_tr.sel;
1927 exn_raised = 0;
1928 if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt) ||
1929 hvm_load_segment_selector(x86_seg_es, tss.es) ||
1930 hvm_load_segment_selector(x86_seg_cs, tss.cs) ||
1931 hvm_load_segment_selector(x86_seg_ss, tss.ss) ||
1932 hvm_load_segment_selector(x86_seg_ds, tss.ds) ||
1933 hvm_load_segment_selector(x86_seg_fs, tss.fs) ||
1934 hvm_load_segment_selector(x86_seg_gs, tss.gs) )
1935 exn_raised = 1;
1937 rc = hvm_copy_to_guest_virt(
1938 tr.base, &tss, sizeof(tss), PFEC_page_present);
1939 if ( rc == HVMCOPY_bad_gva_to_gfn )
1940 exn_raised = 1;
1941 if ( rc == HVMCOPY_gfn_paged_out )
1942 goto out;
1943 if ( rc == HVMCOPY_gfn_shared )
1944 goto out;
1946 if ( (tss.trace & 1) && !exn_raised )
1947 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1949 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1950 hvm_set_segment_register(v, x86_seg_tr, &tr);
1952 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1953 hvm_update_guest_cr(v, 0);
1955 if ( (taskswitch_reason == TSW_iret) ||
1956 (taskswitch_reason == TSW_jmp) )
1957 clear_bit(41, optss_desc); /* clear B flag of old task */
1959 if ( taskswitch_reason != TSW_iret )
1960 set_bit(41, nptss_desc); /* set B flag of new task */
1962 if ( errcode >= 0 )
1964 struct segment_register reg;
1965 unsigned long linear_addr;
1966 regs->esp -= 4;
1967 hvm_get_segment_register(current, x86_seg_ss, &reg);
1968 /* Todo: do not ignore access faults here. */
1969 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1970 4, hvm_access_write, 32,
1971 &linear_addr) )
1972 hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0);
1975 out:
1976 hvm_unmap_entry(optss_desc);
1977 hvm_unmap_entry(nptss_desc);
1980 #define HVMCOPY_from_guest (0u<<0)
1981 #define HVMCOPY_to_guest (1u<<0)
1982 #define HVMCOPY_no_fault (0u<<1)
1983 #define HVMCOPY_fault (1u<<1)
1984 #define HVMCOPY_phys (0u<<2)
1985 #define HVMCOPY_virt (1u<<2)
1986 static enum hvm_copy_result __hvm_copy(
1987 void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
1989 struct vcpu *curr = current;
1990 struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
1991 unsigned long gfn, mfn;
1992 p2m_type_t p2mt;
1993 char *p;
1994 int count, todo = size;
1996 /*
1997 * XXX Disable for 4.1.0: PV-on-HVM drivers will do grant-table ops
1998 * such as query_size. Grant-table code currently does copy_to/from_guest
1999 * accesses under the big per-domain lock, which this test would disallow.
2000 * The test is not needed until we implement sleeping-on-waitqueue when
2001 * we access a paged-out frame, and that's post 4.1.0 now.
2002 */
2003 #if 0
2004 /*
2005 * If the required guest memory is paged out, this function may sleep.
2006 * Hence we bail immediately if called from atomic context.
2007 */
2008 if ( in_atomic() )
2009 return HVMCOPY_unhandleable;
2010 #endif
2012 while ( todo > 0 )
2014 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
2016 if ( flags & HVMCOPY_virt )
2018 gfn = paging_gva_to_gfn(curr, addr, &pfec);
2019 if ( gfn == INVALID_GFN )
2021 if ( pfec == PFEC_page_paged )
2022 return HVMCOPY_gfn_paged_out;
2023 if ( pfec == PFEC_page_shared )
2024 return HVMCOPY_gfn_shared;
2025 if ( flags & HVMCOPY_fault )
2026 hvm_inject_exception(TRAP_page_fault, pfec, addr);
2027 return HVMCOPY_bad_gva_to_gfn;
2030 else
2032 gfn = addr >> PAGE_SHIFT;
2035 mfn = mfn_x(gfn_to_mfn_unshare(p2m, gfn, &p2mt, 0));
2037 if ( p2m_is_paging(p2mt) )
2039 p2m_mem_paging_populate(p2m, gfn);
2040 return HVMCOPY_gfn_paged_out;
2042 if ( p2m_is_shared(p2mt) )
2043 return HVMCOPY_gfn_shared;
2044 if ( p2m_is_grant(p2mt) )
2045 return HVMCOPY_unhandleable;
2046 if ( !p2m_is_ram(p2mt) )
2047 return HVMCOPY_bad_gfn_to_mfn;
2048 ASSERT(mfn_valid(mfn));
2050 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
2052 if ( flags & HVMCOPY_to_guest )
2054 if ( p2mt == p2m_ram_ro )
2056 static unsigned long lastpage;
2057 if ( xchg(&lastpage, gfn) != gfn )
2058 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only"
2059 " memory page. gfn=%#lx, mfn=%#lx\n",
2060 gfn, mfn);
2062 else
2064 memcpy(p, buf, count);
2065 paging_mark_dirty(curr->domain, mfn);
2068 else
2070 memcpy(buf, p, count);
2073 unmap_domain_page(p);
2075 addr += count;
2076 buf += count;
2077 todo -= count;
2080 return HVMCOPY_okay;
2083 enum hvm_copy_result hvm_copy_to_guest_phys(
2084 paddr_t paddr, void *buf, int size)
2086 return __hvm_copy(buf, paddr, size,
2087 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys,
2088 0);
2091 enum hvm_copy_result hvm_copy_from_guest_phys(
2092 void *buf, paddr_t paddr, int size)
2094 return __hvm_copy(buf, paddr, size,
2095 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys,
2096 0);
2099 enum hvm_copy_result hvm_copy_to_guest_virt(
2100 unsigned long vaddr, void *buf, int size, uint32_t pfec)
2102 return __hvm_copy(buf, vaddr, size,
2103 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt,
2104 PFEC_page_present | PFEC_write_access | pfec);
2107 enum hvm_copy_result hvm_copy_from_guest_virt(
2108 void *buf, unsigned long vaddr, int size, uint32_t pfec)
2110 return __hvm_copy(buf, vaddr, size,
2111 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
2112 PFEC_page_present | pfec);
2115 enum hvm_copy_result hvm_fetch_from_guest_virt(
2116 void *buf, unsigned long vaddr, int size, uint32_t pfec)
2118 if ( hvm_nx_enabled(current) )
2119 pfec |= PFEC_insn_fetch;
2120 return __hvm_copy(buf, vaddr, size,
2121 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
2122 PFEC_page_present | pfec);
2125 enum hvm_copy_result hvm_copy_to_guest_virt_nofault(
2126 unsigned long vaddr, void *buf, int size, uint32_t pfec)
2128 return __hvm_copy(buf, vaddr, size,
2129 HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt,
2130 PFEC_page_present | PFEC_write_access | pfec);
2133 enum hvm_copy_result hvm_copy_from_guest_virt_nofault(
2134 void *buf, unsigned long vaddr, int size, uint32_t pfec)
2136 return __hvm_copy(buf, vaddr, size,
2137 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
2138 PFEC_page_present | pfec);
2141 enum hvm_copy_result hvm_fetch_from_guest_virt_nofault(
2142 void *buf, unsigned long vaddr, int size, uint32_t pfec)
2144 if ( hvm_nx_enabled(current) )
2145 pfec |= PFEC_insn_fetch;
2146 return __hvm_copy(buf, vaddr, size,
2147 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
2148 PFEC_page_present | pfec);
2151 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
2153 int rc;
2155 #ifdef __x86_64__
2156 if ( !current->arch.hvm_vcpu.hcall_64bit &&
2157 is_compat_arg_xlat_range(to, len) )
2159 memcpy(to, from, len);
2160 return 0;
2162 #endif
2164 rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from,
2165 len, 0);
2166 return rc ? len : 0; /* fake a copy_to_user() return code */
2169 unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
2171 int rc;
2173 #ifdef __x86_64__
2174 if ( !current->arch.hvm_vcpu.hcall_64bit &&
2175 is_compat_arg_xlat_range(from, len) )
2177 memcpy(to, from, len);
2178 return 0;
2180 #endif
2182 rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0);
2183 return rc ? len : 0; /* fake a copy_from_user() return code */
2186 #define bitmaskof(idx) (1U << ((idx) & 31))
2187 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
2188 unsigned int *ecx, unsigned int *edx)
2190 struct vcpu *v = current;
2191 unsigned int count = *ecx;
2193 if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
2194 return;
2196 if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) )
2197 return;
2199 domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
2201 switch ( input )
2203 case 0x1:
2204 /* Fix up VLAPIC details. */
2205 *ebx &= 0x00FFFFFFu;
2206 *ebx |= (v->vcpu_id * 2) << 24;
2207 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
2208 __clear_bit(X86_FEATURE_APIC & 31, edx);
2210 /* Fix up OSXSAVE. */
2211 if ( cpu_has_xsave )
2212 *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ?
2213 bitmaskof(X86_FEATURE_OSXSAVE) : 0;
2214 break;
2215 case 0xb:
2216 /* Fix the x2APIC identifier. */
2217 *edx = v->vcpu_id * 2;
2218 break;
2219 case 0xd:
2221 unsigned int sub_leaf, _eax, _ebx, _ecx, _edx;
2222 /* EBX value of main leaf 0 depends on enabled xsave features */
2223 if ( count == 0 && v->arch.xcr0 )
2225 for ( sub_leaf = 2;
2226 (sub_leaf < 64) && (v->arch.xcr0 & (1ULL << sub_leaf));
2227 sub_leaf++ )
2229 domain_cpuid(v->domain, input, sub_leaf, &_eax, &_ebx, &_ecx,
2230 &_edx);
2231 if ( (_eax + _ebx) > *ebx )
2232 *ebx = _eax + _ebx;
2235 break;
2237 case 0x80000001:
2238 /* We expose RDTSCP feature to guest only when
2239 tsc_mode == TSC_MODE_DEFAULT and host_tsc_is_safe() returns 1 */
2240 if ( v->domain->arch.tsc_mode != TSC_MODE_DEFAULT ||
2241 !host_tsc_is_safe() )
2242 *edx &= ~bitmaskof(X86_FEATURE_RDTSCP);
2243 break;
2247 void hvm_rdtsc_intercept(struct cpu_user_regs *regs)
2249 uint64_t tsc;
2250 struct vcpu *v = current;
2252 tsc = hvm_get_guest_tsc(v);
2253 regs->eax = (uint32_t)tsc;
2254 regs->edx = (uint32_t)(tsc >> 32);
2256 HVMTRACE_2D(RDTSC, regs->eax, regs->edx);
2259 int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
2261 struct vcpu *v = current;
2262 uint64_t *var_range_base, *fixed_range_base;
2263 int index, mtrr;
2264 uint32_t cpuid[4];
2265 int ret = X86EMUL_OKAY;
2267 var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
2268 fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2270 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
2271 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
2273 switch ( msr )
2275 case MSR_EFER:
2276 *msr_content = v->arch.hvm_vcpu.guest_efer;
2277 break;
2279 case MSR_IA32_TSC:
2280 *msr_content = hvm_get_guest_tsc(v);
2281 break;
2283 case MSR_TSC_AUX:
2284 *msr_content = hvm_msr_tsc_aux(v);
2285 break;
2287 case MSR_IA32_APICBASE:
2288 *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2289 break;
2291 case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff:
2292 if ( hvm_x2apic_msr_read(v, msr, msr_content) )
2293 goto gp_fault;
2294 break;
2296 case MSR_IA32_TSC_DEADLINE:
2297 *msr_content = vlapic_tdt_msr_get(vcpu_vlapic(v));
2298 break;
2300 case MSR_IA32_CR_PAT:
2301 *msr_content = v->arch.hvm_vcpu.pat_cr;
2302 break;
2304 case MSR_MTRRcap:
2305 if ( !mtrr )
2306 goto gp_fault;
2307 *msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2308 break;
2309 case MSR_MTRRdefType:
2310 if ( !mtrr )
2311 goto gp_fault;
2312 *msr_content = v->arch.hvm_vcpu.mtrr.def_type
2313 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2314 break;
2315 case MSR_MTRRfix64K_00000:
2316 if ( !mtrr )
2317 goto gp_fault;
2318 *msr_content = fixed_range_base[0];
2319 break;
2320 case MSR_MTRRfix16K_80000:
2321 case MSR_MTRRfix16K_A0000:
2322 if ( !mtrr )
2323 goto gp_fault;
2324 index = msr - MSR_MTRRfix16K_80000;
2325 *msr_content = fixed_range_base[index + 1];
2326 break;
2327 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2328 if ( !mtrr )
2329 goto gp_fault;
2330 index = msr - MSR_MTRRfix4K_C0000;
2331 *msr_content = fixed_range_base[index + 3];
2332 break;
2333 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2334 if ( !mtrr )
2335 goto gp_fault;
2336 index = msr - MSR_IA32_MTRR_PHYSBASE0;
2337 *msr_content = var_range_base[index];
2338 break;
2340 case MSR_K8_ENABLE_C1E:
2341 case MSR_AMD64_NB_CFG:
2342 /*
2343 * These AMD-only registers may be accessed if this HVM guest
2344 * has been migrated to an Intel host. This fixes a guest crash
2345 * in this case.
2346 */
2347 *msr_content = 0;
2348 break;
2350 default:
2351 if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 )
2352 goto gp_fault;
2353 /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
2354 ret = ((ret == 0)
2355 ? hvm_funcs.msr_read_intercept(msr, msr_content)
2356 : X86EMUL_OKAY);
2357 break;
2360 out:
2361 HVMTRACE_3D(MSR_READ, msr,
2362 (uint32_t)*msr_content, (uint32_t)(*msr_content >> 32));
2363 return ret;
2365 gp_fault:
2366 hvm_inject_exception(TRAP_gp_fault, 0, 0);
2367 ret = X86EMUL_EXCEPTION;
2368 *msr_content = -1ull;
2369 goto out;
2372 int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
2374 struct vcpu *v = current;
2375 int index, mtrr;
2376 uint32_t cpuid[4];
2377 int ret = X86EMUL_OKAY;
2379 HVMTRACE_3D(MSR_WRITE, msr,
2380 (uint32_t)msr_content, (uint32_t)(msr_content >> 32));
2382 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
2383 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
2385 switch ( msr )
2387 case MSR_EFER:
2388 if ( hvm_set_efer(msr_content) )
2389 return X86EMUL_EXCEPTION;
2390 break;
2392 case MSR_IA32_TSC:
2393 hvm_set_guest_tsc(v, msr_content);
2394 break;
2396 case MSR_TSC_AUX:
2397 v->arch.hvm_vcpu.msr_tsc_aux = (uint32_t)msr_content;
2398 if ( cpu_has_rdtscp
2399 && (v->domain->arch.tsc_mode != TSC_MODE_PVRDTSCP) )
2400 wrmsrl(MSR_TSC_AUX, (uint32_t)msr_content);
2401 break;
2403 case MSR_IA32_APICBASE:
2404 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2405 break;
2407 case MSR_IA32_TSC_DEADLINE:
2408 vlapic_tdt_msr_set(vcpu_vlapic(v), msr_content);
2409 break;
2411 case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff:
2412 if ( hvm_x2apic_msr_write(v, msr, msr_content) )
2413 goto gp_fault;
2414 break;
2416 case MSR_IA32_CR_PAT:
2417 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2418 goto gp_fault;
2419 break;
2421 case MSR_MTRRcap:
2422 if ( !mtrr )
2423 goto gp_fault;
2424 goto gp_fault;
2425 case MSR_MTRRdefType:
2426 if ( !mtrr )
2427 goto gp_fault;
2428 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2429 goto gp_fault;
2430 break;
2431 case MSR_MTRRfix64K_00000:
2432 if ( !mtrr )
2433 goto gp_fault;
2434 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2435 goto gp_fault;
2436 break;
2437 case MSR_MTRRfix16K_80000:
2438 case MSR_MTRRfix16K_A0000:
2439 if ( !mtrr )
2440 goto gp_fault;
2441 index = msr - MSR_MTRRfix16K_80000 + 1;
2442 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2443 index, msr_content) )
2444 goto gp_fault;
2445 break;
2446 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2447 if ( !mtrr )
2448 goto gp_fault;
2449 index = msr - MSR_MTRRfix4K_C0000 + 3;
2450 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2451 index, msr_content) )
2452 goto gp_fault;
2453 break;
2454 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2455 if ( !mtrr )
2456 goto gp_fault;
2457 if ( !mtrr_var_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
2458 msr, msr_content) )
2459 goto gp_fault;
2460 break;
2462 case MSR_AMD64_NB_CFG:
2463 /* ignore the write */
2464 break;
2466 default:
2467 if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 )
2468 goto gp_fault;
2469 /* If ret == 0 then this is not an MCE MSR, see other MSRs. */
2470 ret = ((ret == 0)
2471 ? hvm_funcs.msr_write_intercept(msr, msr_content)
2472 : X86EMUL_OKAY);
2473 break;
2476 return ret;
2478 gp_fault:
2479 hvm_inject_exception(TRAP_gp_fault, 0, 0);
2480 return X86EMUL_EXCEPTION;
2483 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
2485 unsigned long intr_shadow;
2487 ASSERT(v == current);
2489 if ( (intack.source != hvm_intsrc_nmi) &&
2490 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
2491 return hvm_intblk_rflags_ie;
2493 intr_shadow = hvm_funcs.get_interrupt_shadow(v);
2495 if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
2496 return hvm_intblk_shadow;
2498 if ( intack.source == hvm_intsrc_nmi )
2499 return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
2500 hvm_intblk_nmi_iret : hvm_intblk_none);
2502 if ( intack.source == hvm_intsrc_lapic )
2504 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
2505 if ( (tpr >> 4) >= (intack.vector >> 4) )
2506 return hvm_intblk_tpr;
2509 return hvm_intblk_none;
2512 static int grant_table_op_is_allowed(unsigned int cmd)
2514 switch (cmd) {
2515 case GNTTABOP_query_size:
2516 case GNTTABOP_setup_table:
2517 case GNTTABOP_set_version:
2518 case GNTTABOP_copy:
2519 case GNTTABOP_map_grant_ref:
2520 case GNTTABOP_unmap_grant_ref:
2521 return 1;
2522 default:
2523 /* all other commands need auditing */
2524 return 0;
2528 static long hvm_grant_table_op(
2529 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
2531 if ( !grant_table_op_is_allowed(cmd) )
2532 return -ENOSYS; /* all other commands need auditing */
2533 return do_grant_table_op(cmd, uop, count);
2536 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
2538 long rc = do_memory_op(cmd, arg);
2539 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
2540 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
2541 return rc;
2544 static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
2546 switch ( cmd )
2548 case PHYSDEVOP_map_pirq:
2549 case PHYSDEVOP_unmap_pirq:
2550 case PHYSDEVOP_eoi:
2551 case PHYSDEVOP_irq_status_query:
2552 case PHYSDEVOP_get_free_pirq:
2553 return do_physdev_op(cmd, arg);
2554 default:
2555 return -ENOSYS;
2559 static long hvm_vcpu_op(
2560 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
2562 long rc;
2564 switch ( cmd )
2566 case VCPUOP_register_runstate_memory_area:
2567 case VCPUOP_get_runstate_info:
2568 case VCPUOP_set_periodic_timer:
2569 case VCPUOP_stop_periodic_timer:
2570 case VCPUOP_set_singleshot_timer:
2571 case VCPUOP_stop_singleshot_timer:
2572 rc = do_vcpu_op(cmd, vcpuid, arg);
2573 break;
2574 default:
2575 rc = -ENOSYS;
2576 break;
2579 return rc;
2582 typedef unsigned long hvm_hypercall_t(
2583 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long,
2584 unsigned long);
2586 #define HYPERCALL(x) \
2587 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
2589 #if defined(__i386__)
2591 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
2592 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
2593 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2594 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
2595 [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op,
2596 HYPERCALL(xen_version),
2597 HYPERCALL(event_channel_op),
2598 HYPERCALL(sched_op),
2599 HYPERCALL(set_timer_op),
2600 HYPERCALL(hvm_op),
2601 HYPERCALL(sysctl),
2602 HYPERCALL(tmem_op)
2603 };
2605 #else /* defined(__x86_64__) */
2607 static long hvm_grant_table_op_compat32(unsigned int cmd,
2608 XEN_GUEST_HANDLE(void) uop,
2609 unsigned int count)
2611 if ( !grant_table_op_is_allowed(cmd) )
2612 return -ENOSYS;
2613 return compat_grant_table_op(cmd, uop, count);
2616 static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
2618 long rc = compat_memory_op(cmd, arg);
2619 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
2620 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
2621 return rc;
2624 static long hvm_vcpu_op_compat32(
2625 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
2627 long rc;
2629 switch ( cmd )
2631 case VCPUOP_register_runstate_memory_area:
2632 case VCPUOP_get_runstate_info:
2633 case VCPUOP_set_periodic_timer:
2634 case VCPUOP_stop_periodic_timer:
2635 case VCPUOP_set_singleshot_timer:
2636 case VCPUOP_stop_singleshot_timer:
2637 rc = compat_vcpu_op(cmd, vcpuid, arg);
2638 break;
2639 default:
2640 rc = -ENOSYS;
2641 break;
2644 return rc;
2647 static long hvm_physdev_op_compat32(
2648 int cmd, XEN_GUEST_HANDLE(void) arg)
2650 switch ( cmd )
2652 case PHYSDEVOP_map_pirq:
2653 case PHYSDEVOP_unmap_pirq:
2654 case PHYSDEVOP_eoi:
2655 case PHYSDEVOP_irq_status_query:
2656 case PHYSDEVOP_get_free_pirq:
2657 return compat_physdev_op(cmd, arg);
2658 break;
2659 default:
2660 return -ENOSYS;
2661 break;
2665 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
2666 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
2667 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2668 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
2669 [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op,
2670 HYPERCALL(xen_version),
2671 HYPERCALL(event_channel_op),
2672 HYPERCALL(sched_op),
2673 HYPERCALL(set_timer_op),
2674 HYPERCALL(hvm_op),
2675 HYPERCALL(sysctl),
2676 HYPERCALL(tmem_op)
2677 };
2679 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
2680 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
2681 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op_compat32,
2682 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32,
2683 [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op_compat32,
2684 HYPERCALL(xen_version),
2685 HYPERCALL(event_channel_op),
2686 HYPERCALL(sched_op),
2687 HYPERCALL(set_timer_op),
2688 HYPERCALL(hvm_op),
2689 HYPERCALL(sysctl),
2690 HYPERCALL(tmem_op)
2691 };
2693 #endif /* defined(__x86_64__) */
2695 int hvm_do_hypercall(struct cpu_user_regs *regs)
2697 struct vcpu *curr = current;
2698 struct segment_register sreg;
2699 int mode = hvm_guest_x86_mode(curr);
2700 uint32_t eax = regs->eax;
2702 switch ( mode )
2704 #ifdef __x86_64__
2705 case 8:
2706 #endif
2707 case 4:
2708 case 2:
2709 hvm_get_segment_register(curr, x86_seg_ss, &sreg);
2710 if ( unlikely(sreg.attr.fields.dpl == 3) )
2712 default:
2713 regs->eax = -EPERM;
2714 return HVM_HCALL_completed;
2716 case 0:
2717 break;
2720 if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) )
2721 return viridian_hypercall(regs);
2723 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
2725 regs->eax = -ENOSYS;
2726 return HVM_HCALL_completed;
2729 curr->arch.hvm_vcpu.hcall_preempted = 0;
2731 #ifdef __x86_64__
2732 if ( mode == 8 )
2734 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx, %lx)",
2735 eax, regs->rdi, regs->rsi, regs->rdx,
2736 regs->r10, regs->r8, regs->r9);
2738 curr->arch.hvm_vcpu.hcall_64bit = 1;
2739 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
2740 regs->rsi,
2741 regs->rdx,
2742 regs->r10,
2743 regs->r8,
2744 regs->r9);
2745 curr->arch.hvm_vcpu.hcall_64bit = 0;
2747 else
2748 #endif
2750 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x, %x)", eax,
2751 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
2752 (uint32_t)regs->edx, (uint32_t)regs->esi,
2753 (uint32_t)regs->edi, (uint32_t)regs->ebp);
2755 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
2756 (uint32_t)regs->ecx,
2757 (uint32_t)regs->edx,
2758 (uint32_t)regs->esi,
2759 (uint32_t)regs->edi,
2760 (uint32_t)regs->ebp);
2763 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
2764 eax, (unsigned long)regs->eax);
2766 if ( curr->arch.hvm_vcpu.hcall_preempted )
2767 return HVM_HCALL_preempted;
2769 if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
2770 test_and_clear_bool(curr->domain->arch.hvm_domain.
2771 qemu_mapcache_invalidate) )
2772 return HVM_HCALL_invalidate;
2774 return HVM_HCALL_completed;
2777 static void hvm_latch_shinfo_size(struct domain *d)
2779 bool_t new_has_32bit;
2781 /*
2782 * Called from operations which are among the very first executed by
2783 * PV drivers on initialisation or after save/restore. These are sensible
2784 * points at which to sample the execution mode of the guest and latch
2785 * 32- or 64-bit format for shared state.
2786 */
2787 if ( current->domain == d ) {
2788 new_has_32bit = (hvm_guest_x86_mode(current) != 8);
2789 if (new_has_32bit != d->arch.has_32bit_shinfo) {
2790 d->arch.has_32bit_shinfo = new_has_32bit;
2791 /*
2792 * Make sure that the timebase in the shared info
2793 * structure is correct for its new bit-ness. We should
2794 * arguably try to convert the other fields as well, but
2795 * that's much more problematic (e.g. what do you do if
2796 * you're going from 64 bit to 32 bit and there's an event
2797 * channel pending which doesn't exist in the 32 bit
2798 * version?). Just setting the wallclock time seems to be
2799 * sufficient for everything we do, even if it is a bit of
2800 * a hack.
2801 */
2802 update_domain_wallclock_time(d);
2807 /* Initialise a hypercall transfer page for a VMX domain using
2808 paravirtualised drivers. */
2809 void hvm_hypercall_page_initialise(struct domain *d,
2810 void *hypercall_page)
2812 hvm_latch_shinfo_size(d);
2813 hvm_funcs.init_hypercall_page(d, hypercall_page);
2816 static int hvmop_set_pci_intx_level(
2817 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
2819 struct xen_hvm_set_pci_intx_level op;
2820 struct domain *d;
2821 int rc;
2823 if ( copy_from_guest(&op, uop, 1) )
2824 return -EFAULT;
2826 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
2827 return -EINVAL;
2829 d = rcu_lock_domain_by_id(op.domid);
2830 if ( d == NULL )
2831 return -ESRCH;
2833 rc = -EPERM;
2834 if ( !IS_PRIV_FOR(current->domain, d) )
2835 goto out;
2837 rc = -EINVAL;
2838 if ( !is_hvm_domain(d) )
2839 goto out;
2841 rc = xsm_hvm_set_pci_intx_level(d);
2842 if ( rc )
2843 goto out;
2845 rc = 0;
2846 switch ( op.level )
2848 case 0:
2849 hvm_pci_intx_deassert(d, op.device, op.intx);
2850 break;
2851 case 1:
2852 hvm_pci_intx_assert(d, op.device, op.intx);
2853 break;
2854 default:
2855 rc = -EINVAL;
2856 break;
2859 out:
2860 rcu_unlock_domain(d);
2861 return rc;
2864 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
2866 struct domain *d = v->domain;
2867 struct vcpu_guest_context *ctxt;
2868 struct segment_register reg;
2870 BUG_ON(vcpu_runnable(v));
2872 domain_lock(d);
2874 if ( v->is_initialised )
2875 goto out;
2877 if ( !paging_mode_hap(d) )
2879 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
2880 put_page(pagetable_get_page(v->arch.guest_table));
2881 v->arch.guest_table = pagetable_null();
2884 ctxt = &v->arch.guest_context;
2885 memset(ctxt, 0, sizeof(*ctxt));
2886 ctxt->flags = VGCF_online;
2887 ctxt->user_regs.eflags = 2;
2888 ctxt->user_regs.edx = 0x00000f00;
2889 ctxt->user_regs.eip = ip;
2891 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
2892 hvm_update_guest_cr(v, 0);
2894 v->arch.hvm_vcpu.guest_cr[2] = 0;
2895 hvm_update_guest_cr(v, 2);
2897 v->arch.hvm_vcpu.guest_cr[3] = 0;
2898 hvm_update_guest_cr(v, 3);
2900 v->arch.hvm_vcpu.guest_cr[4] = 0;
2901 hvm_update_guest_cr(v, 4);
2903 v->arch.hvm_vcpu.guest_efer = 0;
2904 hvm_update_guest_efer(v);
2906 reg.sel = cs;
2907 reg.base = (uint32_t)reg.sel << 4;
2908 reg.limit = 0xffff;
2909 reg.attr.bytes = 0x09b;
2910 hvm_set_segment_register(v, x86_seg_cs, &reg);
2912 reg.sel = reg.base = 0;
2913 reg.limit = 0xffff;
2914 reg.attr.bytes = 0x093;
2915 hvm_set_segment_register(v, x86_seg_ds, &reg);
2916 hvm_set_segment_register(v, x86_seg_es, &reg);
2917 hvm_set_segment_register(v, x86_seg_fs, &reg);
2918 hvm_set_segment_register(v, x86_seg_gs, &reg);
2919 hvm_set_segment_register(v, x86_seg_ss, &reg);
2921 reg.attr.bytes = 0x82; /* LDT */
2922 hvm_set_segment_register(v, x86_seg_ldtr, &reg);
2924 reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
2925 hvm_set_segment_register(v, x86_seg_tr, &reg);
2927 reg.attr.bytes = 0;
2928 hvm_set_segment_register(v, x86_seg_gdtr, &reg);
2929 hvm_set_segment_register(v, x86_seg_idtr, &reg);
2931 /* Sync AP's TSC with BSP's. */
2932 v->arch.hvm_vcpu.cache_tsc_offset =
2933 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
2934 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
2936 paging_update_paging_modes(v);
2938 v->arch.flags |= TF_kernel_mode;
2939 v->is_initialised = 1;
2940 clear_bit(_VPF_down, &v->pause_flags);
2942 out:
2943 domain_unlock(d);
2946 static void hvm_s3_suspend(struct domain *d)
2948 struct vcpu *v;
2950 domain_pause(d);
2951 domain_lock(d);
2953 if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ||
2954 test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) )
2956 domain_unlock(d);
2957 domain_unpause(d);
2958 return;
2961 for_each_vcpu ( d, v )
2963 vlapic_reset(vcpu_vlapic(v));
2964 vcpu_reset(v);
2967 vpic_reset(d);
2968 vioapic_reset(d);
2969 pit_reset(d);
2970 rtc_reset(d);
2971 pmtimer_reset(d);
2972 hpet_reset(d);
2974 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
2976 domain_unlock(d);
2979 static void hvm_s3_resume(struct domain *d)
2981 if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
2982 domain_unpause(d);
2985 static int hvmop_set_isa_irq_level(
2986 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
2988 struct xen_hvm_set_isa_irq_level op;
2989 struct domain *d;
2990 int rc;
2992 if ( copy_from_guest(&op, uop, 1) )
2993 return -EFAULT;
2995 if ( op.isa_irq > 15 )
2996 return -EINVAL;
2998 d = rcu_lock_domain_by_id(op.domid);
2999 if ( d == NULL )
3000 return -ESRCH;
3002 rc = -EPERM;
3003 if ( !IS_PRIV_FOR(current->domain, d) )
3004 goto out;
3006 rc = -EINVAL;
3007 if ( !is_hvm_domain(d) )
3008 goto out;
3010 rc = xsm_hvm_set_isa_irq_level(d);
3011 if ( rc )
3012 goto out;
3014 rc = 0;
3015 switch ( op.level )
3017 case 0:
3018 hvm_isa_irq_deassert(d, op.isa_irq);
3019 break;
3020 case 1:
3021 hvm_isa_irq_assert(d, op.isa_irq);
3022 break;
3023 default:
3024 rc = -EINVAL;
3025 break;
3028 out:
3029 rcu_unlock_domain(d);
3030 return rc;
3033 static int hvmop_set_pci_link_route(
3034 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
3036 struct xen_hvm_set_pci_link_route op;
3037 struct domain *d;
3038 int rc;
3040 if ( copy_from_guest(&op, uop, 1) )
3041 return -EFAULT;
3043 if ( (op.link > 3) || (op.isa_irq > 15) )
3044 return -EINVAL;
3046 d = rcu_lock_domain_by_id(op.domid);
3047 if ( d == NULL )
3048 return -ESRCH;
3050 rc = -EPERM;
3051 if ( !IS_PRIV_FOR(current->domain, d) )
3052 goto out;
3054 rc = -EINVAL;
3055 if ( !is_hvm_domain(d) )
3056 goto out;
3058 rc = xsm_hvm_set_pci_link_route(d);
3059 if ( rc )
3060 goto out;
3062 rc = 0;
3063 hvm_set_pci_link_route(d, op.link, op.isa_irq);
3065 out:
3066 rcu_unlock_domain(d);
3067 return rc;
3070 static int hvmop_flush_tlb_all(void)
3072 struct domain *d = current->domain;
3073 struct vcpu *v;
3075 if ( !is_hvm_domain(d) )
3076 return -EINVAL;
3078 /* Avoid deadlock if more than one vcpu tries this at the same time. */
3079 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
3080 return -EAGAIN;
3082 /* Pause all other vcpus. */
3083 for_each_vcpu ( d, v )
3084 if ( v != current )
3085 vcpu_pause_nosync(v);
3087 /* Now that all VCPUs are signalled to deschedule, we wait... */
3088 for_each_vcpu ( d, v )
3089 if ( v != current )
3090 while ( !vcpu_runnable(v) && v->is_running )
3091 cpu_relax();
3093 /* All other vcpus are paused, safe to unlock now. */
3094 spin_unlock(&d->hypercall_deadlock_mutex);
3096 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
3097 for_each_vcpu ( d, v )
3098 paging_update_cr3(v);
3100 /* Flush all dirty TLBs. */
3101 flush_tlb_mask(&d->domain_dirty_cpumask);
3103 /* Done. */
3104 for_each_vcpu ( d, v )
3105 if ( v != current )
3106 vcpu_unpause(v);
3108 return 0;
3111 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
3114 struct domain *curr_d = current->domain;
3115 long rc = 0;
3117 switch ( op )
3119 case HVMOP_set_param:
3120 case HVMOP_get_param:
3122 struct xen_hvm_param a;
3123 struct hvm_ioreq_page *iorp;
3124 struct domain *d;
3125 struct vcpu *v;
3127 if ( copy_from_guest(&a, arg, 1) )
3128 return -EFAULT;
3130 if ( a.index >= HVM_NR_PARAMS )
3131 return -EINVAL;
3133 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3134 if ( rc != 0 )
3135 return rc;
3137 rc = -EINVAL;
3138 if ( !is_hvm_domain(d) )
3139 goto param_fail;
3141 rc = xsm_hvm_param(d, op);
3142 if ( rc )
3143 goto param_fail;
3145 if ( op == HVMOP_set_param )
3147 rc = 0;
3149 switch ( a.index )
3151 case HVM_PARAM_IOREQ_PFN:
3152 iorp = &d->arch.hvm_domain.ioreq;
3153 if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
3154 break;
3155 spin_lock(&iorp->lock);
3156 if ( iorp->va != NULL )
3157 /* Initialise evtchn port info if VCPUs already created. */
3158 for_each_vcpu ( d, v )
3159 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
3160 spin_unlock(&iorp->lock);
3161 break;
3162 case HVM_PARAM_BUFIOREQ_PFN:
3163 iorp = &d->arch.hvm_domain.buf_ioreq;
3164 rc = hvm_set_ioreq_page(d, iorp, a.value);
3165 break;
3166 case HVM_PARAM_CALLBACK_IRQ:
3167 hvm_set_callback_via(d, a.value);
3168 hvm_latch_shinfo_size(d);
3169 break;
3170 case HVM_PARAM_TIMER_MODE:
3171 if ( a.value > HVMPTM_one_missed_tick_pending )
3172 rc = -EINVAL;
3173 break;
3174 case HVM_PARAM_VIRIDIAN:
3175 if ( a.value > 1 )
3176 rc = -EINVAL;
3177 break;
3178 case HVM_PARAM_IDENT_PT:
3179 /* Not reflexive, as we must domain_pause(). */
3180 rc = -EPERM;
3181 if ( curr_d == d )
3182 break;
3184 rc = -EINVAL;
3185 if ( d->arch.hvm_domain.params[a.index] != 0 )
3186 break;
3188 rc = 0;
3189 if ( !paging_mode_hap(d) )
3190 break;
3192 /*
3193 * Update GUEST_CR3 in each VMCS to point at identity map.
3194 * All foreign updates to guest state must synchronise on
3195 * the domctl_lock.
3196 */
3197 rc = -EAGAIN;
3198 if ( !domctl_lock_acquire() )
3199 break;
3201 rc = 0;
3202 domain_pause(d);
3203 d->arch.hvm_domain.params[a.index] = a.value;
3204 for_each_vcpu ( d, v )
3205 paging_update_cr3(v);
3206 domain_unpause(d);
3208 domctl_lock_release();
3209 break;
3210 case HVM_PARAM_DM_DOMAIN:
3211 /* Not reflexive, as we must domain_pause(). */
3212 rc = -EPERM;
3213 if ( curr_d == d )
3214 break;
3216 if ( a.value == DOMID_SELF )
3217 a.value = curr_d->domain_id;
3219 rc = 0;
3220 domain_pause(d); /* safe to change per-vcpu xen_port */
3221 iorp = &d->arch.hvm_domain.ioreq;
3222 for_each_vcpu ( d, v )
3224 int old_port, new_port;
3225 new_port = alloc_unbound_xen_event_channel(v, a.value);
3226 if ( new_port < 0 )
3228 rc = new_port;
3229 break;
3231 /* xchg() ensures that only we free_xen_event_channel() */
3232 old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
3233 free_xen_event_channel(v, old_port);
3234 spin_lock(&iorp->lock);
3235 if ( iorp->va != NULL )
3236 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
3237 spin_unlock(&iorp->lock);
3239 domain_unpause(d);
3240 break;
3241 case HVM_PARAM_ACPI_S_STATE:
3242 /* Not reflexive, as we must domain_pause(). */
3243 rc = -EPERM;
3244 if ( curr_d == d )
3245 break;
3247 rc = 0;
3248 if ( a.value == 3 )
3249 hvm_s3_suspend(d);
3250 else if ( a.value == 0 )
3251 hvm_s3_resume(d);
3252 else
3253 rc = -EINVAL;
3255 break;
3256 case HVM_PARAM_ACPI_IOPORTS_LOCATION:
3257 rc = pmtimer_change_ioport(d, a.value);
3258 break;
3259 case HVM_PARAM_MEMORY_EVENT_CR0:
3260 case HVM_PARAM_MEMORY_EVENT_CR3:
3261 case HVM_PARAM_MEMORY_EVENT_CR4:
3262 if ( d->domain_id == current->domain->domain_id )
3263 rc = -EPERM;
3264 break;
3265 case HVM_PARAM_MEMORY_EVENT_INT3:
3266 if ( d->domain_id == current->domain->domain_id )
3268 rc = -EPERM;
3269 break;
3271 if ( a.value & HVMPME_onchangeonly )
3272 rc = -EINVAL;
3273 break;
3276 if ( rc == 0 )
3278 d->arch.hvm_domain.params[a.index] = a.value;
3280 switch( a.index )
3282 case HVM_PARAM_MEMORY_EVENT_INT3:
3284 domain_pause(d);
3285 domain_unpause(d); /* Causes guest to latch new status */
3286 break;
3288 case HVM_PARAM_MEMORY_EVENT_CR3:
3290 for_each_vcpu ( d, v )
3291 hvm_funcs.update_guest_cr(v, 0); /* Latches new CR3 mask through CR0 code */
3292 break;
3299 else
3301 switch ( a.index )
3303 case HVM_PARAM_ACPI_S_STATE:
3304 a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
3305 break;
3306 default:
3307 a.value = d->arch.hvm_domain.params[a.index];
3308 break;
3310 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
3313 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
3314 op == HVMOP_set_param ? "set" : "get",
3315 a.index, a.value);
3317 param_fail:
3318 rcu_unlock_domain(d);
3319 break;
3322 case HVMOP_set_pci_intx_level:
3323 rc = hvmop_set_pci_intx_level(
3324 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
3325 break;
3327 case HVMOP_set_isa_irq_level:
3328 rc = hvmop_set_isa_irq_level(
3329 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
3330 break;
3332 case HVMOP_set_pci_link_route:
3333 rc = hvmop_set_pci_link_route(
3334 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
3335 break;
3337 case HVMOP_flush_tlbs:
3338 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
3339 break;
3341 case HVMOP_track_dirty_vram:
3343 struct xen_hvm_track_dirty_vram a;
3344 struct domain *d;
3346 if ( copy_from_guest(&a, arg, 1) )
3347 return -EFAULT;
3349 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3350 if ( rc != 0 )
3351 return rc;
3353 rc = -EINVAL;
3354 if ( !is_hvm_domain(d) )
3355 goto param_fail2;
3357 rc = xsm_hvm_param(d, op);
3358 if ( rc )
3359 goto param_fail2;
3361 rc = -ESRCH;
3362 if ( d->is_dying )
3363 goto param_fail2;
3365 rc = -EINVAL;
3366 if ( d->vcpu == NULL || d->vcpu[0] == NULL )
3367 goto param_fail2;
3369 if ( shadow_mode_enabled(d) )
3370 rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
3371 else
3372 rc = hap_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
3374 param_fail2:
3375 rcu_unlock_domain(d);
3376 break;
3379 case HVMOP_modified_memory:
3381 struct xen_hvm_modified_memory a;
3382 struct domain *d;
3383 struct p2m_domain *p2m;
3384 unsigned long pfn;
3386 if ( copy_from_guest(&a, arg, 1) )
3387 return -EFAULT;
3389 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3390 if ( rc != 0 )
3391 return rc;
3393 rc = -EINVAL;
3394 if ( !is_hvm_domain(d) )
3395 goto param_fail3;
3397 rc = xsm_hvm_param(d, op);
3398 if ( rc )
3399 goto param_fail3;
3401 rc = -EINVAL;
3402 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
3403 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
3404 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
3405 goto param_fail3;
3407 rc = 0;
3408 if ( !paging_mode_log_dirty(d) )
3409 goto param_fail3;
3411 p2m = p2m_get_hostp2m(d);
3412 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
3414 p2m_type_t t;
3415 mfn_t mfn = gfn_to_mfn(p2m, pfn, &t);
3416 if ( p2m_is_paging(t) )
3418 p2m_mem_paging_populate(p2m, pfn);
3420 rc = -EINVAL;
3421 goto param_fail3;
3423 if( p2m_is_shared(t) )
3424 gdprintk(XENLOG_WARNING,
3425 "shared pfn 0x%lx modified?\n", pfn);
3427 if ( mfn_x(mfn) != INVALID_MFN )
3429 paging_mark_dirty(d, mfn_x(mfn));
3430 /* These are most probably not page tables any more */
3431 /* don't take a long time and don't die either */
3432 sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
3436 param_fail3:
3437 rcu_unlock_domain(d);
3438 break;
3441 case HVMOP_set_mem_type:
3443 struct xen_hvm_set_mem_type a;
3444 struct domain *d;
3445 struct p2m_domain *p2m;
3446 unsigned long pfn;
3448 /* Interface types to internal p2m types */
3449 p2m_type_t memtype[] = {
3450 p2m_ram_rw, /* HVMMEM_ram_rw */
3451 p2m_ram_ro, /* HVMMEM_ram_ro */
3452 p2m_mmio_dm /* HVMMEM_mmio_dm */
3453 };
3455 if ( copy_from_guest(&a, arg, 1) )
3456 return -EFAULT;
3458 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3459 if ( rc != 0 )
3460 return rc;
3462 rc = -EINVAL;
3463 if ( !is_hvm_domain(d) )
3464 goto param_fail4;
3466 rc = -EINVAL;
3467 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
3468 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
3469 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
3470 goto param_fail4;
3472 if ( a.hvmmem_type >= ARRAY_SIZE(memtype) )
3473 goto param_fail4;
3475 p2m = p2m_get_hostp2m(d);
3476 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
3478 p2m_type_t t;
3479 p2m_type_t nt;
3480 mfn_t mfn;
3481 mfn = gfn_to_mfn_unshare(p2m, pfn, &t, 0);
3482 if ( p2m_is_paging(t) )
3484 p2m_mem_paging_populate(p2m, pfn);
3486 rc = -EINVAL;
3487 goto param_fail4;
3489 if ( p2m_is_shared(t) )
3491 rc = -EINVAL;
3492 goto param_fail4;
3494 if ( p2m_is_grant(t) )
3496 gdprintk(XENLOG_WARNING,
3497 "type for pfn 0x%lx changed to grant while "
3498 "we were working?\n", pfn);
3499 goto param_fail4;
3501 else
3503 nt = p2m_change_type(p2m, pfn, t, memtype[a.hvmmem_type]);
3504 if ( nt != t )
3506 gdprintk(XENLOG_WARNING,
3507 "type of pfn 0x%lx changed from %d to %d while "
3508 "we were trying to change it to %d\n",
3509 pfn, t, nt, memtype[a.hvmmem_type]);
3510 goto param_fail4;
3515 rc = 0;
3517 param_fail4:
3518 rcu_unlock_domain(d);
3519 break;
3522 case HVMOP_set_mem_access:
3524 struct xen_hvm_set_mem_access a;
3525 struct domain *d;
3526 struct p2m_domain *p2m;
3527 unsigned long pfn;
3529 p2m_access_t memaccess[] = {
3530 p2m_access_n,
3531 p2m_access_r,
3532 p2m_access_w,
3533 p2m_access_rw,
3534 p2m_access_x,
3535 p2m_access_rx,
3536 p2m_access_wx,
3537 p2m_access_rwx,
3538 p2m_access_rx2rw,
3539 0, /* HVMMEM_access_default -- will get set below */
3540 };
3542 if ( copy_from_guest(&a, arg, 1) )
3543 return -EFAULT;
3545 if ( current->domain->domain_id == a.domid )
3546 return -EPERM;
3548 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3549 if ( rc != 0 )
3550 return rc;
3552 rc = -EINVAL;
3553 if ( !is_hvm_domain(d) )
3554 goto param_fail5;
3556 p2m = p2m_get_hostp2m(d);
3557 memaccess[HVMMEM_access_default] = p2m->default_access;
3559 /* If request to set default access */
3560 if ( a.first_pfn == ~0ull )
3562 rc = 0;
3563 p2m->default_access = memaccess[a.hvmmem_access];
3564 goto param_fail5;
3567 rc = -EINVAL;
3568 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
3569 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
3570 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
3571 goto param_fail5;
3573 if ( a.hvmmem_access >= ARRAY_SIZE(memaccess) )
3574 goto param_fail5;
3576 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
3578 p2m_type_t t;
3579 mfn_t mfn;
3580 int success;
3582 mfn = gfn_to_mfn_unshare(p2m, pfn, &t, 0);
3584 p2m_lock(p2m);
3585 success = p2m->set_entry(p2m, pfn, mfn, 0, t, memaccess[a.hvmmem_access]);
3586 p2m_unlock(p2m);
3587 if ( !success )
3588 goto param_fail5;
3591 rc = 0;
3593 param_fail5:
3594 rcu_unlock_domain(d);
3595 break;
3598 case HVMOP_get_mem_access:
3600 struct xen_hvm_get_mem_access a;
3601 struct domain *d;
3602 struct p2m_domain *p2m;
3603 p2m_type_t t;
3604 p2m_access_t ac;
3605 mfn_t mfn;
3607 /* Interface access to internal p2m accesses */
3608 hvmmem_access_t memaccess[] = {
3609 HVMMEM_access_n,
3610 HVMMEM_access_r,
3611 HVMMEM_access_w,
3612 HVMMEM_access_rw,
3613 HVMMEM_access_x,
3614 HVMMEM_access_rx,
3615 HVMMEM_access_wx,
3616 HVMMEM_access_rwx,
3617 HVMMEM_access_rx2rw
3618 };
3620 if ( copy_from_guest(&a, arg, 1) )
3621 return -EFAULT;
3623 if ( current->domain->domain_id == a.domid )
3624 return -EPERM;
3626 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3627 if ( rc != 0 )
3628 return rc;
3630 rc = -EINVAL;
3631 if ( !is_hvm_domain(d) )
3632 goto param_fail6;
3634 p2m = p2m_get_hostp2m(d);
3636 if ( a.pfn == ~0ull )
3638 a.hvmmem_access = memaccess[p2m->default_access];
3640 else {
3641 rc = -EINVAL;
3642 if ( (a.pfn > domain_get_maximum_gpfn(d)) )
3643 goto param_fail6;
3645 rc = -ESRCH;
3646 mfn = p2m->get_entry(p2m, a.pfn, &t, &ac, p2m_query);
3648 if ( mfn_x(mfn) == INVALID_MFN )
3649 goto param_fail6;
3651 rc = -ERANGE;
3652 if ( ac >= ARRAY_SIZE(memaccess) )
3653 goto param_fail6;
3655 a.hvmmem_access = memaccess[ac];
3658 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
3660 param_fail6:
3661 rcu_unlock_domain(d);
3662 break;
3665 case HVMOP_pagetable_dying:
3667 struct xen_hvm_pagetable_dying a;
3668 struct domain *d;
3670 if ( copy_from_guest(&a, arg, 1) )
3671 return -EFAULT;
3673 rc = rcu_lock_target_domain_by_id(a.domid, &d);
3674 if ( rc != 0 )
3675 return rc;
3677 rc = -EINVAL;
3678 if ( !is_hvm_domain(d) || !paging_mode_shadow(d) )
3679 goto param_fail7;
3681 rc = 0;
3682 pagetable_dying(d, a.gpa);
3684 param_fail7:
3685 rcu_unlock_domain(d);
3686 break;
3689 case HVMOP_get_time: {
3690 xen_hvm_get_time_t gxt;
3692 gxt.now = NOW();
3693 if ( copy_to_guest(arg, &gxt, 1) )
3694 rc = -EFAULT;
3695 break;
3698 case HVMOP_xentrace: {
3699 xen_hvm_xentrace_t tr;
3701 if ( copy_from_guest(&tr, arg, 1 ) )
3702 return -EFAULT;
3704 if ( tr.extra_bytes > sizeof(tr.extra)
3705 || (tr.event & ~((1u<<TRC_SUBCLS_SHIFT)-1)) )
3706 return -EINVAL;
3708 /* Cycles will be taken at the vmexit and vmenter */
3709 trace_var(tr.event | TRC_GUEST, 0 /*!cycles*/,
3710 tr.extra_bytes,
3711 (unsigned char *)tr.extra);
3712 break;
3715 case HVMOP_inject_trap:
3717 xen_hvm_inject_trap_t tr;
3718 struct domain *d;
3719 struct vcpu *v;
3721 if ( copy_from_guest(&tr, arg, 1 ) )
3722 return -EFAULT;
3724 if ( current->domain->domain_id == tr.domid )
3725 return -EPERM;
3727 rc = rcu_lock_target_domain_by_id(tr.domid, &d);
3728 if ( rc != 0 )
3729 return rc;
3731 rc = -EINVAL;
3732 if ( !is_hvm_domain(d) )
3733 goto param_fail8;
3735 rc = -ENOENT;
3736 if ( tr.vcpuid >= d->max_vcpus || (v = d->vcpu[tr.vcpuid]) == NULL )
3737 goto param_fail8;
3739 if ( v->arch.hvm_vcpu.inject_trap != -1 )
3740 rc = -EBUSY;
3741 else
3743 v->arch.hvm_vcpu.inject_trap = tr.trap;
3744 v->arch.hvm_vcpu.inject_error_code = tr.error_code;
3745 v->arch.hvm_vcpu.inject_cr2 = tr.cr2;
3748 param_fail8:
3749 rcu_unlock_domain(d);
3750 break;
3753 default:
3755 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
3756 rc = -ENOSYS;
3757 break;
3761 if ( rc == -EAGAIN )
3762 rc = hypercall_create_continuation(
3763 __HYPERVISOR_hvm_op, "lh", op, arg);
3765 return rc;
3768 int hvm_debug_op(struct vcpu *v, int32_t op)
3770 int rc;
3772 switch ( op )
3774 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
3775 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
3776 rc = -ENOSYS;
3777 if ( !cpu_has_monitor_trap_flag )
3778 break;
3779 rc = 0;
3780 vcpu_pause(v);
3781 v->arch.hvm_vcpu.single_step =
3782 (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
3783 vcpu_unpause(v); /* guest will latch new state */
3784 break;
3785 default:
3786 rc = -ENOSYS;
3787 break;
3790 return rc;
3793 #ifdef __x86_64__
3794 static int hvm_memory_event_traps(long p, uint32_t reason,
3795 unsigned long value, unsigned long old,
3796 bool_t gla_valid, unsigned long gla)
3798 struct vcpu* v = current;
3799 struct domain *d = v->domain;
3800 mem_event_request_t req;
3801 int rc;
3803 if ( !(p & HVMPME_MODE_MASK) )
3804 return 0;
3806 if ( (p & HVMPME_onchangeonly) && (value == old) )
3807 return 1;
3809 rc = mem_event_check_ring(d);
3810 if ( rc )
3811 return rc;
3813 memset(&req, 0, sizeof(req));
3814 req.type = MEM_EVENT_TYPE_ACCESS;
3815 req.reason = reason;
3817 if ( (p & HVMPME_MODE_MASK) == HVMPME_mode_sync )
3819 req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED;
3820 vcpu_pause_nosync(v);
3823 req.gfn = value;
3824 req.vcpu_id = v->vcpu_id;
3825 if ( gla_valid )
3827 req.offset = gla & ((1 << PAGE_SHIFT) - 1);
3828 req.gla = gla;
3829 req.gla_valid = 1;
3832 mem_event_put_request(d, &req);
3834 return 1;
3837 void hvm_memory_event_cr0(unsigned long value, unsigned long old)
3839 hvm_memory_event_traps(current->domain->arch.hvm_domain
3840 .params[HVM_PARAM_MEMORY_EVENT_CR0],
3841 MEM_EVENT_REASON_CR0,
3842 value, old, 0, 0);
3845 void hvm_memory_event_cr3(unsigned long value, unsigned long old)
3847 hvm_memory_event_traps(current->domain->arch.hvm_domain
3848 .params[HVM_PARAM_MEMORY_EVENT_CR3],
3849 MEM_EVENT_REASON_CR3,
3850 value, old, 0, 0);
3853 void hvm_memory_event_cr4(unsigned long value, unsigned long old)
3855 hvm_memory_event_traps(current->domain->arch.hvm_domain
3856 .params[HVM_PARAM_MEMORY_EVENT_CR4],
3857 MEM_EVENT_REASON_CR4,
3858 value, old, 0, 0);
3861 int hvm_memory_event_int3(unsigned long gla)
3863 uint32_t pfec = PFEC_page_present;
3864 unsigned long gfn;
3865 gfn = paging_gva_to_gfn(current, gla, &pfec);
3867 return hvm_memory_event_traps(current->domain->arch.hvm_domain
3868 .params[HVM_PARAM_MEMORY_EVENT_INT3],
3869 MEM_EVENT_REASON_INT3,
3870 gfn, 0, 1, gla);
3872 #endif /* __x86_64__ */
3874 /*
3875 * Local variables:
3876 * mode: C
3877 * c-set-style: "BSD"
3878 * c-basic-offset: 4
3879 * tab-width: 4
3880 * indent-tabs-mode: nil
3881 * End:
3882 */