debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 20953:4f2d9156424d

vmx: Don't enable irq for machine check vmexit handling

We should not enable irq for machine check VMExit

In changeset 18658:824892134573, IRQ is enabled during VMExit except
external interrupt. The exception should apply for machine check also,
because :
a) The mce_logout_lock should be held in irq_disabled context.
b) The machine check event should be handled as quickly as possible,
enable irq will increase the period greatly.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 08 08:43:25 2010 +0000 (2010-02-08)
parents 6ade83cb21ca
children 8ab60a883fd5
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/mem_sharing.h>
41 #include <asm/hvm/emulate.h>
42 #include <asm/hvm/hvm.h>
43 #include <asm/hvm/support.h>
44 #include <asm/hvm/vmx/vmx.h>
45 #include <asm/hvm/vmx/vmcs.h>
46 #include <public/sched.h>
47 #include <public/hvm/ioreq.h>
48 #include <asm/hvm/vpic.h>
49 #include <asm/hvm/vlapic.h>
50 #include <asm/x86_emulate.h>
51 #include <asm/hvm/vpt.h>
52 #include <public/hvm/save.h>
53 #include <asm/hvm/trace.h>
54 #include <asm/xenoprof.h>
55 #include <asm/debugger.h>
57 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
59 static void vmx_ctxt_switch_from(struct vcpu *v);
60 static void vmx_ctxt_switch_to(struct vcpu *v);
62 static int vmx_alloc_vlapic_mapping(struct domain *d);
63 static void vmx_free_vlapic_mapping(struct domain *d);
64 static void vmx_install_vlapic_mapping(struct vcpu *v);
65 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
66 static void vmx_update_guest_efer(struct vcpu *v);
67 static void vmx_cpuid_intercept(
68 unsigned int *eax, unsigned int *ebx,
69 unsigned int *ecx, unsigned int *edx);
70 static void vmx_wbinvd_intercept(void);
71 static void vmx_fpu_dirty_intercept(void);
72 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
73 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
74 static void vmx_invlpg_intercept(unsigned long vaddr);
75 static void __ept_sync_domain(void *info);
77 static int vmx_domain_initialise(struct domain *d)
78 {
79 int rc;
81 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
82 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
83 d->arch.hvm_domain.vmx.ept_control.asr =
84 pagetable_get_pfn(d->arch.phys_table);
87 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
88 return rc;
90 return 0;
91 }
93 static void vmx_domain_destroy(struct domain *d)
94 {
95 if ( d->arch.hvm_domain.hap_enabled )
96 on_each_cpu(__ept_sync_domain, d, 1);
97 vmx_free_vlapic_mapping(d);
98 }
100 static int vmx_vcpu_initialise(struct vcpu *v)
101 {
102 int rc;
104 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
106 v->arch.schedule_tail = vmx_do_resume;
107 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
108 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
110 if ( (rc = vmx_create_vmcs(v)) != 0 )
111 {
112 dprintk(XENLOG_WARNING,
113 "Failed to create VMCS for vcpu %d: err=%d.\n",
114 v->vcpu_id, rc);
115 return rc;
116 }
118 vpmu_initialise(v);
120 vmx_install_vlapic_mapping(v);
122 /* %eax == 1 signals full real-mode support to the guest loader. */
123 if ( v->vcpu_id == 0 )
124 v->arch.guest_context.user_regs.eax = 1;
126 return 0;
127 }
129 static void vmx_vcpu_destroy(struct vcpu *v)
130 {
131 vmx_destroy_vmcs(v);
132 vpmu_destroy(v);
133 passive_domain_destroy(v);
134 }
136 #ifdef __x86_64__
138 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
140 static u32 msr_index[] =
141 {
142 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
143 };
145 #define MSR_INDEX_SIZE (ARRAY_SIZE(msr_index))
147 static void vmx_save_host_msrs(void)
148 {
149 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
150 int i;
152 for ( i = 0; i < MSR_INDEX_SIZE; i++ )
153 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
154 }
156 #define WRITE_MSR(address) \
157 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
158 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
159 wrmsrl(MSR_ ## address, msr_content); \
160 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
161 break
163 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
164 {
165 u64 msr_content = 0;
166 u32 ecx = regs->ecx;
167 struct vcpu *v = current;
168 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
170 switch ( ecx )
171 {
172 case MSR_EFER:
173 msr_content = v->arch.hvm_vcpu.guest_efer;
174 break;
176 case MSR_FS_BASE:
177 msr_content = __vmread(GUEST_FS_BASE);
178 break;
180 case MSR_GS_BASE:
181 msr_content = __vmread(GUEST_GS_BASE);
182 break;
184 case MSR_SHADOW_GS_BASE:
185 rdmsrl(MSR_SHADOW_GS_BASE, msr_content);
186 break;
188 case MSR_STAR:
189 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
190 break;
192 case MSR_LSTAR:
193 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
194 break;
196 case MSR_CSTAR:
197 msr_content = v->arch.hvm_vmx.cstar;
198 break;
200 case MSR_SYSCALL_MASK:
201 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
202 break;
204 default:
205 return HNDL_unhandled;
206 }
208 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
210 regs->eax = (u32)(msr_content >> 0);
211 regs->edx = (u32)(msr_content >> 32);
213 return HNDL_done;
214 }
216 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
217 {
218 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
219 u32 ecx = regs->ecx;
220 struct vcpu *v = current;
221 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
222 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
224 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
226 switch ( ecx )
227 {
228 case MSR_EFER:
229 if ( hvm_set_efer(msr_content) )
230 goto exception_raised;
231 break;
233 case MSR_FS_BASE:
234 case MSR_GS_BASE:
235 case MSR_SHADOW_GS_BASE:
236 if ( !is_canonical_address(msr_content) )
237 goto uncanonical_address;
239 if ( ecx == MSR_FS_BASE )
240 __vmwrite(GUEST_FS_BASE, msr_content);
241 else if ( ecx == MSR_GS_BASE )
242 __vmwrite(GUEST_GS_BASE, msr_content);
243 else
244 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
246 break;
248 case MSR_STAR:
249 WRITE_MSR(STAR);
251 case MSR_LSTAR:
252 if ( !is_canonical_address(msr_content) )
253 goto uncanonical_address;
254 WRITE_MSR(LSTAR);
256 case MSR_CSTAR:
257 if ( !is_canonical_address(msr_content) )
258 goto uncanonical_address;
259 v->arch.hvm_vmx.cstar = msr_content;
260 break;
262 case MSR_SYSCALL_MASK:
263 WRITE_MSR(SYSCALL_MASK);
265 default:
266 return HNDL_unhandled;
267 }
269 return HNDL_done;
271 uncanonical_address:
272 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
273 vmx_inject_hw_exception(TRAP_gp_fault, 0);
274 exception_raised:
275 return HNDL_exception_raised;
276 }
278 /*
279 * To avoid MSR save/restore at every VM exit/entry time, we restore
280 * the x86_64 specific MSRs at domain switch time. Since these MSRs
281 * are not modified once set for para domains, we don't save them,
282 * but simply reset them to values set in percpu_traps_init().
283 */
284 static void vmx_restore_host_msrs(void)
285 {
286 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
287 int i;
289 while ( host_msr_state->flags )
290 {
291 i = find_first_set_bit(host_msr_state->flags);
292 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
293 clear_bit(i, &host_msr_state->flags);
294 }
295 }
297 static void vmx_save_guest_msrs(struct vcpu *v)
298 {
299 /*
300 * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
301 * be updated at any time via SWAPGS, which we cannot trap.
302 */
303 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
304 }
306 static void vmx_restore_guest_msrs(struct vcpu *v)
307 {
308 struct vmx_msr_state *guest_msr_state, *host_msr_state;
309 unsigned long guest_flags;
310 int i;
312 guest_msr_state = &v->arch.hvm_vmx.msr_state;
313 host_msr_state = &this_cpu(host_msr_state);
315 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
317 guest_flags = guest_msr_state->flags;
319 while ( guest_flags )
320 {
321 i = find_first_set_bit(guest_flags);
323 HVM_DBG_LOG(DBG_LEVEL_2,
324 "restore guest's index %d msr %x with value %lx",
325 i, msr_index[i], guest_msr_state->msrs[i]);
326 set_bit(i, &host_msr_state->flags);
327 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
328 clear_bit(i, &guest_flags);
329 }
331 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
332 {
333 HVM_DBG_LOG(DBG_LEVEL_2,
334 "restore guest's EFER with value %lx",
335 v->arch.hvm_vcpu.guest_efer);
336 write_efer((read_efer() & ~EFER_SCE) |
337 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
338 }
340 if ( cpu_has_rdtscp )
341 wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
342 }
344 #else /* __i386__ */
346 #define vmx_save_host_msrs() ((void)0)
347 #define vmx_restore_host_msrs() ((void)0)
349 #define vmx_save_guest_msrs(v) ((void)0)
350 #define vmx_restore_guest_msrs(v) ((void)0)
352 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
353 {
354 u64 msr_content = 0;
355 struct vcpu *v = current;
357 switch ( regs->ecx )
358 {
359 case MSR_EFER:
360 msr_content = v->arch.hvm_vcpu.guest_efer;
361 break;
363 default:
364 return HNDL_unhandled;
365 }
367 regs->eax = msr_content >> 0;
368 regs->edx = msr_content >> 32;
370 return HNDL_done;
371 }
373 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
374 {
375 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
377 switch ( regs->ecx )
378 {
379 case MSR_EFER:
380 if ( hvm_set_efer(msr_content) )
381 return HNDL_exception_raised;
382 break;
384 default:
385 return HNDL_unhandled;
386 }
388 return HNDL_done;
389 }
391 #endif /* __i386__ */
393 static int vmx_guest_x86_mode(struct vcpu *v)
394 {
395 unsigned int cs_ar_bytes;
397 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
398 return 0;
399 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
400 return 1;
401 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
402 if ( hvm_long_mode_enabled(v) &&
403 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
404 return 8;
405 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
406 }
408 static void vmx_save_dr(struct vcpu *v)
409 {
410 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
411 return;
413 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
414 v->arch.hvm_vcpu.flag_dr_dirty = 0;
415 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
416 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
418 v->arch.guest_context.debugreg[0] = read_debugreg(0);
419 v->arch.guest_context.debugreg[1] = read_debugreg(1);
420 v->arch.guest_context.debugreg[2] = read_debugreg(2);
421 v->arch.guest_context.debugreg[3] = read_debugreg(3);
422 v->arch.guest_context.debugreg[6] = read_debugreg(6);
423 /* DR7 must be saved as it is used by vmx_restore_dr(). */
424 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
425 }
427 static void __restore_debug_registers(struct vcpu *v)
428 {
429 if ( v->arch.hvm_vcpu.flag_dr_dirty )
430 return;
432 v->arch.hvm_vcpu.flag_dr_dirty = 1;
434 write_debugreg(0, v->arch.guest_context.debugreg[0]);
435 write_debugreg(1, v->arch.guest_context.debugreg[1]);
436 write_debugreg(2, v->arch.guest_context.debugreg[2]);
437 write_debugreg(3, v->arch.guest_context.debugreg[3]);
438 write_debugreg(6, v->arch.guest_context.debugreg[6]);
439 /* DR7 is loaded from the VMCS. */
440 }
442 /*
443 * DR7 is saved and restored on every vmexit. Other debug registers only
444 * need to be restored if their value is going to affect execution -- i.e.,
445 * if one of the breakpoints is enabled. So mask out all bits that don't
446 * enable some breakpoint functionality.
447 */
448 static void vmx_restore_dr(struct vcpu *v)
449 {
450 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
451 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
452 __restore_debug_registers(v);
453 }
455 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
456 {
457 uint32_t ev;
459 vmx_vmcs_enter(v);
461 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
462 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
463 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
464 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
466 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
468 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
469 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
470 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
472 c->pending_event = 0;
473 c->error_code = 0;
474 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
475 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
476 {
477 c->pending_event = ev;
478 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
479 }
481 vmx_vmcs_exit(v);
482 }
484 static int vmx_restore_cr0_cr3(
485 struct vcpu *v, unsigned long cr0, unsigned long cr3)
486 {
487 unsigned long mfn = 0;
488 p2m_type_t p2mt;
490 if ( paging_mode_shadow(v->domain) )
491 {
492 if ( cr0 & X86_CR0_PG )
493 {
494 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
495 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
496 {
497 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
498 return -EINVAL;
499 }
500 }
502 if ( hvm_paging_enabled(v) )
503 put_page(pagetable_get_page(v->arch.guest_table));
505 v->arch.guest_table = pagetable_from_pfn(mfn);
506 }
508 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
509 v->arch.hvm_vcpu.guest_cr[3] = cr3;
511 return 0;
512 }
514 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
515 {
516 int rc;
518 if ( c->pending_valid &&
519 ((c->pending_type == 1) || (c->pending_type > 6) ||
520 (c->pending_reserved != 0)) )
521 {
522 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
523 c->pending_event);
524 return -EINVAL;
525 }
527 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
528 if ( rc )
529 return rc;
531 vmx_vmcs_enter(v);
533 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
534 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
535 vmx_update_guest_cr(v, 0);
536 vmx_update_guest_cr(v, 2);
537 vmx_update_guest_cr(v, 4);
539 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
540 vmx_update_guest_efer(v);
542 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
543 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
544 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
546 __vmwrite(GUEST_DR7, c->dr7);
548 vmx_vmcs_exit(v);
550 paging_update_paging_modes(v);
552 if ( c->pending_valid )
553 {
554 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
555 c->pending_event, c->error_code);
557 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
558 {
559 vmx_vmcs_enter(v);
560 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
561 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
562 vmx_vmcs_exit(v);
563 }
564 }
566 return 0;
567 }
569 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
570 {
571 #ifdef __x86_64__
572 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
573 unsigned long guest_flags = guest_state->flags;
575 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
576 data->msr_cstar = v->arch.hvm_vmx.cstar;
578 /* save msrs */
579 data->msr_flags = guest_flags;
580 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
581 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
582 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
583 #endif
585 data->tsc = hvm_get_guest_tsc(v);
586 }
588 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
589 {
590 #ifdef __x86_64__
591 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
593 /* restore msrs */
594 guest_state->flags = data->msr_flags & 7;
595 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
596 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
597 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
599 v->arch.hvm_vmx.cstar = data->msr_cstar;
600 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
601 #endif
603 hvm_set_guest_tsc(v, data->tsc);
604 }
607 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
608 {
609 vmx_save_cpu_state(v, ctxt);
610 vmx_vmcs_save(v, ctxt);
611 }
613 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
614 {
615 vmx_load_cpu_state(v, ctxt);
617 if ( vmx_vmcs_restore(v, ctxt) )
618 {
619 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
620 domain_crash(v->domain);
621 return -EINVAL;
622 }
624 return 0;
625 }
627 static void vmx_fpu_enter(struct vcpu *v)
628 {
629 setup_fpu(v);
630 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
631 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
632 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
633 }
635 static void vmx_fpu_leave(struct vcpu *v)
636 {
637 ASSERT(!v->fpu_dirtied);
638 ASSERT(read_cr0() & X86_CR0_TS);
640 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
641 {
642 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
643 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
644 }
646 /*
647 * If the guest does not have TS enabled then we must cause and handle an
648 * exception on first use of the FPU. If the guest *does* have TS enabled
649 * then this is not necessary: no FPU activity can occur until the guest
650 * clears CR0.TS, and we will initialise the FPU when that happens.
651 */
652 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
653 {
654 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
655 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
656 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
657 }
658 }
660 static void vmx_ctxt_switch_from(struct vcpu *v)
661 {
662 vmx_fpu_leave(v);
663 vmx_save_guest_msrs(v);
664 vmx_restore_host_msrs();
665 vmx_save_dr(v);
666 vpmu_save(v);
667 }
669 static void vmx_ctxt_switch_to(struct vcpu *v)
670 {
671 struct domain *d = v->domain;
672 unsigned long old_cr4 = read_cr4(), new_cr4 = mmu_cr4_features;
674 /* HOST_CR4 in VMCS is always mmu_cr4_features and
675 * CR4_OSXSAVE(if supported). Sync CR4 now. */
676 if ( cpu_has_xsave )
677 new_cr4 |= X86_CR4_OSXSAVE;
678 if ( old_cr4 != new_cr4 )
679 write_cr4(new_cr4);
681 if ( d->arch.hvm_domain.hap_enabled )
682 {
683 unsigned int cpu = smp_processor_id();
684 /* Test-and-test-and-set this CPU in the EPT-is-synced mask. */
685 if ( !cpu_isset(cpu, d->arch.hvm_domain.vmx.ept_synced) &&
686 !cpu_test_and_set(cpu, d->arch.hvm_domain.vmx.ept_synced) )
687 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
688 }
690 vmx_restore_guest_msrs(v);
691 vmx_restore_dr(v);
692 vpmu_load(v);
693 }
696 /* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
697 * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
698 * The guest thinks it's got ring-0 segments, so we need to fudge
699 * things. We store the ring-3 version in the VMCS to avoid lots of
700 * shuffling on vmenter and vmexit, and translate in these accessors. */
702 #define rm_cs_attr (((union segment_attributes) { \
703 .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
704 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
705 #define rm_ds_attr (((union segment_attributes) { \
706 .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
707 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
708 #define vm86_ds_attr (((union segment_attributes) { \
709 .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \
710 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
711 #define vm86_tr_attr (((union segment_attributes) { \
712 .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \
713 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
715 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
716 struct segment_register *reg)
717 {
718 uint32_t attr = 0;
720 vmx_vmcs_enter(v);
722 switch ( seg )
723 {
724 case x86_seg_cs:
725 reg->sel = __vmread(GUEST_CS_SELECTOR);
726 reg->limit = __vmread(GUEST_CS_LIMIT);
727 reg->base = __vmread(GUEST_CS_BASE);
728 attr = __vmread(GUEST_CS_AR_BYTES);
729 break;
730 case x86_seg_ds:
731 reg->sel = __vmread(GUEST_DS_SELECTOR);
732 reg->limit = __vmread(GUEST_DS_LIMIT);
733 reg->base = __vmread(GUEST_DS_BASE);
734 attr = __vmread(GUEST_DS_AR_BYTES);
735 break;
736 case x86_seg_es:
737 reg->sel = __vmread(GUEST_ES_SELECTOR);
738 reg->limit = __vmread(GUEST_ES_LIMIT);
739 reg->base = __vmread(GUEST_ES_BASE);
740 attr = __vmread(GUEST_ES_AR_BYTES);
741 break;
742 case x86_seg_fs:
743 reg->sel = __vmread(GUEST_FS_SELECTOR);
744 reg->limit = __vmread(GUEST_FS_LIMIT);
745 reg->base = __vmread(GUEST_FS_BASE);
746 attr = __vmread(GUEST_FS_AR_BYTES);
747 break;
748 case x86_seg_gs:
749 reg->sel = __vmread(GUEST_GS_SELECTOR);
750 reg->limit = __vmread(GUEST_GS_LIMIT);
751 reg->base = __vmread(GUEST_GS_BASE);
752 attr = __vmread(GUEST_GS_AR_BYTES);
753 break;
754 case x86_seg_ss:
755 reg->sel = __vmread(GUEST_SS_SELECTOR);
756 reg->limit = __vmread(GUEST_SS_LIMIT);
757 reg->base = __vmread(GUEST_SS_BASE);
758 attr = __vmread(GUEST_SS_AR_BYTES);
759 break;
760 case x86_seg_tr:
761 reg->sel = __vmread(GUEST_TR_SELECTOR);
762 reg->limit = __vmread(GUEST_TR_LIMIT);
763 reg->base = __vmread(GUEST_TR_BASE);
764 attr = __vmread(GUEST_TR_AR_BYTES);
765 break;
766 case x86_seg_gdtr:
767 reg->limit = __vmread(GUEST_GDTR_LIMIT);
768 reg->base = __vmread(GUEST_GDTR_BASE);
769 break;
770 case x86_seg_idtr:
771 reg->limit = __vmread(GUEST_IDTR_LIMIT);
772 reg->base = __vmread(GUEST_IDTR_BASE);
773 break;
774 case x86_seg_ldtr:
775 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
776 reg->limit = __vmread(GUEST_LDTR_LIMIT);
777 reg->base = __vmread(GUEST_LDTR_BASE);
778 attr = __vmread(GUEST_LDTR_AR_BYTES);
779 break;
780 default:
781 BUG();
782 }
784 vmx_vmcs_exit(v);
786 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
787 /* Unusable flag is folded into Present flag. */
788 if ( attr & (1u<<16) )
789 reg->attr.fields.p = 0;
791 /* Adjust for virtual 8086 mode */
792 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr
793 && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
794 {
795 struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
796 if ( seg == x86_seg_tr )
797 *reg = *sreg;
798 else if ( reg->base != sreg->base || seg == x86_seg_ss )
799 {
800 /* If the guest's reloaded the segment, remember the new version.
801 * We can't tell if the guest reloaded the segment with another
802 * one that has the same base. By default we assume it hasn't,
803 * since we don't want to lose big-real-mode segment attributes,
804 * but for SS we assume it has: the Ubuntu graphical bootloader
805 * does this and gets badly confused if we leave the old SS in
806 * place. */
807 reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
808 *sreg = *reg;
809 }
810 else
811 {
812 /* Always give realmode guests a selector that matches the base
813 * but keep the attr and limit from before */
814 *reg = *sreg;
815 reg->sel = reg->base >> 4;
816 }
817 }
818 }
820 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
821 struct segment_register *reg)
822 {
823 uint32_t attr, sel, limit;
824 uint64_t base;
826 sel = reg->sel;
827 attr = reg->attr.bytes;
828 limit = reg->limit;
829 base = reg->base;
831 /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
832 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
833 {
834 /* Remember the proper contents */
835 v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
837 if ( seg == x86_seg_tr )
838 {
839 if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
840 {
841 sel = 0;
842 attr = vm86_tr_attr;
843 limit = 0xff;
844 base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
845 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
846 }
847 else
848 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
849 }
850 else
851 {
852 /* Try to fake it out as a 16bit data segment. This could
853 * cause confusion for the guest if it reads the selector,
854 * but otherwise we have to emulate if *any* segment hasn't
855 * been reloaded. */
856 if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
857 && reg->attr.fields.p )
858 {
859 sel = base >> 4;
860 attr = vm86_ds_attr;
861 limit = 0xffff;
862 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
863 }
864 else
865 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
866 }
867 }
869 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
871 /* Not-present must mean unusable. */
872 if ( !reg->attr.fields.p )
873 attr |= (1u << 16);
875 /* VMX has strict consistency requirement for flag G. */
876 attr |= !!(limit >> 20) << 15;
878 vmx_vmcs_enter(v);
880 switch ( seg )
881 {
882 case x86_seg_cs:
883 __vmwrite(GUEST_CS_SELECTOR, sel);
884 __vmwrite(GUEST_CS_LIMIT, limit);
885 __vmwrite(GUEST_CS_BASE, base);
886 __vmwrite(GUEST_CS_AR_BYTES, attr);
887 break;
888 case x86_seg_ds:
889 __vmwrite(GUEST_DS_SELECTOR, sel);
890 __vmwrite(GUEST_DS_LIMIT, limit);
891 __vmwrite(GUEST_DS_BASE, base);
892 __vmwrite(GUEST_DS_AR_BYTES, attr);
893 break;
894 case x86_seg_es:
895 __vmwrite(GUEST_ES_SELECTOR, sel);
896 __vmwrite(GUEST_ES_LIMIT, limit);
897 __vmwrite(GUEST_ES_BASE, base);
898 __vmwrite(GUEST_ES_AR_BYTES, attr);
899 break;
900 case x86_seg_fs:
901 __vmwrite(GUEST_FS_SELECTOR, sel);
902 __vmwrite(GUEST_FS_LIMIT, limit);
903 __vmwrite(GUEST_FS_BASE, base);
904 __vmwrite(GUEST_FS_AR_BYTES, attr);
905 break;
906 case x86_seg_gs:
907 __vmwrite(GUEST_GS_SELECTOR, sel);
908 __vmwrite(GUEST_GS_LIMIT, limit);
909 __vmwrite(GUEST_GS_BASE, base);
910 __vmwrite(GUEST_GS_AR_BYTES, attr);
911 break;
912 case x86_seg_ss:
913 __vmwrite(GUEST_SS_SELECTOR, sel);
914 __vmwrite(GUEST_SS_LIMIT, limit);
915 __vmwrite(GUEST_SS_BASE, base);
916 __vmwrite(GUEST_SS_AR_BYTES, attr);
917 break;
918 case x86_seg_tr:
919 __vmwrite(GUEST_TR_SELECTOR, sel);
920 __vmwrite(GUEST_TR_LIMIT, limit);
921 __vmwrite(GUEST_TR_BASE, base);
922 /* VMX checks that the the busy flag (bit 1) is set. */
923 __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
924 break;
925 case x86_seg_gdtr:
926 __vmwrite(GUEST_GDTR_LIMIT, limit);
927 __vmwrite(GUEST_GDTR_BASE, base);
928 break;
929 case x86_seg_idtr:
930 __vmwrite(GUEST_IDTR_LIMIT, limit);
931 __vmwrite(GUEST_IDTR_BASE, base);
932 break;
933 case x86_seg_ldtr:
934 __vmwrite(GUEST_LDTR_SELECTOR, sel);
935 __vmwrite(GUEST_LDTR_LIMIT, limit);
936 __vmwrite(GUEST_LDTR_BASE, base);
937 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
938 break;
939 default:
940 BUG();
941 }
943 vmx_vmcs_exit(v);
944 }
946 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
947 {
948 vmx_vmcs_enter(v);
949 __vmwrite(TSC_OFFSET, offset);
950 #if defined (__i386__)
951 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
952 #endif
953 vmx_vmcs_exit(v);
954 }
956 static void vmx_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
957 {
958 vmx_vmcs_enter(v);
959 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
960 if ( enable )
961 v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
962 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
963 vmx_vmcs_exit(v);
964 }
966 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
967 {
968 char *p;
969 int i;
971 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
972 {
973 p = (char *)(hypercall_page + (i * 32));
974 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
975 *(u32 *)(p + 1) = i;
976 *(u8 *)(p + 5) = 0x0f; /* vmcall */
977 *(u8 *)(p + 6) = 0x01;
978 *(u8 *)(p + 7) = 0xc1;
979 *(u8 *)(p + 8) = 0xc3; /* ret */
980 }
982 /* Don't support HYPERVISOR_iret at the moment */
983 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
984 }
986 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
987 {
988 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
989 }
991 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
992 {
993 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
994 }
996 static void vmx_load_pdptrs(struct vcpu *v)
997 {
998 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
999 uint64_t *guest_pdptrs;
1000 p2m_type_t p2mt;
1001 char *p;
1003 /* EPT needs to load PDPTRS into VMCS for PAE. */
1004 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1005 return;
1007 if ( cr3 & 0x1fUL )
1008 goto crash;
1010 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
1011 if ( !p2m_is_ram(p2mt) )
1012 goto crash;
1014 p = map_domain_page(mfn);
1016 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
1018 /*
1019 * We do not check the PDPTRs for validity. The CPU will do this during
1020 * vm entry, and we can handle the failure there and crash the guest.
1021 * The only thing we could do better here is #GP instead.
1022 */
1024 vmx_vmcs_enter(v);
1026 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
1027 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
1028 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
1029 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
1030 #ifdef __i386__
1031 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
1032 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
1033 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
1034 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
1035 #endif
1037 vmx_vmcs_exit(v);
1039 unmap_domain_page(p);
1040 return;
1042 crash:
1043 domain_crash(v->domain);
1046 static void vmx_update_host_cr3(struct vcpu *v)
1048 vmx_vmcs_enter(v);
1049 __vmwrite(HOST_CR3, v->arch.cr3);
1050 vmx_vmcs_exit(v);
1053 void vmx_update_debug_state(struct vcpu *v)
1055 unsigned long intercepts, mask;
1057 ASSERT(v == current);
1059 mask = 1u << TRAP_int3;
1060 if ( !cpu_has_monitor_trap_flag )
1061 mask |= 1u << TRAP_debug;
1063 intercepts = __vmread(EXCEPTION_BITMAP);
1064 if ( v->arch.hvm_vcpu.debug_state_latch )
1065 intercepts |= mask;
1066 else
1067 intercepts &= ~mask;
1068 __vmwrite(EXCEPTION_BITMAP, intercepts);
1071 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1073 vmx_vmcs_enter(v);
1075 switch ( cr )
1077 case 0: {
1078 int realmode;
1079 unsigned long hw_cr0_mask = X86_CR0_NE;
1081 if ( !vmx_unrestricted_guest(v) )
1082 hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE;
1084 if ( paging_mode_shadow(v->domain) )
1085 hw_cr0_mask |= X86_CR0_WP;
1087 if ( paging_mode_hap(v->domain) )
1089 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
1090 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1091 CPU_BASED_CR3_STORE_EXITING);
1092 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1093 if ( !hvm_paging_enabled(v) )
1094 v->arch.hvm_vmx.exec_control |= cr3_ctls;
1095 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1097 /* Changing CR0.PE can change some bits in real CR4. */
1098 vmx_update_guest_cr(v, 4);
1101 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1103 if ( v != current )
1104 hw_cr0_mask |= X86_CR0_TS;
1105 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1106 vmx_fpu_enter(v);
1109 realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
1111 if ( (!vmx_unrestricted_guest(v)) &&
1112 (realmode != v->arch.hvm_vmx.vmx_realmode) )
1114 enum x86_segment s;
1115 struct segment_register reg[x86_seg_tr + 1];
1117 /* Entering or leaving real mode: adjust the segment registers.
1118 * Need to read them all either way, as realmode reads can update
1119 * the saved values we'll use when returning to prot mode. */
1120 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1121 vmx_get_segment_register(v, s, &reg[s]);
1122 v->arch.hvm_vmx.vmx_realmode = realmode;
1124 if ( realmode )
1126 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1127 vmx_set_segment_register(v, s, &reg[s]);
1128 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1129 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1130 __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
1132 else
1134 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1135 if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
1136 vmx_set_segment_register(
1137 v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
1138 v->arch.hvm_vcpu.hw_cr[4] =
1139 ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
1140 |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
1141 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1142 __vmwrite(EXCEPTION_BITMAP,
1143 HVM_TRAP_MASK
1144 | (paging_mode_hap(v->domain) ?
1145 0 : (1U << TRAP_page_fault))
1146 | (1U << TRAP_no_device));
1147 vmx_update_debug_state(v);
1151 v->arch.hvm_vcpu.hw_cr[0] =
1152 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1153 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1154 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1155 break;
1157 case 2:
1158 /* CR2 is updated in exit stub. */
1159 break;
1160 case 3:
1161 if ( paging_mode_hap(v->domain) )
1163 if ( !hvm_paging_enabled(v) )
1164 v->arch.hvm_vcpu.hw_cr[3] =
1165 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1166 vmx_load_pdptrs(v);
1169 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1170 hvm_asid_flush_vcpu(v);
1171 break;
1172 case 4:
1173 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1174 if ( paging_mode_hap(v->domain) )
1175 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1176 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1177 if ( v->arch.hvm_vmx.vmx_realmode )
1178 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1179 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1181 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1182 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1184 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1185 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1186 break;
1187 default:
1188 BUG();
1191 vmx_vmcs_exit(v);
1194 static void vmx_update_guest_efer(struct vcpu *v)
1196 #ifdef __x86_64__
1197 unsigned long vm_entry_value;
1199 vmx_vmcs_enter(v);
1201 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1202 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1203 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1204 else
1205 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1206 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1208 vmx_vmcs_exit(v);
1209 #endif
1211 if ( v == current )
1212 write_efer((read_efer() & ~EFER_SCE) |
1213 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
1216 static void __ept_sync_domain(void *info)
1218 struct domain *d = info;
1219 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1222 void ept_sync_domain(struct domain *d)
1224 /* Only if using EPT and this domain has some VCPUs to dirty. */
1225 if ( !d->arch.hvm_domain.hap_enabled || !d->vcpu || !d->vcpu[0] )
1226 return;
1228 ASSERT(local_irq_is_enabled());
1229 ASSERT(p2m_locked_by_me(d->arch.p2m));
1231 /*
1232 * Flush active cpus synchronously. Flush others the next time this domain
1233 * is scheduled onto them. We accept the race of other CPUs adding to
1234 * the ept_synced mask before on_selected_cpus() reads it, resulting in
1235 * unnecessary extra flushes, to avoid allocating a cpumask_t on the stack.
1236 */
1237 d->arch.hvm_domain.vmx.ept_synced = d->domain_dirty_cpumask;
1238 on_selected_cpus(&d->arch.hvm_domain.vmx.ept_synced,
1239 __ept_sync_domain, d, 1);
1242 static void __vmx_inject_exception(int trap, int type, int error_code)
1244 unsigned long intr_fields;
1245 struct vcpu *curr = current;
1247 /*
1248 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1249 * "If the VM entry is injecting, there is no blocking by STI or by
1250 * MOV SS following the VM entry, regardless of the contents of the
1251 * interruptibility-state field [in the guest-state area before the
1252 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1253 */
1255 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1256 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1257 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1258 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1261 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1263 /* Can't inject exceptions in virtual 8086 mode because they would
1264 * use the protected-mode IDT. Emulate at the next vmenter instead. */
1265 if ( curr->arch.hvm_vmx.vmx_realmode )
1266 curr->arch.hvm_vmx.vmx_emulate = 1;
1269 void vmx_inject_hw_exception(int trap, int error_code)
1271 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1272 struct vcpu *curr = current;
1274 switch ( trap )
1276 case TRAP_debug:
1277 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1279 __restore_debug_registers(curr);
1280 write_debugreg(6, read_debugreg(6) | 0x4000);
1282 if ( cpu_has_monitor_trap_flag )
1283 break;
1284 case TRAP_int3:
1285 if ( curr->domain->debugger_attached )
1287 /* Debug/Int3: Trap to debugger. */
1288 domain_pause_for_debugger();
1289 return;
1293 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1294 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1296 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1297 if ( trap == TRAP_double_fault )
1298 error_code = 0;
1301 __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1303 if ( trap == TRAP_page_fault )
1304 HVMTRACE_LONG_2D(PF_INJECT, error_code,
1305 TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
1306 else
1307 HVMTRACE_2D(INJ_EXC, trap, error_code);
1310 void vmx_inject_extint(int trap)
1312 __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1313 HVM_DELIVER_NO_ERROR_CODE);
1316 void vmx_inject_nmi(void)
1318 __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1319 HVM_DELIVER_NO_ERROR_CODE);
1322 static void vmx_inject_exception(
1323 unsigned int trapnr, int errcode, unsigned long cr2)
1325 if ( trapnr == TRAP_page_fault )
1326 current->arch.hvm_vcpu.guest_cr[2] = cr2;
1328 vmx_inject_hw_exception(trapnr, errcode);
1331 static int vmx_event_pending(struct vcpu *v)
1333 ASSERT(v == current);
1334 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1337 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1339 return vpmu_do_interrupt(regs);
1342 static void vmx_set_uc_mode(struct vcpu *v)
1344 if ( paging_mode_hap(v->domain) )
1345 ept_change_entry_emt_with_range(
1346 v->domain, 0, v->domain->arch.p2m->max_mapped_pfn);
1347 hvm_asid_flush_vcpu(v);
1350 static void vmx_set_info_guest(struct vcpu *v)
1352 unsigned long intr_shadow;
1354 vmx_vmcs_enter(v);
1356 __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
1358 /*
1359 * If the interruptibility-state field indicates blocking by STI,
1360 * setting the TF flag in the EFLAGS may cause VM entry to fail
1361 * and crash the guest. See SDM 3B 22.3.1.5.
1362 * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
1363 * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
1364 * immediately vmexit and hence make no progress.
1365 */
1366 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1367 if ( v->domain->debugger_attached &&
1368 (v->arch.guest_context.user_regs.eflags & X86_EFLAGS_TF) &&
1369 (intr_shadow & VMX_INTR_SHADOW_STI) )
1371 intr_shadow &= ~VMX_INTR_SHADOW_STI;
1372 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1375 vmx_vmcs_exit(v);
1378 static struct hvm_function_table __read_mostly vmx_function_table = {
1379 .name = "VMX",
1380 .cpu_prepare = vmx_cpu_prepare,
1381 .domain_initialise = vmx_domain_initialise,
1382 .domain_destroy = vmx_domain_destroy,
1383 .vcpu_initialise = vmx_vcpu_initialise,
1384 .vcpu_destroy = vmx_vcpu_destroy,
1385 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1386 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1387 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1388 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1389 .guest_x86_mode = vmx_guest_x86_mode,
1390 .get_segment_register = vmx_get_segment_register,
1391 .set_segment_register = vmx_set_segment_register,
1392 .update_host_cr3 = vmx_update_host_cr3,
1393 .update_guest_cr = vmx_update_guest_cr,
1394 .update_guest_efer = vmx_update_guest_efer,
1395 .set_tsc_offset = vmx_set_tsc_offset,
1396 .inject_exception = vmx_inject_exception,
1397 .init_hypercall_page = vmx_init_hypercall_page,
1398 .event_pending = vmx_event_pending,
1399 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1400 .cpu_up = vmx_cpu_up,
1401 .cpu_down = vmx_cpu_down,
1402 .cpuid_intercept = vmx_cpuid_intercept,
1403 .wbinvd_intercept = vmx_wbinvd_intercept,
1404 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1405 .msr_read_intercept = vmx_msr_read_intercept,
1406 .msr_write_intercept = vmx_msr_write_intercept,
1407 .invlpg_intercept = vmx_invlpg_intercept,
1408 .set_uc_mode = vmx_set_uc_mode,
1409 .set_info_guest = vmx_set_info_guest,
1410 .set_rdtsc_exiting = vmx_set_rdtsc_exiting
1411 };
1413 void start_vmx(void)
1415 static bool_t bootstrapped;
1417 vmx_save_host_msrs();
1419 if ( test_and_set_bool(bootstrapped) )
1421 if ( hvm_enabled && !vmx_cpu_up() )
1423 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1424 smp_processor_id());
1425 BUG();
1427 return;
1430 /* Xen does not fill x86_capability words except 0. */
1431 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1433 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1434 return;
1436 set_in_cr4(X86_CR4_VMXE);
1438 if ( !vmx_cpu_up() )
1440 printk("VMX: failed to initialise.\n");
1441 return;
1444 if ( cpu_has_vmx_ept )
1445 vmx_function_table.hap_supported = 1;
1447 setup_vmcs_dump();
1449 hvm_enable(&vmx_function_table);
1452 /*
1453 * Not all cases receive valid value in the VM-exit instruction length field.
1454 * Callers must know what they're doing!
1455 */
1456 static int __get_instruction_length(void)
1458 int len;
1459 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1460 BUG_ON((len < 1) || (len > 15));
1461 return len;
1464 static void __update_guest_eip(unsigned long inst_len)
1466 struct cpu_user_regs *regs = guest_cpu_user_regs();
1467 unsigned long x;
1469 regs->eip += inst_len;
1470 regs->eflags &= ~X86_EFLAGS_RF;
1472 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1473 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1475 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1476 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1479 if ( regs->eflags & X86_EFLAGS_TF )
1480 vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
1483 static void vmx_fpu_dirty_intercept(void)
1485 struct vcpu *curr = current;
1487 vmx_fpu_enter(curr);
1489 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1490 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1492 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1493 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1497 #define bitmaskof(idx) (1U << ((idx) & 31))
1498 static void vmx_cpuid_intercept(
1499 unsigned int *eax, unsigned int *ebx,
1500 unsigned int *ecx, unsigned int *edx)
1502 unsigned int input = *eax;
1503 struct segment_register cs;
1504 struct vcpu *v = current;
1506 hvm_cpuid(input, eax, ebx, ecx, edx);
1508 switch ( input )
1510 case 0x80000001:
1511 /* SYSCALL is visible iff running in long mode. */
1512 hvm_get_segment_register(v, x86_seg_cs, &cs);
1513 if ( cs.attr.fields.l )
1514 *edx |= bitmaskof(X86_FEATURE_SYSCALL);
1515 else
1516 *edx &= ~(bitmaskof(X86_FEATURE_SYSCALL));
1518 break;
1521 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
1524 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1526 unsigned int eax, ebx, ecx, edx;
1528 eax = regs->eax;
1529 ebx = regs->ebx;
1530 ecx = regs->ecx;
1531 edx = regs->edx;
1533 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1535 regs->eax = eax;
1536 regs->ebx = ebx;
1537 regs->ecx = ecx;
1538 regs->edx = edx;
1541 static void vmx_dr_access(unsigned long exit_qualification,
1542 struct cpu_user_regs *regs)
1544 struct vcpu *v = current;
1546 HVMTRACE_0D(DR_WRITE);
1548 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1549 __restore_debug_registers(v);
1551 /* Allow guest direct access to DR registers */
1552 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1553 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1556 static void vmx_invlpg_intercept(unsigned long vaddr)
1558 struct vcpu *curr = current;
1559 HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
1560 if ( paging_invlpg(curr, vaddr) && cpu_has_vmx_vpid )
1561 vpid_sync_vcpu_gva(curr, vaddr);
1564 #define CASE_SET_REG(REG, reg) \
1565 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1566 #define CASE_GET_REG(REG, reg) \
1567 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1569 #define CASE_EXTEND_SET_REG \
1570 CASE_EXTEND_REG(S)
1571 #define CASE_EXTEND_GET_REG \
1572 CASE_EXTEND_REG(G)
1574 #ifdef __i386__
1575 #define CASE_EXTEND_REG(T)
1576 #else
1577 #define CASE_EXTEND_REG(T) \
1578 CASE_ ## T ## ET_REG(R8, r8); \
1579 CASE_ ## T ## ET_REG(R9, r9); \
1580 CASE_ ## T ## ET_REG(R10, r10); \
1581 CASE_ ## T ## ET_REG(R11, r11); \
1582 CASE_ ## T ## ET_REG(R12, r12); \
1583 CASE_ ## T ## ET_REG(R13, r13); \
1584 CASE_ ## T ## ET_REG(R14, r14); \
1585 CASE_ ## T ## ET_REG(R15, r15)
1586 #endif
1588 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1590 unsigned long value;
1591 struct vcpu *v = current;
1592 struct vlapic *vlapic = vcpu_vlapic(v);
1594 switch ( gp )
1596 CASE_GET_REG(EAX, eax);
1597 CASE_GET_REG(ECX, ecx);
1598 CASE_GET_REG(EDX, edx);
1599 CASE_GET_REG(EBX, ebx);
1600 CASE_GET_REG(EBP, ebp);
1601 CASE_GET_REG(ESI, esi);
1602 CASE_GET_REG(EDI, edi);
1603 CASE_GET_REG(ESP, esp);
1604 CASE_EXTEND_GET_REG;
1605 default:
1606 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1607 goto exit_and_crash;
1610 HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
1612 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1614 switch ( cr )
1616 case 0:
1617 return !hvm_set_cr0(value);
1619 case 3:
1620 return !hvm_set_cr3(value);
1622 case 4:
1623 return !hvm_set_cr4(value);
1625 case 8:
1626 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1627 break;
1629 default:
1630 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1631 goto exit_and_crash;
1634 return 1;
1636 exit_and_crash:
1637 domain_crash(v->domain);
1638 return 0;
1641 /*
1642 * Read from control registers. CR0 and CR4 are read from the shadow.
1643 */
1644 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1646 unsigned long value = 0;
1647 struct vcpu *v = current;
1648 struct vlapic *vlapic = vcpu_vlapic(v);
1650 switch ( cr )
1652 case 3:
1653 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1654 break;
1655 case 8:
1656 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1657 value = (value & 0xF0) >> 4;
1658 break;
1659 default:
1660 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1661 domain_crash(v->domain);
1662 break;
1665 switch ( gp ) {
1666 CASE_SET_REG(EAX, eax);
1667 CASE_SET_REG(ECX, ecx);
1668 CASE_SET_REG(EDX, edx);
1669 CASE_SET_REG(EBX, ebx);
1670 CASE_SET_REG(EBP, ebp);
1671 CASE_SET_REG(ESI, esi);
1672 CASE_SET_REG(EDI, edi);
1673 CASE_SET_REG(ESP, esp);
1674 CASE_EXTEND_SET_REG;
1675 default:
1676 printk("invalid gp: %d\n", gp);
1677 domain_crash(v->domain);
1678 break;
1681 HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
1683 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1686 static int vmx_cr_access(unsigned long exit_qualification,
1687 struct cpu_user_regs *regs)
1689 unsigned int gp, cr;
1690 unsigned long value;
1691 struct vcpu *v = current;
1693 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1695 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1696 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1697 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1698 return mov_to_cr(gp, cr, regs);
1699 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1700 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1701 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1702 mov_from_cr(cr, gp, regs);
1703 break;
1704 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1705 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1706 vmx_update_guest_cr(v, 0);
1707 HVMTRACE_0D(CLTS);
1708 break;
1709 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1710 value = v->arch.hvm_vcpu.guest_cr[0];
1711 /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
1712 value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
1713 HVMTRACE_LONG_1D(LMSW, value);
1714 return !hvm_set_cr0(value);
1715 default:
1716 BUG();
1719 return 1;
1722 static const struct lbr_info {
1723 u32 base, count;
1724 } p4_lbr[] = {
1725 { MSR_P4_LER_FROM_LIP, 1 },
1726 { MSR_P4_LER_TO_LIP, 1 },
1727 { MSR_P4_LASTBRANCH_TOS, 1 },
1728 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1729 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1730 { 0, 0 }
1731 }, c2_lbr[] = {
1732 { MSR_IA32_LASTINTFROMIP, 1 },
1733 { MSR_IA32_LASTINTTOIP, 1 },
1734 { MSR_C2_LASTBRANCH_TOS, 1 },
1735 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1736 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1737 { 0, 0 }
1738 #ifdef __i386__
1739 }, pm_lbr[] = {
1740 { MSR_IA32_LASTINTFROMIP, 1 },
1741 { MSR_IA32_LASTINTTOIP, 1 },
1742 { MSR_PM_LASTBRANCH_TOS, 1 },
1743 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1744 { 0, 0 }
1745 #endif
1746 };
1748 static const struct lbr_info *last_branch_msr_get(void)
1750 switch ( boot_cpu_data.x86 )
1752 case 6:
1753 switch ( boot_cpu_data.x86_model )
1755 #ifdef __i386__
1756 /* PentiumM */
1757 case 9: case 13:
1758 /* Core Solo/Duo */
1759 case 14:
1760 return pm_lbr;
1761 break;
1762 #endif
1763 /* Core2 Duo */
1764 case 15:
1765 return c2_lbr;
1766 break;
1768 break;
1770 case 15:
1771 switch ( boot_cpu_data.x86_model )
1773 /* Pentium4/Xeon with em64t */
1774 case 3: case 4: case 6:
1775 return p4_lbr;
1776 break;
1778 break;
1781 return NULL;
1784 static int is_last_branch_msr(u32 ecx)
1786 const struct lbr_info *lbr = last_branch_msr_get();
1788 if ( lbr == NULL )
1789 return 0;
1791 for ( ; lbr->count; lbr++ )
1792 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1793 return 1;
1795 return 0;
1798 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1800 u64 msr_content = 0;
1801 u32 ecx = regs->ecx, eax, edx;
1803 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1805 switch ( ecx )
1807 case MSR_IA32_SYSENTER_CS:
1808 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1809 break;
1810 case MSR_IA32_SYSENTER_ESP:
1811 msr_content = __vmread(GUEST_SYSENTER_ESP);
1812 break;
1813 case MSR_IA32_SYSENTER_EIP:
1814 msr_content = __vmread(GUEST_SYSENTER_EIP);
1815 break;
1816 case MSR_IA32_DEBUGCTLMSR:
1817 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1818 #ifdef __i386__
1819 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1820 #endif
1821 break;
1822 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1823 goto gp_fault;
1824 case MSR_IA32_MISC_ENABLE:
1825 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1826 /* Debug Trace Store is not supported. */
1827 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1828 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1829 break;
1830 default:
1831 if ( vpmu_do_rdmsr(regs) )
1832 goto done;
1833 if ( passive_domain_do_rdmsr(regs) )
1834 goto done;
1835 switch ( long_mode_do_msr_read(regs) )
1837 case HNDL_unhandled:
1838 break;
1839 case HNDL_exception_raised:
1840 return X86EMUL_EXCEPTION;
1841 case HNDL_done:
1842 goto done;
1845 if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
1846 break;
1848 if ( is_last_branch_msr(ecx) )
1850 msr_content = 0;
1851 break;
1854 if ( rdmsr_viridian_regs(ecx, &msr_content) ||
1855 rdmsr_hypervisor_regs(ecx, &msr_content) )
1856 break;
1858 if ( rdmsr_safe(ecx, eax, edx) == 0 )
1860 msr_content = ((uint64_t)edx << 32) | eax;
1861 break;
1864 goto gp_fault;
1867 regs->eax = (uint32_t)msr_content;
1868 regs->edx = (uint32_t)(msr_content >> 32);
1870 done:
1871 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1872 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1873 ecx, (unsigned long)regs->eax,
1874 (unsigned long)regs->edx);
1875 return X86EMUL_OKAY;
1877 gp_fault:
1878 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1879 return X86EMUL_EXCEPTION;
1882 static int vmx_alloc_vlapic_mapping(struct domain *d)
1884 void *apic_va;
1886 if ( !cpu_has_vmx_virtualize_apic_accesses )
1887 return 0;
1889 apic_va = alloc_xenheap_page();
1890 if ( apic_va == NULL )
1891 return -ENOMEM;
1892 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1893 set_mmio_p2m_entry(
1894 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1895 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1897 return 0;
1900 static void vmx_free_vlapic_mapping(struct domain *d)
1902 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1903 if ( mfn != 0 )
1904 free_xenheap_page(mfn_to_virt(mfn));
1907 static void vmx_install_vlapic_mapping(struct vcpu *v)
1909 paddr_t virt_page_ma, apic_page_ma;
1911 if ( !cpu_has_vmx_virtualize_apic_accesses )
1912 return;
1914 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1915 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1916 apic_page_ma <<= PAGE_SHIFT;
1918 vmx_vmcs_enter(v);
1919 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1920 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1921 vmx_vmcs_exit(v);
1924 void vmx_vlapic_msr_changed(struct vcpu *v)
1926 struct vlapic *vlapic = vcpu_vlapic(v);
1927 uint32_t ctl;
1929 if ( !cpu_has_vmx_virtualize_apic_accesses )
1930 return;
1932 vmx_vmcs_enter(v);
1933 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1934 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1935 if ( !vlapic_hw_disabled(vlapic) &&
1936 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1937 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1938 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1939 vmx_vmcs_exit(v);
1942 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1944 u32 ecx = regs->ecx;
1945 u64 msr_content;
1946 struct vcpu *v = current;
1948 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
1949 ecx, (u32)regs->eax, (u32)regs->edx);
1951 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1953 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
1955 switch ( ecx )
1957 case MSR_IA32_SYSENTER_CS:
1958 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1959 break;
1960 case MSR_IA32_SYSENTER_ESP:
1961 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1962 break;
1963 case MSR_IA32_SYSENTER_EIP:
1964 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1965 break;
1966 case MSR_IA32_DEBUGCTLMSR: {
1967 int i, rc = 0;
1969 if ( !msr_content || (msr_content & ~3) )
1970 break;
1972 if ( msr_content & 1 )
1974 const struct lbr_info *lbr = last_branch_msr_get();
1975 if ( lbr == NULL )
1976 break;
1978 for ( ; (rc == 0) && lbr->count; lbr++ )
1979 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
1980 if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
1981 vmx_disable_intercept_for_msr(v, lbr->base + i);
1984 if ( (rc < 0) ||
1985 (vmx_add_host_load_msr(ecx) < 0) )
1986 vmx_inject_hw_exception(TRAP_machine_check, 0);
1987 else
1989 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
1990 #ifdef __i386__
1991 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
1992 #endif
1995 break;
1997 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1998 goto gp_fault;
1999 default:
2000 if ( vpmu_do_wrmsr(regs) )
2001 return X86EMUL_OKAY;
2002 if ( passive_domain_do_wrmsr(regs) )
2003 return X86EMUL_OKAY;
2005 if ( wrmsr_viridian_regs(ecx, msr_content) )
2006 break;
2008 switch ( long_mode_do_msr_write(regs) )
2010 case HNDL_unhandled:
2011 if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
2012 !is_last_branch_msr(ecx) )
2013 wrmsr_hypervisor_regs(ecx, msr_content);
2014 break;
2015 case HNDL_exception_raised:
2016 return X86EMUL_EXCEPTION;
2017 case HNDL_done:
2018 break;
2020 break;
2023 return X86EMUL_OKAY;
2025 gp_fault:
2026 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2027 return X86EMUL_EXCEPTION;
2030 static void vmx_do_extint(struct cpu_user_regs *regs)
2032 unsigned int vector;
2034 vector = __vmread(VM_EXIT_INTR_INFO);
2035 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2037 vector &= INTR_INFO_VECTOR_MASK;
2038 HVMTRACE_1D(INTR, vector);
2040 switch ( vector )
2042 case IRQ_MOVE_CLEANUP_VECTOR:
2043 smp_irq_move_cleanup_interrupt(regs);
2044 break;
2045 case LOCAL_TIMER_VECTOR:
2046 smp_apic_timer_interrupt(regs);
2047 break;
2048 case EVENT_CHECK_VECTOR:
2049 smp_event_check_interrupt(regs);
2050 break;
2051 case INVALIDATE_TLB_VECTOR:
2052 smp_invalidate_interrupt();
2053 break;
2054 case CALL_FUNCTION_VECTOR:
2055 smp_call_function_interrupt(regs);
2056 break;
2057 case SPURIOUS_APIC_VECTOR:
2058 smp_spurious_interrupt(regs);
2059 break;
2060 case ERROR_APIC_VECTOR:
2061 smp_error_interrupt(regs);
2062 break;
2063 case CMCI_APIC_VECTOR:
2064 smp_cmci_interrupt(regs);
2065 break;
2066 case PMU_APIC_VECTOR:
2067 smp_pmu_apic_interrupt(regs);
2068 break;
2069 #ifdef CONFIG_X86_MCE_THERMAL
2070 case THERMAL_APIC_VECTOR:
2071 smp_thermal_interrupt(regs);
2072 break;
2073 #endif
2074 default:
2075 regs->entry_vector = vector;
2076 do_IRQ(regs);
2077 break;
2081 static void wbinvd_ipi(void *info)
2083 wbinvd();
2086 static void vmx_wbinvd_intercept(void)
2088 if ( !has_arch_pdevs(current->domain) )
2089 return;
2091 if ( cpu_has_wbinvd_exiting )
2092 on_each_cpu(wbinvd_ipi, NULL, 1);
2093 else
2094 wbinvd();
2097 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
2099 unsigned long gla, gfn = gpa >> PAGE_SHIFT;
2100 mfn_t mfn;
2101 p2m_type_t p2mt;
2103 if ( tb_init_done )
2105 struct {
2106 uint64_t gpa;
2107 uint64_t mfn;
2108 u32 qualification;
2109 u32 p2mt;
2110 } _d;
2112 _d.gpa = gpa;
2113 _d.qualification = qualification;
2114 _d.mfn = mfn_x(gfn_to_mfn_query(current->domain, gfn, &_d.p2mt));
2116 __trace_var(TRC_HVM_NPF, 0, sizeof(_d), (unsigned char *)&_d);
2119 if ( (qualification & EPT_GLA_VALID) &&
2120 hvm_hap_nested_page_fault(gfn) )
2121 return;
2123 /* Everything else is an error. */
2124 mfn = gfn_to_mfn_type_current(gfn, &p2mt, p2m_guest);
2125 gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
2126 "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
2127 qualification,
2128 (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
2129 (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
2130 (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
2131 (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
2132 (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
2133 (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
2134 gpa, mfn_x(mfn), p2mt);
2136 ept_walk_table(current->domain, gfn);
2138 if ( qualification & EPT_GLA_VALID )
2140 gla = __vmread(GUEST_LINEAR_ADDRESS);
2141 gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
2144 if ( qualification & EPT_GAW_VIOLATION )
2145 gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n",
2146 9 * (unsigned int)current->domain->arch.hvm_domain.
2147 vmx.ept_control.gaw + 21);
2149 domain_crash(current->domain);
2152 static void vmx_failed_vmentry(unsigned int exit_reason,
2153 struct cpu_user_regs *regs)
2155 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2156 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2157 struct vcpu *curr = current;
2159 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2160 switch ( failed_vmentry_reason )
2162 case EXIT_REASON_INVALID_GUEST_STATE:
2163 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2164 break;
2165 case EXIT_REASON_MSR_LOADING:
2166 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2167 break;
2168 case EXIT_REASON_MCE_DURING_VMENTRY:
2169 printk("caused by machine check.\n");
2170 HVMTRACE_0D(MCE);
2171 /* Already handled. */
2172 break;
2173 default:
2174 printk("reason not known yet!");
2175 break;
2178 printk("************* VMCS Area **************\n");
2179 vmcs_dump_vcpu(curr);
2180 printk("**************************************\n");
2182 domain_crash(curr->domain);
2185 asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
2187 struct vcpu *v = current;
2189 /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since
2190 * we have CR4.VME == 1 and our own TSS with an empty interrupt
2191 * redirection bitmap, all software INTs will be handled by vm86 */
2192 v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
2193 regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2196 static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs)
2198 struct hvm_emulate_ctxt ctxt;
2199 int rc;
2201 hvm_emulate_prepare(&ctxt, regs);
2203 rc = hvm_emulate_one(&ctxt);
2205 switch ( rc )
2207 case X86EMUL_UNHANDLEABLE:
2208 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2209 break;
2210 case X86EMUL_EXCEPTION:
2211 if ( ctxt.exn_pending )
2212 hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0);
2213 /* fall through */
2214 default:
2215 hvm_emulate_writeback(&ctxt);
2216 break;
2220 static int vmx_handle_eoi_write(void)
2222 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2224 /*
2225 * 1. Must be a linear access data write.
2226 * 2. Data write must be to the EOI register.
2227 */
2228 if ( (((exit_qualification >> 12) & 0xf) == 1) &&
2229 ((exit_qualification & 0xfff) == APIC_EOI) )
2231 int inst_len = __get_instruction_length(); /* Safe: APIC data write */
2232 __update_guest_eip(inst_len);
2233 vlapic_EOI_set(vcpu_vlapic(current));
2234 return 1;
2237 return 0;
2240 static int vmx_handle_xsetbv(u64 new_bv)
2242 struct vcpu *v = current;
2243 u64 xfeature = (((u64)xfeature_high) << 32) | xfeature_low;
2244 struct segment_register sreg;
2246 hvm_get_segment_register(v, x86_seg_ss, &sreg);
2247 if ( sreg.attr.fields.dpl != 0 )
2248 goto err;
2250 if ( ((new_bv ^ xfeature) & ~xfeature) || !(new_bv & 1) )
2251 goto err;
2253 if ( (xfeature & XSTATE_YMM & new_bv) && !(new_bv & XSTATE_SSE) )
2254 goto err;
2256 v->arch.hvm_vcpu.xfeature_mask = new_bv;
2257 set_xcr0(new_bv);
2258 return 0;
2259 err:
2260 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2261 return -1;
2264 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2266 unsigned int exit_reason, idtv_info, intr_info = 0, vector = 0;
2267 unsigned long exit_qualification, inst_len = 0;
2268 struct vcpu *v = current;
2270 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2271 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2272 __vmread(GUEST_CR3);
2274 exit_reason = __vmread(VM_EXIT_REASON);
2276 if ( hvm_long_mode_enabled(v) )
2277 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
2278 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
2279 0, 0, 0);
2280 else
2281 HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason,
2282 (uint32_t)regs->eip,
2283 0, 0, 0, 0);
2285 perfc_incra(vmexits, exit_reason);
2287 /* Handle the interrupt we missed before allowing any more in. */
2288 switch ( (uint16_t)exit_reason )
2290 case EXIT_REASON_EXTERNAL_INTERRUPT:
2291 vmx_do_extint(regs);
2292 break;
2293 case EXIT_REASON_EXCEPTION_NMI:
2294 intr_info = __vmread(VM_EXIT_INTR_INFO);
2295 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2296 vector = intr_info & INTR_INFO_VECTOR_MASK;
2297 if ( vector == TRAP_machine_check )
2298 do_machine_check(regs);
2299 break;
2300 case EXIT_REASON_MCE_DURING_VMENTRY:
2301 do_machine_check(regs);
2302 break;
2305 /* Now enable interrupts so it's safe to take locks. */
2306 local_irq_enable();
2308 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2309 return vmx_failed_vmentry(exit_reason, regs);
2311 if ( v->arch.hvm_vmx.vmx_realmode )
2313 /* Put RFLAGS back the way the guest wants it */
2314 regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2315 regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
2317 /* Unless this exit was for an interrupt, we've hit something
2318 * vm86 can't handle. Try again, using the emulator. */
2319 switch ( exit_reason )
2321 case EXIT_REASON_EXCEPTION_NMI:
2322 if ( vector != TRAP_page_fault
2323 && vector != TRAP_nmi
2324 && vector != TRAP_machine_check )
2326 perfc_incr(realmode_exits);
2327 v->arch.hvm_vmx.vmx_emulate = 1;
2328 return;
2330 case EXIT_REASON_EXTERNAL_INTERRUPT:
2331 case EXIT_REASON_INIT:
2332 case EXIT_REASON_SIPI:
2333 case EXIT_REASON_PENDING_VIRT_INTR:
2334 case EXIT_REASON_PENDING_VIRT_NMI:
2335 case EXIT_REASON_MCE_DURING_VMENTRY:
2336 break;
2337 default:
2338 v->arch.hvm_vmx.vmx_emulate = 1;
2339 perfc_incr(realmode_exits);
2340 return;
2344 hvm_maybe_deassert_evtchn_irq();
2346 /* Event delivery caused this intercept? Queue for redelivery. */
2347 idtv_info = __vmread(IDT_VECTORING_INFO);
2348 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2349 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2351 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2353 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2354 __vmwrite(VM_ENTRY_INTR_INFO,
2355 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2356 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2357 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2358 __vmread(IDT_VECTORING_ERROR_CODE));
2361 /*
2362 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2363 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2364 */
2365 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2366 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2367 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2368 ~VMX_INTR_SHADOW_NMI);
2371 switch ( exit_reason )
2373 case EXIT_REASON_EXCEPTION_NMI:
2375 /*
2376 * We don't set the software-interrupt exiting (INT n).
2377 * (1) We can get an exception (e.g. #PG) in the guest, or
2378 * (2) NMI
2379 */
2381 /*
2382 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2383 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2384 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2385 */
2386 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2387 !(idtv_info & INTR_INFO_VALID_MASK) &&
2388 (vector != TRAP_double_fault) )
2389 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2390 __vmread(GUEST_INTERRUPTIBILITY_INFO)
2391 | VMX_INTR_SHADOW_NMI);
2393 perfc_incra(cause_vector, vector);
2395 switch ( vector )
2397 case TRAP_debug:
2398 /*
2399 * Updates DR6 where debugger can peek (See 3B 23.2.1,
2400 * Table 23-1, "Exit Qualification for Debug Exceptions").
2401 */
2402 exit_qualification = __vmread(EXIT_QUALIFICATION);
2403 write_debugreg(6, exit_qualification | 0xffff0ff0);
2404 if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
2405 goto exit_and_crash;
2406 domain_pause_for_debugger();
2407 break;
2408 case TRAP_int3:
2409 if ( !v->domain->debugger_attached )
2410 goto exit_and_crash;
2411 inst_len = __get_instruction_length(); /* Safe: INT3 */
2412 __update_guest_eip(inst_len);
2413 #ifdef XEN_GDBSX_CONFIG
2414 current->arch.gdbsx_vcpu_event = TRAP_int3;
2415 #endif
2416 domain_pause_for_debugger();
2417 break;
2418 case TRAP_no_device:
2419 vmx_fpu_dirty_intercept();
2420 break;
2421 case TRAP_page_fault:
2422 exit_qualification = __vmread(EXIT_QUALIFICATION);
2423 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2425 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2426 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2427 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2428 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2429 (unsigned long)regs->esi, (unsigned long)regs->edi);
2431 if ( paging_fault(exit_qualification, regs) )
2433 if ( trace_will_trace_event(TRC_SHADOW) )
2434 break;
2435 if ( hvm_long_mode_enabled(v) )
2436 HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
2437 TRC_PAR_LONG(exit_qualification) );
2438 else
2439 HVMTRACE_2D(PF_XEN,
2440 regs->error_code, exit_qualification );
2441 break;
2444 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2445 vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
2446 break;
2447 case TRAP_nmi:
2448 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2449 (X86_EVENTTYPE_NMI << 8) )
2450 goto exit_and_crash;
2451 HVMTRACE_0D(NMI);
2452 self_nmi(); /* Real NMI, vector 2: normal processing. */
2453 break;
2454 case TRAP_machine_check:
2455 HVMTRACE_0D(MCE);
2456 /* Already handled above. */
2457 break;
2458 case TRAP_invalid_op:
2459 vmx_vmexit_ud_intercept(regs);
2460 break;
2461 default:
2462 goto exit_and_crash;
2464 break;
2466 case EXIT_REASON_EXTERNAL_INTERRUPT:
2467 /* Already handled above. */
2468 break;
2469 case EXIT_REASON_TRIPLE_FAULT:
2470 hvm_triple_fault();
2471 break;
2472 case EXIT_REASON_PENDING_VIRT_INTR:
2473 /* Disable the interrupt window. */
2474 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2475 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2476 v->arch.hvm_vmx.exec_control);
2477 break;
2478 case EXIT_REASON_PENDING_VIRT_NMI:
2479 /* Disable the NMI window. */
2480 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2481 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2482 v->arch.hvm_vmx.exec_control);
2483 break;
2484 case EXIT_REASON_TASK_SWITCH: {
2485 const enum hvm_task_switch_reason reasons[] = {
2486 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2487 int32_t ecode = -1, source;
2488 exit_qualification = __vmread(EXIT_QUALIFICATION);
2489 source = (exit_qualification >> 30) & 3;
2490 /* Vectored event should fill in interrupt information. */
2491 WARN_ON((source == 3) && !(idtv_info & INTR_INFO_VALID_MASK));
2492 /*
2493 * In the following cases there is an instruction to skip over:
2494 * - TSW is due to a CALL, IRET or JMP instruction.
2495 * - TSW is a vectored event due to a SW exception or SW interrupt.
2496 */
2497 inst_len = ((source != 3) || /* CALL, IRET, or JMP? */
2498 (idtv_info & (1u<<10))) /* IntrType > 3? */
2499 ? __get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0;
2500 if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2501 ecode = __vmread(IDT_VECTORING_ERROR_CODE);
2502 regs->eip += inst_len;
2503 hvm_task_switch((uint16_t)exit_qualification, reasons[source], ecode);
2504 break;
2506 case EXIT_REASON_CPUID:
2507 inst_len = __get_instruction_length(); /* Safe: CPUID */
2508 __update_guest_eip(inst_len);
2509 vmx_do_cpuid(regs);
2510 break;
2511 case EXIT_REASON_HLT:
2512 inst_len = __get_instruction_length(); /* Safe: HLT */
2513 __update_guest_eip(inst_len);
2514 hvm_hlt(regs->eflags);
2515 break;
2516 case EXIT_REASON_INVLPG:
2517 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2518 __update_guest_eip(inst_len);
2519 exit_qualification = __vmread(EXIT_QUALIFICATION);
2520 vmx_invlpg_intercept(exit_qualification);
2521 break;
2522 case EXIT_REASON_RDTSCP:
2523 regs->ecx = hvm_msr_tsc_aux(v);
2524 /* fall through */
2525 case EXIT_REASON_RDTSC:
2526 inst_len = __get_instruction_length();
2527 __update_guest_eip(inst_len);
2528 hvm_rdtsc_intercept(regs);
2529 break;
2530 case EXIT_REASON_VMCALL:
2532 int rc;
2533 HVMTRACE_1D(VMMCALL, regs->eax);
2534 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2535 rc = hvm_do_hypercall(regs);
2536 if ( rc != HVM_HCALL_preempted )
2538 __update_guest_eip(inst_len);
2539 if ( rc == HVM_HCALL_invalidate )
2540 send_invalidate_req();
2542 break;
2544 case EXIT_REASON_CR_ACCESS:
2546 exit_qualification = __vmread(EXIT_QUALIFICATION);
2547 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2548 if ( vmx_cr_access(exit_qualification, regs) )
2549 __update_guest_eip(inst_len);
2550 break;
2552 case EXIT_REASON_DR_ACCESS:
2553 exit_qualification = __vmread(EXIT_QUALIFICATION);
2554 vmx_dr_access(exit_qualification, regs);
2555 break;
2556 case EXIT_REASON_MSR_READ:
2557 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2558 if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY )
2559 __update_guest_eip(inst_len);
2560 break;
2561 case EXIT_REASON_MSR_WRITE:
2562 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2563 if ( hvm_msr_write_intercept(regs) == X86EMUL_OKAY )
2564 __update_guest_eip(inst_len);
2565 break;
2567 case EXIT_REASON_MWAIT_INSTRUCTION:
2568 case EXIT_REASON_MONITOR_INSTRUCTION:
2569 case EXIT_REASON_VMCLEAR:
2570 case EXIT_REASON_VMLAUNCH:
2571 case EXIT_REASON_VMPTRLD:
2572 case EXIT_REASON_VMPTRST:
2573 case EXIT_REASON_VMREAD:
2574 case EXIT_REASON_VMRESUME:
2575 case EXIT_REASON_VMWRITE:
2576 case EXIT_REASON_VMXOFF:
2577 case EXIT_REASON_VMXON:
2578 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2579 break;
2581 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2582 break;
2584 case EXIT_REASON_APIC_ACCESS:
2585 if ( !vmx_handle_eoi_write() && !handle_mmio() )
2586 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2587 break;
2589 case EXIT_REASON_IO_INSTRUCTION:
2590 if ( !handle_mmio() )
2591 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2592 break;
2594 case EXIT_REASON_INVD:
2595 case EXIT_REASON_WBINVD:
2597 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2598 __update_guest_eip(inst_len);
2599 vmx_wbinvd_intercept();
2600 break;
2603 case EXIT_REASON_EPT_VIOLATION:
2605 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2606 #ifdef __i386__
2607 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2608 #endif
2609 exit_qualification = __vmread(EXIT_QUALIFICATION);
2610 ept_handle_violation(exit_qualification, gpa);
2611 break;
2614 case EXIT_REASON_MONITOR_TRAP_FLAG:
2615 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
2616 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
2617 if ( v->domain->debugger_attached && v->arch.hvm_vcpu.single_step )
2618 domain_pause_for_debugger();
2619 break;
2621 case EXIT_REASON_PAUSE_INSTRUCTION:
2622 perfc_incr(pauseloop_exits);
2623 do_sched_op_compat(SCHEDOP_yield, 0);
2624 break;
2626 case EXIT_REASON_XSETBV:
2628 u64 new_bv = (((u64)regs->edx) << 32) | regs->eax;
2629 if ( vmx_handle_xsetbv(new_bv) == 0 )
2631 inst_len = __get_instruction_length();
2632 __update_guest_eip(inst_len);
2634 break;
2637 default:
2638 exit_and_crash:
2639 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2640 domain_crash(v->domain);
2641 break;
2645 asmlinkage void vmx_vmenter_helper(void)
2647 struct vcpu *curr = current;
2648 u32 new_asid, old_asid;
2649 bool_t need_flush;
2651 if ( !cpu_has_vmx_vpid )
2652 goto out;
2654 old_asid = curr->arch.hvm_vcpu.asid;
2655 need_flush = hvm_asid_handle_vmenter();
2656 new_asid = curr->arch.hvm_vcpu.asid;
2658 if ( unlikely(new_asid != old_asid) )
2660 __vmwrite(VIRTUAL_PROCESSOR_ID, new_asid);
2661 if ( !old_asid && new_asid )
2663 /* VPID was disabled: now enabled. */
2664 curr->arch.hvm_vmx.secondary_exec_control |=
2665 SECONDARY_EXEC_ENABLE_VPID;
2666 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2667 curr->arch.hvm_vmx.secondary_exec_control);
2669 else if ( old_asid && !new_asid )
2671 /* VPID was enabled: now disabled. */
2672 curr->arch.hvm_vmx.secondary_exec_control &=
2673 ~SECONDARY_EXEC_ENABLE_VPID;
2674 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2675 curr->arch.hvm_vmx.secondary_exec_control);
2679 if ( unlikely(need_flush) )
2680 vpid_sync_all();
2682 out:
2683 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2686 /*
2687 * Local variables:
2688 * mode: C
2689 * c-set-style: "BSD"
2690 * c-basic-offset: 4
2691 * tab-width: 4
2692 * indent-tabs-mode: nil
2693 * End:
2694 */