debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 20688:0cb22e32795c

x86_32: Fix build after RDTSCP and memory hotplug changes.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Dec 14 09:48:47 2009 +0000 (2009-12-14)
parents c61953922215
children 976d679b04fb
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/emulate.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/hvm/vmx/vmx.h>
44 #include <asm/hvm/vmx/vmcs.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
53 #include <asm/xenoprof.h>
54 #include <asm/debugger.h>
56 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
58 static void vmx_ctxt_switch_from(struct vcpu *v);
59 static void vmx_ctxt_switch_to(struct vcpu *v);
61 static int vmx_alloc_vlapic_mapping(struct domain *d);
62 static void vmx_free_vlapic_mapping(struct domain *d);
63 static void vmx_install_vlapic_mapping(struct vcpu *v);
64 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
65 static void vmx_update_guest_efer(struct vcpu *v);
66 static void vmx_cpuid_intercept(
67 unsigned int *eax, unsigned int *ebx,
68 unsigned int *ecx, unsigned int *edx);
69 static void vmx_wbinvd_intercept(void);
70 static void vmx_fpu_dirty_intercept(void);
71 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
72 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
73 static void vmx_invlpg_intercept(unsigned long vaddr);
74 static void __ept_sync_domain(void *info);
76 static int vmx_domain_initialise(struct domain *d)
77 {
78 int rc;
80 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
81 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
82 d->arch.hvm_domain.vmx.ept_control.asr =
83 pagetable_get_pfn(d->arch.phys_table);
86 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
87 return rc;
89 return 0;
90 }
92 static void vmx_domain_destroy(struct domain *d)
93 {
94 if ( d->arch.hvm_domain.hap_enabled )
95 on_each_cpu(__ept_sync_domain, d, 1);
96 vmx_free_vlapic_mapping(d);
97 }
99 static int vmx_vcpu_initialise(struct vcpu *v)
100 {
101 int rc;
103 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
105 v->arch.schedule_tail = vmx_do_resume;
106 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
107 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
109 if ( (rc = vmx_create_vmcs(v)) != 0 )
110 {
111 dprintk(XENLOG_WARNING,
112 "Failed to create VMCS for vcpu %d: err=%d.\n",
113 v->vcpu_id, rc);
114 return rc;
115 }
117 vpmu_initialise(v);
119 vmx_install_vlapic_mapping(v);
121 /* %eax == 1 signals full real-mode support to the guest loader. */
122 if ( v->vcpu_id == 0 )
123 v->arch.guest_context.user_regs.eax = 1;
125 return 0;
126 }
128 static void vmx_vcpu_destroy(struct vcpu *v)
129 {
130 vmx_destroy_vmcs(v);
131 vpmu_destroy(v);
132 passive_domain_destroy(v);
133 }
135 #ifdef __x86_64__
137 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
139 static u32 msr_index[] =
140 {
141 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
142 };
144 #define MSR_INDEX_SIZE (ARRAY_SIZE(msr_index))
146 static void vmx_save_host_msrs(void)
147 {
148 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
149 int i;
151 /*
152 * If new MSR is needed to add into msr_index[] and VMX_INDEX_MSR_*** enum,
153 * please note that elements in msr_index[] and VMX_INDEX_MSR_*** enum
154 * are not the same. Currently we only save three MSRs(MSR_LSTAR, MSR_STAR,
155 * and MSR_SYSCALL_MASK into host state.
156 */
157 BUILD_BUG_ON(MSR_INDEX_SIZE != VMX_INDEX_MSR_TSC_AUX ||
158 VMX_INDEX_MSR_TSC_AUX != VMX_MSR_COUNT - 1);
159 for ( i = 0; i < MSR_INDEX_SIZE; i++ )
160 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
162 if ( cpu_has_rdtscp )
163 rdmsrl(MSR_TSC_AUX, host_msr_state->msrs[VMX_INDEX_MSR_TSC_AUX]);
164 }
166 #define WRITE_MSR(address) \
167 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
168 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
169 wrmsrl(MSR_ ## address, msr_content); \
170 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
171 break
173 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
174 {
175 u64 msr_content = 0;
176 u32 ecx = regs->ecx;
177 struct vcpu *v = current;
178 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
180 switch ( ecx )
181 {
182 case MSR_EFER:
183 msr_content = v->arch.hvm_vcpu.guest_efer;
184 break;
186 case MSR_FS_BASE:
187 msr_content = __vmread(GUEST_FS_BASE);
188 break;
190 case MSR_GS_BASE:
191 msr_content = __vmread(GUEST_GS_BASE);
192 break;
194 case MSR_SHADOW_GS_BASE:
195 rdmsrl(MSR_SHADOW_GS_BASE, msr_content);
196 break;
198 case MSR_STAR:
199 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
200 break;
202 case MSR_LSTAR:
203 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
204 break;
206 case MSR_CSTAR:
207 msr_content = v->arch.hvm_vmx.cstar;
208 break;
210 case MSR_SYSCALL_MASK:
211 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
212 break;
214 case MSR_TSC_AUX:
215 if ( cpu_has_rdtscp )
216 {
217 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_TSC_AUX];
218 break;
219 }
220 else
221 {
222 HVM_DBG_LOG(DBG_LEVEL_0, "Reading from nonexistence msr 0x%x\n",
223 ecx);
224 vmx_inject_hw_exception(TRAP_gp_fault, 0);
225 return HNDL_exception_raised;
226 }
229 default:
230 return HNDL_unhandled;
231 }
233 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
235 regs->eax = (u32)(msr_content >> 0);
236 regs->edx = (u32)(msr_content >> 32);
238 return HNDL_done;
239 }
241 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
242 {
243 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
244 u32 ecx = regs->ecx;
245 struct vcpu *v = current;
246 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
247 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
249 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
251 switch ( ecx )
252 {
253 case MSR_EFER:
254 if ( hvm_set_efer(msr_content) )
255 goto exception_raised;
256 break;
258 case MSR_FS_BASE:
259 case MSR_GS_BASE:
260 case MSR_SHADOW_GS_BASE:
261 if ( !is_canonical_address(msr_content) )
262 goto uncanonical_address;
264 if ( ecx == MSR_FS_BASE )
265 __vmwrite(GUEST_FS_BASE, msr_content);
266 else if ( ecx == MSR_GS_BASE )
267 __vmwrite(GUEST_GS_BASE, msr_content);
268 else
269 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
271 break;
273 case MSR_STAR:
274 WRITE_MSR(STAR);
276 case MSR_LSTAR:
277 if ( !is_canonical_address(msr_content) )
278 goto uncanonical_address;
279 WRITE_MSR(LSTAR);
281 case MSR_CSTAR:
282 if ( !is_canonical_address(msr_content) )
283 goto uncanonical_address;
284 v->arch.hvm_vmx.cstar = msr_content;
285 break;
287 case MSR_SYSCALL_MASK:
288 WRITE_MSR(SYSCALL_MASK);
290 case MSR_TSC_AUX:
291 if ( cpu_has_rdtscp )
292 {
293 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
294 guest_state->msrs[VMX_INDEX_MSR_TSC_AUX] = msr_content;
295 wrmsrl(MSR_TSC_AUX, msr_content);
296 }
297 else
298 {
299 HVM_DBG_LOG(DBG_LEVEL_0, "Writing to nonexistence msr 0x%x\n", ecx);
300 vmx_inject_hw_exception(TRAP_gp_fault, 0);
301 return HNDL_exception_raised;
302 }
304 default:
305 return HNDL_unhandled;
306 }
308 return HNDL_done;
310 uncanonical_address:
311 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
312 vmx_inject_hw_exception(TRAP_gp_fault, 0);
313 exception_raised:
314 return HNDL_exception_raised;
315 }
317 /*
318 * To avoid MSR save/restore at every VM exit/entry time, we restore
319 * the x86_64 specific MSRs at domain switch time. Since these MSRs
320 * are not modified once set for para domains, we don't save them,
321 * but simply reset them to values set in percpu_traps_init().
322 */
323 static void vmx_restore_host_msrs(void)
324 {
325 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
326 int i;
328 while ( host_msr_state->flags )
329 {
330 i = find_first_set_bit(host_msr_state->flags);
331 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
332 clear_bit(i, &host_msr_state->flags);
333 }
335 if ( cpu_has_rdtscp )
336 wrmsrl(MSR_TSC_AUX, host_msr_state->msrs[VMX_INDEX_MSR_TSC_AUX]);
337 }
339 static void vmx_save_guest_msrs(struct vcpu *v)
340 {
341 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
342 /*
343 * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
344 * be updated at any time via SWAPGS, which we cannot trap.
345 */
346 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
347 if ( cpu_has_rdtscp )
348 rdmsrl(MSR_TSC_AUX, guest_msr_state->msrs[VMX_INDEX_MSR_TSC_AUX]);
349 }
351 static void vmx_restore_guest_msrs(struct vcpu *v)
352 {
353 struct vmx_msr_state *guest_msr_state, *host_msr_state;
354 unsigned long guest_flags;
355 int i;
357 guest_msr_state = &v->arch.hvm_vmx.msr_state;
358 host_msr_state = &this_cpu(host_msr_state);
360 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
362 guest_flags = guest_msr_state->flags;
364 while ( guest_flags )
365 {
366 i = find_first_set_bit(guest_flags);
368 HVM_DBG_LOG(DBG_LEVEL_2,
369 "restore guest's index %d msr %x with value %lx",
370 i, msr_index[i], guest_msr_state->msrs[i]);
371 set_bit(i, &host_msr_state->flags);
372 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
373 clear_bit(i, &guest_flags);
374 }
376 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
377 {
378 HVM_DBG_LOG(DBG_LEVEL_2,
379 "restore guest's EFER with value %lx",
380 v->arch.hvm_vcpu.guest_efer);
381 write_efer((read_efer() & ~EFER_SCE) |
382 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
383 }
385 if ( cpu_has_rdtscp )
386 wrmsrl(MSR_TSC_AUX, guest_msr_state->msrs[VMX_INDEX_MSR_TSC_AUX]);
387 }
389 #else /* __i386__ */
391 #define vmx_save_host_msrs() ((void)0)
392 #define vmx_restore_host_msrs() ((void)0)
394 #define vmx_save_guest_msrs(v) ((void)0)
395 #define vmx_restore_guest_msrs(v) ((void)0)
397 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
398 {
399 u64 msr_content = 0;
400 struct vcpu *v = current;
402 switch ( regs->ecx )
403 {
404 case MSR_EFER:
405 msr_content = v->arch.hvm_vcpu.guest_efer;
406 break;
408 default:
409 return HNDL_unhandled;
410 }
412 regs->eax = msr_content >> 0;
413 regs->edx = msr_content >> 32;
415 return HNDL_done;
416 }
418 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
419 {
420 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
422 switch ( regs->ecx )
423 {
424 case MSR_EFER:
425 if ( hvm_set_efer(msr_content) )
426 return HNDL_exception_raised;
427 break;
429 default:
430 return HNDL_unhandled;
431 }
433 return HNDL_done;
434 }
436 #endif /* __i386__ */
438 static int vmx_guest_x86_mode(struct vcpu *v)
439 {
440 unsigned int cs_ar_bytes;
442 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
443 return 0;
444 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
445 return 1;
446 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
447 if ( hvm_long_mode_enabled(v) &&
448 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
449 return 8;
450 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
451 }
453 static void vmx_save_dr(struct vcpu *v)
454 {
455 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
456 return;
458 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
459 v->arch.hvm_vcpu.flag_dr_dirty = 0;
460 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
461 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
463 v->arch.guest_context.debugreg[0] = read_debugreg(0);
464 v->arch.guest_context.debugreg[1] = read_debugreg(1);
465 v->arch.guest_context.debugreg[2] = read_debugreg(2);
466 v->arch.guest_context.debugreg[3] = read_debugreg(3);
467 v->arch.guest_context.debugreg[6] = read_debugreg(6);
468 /* DR7 must be saved as it is used by vmx_restore_dr(). */
469 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
470 }
472 static void __restore_debug_registers(struct vcpu *v)
473 {
474 if ( v->arch.hvm_vcpu.flag_dr_dirty )
475 return;
477 v->arch.hvm_vcpu.flag_dr_dirty = 1;
479 write_debugreg(0, v->arch.guest_context.debugreg[0]);
480 write_debugreg(1, v->arch.guest_context.debugreg[1]);
481 write_debugreg(2, v->arch.guest_context.debugreg[2]);
482 write_debugreg(3, v->arch.guest_context.debugreg[3]);
483 write_debugreg(6, v->arch.guest_context.debugreg[6]);
484 /* DR7 is loaded from the VMCS. */
485 }
487 /*
488 * DR7 is saved and restored on every vmexit. Other debug registers only
489 * need to be restored if their value is going to affect execution -- i.e.,
490 * if one of the breakpoints is enabled. So mask out all bits that don't
491 * enable some breakpoint functionality.
492 */
493 static void vmx_restore_dr(struct vcpu *v)
494 {
495 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
496 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
497 __restore_debug_registers(v);
498 }
500 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
501 {
502 uint32_t ev;
504 vmx_vmcs_enter(v);
506 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
507 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
508 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
509 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
511 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
513 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
514 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
515 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
517 c->pending_event = 0;
518 c->error_code = 0;
519 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
520 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
521 {
522 c->pending_event = ev;
523 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
524 }
526 vmx_vmcs_exit(v);
527 }
529 static int vmx_restore_cr0_cr3(
530 struct vcpu *v, unsigned long cr0, unsigned long cr3)
531 {
532 unsigned long mfn = 0;
533 p2m_type_t p2mt;
535 if ( paging_mode_shadow(v->domain) )
536 {
537 if ( cr0 & X86_CR0_PG )
538 {
539 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
540 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
541 {
542 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
543 return -EINVAL;
544 }
545 }
547 if ( hvm_paging_enabled(v) )
548 put_page(pagetable_get_page(v->arch.guest_table));
550 v->arch.guest_table = pagetable_from_pfn(mfn);
551 }
553 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
554 v->arch.hvm_vcpu.guest_cr[3] = cr3;
556 return 0;
557 }
559 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
560 {
561 int rc;
563 if ( c->pending_valid &&
564 ((c->pending_type == 1) || (c->pending_type > 6) ||
565 (c->pending_reserved != 0)) )
566 {
567 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
568 c->pending_event);
569 return -EINVAL;
570 }
572 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
573 if ( rc )
574 return rc;
576 vmx_vmcs_enter(v);
578 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
579 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
580 vmx_update_guest_cr(v, 0);
581 vmx_update_guest_cr(v, 2);
582 vmx_update_guest_cr(v, 4);
584 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
585 vmx_update_guest_efer(v);
587 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
588 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
589 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
591 __vmwrite(GUEST_DR7, c->dr7);
593 vmx_vmcs_exit(v);
595 paging_update_paging_modes(v);
597 if ( c->pending_valid )
598 {
599 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
600 c->pending_event, c->error_code);
602 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
603 {
604 vmx_vmcs_enter(v);
605 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
606 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
607 vmx_vmcs_exit(v);
608 }
609 }
611 return 0;
612 }
614 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
615 {
616 #ifdef __x86_64__
617 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
618 unsigned long guest_flags = guest_state->flags;
620 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
621 data->msr_cstar = v->arch.hvm_vmx.cstar;
623 /* save msrs */
624 data->msr_flags = guest_flags;
625 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
626 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
627 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
628 if ( cpu_has_rdtscp )
629 data->msr_tsc_aux = guest_state->msrs[VMX_INDEX_MSR_TSC_AUX];
630 #endif
632 data->tsc = hvm_get_guest_tsc(v);
633 }
635 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
636 {
637 #ifdef __x86_64__
638 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
640 /* restore msrs */
641 guest_state->flags = data->msr_flags & 7;
642 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
643 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
644 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
646 v->arch.hvm_vmx.cstar = data->msr_cstar;
647 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
648 if ( cpu_has_rdtscp )
649 guest_state->msrs[VMX_INDEX_MSR_TSC_AUX] = data->msr_tsc_aux;
650 #endif
652 hvm_set_guest_tsc(v, data->tsc);
653 }
656 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
657 {
658 vmx_save_cpu_state(v, ctxt);
659 vmx_vmcs_save(v, ctxt);
660 }
662 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
663 {
664 vmx_load_cpu_state(v, ctxt);
666 if ( vmx_vmcs_restore(v, ctxt) )
667 {
668 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
669 domain_crash(v->domain);
670 return -EINVAL;
671 }
673 return 0;
674 }
676 static void vmx_fpu_enter(struct vcpu *v)
677 {
678 setup_fpu(v);
679 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
680 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
681 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
682 }
684 static void vmx_fpu_leave(struct vcpu *v)
685 {
686 ASSERT(!v->fpu_dirtied);
687 ASSERT(read_cr0() & X86_CR0_TS);
689 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
690 {
691 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
692 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
693 }
695 /*
696 * If the guest does not have TS enabled then we must cause and handle an
697 * exception on first use of the FPU. If the guest *does* have TS enabled
698 * then this is not necessary: no FPU activity can occur until the guest
699 * clears CR0.TS, and we will initialise the FPU when that happens.
700 */
701 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
702 {
703 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
704 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
705 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
706 }
707 }
709 static void vmx_ctxt_switch_from(struct vcpu *v)
710 {
711 vmx_fpu_leave(v);
712 vmx_save_guest_msrs(v);
713 vmx_restore_host_msrs();
714 vmx_save_dr(v);
715 vpmu_save(v);
716 }
718 static void vmx_ctxt_switch_to(struct vcpu *v)
719 {
720 struct domain *d = v->domain;
721 unsigned long old_cr4 = read_cr4(), new_cr4 = mmu_cr4_features;
723 /* HOST_CR4 in VMCS is always mmu_cr4_features and
724 * CR4_OSXSAVE(if supported). Sync CR4 now. */
725 if ( cpu_has_xsave )
726 new_cr4 |= X86_CR4_OSXSAVE;
727 if ( old_cr4 != new_cr4 )
728 write_cr4(new_cr4);
730 if ( d->arch.hvm_domain.hap_enabled )
731 {
732 unsigned int cpu = smp_processor_id();
733 /* Test-and-test-and-set this CPU in the EPT-is-synced mask. */
734 if ( !cpu_isset(cpu, d->arch.hvm_domain.vmx.ept_synced) &&
735 !cpu_test_and_set(cpu, d->arch.hvm_domain.vmx.ept_synced) )
736 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
737 }
739 vmx_restore_guest_msrs(v);
740 vmx_restore_dr(v);
741 vpmu_load(v);
742 }
745 /* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
746 * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
747 * The guest thinks it's got ring-0 segments, so we need to fudge
748 * things. We store the ring-3 version in the VMCS to avoid lots of
749 * shuffling on vmenter and vmexit, and translate in these accessors. */
751 #define rm_cs_attr (((union segment_attributes) { \
752 .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
753 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
754 #define rm_ds_attr (((union segment_attributes) { \
755 .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
756 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
757 #define vm86_ds_attr (((union segment_attributes) { \
758 .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \
759 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
760 #define vm86_tr_attr (((union segment_attributes) { \
761 .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \
762 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
764 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
765 struct segment_register *reg)
766 {
767 uint32_t attr = 0;
769 vmx_vmcs_enter(v);
771 switch ( seg )
772 {
773 case x86_seg_cs:
774 reg->sel = __vmread(GUEST_CS_SELECTOR);
775 reg->limit = __vmread(GUEST_CS_LIMIT);
776 reg->base = __vmread(GUEST_CS_BASE);
777 attr = __vmread(GUEST_CS_AR_BYTES);
778 break;
779 case x86_seg_ds:
780 reg->sel = __vmread(GUEST_DS_SELECTOR);
781 reg->limit = __vmread(GUEST_DS_LIMIT);
782 reg->base = __vmread(GUEST_DS_BASE);
783 attr = __vmread(GUEST_DS_AR_BYTES);
784 break;
785 case x86_seg_es:
786 reg->sel = __vmread(GUEST_ES_SELECTOR);
787 reg->limit = __vmread(GUEST_ES_LIMIT);
788 reg->base = __vmread(GUEST_ES_BASE);
789 attr = __vmread(GUEST_ES_AR_BYTES);
790 break;
791 case x86_seg_fs:
792 reg->sel = __vmread(GUEST_FS_SELECTOR);
793 reg->limit = __vmread(GUEST_FS_LIMIT);
794 reg->base = __vmread(GUEST_FS_BASE);
795 attr = __vmread(GUEST_FS_AR_BYTES);
796 break;
797 case x86_seg_gs:
798 reg->sel = __vmread(GUEST_GS_SELECTOR);
799 reg->limit = __vmread(GUEST_GS_LIMIT);
800 reg->base = __vmread(GUEST_GS_BASE);
801 attr = __vmread(GUEST_GS_AR_BYTES);
802 break;
803 case x86_seg_ss:
804 reg->sel = __vmread(GUEST_SS_SELECTOR);
805 reg->limit = __vmread(GUEST_SS_LIMIT);
806 reg->base = __vmread(GUEST_SS_BASE);
807 attr = __vmread(GUEST_SS_AR_BYTES);
808 break;
809 case x86_seg_tr:
810 reg->sel = __vmread(GUEST_TR_SELECTOR);
811 reg->limit = __vmread(GUEST_TR_LIMIT);
812 reg->base = __vmread(GUEST_TR_BASE);
813 attr = __vmread(GUEST_TR_AR_BYTES);
814 break;
815 case x86_seg_gdtr:
816 reg->limit = __vmread(GUEST_GDTR_LIMIT);
817 reg->base = __vmread(GUEST_GDTR_BASE);
818 break;
819 case x86_seg_idtr:
820 reg->limit = __vmread(GUEST_IDTR_LIMIT);
821 reg->base = __vmread(GUEST_IDTR_BASE);
822 break;
823 case x86_seg_ldtr:
824 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
825 reg->limit = __vmread(GUEST_LDTR_LIMIT);
826 reg->base = __vmread(GUEST_LDTR_BASE);
827 attr = __vmread(GUEST_LDTR_AR_BYTES);
828 break;
829 default:
830 BUG();
831 }
833 vmx_vmcs_exit(v);
835 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
836 /* Unusable flag is folded into Present flag. */
837 if ( attr & (1u<<16) )
838 reg->attr.fields.p = 0;
840 /* Adjust for virtual 8086 mode */
841 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr
842 && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
843 {
844 struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
845 if ( seg == x86_seg_tr )
846 *reg = *sreg;
847 else if ( reg->base != sreg->base || seg == x86_seg_ss )
848 {
849 /* If the guest's reloaded the segment, remember the new version.
850 * We can't tell if the guest reloaded the segment with another
851 * one that has the same base. By default we assume it hasn't,
852 * since we don't want to lose big-real-mode segment attributes,
853 * but for SS we assume it has: the Ubuntu graphical bootloader
854 * does this and gets badly confused if we leave the old SS in
855 * place. */
856 reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
857 *sreg = *reg;
858 }
859 else
860 {
861 /* Always give realmode guests a selector that matches the base
862 * but keep the attr and limit from before */
863 *reg = *sreg;
864 reg->sel = reg->base >> 4;
865 }
866 }
867 }
869 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
870 struct segment_register *reg)
871 {
872 uint32_t attr, sel, limit;
873 uint64_t base;
875 sel = reg->sel;
876 attr = reg->attr.bytes;
877 limit = reg->limit;
878 base = reg->base;
880 /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
881 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
882 {
883 /* Remember the proper contents */
884 v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
886 if ( seg == x86_seg_tr )
887 {
888 if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
889 {
890 sel = 0;
891 attr = vm86_tr_attr;
892 limit = 0xff;
893 base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
894 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
895 }
896 else
897 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
898 }
899 else
900 {
901 /* Try to fake it out as a 16bit data segment. This could
902 * cause confusion for the guest if it reads the selector,
903 * but otherwise we have to emulate if *any* segment hasn't
904 * been reloaded. */
905 if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
906 && reg->attr.fields.p )
907 {
908 sel = base >> 4;
909 attr = vm86_ds_attr;
910 limit = 0xffff;
911 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
912 }
913 else
914 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
915 }
916 }
918 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
920 /* Not-present must mean unusable. */
921 if ( !reg->attr.fields.p )
922 attr |= (1u << 16);
924 /* VMX has strict consistency requirement for flag G. */
925 attr |= !!(limit >> 20) << 15;
927 vmx_vmcs_enter(v);
929 switch ( seg )
930 {
931 case x86_seg_cs:
932 __vmwrite(GUEST_CS_SELECTOR, sel);
933 __vmwrite(GUEST_CS_LIMIT, limit);
934 __vmwrite(GUEST_CS_BASE, base);
935 __vmwrite(GUEST_CS_AR_BYTES, attr);
936 break;
937 case x86_seg_ds:
938 __vmwrite(GUEST_DS_SELECTOR, sel);
939 __vmwrite(GUEST_DS_LIMIT, limit);
940 __vmwrite(GUEST_DS_BASE, base);
941 __vmwrite(GUEST_DS_AR_BYTES, attr);
942 break;
943 case x86_seg_es:
944 __vmwrite(GUEST_ES_SELECTOR, sel);
945 __vmwrite(GUEST_ES_LIMIT, limit);
946 __vmwrite(GUEST_ES_BASE, base);
947 __vmwrite(GUEST_ES_AR_BYTES, attr);
948 break;
949 case x86_seg_fs:
950 __vmwrite(GUEST_FS_SELECTOR, sel);
951 __vmwrite(GUEST_FS_LIMIT, limit);
952 __vmwrite(GUEST_FS_BASE, base);
953 __vmwrite(GUEST_FS_AR_BYTES, attr);
954 break;
955 case x86_seg_gs:
956 __vmwrite(GUEST_GS_SELECTOR, sel);
957 __vmwrite(GUEST_GS_LIMIT, limit);
958 __vmwrite(GUEST_GS_BASE, base);
959 __vmwrite(GUEST_GS_AR_BYTES, attr);
960 break;
961 case x86_seg_ss:
962 __vmwrite(GUEST_SS_SELECTOR, sel);
963 __vmwrite(GUEST_SS_LIMIT, limit);
964 __vmwrite(GUEST_SS_BASE, base);
965 __vmwrite(GUEST_SS_AR_BYTES, attr);
966 break;
967 case x86_seg_tr:
968 __vmwrite(GUEST_TR_SELECTOR, sel);
969 __vmwrite(GUEST_TR_LIMIT, limit);
970 __vmwrite(GUEST_TR_BASE, base);
971 /* VMX checks that the the busy flag (bit 1) is set. */
972 __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
973 break;
974 case x86_seg_gdtr:
975 __vmwrite(GUEST_GDTR_LIMIT, limit);
976 __vmwrite(GUEST_GDTR_BASE, base);
977 break;
978 case x86_seg_idtr:
979 __vmwrite(GUEST_IDTR_LIMIT, limit);
980 __vmwrite(GUEST_IDTR_BASE, base);
981 break;
982 case x86_seg_ldtr:
983 __vmwrite(GUEST_LDTR_SELECTOR, sel);
984 __vmwrite(GUEST_LDTR_LIMIT, limit);
985 __vmwrite(GUEST_LDTR_BASE, base);
986 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
987 break;
988 default:
989 BUG();
990 }
992 vmx_vmcs_exit(v);
993 }
995 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
996 {
997 vmx_vmcs_enter(v);
998 __vmwrite(TSC_OFFSET, offset);
999 #if defined (__i386__)
1000 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
1001 #endif
1002 vmx_vmcs_exit(v);
1005 static void vmx_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
1007 vmx_vmcs_enter(v);
1008 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
1009 if ( enable )
1010 v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
1011 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1012 vmx_vmcs_exit(v);
1015 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1017 char *p;
1018 int i;
1020 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1022 p = (char *)(hypercall_page + (i * 32));
1023 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1024 *(u32 *)(p + 1) = i;
1025 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1026 *(u8 *)(p + 6) = 0x01;
1027 *(u8 *)(p + 7) = 0xc1;
1028 *(u8 *)(p + 8) = 0xc3; /* ret */
1031 /* Don't support HYPERVISOR_iret at the moment */
1032 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1035 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
1037 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
1040 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
1042 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1045 static void vmx_load_pdptrs(struct vcpu *v)
1047 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
1048 uint64_t *guest_pdptrs;
1049 p2m_type_t p2mt;
1050 char *p;
1052 /* EPT needs to load PDPTRS into VMCS for PAE. */
1053 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1054 return;
1056 if ( cr3 & 0x1fUL )
1057 goto crash;
1059 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
1060 if ( !p2m_is_ram(p2mt) )
1061 goto crash;
1063 p = map_domain_page(mfn);
1065 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
1067 /*
1068 * We do not check the PDPTRs for validity. The CPU will do this during
1069 * vm entry, and we can handle the failure there and crash the guest.
1070 * The only thing we could do better here is #GP instead.
1071 */
1073 vmx_vmcs_enter(v);
1075 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
1076 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
1077 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
1078 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
1079 #ifdef __i386__
1080 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
1081 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
1082 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
1083 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
1084 #endif
1086 vmx_vmcs_exit(v);
1088 unmap_domain_page(p);
1089 return;
1091 crash:
1092 domain_crash(v->domain);
1095 static void vmx_update_host_cr3(struct vcpu *v)
1097 vmx_vmcs_enter(v);
1098 __vmwrite(HOST_CR3, v->arch.cr3);
1099 vmx_vmcs_exit(v);
1102 void vmx_update_debug_state(struct vcpu *v)
1104 unsigned long intercepts, mask;
1106 ASSERT(v == current);
1108 mask = 1u << TRAP_int3;
1109 if ( !cpu_has_monitor_trap_flag )
1110 mask |= 1u << TRAP_debug;
1112 intercepts = __vmread(EXCEPTION_BITMAP);
1113 if ( v->arch.hvm_vcpu.debug_state_latch )
1114 intercepts |= mask;
1115 else
1116 intercepts &= ~mask;
1117 __vmwrite(EXCEPTION_BITMAP, intercepts);
1120 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1122 vmx_vmcs_enter(v);
1124 switch ( cr )
1126 case 0: {
1127 int realmode;
1128 unsigned long hw_cr0_mask = X86_CR0_NE;
1130 if ( !vmx_unrestricted_guest(v) )
1131 hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE;
1133 if ( paging_mode_shadow(v->domain) )
1134 hw_cr0_mask |= X86_CR0_WP;
1136 if ( paging_mode_hap(v->domain) )
1138 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
1139 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1140 CPU_BASED_CR3_STORE_EXITING);
1141 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1142 if ( !hvm_paging_enabled(v) )
1143 v->arch.hvm_vmx.exec_control |= cr3_ctls;
1144 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1146 /* Changing CR0.PE can change some bits in real CR4. */
1147 vmx_update_guest_cr(v, 4);
1150 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1152 if ( v != current )
1153 hw_cr0_mask |= X86_CR0_TS;
1154 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1155 vmx_fpu_enter(v);
1158 realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
1160 if ( (!vmx_unrestricted_guest(v)) &&
1161 (realmode != v->arch.hvm_vmx.vmx_realmode) )
1163 enum x86_segment s;
1164 struct segment_register reg[x86_seg_tr + 1];
1166 /* Entering or leaving real mode: adjust the segment registers.
1167 * Need to read them all either way, as realmode reads can update
1168 * the saved values we'll use when returning to prot mode. */
1169 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1170 vmx_get_segment_register(v, s, &reg[s]);
1171 v->arch.hvm_vmx.vmx_realmode = realmode;
1173 if ( realmode )
1175 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1176 vmx_set_segment_register(v, s, &reg[s]);
1177 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1178 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1179 __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
1181 else
1183 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1184 if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
1185 vmx_set_segment_register(
1186 v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
1187 v->arch.hvm_vcpu.hw_cr[4] =
1188 ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
1189 |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
1190 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1191 __vmwrite(EXCEPTION_BITMAP,
1192 HVM_TRAP_MASK
1193 | (paging_mode_hap(v->domain) ?
1194 0 : (1U << TRAP_page_fault))
1195 | (1U << TRAP_no_device));
1196 vmx_update_debug_state(v);
1200 v->arch.hvm_vcpu.hw_cr[0] =
1201 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1202 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1203 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1204 break;
1206 case 2:
1207 /* CR2 is updated in exit stub. */
1208 break;
1209 case 3:
1210 if ( paging_mode_hap(v->domain) )
1212 if ( !hvm_paging_enabled(v) )
1213 v->arch.hvm_vcpu.hw_cr[3] =
1214 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1215 vmx_load_pdptrs(v);
1218 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1219 hvm_asid_flush_vcpu(v);
1220 break;
1221 case 4:
1222 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1223 if ( paging_mode_hap(v->domain) )
1224 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1225 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1226 if ( v->arch.hvm_vmx.vmx_realmode )
1227 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1228 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1230 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1231 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1233 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1234 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1235 break;
1236 default:
1237 BUG();
1240 vmx_vmcs_exit(v);
1243 static void vmx_update_guest_efer(struct vcpu *v)
1245 #ifdef __x86_64__
1246 unsigned long vm_entry_value;
1248 vmx_vmcs_enter(v);
1250 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1251 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1252 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1253 else
1254 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1255 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1257 vmx_vmcs_exit(v);
1258 #endif
1260 if ( v == current )
1261 write_efer((read_efer() & ~EFER_SCE) |
1262 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
1265 static void __ept_sync_domain(void *info)
1267 struct domain *d = info;
1268 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1271 void ept_sync_domain(struct domain *d)
1273 /* Only if using EPT and this domain has some VCPUs to dirty. */
1274 if ( !d->arch.hvm_domain.hap_enabled || !d->vcpu || !d->vcpu[0] )
1275 return;
1277 ASSERT(local_irq_is_enabled());
1278 ASSERT(p2m_locked_by_me(d->arch.p2m));
1280 /*
1281 * Flush active cpus synchronously. Flush others the next time this domain
1282 * is scheduled onto them. We accept the race of other CPUs adding to
1283 * the ept_synced mask before on_selected_cpus() reads it, resulting in
1284 * unnecessary extra flushes, to avoid allocating a cpumask_t on the stack.
1285 */
1286 d->arch.hvm_domain.vmx.ept_synced = d->domain_dirty_cpumask;
1287 on_selected_cpus(&d->arch.hvm_domain.vmx.ept_synced,
1288 __ept_sync_domain, d, 1);
1291 static void __vmx_inject_exception(int trap, int type, int error_code)
1293 unsigned long intr_fields;
1294 struct vcpu *curr = current;
1296 /*
1297 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1298 * "If the VM entry is injecting, there is no blocking by STI or by
1299 * MOV SS following the VM entry, regardless of the contents of the
1300 * interruptibility-state field [in the guest-state area before the
1301 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1302 */
1304 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1305 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1306 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1307 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1310 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1312 /* Can't inject exceptions in virtual 8086 mode because they would
1313 * use the protected-mode IDT. Emulate at the next vmenter instead. */
1314 if ( curr->arch.hvm_vmx.vmx_realmode )
1315 curr->arch.hvm_vmx.vmx_emulate = 1;
1318 void vmx_inject_hw_exception(int trap, int error_code)
1320 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1321 struct vcpu *curr = current;
1323 switch ( trap )
1325 case TRAP_debug:
1326 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1328 __restore_debug_registers(curr);
1329 write_debugreg(6, read_debugreg(6) | 0x4000);
1331 if ( cpu_has_monitor_trap_flag )
1332 break;
1333 case TRAP_int3:
1334 if ( curr->domain->debugger_attached )
1336 /* Debug/Int3: Trap to debugger. */
1337 domain_pause_for_debugger();
1338 return;
1342 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1343 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1345 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1346 if ( trap == TRAP_double_fault )
1347 error_code = 0;
1350 __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1352 if ( trap == TRAP_page_fault )
1353 HVMTRACE_LONG_2D(PF_INJECT, error_code,
1354 TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
1355 else
1356 HVMTRACE_2D(INJ_EXC, trap, error_code);
1359 void vmx_inject_extint(int trap)
1361 __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1362 HVM_DELIVER_NO_ERROR_CODE);
1365 void vmx_inject_nmi(void)
1367 __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1368 HVM_DELIVER_NO_ERROR_CODE);
1371 static void vmx_inject_exception(
1372 unsigned int trapnr, int errcode, unsigned long cr2)
1374 if ( trapnr == TRAP_page_fault )
1375 current->arch.hvm_vcpu.guest_cr[2] = cr2;
1377 vmx_inject_hw_exception(trapnr, errcode);
1380 static int vmx_event_pending(struct vcpu *v)
1382 ASSERT(v == current);
1383 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1386 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1388 return vpmu_do_interrupt(regs);
1391 static void vmx_set_uc_mode(struct vcpu *v)
1393 if ( paging_mode_hap(v->domain) )
1394 ept_change_entry_emt_with_range(
1395 v->domain, 0, v->domain->arch.p2m->max_mapped_pfn);
1396 hvm_asid_flush_vcpu(v);
1399 static void vmx_set_info_guest(struct vcpu *v)
1401 unsigned long intr_shadow;
1403 vmx_vmcs_enter(v);
1405 __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
1407 /*
1408 * If the interruptibility-state field indicates blocking by STI,
1409 * setting the TF flag in the EFLAGS may cause VM entry to fail
1410 * and crash the guest. See SDM 3B 22.3.1.5.
1411 * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
1412 * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
1413 * immediately vmexit and hence make no progress.
1414 */
1415 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1416 if ( v->domain->debugger_attached &&
1417 (v->arch.guest_context.user_regs.eflags & X86_EFLAGS_TF) &&
1418 (intr_shadow & VMX_INTR_SHADOW_STI) )
1420 intr_shadow &= ~VMX_INTR_SHADOW_STI;
1421 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1424 vmx_vmcs_exit(v);
1427 static struct hvm_function_table __read_mostly vmx_function_table = {
1428 .name = "VMX",
1429 .domain_initialise = vmx_domain_initialise,
1430 .domain_destroy = vmx_domain_destroy,
1431 .vcpu_initialise = vmx_vcpu_initialise,
1432 .vcpu_destroy = vmx_vcpu_destroy,
1433 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1434 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1435 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1436 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1437 .guest_x86_mode = vmx_guest_x86_mode,
1438 .get_segment_register = vmx_get_segment_register,
1439 .set_segment_register = vmx_set_segment_register,
1440 .update_host_cr3 = vmx_update_host_cr3,
1441 .update_guest_cr = vmx_update_guest_cr,
1442 .update_guest_efer = vmx_update_guest_efer,
1443 .set_tsc_offset = vmx_set_tsc_offset,
1444 .inject_exception = vmx_inject_exception,
1445 .init_hypercall_page = vmx_init_hypercall_page,
1446 .event_pending = vmx_event_pending,
1447 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1448 .cpu_up = vmx_cpu_up,
1449 .cpu_down = vmx_cpu_down,
1450 .cpuid_intercept = vmx_cpuid_intercept,
1451 .wbinvd_intercept = vmx_wbinvd_intercept,
1452 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1453 .msr_read_intercept = vmx_msr_read_intercept,
1454 .msr_write_intercept = vmx_msr_write_intercept,
1455 .invlpg_intercept = vmx_invlpg_intercept,
1456 .set_uc_mode = vmx_set_uc_mode,
1457 .set_info_guest = vmx_set_info_guest,
1458 .set_rdtsc_exiting = vmx_set_rdtsc_exiting
1459 };
1461 void start_vmx(void)
1463 static bool_t bootstrapped;
1465 vmx_save_host_msrs();
1467 if ( test_and_set_bool(bootstrapped) )
1469 if ( hvm_enabled && !vmx_cpu_up() )
1471 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1472 smp_processor_id());
1473 BUG();
1475 return;
1478 /* Xen does not fill x86_capability words except 0. */
1479 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1481 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1482 return;
1484 set_in_cr4(X86_CR4_VMXE);
1486 if ( !vmx_cpu_up() )
1488 printk("VMX: failed to initialise.\n");
1489 return;
1492 if ( cpu_has_vmx_ept )
1493 vmx_function_table.hap_supported = 1;
1495 setup_vmcs_dump();
1497 hvm_enable(&vmx_function_table);
1500 /*
1501 * Not all cases receive valid value in the VM-exit instruction length field.
1502 * Callers must know what they're doing!
1503 */
1504 static int __get_instruction_length(void)
1506 int len;
1507 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1508 BUG_ON((len < 1) || (len > 15));
1509 return len;
1512 static void __update_guest_eip(unsigned long inst_len)
1514 struct cpu_user_regs *regs = guest_cpu_user_regs();
1515 unsigned long x;
1517 regs->eip += inst_len;
1518 regs->eflags &= ~X86_EFLAGS_RF;
1520 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1521 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1523 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1524 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1527 if ( regs->eflags & X86_EFLAGS_TF )
1528 vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
1531 static void vmx_fpu_dirty_intercept(void)
1533 struct vcpu *curr = current;
1535 vmx_fpu_enter(curr);
1537 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1538 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1540 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1541 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1545 #define bitmaskof(idx) (1U << ((idx) & 31))
1546 static void vmx_cpuid_intercept(
1547 unsigned int *eax, unsigned int *ebx,
1548 unsigned int *ecx, unsigned int *edx)
1550 unsigned int input = *eax;
1551 struct segment_register cs;
1552 struct vcpu *v = current;
1554 hvm_cpuid(input, eax, ebx, ecx, edx);
1556 switch ( input )
1558 case 0x80000001:
1559 /* SYSCALL is visible iff running in long mode. */
1560 hvm_get_segment_register(v, x86_seg_cs, &cs);
1561 if ( cs.attr.fields.l )
1562 *edx |= bitmaskof(X86_FEATURE_SYSCALL);
1563 else
1564 *edx &= ~(bitmaskof(X86_FEATURE_SYSCALL));
1566 #ifdef __x86_64__
1567 if ( cpu_has_rdtscp )
1568 *edx |= bitmaskof(X86_FEATURE_RDTSCP);
1569 else
1570 *edx &= ~(bitmaskof(X86_FEATURE_RDTSCP));
1571 #endif
1573 break;
1576 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
1579 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1581 unsigned int eax, ebx, ecx, edx;
1583 eax = regs->eax;
1584 ebx = regs->ebx;
1585 ecx = regs->ecx;
1586 edx = regs->edx;
1588 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1590 regs->eax = eax;
1591 regs->ebx = ebx;
1592 regs->ecx = ecx;
1593 regs->edx = edx;
1596 static void vmx_dr_access(unsigned long exit_qualification,
1597 struct cpu_user_regs *regs)
1599 struct vcpu *v = current;
1601 HVMTRACE_0D(DR_WRITE);
1603 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1604 __restore_debug_registers(v);
1606 /* Allow guest direct access to DR registers */
1607 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1608 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1611 static void vmx_invlpg_intercept(unsigned long vaddr)
1613 struct vcpu *curr = current;
1614 HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
1615 if ( paging_invlpg(curr, vaddr) && cpu_has_vmx_vpid )
1616 vpid_sync_vcpu_gva(curr, vaddr);
1619 #define CASE_SET_REG(REG, reg) \
1620 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1621 #define CASE_GET_REG(REG, reg) \
1622 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1624 #define CASE_EXTEND_SET_REG \
1625 CASE_EXTEND_REG(S)
1626 #define CASE_EXTEND_GET_REG \
1627 CASE_EXTEND_REG(G)
1629 #ifdef __i386__
1630 #define CASE_EXTEND_REG(T)
1631 #else
1632 #define CASE_EXTEND_REG(T) \
1633 CASE_ ## T ## ET_REG(R8, r8); \
1634 CASE_ ## T ## ET_REG(R9, r9); \
1635 CASE_ ## T ## ET_REG(R10, r10); \
1636 CASE_ ## T ## ET_REG(R11, r11); \
1637 CASE_ ## T ## ET_REG(R12, r12); \
1638 CASE_ ## T ## ET_REG(R13, r13); \
1639 CASE_ ## T ## ET_REG(R14, r14); \
1640 CASE_ ## T ## ET_REG(R15, r15)
1641 #endif
1643 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1645 unsigned long value;
1646 struct vcpu *v = current;
1647 struct vlapic *vlapic = vcpu_vlapic(v);
1649 switch ( gp )
1651 CASE_GET_REG(EAX, eax);
1652 CASE_GET_REG(ECX, ecx);
1653 CASE_GET_REG(EDX, edx);
1654 CASE_GET_REG(EBX, ebx);
1655 CASE_GET_REG(EBP, ebp);
1656 CASE_GET_REG(ESI, esi);
1657 CASE_GET_REG(EDI, edi);
1658 CASE_GET_REG(ESP, esp);
1659 CASE_EXTEND_GET_REG;
1660 default:
1661 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1662 goto exit_and_crash;
1665 HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
1667 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1669 switch ( cr )
1671 case 0:
1672 return !hvm_set_cr0(value);
1674 case 3:
1675 return !hvm_set_cr3(value);
1677 case 4:
1678 return !hvm_set_cr4(value);
1680 case 8:
1681 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1682 break;
1684 default:
1685 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1686 goto exit_and_crash;
1689 return 1;
1691 exit_and_crash:
1692 domain_crash(v->domain);
1693 return 0;
1696 /*
1697 * Read from control registers. CR0 and CR4 are read from the shadow.
1698 */
1699 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1701 unsigned long value = 0;
1702 struct vcpu *v = current;
1703 struct vlapic *vlapic = vcpu_vlapic(v);
1705 switch ( cr )
1707 case 3:
1708 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1709 break;
1710 case 8:
1711 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1712 value = (value & 0xF0) >> 4;
1713 break;
1714 default:
1715 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1716 domain_crash(v->domain);
1717 break;
1720 switch ( gp ) {
1721 CASE_SET_REG(EAX, eax);
1722 CASE_SET_REG(ECX, ecx);
1723 CASE_SET_REG(EDX, edx);
1724 CASE_SET_REG(EBX, ebx);
1725 CASE_SET_REG(EBP, ebp);
1726 CASE_SET_REG(ESI, esi);
1727 CASE_SET_REG(EDI, edi);
1728 CASE_SET_REG(ESP, esp);
1729 CASE_EXTEND_SET_REG;
1730 default:
1731 printk("invalid gp: %d\n", gp);
1732 domain_crash(v->domain);
1733 break;
1736 HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
1738 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1741 static int vmx_cr_access(unsigned long exit_qualification,
1742 struct cpu_user_regs *regs)
1744 unsigned int gp, cr;
1745 unsigned long value;
1746 struct vcpu *v = current;
1748 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1750 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1751 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1752 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1753 return mov_to_cr(gp, cr, regs);
1754 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1755 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1756 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1757 mov_from_cr(cr, gp, regs);
1758 break;
1759 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1760 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1761 vmx_update_guest_cr(v, 0);
1762 HVMTRACE_0D(CLTS);
1763 break;
1764 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1765 value = v->arch.hvm_vcpu.guest_cr[0];
1766 /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
1767 value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
1768 HVMTRACE_LONG_1D(LMSW, value);
1769 return !hvm_set_cr0(value);
1770 default:
1771 BUG();
1774 return 1;
1777 static const struct lbr_info {
1778 u32 base, count;
1779 } p4_lbr[] = {
1780 { MSR_P4_LER_FROM_LIP, 1 },
1781 { MSR_P4_LER_TO_LIP, 1 },
1782 { MSR_P4_LASTBRANCH_TOS, 1 },
1783 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1784 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1785 { 0, 0 }
1786 }, c2_lbr[] = {
1787 { MSR_IA32_LASTINTFROMIP, 1 },
1788 { MSR_IA32_LASTINTTOIP, 1 },
1789 { MSR_C2_LASTBRANCH_TOS, 1 },
1790 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1791 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1792 { 0, 0 }
1793 #ifdef __i386__
1794 }, pm_lbr[] = {
1795 { MSR_IA32_LASTINTFROMIP, 1 },
1796 { MSR_IA32_LASTINTTOIP, 1 },
1797 { MSR_PM_LASTBRANCH_TOS, 1 },
1798 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1799 { 0, 0 }
1800 #endif
1801 };
1803 static const struct lbr_info *last_branch_msr_get(void)
1805 switch ( boot_cpu_data.x86 )
1807 case 6:
1808 switch ( boot_cpu_data.x86_model )
1810 #ifdef __i386__
1811 /* PentiumM */
1812 case 9: case 13:
1813 /* Core Solo/Duo */
1814 case 14:
1815 return pm_lbr;
1816 break;
1817 #endif
1818 /* Core2 Duo */
1819 case 15:
1820 return c2_lbr;
1821 break;
1823 break;
1825 case 15:
1826 switch ( boot_cpu_data.x86_model )
1828 /* Pentium4/Xeon with em64t */
1829 case 3: case 4: case 6:
1830 return p4_lbr;
1831 break;
1833 break;
1836 return NULL;
1839 static int is_last_branch_msr(u32 ecx)
1841 const struct lbr_info *lbr = last_branch_msr_get();
1843 if ( lbr == NULL )
1844 return 0;
1846 for ( ; lbr->count; lbr++ )
1847 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1848 return 1;
1850 return 0;
1853 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1855 u64 msr_content = 0;
1856 u32 ecx = regs->ecx, eax, edx;
1858 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1860 switch ( ecx )
1862 case MSR_IA32_SYSENTER_CS:
1863 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1864 break;
1865 case MSR_IA32_SYSENTER_ESP:
1866 msr_content = __vmread(GUEST_SYSENTER_ESP);
1867 break;
1868 case MSR_IA32_SYSENTER_EIP:
1869 msr_content = __vmread(GUEST_SYSENTER_EIP);
1870 break;
1871 case MSR_IA32_DEBUGCTLMSR:
1872 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1873 #ifdef __i386__
1874 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1875 #endif
1876 break;
1877 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1878 goto gp_fault;
1879 case MSR_IA32_MISC_ENABLE:
1880 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1881 /* Debug Trace Store is not supported. */
1882 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1883 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1884 break;
1885 default:
1886 if ( vpmu_do_rdmsr(regs) )
1887 goto done;
1888 if ( passive_domain_do_rdmsr(regs) )
1889 goto done;
1890 switch ( long_mode_do_msr_read(regs) )
1892 case HNDL_unhandled:
1893 break;
1894 case HNDL_exception_raised:
1895 return X86EMUL_EXCEPTION;
1896 case HNDL_done:
1897 goto done;
1900 if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
1901 break;
1903 if ( is_last_branch_msr(ecx) )
1905 msr_content = 0;
1906 break;
1909 if ( rdmsr_viridian_regs(ecx, &msr_content) ||
1910 rdmsr_hypervisor_regs(ecx, &msr_content) )
1911 break;
1913 if ( rdmsr_safe(ecx, eax, edx) == 0 )
1915 msr_content = ((uint64_t)edx << 32) | eax;
1916 break;
1919 goto gp_fault;
1922 regs->eax = (uint32_t)msr_content;
1923 regs->edx = (uint32_t)(msr_content >> 32);
1925 done:
1926 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1927 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1928 ecx, (unsigned long)regs->eax,
1929 (unsigned long)regs->edx);
1930 return X86EMUL_OKAY;
1932 gp_fault:
1933 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1934 return X86EMUL_EXCEPTION;
1937 static int vmx_alloc_vlapic_mapping(struct domain *d)
1939 void *apic_va;
1941 if ( !cpu_has_vmx_virtualize_apic_accesses )
1942 return 0;
1944 apic_va = alloc_xenheap_page();
1945 if ( apic_va == NULL )
1946 return -ENOMEM;
1947 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1948 set_mmio_p2m_entry(
1949 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1950 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1952 return 0;
1955 static void vmx_free_vlapic_mapping(struct domain *d)
1957 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1958 if ( mfn != 0 )
1959 free_xenheap_page(mfn_to_virt(mfn));
1962 static void vmx_install_vlapic_mapping(struct vcpu *v)
1964 paddr_t virt_page_ma, apic_page_ma;
1966 if ( !cpu_has_vmx_virtualize_apic_accesses )
1967 return;
1969 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1970 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1971 apic_page_ma <<= PAGE_SHIFT;
1973 vmx_vmcs_enter(v);
1974 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1975 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1976 vmx_vmcs_exit(v);
1979 void vmx_vlapic_msr_changed(struct vcpu *v)
1981 struct vlapic *vlapic = vcpu_vlapic(v);
1982 uint32_t ctl;
1984 if ( !cpu_has_vmx_virtualize_apic_accesses )
1985 return;
1987 vmx_vmcs_enter(v);
1988 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1989 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1990 if ( !vlapic_hw_disabled(vlapic) &&
1991 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1992 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1993 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1994 vmx_vmcs_exit(v);
1997 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1999 u32 ecx = regs->ecx;
2000 u64 msr_content;
2001 struct vcpu *v = current;
2003 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2004 ecx, (u32)regs->eax, (u32)regs->edx);
2006 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2008 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
2010 switch ( ecx )
2012 case MSR_IA32_SYSENTER_CS:
2013 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2014 break;
2015 case MSR_IA32_SYSENTER_ESP:
2016 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2017 break;
2018 case MSR_IA32_SYSENTER_EIP:
2019 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2020 break;
2021 case MSR_IA32_DEBUGCTLMSR: {
2022 int i, rc = 0;
2024 if ( !msr_content || (msr_content & ~3) )
2025 break;
2027 if ( msr_content & 1 )
2029 const struct lbr_info *lbr = last_branch_msr_get();
2030 if ( lbr == NULL )
2031 break;
2033 for ( ; (rc == 0) && lbr->count; lbr++ )
2034 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2035 if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
2036 vmx_disable_intercept_for_msr(v, lbr->base + i);
2039 if ( (rc < 0) ||
2040 (vmx_add_host_load_msr(ecx) < 0) )
2041 vmx_inject_hw_exception(TRAP_machine_check, 0);
2042 else
2044 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
2045 #ifdef __i386__
2046 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
2047 #endif
2050 break;
2052 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2053 goto gp_fault;
2054 default:
2055 if ( vpmu_do_wrmsr(regs) )
2056 return X86EMUL_OKAY;
2057 if ( passive_domain_do_wrmsr(regs) )
2058 return X86EMUL_OKAY;
2060 if ( wrmsr_viridian_regs(ecx, msr_content) )
2061 break;
2063 switch ( long_mode_do_msr_write(regs) )
2065 case HNDL_unhandled:
2066 if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
2067 !is_last_branch_msr(ecx) )
2068 wrmsr_hypervisor_regs(ecx, msr_content);
2069 break;
2070 case HNDL_exception_raised:
2071 return X86EMUL_EXCEPTION;
2072 case HNDL_done:
2073 break;
2075 break;
2078 return X86EMUL_OKAY;
2080 gp_fault:
2081 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2082 return X86EMUL_EXCEPTION;
2085 static void vmx_do_extint(struct cpu_user_regs *regs)
2087 unsigned int vector;
2089 vector = __vmread(VM_EXIT_INTR_INFO);
2090 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2092 vector &= INTR_INFO_VECTOR_MASK;
2093 HVMTRACE_1D(INTR, vector);
2095 switch ( vector )
2097 case IRQ_MOVE_CLEANUP_VECTOR:
2098 smp_irq_move_cleanup_interrupt(regs);
2099 break;
2100 case LOCAL_TIMER_VECTOR:
2101 smp_apic_timer_interrupt(regs);
2102 break;
2103 case EVENT_CHECK_VECTOR:
2104 smp_event_check_interrupt(regs);
2105 break;
2106 case INVALIDATE_TLB_VECTOR:
2107 smp_invalidate_interrupt();
2108 break;
2109 case CALL_FUNCTION_VECTOR:
2110 smp_call_function_interrupt(regs);
2111 break;
2112 case SPURIOUS_APIC_VECTOR:
2113 smp_spurious_interrupt(regs);
2114 break;
2115 case ERROR_APIC_VECTOR:
2116 smp_error_interrupt(regs);
2117 break;
2118 case CMCI_APIC_VECTOR:
2119 smp_cmci_interrupt(regs);
2120 break;
2121 case PMU_APIC_VECTOR:
2122 smp_pmu_apic_interrupt(regs);
2123 break;
2124 #ifdef CONFIG_X86_MCE_THERMAL
2125 case THERMAL_APIC_VECTOR:
2126 smp_thermal_interrupt(regs);
2127 break;
2128 #endif
2129 default:
2130 regs->entry_vector = vector;
2131 do_IRQ(regs);
2132 break;
2136 static void wbinvd_ipi(void *info)
2138 wbinvd();
2141 static void vmx_wbinvd_intercept(void)
2143 if ( !has_arch_pdevs(current->domain) )
2144 return;
2146 if ( cpu_has_wbinvd_exiting )
2147 on_each_cpu(wbinvd_ipi, NULL, 1);
2148 else
2149 wbinvd();
2152 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
2154 unsigned long gla, gfn = gpa >> PAGE_SHIFT;
2155 mfn_t mfn;
2156 p2m_type_t p2mt;
2158 if ( (qualification & EPT_GLA_VALID) &&
2159 hvm_hap_nested_page_fault(gfn) )
2160 return;
2162 /* Everything else is an error. */
2163 mfn = gfn_to_mfn_type_current(gfn, &p2mt, p2m_guest);
2164 gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
2165 "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
2166 qualification,
2167 (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
2168 (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
2169 (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
2170 (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
2171 (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
2172 (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
2173 gpa, mfn_x(mfn), p2mt);
2175 if ( qualification & EPT_GLA_VALID )
2177 gla = __vmread(GUEST_LINEAR_ADDRESS);
2178 gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
2181 if ( qualification & EPT_GAW_VIOLATION )
2182 gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n",
2183 9 * (unsigned int)current->domain->arch.hvm_domain.
2184 vmx.ept_control.gaw + 21);
2186 domain_crash(current->domain);
2189 static void vmx_failed_vmentry(unsigned int exit_reason,
2190 struct cpu_user_regs *regs)
2192 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2193 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2194 struct vcpu *curr = current;
2196 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2197 switch ( failed_vmentry_reason )
2199 case EXIT_REASON_INVALID_GUEST_STATE:
2200 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2201 break;
2202 case EXIT_REASON_MSR_LOADING:
2203 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2204 break;
2205 case EXIT_REASON_MCE_DURING_VMENTRY:
2206 printk("caused by machine check.\n");
2207 HVMTRACE_0D(MCE);
2208 do_machine_check(regs);
2209 break;
2210 default:
2211 printk("reason not known yet!");
2212 break;
2215 printk("************* VMCS Area **************\n");
2216 vmcs_dump_vcpu(curr);
2217 printk("**************************************\n");
2219 domain_crash(curr->domain);
2222 asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
2224 struct vcpu *v = current;
2226 /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since
2227 * we have CR4.VME == 1 and our own TSS with an empty interrupt
2228 * redirection bitmap, all software INTs will be handled by vm86 */
2229 v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
2230 regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2233 static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs)
2235 struct hvm_emulate_ctxt ctxt;
2236 int rc;
2238 hvm_emulate_prepare(&ctxt, regs);
2240 rc = hvm_emulate_one(&ctxt);
2242 switch ( rc )
2244 case X86EMUL_UNHANDLEABLE:
2245 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2246 break;
2247 case X86EMUL_EXCEPTION:
2248 if ( ctxt.exn_pending )
2249 hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0);
2250 /* fall through */
2251 default:
2252 hvm_emulate_writeback(&ctxt);
2253 break;
2257 static int vmx_handle_eoi_write(void)
2259 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2261 /*
2262 * 1. Must be a linear access data write.
2263 * 2. Data write must be to the EOI register.
2264 */
2265 if ( (((exit_qualification >> 12) & 0xf) == 1) &&
2266 ((exit_qualification & 0xfff) == APIC_EOI) )
2268 int inst_len = __get_instruction_length(); /* Safe: APIC data write */
2269 __update_guest_eip(inst_len);
2270 vlapic_EOI_set(vcpu_vlapic(current));
2271 return 1;
2274 return 0;
2277 static int vmx_handle_xsetbv(u64 new_bv)
2279 struct vcpu *v = current;
2280 u64 xfeature = (((u64)xfeature_high) << 32) | xfeature_low;
2281 struct segment_register sreg;
2283 hvm_get_segment_register(v, x86_seg_ss, &sreg);
2284 if ( sreg.attr.fields.dpl != 0 )
2285 goto err;
2287 if ( ((new_bv ^ xfeature) & ~xfeature) || !(new_bv & 1) )
2288 goto err;
2290 if ( (xfeature & XSTATE_YMM & new_bv) && !(new_bv & XSTATE_SSE) )
2291 goto err;
2293 v->arch.hvm_vcpu.xfeature_mask = new_bv;
2294 set_xcr0(new_bv);
2295 return 0;
2296 err:
2297 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2298 return -1;
2301 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2303 unsigned int exit_reason, idtv_info;
2304 unsigned long exit_qualification, inst_len = 0;
2305 struct vcpu *v = current;
2307 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2308 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2309 __vmread(GUEST_CR3);
2311 exit_reason = __vmread(VM_EXIT_REASON);
2313 if ( hvm_long_mode_enabled(v) )
2314 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
2315 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
2316 0, 0, 0);
2317 else
2318 HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason,
2319 (uint32_t)regs->eip,
2320 0, 0, 0, 0);
2322 perfc_incra(vmexits, exit_reason);
2324 /* Handle the interrupt we missed before allowing any more in. */
2325 if ( exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT )
2326 vmx_do_extint(regs);
2328 /* Now enable interrupts so it's safe to take locks. */
2329 local_irq_enable();
2331 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2332 return vmx_failed_vmentry(exit_reason, regs);
2334 if ( v->arch.hvm_vmx.vmx_realmode )
2336 unsigned int vector;
2338 /* Put RFLAGS back the way the guest wants it */
2339 regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2340 regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
2342 /* Unless this exit was for an interrupt, we've hit something
2343 * vm86 can't handle. Try again, using the emulator. */
2344 switch ( exit_reason )
2346 case EXIT_REASON_EXCEPTION_NMI:
2347 vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;
2348 if ( vector != TRAP_page_fault
2349 && vector != TRAP_nmi
2350 && vector != TRAP_machine_check )
2352 perfc_incr(realmode_exits);
2353 v->arch.hvm_vmx.vmx_emulate = 1;
2354 return;
2356 case EXIT_REASON_EXTERNAL_INTERRUPT:
2357 case EXIT_REASON_INIT:
2358 case EXIT_REASON_SIPI:
2359 case EXIT_REASON_PENDING_VIRT_INTR:
2360 case EXIT_REASON_PENDING_VIRT_NMI:
2361 case EXIT_REASON_MCE_DURING_VMENTRY:
2362 break;
2363 default:
2364 v->arch.hvm_vmx.vmx_emulate = 1;
2365 perfc_incr(realmode_exits);
2366 return;
2370 hvm_maybe_deassert_evtchn_irq();
2372 /* Event delivery caused this intercept? Queue for redelivery. */
2373 idtv_info = __vmread(IDT_VECTORING_INFO);
2374 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2375 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2377 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2379 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2380 __vmwrite(VM_ENTRY_INTR_INFO,
2381 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2382 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2383 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2384 __vmread(IDT_VECTORING_ERROR_CODE));
2387 /*
2388 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2389 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2390 */
2391 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2392 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2393 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2394 ~VMX_INTR_SHADOW_NMI);
2397 switch ( exit_reason )
2399 case EXIT_REASON_EXCEPTION_NMI:
2401 /*
2402 * We don't set the software-interrupt exiting (INT n).
2403 * (1) We can get an exception (e.g. #PG) in the guest, or
2404 * (2) NMI
2405 */
2406 unsigned int intr_info, vector;
2408 intr_info = __vmread(VM_EXIT_INTR_INFO);
2409 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2411 vector = intr_info & INTR_INFO_VECTOR_MASK;
2413 /*
2414 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2415 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2416 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2417 */
2418 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2419 !(idtv_info & INTR_INFO_VALID_MASK) &&
2420 (vector != TRAP_double_fault) )
2421 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2422 __vmread(GUEST_INTERRUPTIBILITY_INFO)
2423 | VMX_INTR_SHADOW_NMI);
2425 perfc_incra(cause_vector, vector);
2427 switch ( vector )
2429 case TRAP_debug:
2430 /*
2431 * Updates DR6 where debugger can peek (See 3B 23.2.1,
2432 * Table 23-1, "Exit Qualification for Debug Exceptions").
2433 */
2434 exit_qualification = __vmread(EXIT_QUALIFICATION);
2435 write_debugreg(6, exit_qualification | 0xffff0ff0);
2436 if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
2437 goto exit_and_crash;
2438 domain_pause_for_debugger();
2439 break;
2440 case TRAP_int3:
2441 if ( !v->domain->debugger_attached )
2442 goto exit_and_crash;
2443 inst_len = __get_instruction_length(); /* Safe: INT3 */
2444 __update_guest_eip(inst_len);
2445 #ifdef XEN_GDBSX_CONFIG
2446 current->arch.gdbsx_vcpu_event = TRAP_int3;
2447 #endif
2448 domain_pause_for_debugger();
2449 break;
2450 case TRAP_no_device:
2451 vmx_fpu_dirty_intercept();
2452 break;
2453 case TRAP_page_fault:
2454 exit_qualification = __vmread(EXIT_QUALIFICATION);
2455 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2457 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2458 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2459 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2460 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2461 (unsigned long)regs->esi, (unsigned long)regs->edi);
2463 if ( paging_fault(exit_qualification, regs) )
2465 if ( trace_will_trace_event(TRC_SHADOW) )
2466 break;
2467 if ( hvm_long_mode_enabled(v) )
2468 HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
2469 TRC_PAR_LONG(exit_qualification) );
2470 else
2471 HVMTRACE_2D(PF_XEN,
2472 regs->error_code, exit_qualification );
2473 break;
2476 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2477 vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
2478 break;
2479 case TRAP_nmi:
2480 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2481 (X86_EVENTTYPE_NMI << 8) )
2482 goto exit_and_crash;
2483 HVMTRACE_0D(NMI);
2484 self_nmi(); /* Real NMI, vector 2: normal processing. */
2485 break;
2486 case TRAP_machine_check:
2487 HVMTRACE_0D(MCE);
2488 do_machine_check(regs);
2489 break;
2490 case TRAP_invalid_op:
2491 vmx_vmexit_ud_intercept(regs);
2492 break;
2493 default:
2494 goto exit_and_crash;
2496 break;
2498 case EXIT_REASON_EXTERNAL_INTERRUPT:
2499 /* Already handled above. */
2500 break;
2501 case EXIT_REASON_TRIPLE_FAULT:
2502 hvm_triple_fault();
2503 break;
2504 case EXIT_REASON_PENDING_VIRT_INTR:
2505 /* Disable the interrupt window. */
2506 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2507 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2508 v->arch.hvm_vmx.exec_control);
2509 break;
2510 case EXIT_REASON_PENDING_VIRT_NMI:
2511 /* Disable the NMI window. */
2512 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2513 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2514 v->arch.hvm_vmx.exec_control);
2515 break;
2516 case EXIT_REASON_TASK_SWITCH: {
2517 const enum hvm_task_switch_reason reasons[] = {
2518 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2519 int32_t ecode = -1, source;
2520 exit_qualification = __vmread(EXIT_QUALIFICATION);
2521 source = (exit_qualification >> 30) & 3;
2522 /* Vectored event should fill in interrupt information. */
2523 WARN_ON((source == 3) && !(idtv_info & INTR_INFO_VALID_MASK));
2524 /*
2525 * In the following cases there is an instruction to skip over:
2526 * - TSW is due to a CALL, IRET or JMP instruction.
2527 * - TSW is a vectored event due to a SW exception or SW interrupt.
2528 */
2529 inst_len = ((source != 3) || /* CALL, IRET, or JMP? */
2530 (idtv_info & (1u<<10))) /* IntrType > 3? */
2531 ? __get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0;
2532 if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2533 ecode = __vmread(IDT_VECTORING_ERROR_CODE);
2534 regs->eip += inst_len;
2535 hvm_task_switch((uint16_t)exit_qualification, reasons[source], ecode);
2536 break;
2538 case EXIT_REASON_CPUID:
2539 inst_len = __get_instruction_length(); /* Safe: CPUID */
2540 __update_guest_eip(inst_len);
2541 vmx_do_cpuid(regs);
2542 break;
2543 case EXIT_REASON_HLT:
2544 inst_len = __get_instruction_length(); /* Safe: HLT */
2545 __update_guest_eip(inst_len);
2546 hvm_hlt(regs->eflags);
2547 break;
2548 case EXIT_REASON_INVLPG:
2550 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2551 __update_guest_eip(inst_len);
2552 exit_qualification = __vmread(EXIT_QUALIFICATION);
2553 vmx_invlpg_intercept(exit_qualification);
2554 break;
2556 case EXIT_REASON_RDTSC:
2557 inst_len = __get_instruction_length();
2558 __update_guest_eip(inst_len);
2559 hvm_rdtsc_intercept(regs);
2560 break;
2561 #ifdef __x86_64__
2562 case EXIT_REASON_RDTSCP:
2564 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
2565 inst_len = __get_instruction_length();
2566 __update_guest_eip(inst_len);
2567 hvm_rdtsc_intercept(regs);
2568 regs->ecx = (uint32_t)(guest_state->msrs[VMX_INDEX_MSR_TSC_AUX]);
2569 break;
2571 #endif
2572 case EXIT_REASON_VMCALL:
2574 int rc;
2575 HVMTRACE_1D(VMMCALL, regs->eax);
2576 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2577 rc = hvm_do_hypercall(regs);
2578 if ( rc != HVM_HCALL_preempted )
2580 __update_guest_eip(inst_len);
2581 if ( rc == HVM_HCALL_invalidate )
2582 send_invalidate_req();
2584 break;
2586 case EXIT_REASON_CR_ACCESS:
2588 exit_qualification = __vmread(EXIT_QUALIFICATION);
2589 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2590 if ( vmx_cr_access(exit_qualification, regs) )
2591 __update_guest_eip(inst_len);
2592 break;
2594 case EXIT_REASON_DR_ACCESS:
2595 exit_qualification = __vmread(EXIT_QUALIFICATION);
2596 vmx_dr_access(exit_qualification, regs);
2597 break;
2598 case EXIT_REASON_MSR_READ:
2599 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2600 if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY )
2601 __update_guest_eip(inst_len);
2602 break;
2603 case EXIT_REASON_MSR_WRITE:
2604 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2605 if ( hvm_msr_write_intercept(regs) == X86EMUL_OKAY )
2606 __update_guest_eip(inst_len);
2607 break;
2609 case EXIT_REASON_MWAIT_INSTRUCTION:
2610 case EXIT_REASON_MONITOR_INSTRUCTION:
2611 case EXIT_REASON_VMCLEAR:
2612 case EXIT_REASON_VMLAUNCH:
2613 case EXIT_REASON_VMPTRLD:
2614 case EXIT_REASON_VMPTRST:
2615 case EXIT_REASON_VMREAD:
2616 case EXIT_REASON_VMRESUME:
2617 case EXIT_REASON_VMWRITE:
2618 case EXIT_REASON_VMXOFF:
2619 case EXIT_REASON_VMXON:
2620 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2621 break;
2623 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2624 break;
2626 case EXIT_REASON_APIC_ACCESS:
2627 if ( !vmx_handle_eoi_write() && !handle_mmio() )
2628 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2629 break;
2631 case EXIT_REASON_IO_INSTRUCTION:
2632 if ( !handle_mmio() )
2633 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2634 break;
2636 case EXIT_REASON_INVD:
2637 case EXIT_REASON_WBINVD:
2639 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2640 __update_guest_eip(inst_len);
2641 vmx_wbinvd_intercept();
2642 break;
2645 case EXIT_REASON_EPT_VIOLATION:
2647 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2648 #ifdef __i386__
2649 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2650 #endif
2651 exit_qualification = __vmread(EXIT_QUALIFICATION);
2652 ept_handle_violation(exit_qualification, gpa);
2653 break;
2656 case EXIT_REASON_MONITOR_TRAP_FLAG:
2657 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
2658 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
2659 if ( v->domain->debugger_attached && v->arch.hvm_vcpu.single_step )
2660 domain_pause_for_debugger();
2661 break;
2663 case EXIT_REASON_PAUSE_INSTRUCTION:
2664 perfc_incr(pauseloop_exits);
2665 do_sched_op_compat(SCHEDOP_yield, 0);
2666 break;
2668 case EXIT_REASON_XSETBV:
2670 u64 new_bv = (((u64)regs->edx) << 32) | regs->eax;
2671 if ( vmx_handle_xsetbv(new_bv) == 0 )
2673 inst_len = __get_instruction_length();
2674 __update_guest_eip(inst_len);
2676 break;
2679 default:
2680 exit_and_crash:
2681 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2682 domain_crash(v->domain);
2683 break;
2687 asmlinkage void vmx_vmenter_helper(void)
2689 struct vcpu *curr = current;
2690 u32 new_asid, old_asid;
2691 bool_t need_flush;
2693 if ( !cpu_has_vmx_vpid )
2694 goto out;
2696 old_asid = curr->arch.hvm_vcpu.asid;
2697 need_flush = hvm_asid_handle_vmenter();
2698 new_asid = curr->arch.hvm_vcpu.asid;
2700 if ( unlikely(new_asid != old_asid) )
2702 __vmwrite(VIRTUAL_PROCESSOR_ID, new_asid);
2703 if ( !old_asid && new_asid )
2705 /* VPID was disabled: now enabled. */
2706 curr->arch.hvm_vmx.secondary_exec_control |=
2707 SECONDARY_EXEC_ENABLE_VPID;
2708 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2709 curr->arch.hvm_vmx.secondary_exec_control);
2711 else if ( old_asid && !new_asid )
2713 /* VPID was enabled: now disabled. */
2714 curr->arch.hvm_vmx.secondary_exec_control &=
2715 ~SECONDARY_EXEC_ENABLE_VPID;
2716 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2717 curr->arch.hvm_vmx.secondary_exec_control);
2721 if ( unlikely(need_flush) )
2722 vpid_sync_all();
2724 out:
2725 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2728 /*
2729 * Local variables:
2730 * mode: C
2731 * c-set-style: "BSD"
2732 * c-basic-offset: 4
2733 * tab-width: 4
2734 * indent-tabs-mode: nil
2735 * End:
2736 */