debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents 44fe7ad6fee8
children af06333d4c5d
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/emulate.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/hvm/vmx/vmx.h>
44 #include <asm/hvm/vmx/vmcs.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
53 #include <asm/xenoprof.h>
55 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
57 static void vmx_ctxt_switch_from(struct vcpu *v);
58 static void vmx_ctxt_switch_to(struct vcpu *v);
60 static int vmx_alloc_vlapic_mapping(struct domain *d);
61 static void vmx_free_vlapic_mapping(struct domain *d);
62 static int vmx_alloc_vpid(struct domain *d);
63 static void vmx_free_vpid(struct domain *d);
64 static void vmx_install_vlapic_mapping(struct vcpu *v);
65 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
66 static void vmx_update_guest_efer(struct vcpu *v);
67 static void vmx_cpuid_intercept(
68 unsigned int *eax, unsigned int *ebx,
69 unsigned int *ecx, unsigned int *edx);
70 static void vmx_wbinvd_intercept(void);
71 static void vmx_fpu_dirty_intercept(void);
72 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
73 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
74 static void vmx_invlpg_intercept(unsigned long vaddr);
76 static int vmx_domain_initialise(struct domain *d)
77 {
78 int rc;
80 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
81 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
82 d->arch.hvm_domain.vmx.ept_control.asr =
83 pagetable_get_pfn(d->arch.phys_table);
85 if ( (rc = vmx_alloc_vpid(d)) != 0 )
86 return rc;
88 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
89 {
90 vmx_free_vpid(d);
91 return rc;
92 }
94 return 0;
95 }
97 static void vmx_domain_destroy(struct domain *d)
98 {
99 ept_sync_domain(d);
100 vmx_free_vlapic_mapping(d);
101 vmx_free_vpid(d);
102 }
104 static int vmx_vcpu_initialise(struct vcpu *v)
105 {
106 int rc;
108 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
110 v->arch.schedule_tail = vmx_do_resume;
111 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
112 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
114 if ( (rc = vmx_create_vmcs(v)) != 0 )
115 {
116 dprintk(XENLOG_WARNING,
117 "Failed to create VMCS for vcpu %d: err=%d.\n",
118 v->vcpu_id, rc);
119 return rc;
120 }
122 vpmu_initialise(v);
124 vmx_install_vlapic_mapping(v);
126 /* %eax == 1 signals full real-mode support to the guest loader. */
127 if ( v->vcpu_id == 0 )
128 v->arch.guest_context.user_regs.eax = 1;
130 return 0;
131 }
133 static void vmx_vcpu_destroy(struct vcpu *v)
134 {
135 vmx_destroy_vmcs(v);
136 vpmu_destroy(v);
137 passive_domain_destroy(v);
138 }
140 #ifdef __x86_64__
142 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
144 static u32 msr_index[VMX_MSR_COUNT] =
145 {
146 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
147 };
149 static void vmx_save_host_msrs(void)
150 {
151 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
152 int i;
154 for ( i = 0; i < VMX_MSR_COUNT; i++ )
155 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
156 }
158 #define WRITE_MSR(address) \
159 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
160 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
161 wrmsrl(MSR_ ## address, msr_content); \
162 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
163 break
165 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
166 {
167 u64 msr_content = 0;
168 u32 ecx = regs->ecx;
169 struct vcpu *v = current;
170 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
172 switch ( ecx )
173 {
174 case MSR_EFER:
175 msr_content = v->arch.hvm_vcpu.guest_efer;
176 break;
178 case MSR_FS_BASE:
179 msr_content = __vmread(GUEST_FS_BASE);
180 goto check_long_mode;
182 case MSR_GS_BASE:
183 msr_content = __vmread(GUEST_GS_BASE);
184 goto check_long_mode;
186 case MSR_SHADOW_GS_BASE:
187 msr_content = v->arch.hvm_vmx.shadow_gs;
188 check_long_mode:
189 if ( !(hvm_long_mode_enabled(v)) )
190 {
191 vmx_inject_hw_exception(TRAP_gp_fault, 0);
192 return HNDL_exception_raised;
193 }
194 break;
196 case MSR_STAR:
197 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
198 break;
200 case MSR_LSTAR:
201 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
202 break;
204 case MSR_CSTAR:
205 msr_content = v->arch.hvm_vmx.cstar;
206 break;
208 case MSR_SYSCALL_MASK:
209 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
210 break;
212 default:
213 return HNDL_unhandled;
214 }
216 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
218 regs->eax = (u32)(msr_content >> 0);
219 regs->edx = (u32)(msr_content >> 32);
221 return HNDL_done;
222 }
224 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
225 {
226 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
227 u32 ecx = regs->ecx;
228 struct vcpu *v = current;
229 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
230 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
232 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
234 switch ( ecx )
235 {
236 case MSR_EFER:
237 if ( hvm_set_efer(msr_content) )
238 goto exception_raised;
239 break;
241 case MSR_FS_BASE:
242 case MSR_GS_BASE:
243 case MSR_SHADOW_GS_BASE:
244 if ( !hvm_long_mode_enabled(v) )
245 goto gp_fault;
247 if ( !is_canonical_address(msr_content) )
248 goto uncanonical_address;
250 if ( ecx == MSR_FS_BASE )
251 __vmwrite(GUEST_FS_BASE, msr_content);
252 else if ( ecx == MSR_GS_BASE )
253 __vmwrite(GUEST_GS_BASE, msr_content);
254 else
255 {
256 v->arch.hvm_vmx.shadow_gs = msr_content;
257 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
258 }
260 break;
262 case MSR_STAR:
263 WRITE_MSR(STAR);
265 case MSR_LSTAR:
266 if ( !is_canonical_address(msr_content) )
267 goto uncanonical_address;
268 WRITE_MSR(LSTAR);
270 case MSR_CSTAR:
271 if ( !is_canonical_address(msr_content) )
272 goto uncanonical_address;
273 v->arch.hvm_vmx.cstar = msr_content;
274 break;
276 case MSR_SYSCALL_MASK:
277 WRITE_MSR(SYSCALL_MASK);
279 default:
280 return HNDL_unhandled;
281 }
283 return HNDL_done;
285 uncanonical_address:
286 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
287 gp_fault:
288 vmx_inject_hw_exception(TRAP_gp_fault, 0);
289 exception_raised:
290 return HNDL_exception_raised;
291 }
293 /*
294 * To avoid MSR save/restore at every VM exit/entry time, we restore
295 * the x86_64 specific MSRs at domain switch time. Since these MSRs
296 * are not modified once set for para domains, we don't save them,
297 * but simply reset them to values set in percpu_traps_init().
298 */
299 static void vmx_restore_host_msrs(void)
300 {
301 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
302 int i;
304 while ( host_msr_state->flags )
305 {
306 i = find_first_set_bit(host_msr_state->flags);
307 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
308 clear_bit(i, &host_msr_state->flags);
309 }
310 }
312 static void vmx_save_guest_msrs(struct vcpu *v)
313 {
314 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
315 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
316 }
318 static void vmx_restore_guest_msrs(struct vcpu *v)
319 {
320 struct vmx_msr_state *guest_msr_state, *host_msr_state;
321 unsigned long guest_flags;
322 int i;
324 guest_msr_state = &v->arch.hvm_vmx.msr_state;
325 host_msr_state = &this_cpu(host_msr_state);
327 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
329 guest_flags = guest_msr_state->flags;
331 while ( guest_flags )
332 {
333 i = find_first_set_bit(guest_flags);
335 HVM_DBG_LOG(DBG_LEVEL_2,
336 "restore guest's index %d msr %x with value %lx",
337 i, msr_index[i], guest_msr_state->msrs[i]);
338 set_bit(i, &host_msr_state->flags);
339 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
340 clear_bit(i, &guest_flags);
341 }
343 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
344 {
345 HVM_DBG_LOG(DBG_LEVEL_2,
346 "restore guest's EFER with value %lx",
347 v->arch.hvm_vcpu.guest_efer);
348 write_efer((read_efer() & ~EFER_SCE) |
349 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
350 }
351 }
353 #else /* __i386__ */
355 #define vmx_save_host_msrs() ((void)0)
356 #define vmx_restore_host_msrs() ((void)0)
358 #define vmx_save_guest_msrs(v) ((void)0)
359 #define vmx_restore_guest_msrs(v) ((void)0)
361 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
362 {
363 u64 msr_content = 0;
364 struct vcpu *v = current;
366 switch ( regs->ecx )
367 {
368 case MSR_EFER:
369 msr_content = v->arch.hvm_vcpu.guest_efer;
370 break;
372 default:
373 return HNDL_unhandled;
374 }
376 regs->eax = msr_content >> 0;
377 regs->edx = msr_content >> 32;
379 return HNDL_done;
380 }
382 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
383 {
384 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
386 switch ( regs->ecx )
387 {
388 case MSR_EFER:
389 if ( hvm_set_efer(msr_content) )
390 return HNDL_exception_raised;
391 break;
393 default:
394 return HNDL_unhandled;
395 }
397 return HNDL_done;
398 }
400 #endif /* __i386__ */
402 static int vmx_guest_x86_mode(struct vcpu *v)
403 {
404 unsigned int cs_ar_bytes;
406 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
407 return 0;
408 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
409 return 1;
410 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
411 if ( hvm_long_mode_enabled(v) &&
412 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
413 return 8;
414 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
415 }
417 static void vmx_save_dr(struct vcpu *v)
418 {
419 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
420 return;
422 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
423 v->arch.hvm_vcpu.flag_dr_dirty = 0;
424 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
425 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
427 v->arch.guest_context.debugreg[0] = read_debugreg(0);
428 v->arch.guest_context.debugreg[1] = read_debugreg(1);
429 v->arch.guest_context.debugreg[2] = read_debugreg(2);
430 v->arch.guest_context.debugreg[3] = read_debugreg(3);
431 v->arch.guest_context.debugreg[6] = read_debugreg(6);
432 /* DR7 must be saved as it is used by vmx_restore_dr(). */
433 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
434 }
436 static void __restore_debug_registers(struct vcpu *v)
437 {
438 if ( v->arch.hvm_vcpu.flag_dr_dirty )
439 return;
441 v->arch.hvm_vcpu.flag_dr_dirty = 1;
443 write_debugreg(0, v->arch.guest_context.debugreg[0]);
444 write_debugreg(1, v->arch.guest_context.debugreg[1]);
445 write_debugreg(2, v->arch.guest_context.debugreg[2]);
446 write_debugreg(3, v->arch.guest_context.debugreg[3]);
447 write_debugreg(6, v->arch.guest_context.debugreg[6]);
448 /* DR7 is loaded from the VMCS. */
449 }
451 /*
452 * DR7 is saved and restored on every vmexit. Other debug registers only
453 * need to be restored if their value is going to affect execution -- i.e.,
454 * if one of the breakpoints is enabled. So mask out all bits that don't
455 * enable some breakpoint functionality.
456 */
457 static void vmx_restore_dr(struct vcpu *v)
458 {
459 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
460 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
461 __restore_debug_registers(v);
462 }
464 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
465 {
466 uint32_t ev;
468 vmx_vmcs_enter(v);
470 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
471 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
472 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
473 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
475 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
477 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
478 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
479 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
481 c->pending_event = 0;
482 c->error_code = 0;
483 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
484 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
485 {
486 c->pending_event = ev;
487 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
488 }
490 vmx_vmcs_exit(v);
491 }
493 static int vmx_restore_cr0_cr3(
494 struct vcpu *v, unsigned long cr0, unsigned long cr3)
495 {
496 unsigned long mfn = 0;
497 p2m_type_t p2mt;
499 if ( paging_mode_shadow(v->domain) )
500 {
501 if ( cr0 & X86_CR0_PG )
502 {
503 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
504 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
505 {
506 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
507 return -EINVAL;
508 }
509 }
511 if ( hvm_paging_enabled(v) )
512 put_page(pagetable_get_page(v->arch.guest_table));
514 v->arch.guest_table = pagetable_from_pfn(mfn);
515 }
517 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
518 v->arch.hvm_vcpu.guest_cr[3] = cr3;
520 return 0;
521 }
523 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
524 {
525 int rc;
527 if ( c->pending_valid &&
528 ((c->pending_type == 1) || (c->pending_type > 6) ||
529 (c->pending_reserved != 0)) )
530 {
531 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
532 c->pending_event);
533 return -EINVAL;
534 }
536 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
537 if ( rc )
538 return rc;
540 vmx_vmcs_enter(v);
542 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
543 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
544 vmx_update_guest_cr(v, 0);
545 vmx_update_guest_cr(v, 2);
546 vmx_update_guest_cr(v, 4);
548 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
549 vmx_update_guest_efer(v);
551 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
552 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
553 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
555 __vmwrite(GUEST_DR7, c->dr7);
557 vmx_vmcs_exit(v);
559 paging_update_paging_modes(v);
561 if ( c->pending_valid )
562 {
563 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
564 c->pending_event, c->error_code);
566 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
567 {
568 vmx_vmcs_enter(v);
569 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
570 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
571 vmx_vmcs_exit(v);
572 }
573 }
575 return 0;
576 }
578 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
579 {
580 #ifdef __x86_64__
581 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
582 unsigned long guest_flags = guest_state->flags;
584 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
585 data->msr_cstar = v->arch.hvm_vmx.cstar;
587 /* save msrs */
588 data->msr_flags = guest_flags;
589 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
590 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
591 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
592 #endif
594 data->tsc = hvm_get_guest_tsc(v);
595 }
597 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
598 {
599 #ifdef __x86_64__
600 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
602 /* restore msrs */
603 guest_state->flags = data->msr_flags & 7;
604 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
605 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
606 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
608 v->arch.hvm_vmx.cstar = data->msr_cstar;
609 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
610 #endif
612 hvm_set_guest_tsc(v, data->tsc);
613 }
616 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
617 {
618 vmx_save_cpu_state(v, ctxt);
619 vmx_vmcs_save(v, ctxt);
620 }
622 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
623 {
624 vmx_load_cpu_state(v, ctxt);
626 if ( vmx_vmcs_restore(v, ctxt) )
627 {
628 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
629 domain_crash(v->domain);
630 return -EINVAL;
631 }
633 return 0;
634 }
636 static void vmx_fpu_enter(struct vcpu *v)
637 {
638 setup_fpu(v);
639 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
640 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
641 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
642 }
644 static void vmx_fpu_leave(struct vcpu *v)
645 {
646 ASSERT(!v->fpu_dirtied);
647 ASSERT(read_cr0() & X86_CR0_TS);
649 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
650 {
651 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
652 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
653 }
655 /*
656 * If the guest does not have TS enabled then we must cause and handle an
657 * exception on first use of the FPU. If the guest *does* have TS enabled
658 * then this is not necessary: no FPU activity can occur until the guest
659 * clears CR0.TS, and we will initialise the FPU when that happens.
660 */
661 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
662 {
663 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
664 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
665 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
666 }
667 }
669 static void vmx_ctxt_switch_from(struct vcpu *v)
670 {
671 vmx_fpu_leave(v);
672 vmx_save_guest_msrs(v);
673 vmx_restore_host_msrs();
674 vmx_save_dr(v);
675 vpmu_save(v);
676 }
678 static void vmx_ctxt_switch_to(struct vcpu *v)
679 {
680 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
681 if ( unlikely(read_cr4() != mmu_cr4_features) )
682 write_cr4(mmu_cr4_features);
684 vmx_restore_guest_msrs(v);
685 vmx_restore_dr(v);
686 vpmu_load(v);
687 }
690 /* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
691 * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
692 * The guest thinks it's got ring-0 segments, so we need to fudge
693 * things. We store the ring-3 version in the VMCS to avoid lots of
694 * shuffling on vmenter and vmexit, and translate in these accessors. */
696 #define rm_cs_attr (((union segment_attributes) { \
697 .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
698 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
699 #define rm_ds_attr (((union segment_attributes) { \
700 .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
701 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
702 #define vm86_ds_attr (((union segment_attributes) { \
703 .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \
704 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
705 #define vm86_tr_attr (((union segment_attributes) { \
706 .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \
707 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
709 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
710 struct segment_register *reg)
711 {
712 uint32_t attr = 0;
714 vmx_vmcs_enter(v);
716 switch ( seg )
717 {
718 case x86_seg_cs:
719 reg->sel = __vmread(GUEST_CS_SELECTOR);
720 reg->limit = __vmread(GUEST_CS_LIMIT);
721 reg->base = __vmread(GUEST_CS_BASE);
722 attr = __vmread(GUEST_CS_AR_BYTES);
723 break;
724 case x86_seg_ds:
725 reg->sel = __vmread(GUEST_DS_SELECTOR);
726 reg->limit = __vmread(GUEST_DS_LIMIT);
727 reg->base = __vmread(GUEST_DS_BASE);
728 attr = __vmread(GUEST_DS_AR_BYTES);
729 break;
730 case x86_seg_es:
731 reg->sel = __vmread(GUEST_ES_SELECTOR);
732 reg->limit = __vmread(GUEST_ES_LIMIT);
733 reg->base = __vmread(GUEST_ES_BASE);
734 attr = __vmread(GUEST_ES_AR_BYTES);
735 break;
736 case x86_seg_fs:
737 reg->sel = __vmread(GUEST_FS_SELECTOR);
738 reg->limit = __vmread(GUEST_FS_LIMIT);
739 reg->base = __vmread(GUEST_FS_BASE);
740 attr = __vmread(GUEST_FS_AR_BYTES);
741 break;
742 case x86_seg_gs:
743 reg->sel = __vmread(GUEST_GS_SELECTOR);
744 reg->limit = __vmread(GUEST_GS_LIMIT);
745 reg->base = __vmread(GUEST_GS_BASE);
746 attr = __vmread(GUEST_GS_AR_BYTES);
747 break;
748 case x86_seg_ss:
749 reg->sel = __vmread(GUEST_SS_SELECTOR);
750 reg->limit = __vmread(GUEST_SS_LIMIT);
751 reg->base = __vmread(GUEST_SS_BASE);
752 attr = __vmread(GUEST_SS_AR_BYTES);
753 break;
754 case x86_seg_tr:
755 reg->sel = __vmread(GUEST_TR_SELECTOR);
756 reg->limit = __vmread(GUEST_TR_LIMIT);
757 reg->base = __vmread(GUEST_TR_BASE);
758 attr = __vmread(GUEST_TR_AR_BYTES);
759 break;
760 case x86_seg_gdtr:
761 reg->limit = __vmread(GUEST_GDTR_LIMIT);
762 reg->base = __vmread(GUEST_GDTR_BASE);
763 break;
764 case x86_seg_idtr:
765 reg->limit = __vmread(GUEST_IDTR_LIMIT);
766 reg->base = __vmread(GUEST_IDTR_BASE);
767 break;
768 case x86_seg_ldtr:
769 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
770 reg->limit = __vmread(GUEST_LDTR_LIMIT);
771 reg->base = __vmread(GUEST_LDTR_BASE);
772 attr = __vmread(GUEST_LDTR_AR_BYTES);
773 break;
774 default:
775 BUG();
776 }
778 vmx_vmcs_exit(v);
780 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
781 /* Unusable flag is folded into Present flag. */
782 if ( attr & (1u<<16) )
783 reg->attr.fields.p = 0;
785 /* Adjust for virtual 8086 mode */
786 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr
787 && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
788 {
789 struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
790 if ( seg == x86_seg_tr )
791 *reg = *sreg;
792 else if ( reg->base != sreg->base || seg == x86_seg_ss )
793 {
794 /* If the guest's reloaded the segment, remember the new version.
795 * We can't tell if the guest reloaded the segment with another
796 * one that has the same base. By default we assume it hasn't,
797 * since we don't want to lose big-real-mode segment attributes,
798 * but for SS we assume it has: the Ubuntu graphical bootloader
799 * does this and gets badly confused if we leave the old SS in
800 * place. */
801 reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
802 *sreg = *reg;
803 }
804 else
805 {
806 /* Always give realmode guests a selector that matches the base
807 * but keep the attr and limit from before */
808 *reg = *sreg;
809 reg->sel = reg->base >> 4;
810 }
811 }
812 }
814 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
815 struct segment_register *reg)
816 {
817 uint32_t attr, sel, limit;
818 uint64_t base;
820 sel = reg->sel;
821 attr = reg->attr.bytes;
822 limit = reg->limit;
823 base = reg->base;
825 /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
826 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
827 {
828 /* Remember the proper contents */
829 v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
831 if ( seg == x86_seg_tr )
832 {
833 if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
834 {
835 sel = 0;
836 attr = vm86_tr_attr;
837 limit = 0xff;
838 base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
839 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
840 }
841 else
842 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
843 }
844 else
845 {
846 /* Try to fake it out as a 16bit data segment. This could
847 * cause confusion for the guest if it reads the selector,
848 * but otherwise we have to emulate if *any* segment hasn't
849 * been reloaded. */
850 if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
851 && reg->attr.fields.p )
852 {
853 sel = base >> 4;
854 attr = vm86_ds_attr;
855 limit = 0xffff;
856 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
857 }
858 else
859 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
860 }
861 }
863 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
865 /* Not-present must mean unusable. */
866 if ( !reg->attr.fields.p )
867 attr |= (1u << 16);
869 /* VMX has strict consistency requirement for flag G. */
870 attr |= !!(limit >> 20) << 15;
872 vmx_vmcs_enter(v);
874 switch ( seg )
875 {
876 case x86_seg_cs:
877 __vmwrite(GUEST_CS_SELECTOR, sel);
878 __vmwrite(GUEST_CS_LIMIT, limit);
879 __vmwrite(GUEST_CS_BASE, base);
880 __vmwrite(GUEST_CS_AR_BYTES, attr);
881 break;
882 case x86_seg_ds:
883 __vmwrite(GUEST_DS_SELECTOR, sel);
884 __vmwrite(GUEST_DS_LIMIT, limit);
885 __vmwrite(GUEST_DS_BASE, base);
886 __vmwrite(GUEST_DS_AR_BYTES, attr);
887 break;
888 case x86_seg_es:
889 __vmwrite(GUEST_ES_SELECTOR, sel);
890 __vmwrite(GUEST_ES_LIMIT, limit);
891 __vmwrite(GUEST_ES_BASE, base);
892 __vmwrite(GUEST_ES_AR_BYTES, attr);
893 break;
894 case x86_seg_fs:
895 __vmwrite(GUEST_FS_SELECTOR, sel);
896 __vmwrite(GUEST_FS_LIMIT, limit);
897 __vmwrite(GUEST_FS_BASE, base);
898 __vmwrite(GUEST_FS_AR_BYTES, attr);
899 break;
900 case x86_seg_gs:
901 __vmwrite(GUEST_GS_SELECTOR, sel);
902 __vmwrite(GUEST_GS_LIMIT, limit);
903 __vmwrite(GUEST_GS_BASE, base);
904 __vmwrite(GUEST_GS_AR_BYTES, attr);
905 break;
906 case x86_seg_ss:
907 __vmwrite(GUEST_SS_SELECTOR, sel);
908 __vmwrite(GUEST_SS_LIMIT, limit);
909 __vmwrite(GUEST_SS_BASE, base);
910 __vmwrite(GUEST_SS_AR_BYTES, attr);
911 break;
912 case x86_seg_tr:
913 __vmwrite(GUEST_TR_SELECTOR, sel);
914 __vmwrite(GUEST_TR_LIMIT, limit);
915 __vmwrite(GUEST_TR_BASE, base);
916 /* VMX checks that the the busy flag (bit 1) is set. */
917 __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
918 break;
919 case x86_seg_gdtr:
920 __vmwrite(GUEST_GDTR_LIMIT, limit);
921 __vmwrite(GUEST_GDTR_BASE, base);
922 break;
923 case x86_seg_idtr:
924 __vmwrite(GUEST_IDTR_LIMIT, limit);
925 __vmwrite(GUEST_IDTR_BASE, base);
926 break;
927 case x86_seg_ldtr:
928 __vmwrite(GUEST_LDTR_SELECTOR, sel);
929 __vmwrite(GUEST_LDTR_LIMIT, limit);
930 __vmwrite(GUEST_LDTR_BASE, base);
931 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
932 break;
933 default:
934 BUG();
935 }
937 vmx_vmcs_exit(v);
938 }
940 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
941 {
942 vmx_vmcs_enter(v);
943 __vmwrite(TSC_OFFSET, offset);
944 #if defined (__i386__)
945 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
946 #endif
947 vmx_vmcs_exit(v);
948 }
950 void do_nmi(struct cpu_user_regs *);
952 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
953 {
954 char *p;
955 int i;
957 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
958 {
959 p = (char *)(hypercall_page + (i * 32));
960 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
961 *(u32 *)(p + 1) = i;
962 *(u8 *)(p + 5) = 0x0f; /* vmcall */
963 *(u8 *)(p + 6) = 0x01;
964 *(u8 *)(p + 7) = 0xc1;
965 *(u8 *)(p + 8) = 0xc3; /* ret */
966 }
968 /* Don't support HYPERVISOR_iret at the moment */
969 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
970 }
972 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
973 {
974 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
975 }
977 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
978 {
979 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
980 }
982 static void vmx_load_pdptrs(struct vcpu *v)
983 {
984 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
985 uint64_t *guest_pdptrs;
986 p2m_type_t p2mt;
987 char *p;
989 /* EPT needs to load PDPTRS into VMCS for PAE. */
990 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
991 return;
993 if ( cr3 & 0x1fUL )
994 goto crash;
996 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
997 if ( !p2m_is_ram(p2mt) )
998 goto crash;
1000 p = map_domain_page(mfn);
1002 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
1004 /*
1005 * We do not check the PDPTRs for validity. The CPU will do this during
1006 * vm entry, and we can handle the failure there and crash the guest.
1007 * The only thing we could do better here is #GP instead.
1008 */
1010 vmx_vmcs_enter(v);
1012 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
1013 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
1014 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
1015 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
1016 #ifdef __i386__
1017 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
1018 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
1019 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
1020 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
1021 #endif
1023 vmx_vmcs_exit(v);
1025 unmap_domain_page(p);
1026 return;
1028 crash:
1029 domain_crash(v->domain);
1032 static void vmx_update_host_cr3(struct vcpu *v)
1034 vmx_vmcs_enter(v);
1035 __vmwrite(HOST_CR3, v->arch.cr3);
1036 vmx_vmcs_exit(v);
1039 void vmx_update_debug_state(struct vcpu *v)
1041 unsigned long intercepts, mask;
1043 ASSERT(v == current);
1045 mask = 1u << TRAP_int3;
1046 if ( !cpu_has_monitor_trap_flag )
1047 mask |= 1u << TRAP_debug;
1049 intercepts = __vmread(EXCEPTION_BITMAP);
1050 if ( v->arch.hvm_vcpu.debug_state_latch )
1051 intercepts |= mask;
1052 else
1053 intercepts &= ~mask;
1054 __vmwrite(EXCEPTION_BITMAP, intercepts);
1057 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1059 vmx_vmcs_enter(v);
1061 switch ( cr )
1063 case 0: {
1064 int realmode;
1065 unsigned long hw_cr0_mask = X86_CR0_NE;
1067 if ( !vmx_unrestricted_guest(v) )
1068 hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE;
1070 if ( paging_mode_shadow(v->domain) )
1071 hw_cr0_mask |= X86_CR0_WP;
1073 if ( paging_mode_hap(v->domain) )
1075 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
1076 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1077 CPU_BASED_CR3_STORE_EXITING);
1078 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1079 if ( !hvm_paging_enabled(v) )
1080 v->arch.hvm_vmx.exec_control |= cr3_ctls;
1081 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1083 /* Changing CR0.PE can change some bits in real CR4. */
1084 vmx_update_guest_cr(v, 4);
1087 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1089 if ( v != current )
1090 hw_cr0_mask |= X86_CR0_TS;
1091 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1092 vmx_fpu_enter(v);
1095 realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
1097 if ( (!vmx_unrestricted_guest(v)) &&
1098 (realmode != v->arch.hvm_vmx.vmx_realmode) )
1100 enum x86_segment s;
1101 struct segment_register reg[x86_seg_tr + 1];
1103 /* Entering or leaving real mode: adjust the segment registers.
1104 * Need to read them all either way, as realmode reads can update
1105 * the saved values we'll use when returning to prot mode. */
1106 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1107 vmx_get_segment_register(v, s, &reg[s]);
1108 v->arch.hvm_vmx.vmx_realmode = realmode;
1110 if ( realmode )
1112 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1113 vmx_set_segment_register(v, s, &reg[s]);
1114 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1115 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1116 __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
1118 else
1120 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1121 if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
1122 vmx_set_segment_register(
1123 v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
1124 v->arch.hvm_vcpu.hw_cr[4] =
1125 ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
1126 |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
1127 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1128 __vmwrite(EXCEPTION_BITMAP,
1129 HVM_TRAP_MASK
1130 | (paging_mode_hap(v->domain) ?
1131 0 : (1U << TRAP_page_fault))
1132 | (1U << TRAP_no_device));
1133 vmx_update_debug_state(v);
1137 v->arch.hvm_vcpu.hw_cr[0] =
1138 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1139 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1140 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1141 break;
1143 case 2:
1144 /* CR2 is updated in exit stub. */
1145 break;
1146 case 3:
1147 if ( paging_mode_hap(v->domain) )
1149 if ( !hvm_paging_enabled(v) )
1150 v->arch.hvm_vcpu.hw_cr[3] =
1151 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1152 vmx_load_pdptrs(v);
1155 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1156 vpid_sync_vcpu_all(v);
1157 break;
1158 case 4:
1159 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1160 if ( paging_mode_hap(v->domain) )
1161 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1162 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1163 if ( v->arch.hvm_vmx.vmx_realmode )
1164 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1165 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1167 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1168 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1170 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1171 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1172 break;
1173 default:
1174 BUG();
1177 vmx_vmcs_exit(v);
1180 static void vmx_update_guest_efer(struct vcpu *v)
1182 #ifdef __x86_64__
1183 unsigned long vm_entry_value;
1185 vmx_vmcs_enter(v);
1187 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1188 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1189 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1190 else
1191 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1192 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1194 vmx_vmcs_exit(v);
1195 #endif
1197 if ( v == current )
1198 write_efer((read_efer() & ~EFER_SCE) |
1199 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
1202 static void vmx_flush_guest_tlbs(void)
1204 /*
1205 * If VPID (i.e. tagged TLB support) is not enabled, the fact that
1206 * we're in Xen at all means any guest will have a clean TLB when
1207 * it's next run, because VMRESUME will flush it for us.
1209 * If enabled, we invalidate all translations associated with all
1210 * VPID values.
1211 */
1212 vpid_sync_all();
1215 static void __ept_sync_domain(void *info)
1217 struct domain *d = info;
1218 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1221 void ept_sync_domain(struct domain *d)
1223 /* Only if using EPT and this domain has some VCPUs to dirty. */
1224 if ( d->arch.hvm_domain.hap_enabled && d->vcpu && d->vcpu[0] )
1226 ASSERT(local_irq_is_enabled());
1227 on_each_cpu(__ept_sync_domain, d, 1);
1231 static void __vmx_inject_exception(int trap, int type, int error_code)
1233 unsigned long intr_fields;
1234 struct vcpu *curr = current;
1236 /*
1237 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1238 * "If the VM entry is injecting, there is no blocking by STI or by
1239 * MOV SS following the VM entry, regardless of the contents of the
1240 * interruptibility-state field [in the guest-state area before the
1241 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1242 */
1244 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1245 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1246 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1247 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1250 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1252 /* Can't inject exceptions in virtual 8086 mode because they would
1253 * use the protected-mode IDT. Emulate at the next vmenter instead. */
1254 if ( curr->arch.hvm_vmx.vmx_realmode )
1255 curr->arch.hvm_vmx.vmx_emulate = 1;
1258 void vmx_inject_hw_exception(int trap, int error_code)
1260 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1261 struct vcpu *curr = current;
1263 switch ( trap )
1265 case TRAP_debug:
1266 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1268 __restore_debug_registers(curr);
1269 write_debugreg(6, read_debugreg(6) | 0x4000);
1271 if ( cpu_has_monitor_trap_flag )
1272 break;
1273 case TRAP_int3:
1274 if ( curr->domain->debugger_attached )
1276 /* Debug/Int3: Trap to debugger. */
1277 domain_pause_for_debugger();
1278 return;
1282 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1283 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1285 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1286 if ( trap == TRAP_double_fault )
1287 error_code = 0;
1290 __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1292 if ( trap == TRAP_page_fault )
1293 HVMTRACE_LONG_2D(PF_INJECT, error_code,
1294 TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
1295 else
1296 HVMTRACE_2D(INJ_EXC, trap, error_code);
1299 void vmx_inject_extint(int trap)
1301 __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1302 HVM_DELIVER_NO_ERROR_CODE);
1305 void vmx_inject_nmi(void)
1307 __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1308 HVM_DELIVER_NO_ERROR_CODE);
1311 static void vmx_inject_exception(
1312 unsigned int trapnr, int errcode, unsigned long cr2)
1314 if ( trapnr == TRAP_page_fault )
1315 current->arch.hvm_vcpu.guest_cr[2] = cr2;
1317 vmx_inject_hw_exception(trapnr, errcode);
1320 static int vmx_event_pending(struct vcpu *v)
1322 ASSERT(v == current);
1323 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1326 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1328 return vpmu_do_interrupt(regs);
1331 static void vmx_set_uc_mode(struct vcpu *v)
1333 if ( paging_mode_hap(v->domain) )
1334 ept_change_entry_emt_with_range(
1335 v->domain, 0, v->domain->arch.p2m->max_mapped_pfn);
1336 vpid_sync_all();
1339 static void vmx_set_info_guest(struct vcpu *v)
1341 unsigned long intr_shadow;
1343 vmx_vmcs_enter(v);
1345 __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
1347 /*
1348 * If the interruptibility-state field indicates blocking by STI,
1349 * setting the TF flag in the EFLAGS may cause VM entry to fail
1350 * and crash the guest. See SDM 3B 22.3.1.5.
1351 * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
1352 * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
1353 * immediately vmexit and hence make no progress.
1354 */
1355 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1356 if ( v->domain->debugger_attached &&
1357 (v->arch.guest_context.user_regs.eflags & X86_EFLAGS_TF) &&
1358 (intr_shadow & VMX_INTR_SHADOW_STI) )
1360 intr_shadow &= ~VMX_INTR_SHADOW_STI;
1361 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1364 vmx_vmcs_exit(v);
1367 static struct hvm_function_table vmx_function_table = {
1368 .name = "VMX",
1369 .domain_initialise = vmx_domain_initialise,
1370 .domain_destroy = vmx_domain_destroy,
1371 .vcpu_initialise = vmx_vcpu_initialise,
1372 .vcpu_destroy = vmx_vcpu_destroy,
1373 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1374 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1375 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1376 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1377 .guest_x86_mode = vmx_guest_x86_mode,
1378 .get_segment_register = vmx_get_segment_register,
1379 .set_segment_register = vmx_set_segment_register,
1380 .update_host_cr3 = vmx_update_host_cr3,
1381 .update_guest_cr = vmx_update_guest_cr,
1382 .update_guest_efer = vmx_update_guest_efer,
1383 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1384 .set_tsc_offset = vmx_set_tsc_offset,
1385 .inject_exception = vmx_inject_exception,
1386 .init_hypercall_page = vmx_init_hypercall_page,
1387 .event_pending = vmx_event_pending,
1388 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1389 .cpu_up = vmx_cpu_up,
1390 .cpu_down = vmx_cpu_down,
1391 .cpuid_intercept = vmx_cpuid_intercept,
1392 .wbinvd_intercept = vmx_wbinvd_intercept,
1393 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1394 .msr_read_intercept = vmx_msr_read_intercept,
1395 .msr_write_intercept = vmx_msr_write_intercept,
1396 .invlpg_intercept = vmx_invlpg_intercept,
1397 .set_uc_mode = vmx_set_uc_mode,
1398 .set_info_guest = vmx_set_info_guest
1399 };
1401 static unsigned long *vpid_bitmap;
1402 #define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / XEN_LEGACY_MAX_VCPUS)
1404 void start_vmx(void)
1406 static bool_t bootstrapped;
1408 vmx_save_host_msrs();
1410 if ( !test_and_set_bool(bootstrapped) )
1412 if ( hvm_enabled && !vmx_cpu_up() )
1414 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1415 smp_processor_id());
1416 BUG();
1418 return;
1421 /* Xen does not fill x86_capability words except 0. */
1422 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1424 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1425 return;
1427 set_in_cr4(X86_CR4_VMXE);
1429 if ( !vmx_cpu_up() )
1431 printk("VMX: failed to initialise.\n");
1432 return;
1435 if ( cpu_has_vmx_ept )
1436 vmx_function_table.hap_supported = 1;
1438 if ( cpu_has_vmx_vpid )
1440 vpid_bitmap = xmalloc_array(
1441 unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
1442 BUG_ON(vpid_bitmap == NULL);
1443 memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
1445 /* VPID 0 is used by VMX root mode (the hypervisor). */
1446 __set_bit(0, vpid_bitmap);
1449 setup_vmcs_dump();
1451 hvm_enable(&vmx_function_table);
1454 /*
1455 * Not all cases receive valid value in the VM-exit instruction length field.
1456 * Callers must know what they're doing!
1457 */
1458 static int __get_instruction_length(void)
1460 int len;
1461 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1462 BUG_ON((len < 1) || (len > 15));
1463 return len;
1466 static void __update_guest_eip(unsigned long inst_len)
1468 struct cpu_user_regs *regs = guest_cpu_user_regs();
1469 unsigned long x;
1471 regs->eip += inst_len;
1472 regs->eflags &= ~X86_EFLAGS_RF;
1474 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1475 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1477 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1478 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1481 if ( regs->eflags & X86_EFLAGS_TF )
1482 vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
1485 static void vmx_fpu_dirty_intercept(void)
1487 struct vcpu *curr = current;
1489 vmx_fpu_enter(curr);
1491 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1492 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1494 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1495 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1499 #define bitmaskof(idx) (1U << ((idx) & 31))
1500 static void vmx_cpuid_intercept(
1501 unsigned int *eax, unsigned int *ebx,
1502 unsigned int *ecx, unsigned int *edx)
1504 unsigned int input = *eax;
1505 struct segment_register cs;
1506 struct vcpu *v = current;
1508 hvm_cpuid(input, eax, ebx, ecx, edx);
1510 switch ( input )
1512 case 0x80000001:
1513 /* SYSCALL is visible iff running in long mode. */
1514 hvm_get_segment_register(v, x86_seg_cs, &cs);
1515 if ( cs.attr.fields.l )
1516 *edx |= bitmaskof(X86_FEATURE_SYSCALL);
1517 else
1518 *edx &= ~(bitmaskof(X86_FEATURE_SYSCALL));
1519 break;
1522 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
1525 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1527 unsigned int eax, ebx, ecx, edx;
1529 eax = regs->eax;
1530 ebx = regs->ebx;
1531 ecx = regs->ecx;
1532 edx = regs->edx;
1534 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1536 regs->eax = eax;
1537 regs->ebx = ebx;
1538 regs->ecx = ecx;
1539 regs->edx = edx;
1542 static void vmx_dr_access(unsigned long exit_qualification,
1543 struct cpu_user_regs *regs)
1545 struct vcpu *v = current;
1547 HVMTRACE_0D(DR_WRITE);
1549 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1550 __restore_debug_registers(v);
1552 /* Allow guest direct access to DR registers */
1553 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1554 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1557 static void vmx_invlpg_intercept(unsigned long vaddr)
1559 struct vcpu *curr = current;
1560 HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
1561 if ( paging_invlpg(curr, vaddr) )
1562 vpid_sync_vcpu_gva(curr, vaddr);
1565 #define CASE_SET_REG(REG, reg) \
1566 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1567 #define CASE_GET_REG(REG, reg) \
1568 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1570 #define CASE_EXTEND_SET_REG \
1571 CASE_EXTEND_REG(S)
1572 #define CASE_EXTEND_GET_REG \
1573 CASE_EXTEND_REG(G)
1575 #ifdef __i386__
1576 #define CASE_EXTEND_REG(T)
1577 #else
1578 #define CASE_EXTEND_REG(T) \
1579 CASE_ ## T ## ET_REG(R8, r8); \
1580 CASE_ ## T ## ET_REG(R9, r9); \
1581 CASE_ ## T ## ET_REG(R10, r10); \
1582 CASE_ ## T ## ET_REG(R11, r11); \
1583 CASE_ ## T ## ET_REG(R12, r12); \
1584 CASE_ ## T ## ET_REG(R13, r13); \
1585 CASE_ ## T ## ET_REG(R14, r14); \
1586 CASE_ ## T ## ET_REG(R15, r15)
1587 #endif
1589 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1591 unsigned long value;
1592 struct vcpu *v = current;
1593 struct vlapic *vlapic = vcpu_vlapic(v);
1595 switch ( gp )
1597 CASE_GET_REG(EAX, eax);
1598 CASE_GET_REG(ECX, ecx);
1599 CASE_GET_REG(EDX, edx);
1600 CASE_GET_REG(EBX, ebx);
1601 CASE_GET_REG(EBP, ebp);
1602 CASE_GET_REG(ESI, esi);
1603 CASE_GET_REG(EDI, edi);
1604 CASE_GET_REG(ESP, esp);
1605 CASE_EXTEND_GET_REG;
1606 default:
1607 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1608 goto exit_and_crash;
1611 HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
1613 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1615 switch ( cr )
1617 case 0:
1618 return !hvm_set_cr0(value);
1620 case 3:
1621 return !hvm_set_cr3(value);
1623 case 4:
1624 return !hvm_set_cr4(value);
1626 case 8:
1627 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1628 break;
1630 default:
1631 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1632 goto exit_and_crash;
1635 return 1;
1637 exit_and_crash:
1638 domain_crash(v->domain);
1639 return 0;
1642 /*
1643 * Read from control registers. CR0 and CR4 are read from the shadow.
1644 */
1645 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1647 unsigned long value = 0;
1648 struct vcpu *v = current;
1649 struct vlapic *vlapic = vcpu_vlapic(v);
1651 switch ( cr )
1653 case 3:
1654 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1655 break;
1656 case 8:
1657 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1658 value = (value & 0xF0) >> 4;
1659 break;
1660 default:
1661 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1662 domain_crash(v->domain);
1663 break;
1666 switch ( gp ) {
1667 CASE_SET_REG(EAX, eax);
1668 CASE_SET_REG(ECX, ecx);
1669 CASE_SET_REG(EDX, edx);
1670 CASE_SET_REG(EBX, ebx);
1671 CASE_SET_REG(EBP, ebp);
1672 CASE_SET_REG(ESI, esi);
1673 CASE_SET_REG(EDI, edi);
1674 CASE_SET_REG(ESP, esp);
1675 CASE_EXTEND_SET_REG;
1676 default:
1677 printk("invalid gp: %d\n", gp);
1678 domain_crash(v->domain);
1679 break;
1682 HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
1684 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1687 static int vmx_cr_access(unsigned long exit_qualification,
1688 struct cpu_user_regs *regs)
1690 unsigned int gp, cr;
1691 unsigned long value;
1692 struct vcpu *v = current;
1694 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1696 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1697 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1698 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1699 return mov_to_cr(gp, cr, regs);
1700 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1701 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1702 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1703 mov_from_cr(cr, gp, regs);
1704 break;
1705 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1706 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1707 vmx_update_guest_cr(v, 0);
1708 HVMTRACE_0D(CLTS);
1709 break;
1710 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1711 value = v->arch.hvm_vcpu.guest_cr[0];
1712 /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
1713 value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
1714 HVMTRACE_LONG_1D(LMSW, value);
1715 return !hvm_set_cr0(value);
1716 default:
1717 BUG();
1720 return 1;
1723 static const struct lbr_info {
1724 u32 base, count;
1725 } p4_lbr[] = {
1726 { MSR_P4_LER_FROM_LIP, 1 },
1727 { MSR_P4_LER_TO_LIP, 1 },
1728 { MSR_P4_LASTBRANCH_TOS, 1 },
1729 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1730 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1731 { 0, 0 }
1732 }, c2_lbr[] = {
1733 { MSR_IA32_LASTINTFROMIP, 1 },
1734 { MSR_IA32_LASTINTTOIP, 1 },
1735 { MSR_C2_LASTBRANCH_TOS, 1 },
1736 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1737 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1738 { 0, 0 }
1739 #ifdef __i386__
1740 }, pm_lbr[] = {
1741 { MSR_IA32_LASTINTFROMIP, 1 },
1742 { MSR_IA32_LASTINTTOIP, 1 },
1743 { MSR_PM_LASTBRANCH_TOS, 1 },
1744 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1745 { 0, 0 }
1746 #endif
1747 };
1749 static const struct lbr_info *last_branch_msr_get(void)
1751 switch ( boot_cpu_data.x86 )
1753 case 6:
1754 switch ( boot_cpu_data.x86_model )
1756 #ifdef __i386__
1757 /* PentiumM */
1758 case 9: case 13:
1759 /* Core Solo/Duo */
1760 case 14:
1761 return pm_lbr;
1762 break;
1763 #endif
1764 /* Core2 Duo */
1765 case 15:
1766 return c2_lbr;
1767 break;
1769 break;
1771 case 15:
1772 switch ( boot_cpu_data.x86_model )
1774 /* Pentium4/Xeon with em64t */
1775 case 3: case 4: case 6:
1776 return p4_lbr;
1777 break;
1779 break;
1782 return NULL;
1785 static int is_last_branch_msr(u32 ecx)
1787 const struct lbr_info *lbr = last_branch_msr_get();
1789 if ( lbr == NULL )
1790 return 0;
1792 for ( ; lbr->count; lbr++ )
1793 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1794 return 1;
1796 return 0;
1799 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1801 u64 msr_content = 0;
1802 u32 ecx = regs->ecx, eax, edx;
1804 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1806 switch ( ecx )
1808 case MSR_IA32_SYSENTER_CS:
1809 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1810 break;
1811 case MSR_IA32_SYSENTER_ESP:
1812 msr_content = __vmread(GUEST_SYSENTER_ESP);
1813 break;
1814 case MSR_IA32_SYSENTER_EIP:
1815 msr_content = __vmread(GUEST_SYSENTER_EIP);
1816 break;
1817 case MSR_IA32_DEBUGCTLMSR:
1818 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1819 #ifdef __i386__
1820 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1821 #endif
1822 break;
1823 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1824 goto gp_fault;
1825 case MSR_IA32_MISC_ENABLE:
1826 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1827 /* Debug Trace Store is not supported. */
1828 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1829 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1830 break;
1831 default:
1832 if ( vpmu_do_rdmsr(regs) )
1833 goto done;
1834 if ( passive_domain_do_rdmsr(regs) )
1835 goto done;
1836 switch ( long_mode_do_msr_read(regs) )
1838 case HNDL_unhandled:
1839 break;
1840 case HNDL_exception_raised:
1841 return X86EMUL_EXCEPTION;
1842 case HNDL_done:
1843 goto done;
1846 if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
1847 break;
1849 if ( is_last_branch_msr(ecx) )
1851 msr_content = 0;
1852 break;
1855 if ( rdmsr_viridian_regs(ecx, &eax, &edx) ||
1856 rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
1857 rdmsr_safe(ecx, eax, edx) == 0 )
1859 regs->eax = eax;
1860 regs->edx = edx;
1861 goto done;
1864 goto gp_fault;
1867 regs->eax = (uint32_t)msr_content;
1868 regs->edx = (uint32_t)(msr_content >> 32);
1870 done:
1871 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1872 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1873 ecx, (unsigned long)regs->eax,
1874 (unsigned long)regs->edx);
1875 return X86EMUL_OKAY;
1877 gp_fault:
1878 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1879 return X86EMUL_EXCEPTION;
1882 static int vmx_alloc_vlapic_mapping(struct domain *d)
1884 void *apic_va;
1886 if ( !cpu_has_vmx_virtualize_apic_accesses )
1887 return 0;
1889 apic_va = alloc_xenheap_page();
1890 if ( apic_va == NULL )
1891 return -ENOMEM;
1892 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1893 set_mmio_p2m_entry(
1894 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1895 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1897 return 0;
1900 static void vmx_free_vlapic_mapping(struct domain *d)
1902 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1903 if ( mfn != 0 )
1904 free_xenheap_page(mfn_to_virt(mfn));
1907 static int vmx_alloc_vpid(struct domain *d)
1909 int idx;
1911 if ( !cpu_has_vmx_vpid )
1912 return 0;
1914 do {
1915 idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
1916 if ( idx >= VPID_BITMAP_SIZE )
1918 dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
1919 return -EBUSY;
1922 while ( test_and_set_bit(idx, vpid_bitmap) );
1924 d->arch.hvm_domain.vmx.vpid_base = idx * XEN_LEGACY_MAX_VCPUS;
1925 return 0;
1928 static void vmx_free_vpid(struct domain *d)
1930 if ( !cpu_has_vmx_vpid )
1931 return;
1933 clear_bit(d->arch.hvm_domain.vmx.vpid_base / XEN_LEGACY_MAX_VCPUS,
1934 vpid_bitmap);
1937 static void vmx_install_vlapic_mapping(struct vcpu *v)
1939 paddr_t virt_page_ma, apic_page_ma;
1941 if ( !cpu_has_vmx_virtualize_apic_accesses )
1942 return;
1944 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1945 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1946 apic_page_ma <<= PAGE_SHIFT;
1948 vmx_vmcs_enter(v);
1949 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1950 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1951 vmx_vmcs_exit(v);
1954 void vmx_vlapic_msr_changed(struct vcpu *v)
1956 struct vlapic *vlapic = vcpu_vlapic(v);
1957 uint32_t ctl;
1959 if ( !cpu_has_vmx_virtualize_apic_accesses )
1960 return;
1962 vmx_vmcs_enter(v);
1963 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1964 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1965 if ( !vlapic_hw_disabled(vlapic) &&
1966 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1967 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1968 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1969 vmx_vmcs_exit(v);
1972 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1974 u32 ecx = regs->ecx;
1975 u64 msr_content;
1976 struct vcpu *v = current;
1978 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
1979 ecx, (u32)regs->eax, (u32)regs->edx);
1981 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1983 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
1985 switch ( ecx )
1987 case MSR_IA32_SYSENTER_CS:
1988 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1989 break;
1990 case MSR_IA32_SYSENTER_ESP:
1991 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1992 break;
1993 case MSR_IA32_SYSENTER_EIP:
1994 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1995 break;
1996 case MSR_IA32_DEBUGCTLMSR: {
1997 int i, rc = 0;
1999 if ( !msr_content || (msr_content & ~3) )
2000 break;
2002 if ( msr_content & 1 )
2004 const struct lbr_info *lbr = last_branch_msr_get();
2005 if ( lbr == NULL )
2006 break;
2008 for ( ; (rc == 0) && lbr->count; lbr++ )
2009 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2010 if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
2011 vmx_disable_intercept_for_msr(v, lbr->base + i);
2014 if ( (rc < 0) ||
2015 (vmx_add_host_load_msr(ecx) < 0) )
2016 vmx_inject_hw_exception(TRAP_machine_check, 0);
2017 else
2019 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
2020 #ifdef __i386__
2021 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
2022 #endif
2025 break;
2027 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2028 goto gp_fault;
2029 default:
2030 if ( vpmu_do_wrmsr(regs) )
2031 return X86EMUL_OKAY;
2032 if ( passive_domain_do_wrmsr(regs) )
2033 return X86EMUL_OKAY;
2035 if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) )
2036 break;
2038 switch ( long_mode_do_msr_write(regs) )
2040 case HNDL_unhandled:
2041 if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
2042 !is_last_branch_msr(ecx) )
2043 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2044 break;
2045 case HNDL_exception_raised:
2046 return X86EMUL_EXCEPTION;
2047 case HNDL_done:
2048 break;
2050 break;
2053 return X86EMUL_OKAY;
2055 gp_fault:
2056 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2057 return X86EMUL_EXCEPTION;
2060 static void vmx_do_extint(struct cpu_user_regs *regs)
2062 unsigned int vector;
2064 asmlinkage void do_IRQ(struct cpu_user_regs *);
2065 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2066 fastcall void smp_event_check_interrupt(void);
2067 fastcall void smp_invalidate_interrupt(void);
2068 fastcall void smp_call_function_interrupt(void);
2069 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2070 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2071 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
2072 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs);
2073 #ifdef CONFIG_X86_MCE_THERMAL
2074 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2075 #endif
2077 vector = __vmread(VM_EXIT_INTR_INFO);
2078 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2080 vector &= INTR_INFO_VECTOR_MASK;
2081 HVMTRACE_1D(INTR, vector);
2083 switch ( vector )
2085 case LOCAL_TIMER_VECTOR:
2086 smp_apic_timer_interrupt(regs);
2087 break;
2088 case EVENT_CHECK_VECTOR:
2089 smp_event_check_interrupt();
2090 break;
2091 case INVALIDATE_TLB_VECTOR:
2092 smp_invalidate_interrupt();
2093 break;
2094 case CALL_FUNCTION_VECTOR:
2095 smp_call_function_interrupt();
2096 break;
2097 case SPURIOUS_APIC_VECTOR:
2098 smp_spurious_interrupt(regs);
2099 break;
2100 case ERROR_APIC_VECTOR:
2101 smp_error_interrupt(regs);
2102 break;
2103 case CMCI_APIC_VECTOR:
2104 smp_cmci_interrupt(regs);
2105 break;
2106 case PMU_APIC_VECTOR:
2107 smp_pmu_apic_interrupt(regs);
2108 break;
2109 #ifdef CONFIG_X86_MCE_THERMAL
2110 case THERMAL_APIC_VECTOR:
2111 smp_thermal_interrupt(regs);
2112 break;
2113 #endif
2114 default:
2115 regs->entry_vector = vector;
2116 do_IRQ(regs);
2117 break;
2121 static void wbinvd_ipi(void *info)
2123 wbinvd();
2126 static void vmx_wbinvd_intercept(void)
2128 if ( !has_arch_pdevs(current->domain) )
2129 return;
2131 if ( cpu_has_wbinvd_exiting )
2132 on_each_cpu(wbinvd_ipi, NULL, 1);
2133 else
2134 wbinvd();
2137 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
2139 unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
2140 struct domain *d = current->domain;
2141 unsigned long gla, gfn = gpa >> PAGE_SHIFT;
2142 mfn_t mfn;
2143 p2m_type_t t;
2145 mfn = gfn_to_mfn_guest(d, gfn, &t);
2147 /* There are three legitimate reasons for taking an EPT violation.
2148 * One is a guest access to MMIO space. */
2149 if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
2151 handle_mmio();
2152 return;
2155 /* The second is log-dirty mode, writing to a read-only page;
2156 * The third is populating a populate-on-demand page. */
2157 if ( (gla_validity == EPT_GLA_VALIDITY_MATCH
2158 || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
2159 && p2m_is_ram(t) && (t != p2m_ram_ro) )
2161 if ( paging_mode_log_dirty(d) )
2163 paging_mark_dirty(d, mfn_x(mfn));
2164 p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
2165 flush_tlb_mask(&d->domain_dirty_cpumask);
2167 return;
2170 /* Everything else is an error. */
2171 gla = __vmread(GUEST_LINEAR_ADDRESS);
2172 gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
2173 "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
2174 qualification,
2175 (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
2176 (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
2177 (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
2178 (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
2179 (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
2180 (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
2181 gpa, mfn_x(mfn), t);
2183 if ( qualification & EPT_GAW_VIOLATION )
2184 gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n",
2185 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
2187 switch ( gla_validity )
2189 case EPT_GLA_VALIDITY_PDPTR_LOAD:
2190 gdprintk(XENLOG_ERR, " --- PDPTR load failed\n");
2191 break;
2192 case EPT_GLA_VALIDITY_GPT_WALK:
2193 gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
2194 break;
2195 case EPT_GLA_VALIDITY_RSVD:
2196 gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
2197 break;
2198 case EPT_GLA_VALIDITY_MATCH:
2199 gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
2200 break;
2203 domain_crash(d);
2206 static void vmx_failed_vmentry(unsigned int exit_reason,
2207 struct cpu_user_regs *regs)
2209 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2210 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2211 struct vcpu *curr = current;
2213 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2214 switch ( failed_vmentry_reason )
2216 case EXIT_REASON_INVALID_GUEST_STATE:
2217 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2218 break;
2219 case EXIT_REASON_MSR_LOADING:
2220 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2221 break;
2222 case EXIT_REASON_MCE_DURING_VMENTRY:
2223 printk("caused by machine check.\n");
2224 HVMTRACE_0D(MCE);
2225 do_machine_check(regs);
2226 break;
2227 default:
2228 printk("reason not known yet!");
2229 break;
2232 printk("************* VMCS Area **************\n");
2233 vmcs_dump_vcpu(curr);
2234 printk("**************************************\n");
2236 domain_crash(curr->domain);
2239 asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
2241 struct vcpu *v = current;
2243 /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since
2244 * we have CR4.VME == 1 and our own TSS with an empty interrupt
2245 * redirection bitmap, all software INTs will be handled by vm86 */
2246 v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
2247 regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2250 static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs)
2252 struct hvm_emulate_ctxt ctxt;
2253 int rc;
2255 hvm_emulate_prepare(&ctxt, regs);
2257 rc = hvm_emulate_one(&ctxt);
2259 switch ( rc )
2261 case X86EMUL_UNHANDLEABLE:
2262 gdprintk(XENLOG_WARNING,
2263 "instruction emulation failed @ %04x:%lx: "
2264 "%02x %02x %02x %02x %02x %02x\n",
2265 hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel,
2266 ctxt.insn_buf_eip,
2267 ctxt.insn_buf[0], ctxt.insn_buf[1],
2268 ctxt.insn_buf[2], ctxt.insn_buf[3],
2269 ctxt.insn_buf[4], ctxt.insn_buf[5]);
2270 return;
2271 case X86EMUL_EXCEPTION:
2272 if ( ctxt.exn_pending )
2273 hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0);
2274 break;
2275 default:
2276 break;
2279 hvm_emulate_writeback(&ctxt);
2282 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2284 unsigned int exit_reason, idtv_info;
2285 unsigned long exit_qualification, inst_len = 0;
2286 struct vcpu *v = current;
2288 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2289 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2290 __vmread(GUEST_CR3);
2292 exit_reason = __vmread(VM_EXIT_REASON);
2294 if ( hvm_long_mode_enabled(v) )
2295 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
2296 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
2297 0, 0, 0);
2298 else
2299 HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason,
2300 (uint32_t)regs->eip,
2301 0, 0, 0, 0);
2303 perfc_incra(vmexits, exit_reason);
2305 /* Handle the interrupt we missed before allowing any more in. */
2306 if ( exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT )
2307 vmx_do_extint(regs);
2309 /* Now enable interrupts so it's safe to take locks. */
2310 local_irq_enable();
2312 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2313 return vmx_failed_vmentry(exit_reason, regs);
2315 if ( v->arch.hvm_vmx.vmx_realmode )
2317 unsigned int vector;
2319 /* Put RFLAGS back the way the guest wants it */
2320 regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2321 regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
2323 /* Unless this exit was for an interrupt, we've hit something
2324 * vm86 can't handle. Try again, using the emulator. */
2325 switch ( exit_reason )
2327 case EXIT_REASON_EXCEPTION_NMI:
2328 vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
2329 if ( vector != TRAP_page_fault
2330 && vector != TRAP_nmi
2331 && vector != TRAP_machine_check )
2333 perfc_incr(realmode_exits);
2334 v->arch.hvm_vmx.vmx_emulate = 1;
2335 return;
2337 case EXIT_REASON_EXTERNAL_INTERRUPT:
2338 case EXIT_REASON_INIT:
2339 case EXIT_REASON_SIPI:
2340 case EXIT_REASON_PENDING_VIRT_INTR:
2341 case EXIT_REASON_PENDING_VIRT_NMI:
2342 case EXIT_REASON_MCE_DURING_VMENTRY:
2343 break;
2344 default:
2345 v->arch.hvm_vmx.vmx_emulate = 1;
2346 perfc_incr(realmode_exits);
2347 return;
2351 hvm_maybe_deassert_evtchn_irq();
2353 /* Event delivery caused this intercept? Queue for redelivery. */
2354 idtv_info = __vmread(IDT_VECTORING_INFO);
2355 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2356 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2358 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2360 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2361 __vmwrite(VM_ENTRY_INTR_INFO,
2362 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2363 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2364 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2365 __vmread(IDT_VECTORING_ERROR_CODE));
2368 /*
2369 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2370 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2371 */
2372 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2373 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2374 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2375 ~VMX_INTR_SHADOW_NMI);
2378 switch ( exit_reason )
2380 case EXIT_REASON_EXCEPTION_NMI:
2382 /*
2383 * We don't set the software-interrupt exiting (INT n).
2384 * (1) We can get an exception (e.g. #PG) in the guest, or
2385 * (2) NMI
2386 */
2387 unsigned int intr_info, vector;
2389 intr_info = __vmread(VM_EXIT_INTR_INFO);
2390 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2392 vector = intr_info & INTR_INFO_VECTOR_MASK;
2394 /*
2395 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2396 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2397 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2398 */
2399 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2400 !(idtv_info & INTR_INFO_VALID_MASK) &&
2401 (vector != TRAP_double_fault) )
2402 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2403 __vmread(GUEST_INTERRUPTIBILITY_INFO)
2404 | VMX_INTR_SHADOW_NMI);
2406 perfc_incra(cause_vector, vector);
2408 switch ( vector )
2410 case TRAP_debug:
2411 /*
2412 * Updates DR6 where debugger can peek (See 3B 23.2.1,
2413 * Table 23-1, "Exit Qualification for Debug Exceptions").
2414 */
2415 exit_qualification = __vmread(EXIT_QUALIFICATION);
2416 write_debugreg(6, exit_qualification | 0xffff0ff0);
2417 if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
2418 goto exit_and_crash;
2419 domain_pause_for_debugger();
2420 break;
2421 case TRAP_int3:
2422 if ( !v->domain->debugger_attached )
2423 goto exit_and_crash;
2424 inst_len = __get_instruction_length(); /* Safe: INT3 */
2425 __update_guest_eip(inst_len);
2426 domain_pause_for_debugger();
2427 break;
2428 case TRAP_no_device:
2429 vmx_fpu_dirty_intercept();
2430 break;
2431 case TRAP_page_fault:
2432 exit_qualification = __vmread(EXIT_QUALIFICATION);
2433 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2435 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2436 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2437 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2438 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2439 (unsigned long)regs->esi, (unsigned long)regs->edi);
2441 if ( paging_fault(exit_qualification, regs) )
2443 if ( trace_will_trace_event(TRC_SHADOW) )
2444 break;
2445 if ( hvm_long_mode_enabled(v) )
2446 HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
2447 TRC_PAR_LONG(exit_qualification) );
2448 else
2449 HVMTRACE_2D(PF_XEN,
2450 regs->error_code, exit_qualification );
2451 break;
2454 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2455 vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
2456 break;
2457 case TRAP_nmi:
2458 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2459 (X86_EVENTTYPE_NMI << 8) )
2460 goto exit_and_crash;
2461 HVMTRACE_0D(NMI);
2462 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2463 break;
2464 case TRAP_machine_check:
2465 HVMTRACE_0D(MCE);
2466 do_machine_check(regs);
2467 break;
2468 case TRAP_invalid_op:
2469 vmx_vmexit_ud_intercept(regs);
2470 break;
2471 default:
2472 goto exit_and_crash;
2474 break;
2476 case EXIT_REASON_EXTERNAL_INTERRUPT:
2477 /* Already handled above. */
2478 break;
2479 case EXIT_REASON_TRIPLE_FAULT:
2480 hvm_triple_fault();
2481 break;
2482 case EXIT_REASON_PENDING_VIRT_INTR:
2483 /* Disable the interrupt window. */
2484 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2485 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2486 v->arch.hvm_vmx.exec_control);
2487 break;
2488 case EXIT_REASON_PENDING_VIRT_NMI:
2489 /* Disable the NMI window. */
2490 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2491 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2492 v->arch.hvm_vmx.exec_control);
2493 break;
2494 case EXIT_REASON_TASK_SWITCH: {
2495 const enum hvm_task_switch_reason reasons[] = {
2496 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2497 int32_t errcode = -1;
2498 exit_qualification = __vmread(EXIT_QUALIFICATION);
2499 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2500 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2501 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2502 hvm_task_switch((uint16_t)exit_qualification,
2503 reasons[(exit_qualification >> 30) & 3],
2504 errcode);
2505 break;
2507 case EXIT_REASON_CPUID:
2508 inst_len = __get_instruction_length(); /* Safe: CPUID */
2509 __update_guest_eip(inst_len);
2510 vmx_do_cpuid(regs);
2511 break;
2512 case EXIT_REASON_HLT:
2513 inst_len = __get_instruction_length(); /* Safe: HLT */
2514 __update_guest_eip(inst_len);
2515 hvm_hlt(regs->eflags);
2516 break;
2517 case EXIT_REASON_INVLPG:
2519 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2520 __update_guest_eip(inst_len);
2521 exit_qualification = __vmread(EXIT_QUALIFICATION);
2522 vmx_invlpg_intercept(exit_qualification);
2523 break;
2525 case EXIT_REASON_RDTSC:
2526 inst_len = __get_instruction_length();
2527 __update_guest_eip(inst_len);
2528 hvm_rdtsc_intercept(regs);
2529 break;
2530 case EXIT_REASON_VMCALL:
2532 int rc;
2533 HVMTRACE_1D(VMMCALL, regs->eax);
2534 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2535 rc = hvm_do_hypercall(regs);
2536 if ( rc != HVM_HCALL_preempted )
2538 __update_guest_eip(inst_len);
2539 if ( rc == HVM_HCALL_invalidate )
2540 send_invalidate_req();
2542 break;
2544 case EXIT_REASON_CR_ACCESS:
2546 exit_qualification = __vmread(EXIT_QUALIFICATION);
2547 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2548 if ( vmx_cr_access(exit_qualification, regs) )
2549 __update_guest_eip(inst_len);
2550 break;
2552 case EXIT_REASON_DR_ACCESS:
2553 exit_qualification = __vmread(EXIT_QUALIFICATION);
2554 vmx_dr_access(exit_qualification, regs);
2555 break;
2556 case EXIT_REASON_MSR_READ:
2557 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2558 if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY )
2559 __update_guest_eip(inst_len);
2560 break;
2561 case EXIT_REASON_MSR_WRITE:
2562 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2563 if ( hvm_msr_write_intercept(regs) == X86EMUL_OKAY )
2564 __update_guest_eip(inst_len);
2565 break;
2567 case EXIT_REASON_MWAIT_INSTRUCTION:
2568 case EXIT_REASON_MONITOR_INSTRUCTION:
2569 case EXIT_REASON_VMCLEAR:
2570 case EXIT_REASON_VMLAUNCH:
2571 case EXIT_REASON_VMPTRLD:
2572 case EXIT_REASON_VMPTRST:
2573 case EXIT_REASON_VMREAD:
2574 case EXIT_REASON_VMRESUME:
2575 case EXIT_REASON_VMWRITE:
2576 case EXIT_REASON_VMXOFF:
2577 case EXIT_REASON_VMXON:
2578 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2579 break;
2581 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2582 break;
2584 case EXIT_REASON_IO_INSTRUCTION:
2585 case EXIT_REASON_APIC_ACCESS:
2586 if ( !handle_mmio() )
2587 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2588 break;
2590 case EXIT_REASON_INVD:
2591 case EXIT_REASON_WBINVD:
2593 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2594 __update_guest_eip(inst_len);
2595 vmx_wbinvd_intercept();
2596 break;
2599 case EXIT_REASON_EPT_VIOLATION:
2601 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2602 #ifdef __i386__
2603 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2604 #endif
2605 exit_qualification = __vmread(EXIT_QUALIFICATION);
2606 ept_handle_violation(exit_qualification, gpa);
2607 break;
2610 case EXIT_REASON_MONITOR_TRAP_FLAG:
2612 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
2613 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
2614 if ( v->domain->debugger_attached && v->arch.hvm_vcpu.single_step )
2615 domain_pause_for_debugger();
2616 break;
2619 default:
2620 exit_and_crash:
2621 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2622 domain_crash(v->domain);
2623 break;
2627 asmlinkage void vmx_trace_vmentry(void)
2629 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2632 /*
2633 * Local variables:
2634 * mode: C
2635 * c-set-style: "BSD"
2636 * c-basic-offset: 4
2637 * tab-width: 4
2638 * indent-tabs-mode: nil
2639 * End:
2640 */