debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 16714:43aab98aef60

vmx: Fix the 2nd argument of cpuid_count() in vmx_cpuid_intercept()

For input 0x00000004, the value of "*ecx" has been overwritten by the
cpuid() in hvm_cpuid(), causing a bad value passed to cpuid_count().

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Sat Dec 29 17:55:47 2007 +0000 (2007-12-29)
parents d5f0afb58589
children e4fd457a3dd5
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
62 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
63 static void vmx_update_guest_efer(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vmx_install_vlapic_mapping(v);
95 #ifndef VMXASSIST
96 if ( v->vcpu_id == 0 )
97 v->arch.guest_context.user_regs.eax = 1;
98 v->arch.hvm_vcpu.io_complete = vmx_realmode_io_complete;
99 #endif
101 return 0;
102 }
104 static void vmx_vcpu_destroy(struct vcpu *v)
105 {
106 vmx_destroy_vmcs(v);
107 }
109 #ifdef __x86_64__
111 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
113 static u32 msr_index[VMX_MSR_COUNT] =
114 {
115 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
116 };
118 static void vmx_save_host_msrs(void)
119 {
120 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
121 int i;
123 for ( i = 0; i < VMX_MSR_COUNT; i++ )
124 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
125 }
127 #define WRITE_MSR(address) \
128 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
129 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
130 wrmsrl(MSR_ ## address, msr_content); \
131 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
132 break
134 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
135 {
136 u64 msr_content = 0;
137 u32 ecx = regs->ecx;
138 struct vcpu *v = current;
139 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
141 switch ( ecx )
142 {
143 case MSR_EFER:
144 msr_content = v->arch.hvm_vcpu.guest_efer;
145 break;
147 case MSR_FS_BASE:
148 msr_content = __vmread(GUEST_FS_BASE);
149 goto check_long_mode;
151 case MSR_GS_BASE:
152 msr_content = __vmread(GUEST_GS_BASE);
153 goto check_long_mode;
155 case MSR_SHADOW_GS_BASE:
156 msr_content = v->arch.hvm_vmx.shadow_gs;
157 check_long_mode:
158 if ( !(hvm_long_mode_enabled(v)) )
159 {
160 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
161 return HNDL_exception_raised;
162 }
163 break;
165 case MSR_STAR:
166 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
167 break;
169 case MSR_LSTAR:
170 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
171 break;
173 case MSR_CSTAR:
174 msr_content = v->arch.hvm_vmx.cstar;
175 break;
177 case MSR_SYSCALL_MASK:
178 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
179 break;
181 default:
182 return HNDL_unhandled;
183 }
185 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
187 regs->eax = (u32)(msr_content >> 0);
188 regs->edx = (u32)(msr_content >> 32);
190 return HNDL_done;
191 }
193 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
194 {
195 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
196 u32 ecx = regs->ecx;
197 struct vcpu *v = current;
198 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
199 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
201 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
203 switch ( ecx )
204 {
205 case MSR_EFER:
206 if ( !hvm_set_efer(msr_content) )
207 goto exception_raised;
208 break;
210 case MSR_FS_BASE:
211 case MSR_GS_BASE:
212 case MSR_SHADOW_GS_BASE:
213 if ( !hvm_long_mode_enabled(v) )
214 goto gp_fault;
216 if ( !is_canonical_address(msr_content) )
217 goto uncanonical_address;
219 if ( ecx == MSR_FS_BASE )
220 __vmwrite(GUEST_FS_BASE, msr_content);
221 else if ( ecx == MSR_GS_BASE )
222 __vmwrite(GUEST_GS_BASE, msr_content);
223 else
224 {
225 v->arch.hvm_vmx.shadow_gs = msr_content;
226 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
227 }
229 break;
231 case MSR_STAR:
232 WRITE_MSR(STAR);
234 case MSR_LSTAR:
235 if ( !is_canonical_address(msr_content) )
236 goto uncanonical_address;
237 WRITE_MSR(LSTAR);
239 case MSR_CSTAR:
240 if ( !is_canonical_address(msr_content) )
241 goto uncanonical_address;
242 v->arch.hvm_vmx.cstar = msr_content;
243 break;
245 case MSR_SYSCALL_MASK:
246 WRITE_MSR(SYSCALL_MASK);
248 default:
249 return HNDL_unhandled;
250 }
252 return HNDL_done;
254 uncanonical_address:
255 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
256 gp_fault:
257 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
258 exception_raised:
259 return HNDL_exception_raised;
260 }
262 /*
263 * To avoid MSR save/restore at every VM exit/entry time, we restore
264 * the x86_64 specific MSRs at domain switch time. Since these MSRs
265 * are not modified once set for para domains, we don't save them,
266 * but simply reset them to values set in percpu_traps_init().
267 */
268 static void vmx_restore_host_msrs(void)
269 {
270 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
271 int i;
273 while ( host_msr_state->flags )
274 {
275 i = find_first_set_bit(host_msr_state->flags);
276 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
277 clear_bit(i, &host_msr_state->flags);
278 }
280 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
281 write_efer(read_efer() | EFER_NX);
282 }
284 static void vmx_save_guest_msrs(struct vcpu *v)
285 {
286 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
287 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
288 }
290 static void vmx_restore_guest_msrs(struct vcpu *v)
291 {
292 struct vmx_msr_state *guest_msr_state, *host_msr_state;
293 unsigned long guest_flags;
294 int i;
296 guest_msr_state = &v->arch.hvm_vmx.msr_state;
297 host_msr_state = &this_cpu(host_msr_state);
299 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
301 guest_flags = guest_msr_state->flags;
303 while ( guest_flags )
304 {
305 i = find_first_set_bit(guest_flags);
307 HVM_DBG_LOG(DBG_LEVEL_2,
308 "restore guest's index %d msr %x with value %lx",
309 i, msr_index[i], guest_msr_state->msrs[i]);
310 set_bit(i, &host_msr_state->flags);
311 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
312 clear_bit(i, &guest_flags);
313 }
315 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
316 {
317 HVM_DBG_LOG(DBG_LEVEL_2,
318 "restore guest's EFER with value %lx",
319 v->arch.hvm_vcpu.guest_efer);
320 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
321 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
322 }
323 }
325 #else /* __i386__ */
327 #define vmx_save_host_msrs() ((void)0)
329 static void vmx_restore_host_msrs(void)
330 {
331 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
332 write_efer(read_efer() | EFER_NX);
333 }
335 #define vmx_save_guest_msrs(v) ((void)0)
337 static void vmx_restore_guest_msrs(struct vcpu *v)
338 {
339 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
340 {
341 HVM_DBG_LOG(DBG_LEVEL_2,
342 "restore guest's EFER with value %lx",
343 v->arch.hvm_vcpu.guest_efer);
344 write_efer((read_efer() & ~EFER_NX) |
345 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
346 }
347 }
349 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
350 {
351 u64 msr_content = 0;
352 struct vcpu *v = current;
354 switch ( regs->ecx )
355 {
356 case MSR_EFER:
357 msr_content = v->arch.hvm_vcpu.guest_efer;
358 break;
360 default:
361 return HNDL_unhandled;
362 }
364 regs->eax = msr_content >> 0;
365 regs->edx = msr_content >> 32;
367 return HNDL_done;
368 }
370 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
371 {
372 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
374 switch ( regs->ecx )
375 {
376 case MSR_EFER:
377 if ( !hvm_set_efer(msr_content) )
378 return HNDL_exception_raised;
379 break;
381 default:
382 return HNDL_unhandled;
383 }
385 return HNDL_done;
386 }
388 #endif /* __i386__ */
390 static int vmx_guest_x86_mode(struct vcpu *v)
391 {
392 unsigned int cs_ar_bytes;
394 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
395 return 0;
396 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
397 return 1;
398 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
399 if ( hvm_long_mode_enabled(v) &&
400 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
401 return 8;
402 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
403 }
405 static void vmx_save_dr(struct vcpu *v)
406 {
407 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
408 return;
410 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
411 v->arch.hvm_vcpu.flag_dr_dirty = 0;
412 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
413 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
415 v->arch.guest_context.debugreg[0] = read_debugreg(0);
416 v->arch.guest_context.debugreg[1] = read_debugreg(1);
417 v->arch.guest_context.debugreg[2] = read_debugreg(2);
418 v->arch.guest_context.debugreg[3] = read_debugreg(3);
419 v->arch.guest_context.debugreg[6] = read_debugreg(6);
420 /* DR7 must be saved as it is used by vmx_restore_dr(). */
421 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
422 }
424 static void __restore_debug_registers(struct vcpu *v)
425 {
426 if ( v->arch.hvm_vcpu.flag_dr_dirty )
427 return;
429 v->arch.hvm_vcpu.flag_dr_dirty = 1;
431 write_debugreg(0, v->arch.guest_context.debugreg[0]);
432 write_debugreg(1, v->arch.guest_context.debugreg[1]);
433 write_debugreg(2, v->arch.guest_context.debugreg[2]);
434 write_debugreg(3, v->arch.guest_context.debugreg[3]);
435 write_debugreg(6, v->arch.guest_context.debugreg[6]);
436 /* DR7 is loaded from the VMCS. */
437 }
439 /*
440 * DR7 is saved and restored on every vmexit. Other debug registers only
441 * need to be restored if their value is going to affect execution -- i.e.,
442 * if one of the breakpoints is enabled. So mask out all bits that don't
443 * enable some breakpoint functionality.
444 */
445 static void vmx_restore_dr(struct vcpu *v)
446 {
447 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
448 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
449 __restore_debug_registers(v);
450 }
452 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
453 {
454 uint32_t ev;
456 vmx_vmcs_enter(v);
458 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
459 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
460 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
461 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
463 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
465 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
466 c->idtr_base = __vmread(GUEST_IDTR_BASE);
468 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
469 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
471 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
472 c->cs_limit = __vmread(GUEST_CS_LIMIT);
473 c->cs_base = __vmread(GUEST_CS_BASE);
474 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
476 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
477 c->ds_limit = __vmread(GUEST_DS_LIMIT);
478 c->ds_base = __vmread(GUEST_DS_BASE);
479 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
481 c->es_sel = __vmread(GUEST_ES_SELECTOR);
482 c->es_limit = __vmread(GUEST_ES_LIMIT);
483 c->es_base = __vmread(GUEST_ES_BASE);
484 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
486 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
487 c->ss_limit = __vmread(GUEST_SS_LIMIT);
488 c->ss_base = __vmread(GUEST_SS_BASE);
489 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
491 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
492 c->fs_limit = __vmread(GUEST_FS_LIMIT);
493 c->fs_base = __vmread(GUEST_FS_BASE);
494 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
496 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
497 c->gs_limit = __vmread(GUEST_GS_LIMIT);
498 c->gs_base = __vmread(GUEST_GS_BASE);
499 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
501 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
502 c->tr_limit = __vmread(GUEST_TR_LIMIT);
503 c->tr_base = __vmread(GUEST_TR_BASE);
504 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
506 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
507 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
508 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
509 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
511 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
512 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
513 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
515 c->pending_event = 0;
516 c->error_code = 0;
517 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
518 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
519 {
520 c->pending_event = ev;
521 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
522 }
524 vmx_vmcs_exit(v);
525 }
527 static int vmx_restore_cr0_cr3(
528 struct vcpu *v, unsigned long cr0, unsigned long cr3)
529 {
530 unsigned long mfn = 0;
531 p2m_type_t p2mt;
533 if ( cr0 & X86_CR0_PG )
534 {
535 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
536 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
537 {
538 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
539 return -EINVAL;
540 }
541 }
543 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
544 put_page(pagetable_get_page(v->arch.guest_table));
546 v->arch.guest_table = pagetable_from_pfn(mfn);
548 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
549 v->arch.hvm_vcpu.guest_cr[3] = cr3;
551 return 0;
552 }
554 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
555 {
556 int rc;
558 if ( c->pending_valid &&
559 ((c->pending_type == 1) || (c->pending_type > 6) ||
560 (c->pending_reserved != 0)) )
561 {
562 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
563 c->pending_event);
564 return -EINVAL;
565 }
567 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
568 if ( rc )
569 return rc;
571 vmx_vmcs_enter(v);
573 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
574 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
575 vmx_update_guest_cr(v, 0);
576 vmx_update_guest_cr(v, 2);
577 vmx_update_guest_cr(v, 4);
579 #ifdef HVM_DEBUG_SUSPEND
580 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
581 __func__, c->cr3, c->cr0, c->cr4);
582 #endif
584 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
585 vmx_update_guest_efer(v);
587 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
588 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
590 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
591 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
593 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
594 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
595 __vmwrite(GUEST_CS_BASE, c->cs_base);
596 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
598 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
599 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
600 __vmwrite(GUEST_DS_BASE, c->ds_base);
601 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
603 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
604 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
605 __vmwrite(GUEST_ES_BASE, c->es_base);
606 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
608 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
609 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
610 __vmwrite(GUEST_SS_BASE, c->ss_base);
611 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
613 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
614 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
615 __vmwrite(GUEST_FS_BASE, c->fs_base);
616 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
618 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
619 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
620 __vmwrite(GUEST_GS_BASE, c->gs_base);
621 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
623 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
624 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
625 __vmwrite(GUEST_TR_BASE, c->tr_base);
626 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
628 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
629 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
630 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
631 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
633 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
634 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
635 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
637 __vmwrite(GUEST_DR7, c->dr7);
639 vmx_vmcs_exit(v);
641 paging_update_paging_modes(v);
643 if ( c->pending_valid )
644 {
645 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
646 c->pending_event, c->error_code);
648 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
649 {
650 vmx_vmcs_enter(v);
651 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
652 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
653 vmx_vmcs_exit(v);
654 }
655 }
657 return 0;
658 }
660 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
661 static void dump_msr_state(struct vmx_msr_state *m)
662 {
663 int i = 0;
664 printk("**** msr state ****\n");
665 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
666 for ( i = 0; i < VMX_MSR_COUNT; i++ )
667 printk("0x%lx,", m->msrs[i]);
668 printk("\n");
669 }
670 #else
671 #define dump_msr_state(m) ((void)0)
672 #endif
674 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
675 {
676 #ifdef __x86_64__
677 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
678 unsigned long guest_flags = guest_state->flags;
680 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
681 data->msr_cstar = v->arch.hvm_vmx.cstar;
683 /* save msrs */
684 data->msr_flags = guest_flags;
685 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
686 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
687 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
688 #endif
690 data->tsc = hvm_get_guest_time(v);
692 dump_msr_state(guest_state);
693 }
695 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
696 {
697 #ifdef __x86_64__
698 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
700 /* restore msrs */
701 guest_state->flags = data->msr_flags;
702 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
703 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
704 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
706 v->arch.hvm_vmx.cstar = data->msr_cstar;
707 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
708 #endif
710 #ifdef VMXASSIST
711 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
712 #endif
714 hvm_set_guest_time(v, data->tsc);
716 dump_msr_state(guest_state);
717 }
720 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
721 {
722 vmx_save_cpu_state(v, ctxt);
723 vmx_vmcs_save(v, ctxt);
724 }
726 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
727 {
728 vmx_load_cpu_state(v, ctxt);
730 if ( vmx_vmcs_restore(v, ctxt) )
731 {
732 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
733 domain_crash(v->domain);
734 return -EINVAL;
735 }
737 return 0;
738 }
740 static void vmx_ctxt_switch_from(struct vcpu *v)
741 {
742 vmx_save_guest_msrs(v);
743 vmx_restore_host_msrs();
744 vmx_save_dr(v);
745 }
747 static void vmx_ctxt_switch_to(struct vcpu *v)
748 {
749 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
750 if ( unlikely(read_cr4() != mmu_cr4_features) )
751 write_cr4(mmu_cr4_features);
753 vmx_restore_guest_msrs(v);
754 vmx_restore_dr(v);
755 }
757 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
758 {
759 unsigned long base = 0;
760 int long_mode = 0;
762 ASSERT(v == current);
764 if ( hvm_long_mode_enabled(v) &&
765 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
766 long_mode = 1;
768 switch ( seg )
769 {
770 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
771 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
772 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
773 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
774 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
775 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
776 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
777 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
778 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
779 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
780 default: BUG(); break;
781 }
783 return base;
784 }
786 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
787 struct segment_register *reg)
788 {
789 uint32_t attr = 0;
791 ASSERT(v == current);
793 switch ( seg )
794 {
795 case x86_seg_cs:
796 reg->sel = __vmread(GUEST_CS_SELECTOR);
797 reg->limit = __vmread(GUEST_CS_LIMIT);
798 reg->base = __vmread(GUEST_CS_BASE);
799 attr = __vmread(GUEST_CS_AR_BYTES);
800 break;
801 case x86_seg_ds:
802 reg->sel = __vmread(GUEST_DS_SELECTOR);
803 reg->limit = __vmread(GUEST_DS_LIMIT);
804 reg->base = __vmread(GUEST_DS_BASE);
805 attr = __vmread(GUEST_DS_AR_BYTES);
806 break;
807 case x86_seg_es:
808 reg->sel = __vmread(GUEST_ES_SELECTOR);
809 reg->limit = __vmread(GUEST_ES_LIMIT);
810 reg->base = __vmread(GUEST_ES_BASE);
811 attr = __vmread(GUEST_ES_AR_BYTES);
812 break;
813 case x86_seg_fs:
814 reg->sel = __vmread(GUEST_FS_SELECTOR);
815 reg->limit = __vmread(GUEST_FS_LIMIT);
816 reg->base = __vmread(GUEST_FS_BASE);
817 attr = __vmread(GUEST_FS_AR_BYTES);
818 break;
819 case x86_seg_gs:
820 reg->sel = __vmread(GUEST_GS_SELECTOR);
821 reg->limit = __vmread(GUEST_GS_LIMIT);
822 reg->base = __vmread(GUEST_GS_BASE);
823 attr = __vmread(GUEST_GS_AR_BYTES);
824 break;
825 case x86_seg_ss:
826 reg->sel = __vmread(GUEST_SS_SELECTOR);
827 reg->limit = __vmread(GUEST_SS_LIMIT);
828 reg->base = __vmread(GUEST_SS_BASE);
829 attr = __vmread(GUEST_SS_AR_BYTES);
830 break;
831 case x86_seg_tr:
832 reg->sel = __vmread(GUEST_TR_SELECTOR);
833 reg->limit = __vmread(GUEST_TR_LIMIT);
834 reg->base = __vmread(GUEST_TR_BASE);
835 attr = __vmread(GUEST_TR_AR_BYTES);
836 break;
837 case x86_seg_gdtr:
838 reg->limit = __vmread(GUEST_GDTR_LIMIT);
839 reg->base = __vmread(GUEST_GDTR_BASE);
840 break;
841 case x86_seg_idtr:
842 reg->limit = __vmread(GUEST_IDTR_LIMIT);
843 reg->base = __vmread(GUEST_IDTR_BASE);
844 break;
845 case x86_seg_ldtr:
846 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
847 reg->limit = __vmread(GUEST_LDTR_LIMIT);
848 reg->base = __vmread(GUEST_LDTR_BASE);
849 attr = __vmread(GUEST_LDTR_AR_BYTES);
850 break;
851 default:
852 BUG();
853 }
855 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
856 /* Unusable flag is folded into Present flag. */
857 if ( attr & (1u<<16) )
858 reg->attr.fields.p = 0;
859 }
861 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
862 struct segment_register *reg)
863 {
864 uint32_t attr;
866 ASSERT((v == current) || !vcpu_runnable(v));
868 attr = reg->attr.bytes;
869 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
871 /* Not-present must mean unusable. */
872 if ( !reg->attr.fields.p )
873 attr |= (1u << 16);
875 vmx_vmcs_enter(v);
877 switch ( seg )
878 {
879 case x86_seg_cs:
880 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
881 __vmwrite(GUEST_CS_LIMIT, reg->limit);
882 __vmwrite(GUEST_CS_BASE, reg->base);
883 __vmwrite(GUEST_CS_AR_BYTES, attr);
884 break;
885 case x86_seg_ds:
886 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
887 __vmwrite(GUEST_DS_LIMIT, reg->limit);
888 __vmwrite(GUEST_DS_BASE, reg->base);
889 __vmwrite(GUEST_DS_AR_BYTES, attr);
890 break;
891 case x86_seg_es:
892 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
893 __vmwrite(GUEST_ES_LIMIT, reg->limit);
894 __vmwrite(GUEST_ES_BASE, reg->base);
895 __vmwrite(GUEST_ES_AR_BYTES, attr);
896 break;
897 case x86_seg_fs:
898 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
899 __vmwrite(GUEST_FS_LIMIT, reg->limit);
900 __vmwrite(GUEST_FS_BASE, reg->base);
901 __vmwrite(GUEST_FS_AR_BYTES, attr);
902 break;
903 case x86_seg_gs:
904 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
905 __vmwrite(GUEST_GS_LIMIT, reg->limit);
906 __vmwrite(GUEST_GS_BASE, reg->base);
907 __vmwrite(GUEST_GS_AR_BYTES, attr);
908 break;
909 case x86_seg_ss:
910 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
911 __vmwrite(GUEST_SS_LIMIT, reg->limit);
912 __vmwrite(GUEST_SS_BASE, reg->base);
913 __vmwrite(GUEST_SS_AR_BYTES, attr);
914 break;
915 case x86_seg_tr:
916 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
917 __vmwrite(GUEST_TR_LIMIT, reg->limit);
918 __vmwrite(GUEST_TR_BASE, reg->base);
919 __vmwrite(GUEST_TR_AR_BYTES, attr);
920 break;
921 case x86_seg_gdtr:
922 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
923 __vmwrite(GUEST_GDTR_BASE, reg->base);
924 break;
925 case x86_seg_idtr:
926 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
927 __vmwrite(GUEST_IDTR_BASE, reg->base);
928 break;
929 case x86_seg_ldtr:
930 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
931 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
932 __vmwrite(GUEST_LDTR_BASE, reg->base);
933 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
934 break;
935 default:
936 BUG();
937 }
939 vmx_vmcs_exit(v);
940 }
942 /* Make sure that xen intercepts any FP accesses from current */
943 static void vmx_stts(struct vcpu *v)
944 {
945 /* VMX depends on operating on the current vcpu */
946 ASSERT(v == current);
948 /*
949 * If the guest does not have TS enabled then we must cause and handle an
950 * exception on first use of the FPU. If the guest *does* have TS enabled
951 * then this is not necessary: no FPU activity can occur until the guest
952 * clears CR0.TS, and we will initialise the FPU when that happens.
953 */
954 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
955 {
956 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
957 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
958 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
959 }
960 }
962 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
963 {
964 vmx_vmcs_enter(v);
965 __vmwrite(TSC_OFFSET, offset);
966 #if defined (__i386__)
967 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
968 #endif
969 vmx_vmcs_exit(v);
970 }
972 void do_nmi(struct cpu_user_regs *);
974 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
975 {
976 char *p;
977 int i;
979 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
980 {
981 p = (char *)(hypercall_page + (i * 32));
982 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
983 *(u32 *)(p + 1) = i;
984 *(u8 *)(p + 5) = 0x0f; /* vmcall */
985 *(u8 *)(p + 6) = 0x01;
986 *(u8 *)(p + 7) = 0xc1;
987 *(u8 *)(p + 8) = 0xc3; /* ret */
988 }
990 /* Don't support HYPERVISOR_iret at the moment */
991 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
992 }
994 static enum hvm_intblk vmx_interrupt_blocked(
995 struct vcpu *v, struct hvm_intack intack)
996 {
997 unsigned long intr_shadow;
999 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1001 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
1002 return hvm_intblk_shadow;
1004 if ( intack.source == hvm_intsrc_nmi )
1005 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1006 hvm_intblk_nmi_iret : hvm_intblk_none);
1008 ASSERT((intack.source == hvm_intsrc_pic) ||
1009 (intack.source == hvm_intsrc_lapic));
1011 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1012 return hvm_intblk_rflags_ie;
1014 return hvm_intblk_none;
1017 static void vmx_update_host_cr3(struct vcpu *v)
1019 ASSERT((v == current) || !vcpu_runnable(v));
1020 vmx_vmcs_enter(v);
1021 __vmwrite(HOST_CR3, v->arch.cr3);
1022 vmx_vmcs_exit(v);
1025 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1027 ASSERT((v == current) || !vcpu_runnable(v));
1029 vmx_vmcs_enter(v);
1031 switch ( cr )
1033 case 0:
1034 /* TS cleared? Then initialise FPU now. */
1035 if ( (v == current) && !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) &&
1036 (v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS) )
1038 setup_fpu(v);
1039 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1042 v->arch.hvm_vcpu.hw_cr[0] =
1043 v->arch.hvm_vcpu.guest_cr[0] |
1044 X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
1045 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1046 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1047 break;
1048 case 2:
1049 /* CR2 is updated in exit stub. */
1050 break;
1051 case 3:
1052 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1053 break;
1054 case 4:
1055 v->arch.hvm_vcpu.hw_cr[4] =
1056 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1057 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1058 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1059 break;
1060 default:
1061 BUG();
1064 vmx_vmcs_exit(v);
1067 static void vmx_update_guest_efer(struct vcpu *v)
1069 #ifdef __x86_64__
1070 unsigned long vm_entry_value;
1072 ASSERT((v == current) || !vcpu_runnable(v));
1074 vmx_vmcs_enter(v);
1076 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1077 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1078 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1079 else
1080 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1081 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1083 vmx_vmcs_exit(v);
1084 #endif
1086 if ( v == current )
1087 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1088 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1091 static void vmx_flush_guest_tlbs(void)
1093 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1094 * at all means any guest will have a clean TLB when it's next run,
1095 * because VMRESUME will flush it for us. */
1098 static void vmx_inject_exception(
1099 unsigned int trapnr, int errcode, unsigned long cr2)
1101 struct vcpu *curr = current;
1103 vmx_inject_hw_exception(curr, trapnr, errcode);
1105 if ( trapnr == TRAP_page_fault )
1106 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1108 if ( (trapnr == TRAP_debug) &&
1109 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1111 __restore_debug_registers(curr);
1112 write_debugreg(6, read_debugreg(6) | 0x4000);
1116 static int vmx_event_pending(struct vcpu *v)
1118 ASSERT(v == current);
1119 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1122 static struct hvm_function_table vmx_function_table = {
1123 .name = "VMX",
1124 .domain_initialise = vmx_domain_initialise,
1125 .domain_destroy = vmx_domain_destroy,
1126 .vcpu_initialise = vmx_vcpu_initialise,
1127 .vcpu_destroy = vmx_vcpu_destroy,
1128 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1129 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1130 .interrupt_blocked = vmx_interrupt_blocked,
1131 .guest_x86_mode = vmx_guest_x86_mode,
1132 .get_segment_base = vmx_get_segment_base,
1133 .get_segment_register = vmx_get_segment_register,
1134 .set_segment_register = vmx_set_segment_register,
1135 .update_host_cr3 = vmx_update_host_cr3,
1136 .update_guest_cr = vmx_update_guest_cr,
1137 .update_guest_efer = vmx_update_guest_efer,
1138 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1139 .stts = vmx_stts,
1140 .set_tsc_offset = vmx_set_tsc_offset,
1141 .inject_exception = vmx_inject_exception,
1142 .init_hypercall_page = vmx_init_hypercall_page,
1143 .event_pending = vmx_event_pending,
1144 .cpu_up = vmx_cpu_up,
1145 .cpu_down = vmx_cpu_down,
1146 };
1148 void start_vmx(void)
1150 static int bootstrapped;
1152 vmx_save_host_msrs();
1154 if ( bootstrapped )
1156 if ( hvm_enabled && !vmx_cpu_up() )
1158 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1159 smp_processor_id());
1160 BUG();
1162 return;
1165 bootstrapped = 1;
1167 /* Xen does not fill x86_capability words except 0. */
1168 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1170 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1171 return;
1173 set_in_cr4(X86_CR4_VMXE);
1175 if ( !vmx_cpu_up() )
1177 printk("VMX: failed to initialise.\n");
1178 return;
1181 setup_vmcs_dump();
1183 hvm_enable(&vmx_function_table);
1186 /*
1187 * Not all cases receive valid value in the VM-exit instruction length field.
1188 * Callers must know what they're doing!
1189 */
1190 static int __get_instruction_length(void)
1192 int len;
1193 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1194 BUG_ON((len < 1) || (len > 15));
1195 return len;
1198 static void __update_guest_eip(unsigned long inst_len)
1200 struct cpu_user_regs *regs = guest_cpu_user_regs();
1201 unsigned long x;
1203 regs->eip += inst_len;
1204 regs->eflags &= ~X86_EFLAGS_RF;
1206 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1207 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1209 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1210 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1213 if ( regs->eflags & X86_EFLAGS_TF )
1214 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1217 static void vmx_do_no_device_fault(void)
1219 struct vcpu *v = current;
1221 setup_fpu(current);
1222 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1224 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1225 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1227 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1228 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1232 #define bitmaskof(idx) (1U << ((idx) & 31))
1233 void vmx_cpuid_intercept(
1234 unsigned int *eax, unsigned int *ebx,
1235 unsigned int *ecx, unsigned int *edx)
1237 unsigned int input = *eax;
1238 unsigned int count = *ecx;
1240 #ifdef VMXASSIST
1241 if ( input == 0x40000003 )
1243 /*
1244 * NB. Unsupported interface for private use of VMXASSIST only.
1245 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1246 */
1247 u64 value = ((u64)*edx << 32) | (u32)*ecx;
1248 p2m_type_t p2mt;
1249 unsigned long mfn;
1250 struct vcpu *v = current;
1251 char *p;
1253 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1255 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1257 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1258 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1259 !v->arch.hvm_vmx.vmxassist_enabled )
1261 domain_crash(v->domain);
1262 return;
1264 ASSERT(mfn_valid(mfn));
1266 p = map_domain_page(mfn);
1267 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1268 unmap_domain_page(p);
1270 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1271 *ecx = (u32)value;
1272 *edx = (u32)(value >> 32);
1273 return;
1275 #endif
1277 hvm_cpuid(input, eax, ebx, ecx, edx);
1279 switch ( input )
1281 case 0x00000001:
1282 *ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1283 *ebx &= NUM_THREADS_RESET_MASK;
1284 *ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1285 bitmaskof(X86_FEATURE_EST) |
1286 bitmaskof(X86_FEATURE_TM2) |
1287 bitmaskof(X86_FEATURE_CID) |
1288 bitmaskof(X86_FEATURE_PDCM) |
1289 bitmaskof(X86_FEATURE_DSCPL));
1290 *edx &= ~(bitmaskof(X86_FEATURE_HT) |
1291 bitmaskof(X86_FEATURE_ACPI) |
1292 bitmaskof(X86_FEATURE_ACC) |
1293 bitmaskof(X86_FEATURE_DS));
1294 break;
1296 case 0x00000004:
1297 cpuid_count(input, count, eax, ebx, ecx, edx);
1298 *eax &= NUM_CORES_RESET_MASK;
1299 break;
1301 case 0x00000006:
1302 case 0x00000009:
1303 case 0x0000000A:
1304 *eax = *ebx = *ecx = *edx = 0;
1305 break;
1307 case 0x80000001:
1308 /* Only a few features are advertised in Intel's 0x80000001. */
1309 *ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1310 *edx &= (bitmaskof(X86_FEATURE_NX) |
1311 bitmaskof(X86_FEATURE_LM) |
1312 bitmaskof(X86_FEATURE_SYSCALL));
1313 break;
1316 HVMTRACE_3D(CPUID, current, input,
1317 ((uint64_t)*eax << 32) | *ebx, ((uint64_t)*ecx << 32) | *edx);
1320 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1322 unsigned int eax, ebx, ecx, edx;
1324 eax = regs->eax;
1325 ebx = regs->ebx;
1326 ecx = regs->ecx;
1327 edx = regs->edx;
1329 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1331 regs->eax = eax;
1332 regs->ebx = ebx;
1333 regs->ecx = ecx;
1334 regs->edx = edx;
1337 #define CASE_GET_REG_P(REG, reg) \
1338 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1340 #ifdef __i386__
1341 #define CASE_EXTEND_GET_REG_P
1342 #else
1343 #define CASE_EXTEND_GET_REG_P \
1344 CASE_GET_REG_P(R8, r8); \
1345 CASE_GET_REG_P(R9, r9); \
1346 CASE_GET_REG_P(R10, r10); \
1347 CASE_GET_REG_P(R11, r11); \
1348 CASE_GET_REG_P(R12, r12); \
1349 CASE_GET_REG_P(R13, r13); \
1350 CASE_GET_REG_P(R14, r14); \
1351 CASE_GET_REG_P(R15, r15)
1352 #endif
1354 static void vmx_dr_access(unsigned long exit_qualification,
1355 struct cpu_user_regs *regs)
1357 struct vcpu *v = current;
1359 HVMTRACE_0D(DR_WRITE, v);
1361 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1362 __restore_debug_registers(v);
1364 /* Allow guest direct access to DR registers */
1365 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1366 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1369 /*
1370 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1371 * the address va.
1372 */
1373 static void vmx_do_invlpg(unsigned long va)
1375 struct vcpu *v = current;
1377 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1379 /*
1380 * We do the safest things first, then try to update the shadow
1381 * copying from guest
1382 */
1383 paging_invlpg(v, va);
1386 /* Get segment for OUTS according to guest instruction. */
1387 static enum x86_segment vmx_outs_get_segment(
1388 int long_mode, unsigned long eip, int inst_len)
1390 unsigned char inst[MAX_INST_LEN];
1391 enum x86_segment seg = x86_seg_ds;
1392 int i;
1393 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1395 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1397 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1399 /* Get segment register according to bits 17:15. */
1400 switch ( (instr_info >> 15) & 7 )
1402 case 0: seg = x86_seg_es; break;
1403 case 1: seg = x86_seg_cs; break;
1404 case 2: seg = x86_seg_ss; break;
1405 case 3: seg = x86_seg_ds; break;
1406 case 4: seg = x86_seg_fs; break;
1407 case 5: seg = x86_seg_gs; break;
1408 default: BUG();
1411 goto out;
1414 if ( !long_mode )
1415 eip += __vmread(GUEST_CS_BASE);
1417 memset(inst, 0, MAX_INST_LEN);
1418 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1420 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1421 domain_crash(current->domain);
1422 goto out;
1425 for ( i = 0; i < inst_len; i++ )
1427 switch ( inst[i] )
1429 case 0xf3: /* REPZ */
1430 case 0xf2: /* REPNZ */
1431 case 0xf0: /* LOCK */
1432 case 0x66: /* data32 */
1433 case 0x67: /* addr32 */
1434 #ifdef __x86_64__
1435 case 0x40 ... 0x4f: /* REX */
1436 #endif
1437 continue;
1438 case 0x2e: /* CS */
1439 seg = x86_seg_cs;
1440 continue;
1441 case 0x36: /* SS */
1442 seg = x86_seg_ss;
1443 continue;
1444 case 0x26: /* ES */
1445 seg = x86_seg_es;
1446 continue;
1447 case 0x64: /* FS */
1448 seg = x86_seg_fs;
1449 continue;
1450 case 0x65: /* GS */
1451 seg = x86_seg_gs;
1452 continue;
1453 case 0x3e: /* DS */
1454 seg = x86_seg_ds;
1455 continue;
1459 out:
1460 return seg;
1463 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1464 int inst_len, enum x86_segment seg,
1465 unsigned long *base, u32 *limit,
1466 u32 *ar_bytes)
1468 enum vmcs_field ar_field, base_field, limit_field;
1470 *base = 0;
1471 *limit = 0;
1472 if ( seg != x86_seg_es )
1473 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1475 switch ( seg )
1477 case x86_seg_cs:
1478 ar_field = GUEST_CS_AR_BYTES;
1479 base_field = GUEST_CS_BASE;
1480 limit_field = GUEST_CS_LIMIT;
1481 break;
1482 case x86_seg_ds:
1483 ar_field = GUEST_DS_AR_BYTES;
1484 base_field = GUEST_DS_BASE;
1485 limit_field = GUEST_DS_LIMIT;
1486 break;
1487 case x86_seg_es:
1488 ar_field = GUEST_ES_AR_BYTES;
1489 base_field = GUEST_ES_BASE;
1490 limit_field = GUEST_ES_LIMIT;
1491 break;
1492 case x86_seg_fs:
1493 ar_field = GUEST_FS_AR_BYTES;
1494 base_field = GUEST_FS_BASE;
1495 limit_field = GUEST_FS_LIMIT;
1496 break;
1497 case x86_seg_gs:
1498 ar_field = GUEST_GS_AR_BYTES;
1499 base_field = GUEST_GS_BASE;
1500 limit_field = GUEST_GS_LIMIT;
1501 break;
1502 case x86_seg_ss:
1503 ar_field = GUEST_SS_AR_BYTES;
1504 base_field = GUEST_SS_BASE;
1505 limit_field = GUEST_SS_LIMIT;
1506 break;
1507 default:
1508 BUG();
1509 return 0;
1512 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1514 *base = __vmread(base_field);
1515 *limit = __vmread(limit_field);
1517 *ar_bytes = __vmread(ar_field);
1519 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1523 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1524 u32 ar_bytes, unsigned long addr,
1525 unsigned long base, int df,
1526 unsigned long *count)
1528 unsigned long ea = addr - base;
1530 /* Offset must be within limits. */
1531 ASSERT(ea == (u32)ea);
1532 if ( (u32)(ea + size - 1) < (u32)ea ||
1533 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1534 : ea <= limit )
1535 return 0;
1537 /* Check the limit for repeated instructions, as above we checked
1538 only the first instance. Truncate the count if a limit violation
1539 would occur. Note that the checking is not necessary for page
1540 granular segments as transfers crossing page boundaries will be
1541 broken up anyway. */
1542 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1544 if ( (ar_bytes & 0xc) != 0x4 )
1546 /* expand-up */
1547 if ( !df )
1549 if ( ea + *count * size - 1 < ea ||
1550 ea + *count * size - 1 > limit )
1551 *count = (limit + 1UL - ea) / size;
1553 else
1555 if ( *count - 1 > ea / size )
1556 *count = ea / size + 1;
1559 else
1561 /* expand-down */
1562 if ( !df )
1564 if ( *count - 1 > -(s32)ea / size )
1565 *count = -(s32)ea / size + 1UL;
1567 else
1569 if ( ea < (*count - 1) * size ||
1570 ea - (*count - 1) * size <= limit )
1571 *count = (ea - limit - 1) / size + 1;
1574 ASSERT(*count);
1577 return 1;
1580 #ifdef __x86_64__
1581 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1582 unsigned int size,
1583 unsigned long addr,
1584 unsigned long *count)
1586 if ( !is_canonical_address(addr) ||
1587 !is_canonical_address(addr + size - 1) )
1588 return 0;
1590 if ( *count > (1UL << 48) / size )
1591 *count = (1UL << 48) / size;
1593 if ( !(regs->eflags & EF_DF) )
1595 if ( addr + *count * size - 1 < addr ||
1596 !is_canonical_address(addr + *count * size - 1) )
1597 *count = (addr & ~((1UL << 48) - 1)) / size;
1599 else
1601 if ( (*count - 1) * size > addr ||
1602 !is_canonical_address(addr + (*count - 1) * size) )
1603 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1606 ASSERT(*count);
1608 return 1;
1610 #endif
1612 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1613 struct hvm_io_op *pio_opp,
1614 unsigned long inst_len, unsigned int port,
1615 int sign, unsigned int size, int dir,
1616 int df, unsigned long addr,
1617 paddr_t paddr, unsigned long count)
1619 /*
1620 * Handle string pio instructions that cross pages or that
1621 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1622 */
1623 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1624 unsigned long value = 0;
1626 pio_opp->flags |= OVERLAP;
1628 if ( dir == IOREQ_WRITE ) /* OUTS */
1630 if ( hvm_paging_enabled(current) )
1632 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1633 if ( rv == HVMCOPY_bad_gva_to_gfn )
1634 return; /* exception already injected */
1636 else
1637 (void)hvm_copy_from_guest_phys(&value, addr, size);
1639 else /* dir != IOREQ_WRITE */
1640 /* Remember where to write the result, as a *VA*.
1641 * Must be a VA so we can handle the page overlap
1642 * correctly in hvm_pio_assist() */
1643 pio_opp->addr = addr;
1645 if ( count == 1 )
1646 regs->eip += inst_len;
1648 send_pio_req(port, 1, size, value, dir, df, 0);
1649 } else {
1650 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1651 : addr - (count - 1) * size;
1653 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1655 if ( sign > 0 )
1656 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1657 else
1658 count = (addr & ~PAGE_MASK) / size + 1;
1659 } else
1660 regs->eip += inst_len;
1662 send_pio_req(port, count, size, paddr, dir, df, 1);
1666 static void vmx_do_str_pio(unsigned long exit_qualification,
1667 unsigned long inst_len,
1668 struct cpu_user_regs *regs,
1669 struct hvm_io_op *pio_opp)
1671 unsigned int port, size;
1672 int dir, df, vm86;
1673 unsigned long addr, count = 1, base;
1674 paddr_t paddr;
1675 unsigned long gfn;
1676 u32 ar_bytes, limit, pfec;
1677 int sign;
1678 int long_mode = 0;
1680 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1681 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1683 if ( test_bit(6, &exit_qualification) )
1684 port = (exit_qualification >> 16) & 0xFFFF;
1685 else
1686 port = regs->edx & 0xffff;
1688 size = (exit_qualification & 7) + 1;
1689 dir = test_bit(3, &exit_qualification); /* direction */
1691 if ( dir == IOREQ_READ )
1692 HVMTRACE_2D(IO_READ, current, port, size);
1693 else
1694 HVMTRACE_2D(IO_WRITE, current, port, size);
1696 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1697 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1698 if ( hvm_long_mode_enabled(current) &&
1699 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1700 long_mode = 1;
1701 addr = __vmread(GUEST_LINEAR_ADDRESS);
1703 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1704 pio_opp->flags |= REPZ;
1705 count = regs->ecx;
1706 if ( !long_mode &&
1707 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1708 count &= 0xFFFF;
1711 /*
1712 * In protected mode, guest linear address is invalid if the
1713 * selector is null.
1714 */
1715 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1716 dir==IOREQ_WRITE ? x86_seg_ds :
1717 x86_seg_es, &base, &limit,
1718 &ar_bytes) ) {
1719 if ( !long_mode ) {
1720 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1721 return;
1723 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1726 if ( !long_mode )
1728 /* Segment must be readable for outs and writeable for ins. */
1729 if ( ((dir == IOREQ_WRITE)
1730 ? ((ar_bytes & 0xa) == 0x8)
1731 : ((ar_bytes & 0xa) != 0x2)) ||
1732 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1733 addr, base, df, &count) )
1735 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1736 return;
1739 #ifdef __x86_64__
1740 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1742 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1743 return;
1745 #endif
1747 /* Translate the address to a physical address */
1748 pfec = PFEC_page_present;
1749 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1750 pfec |= PFEC_write_access;
1751 if ( ((__vmread(GUEST_SS_AR_BYTES) >> 5) & 3) == 3 )
1752 pfec |= PFEC_user_mode;
1753 gfn = paging_gva_to_gfn(current, addr, &pfec);
1754 if ( gfn == INVALID_GFN )
1756 /* The guest does not have the RAM address mapped.
1757 * Need to send in a page fault */
1758 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1759 return;
1761 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1763 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1764 size, dir, df, addr, paddr, count);
1767 static void vmx_io_instruction(unsigned long exit_qualification,
1768 unsigned long inst_len)
1770 struct cpu_user_regs *regs;
1771 struct hvm_io_op *pio_opp;
1773 pio_opp = &current->arch.hvm_vcpu.io_op;
1774 pio_opp->instr = INSTR_PIO;
1775 pio_opp->flags = 0;
1777 regs = &pio_opp->io_context;
1779 /* Copy current guest state into io instruction state structure. */
1780 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1782 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1783 "exit_qualification = %lx",
1784 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1785 regs->cs, (unsigned long)regs->eip, exit_qualification);
1787 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1788 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1789 else
1791 unsigned int port, size;
1792 int dir, df;
1794 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1796 if ( test_bit(6, &exit_qualification) )
1797 port = (exit_qualification >> 16) & 0xFFFF;
1798 else
1799 port = regs->edx & 0xffff;
1801 size = (exit_qualification & 7) + 1;
1802 dir = test_bit(3, &exit_qualification); /* direction */
1804 if ( dir == IOREQ_READ )
1805 HVMTRACE_2D(IO_READ, current, port, size);
1806 else
1807 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1809 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1810 hvm_print_line(current, regs->eax); /* guest debug output */
1812 regs->eip += inst_len;
1813 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1817 #ifdef VMXASSIST
1819 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1821 struct cpu_user_regs *regs = guest_cpu_user_regs();
1823 c->eip = regs->eip;
1824 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1825 c->esp = regs->esp;
1826 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1828 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1829 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1830 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1832 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1833 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1835 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1836 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1838 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1839 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1840 c->cs_base = __vmread(GUEST_CS_BASE);
1841 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1843 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1844 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1845 c->ds_base = __vmread(GUEST_DS_BASE);
1846 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1848 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1849 c->es_limit = __vmread(GUEST_ES_LIMIT);
1850 c->es_base = __vmread(GUEST_ES_BASE);
1851 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1853 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1854 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1855 c->ss_base = __vmread(GUEST_SS_BASE);
1856 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1858 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1859 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1860 c->fs_base = __vmread(GUEST_FS_BASE);
1861 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1863 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1864 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1865 c->gs_base = __vmread(GUEST_GS_BASE);
1866 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1868 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1869 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1870 c->tr_base = __vmread(GUEST_TR_BASE);
1871 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1873 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1874 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1875 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1876 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1879 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1881 struct cpu_user_regs *regs = guest_cpu_user_regs();
1882 int rc;
1884 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1885 if ( rc )
1886 return rc;
1888 regs->eip = c->eip;
1889 regs->esp = c->esp;
1890 regs->eflags = c->eflags | 2;
1892 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1893 vmx_update_guest_cr(v, 0);
1894 vmx_update_guest_cr(v, 4);
1896 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1897 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1899 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1900 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1902 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1903 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1904 __vmwrite(GUEST_CS_BASE, c->cs_base);
1905 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1907 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1908 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1909 __vmwrite(GUEST_DS_BASE, c->ds_base);
1910 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1912 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1913 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1914 __vmwrite(GUEST_ES_BASE, c->es_base);
1915 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1917 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1918 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1919 __vmwrite(GUEST_SS_BASE, c->ss_base);
1920 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1922 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1923 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1924 __vmwrite(GUEST_FS_BASE, c->fs_base);
1925 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1927 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1928 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1929 __vmwrite(GUEST_GS_BASE, c->gs_base);
1930 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1932 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1933 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1934 __vmwrite(GUEST_TR_BASE, c->tr_base);
1935 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1937 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1938 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1939 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1940 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1942 paging_update_paging_modes(v);
1943 return 0;
1946 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1948 static int vmx_assist(struct vcpu *v, int mode)
1950 struct vmx_assist_context c;
1951 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1952 u32 magic, cp;
1954 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1955 sizeof(magic)) )
1957 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1958 domain_crash(v->domain);
1959 return 0;
1962 if ( magic != VMXASSIST_MAGIC )
1964 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1965 domain_crash(v->domain);
1966 return 0;
1969 switch ( mode ) {
1970 /*
1971 * Transfer control to vmxassist.
1972 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1973 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1974 * by vmxassist and will transfer control to it.
1975 */
1976 case VMX_ASSIST_INVOKE:
1977 /* save the old context */
1978 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1979 goto error;
1980 if ( cp != 0 ) {
1981 vmx_world_save(v, &c);
1982 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
1983 goto error;
1986 /* restore the new context, this should activate vmxassist */
1987 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
1988 goto error;
1989 if ( cp != 0 ) {
1990 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
1991 goto error;
1992 if ( vmx_world_restore(v, &c) != 0 )
1993 goto error;
1994 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
1995 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
1996 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
1997 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
1998 v->arch.hvm_vmx.vmxassist_enabled = 1;
1999 return 1;
2001 break;
2003 /*
2004 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2005 * VMX_ASSIST_INVOKE above.
2006 */
2007 case VMX_ASSIST_RESTORE:
2008 /* save the old context */
2009 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2010 goto error;
2011 if ( cp != 0 ) {
2012 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2013 goto error;
2014 if ( vmx_world_restore(v, &c) != 0 )
2015 goto error;
2016 if ( v->arch.hvm_vmx.irqbase_mode ) {
2017 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2018 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2019 } else {
2020 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2021 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2023 v->arch.hvm_vmx.vmxassist_enabled = 0;
2024 return 1;
2026 break;
2029 error:
2030 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2031 domain_crash(v->domain);
2032 return 0;
2035 static int vmx_set_cr0(unsigned long value)
2037 struct vcpu *v = current;
2039 if ( hvm_set_cr0(value) == 0 )
2040 return 0;
2042 /*
2043 * VMX does not implement real-mode virtualization. We emulate
2044 * real-mode by performing a world switch to VMXAssist whenever
2045 * a partition disables the CR0.PE bit.
2046 */
2047 if ( !(value & X86_CR0_PE) )
2049 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2050 return 0; /* do not update eip! */
2052 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2054 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2055 return 0; /* do not update eip! */
2058 return 1;
2061 #else /* !defined(VMXASSIST) */
2063 #define vmx_set_cr0(v) hvm_set_cr0(v)
2065 #endif
2067 #define CASE_SET_REG(REG, reg) \
2068 case REG_ ## REG: regs->reg = value; break
2069 #define CASE_GET_REG(REG, reg) \
2070 case REG_ ## REG: value = regs->reg; break
2072 #define CASE_EXTEND_SET_REG \
2073 CASE_EXTEND_REG(S)
2074 #define CASE_EXTEND_GET_REG \
2075 CASE_EXTEND_REG(G)
2077 #ifdef __i386__
2078 #define CASE_EXTEND_REG(T)
2079 #else
2080 #define CASE_EXTEND_REG(T) \
2081 CASE_ ## T ## ET_REG(R8, r8); \
2082 CASE_ ## T ## ET_REG(R9, r9); \
2083 CASE_ ## T ## ET_REG(R10, r10); \
2084 CASE_ ## T ## ET_REG(R11, r11); \
2085 CASE_ ## T ## ET_REG(R12, r12); \
2086 CASE_ ## T ## ET_REG(R13, r13); \
2087 CASE_ ## T ## ET_REG(R14, r14); \
2088 CASE_ ## T ## ET_REG(R15, r15)
2089 #endif
2091 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2093 unsigned long value;
2094 struct vcpu *v = current;
2095 struct vlapic *vlapic = vcpu_vlapic(v);
2097 switch ( gp )
2099 CASE_GET_REG(EAX, eax);
2100 CASE_GET_REG(ECX, ecx);
2101 CASE_GET_REG(EDX, edx);
2102 CASE_GET_REG(EBX, ebx);
2103 CASE_GET_REG(EBP, ebp);
2104 CASE_GET_REG(ESI, esi);
2105 CASE_GET_REG(EDI, edi);
2106 CASE_GET_REG(ESP, esp);
2107 CASE_EXTEND_GET_REG;
2108 default:
2109 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2110 goto exit_and_crash;
2113 HVMTRACE_2D(CR_WRITE, v, cr, value);
2115 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2117 switch ( cr )
2119 case 0:
2120 return vmx_set_cr0(value);
2122 case 3:
2123 return hvm_set_cr3(value);
2125 case 4:
2126 return hvm_set_cr4(value);
2128 case 8:
2129 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2130 break;
2132 default:
2133 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2134 goto exit_and_crash;
2137 return 1;
2139 exit_and_crash:
2140 domain_crash(v->domain);
2141 return 0;
2144 /*
2145 * Read from control registers. CR0 and CR4 are read from the shadow.
2146 */
2147 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2149 unsigned long value = 0;
2150 struct vcpu *v = current;
2151 struct vlapic *vlapic = vcpu_vlapic(v);
2153 switch ( cr )
2155 case 3:
2156 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2157 break;
2158 case 8:
2159 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2160 value = (value & 0xF0) >> 4;
2161 break;
2162 default:
2163 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2164 domain_crash(v->domain);
2165 break;
2168 switch ( gp ) {
2169 CASE_SET_REG(EAX, eax);
2170 CASE_SET_REG(ECX, ecx);
2171 CASE_SET_REG(EDX, edx);
2172 CASE_SET_REG(EBX, ebx);
2173 CASE_SET_REG(EBP, ebp);
2174 CASE_SET_REG(ESI, esi);
2175 CASE_SET_REG(EDI, edi);
2176 CASE_SET_REG(ESP, esp);
2177 CASE_EXTEND_SET_REG;
2178 default:
2179 printk("invalid gp: %d\n", gp);
2180 domain_crash(v->domain);
2181 break;
2184 HVMTRACE_2D(CR_READ, v, cr, value);
2186 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2189 static int vmx_cr_access(unsigned long exit_qualification,
2190 struct cpu_user_regs *regs)
2192 unsigned int gp, cr;
2193 unsigned long value;
2194 struct vcpu *v = current;
2196 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2198 case TYPE_MOV_TO_CR:
2199 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2200 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2201 return mov_to_cr(gp, cr, regs);
2202 case TYPE_MOV_FROM_CR:
2203 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2204 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2205 mov_from_cr(cr, gp, regs);
2206 break;
2207 case TYPE_CLTS:
2208 /* We initialise the FPU now, to avoid needing another vmexit. */
2209 setup_fpu(v);
2210 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2212 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2213 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2215 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2216 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2217 HVMTRACE_0D(CLTS, current);
2218 break;
2219 case TYPE_LMSW:
2220 value = v->arch.hvm_vcpu.guest_cr[0];
2221 value = (value & ~0xF) |
2222 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2223 HVMTRACE_1D(LMSW, current, value);
2224 return vmx_set_cr0(value);
2225 default:
2226 BUG();
2229 return 1;
2232 static const struct lbr_info {
2233 u32 base, count;
2234 } p4_lbr[] = {
2235 { MSR_P4_LER_FROM_LIP, 1 },
2236 { MSR_P4_LER_TO_LIP, 1 },
2237 { MSR_P4_LASTBRANCH_TOS, 1 },
2238 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2239 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2240 { 0, 0 }
2241 }, c2_lbr[] = {
2242 { MSR_IA32_LASTINTFROMIP, 1 },
2243 { MSR_IA32_LASTINTTOIP, 1 },
2244 { MSR_C2_LASTBRANCH_TOS, 1 },
2245 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2246 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2247 { 0, 0 }
2248 #ifdef __i386__
2249 }, pm_lbr[] = {
2250 { MSR_IA32_LASTINTFROMIP, 1 },
2251 { MSR_IA32_LASTINTTOIP, 1 },
2252 { MSR_PM_LASTBRANCH_TOS, 1 },
2253 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2254 { 0, 0 }
2255 #endif
2256 };
2258 static const struct lbr_info *last_branch_msr_get(void)
2260 switch ( boot_cpu_data.x86 )
2262 case 6:
2263 switch ( boot_cpu_data.x86_model )
2265 #ifdef __i386__
2266 /* PentiumM */
2267 case 9: case 13:
2268 /* Core Solo/Duo */
2269 case 14:
2270 return pm_lbr;
2271 break;
2272 #endif
2273 /* Core2 Duo */
2274 case 15:
2275 return c2_lbr;
2276 break;
2278 break;
2280 case 15:
2281 switch ( boot_cpu_data.x86_model )
2283 /* Pentium4/Xeon with em64t */
2284 case 3: case 4: case 6:
2285 return p4_lbr;
2286 break;
2288 break;
2291 return NULL;
2294 static int is_last_branch_msr(u32 ecx)
2296 const struct lbr_info *lbr = last_branch_msr_get();
2298 if ( lbr == NULL )
2299 return 0;
2301 for ( ; lbr->count; lbr++ )
2302 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2303 return 1;
2305 return 0;
2308 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2310 u64 msr_content = 0;
2311 u32 ecx = regs->ecx, eax, edx;
2312 struct vcpu *v = current;
2313 int index;
2314 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2315 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2317 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2319 switch ( ecx )
2321 case MSR_IA32_TSC:
2322 msr_content = hvm_get_guest_time(v);
2323 break;
2324 case MSR_IA32_SYSENTER_CS:
2325 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2326 break;
2327 case MSR_IA32_SYSENTER_ESP:
2328 msr_content = __vmread(GUEST_SYSENTER_ESP);
2329 break;
2330 case MSR_IA32_SYSENTER_EIP:
2331 msr_content = __vmread(GUEST_SYSENTER_EIP);
2332 break;
2333 case MSR_IA32_APICBASE:
2334 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2335 break;
2336 case MSR_IA32_CR_PAT:
2337 msr_content = v->arch.hvm_vcpu.pat_cr;
2338 break;
2339 case MSR_MTRRcap:
2340 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2341 break;
2342 case MSR_MTRRdefType:
2343 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2344 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2345 break;
2346 case MSR_MTRRfix64K_00000:
2347 msr_content = fixed_range_base[0];
2348 break;
2349 case MSR_MTRRfix16K_80000:
2350 case MSR_MTRRfix16K_A0000:
2351 index = regs->ecx - MSR_MTRRfix16K_80000;
2352 msr_content = fixed_range_base[index + 1];
2353 break;
2354 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2355 index = regs->ecx - MSR_MTRRfix4K_C0000;
2356 msr_content = fixed_range_base[index + 3];
2357 break;
2358 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2359 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2360 msr_content = var_range_base[index];
2361 break;
2362 case MSR_IA32_DEBUGCTLMSR:
2363 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2364 msr_content = 0;
2365 break;
2366 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2367 goto gp_fault;
2368 case MSR_IA32_MCG_CAP:
2369 case MSR_IA32_MCG_STATUS:
2370 case MSR_IA32_MC0_STATUS:
2371 case MSR_IA32_MC1_STATUS:
2372 case MSR_IA32_MC2_STATUS:
2373 case MSR_IA32_MC3_STATUS:
2374 case MSR_IA32_MC4_STATUS:
2375 case MSR_IA32_MC5_STATUS:
2376 /* No point in letting the guest see real MCEs */
2377 msr_content = 0;
2378 break;
2379 default:
2380 switch ( long_mode_do_msr_read(regs) )
2382 case HNDL_unhandled:
2383 break;
2384 case HNDL_exception_raised:
2385 return 0;
2386 case HNDL_done:
2387 goto done;
2390 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2391 break;
2393 if ( is_last_branch_msr(ecx) )
2395 msr_content = 0;
2396 break;
2399 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2400 rdmsr_safe(ecx, eax, edx) == 0 )
2402 regs->eax = eax;
2403 regs->edx = edx;
2404 goto done;
2407 goto gp_fault;
2410 regs->eax = msr_content & 0xFFFFFFFF;
2411 regs->edx = msr_content >> 32;
2413 done:
2414 hvmtrace_msr_read(v, ecx, msr_content);
2415 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2416 ecx, (unsigned long)regs->eax,
2417 (unsigned long)regs->edx);
2418 return 1;
2420 gp_fault:
2421 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2422 return 0;
2425 static int vmx_alloc_vlapic_mapping(struct domain *d)
2427 void *apic_va;
2429 if ( !cpu_has_vmx_virtualize_apic_accesses )
2430 return 0;
2432 apic_va = alloc_xenheap_page();
2433 if ( apic_va == NULL )
2434 return -ENOMEM;
2435 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2436 set_mmio_p2m_entry(
2437 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
2438 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2440 return 0;
2443 static void vmx_free_vlapic_mapping(struct domain *d)
2445 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2446 if ( mfn != 0 )
2447 free_xenheap_page(mfn_to_virt(mfn));
2450 static void vmx_install_vlapic_mapping(struct vcpu *v)
2452 paddr_t virt_page_ma, apic_page_ma;
2454 if ( !cpu_has_vmx_virtualize_apic_accesses )
2455 return;
2457 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2458 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2459 apic_page_ma <<= PAGE_SHIFT;
2461 vmx_vmcs_enter(v);
2462 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2463 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2464 vmx_vmcs_exit(v);
2467 void vmx_vlapic_msr_changed(struct vcpu *v)
2469 struct vlapic *vlapic = vcpu_vlapic(v);
2470 uint32_t ctl;
2472 if ( !cpu_has_vmx_virtualize_apic_accesses )
2473 return;
2475 vmx_vmcs_enter(v);
2476 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2477 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2478 if ( !vlapic_hw_disabled(vlapic) &&
2479 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2480 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2481 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2482 vmx_vmcs_exit(v);
2485 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2486 u32 msr, u64 msr_content);
2487 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2488 int row, u64 msr_content);
2489 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2490 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2492 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2494 u32 ecx = regs->ecx;
2495 u64 msr_content;
2496 struct vcpu *v = current;
2497 int index;
2499 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2500 ecx, (u32)regs->eax, (u32)regs->edx);
2502 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2504 hvmtrace_msr_write(v, ecx, msr_content);
2506 switch ( ecx )
2508 case MSR_IA32_TSC:
2509 hvm_set_guest_time(v, msr_content);
2510 pt_reset(v);
2511 break;
2512 case MSR_IA32_SYSENTER_CS:
2513 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2514 break;
2515 case MSR_IA32_SYSENTER_ESP:
2516 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2517 break;
2518 case MSR_IA32_SYSENTER_EIP:
2519 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2520 break;
2521 case MSR_IA32_APICBASE:
2522 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2523 break;
2524 case MSR_IA32_CR_PAT:
2525 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2526 goto gp_fault;
2527 break;
2528 case MSR_MTRRdefType:
2529 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2530 goto gp_fault;
2531 break;
2532 case MSR_MTRRfix64K_00000:
2533 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2534 goto gp_fault;
2535 break;
2536 case MSR_MTRRfix16K_80000:
2537 case MSR_MTRRfix16K_A0000:
2538 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2539 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2540 index, msr_content) )
2541 goto gp_fault;
2542 break;
2543 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2544 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2545 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2546 index, msr_content) )
2547 goto gp_fault;
2548 break;
2549 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2550 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2551 regs->ecx, msr_content) )
2552 goto gp_fault;
2553 break;
2554 case MSR_MTRRcap:
2555 goto gp_fault;
2556 case MSR_IA32_DEBUGCTLMSR: {
2557 int i, rc = 0;
2559 if ( !msr_content || (msr_content & ~3) )
2560 break;
2562 if ( msr_content & 1 )
2564 const struct lbr_info *lbr = last_branch_msr_get();
2565 if ( lbr == NULL )
2566 break;
2568 for ( ; (rc == 0) && lbr->count; lbr++ )
2569 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2570 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2571 vmx_disable_intercept_for_msr(v, lbr->base + i);
2574 if ( (rc < 0) ||
2575 (vmx_add_guest_msr(v, ecx) < 0) ||
2576 (vmx_add_host_load_msr(v, ecx) < 0) )
2577 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2578 else
2579 vmx_write_guest_msr(v, ecx, msr_content);
2581 break;
2583 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2584 goto gp_fault;
2585 default:
2586 switch ( long_mode_do_msr_write(regs) )
2588 case HNDL_unhandled:
2589 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2590 !is_last_branch_msr(ecx) )
2591 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2592 break;
2593 case HNDL_exception_raised:
2594 return 0;
2595 case HNDL_done:
2596 break;
2598 break;
2601 return 1;
2603 gp_fault:
2604 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2605 return 0;
2608 static void vmx_do_hlt(struct cpu_user_regs *regs)
2610 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
2611 struct vcpu *curr = current;
2613 /* Check for pending exception. */
2614 if ( intr_info & INTR_INFO_VALID_MASK )
2616 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
2617 return;
2620 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2621 hvm_hlt(regs->eflags);
2624 static void vmx_do_extint(struct cpu_user_regs *regs)
2626 unsigned int vector;
2628 asmlinkage void do_IRQ(struct cpu_user_regs *);
2629 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2630 fastcall void smp_event_check_interrupt(void);
2631 fastcall void smp_invalidate_interrupt(void);
2632 fastcall void smp_call_function_interrupt(void);
2633 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2634 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2635 #ifdef CONFIG_X86_MCE_P4THERMAL
2636 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2637 #endif
2639 vector = __vmread(VM_EXIT_INTR_INFO);
2640 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2642 vector &= INTR_INFO_VECTOR_MASK;
2643 HVMTRACE_1D(INTR, current, vector);
2645 switch ( vector )
2647 case LOCAL_TIMER_VECTOR:
2648 smp_apic_timer_interrupt(regs);
2649 break;
2650 case EVENT_CHECK_VECTOR:
2651 smp_event_check_interrupt();
2652 break;
2653 case INVALIDATE_TLB_VECTOR:
2654 smp_invalidate_interrupt();
2655 break;
2656 case CALL_FUNCTION_VECTOR:
2657 smp_call_function_interrupt();
2658 break;
2659 case SPURIOUS_APIC_VECTOR:
2660 smp_spurious_interrupt(regs);
2661 break;
2662 case ERROR_APIC_VECTOR:
2663 smp_error_interrupt(regs);
2664 break;
2665 #ifdef CONFIG_X86_MCE_P4THERMAL
2666 case THERMAL_APIC_VECTOR:
2667 smp_thermal_interrupt(regs);
2668 break;
2669 #endif
2670 default:
2671 regs->entry_vector = vector;
2672 do_IRQ(regs);
2673 break;
2677 static void wbinvd_ipi(void *info)
2679 wbinvd();
2682 void vmx_wbinvd_intercept(void)
2684 if ( list_empty(&(domain_hvm_iommu(current->domain)->pdev_list)) )
2685 return;
2687 if ( cpu_has_wbinvd_exiting )
2688 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2689 else
2690 wbinvd();
2693 static void vmx_failed_vmentry(unsigned int exit_reason,
2694 struct cpu_user_regs *regs)
2696 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2697 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2698 struct vcpu *curr = current;
2700 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2701 switch ( failed_vmentry_reason )
2703 case EXIT_REASON_INVALID_GUEST_STATE:
2704 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2705 break;
2706 case EXIT_REASON_MSR_LOADING:
2707 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2708 break;
2709 case EXIT_REASON_MACHINE_CHECK:
2710 printk("caused by machine check.\n");
2711 HVMTRACE_0D(MCE, curr);
2712 do_machine_check(regs);
2713 break;
2714 default:
2715 printk("reason not known yet!");
2716 break;
2719 printk("************* VMCS Area **************\n");
2720 vmcs_dump_vcpu(curr);
2721 printk("**************************************\n");
2723 domain_crash(curr->domain);
2726 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2728 unsigned int exit_reason, idtv_info;
2729 unsigned long exit_qualification, inst_len = 0;
2730 struct vcpu *v = current;
2732 exit_reason = __vmread(VM_EXIT_REASON);
2734 hvmtrace_vmexit(v, regs->eip, exit_reason);
2736 perfc_incra(vmexits, exit_reason);
2738 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2739 local_irq_enable();
2741 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2742 return vmx_failed_vmentry(exit_reason, regs);
2744 /* Event delivery caused this intercept? Queue for redelivery. */
2745 idtv_info = __vmread(IDT_VECTORING_INFO);
2746 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2747 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2749 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2751 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2752 __vmwrite(VM_ENTRY_INTR_INFO,
2753 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2754 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2755 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2756 __vmread(IDT_VECTORING_ERROR_CODE));
2759 /*
2760 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2761 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2762 */
2763 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2764 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2765 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2766 ~VMX_INTR_SHADOW_NMI);
2769 switch ( exit_reason )
2771 case EXIT_REASON_EXCEPTION_NMI:
2773 /*
2774 * We don't set the software-interrupt exiting (INT n).
2775 * (1) We can get an exception (e.g. #PG) in the guest, or
2776 * (2) NMI
2777 */
2778 unsigned int intr_info, vector;
2780 intr_info = __vmread(VM_EXIT_INTR_INFO);
2781 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2783 vector = intr_info & INTR_INFO_VECTOR_MASK;
2785 /*
2786 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2787 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2788 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2789 */
2790 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2791 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2792 (vector != TRAP_double_fault) )
2793 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2794 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2796 perfc_incra(cause_vector, vector);
2798 switch ( vector )
2800 case TRAP_debug:
2801 case TRAP_int3:
2802 if ( !v->domain->debugger_attached )
2803 goto exit_and_crash;
2804 domain_pause_for_debugger();
2805 break;
2806 case TRAP_no_device:
2807 vmx_do_no_device_fault();
2808 break;
2809 case TRAP_page_fault:
2810 exit_qualification = __vmread(EXIT_QUALIFICATION);
2811 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2813 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2814 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2815 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2816 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2817 (unsigned long)regs->esi, (unsigned long)regs->edi);
2819 if ( paging_fault(exit_qualification, regs) )
2821 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2822 break;
2825 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2826 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2827 break;
2828 case TRAP_nmi:
2829 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2830 (X86_EVENTTYPE_NMI << 8) )
2831 goto exit_and_crash;
2832 HVMTRACE_0D(NMI, v);
2833 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2834 break;
2835 case TRAP_machine_check:
2836 HVMTRACE_0D(MCE, v);
2837 do_machine_check(regs);
2838 break;
2839 default:
2840 goto exit_and_crash;
2842 break;
2844 case EXIT_REASON_EXTERNAL_INTERRUPT:
2845 vmx_do_extint(regs);
2846 break;
2847 case EXIT_REASON_TRIPLE_FAULT:
2848 hvm_triple_fault();
2849 break;
2850 case EXIT_REASON_PENDING_VIRT_INTR:
2851 /* Disable the interrupt window. */
2852 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2853 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2854 v->arch.hvm_vmx.exec_control);
2855 break;
2856 case EXIT_REASON_PENDING_VIRT_NMI:
2857 /* Disable the NMI window. */
2858 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2859 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2860 v->arch.hvm_vmx.exec_control);
2861 break;
2862 case EXIT_REASON_TASK_SWITCH: {
2863 const enum hvm_task_switch_reason reasons[] = {
2864 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2865 int32_t errcode = -1;
2866 exit_qualification = __vmread(EXIT_QUALIFICATION);
2867 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2868 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2869 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2870 hvm_task_switch((uint16_t)exit_qualification,
2871 reasons[(exit_qualification >> 30) & 3],
2872 errcode);
2873 break;
2875 case EXIT_REASON_CPUID:
2876 inst_len = __get_instruction_length(); /* Safe: CPUID */
2877 __update_guest_eip(inst_len);
2878 vmx_do_cpuid(regs);
2879 break;
2880 case EXIT_REASON_HLT:
2881 inst_len = __get_instruction_length(); /* Safe: HLT */
2882 __update_guest_eip(inst_len);
2883 vmx_do_hlt(regs);
2884 break;
2885 case EXIT_REASON_INVLPG:
2887 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2888 __update_guest_eip(inst_len);
2889 exit_qualification = __vmread(EXIT_QUALIFICATION);
2890 vmx_do_invlpg(exit_qualification);
2891 break;
2893 case EXIT_REASON_VMCALL:
2895 int rc;
2896 HVMTRACE_1D(VMMCALL, v, regs->eax);
2897 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2898 rc = hvm_do_hypercall(regs);
2899 if ( rc != HVM_HCALL_preempted )
2901 __update_guest_eip(inst_len);
2902 if ( rc == HVM_HCALL_invalidate )
2903 send_invalidate_req();
2905 break;
2907 case EXIT_REASON_CR_ACCESS:
2909 exit_qualification = __vmread(EXIT_QUALIFICATION);
2910 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2911 if ( vmx_cr_access(exit_qualification, regs) )
2912 __update_guest_eip(inst_len);
2913 break;
2915 case EXIT_REASON_DR_ACCESS:
2916 exit_qualification = __vmread(EXIT_QUALIFICATION);
2917 vmx_dr_access(exit_qualification, regs);
2918 break;
2919 case EXIT_REASON_IO_INSTRUCTION:
2920 exit_qualification = __vmread(EXIT_QUALIFICATION);
2921 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2922 vmx_io_instruction(exit_qualification, inst_len);
2923 break;
2924 case EXIT_REASON_MSR_READ:
2925 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2926 if ( vmx_do_msr_read(regs) )
2927 __update_guest_eip(inst_len);
2928 break;
2929 case EXIT_REASON_MSR_WRITE:
2930 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2931 if ( vmx_do_msr_write(regs) )
2932 __update_guest_eip(inst_len);
2933 break;
2935 case EXIT_REASON_MWAIT_INSTRUCTION:
2936 case EXIT_REASON_MONITOR_INSTRUCTION:
2937 case EXIT_REASON_VMCLEAR:
2938 case EXIT_REASON_VMLAUNCH:
2939 case EXIT_REASON_VMPTRLD:
2940 case EXIT_REASON_VMPTRST:
2941 case EXIT_REASON_VMREAD:
2942 case EXIT_REASON_VMRESUME:
2943 case EXIT_REASON_VMWRITE:
2944 case EXIT_REASON_VMXOFF:
2945 case EXIT_REASON_VMXON:
2946 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2947 break;
2949 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2950 break;
2952 case EXIT_REASON_APIC_ACCESS:
2954 unsigned long offset;
2955 exit_qualification = __vmread(EXIT_QUALIFICATION);
2956 offset = exit_qualification & 0x0fffUL;
2957 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2958 break;
2961 case EXIT_REASON_INVD:
2962 case EXIT_REASON_WBINVD:
2964 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2965 __update_guest_eip(inst_len);
2966 vmx_wbinvd_intercept();
2967 break;
2970 default:
2971 exit_and_crash:
2972 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2973 domain_crash(v->domain);
2974 break;
2978 asmlinkage void vmx_trace_vmentry(void)
2980 struct vcpu *v = current;
2982 hvmtrace_vmentry(v);
2985 /*
2986 * Local variables:
2987 * mode: C
2988 * c-set-style: "BSD"
2989 * c-basic-offset: 4
2990 * tab-width: 4
2991 * indent-tabs-mode: nil
2992 * End:
2993 */