debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 16987:0d70e01c0012

vmx realmode: Emulate MSR accesses.
Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jan 31 09:33:26 2008 +0000 (2008-01-31)
parents 6ea3db7ae24d
children aecbf98aa709
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
62 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
63 static void vmx_update_guest_efer(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vpmu_initialise(v);
95 vmx_install_vlapic_mapping(v);
97 #ifndef VMXASSIST
98 if ( v->vcpu_id == 0 )
99 v->arch.guest_context.user_regs.eax = 1;
100 v->arch.hvm_vcpu.io_complete = vmx_realmode_io_complete;
101 #endif
103 return 0;
104 }
106 static void vmx_vcpu_destroy(struct vcpu *v)
107 {
108 vmx_destroy_vmcs(v);
109 vpmu_destroy(v);
110 }
112 #ifdef __x86_64__
114 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
116 static u32 msr_index[VMX_MSR_COUNT] =
117 {
118 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
119 };
121 static void vmx_save_host_msrs(void)
122 {
123 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
124 int i;
126 for ( i = 0; i < VMX_MSR_COUNT; i++ )
127 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
128 }
130 #define WRITE_MSR(address) \
131 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
132 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
133 wrmsrl(MSR_ ## address, msr_content); \
134 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
135 break
137 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
138 {
139 u64 msr_content = 0;
140 u32 ecx = regs->ecx;
141 struct vcpu *v = current;
142 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
144 switch ( ecx )
145 {
146 case MSR_EFER:
147 msr_content = v->arch.hvm_vcpu.guest_efer;
148 break;
150 case MSR_FS_BASE:
151 msr_content = __vmread(GUEST_FS_BASE);
152 goto check_long_mode;
154 case MSR_GS_BASE:
155 msr_content = __vmread(GUEST_GS_BASE);
156 goto check_long_mode;
158 case MSR_SHADOW_GS_BASE:
159 msr_content = v->arch.hvm_vmx.shadow_gs;
160 check_long_mode:
161 if ( !(hvm_long_mode_enabled(v)) )
162 {
163 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
164 return HNDL_exception_raised;
165 }
166 break;
168 case MSR_STAR:
169 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
170 break;
172 case MSR_LSTAR:
173 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
174 break;
176 case MSR_CSTAR:
177 msr_content = v->arch.hvm_vmx.cstar;
178 break;
180 case MSR_SYSCALL_MASK:
181 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
182 break;
184 default:
185 return HNDL_unhandled;
186 }
188 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
190 regs->eax = (u32)(msr_content >> 0);
191 regs->edx = (u32)(msr_content >> 32);
193 return HNDL_done;
194 }
196 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
197 {
198 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
199 u32 ecx = regs->ecx;
200 struct vcpu *v = current;
201 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
202 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
204 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
206 switch ( ecx )
207 {
208 case MSR_EFER:
209 if ( !hvm_set_efer(msr_content) )
210 goto exception_raised;
211 break;
213 case MSR_FS_BASE:
214 case MSR_GS_BASE:
215 case MSR_SHADOW_GS_BASE:
216 if ( !hvm_long_mode_enabled(v) )
217 goto gp_fault;
219 if ( !is_canonical_address(msr_content) )
220 goto uncanonical_address;
222 if ( ecx == MSR_FS_BASE )
223 __vmwrite(GUEST_FS_BASE, msr_content);
224 else if ( ecx == MSR_GS_BASE )
225 __vmwrite(GUEST_GS_BASE, msr_content);
226 else
227 {
228 v->arch.hvm_vmx.shadow_gs = msr_content;
229 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
230 }
232 break;
234 case MSR_STAR:
235 WRITE_MSR(STAR);
237 case MSR_LSTAR:
238 if ( !is_canonical_address(msr_content) )
239 goto uncanonical_address;
240 WRITE_MSR(LSTAR);
242 case MSR_CSTAR:
243 if ( !is_canonical_address(msr_content) )
244 goto uncanonical_address;
245 v->arch.hvm_vmx.cstar = msr_content;
246 break;
248 case MSR_SYSCALL_MASK:
249 WRITE_MSR(SYSCALL_MASK);
251 default:
252 return HNDL_unhandled;
253 }
255 return HNDL_done;
257 uncanonical_address:
258 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
259 gp_fault:
260 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
261 exception_raised:
262 return HNDL_exception_raised;
263 }
265 /*
266 * To avoid MSR save/restore at every VM exit/entry time, we restore
267 * the x86_64 specific MSRs at domain switch time. Since these MSRs
268 * are not modified once set for para domains, we don't save them,
269 * but simply reset them to values set in percpu_traps_init().
270 */
271 static void vmx_restore_host_msrs(void)
272 {
273 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
274 int i;
276 while ( host_msr_state->flags )
277 {
278 i = find_first_set_bit(host_msr_state->flags);
279 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
280 clear_bit(i, &host_msr_state->flags);
281 }
283 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
284 write_efer(read_efer() | EFER_NX);
285 }
287 static void vmx_save_guest_msrs(struct vcpu *v)
288 {
289 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
290 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
291 }
293 static void vmx_restore_guest_msrs(struct vcpu *v)
294 {
295 struct vmx_msr_state *guest_msr_state, *host_msr_state;
296 unsigned long guest_flags;
297 int i;
299 guest_msr_state = &v->arch.hvm_vmx.msr_state;
300 host_msr_state = &this_cpu(host_msr_state);
302 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
304 guest_flags = guest_msr_state->flags;
306 while ( guest_flags )
307 {
308 i = find_first_set_bit(guest_flags);
310 HVM_DBG_LOG(DBG_LEVEL_2,
311 "restore guest's index %d msr %x with value %lx",
312 i, msr_index[i], guest_msr_state->msrs[i]);
313 set_bit(i, &host_msr_state->flags);
314 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
315 clear_bit(i, &guest_flags);
316 }
318 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
319 {
320 HVM_DBG_LOG(DBG_LEVEL_2,
321 "restore guest's EFER with value %lx",
322 v->arch.hvm_vcpu.guest_efer);
323 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
324 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
325 }
326 }
328 #else /* __i386__ */
330 #define vmx_save_host_msrs() ((void)0)
332 static void vmx_restore_host_msrs(void)
333 {
334 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
335 write_efer(read_efer() | EFER_NX);
336 }
338 #define vmx_save_guest_msrs(v) ((void)0)
340 static void vmx_restore_guest_msrs(struct vcpu *v)
341 {
342 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
343 {
344 HVM_DBG_LOG(DBG_LEVEL_2,
345 "restore guest's EFER with value %lx",
346 v->arch.hvm_vcpu.guest_efer);
347 write_efer((read_efer() & ~EFER_NX) |
348 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
349 }
350 }
352 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
353 {
354 u64 msr_content = 0;
355 struct vcpu *v = current;
357 switch ( regs->ecx )
358 {
359 case MSR_EFER:
360 msr_content = v->arch.hvm_vcpu.guest_efer;
361 break;
363 default:
364 return HNDL_unhandled;
365 }
367 regs->eax = msr_content >> 0;
368 regs->edx = msr_content >> 32;
370 return HNDL_done;
371 }
373 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
374 {
375 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
377 switch ( regs->ecx )
378 {
379 case MSR_EFER:
380 if ( !hvm_set_efer(msr_content) )
381 return HNDL_exception_raised;
382 break;
384 default:
385 return HNDL_unhandled;
386 }
388 return HNDL_done;
389 }
391 #endif /* __i386__ */
393 static int vmx_guest_x86_mode(struct vcpu *v)
394 {
395 unsigned int cs_ar_bytes;
397 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
398 return 0;
399 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
400 return 1;
401 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
402 if ( hvm_long_mode_enabled(v) &&
403 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
404 return 8;
405 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
406 }
408 static void vmx_save_dr(struct vcpu *v)
409 {
410 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
411 return;
413 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
414 v->arch.hvm_vcpu.flag_dr_dirty = 0;
415 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
416 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
418 v->arch.guest_context.debugreg[0] = read_debugreg(0);
419 v->arch.guest_context.debugreg[1] = read_debugreg(1);
420 v->arch.guest_context.debugreg[2] = read_debugreg(2);
421 v->arch.guest_context.debugreg[3] = read_debugreg(3);
422 v->arch.guest_context.debugreg[6] = read_debugreg(6);
423 /* DR7 must be saved as it is used by vmx_restore_dr(). */
424 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
425 }
427 static void __restore_debug_registers(struct vcpu *v)
428 {
429 if ( v->arch.hvm_vcpu.flag_dr_dirty )
430 return;
432 v->arch.hvm_vcpu.flag_dr_dirty = 1;
434 write_debugreg(0, v->arch.guest_context.debugreg[0]);
435 write_debugreg(1, v->arch.guest_context.debugreg[1]);
436 write_debugreg(2, v->arch.guest_context.debugreg[2]);
437 write_debugreg(3, v->arch.guest_context.debugreg[3]);
438 write_debugreg(6, v->arch.guest_context.debugreg[6]);
439 /* DR7 is loaded from the VMCS. */
440 }
442 /*
443 * DR7 is saved and restored on every vmexit. Other debug registers only
444 * need to be restored if their value is going to affect execution -- i.e.,
445 * if one of the breakpoints is enabled. So mask out all bits that don't
446 * enable some breakpoint functionality.
447 */
448 static void vmx_restore_dr(struct vcpu *v)
449 {
450 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
451 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
452 __restore_debug_registers(v);
453 }
455 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
456 {
457 uint32_t ev;
459 vmx_vmcs_enter(v);
461 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
462 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
463 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
464 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
466 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
468 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
469 c->idtr_base = __vmread(GUEST_IDTR_BASE);
471 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
472 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
474 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
475 c->cs_limit = __vmread(GUEST_CS_LIMIT);
476 c->cs_base = __vmread(GUEST_CS_BASE);
477 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
479 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
480 c->ds_limit = __vmread(GUEST_DS_LIMIT);
481 c->ds_base = __vmread(GUEST_DS_BASE);
482 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
484 c->es_sel = __vmread(GUEST_ES_SELECTOR);
485 c->es_limit = __vmread(GUEST_ES_LIMIT);
486 c->es_base = __vmread(GUEST_ES_BASE);
487 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
489 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
490 c->ss_limit = __vmread(GUEST_SS_LIMIT);
491 c->ss_base = __vmread(GUEST_SS_BASE);
492 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
494 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
495 c->fs_limit = __vmread(GUEST_FS_LIMIT);
496 c->fs_base = __vmread(GUEST_FS_BASE);
497 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
499 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
500 c->gs_limit = __vmread(GUEST_GS_LIMIT);
501 c->gs_base = __vmread(GUEST_GS_BASE);
502 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
504 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
505 c->tr_limit = __vmread(GUEST_TR_LIMIT);
506 c->tr_base = __vmread(GUEST_TR_BASE);
507 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
509 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
510 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
511 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
512 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
514 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
515 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
516 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
518 c->pending_event = 0;
519 c->error_code = 0;
520 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
521 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
522 {
523 c->pending_event = ev;
524 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
525 }
527 vmx_vmcs_exit(v);
528 }
530 static int vmx_restore_cr0_cr3(
531 struct vcpu *v, unsigned long cr0, unsigned long cr3)
532 {
533 unsigned long mfn = 0;
534 p2m_type_t p2mt;
536 if ( cr0 & X86_CR0_PG )
537 {
538 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
539 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
540 {
541 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
542 return -EINVAL;
543 }
544 }
546 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
547 put_page(pagetable_get_page(v->arch.guest_table));
549 v->arch.guest_table = pagetable_from_pfn(mfn);
551 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
552 v->arch.hvm_vcpu.guest_cr[3] = cr3;
554 return 0;
555 }
557 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
558 {
559 int rc;
561 if ( c->pending_valid &&
562 ((c->pending_type == 1) || (c->pending_type > 6) ||
563 (c->pending_reserved != 0)) )
564 {
565 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
566 c->pending_event);
567 return -EINVAL;
568 }
570 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
571 if ( rc )
572 return rc;
574 vmx_vmcs_enter(v);
576 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
577 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
578 vmx_update_guest_cr(v, 0);
579 vmx_update_guest_cr(v, 2);
580 vmx_update_guest_cr(v, 4);
582 #ifdef HVM_DEBUG_SUSPEND
583 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
584 __func__, c->cr3, c->cr0, c->cr4);
585 #endif
587 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
588 vmx_update_guest_efer(v);
590 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
591 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
593 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
594 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
596 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
597 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
598 __vmwrite(GUEST_CS_BASE, c->cs_base);
599 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
601 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
602 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
603 __vmwrite(GUEST_DS_BASE, c->ds_base);
604 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
606 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
607 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
608 __vmwrite(GUEST_ES_BASE, c->es_base);
609 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
611 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
612 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
613 __vmwrite(GUEST_SS_BASE, c->ss_base);
614 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
616 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
617 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
618 __vmwrite(GUEST_FS_BASE, c->fs_base);
619 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
621 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
622 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
623 __vmwrite(GUEST_GS_BASE, c->gs_base);
624 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
626 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
627 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
628 __vmwrite(GUEST_TR_BASE, c->tr_base);
629 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
631 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
632 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
633 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
634 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
636 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
637 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
638 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
640 __vmwrite(GUEST_DR7, c->dr7);
642 vmx_vmcs_exit(v);
644 paging_update_paging_modes(v);
646 if ( c->pending_valid )
647 {
648 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
649 c->pending_event, c->error_code);
651 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
652 {
653 vmx_vmcs_enter(v);
654 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
655 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
656 vmx_vmcs_exit(v);
657 }
658 }
660 return 0;
661 }
663 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
664 static void dump_msr_state(struct vmx_msr_state *m)
665 {
666 int i = 0;
667 printk("**** msr state ****\n");
668 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
669 for ( i = 0; i < VMX_MSR_COUNT; i++ )
670 printk("0x%lx,", m->msrs[i]);
671 printk("\n");
672 }
673 #else
674 #define dump_msr_state(m) ((void)0)
675 #endif
677 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
678 {
679 #ifdef __x86_64__
680 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
681 unsigned long guest_flags = guest_state->flags;
683 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
684 data->msr_cstar = v->arch.hvm_vmx.cstar;
686 /* save msrs */
687 data->msr_flags = guest_flags;
688 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
689 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
690 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
691 #endif
693 data->tsc = hvm_get_guest_time(v);
695 dump_msr_state(guest_state);
696 }
698 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
699 {
700 #ifdef __x86_64__
701 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
703 /* restore msrs */
704 guest_state->flags = data->msr_flags;
705 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
706 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
707 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
709 v->arch.hvm_vmx.cstar = data->msr_cstar;
710 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
711 #endif
713 #ifdef VMXASSIST
714 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
715 #endif
717 hvm_set_guest_time(v, data->tsc);
719 dump_msr_state(guest_state);
720 }
723 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
724 {
725 vmx_save_cpu_state(v, ctxt);
726 vmx_vmcs_save(v, ctxt);
727 }
729 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
730 {
731 vmx_load_cpu_state(v, ctxt);
733 if ( vmx_vmcs_restore(v, ctxt) )
734 {
735 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
736 domain_crash(v->domain);
737 return -EINVAL;
738 }
740 return 0;
741 }
743 static void vmx_ctxt_switch_from(struct vcpu *v)
744 {
745 vmx_save_guest_msrs(v);
746 vmx_restore_host_msrs();
747 vmx_save_dr(v);
748 vpmu_save(v);
749 }
751 static void vmx_ctxt_switch_to(struct vcpu *v)
752 {
753 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
754 if ( unlikely(read_cr4() != mmu_cr4_features) )
755 write_cr4(mmu_cr4_features);
757 vmx_restore_guest_msrs(v);
758 vmx_restore_dr(v);
759 vpmu_load(v);
760 }
762 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
763 {
764 unsigned long base = 0;
765 int long_mode = 0;
767 ASSERT(v == current);
769 if ( hvm_long_mode_enabled(v) &&
770 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
771 long_mode = 1;
773 switch ( seg )
774 {
775 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
776 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
777 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
778 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
779 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
780 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
781 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
782 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
783 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
784 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
785 default: BUG(); break;
786 }
788 return base;
789 }
791 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
792 struct segment_register *reg)
793 {
794 uint32_t attr = 0;
796 ASSERT(v == current);
798 switch ( seg )
799 {
800 case x86_seg_cs:
801 reg->sel = __vmread(GUEST_CS_SELECTOR);
802 reg->limit = __vmread(GUEST_CS_LIMIT);
803 reg->base = __vmread(GUEST_CS_BASE);
804 attr = __vmread(GUEST_CS_AR_BYTES);
805 break;
806 case x86_seg_ds:
807 reg->sel = __vmread(GUEST_DS_SELECTOR);
808 reg->limit = __vmread(GUEST_DS_LIMIT);
809 reg->base = __vmread(GUEST_DS_BASE);
810 attr = __vmread(GUEST_DS_AR_BYTES);
811 break;
812 case x86_seg_es:
813 reg->sel = __vmread(GUEST_ES_SELECTOR);
814 reg->limit = __vmread(GUEST_ES_LIMIT);
815 reg->base = __vmread(GUEST_ES_BASE);
816 attr = __vmread(GUEST_ES_AR_BYTES);
817 break;
818 case x86_seg_fs:
819 reg->sel = __vmread(GUEST_FS_SELECTOR);
820 reg->limit = __vmread(GUEST_FS_LIMIT);
821 reg->base = __vmread(GUEST_FS_BASE);
822 attr = __vmread(GUEST_FS_AR_BYTES);
823 break;
824 case x86_seg_gs:
825 reg->sel = __vmread(GUEST_GS_SELECTOR);
826 reg->limit = __vmread(GUEST_GS_LIMIT);
827 reg->base = __vmread(GUEST_GS_BASE);
828 attr = __vmread(GUEST_GS_AR_BYTES);
829 break;
830 case x86_seg_ss:
831 reg->sel = __vmread(GUEST_SS_SELECTOR);
832 reg->limit = __vmread(GUEST_SS_LIMIT);
833 reg->base = __vmread(GUEST_SS_BASE);
834 attr = __vmread(GUEST_SS_AR_BYTES);
835 break;
836 case x86_seg_tr:
837 reg->sel = __vmread(GUEST_TR_SELECTOR);
838 reg->limit = __vmread(GUEST_TR_LIMIT);
839 reg->base = __vmread(GUEST_TR_BASE);
840 attr = __vmread(GUEST_TR_AR_BYTES);
841 break;
842 case x86_seg_gdtr:
843 reg->limit = __vmread(GUEST_GDTR_LIMIT);
844 reg->base = __vmread(GUEST_GDTR_BASE);
845 break;
846 case x86_seg_idtr:
847 reg->limit = __vmread(GUEST_IDTR_LIMIT);
848 reg->base = __vmread(GUEST_IDTR_BASE);
849 break;
850 case x86_seg_ldtr:
851 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
852 reg->limit = __vmread(GUEST_LDTR_LIMIT);
853 reg->base = __vmread(GUEST_LDTR_BASE);
854 attr = __vmread(GUEST_LDTR_AR_BYTES);
855 break;
856 default:
857 BUG();
858 }
860 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
861 /* Unusable flag is folded into Present flag. */
862 if ( attr & (1u<<16) )
863 reg->attr.fields.p = 0;
864 }
866 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
867 struct segment_register *reg)
868 {
869 uint32_t attr;
871 ASSERT((v == current) || !vcpu_runnable(v));
873 attr = reg->attr.bytes;
874 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
876 /* Not-present must mean unusable. */
877 if ( !reg->attr.fields.p )
878 attr |= (1u << 16);
880 vmx_vmcs_enter(v);
882 switch ( seg )
883 {
884 case x86_seg_cs:
885 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
886 __vmwrite(GUEST_CS_LIMIT, reg->limit);
887 __vmwrite(GUEST_CS_BASE, reg->base);
888 __vmwrite(GUEST_CS_AR_BYTES, attr);
889 break;
890 case x86_seg_ds:
891 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
892 __vmwrite(GUEST_DS_LIMIT, reg->limit);
893 __vmwrite(GUEST_DS_BASE, reg->base);
894 __vmwrite(GUEST_DS_AR_BYTES, attr);
895 break;
896 case x86_seg_es:
897 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
898 __vmwrite(GUEST_ES_LIMIT, reg->limit);
899 __vmwrite(GUEST_ES_BASE, reg->base);
900 __vmwrite(GUEST_ES_AR_BYTES, attr);
901 break;
902 case x86_seg_fs:
903 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
904 __vmwrite(GUEST_FS_LIMIT, reg->limit);
905 __vmwrite(GUEST_FS_BASE, reg->base);
906 __vmwrite(GUEST_FS_AR_BYTES, attr);
907 break;
908 case x86_seg_gs:
909 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
910 __vmwrite(GUEST_GS_LIMIT, reg->limit);
911 __vmwrite(GUEST_GS_BASE, reg->base);
912 __vmwrite(GUEST_GS_AR_BYTES, attr);
913 break;
914 case x86_seg_ss:
915 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
916 __vmwrite(GUEST_SS_LIMIT, reg->limit);
917 __vmwrite(GUEST_SS_BASE, reg->base);
918 __vmwrite(GUEST_SS_AR_BYTES, attr);
919 break;
920 case x86_seg_tr:
921 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
922 __vmwrite(GUEST_TR_LIMIT, reg->limit);
923 __vmwrite(GUEST_TR_BASE, reg->base);
924 __vmwrite(GUEST_TR_AR_BYTES, attr);
925 break;
926 case x86_seg_gdtr:
927 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
928 __vmwrite(GUEST_GDTR_BASE, reg->base);
929 break;
930 case x86_seg_idtr:
931 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
932 __vmwrite(GUEST_IDTR_BASE, reg->base);
933 break;
934 case x86_seg_ldtr:
935 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
936 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
937 __vmwrite(GUEST_LDTR_BASE, reg->base);
938 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
939 break;
940 default:
941 BUG();
942 }
944 vmx_vmcs_exit(v);
945 }
947 /* Make sure that xen intercepts any FP accesses from current */
948 static void vmx_stts(struct vcpu *v)
949 {
950 /* VMX depends on operating on the current vcpu */
951 ASSERT(v == current);
953 /*
954 * If the guest does not have TS enabled then we must cause and handle an
955 * exception on first use of the FPU. If the guest *does* have TS enabled
956 * then this is not necessary: no FPU activity can occur until the guest
957 * clears CR0.TS, and we will initialise the FPU when that happens.
958 */
959 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
960 {
961 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
962 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
963 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
964 }
965 }
967 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
968 {
969 vmx_vmcs_enter(v);
970 __vmwrite(TSC_OFFSET, offset);
971 #if defined (__i386__)
972 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
973 #endif
974 vmx_vmcs_exit(v);
975 }
977 void do_nmi(struct cpu_user_regs *);
979 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
980 {
981 char *p;
982 int i;
984 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
985 {
986 p = (char *)(hypercall_page + (i * 32));
987 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
988 *(u32 *)(p + 1) = i;
989 *(u8 *)(p + 5) = 0x0f; /* vmcall */
990 *(u8 *)(p + 6) = 0x01;
991 *(u8 *)(p + 7) = 0xc1;
992 *(u8 *)(p + 8) = 0xc3; /* ret */
993 }
995 /* Don't support HYPERVISOR_iret at the moment */
996 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
997 }
999 static enum hvm_intblk vmx_interrupt_blocked(
1000 struct vcpu *v, struct hvm_intack intack)
1002 unsigned long intr_shadow;
1004 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1006 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
1007 return hvm_intblk_shadow;
1009 if ( intack.source == hvm_intsrc_nmi )
1010 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1011 hvm_intblk_nmi_iret : hvm_intblk_none);
1013 ASSERT((intack.source == hvm_intsrc_pic) ||
1014 (intack.source == hvm_intsrc_lapic));
1016 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1017 return hvm_intblk_rflags_ie;
1019 return hvm_intblk_none;
1022 static void vmx_update_host_cr3(struct vcpu *v)
1024 ASSERT((v == current) || !vcpu_runnable(v));
1025 vmx_vmcs_enter(v);
1026 __vmwrite(HOST_CR3, v->arch.cr3);
1027 vmx_vmcs_exit(v);
1030 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1032 ASSERT((v == current) || !vcpu_runnable(v));
1034 vmx_vmcs_enter(v);
1036 switch ( cr )
1038 case 0:
1039 /* TS cleared? Then initialise FPU now. */
1040 if ( (v == current) && !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) &&
1041 (v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS) )
1043 setup_fpu(v);
1044 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1047 v->arch.hvm_vcpu.hw_cr[0] =
1048 v->arch.hvm_vcpu.guest_cr[0] |
1049 X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
1050 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1051 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1052 break;
1053 case 2:
1054 /* CR2 is updated in exit stub. */
1055 break;
1056 case 3:
1057 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1058 break;
1059 case 4:
1060 v->arch.hvm_vcpu.hw_cr[4] =
1061 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1062 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1063 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1064 break;
1065 default:
1066 BUG();
1069 vmx_vmcs_exit(v);
1072 static void vmx_update_guest_efer(struct vcpu *v)
1074 #ifdef __x86_64__
1075 unsigned long vm_entry_value;
1077 ASSERT((v == current) || !vcpu_runnable(v));
1079 vmx_vmcs_enter(v);
1081 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1082 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1083 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1084 else
1085 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1086 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1088 vmx_vmcs_exit(v);
1089 #endif
1091 if ( v == current )
1092 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1093 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1096 static void vmx_flush_guest_tlbs(void)
1098 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1099 * at all means any guest will have a clean TLB when it's next run,
1100 * because VMRESUME will flush it for us. */
1103 static void vmx_inject_exception(
1104 unsigned int trapnr, int errcode, unsigned long cr2)
1106 struct vcpu *curr = current;
1108 vmx_inject_hw_exception(curr, trapnr, errcode);
1110 if ( trapnr == TRAP_page_fault )
1111 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1113 if ( (trapnr == TRAP_debug) &&
1114 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1116 __restore_debug_registers(curr);
1117 write_debugreg(6, read_debugreg(6) | 0x4000);
1121 static int vmx_event_pending(struct vcpu *v)
1123 ASSERT(v == current);
1124 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1127 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1129 return vpmu_do_interrupt(regs);
1132 static struct hvm_function_table vmx_function_table = {
1133 .name = "VMX",
1134 .domain_initialise = vmx_domain_initialise,
1135 .domain_destroy = vmx_domain_destroy,
1136 .vcpu_initialise = vmx_vcpu_initialise,
1137 .vcpu_destroy = vmx_vcpu_destroy,
1138 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1139 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1140 .interrupt_blocked = vmx_interrupt_blocked,
1141 .guest_x86_mode = vmx_guest_x86_mode,
1142 .get_segment_base = vmx_get_segment_base,
1143 .get_segment_register = vmx_get_segment_register,
1144 .set_segment_register = vmx_set_segment_register,
1145 .update_host_cr3 = vmx_update_host_cr3,
1146 .update_guest_cr = vmx_update_guest_cr,
1147 .update_guest_efer = vmx_update_guest_efer,
1148 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1149 .stts = vmx_stts,
1150 .set_tsc_offset = vmx_set_tsc_offset,
1151 .inject_exception = vmx_inject_exception,
1152 .init_hypercall_page = vmx_init_hypercall_page,
1153 .event_pending = vmx_event_pending,
1154 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1155 .cpu_up = vmx_cpu_up,
1156 .cpu_down = vmx_cpu_down,
1157 };
1159 void start_vmx(void)
1161 static int bootstrapped;
1163 vmx_save_host_msrs();
1165 if ( bootstrapped )
1167 if ( hvm_enabled && !vmx_cpu_up() )
1169 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1170 smp_processor_id());
1171 BUG();
1173 return;
1176 bootstrapped = 1;
1178 /* Xen does not fill x86_capability words except 0. */
1179 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1181 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1182 return;
1184 set_in_cr4(X86_CR4_VMXE);
1186 if ( !vmx_cpu_up() )
1188 printk("VMX: failed to initialise.\n");
1189 return;
1192 setup_vmcs_dump();
1194 hvm_enable(&vmx_function_table);
1197 /*
1198 * Not all cases receive valid value in the VM-exit instruction length field.
1199 * Callers must know what they're doing!
1200 */
1201 static int __get_instruction_length(void)
1203 int len;
1204 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1205 BUG_ON((len < 1) || (len > 15));
1206 return len;
1209 static void __update_guest_eip(unsigned long inst_len)
1211 struct cpu_user_regs *regs = guest_cpu_user_regs();
1212 unsigned long x;
1214 regs->eip += inst_len;
1215 regs->eflags &= ~X86_EFLAGS_RF;
1217 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1218 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1220 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1221 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1224 if ( regs->eflags & X86_EFLAGS_TF )
1225 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1228 void vmx_do_no_device_fault(void)
1230 struct vcpu *v = current;
1232 setup_fpu(current);
1233 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1235 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1236 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1238 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1239 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1243 #define bitmaskof(idx) (1U << ((idx) & 31))
1244 void vmx_cpuid_intercept(
1245 unsigned int *eax, unsigned int *ebx,
1246 unsigned int *ecx, unsigned int *edx)
1248 unsigned int input = *eax;
1249 unsigned int count = *ecx;
1251 #ifdef VMXASSIST
1252 if ( input == 0x40000003 )
1254 /*
1255 * NB. Unsupported interface for private use of VMXASSIST only.
1256 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1257 */
1258 u64 value = ((u64)*edx << 32) | (u32)*ecx;
1259 p2m_type_t p2mt;
1260 unsigned long mfn;
1261 struct vcpu *v = current;
1262 char *p;
1264 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1266 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1268 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1269 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1270 !v->arch.hvm_vmx.vmxassist_enabled )
1272 domain_crash(v->domain);
1273 return;
1275 ASSERT(mfn_valid(mfn));
1277 p = map_domain_page(mfn);
1278 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1279 unmap_domain_page(p);
1281 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1282 *ecx = (u32)value;
1283 *edx = (u32)(value >> 32);
1284 return;
1286 #endif
1288 hvm_cpuid(input, eax, ebx, ecx, edx);
1290 switch ( input )
1292 case 0x00000001:
1293 *ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1294 *ebx &= NUM_THREADS_RESET_MASK;
1295 *ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1296 bitmaskof(X86_FEATURE_EST) |
1297 bitmaskof(X86_FEATURE_TM2) |
1298 bitmaskof(X86_FEATURE_CID) |
1299 bitmaskof(X86_FEATURE_PDCM) |
1300 bitmaskof(X86_FEATURE_DSCPL));
1301 *edx &= ~(bitmaskof(X86_FEATURE_HT) |
1302 bitmaskof(X86_FEATURE_ACPI) |
1303 bitmaskof(X86_FEATURE_ACC) |
1304 bitmaskof(X86_FEATURE_DS));
1305 break;
1307 case 0x00000004:
1308 cpuid_count(input, count, eax, ebx, ecx, edx);
1309 *eax &= NUM_CORES_RESET_MASK;
1310 break;
1312 case 0x00000006:
1313 case 0x00000009:
1314 *eax = *ebx = *ecx = *edx = 0;
1315 break;
1317 case 0x80000001:
1318 /* Only a few features are advertised in Intel's 0x80000001. */
1319 *ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1320 *edx &= (bitmaskof(X86_FEATURE_NX) |
1321 bitmaskof(X86_FEATURE_LM) |
1322 bitmaskof(X86_FEATURE_SYSCALL));
1323 break;
1326 HVMTRACE_3D(CPUID, current, input,
1327 ((uint64_t)*eax << 32) | *ebx, ((uint64_t)*ecx << 32) | *edx);
1330 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1332 unsigned int eax, ebx, ecx, edx;
1334 eax = regs->eax;
1335 ebx = regs->ebx;
1336 ecx = regs->ecx;
1337 edx = regs->edx;
1339 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1341 regs->eax = eax;
1342 regs->ebx = ebx;
1343 regs->ecx = ecx;
1344 regs->edx = edx;
1347 #define CASE_GET_REG_P(REG, reg) \
1348 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1350 #ifdef __i386__
1351 #define CASE_EXTEND_GET_REG_P
1352 #else
1353 #define CASE_EXTEND_GET_REG_P \
1354 CASE_GET_REG_P(R8, r8); \
1355 CASE_GET_REG_P(R9, r9); \
1356 CASE_GET_REG_P(R10, r10); \
1357 CASE_GET_REG_P(R11, r11); \
1358 CASE_GET_REG_P(R12, r12); \
1359 CASE_GET_REG_P(R13, r13); \
1360 CASE_GET_REG_P(R14, r14); \
1361 CASE_GET_REG_P(R15, r15)
1362 #endif
1364 static void vmx_dr_access(unsigned long exit_qualification,
1365 struct cpu_user_regs *regs)
1367 struct vcpu *v = current;
1369 HVMTRACE_0D(DR_WRITE, v);
1371 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1372 __restore_debug_registers(v);
1374 /* Allow guest direct access to DR registers */
1375 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1376 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1379 /*
1380 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1381 * the address va.
1382 */
1383 static void vmx_do_invlpg(unsigned long va)
1385 struct vcpu *v = current;
1387 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1389 /*
1390 * We do the safest things first, then try to update the shadow
1391 * copying from guest
1392 */
1393 paging_invlpg(v, va);
1396 /* Get segment for OUTS according to guest instruction. */
1397 static enum x86_segment vmx_outs_get_segment(
1398 int long_mode, unsigned long eip, int inst_len)
1400 unsigned char inst[MAX_INST_LEN];
1401 enum x86_segment seg = x86_seg_ds;
1402 int i;
1403 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1405 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1407 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1409 /* Get segment register according to bits 17:15. */
1410 switch ( (instr_info >> 15) & 7 )
1412 case 0: seg = x86_seg_es; break;
1413 case 1: seg = x86_seg_cs; break;
1414 case 2: seg = x86_seg_ss; break;
1415 case 3: seg = x86_seg_ds; break;
1416 case 4: seg = x86_seg_fs; break;
1417 case 5: seg = x86_seg_gs; break;
1418 default: BUG();
1421 goto out;
1424 if ( !long_mode )
1425 eip += __vmread(GUEST_CS_BASE);
1427 memset(inst, 0, MAX_INST_LEN);
1428 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1430 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1431 domain_crash(current->domain);
1432 goto out;
1435 for ( i = 0; i < inst_len; i++ )
1437 switch ( inst[i] )
1439 case 0xf3: /* REPZ */
1440 case 0xf2: /* REPNZ */
1441 case 0xf0: /* LOCK */
1442 case 0x66: /* data32 */
1443 case 0x67: /* addr32 */
1444 #ifdef __x86_64__
1445 case 0x40 ... 0x4f: /* REX */
1446 #endif
1447 continue;
1448 case 0x2e: /* CS */
1449 seg = x86_seg_cs;
1450 continue;
1451 case 0x36: /* SS */
1452 seg = x86_seg_ss;
1453 continue;
1454 case 0x26: /* ES */
1455 seg = x86_seg_es;
1456 continue;
1457 case 0x64: /* FS */
1458 seg = x86_seg_fs;
1459 continue;
1460 case 0x65: /* GS */
1461 seg = x86_seg_gs;
1462 continue;
1463 case 0x3e: /* DS */
1464 seg = x86_seg_ds;
1465 continue;
1469 out:
1470 return seg;
1473 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1474 int inst_len, enum x86_segment seg,
1475 unsigned long *base, u32 *limit,
1476 u32 *ar_bytes)
1478 enum vmcs_field ar_field, base_field, limit_field;
1480 *base = 0;
1481 *limit = 0;
1482 if ( seg != x86_seg_es )
1483 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1485 switch ( seg )
1487 case x86_seg_cs:
1488 ar_field = GUEST_CS_AR_BYTES;
1489 base_field = GUEST_CS_BASE;
1490 limit_field = GUEST_CS_LIMIT;
1491 break;
1492 case x86_seg_ds:
1493 ar_field = GUEST_DS_AR_BYTES;
1494 base_field = GUEST_DS_BASE;
1495 limit_field = GUEST_DS_LIMIT;
1496 break;
1497 case x86_seg_es:
1498 ar_field = GUEST_ES_AR_BYTES;
1499 base_field = GUEST_ES_BASE;
1500 limit_field = GUEST_ES_LIMIT;
1501 break;
1502 case x86_seg_fs:
1503 ar_field = GUEST_FS_AR_BYTES;
1504 base_field = GUEST_FS_BASE;
1505 limit_field = GUEST_FS_LIMIT;
1506 break;
1507 case x86_seg_gs:
1508 ar_field = GUEST_GS_AR_BYTES;
1509 base_field = GUEST_GS_BASE;
1510 limit_field = GUEST_GS_LIMIT;
1511 break;
1512 case x86_seg_ss:
1513 ar_field = GUEST_SS_AR_BYTES;
1514 base_field = GUEST_SS_BASE;
1515 limit_field = GUEST_SS_LIMIT;
1516 break;
1517 default:
1518 BUG();
1519 return 0;
1522 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1524 *base = __vmread(base_field);
1525 *limit = __vmread(limit_field);
1527 *ar_bytes = __vmread(ar_field);
1529 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1533 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1534 u32 ar_bytes, unsigned long addr,
1535 unsigned long base, int df,
1536 unsigned long *count)
1538 unsigned long ea = addr - base;
1540 /* Offset must be within limits. */
1541 ASSERT(ea == (u32)ea);
1542 if ( (u32)(ea + size - 1) < (u32)ea ||
1543 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1544 : ea <= limit )
1545 return 0;
1547 /* Check the limit for repeated instructions, as above we checked
1548 only the first instance. Truncate the count if a limit violation
1549 would occur. Note that the checking is not necessary for page
1550 granular segments as transfers crossing page boundaries will be
1551 broken up anyway. */
1552 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1554 if ( (ar_bytes & 0xc) != 0x4 )
1556 /* expand-up */
1557 if ( !df )
1559 if ( ea + *count * size - 1 < ea ||
1560 ea + *count * size - 1 > limit )
1561 *count = (limit + 1UL - ea) / size;
1563 else
1565 if ( *count - 1 > ea / size )
1566 *count = ea / size + 1;
1569 else
1571 /* expand-down */
1572 if ( !df )
1574 if ( *count - 1 > -(s32)ea / size )
1575 *count = -(s32)ea / size + 1UL;
1577 else
1579 if ( ea < (*count - 1) * size ||
1580 ea - (*count - 1) * size <= limit )
1581 *count = (ea - limit - 1) / size + 1;
1584 ASSERT(*count);
1587 return 1;
1590 #ifdef __x86_64__
1591 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1592 unsigned int size,
1593 unsigned long addr,
1594 unsigned long *count)
1596 if ( !is_canonical_address(addr) ||
1597 !is_canonical_address(addr + size - 1) )
1598 return 0;
1600 if ( *count > (1UL << 48) / size )
1601 *count = (1UL << 48) / size;
1603 if ( !(regs->eflags & EF_DF) )
1605 if ( addr + *count * size - 1 < addr ||
1606 !is_canonical_address(addr + *count * size - 1) )
1607 *count = (addr & ~((1UL << 48) - 1)) / size;
1609 else
1611 if ( (*count - 1) * size > addr ||
1612 !is_canonical_address(addr + (*count - 1) * size) )
1613 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1616 ASSERT(*count);
1618 return 1;
1620 #endif
1622 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1623 struct hvm_io_op *pio_opp,
1624 unsigned long inst_len, unsigned int port,
1625 int sign, unsigned int size, int dir,
1626 int df, unsigned long addr,
1627 paddr_t paddr, unsigned long count)
1629 /*
1630 * Handle string pio instructions that cross pages or that
1631 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1632 */
1633 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1634 unsigned long value = 0;
1636 pio_opp->flags |= OVERLAP;
1638 if ( dir == IOREQ_WRITE ) /* OUTS */
1640 if ( hvm_paging_enabled(current) )
1642 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1643 if ( rv == HVMCOPY_bad_gva_to_gfn )
1644 return; /* exception already injected */
1646 else
1647 (void)hvm_copy_from_guest_phys(&value, addr, size);
1649 else /* dir != IOREQ_WRITE */
1650 /* Remember where to write the result, as a *VA*.
1651 * Must be a VA so we can handle the page overlap
1652 * correctly in hvm_pio_assist() */
1653 pio_opp->addr = addr;
1655 if ( count == 1 )
1656 regs->eip += inst_len;
1658 send_pio_req(port, 1, size, value, dir, df, 0);
1659 } else {
1660 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1661 : addr - (count - 1) * size;
1663 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1665 if ( sign > 0 )
1666 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1667 else
1668 count = (addr & ~PAGE_MASK) / size + 1;
1669 } else
1670 regs->eip += inst_len;
1672 send_pio_req(port, count, size, paddr, dir, df, 1);
1676 static void vmx_do_str_pio(unsigned long exit_qualification,
1677 unsigned long inst_len,
1678 struct cpu_user_regs *regs,
1679 struct hvm_io_op *pio_opp)
1681 unsigned int port, size;
1682 int dir, df, vm86;
1683 unsigned long addr, count = 1, base;
1684 paddr_t paddr;
1685 unsigned long gfn;
1686 u32 ar_bytes, limit, pfec;
1687 int sign;
1688 int long_mode = 0;
1690 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1691 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1693 if ( test_bit(6, &exit_qualification) )
1694 port = (exit_qualification >> 16) & 0xFFFF;
1695 else
1696 port = regs->edx & 0xffff;
1698 size = (exit_qualification & 7) + 1;
1699 dir = test_bit(3, &exit_qualification); /* direction */
1701 if ( dir == IOREQ_READ )
1702 HVMTRACE_2D(IO_READ, current, port, size);
1703 else
1704 HVMTRACE_2D(IO_WRITE, current, port, size);
1706 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1707 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1708 if ( hvm_long_mode_enabled(current) &&
1709 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1710 long_mode = 1;
1711 addr = __vmread(GUEST_LINEAR_ADDRESS);
1713 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1714 pio_opp->flags |= REPZ;
1715 count = regs->ecx;
1716 if ( !long_mode &&
1717 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1718 count &= 0xFFFF;
1721 /*
1722 * In protected mode, guest linear address is invalid if the
1723 * selector is null.
1724 */
1725 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1726 dir==IOREQ_WRITE ? x86_seg_ds :
1727 x86_seg_es, &base, &limit,
1728 &ar_bytes) ) {
1729 if ( !long_mode ) {
1730 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1731 return;
1733 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1736 if ( !long_mode )
1738 /* Segment must be readable for outs and writeable for ins. */
1739 if ( ((dir == IOREQ_WRITE)
1740 ? ((ar_bytes & 0xa) == 0x8)
1741 : ((ar_bytes & 0xa) != 0x2)) ||
1742 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1743 addr, base, df, &count) )
1745 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1746 return;
1749 #ifdef __x86_64__
1750 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1752 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1753 return;
1755 #endif
1757 /* Translate the address to a physical address */
1758 pfec = PFEC_page_present;
1759 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1760 pfec |= PFEC_write_access;
1761 if ( ((__vmread(GUEST_SS_AR_BYTES) >> 5) & 3) == 3 )
1762 pfec |= PFEC_user_mode;
1763 gfn = paging_gva_to_gfn(current, addr, &pfec);
1764 if ( gfn == INVALID_GFN )
1766 /* The guest does not have the RAM address mapped.
1767 * Need to send in a page fault */
1768 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1769 return;
1771 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1773 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1774 size, dir, df, addr, paddr, count);
1777 static void vmx_io_instruction(unsigned long exit_qualification,
1778 unsigned long inst_len)
1780 struct cpu_user_regs *regs;
1781 struct hvm_io_op *pio_opp;
1783 pio_opp = &current->arch.hvm_vcpu.io_op;
1784 pio_opp->instr = INSTR_PIO;
1785 pio_opp->flags = 0;
1787 regs = &pio_opp->io_context;
1789 /* Copy current guest state into io instruction state structure. */
1790 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1792 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1793 "exit_qualification = %lx",
1794 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1795 regs->cs, (unsigned long)regs->eip, exit_qualification);
1797 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1798 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1799 else
1801 unsigned int port, size;
1802 int dir, df;
1804 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1806 if ( test_bit(6, &exit_qualification) )
1807 port = (exit_qualification >> 16) & 0xFFFF;
1808 else
1809 port = regs->edx & 0xffff;
1811 size = (exit_qualification & 7) + 1;
1812 dir = test_bit(3, &exit_qualification); /* direction */
1814 if ( dir == IOREQ_READ )
1815 HVMTRACE_2D(IO_READ, current, port, size);
1816 else
1817 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1819 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1820 hvm_print_line(current, regs->eax); /* guest debug output */
1822 regs->eip += inst_len;
1823 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1827 #ifdef VMXASSIST
1829 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1831 struct cpu_user_regs *regs = guest_cpu_user_regs();
1833 c->eip = regs->eip;
1834 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1835 c->esp = regs->esp;
1836 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1838 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1839 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1840 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1842 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1843 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1845 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1846 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1848 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1849 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1850 c->cs_base = __vmread(GUEST_CS_BASE);
1851 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1853 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1854 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1855 c->ds_base = __vmread(GUEST_DS_BASE);
1856 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1858 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1859 c->es_limit = __vmread(GUEST_ES_LIMIT);
1860 c->es_base = __vmread(GUEST_ES_BASE);
1861 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1863 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1864 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1865 c->ss_base = __vmread(GUEST_SS_BASE);
1866 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1868 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1869 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1870 c->fs_base = __vmread(GUEST_FS_BASE);
1871 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1873 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1874 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1875 c->gs_base = __vmread(GUEST_GS_BASE);
1876 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1878 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1879 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1880 c->tr_base = __vmread(GUEST_TR_BASE);
1881 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1883 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1884 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1885 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1886 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1889 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1891 struct cpu_user_regs *regs = guest_cpu_user_regs();
1892 int rc;
1894 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1895 if ( rc )
1896 return rc;
1898 regs->eip = c->eip;
1899 regs->esp = c->esp;
1900 regs->eflags = c->eflags | 2;
1902 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1903 vmx_update_guest_cr(v, 0);
1904 vmx_update_guest_cr(v, 4);
1906 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1907 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1909 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1910 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1912 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1913 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1914 __vmwrite(GUEST_CS_BASE, c->cs_base);
1915 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1917 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1918 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1919 __vmwrite(GUEST_DS_BASE, c->ds_base);
1920 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1922 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1923 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1924 __vmwrite(GUEST_ES_BASE, c->es_base);
1925 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1927 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1928 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1929 __vmwrite(GUEST_SS_BASE, c->ss_base);
1930 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1932 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1933 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1934 __vmwrite(GUEST_FS_BASE, c->fs_base);
1935 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1937 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1938 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1939 __vmwrite(GUEST_GS_BASE, c->gs_base);
1940 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1942 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1943 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1944 __vmwrite(GUEST_TR_BASE, c->tr_base);
1945 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1947 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1948 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1949 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1950 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1952 paging_update_paging_modes(v);
1953 return 0;
1956 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1958 static int vmx_assist(struct vcpu *v, int mode)
1960 struct vmx_assist_context c;
1961 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1962 u32 magic, cp;
1964 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1965 sizeof(magic)) )
1967 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1968 domain_crash(v->domain);
1969 return 0;
1972 if ( magic != VMXASSIST_MAGIC )
1974 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1975 domain_crash(v->domain);
1976 return 0;
1979 switch ( mode ) {
1980 /*
1981 * Transfer control to vmxassist.
1982 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1983 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1984 * by vmxassist and will transfer control to it.
1985 */
1986 case VMX_ASSIST_INVOKE:
1987 /* save the old context */
1988 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1989 goto error;
1990 if ( cp != 0 ) {
1991 vmx_world_save(v, &c);
1992 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
1993 goto error;
1996 /* restore the new context, this should activate vmxassist */
1997 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
1998 goto error;
1999 if ( cp != 0 ) {
2000 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2001 goto error;
2002 if ( vmx_world_restore(v, &c) != 0 )
2003 goto error;
2004 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
2005 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
2006 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
2007 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
2008 v->arch.hvm_vmx.vmxassist_enabled = 1;
2009 return 1;
2011 break;
2013 /*
2014 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2015 * VMX_ASSIST_INVOKE above.
2016 */
2017 case VMX_ASSIST_RESTORE:
2018 /* save the old context */
2019 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2020 goto error;
2021 if ( cp != 0 ) {
2022 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2023 goto error;
2024 if ( vmx_world_restore(v, &c) != 0 )
2025 goto error;
2026 if ( v->arch.hvm_vmx.irqbase_mode ) {
2027 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2028 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2029 } else {
2030 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2031 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2033 v->arch.hvm_vmx.vmxassist_enabled = 0;
2034 return 1;
2036 break;
2039 error:
2040 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2041 domain_crash(v->domain);
2042 return 0;
2045 static int vmx_set_cr0(unsigned long value)
2047 struct vcpu *v = current;
2049 if ( hvm_set_cr0(value) == 0 )
2050 return 0;
2052 /*
2053 * VMX does not implement real-mode virtualization. We emulate
2054 * real-mode by performing a world switch to VMXAssist whenever
2055 * a partition disables the CR0.PE bit.
2056 */
2057 if ( !(value & X86_CR0_PE) )
2059 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2060 return 0; /* do not update eip! */
2062 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2064 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2065 return 0; /* do not update eip! */
2068 return 1;
2071 #else /* !defined(VMXASSIST) */
2073 #define vmx_set_cr0(v) hvm_set_cr0(v)
2075 #endif
2077 #define CASE_SET_REG(REG, reg) \
2078 case REG_ ## REG: regs->reg = value; break
2079 #define CASE_GET_REG(REG, reg) \
2080 case REG_ ## REG: value = regs->reg; break
2082 #define CASE_EXTEND_SET_REG \
2083 CASE_EXTEND_REG(S)
2084 #define CASE_EXTEND_GET_REG \
2085 CASE_EXTEND_REG(G)
2087 #ifdef __i386__
2088 #define CASE_EXTEND_REG(T)
2089 #else
2090 #define CASE_EXTEND_REG(T) \
2091 CASE_ ## T ## ET_REG(R8, r8); \
2092 CASE_ ## T ## ET_REG(R9, r9); \
2093 CASE_ ## T ## ET_REG(R10, r10); \
2094 CASE_ ## T ## ET_REG(R11, r11); \
2095 CASE_ ## T ## ET_REG(R12, r12); \
2096 CASE_ ## T ## ET_REG(R13, r13); \
2097 CASE_ ## T ## ET_REG(R14, r14); \
2098 CASE_ ## T ## ET_REG(R15, r15)
2099 #endif
2101 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2103 unsigned long value;
2104 struct vcpu *v = current;
2105 struct vlapic *vlapic = vcpu_vlapic(v);
2107 switch ( gp )
2109 CASE_GET_REG(EAX, eax);
2110 CASE_GET_REG(ECX, ecx);
2111 CASE_GET_REG(EDX, edx);
2112 CASE_GET_REG(EBX, ebx);
2113 CASE_GET_REG(EBP, ebp);
2114 CASE_GET_REG(ESI, esi);
2115 CASE_GET_REG(EDI, edi);
2116 CASE_GET_REG(ESP, esp);
2117 CASE_EXTEND_GET_REG;
2118 default:
2119 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2120 goto exit_and_crash;
2123 HVMTRACE_2D(CR_WRITE, v, cr, value);
2125 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2127 switch ( cr )
2129 case 0:
2130 return vmx_set_cr0(value);
2132 case 3:
2133 return hvm_set_cr3(value);
2135 case 4:
2136 return hvm_set_cr4(value);
2138 case 8:
2139 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2140 break;
2142 default:
2143 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2144 goto exit_and_crash;
2147 return 1;
2149 exit_and_crash:
2150 domain_crash(v->domain);
2151 return 0;
2154 /*
2155 * Read from control registers. CR0 and CR4 are read from the shadow.
2156 */
2157 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2159 unsigned long value = 0;
2160 struct vcpu *v = current;
2161 struct vlapic *vlapic = vcpu_vlapic(v);
2163 switch ( cr )
2165 case 3:
2166 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2167 break;
2168 case 8:
2169 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2170 value = (value & 0xF0) >> 4;
2171 break;
2172 default:
2173 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2174 domain_crash(v->domain);
2175 break;
2178 switch ( gp ) {
2179 CASE_SET_REG(EAX, eax);
2180 CASE_SET_REG(ECX, ecx);
2181 CASE_SET_REG(EDX, edx);
2182 CASE_SET_REG(EBX, ebx);
2183 CASE_SET_REG(EBP, ebp);
2184 CASE_SET_REG(ESI, esi);
2185 CASE_SET_REG(EDI, edi);
2186 CASE_SET_REG(ESP, esp);
2187 CASE_EXTEND_SET_REG;
2188 default:
2189 printk("invalid gp: %d\n", gp);
2190 domain_crash(v->domain);
2191 break;
2194 HVMTRACE_2D(CR_READ, v, cr, value);
2196 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2199 static int vmx_cr_access(unsigned long exit_qualification,
2200 struct cpu_user_regs *regs)
2202 unsigned int gp, cr;
2203 unsigned long value;
2204 struct vcpu *v = current;
2206 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2208 case TYPE_MOV_TO_CR:
2209 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2210 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2211 return mov_to_cr(gp, cr, regs);
2212 case TYPE_MOV_FROM_CR:
2213 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2214 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2215 mov_from_cr(cr, gp, regs);
2216 break;
2217 case TYPE_CLTS:
2218 /* We initialise the FPU now, to avoid needing another vmexit. */
2219 setup_fpu(v);
2220 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2222 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2223 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2225 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2226 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2227 HVMTRACE_0D(CLTS, current);
2228 break;
2229 case TYPE_LMSW:
2230 value = v->arch.hvm_vcpu.guest_cr[0];
2231 value = (value & ~0xF) |
2232 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2233 HVMTRACE_1D(LMSW, current, value);
2234 return vmx_set_cr0(value);
2235 default:
2236 BUG();
2239 return 1;
2242 static const struct lbr_info {
2243 u32 base, count;
2244 } p4_lbr[] = {
2245 { MSR_P4_LER_FROM_LIP, 1 },
2246 { MSR_P4_LER_TO_LIP, 1 },
2247 { MSR_P4_LASTBRANCH_TOS, 1 },
2248 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2249 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2250 { 0, 0 }
2251 }, c2_lbr[] = {
2252 { MSR_IA32_LASTINTFROMIP, 1 },
2253 { MSR_IA32_LASTINTTOIP, 1 },
2254 { MSR_C2_LASTBRANCH_TOS, 1 },
2255 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2256 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2257 { 0, 0 }
2258 #ifdef __i386__
2259 }, pm_lbr[] = {
2260 { MSR_IA32_LASTINTFROMIP, 1 },
2261 { MSR_IA32_LASTINTTOIP, 1 },
2262 { MSR_PM_LASTBRANCH_TOS, 1 },
2263 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2264 { 0, 0 }
2265 #endif
2266 };
2268 static const struct lbr_info *last_branch_msr_get(void)
2270 switch ( boot_cpu_data.x86 )
2272 case 6:
2273 switch ( boot_cpu_data.x86_model )
2275 #ifdef __i386__
2276 /* PentiumM */
2277 case 9: case 13:
2278 /* Core Solo/Duo */
2279 case 14:
2280 return pm_lbr;
2281 break;
2282 #endif
2283 /* Core2 Duo */
2284 case 15:
2285 return c2_lbr;
2286 break;
2288 break;
2290 case 15:
2291 switch ( boot_cpu_data.x86_model )
2293 /* Pentium4/Xeon with em64t */
2294 case 3: case 4: case 6:
2295 return p4_lbr;
2296 break;
2298 break;
2301 return NULL;
2304 static int is_last_branch_msr(u32 ecx)
2306 const struct lbr_info *lbr = last_branch_msr_get();
2308 if ( lbr == NULL )
2309 return 0;
2311 for ( ; lbr->count; lbr++ )
2312 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2313 return 1;
2315 return 0;
2318 int vmx_msr_read_intercept(struct cpu_user_regs *regs)
2320 u64 msr_content = 0;
2321 u32 ecx = regs->ecx, eax, edx;
2322 struct vcpu *v = current;
2323 int index;
2324 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2325 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2327 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2329 switch ( ecx )
2331 case MSR_IA32_TSC:
2332 msr_content = hvm_get_guest_time(v);
2333 break;
2334 case MSR_IA32_SYSENTER_CS:
2335 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2336 break;
2337 case MSR_IA32_SYSENTER_ESP:
2338 msr_content = __vmread(GUEST_SYSENTER_ESP);
2339 break;
2340 case MSR_IA32_SYSENTER_EIP:
2341 msr_content = __vmread(GUEST_SYSENTER_EIP);
2342 break;
2343 case MSR_IA32_APICBASE:
2344 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2345 break;
2346 case MSR_IA32_CR_PAT:
2347 msr_content = v->arch.hvm_vcpu.pat_cr;
2348 break;
2349 case MSR_MTRRcap:
2350 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2351 break;
2352 case MSR_MTRRdefType:
2353 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2354 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2355 break;
2356 case MSR_MTRRfix64K_00000:
2357 msr_content = fixed_range_base[0];
2358 break;
2359 case MSR_MTRRfix16K_80000:
2360 case MSR_MTRRfix16K_A0000:
2361 index = regs->ecx - MSR_MTRRfix16K_80000;
2362 msr_content = fixed_range_base[index + 1];
2363 break;
2364 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2365 index = regs->ecx - MSR_MTRRfix4K_C0000;
2366 msr_content = fixed_range_base[index + 3];
2367 break;
2368 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2369 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2370 msr_content = var_range_base[index];
2371 break;
2372 case MSR_IA32_DEBUGCTLMSR:
2373 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2374 msr_content = 0;
2375 break;
2376 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2377 goto gp_fault;
2378 case MSR_IA32_MCG_CAP:
2379 case MSR_IA32_MCG_STATUS:
2380 case MSR_IA32_MC0_STATUS:
2381 case MSR_IA32_MC1_STATUS:
2382 case MSR_IA32_MC2_STATUS:
2383 case MSR_IA32_MC3_STATUS:
2384 case MSR_IA32_MC4_STATUS:
2385 case MSR_IA32_MC5_STATUS:
2386 /* No point in letting the guest see real MCEs */
2387 msr_content = 0;
2388 break;
2389 case MSR_IA32_MISC_ENABLE:
2390 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
2391 /* Debug Trace Store is not supported. */
2392 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2393 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
2394 break;
2395 default:
2396 if ( vpmu_do_rdmsr(regs) )
2397 goto done;
2398 switch ( long_mode_do_msr_read(regs) )
2400 case HNDL_unhandled:
2401 break;
2402 case HNDL_exception_raised:
2403 return 0;
2404 case HNDL_done:
2405 goto done;
2408 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2409 break;
2411 if ( is_last_branch_msr(ecx) )
2413 msr_content = 0;
2414 break;
2417 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2418 rdmsr_safe(ecx, eax, edx) == 0 )
2420 regs->eax = eax;
2421 regs->edx = edx;
2422 goto done;
2425 goto gp_fault;
2428 regs->eax = msr_content & 0xFFFFFFFF;
2429 regs->edx = msr_content >> 32;
2431 done:
2432 hvmtrace_msr_read(v, ecx, msr_content);
2433 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2434 ecx, (unsigned long)regs->eax,
2435 (unsigned long)regs->edx);
2436 return 1;
2438 gp_fault:
2439 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2440 return 0;
2443 static int vmx_alloc_vlapic_mapping(struct domain *d)
2445 void *apic_va;
2447 if ( !cpu_has_vmx_virtualize_apic_accesses )
2448 return 0;
2450 apic_va = alloc_xenheap_page();
2451 if ( apic_va == NULL )
2452 return -ENOMEM;
2453 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2454 set_mmio_p2m_entry(
2455 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
2456 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2458 return 0;
2461 static void vmx_free_vlapic_mapping(struct domain *d)
2463 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2464 if ( mfn != 0 )
2465 free_xenheap_page(mfn_to_virt(mfn));
2468 static void vmx_install_vlapic_mapping(struct vcpu *v)
2470 paddr_t virt_page_ma, apic_page_ma;
2472 if ( !cpu_has_vmx_virtualize_apic_accesses )
2473 return;
2475 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2476 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2477 apic_page_ma <<= PAGE_SHIFT;
2479 vmx_vmcs_enter(v);
2480 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2481 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2482 vmx_vmcs_exit(v);
2485 void vmx_vlapic_msr_changed(struct vcpu *v)
2487 struct vlapic *vlapic = vcpu_vlapic(v);
2488 uint32_t ctl;
2490 if ( !cpu_has_vmx_virtualize_apic_accesses )
2491 return;
2493 vmx_vmcs_enter(v);
2494 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2495 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2496 if ( !vlapic_hw_disabled(vlapic) &&
2497 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2498 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2499 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2500 vmx_vmcs_exit(v);
2503 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2504 u32 msr, u64 msr_content);
2505 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2506 int row, u64 msr_content);
2507 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2508 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2510 int vmx_msr_write_intercept(struct cpu_user_regs *regs)
2512 u32 ecx = regs->ecx;
2513 u64 msr_content;
2514 struct vcpu *v = current;
2515 int index;
2517 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2518 ecx, (u32)regs->eax, (u32)regs->edx);
2520 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2522 hvmtrace_msr_write(v, ecx, msr_content);
2524 switch ( ecx )
2526 case MSR_IA32_TSC:
2527 hvm_set_guest_time(v, msr_content);
2528 pt_reset(v);
2529 break;
2530 case MSR_IA32_SYSENTER_CS:
2531 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2532 break;
2533 case MSR_IA32_SYSENTER_ESP:
2534 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2535 break;
2536 case MSR_IA32_SYSENTER_EIP:
2537 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2538 break;
2539 case MSR_IA32_APICBASE:
2540 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2541 break;
2542 case MSR_IA32_CR_PAT:
2543 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2544 goto gp_fault;
2545 break;
2546 case MSR_MTRRdefType:
2547 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2548 goto gp_fault;
2549 break;
2550 case MSR_MTRRfix64K_00000:
2551 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2552 goto gp_fault;
2553 break;
2554 case MSR_MTRRfix16K_80000:
2555 case MSR_MTRRfix16K_A0000:
2556 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2557 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2558 index, msr_content) )
2559 goto gp_fault;
2560 break;
2561 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2562 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2563 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2564 index, msr_content) )
2565 goto gp_fault;
2566 break;
2567 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2568 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2569 regs->ecx, msr_content) )
2570 goto gp_fault;
2571 break;
2572 case MSR_MTRRcap:
2573 goto gp_fault;
2574 case MSR_IA32_DEBUGCTLMSR: {
2575 int i, rc = 0;
2577 if ( !msr_content || (msr_content & ~3) )
2578 break;
2580 if ( msr_content & 1 )
2582 const struct lbr_info *lbr = last_branch_msr_get();
2583 if ( lbr == NULL )
2584 break;
2586 for ( ; (rc == 0) && lbr->count; lbr++ )
2587 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2588 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2589 vmx_disable_intercept_for_msr(v, lbr->base + i);
2592 if ( (rc < 0) ||
2593 (vmx_add_guest_msr(v, ecx) < 0) ||
2594 (vmx_add_host_load_msr(v, ecx) < 0) )
2595 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2596 else
2597 vmx_write_guest_msr(v, ecx, msr_content);
2599 break;
2601 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2602 goto gp_fault;
2603 default:
2604 if ( vpmu_do_wrmsr(regs) )
2605 return 1;
2606 switch ( long_mode_do_msr_write(regs) )
2608 case HNDL_unhandled:
2609 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2610 !is_last_branch_msr(ecx) )
2611 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2612 break;
2613 case HNDL_exception_raised:
2614 return 0;
2615 case HNDL_done:
2616 break;
2618 break;
2621 return 1;
2623 gp_fault:
2624 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2625 return 0;
2628 static void vmx_do_hlt(struct cpu_user_regs *regs)
2630 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
2631 struct vcpu *curr = current;
2633 /* Check for pending exception. */
2634 if ( intr_info & INTR_INFO_VALID_MASK )
2636 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
2637 return;
2640 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2641 hvm_hlt(regs->eflags);
2644 static void vmx_do_extint(struct cpu_user_regs *regs)
2646 unsigned int vector;
2648 asmlinkage void do_IRQ(struct cpu_user_regs *);
2649 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2650 fastcall void smp_event_check_interrupt(void);
2651 fastcall void smp_invalidate_interrupt(void);
2652 fastcall void smp_call_function_interrupt(void);
2653 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2654 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2655 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
2656 #ifdef CONFIG_X86_MCE_P4THERMAL
2657 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2658 #endif
2660 vector = __vmread(VM_EXIT_INTR_INFO);
2661 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2663 vector &= INTR_INFO_VECTOR_MASK;
2664 HVMTRACE_1D(INTR, current, vector);
2666 switch ( vector )
2668 case LOCAL_TIMER_VECTOR:
2669 smp_apic_timer_interrupt(regs);
2670 break;
2671 case EVENT_CHECK_VECTOR:
2672 smp_event_check_interrupt();
2673 break;
2674 case INVALIDATE_TLB_VECTOR:
2675 smp_invalidate_interrupt();
2676 break;
2677 case CALL_FUNCTION_VECTOR:
2678 smp_call_function_interrupt();
2679 break;
2680 case SPURIOUS_APIC_VECTOR:
2681 smp_spurious_interrupt(regs);
2682 break;
2683 case ERROR_APIC_VECTOR:
2684 smp_error_interrupt(regs);
2685 break;
2686 case PMU_APIC_VECTOR:
2687 smp_pmu_apic_interrupt(regs);
2688 break;
2689 #ifdef CONFIG_X86_MCE_P4THERMAL
2690 case THERMAL_APIC_VECTOR:
2691 smp_thermal_interrupt(regs);
2692 break;
2693 #endif
2694 default:
2695 regs->entry_vector = vector;
2696 do_IRQ(regs);
2697 break;
2701 static void wbinvd_ipi(void *info)
2703 wbinvd();
2706 void vmx_wbinvd_intercept(void)
2708 if ( list_empty(&(domain_hvm_iommu(current->domain)->pdev_list)) )
2709 return;
2711 if ( cpu_has_wbinvd_exiting )
2712 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2713 else
2714 wbinvd();
2717 static void vmx_failed_vmentry(unsigned int exit_reason,
2718 struct cpu_user_regs *regs)
2720 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2721 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2722 struct vcpu *curr = current;
2724 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2725 switch ( failed_vmentry_reason )
2727 case EXIT_REASON_INVALID_GUEST_STATE:
2728 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2729 break;
2730 case EXIT_REASON_MSR_LOADING:
2731 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2732 break;
2733 case EXIT_REASON_MACHINE_CHECK:
2734 printk("caused by machine check.\n");
2735 HVMTRACE_0D(MCE, curr);
2736 do_machine_check(regs);
2737 break;
2738 default:
2739 printk("reason not known yet!");
2740 break;
2743 printk("************* VMCS Area **************\n");
2744 vmcs_dump_vcpu(curr);
2745 printk("**************************************\n");
2747 domain_crash(curr->domain);
2750 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2752 unsigned int exit_reason, idtv_info;
2753 unsigned long exit_qualification, inst_len = 0;
2754 struct vcpu *v = current;
2756 exit_reason = __vmread(VM_EXIT_REASON);
2758 hvmtrace_vmexit(v, regs->eip, exit_reason);
2760 perfc_incra(vmexits, exit_reason);
2762 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2763 local_irq_enable();
2765 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2766 return vmx_failed_vmentry(exit_reason, regs);
2768 hvm_maybe_deassert_evtchn_irq();
2770 /* Event delivery caused this intercept? Queue for redelivery. */
2771 idtv_info = __vmread(IDT_VECTORING_INFO);
2772 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2773 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2775 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2777 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2778 __vmwrite(VM_ENTRY_INTR_INFO,
2779 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2780 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2781 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2782 __vmread(IDT_VECTORING_ERROR_CODE));
2785 /*
2786 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2787 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2788 */
2789 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2790 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2791 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2792 ~VMX_INTR_SHADOW_NMI);
2795 switch ( exit_reason )
2797 case EXIT_REASON_EXCEPTION_NMI:
2799 /*
2800 * We don't set the software-interrupt exiting (INT n).
2801 * (1) We can get an exception (e.g. #PG) in the guest, or
2802 * (2) NMI
2803 */
2804 unsigned int intr_info, vector;
2806 intr_info = __vmread(VM_EXIT_INTR_INFO);
2807 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2809 vector = intr_info & INTR_INFO_VECTOR_MASK;
2811 /*
2812 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2813 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2814 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2815 */
2816 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2817 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2818 (vector != TRAP_double_fault) )
2819 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2820 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2822 perfc_incra(cause_vector, vector);
2824 switch ( vector )
2826 case TRAP_debug:
2827 case TRAP_int3:
2828 if ( !v->domain->debugger_attached )
2829 goto exit_and_crash;
2830 domain_pause_for_debugger();
2831 break;
2832 case TRAP_no_device:
2833 vmx_do_no_device_fault();
2834 break;
2835 case TRAP_page_fault:
2836 exit_qualification = __vmread(EXIT_QUALIFICATION);
2837 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2839 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2840 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2841 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2842 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2843 (unsigned long)regs->esi, (unsigned long)regs->edi);
2845 if ( paging_fault(exit_qualification, regs) )
2847 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2848 break;
2851 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2852 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2853 break;
2854 case TRAP_nmi:
2855 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2856 (X86_EVENTTYPE_NMI << 8) )
2857 goto exit_and_crash;
2858 HVMTRACE_0D(NMI, v);
2859 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2860 break;
2861 case TRAP_machine_check:
2862 HVMTRACE_0D(MCE, v);
2863 do_machine_check(regs);
2864 break;
2865 default:
2866 goto exit_and_crash;
2868 break;
2870 case EXIT_REASON_EXTERNAL_INTERRUPT:
2871 vmx_do_extint(regs);
2872 break;
2873 case EXIT_REASON_TRIPLE_FAULT:
2874 hvm_triple_fault();
2875 break;
2876 case EXIT_REASON_PENDING_VIRT_INTR:
2877 /* Disable the interrupt window. */
2878 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2879 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2880 v->arch.hvm_vmx.exec_control);
2881 break;
2882 case EXIT_REASON_PENDING_VIRT_NMI:
2883 /* Disable the NMI window. */
2884 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2885 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2886 v->arch.hvm_vmx.exec_control);
2887 break;
2888 case EXIT_REASON_TASK_SWITCH: {
2889 const enum hvm_task_switch_reason reasons[] = {
2890 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2891 int32_t errcode = -1;
2892 exit_qualification = __vmread(EXIT_QUALIFICATION);
2893 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2894 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2895 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2896 hvm_task_switch((uint16_t)exit_qualification,
2897 reasons[(exit_qualification >> 30) & 3],
2898 errcode);
2899 break;
2901 case EXIT_REASON_CPUID:
2902 inst_len = __get_instruction_length(); /* Safe: CPUID */
2903 __update_guest_eip(inst_len);
2904 vmx_do_cpuid(regs);
2905 break;
2906 case EXIT_REASON_HLT:
2907 inst_len = __get_instruction_length(); /* Safe: HLT */
2908 __update_guest_eip(inst_len);
2909 vmx_do_hlt(regs);
2910 break;
2911 case EXIT_REASON_INVLPG:
2913 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2914 __update_guest_eip(inst_len);
2915 exit_qualification = __vmread(EXIT_QUALIFICATION);
2916 vmx_do_invlpg(exit_qualification);
2917 break;
2919 case EXIT_REASON_VMCALL:
2921 int rc;
2922 HVMTRACE_1D(VMMCALL, v, regs->eax);
2923 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2924 rc = hvm_do_hypercall(regs);
2925 if ( rc != HVM_HCALL_preempted )
2927 __update_guest_eip(inst_len);
2928 if ( rc == HVM_HCALL_invalidate )
2929 send_invalidate_req();
2931 break;
2933 case EXIT_REASON_CR_ACCESS:
2935 exit_qualification = __vmread(EXIT_QUALIFICATION);
2936 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2937 if ( vmx_cr_access(exit_qualification, regs) )
2938 __update_guest_eip(inst_len);
2939 break;
2941 case EXIT_REASON_DR_ACCESS:
2942 exit_qualification = __vmread(EXIT_QUALIFICATION);
2943 vmx_dr_access(exit_qualification, regs);
2944 break;
2945 case EXIT_REASON_IO_INSTRUCTION:
2946 exit_qualification = __vmread(EXIT_QUALIFICATION);
2947 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2948 vmx_io_instruction(exit_qualification, inst_len);
2949 break;
2950 case EXIT_REASON_MSR_READ:
2951 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2952 if ( vmx_msr_read_intercept(regs) )
2953 __update_guest_eip(inst_len);
2954 break;
2955 case EXIT_REASON_MSR_WRITE:
2956 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2957 if ( vmx_msr_write_intercept(regs) )
2958 __update_guest_eip(inst_len);
2959 break;
2961 case EXIT_REASON_MWAIT_INSTRUCTION:
2962 case EXIT_REASON_MONITOR_INSTRUCTION:
2963 case EXIT_REASON_VMCLEAR:
2964 case EXIT_REASON_VMLAUNCH:
2965 case EXIT_REASON_VMPTRLD:
2966 case EXIT_REASON_VMPTRST:
2967 case EXIT_REASON_VMREAD:
2968 case EXIT_REASON_VMRESUME:
2969 case EXIT_REASON_VMWRITE:
2970 case EXIT_REASON_VMXOFF:
2971 case EXIT_REASON_VMXON:
2972 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2973 break;
2975 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2976 break;
2978 case EXIT_REASON_APIC_ACCESS:
2980 unsigned long offset;
2981 exit_qualification = __vmread(EXIT_QUALIFICATION);
2982 offset = exit_qualification & 0x0fffUL;
2983 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2984 break;
2987 case EXIT_REASON_INVD:
2988 case EXIT_REASON_WBINVD:
2990 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2991 __update_guest_eip(inst_len);
2992 vmx_wbinvd_intercept();
2993 break;
2996 default:
2997 exit_and_crash:
2998 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2999 domain_crash(v->domain);
3000 break;
3004 asmlinkage void vmx_trace_vmentry(void)
3006 struct vcpu *v = current;
3008 hvmtrace_vmentry(v);
3011 /*
3012 * Local variables:
3013 * mode: C
3014 * c-set-style: "BSD"
3015 * c-basic-offset: 4
3016 * tab-width: 4
3017 * indent-tabs-mode: nil
3018 * End:
3019 */