debuggers.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 13711:6e3decbd3e56

[HVM] When guest explicitly modifies the TSC, update platform timers
_after_ updating guest time (since otherwise pt_reset() will get the
old hvm_get_guest_time() and subsequently pt_intr_post() will warp
the TSC back again).

This is the smallest fix for the HVM SMP TSC issues seen on recent
(viz. 2.6.18+) linux HVM kernels, but probably not the best. Cleaner
fix needs more thought.

Signed-off-by: Steven Hand <steven@xensource.com>
author shand@cosworth.eng.hq.xensource.com
date Tue Jan 30 00:52:51 2007 +0000 (2007-01-30)
parents fde9e1d474b7
children ffcd586dbaae
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
49 #include <asm/hvm/vpt.h>
50 #include <public/hvm/save.h>
52 static void vmx_ctxt_switch_from(struct vcpu *v);
53 static void vmx_ctxt_switch_to(struct vcpu *v);
55 static int vmx_vcpu_initialise(struct vcpu *v)
56 {
57 int rc;
59 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
61 v->arch.schedule_tail = arch_vmx_do_resume;
62 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
63 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
65 if ( (rc = vmx_create_vmcs(v)) != 0 )
66 {
67 dprintk(XENLOG_WARNING,
68 "Failed to create VMCS for vcpu %d: err=%d.\n",
69 v->vcpu_id, rc);
70 return rc;
71 }
73 return 0;
74 }
76 static void vmx_vcpu_destroy(struct vcpu *v)
77 {
78 vmx_destroy_vmcs(v);
79 }
81 #ifdef __x86_64__
83 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
85 static u32 msr_index[VMX_MSR_COUNT] =
86 {
87 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
88 MSR_SYSCALL_MASK, MSR_EFER,
89 };
91 static void vmx_save_host_msrs(void)
92 {
93 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
94 int i;
96 for ( i = 0; i < VMX_MSR_COUNT; i++ )
97 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
98 }
100 #define WRITE_MSR(address) \
101 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
102 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
103 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
104 wrmsrl(MSR_ ## address, msr_content); \
105 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
106 break
108 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
109 {
110 u64 msr_content = 0;
111 struct vcpu *v = current;
112 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
114 switch ( (u32)regs->ecx ) {
115 case MSR_EFER:
116 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
117 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
118 break;
120 case MSR_FS_BASE:
121 msr_content = __vmread(GUEST_FS_BASE);
122 goto check_long_mode;
124 case MSR_GS_BASE:
125 msr_content = __vmread(GUEST_GS_BASE);
126 goto check_long_mode;
128 case MSR_SHADOW_GS_BASE:
129 msr_content = guest_msr_state->shadow_gs;
130 check_long_mode:
131 if ( !(vmx_long_mode_enabled(v)) )
132 {
133 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
134 return 0;
135 }
136 break;
138 case MSR_STAR:
139 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
140 break;
142 case MSR_LSTAR:
143 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
144 break;
146 case MSR_CSTAR:
147 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_CSTAR];
148 break;
150 case MSR_SYSCALL_MASK:
151 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
152 break;
154 default:
155 return 0;
156 }
158 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
160 regs->eax = (u32)(msr_content >> 0);
161 regs->edx = (u32)(msr_content >> 32);
163 return 1;
164 }
166 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
167 {
168 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
169 u32 ecx = regs->ecx;
170 struct vcpu *v = current;
171 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
172 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
174 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%x msr_content 0x%"PRIx64"\n",
175 ecx, msr_content);
177 switch ( ecx )
178 {
179 case MSR_EFER:
180 /* offending reserved bit will cause #GP */
181 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
182 {
183 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
184 "EFER: %"PRIx64"\n", msr_content);
185 goto gp_fault;
186 }
188 if ( (msr_content & EFER_LME)
189 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
190 {
191 if ( unlikely(vmx_paging_enabled(v)) )
192 {
193 gdprintk(XENLOG_WARNING,
194 "Trying to set EFER.LME with paging enabled\n");
195 goto gp_fault;
196 }
197 }
198 else if ( !(msr_content & EFER_LME)
199 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
200 {
201 if ( unlikely(vmx_paging_enabled(v)) )
202 {
203 gdprintk(XENLOG_WARNING,
204 "Trying to clear EFER.LME with paging enabled\n");
205 goto gp_fault;
206 }
207 }
209 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
210 break;
212 case MSR_FS_BASE:
213 case MSR_GS_BASE:
214 case MSR_SHADOW_GS_BASE:
215 if ( !vmx_long_mode_enabled(v) )
216 goto gp_fault;
218 if ( !is_canonical_address(msr_content) )
219 goto uncanonical_address;
221 if ( ecx == MSR_FS_BASE )
222 __vmwrite(GUEST_FS_BASE, msr_content);
223 else if ( ecx == MSR_GS_BASE )
224 __vmwrite(GUEST_GS_BASE, msr_content);
225 else
226 {
227 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
228 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
229 }
231 break;
233 case MSR_STAR:
234 WRITE_MSR(STAR);
236 case MSR_LSTAR:
237 if ( !is_canonical_address(msr_content) )
238 goto uncanonical_address;
239 WRITE_MSR(LSTAR);
241 case MSR_CSTAR:
242 if ( !is_canonical_address(msr_content) )
243 goto uncanonical_address;
244 WRITE_MSR(CSTAR);
246 case MSR_SYSCALL_MASK:
247 WRITE_MSR(SYSCALL_MASK);
249 default:
250 return 0;
251 }
253 return 1;
255 uncanonical_address:
256 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
257 gp_fault:
258 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
259 return 0;
260 }
262 /*
263 * To avoid MSR save/restore at every VM exit/entry time, we restore
264 * the x86_64 specific MSRs at domain switch time. Since these MSRs
265 * are not modified once set for para domains, we don't save them,
266 * but simply reset them to values set in percpu_traps_init().
267 */
268 static void vmx_restore_host_msrs(void)
269 {
270 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
271 int i;
273 while ( host_msr_state->flags )
274 {
275 i = find_first_set_bit(host_msr_state->flags);
276 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
277 clear_bit(i, &host_msr_state->flags);
278 }
279 }
281 static void vmx_save_guest_msrs(struct vcpu *v)
282 {
283 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
284 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
285 }
287 static void vmx_restore_guest_msrs(struct vcpu *v)
288 {
289 struct vmx_msr_state *guest_msr_state, *host_msr_state;
290 unsigned long guest_flags;
291 int i;
293 guest_msr_state = &v->arch.hvm_vmx.msr_state;
294 host_msr_state = &this_cpu(host_msr_state);
296 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
298 guest_flags = guest_msr_state->flags;
299 if ( !guest_flags )
300 return;
302 while ( guest_flags ) {
303 i = find_first_set_bit(guest_flags);
305 HVM_DBG_LOG(DBG_LEVEL_2,
306 "restore guest's index %d msr %x with value %lx",
307 i, msr_index[i], guest_msr_state->msrs[i]);
308 set_bit(i, &host_msr_state->flags);
309 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
310 clear_bit(i, &guest_flags);
311 }
312 }
314 #else /* __i386__ */
316 #define vmx_save_host_msrs() ((void)0)
317 #define vmx_restore_host_msrs() ((void)0)
318 #define vmx_save_guest_msrs(v) ((void)0)
319 #define vmx_restore_guest_msrs(v) ((void)0)
321 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
322 {
323 return 0;
324 }
326 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
327 {
328 return 0;
329 }
331 #endif /* __i386__ */
333 #define loaddebug(_v,_reg) \
334 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
335 #define savedebug(_v,_reg) \
336 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
338 static inline void vmx_save_dr(struct vcpu *v)
339 {
340 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
341 return;
343 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
344 v->arch.hvm_vcpu.flag_dr_dirty = 0;
345 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
346 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
348 savedebug(&v->arch.guest_context, 0);
349 savedebug(&v->arch.guest_context, 1);
350 savedebug(&v->arch.guest_context, 2);
351 savedebug(&v->arch.guest_context, 3);
352 savedebug(&v->arch.guest_context, 6);
353 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
354 }
356 static inline void __restore_debug_registers(struct vcpu *v)
357 {
358 loaddebug(&v->arch.guest_context, 0);
359 loaddebug(&v->arch.guest_context, 1);
360 loaddebug(&v->arch.guest_context, 2);
361 loaddebug(&v->arch.guest_context, 3);
362 /* No 4 and 5 */
363 loaddebug(&v->arch.guest_context, 6);
364 /* DR7 is loaded from the VMCS. */
365 }
367 static int __get_instruction_length(void);
368 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
369 {
370 unsigned long inst_len;
372 inst_len = __get_instruction_length();
373 c->eip = __vmread(GUEST_RIP);
375 #ifdef HVM_DEBUG_SUSPEND
376 printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n",
377 inst_len, c->eip);
378 #endif
380 c->esp = __vmread(GUEST_RSP);
381 c->eflags = __vmread(GUEST_RFLAGS);
383 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
384 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
385 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
387 #ifdef HVM_DEBUG_SUSPEND
388 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
389 c->cr3,
390 c->cr0,
391 c->cr4);
392 #endif
394 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
395 c->idtr_base = __vmread(GUEST_IDTR_BASE);
397 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
398 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
400 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
401 c->cs_limit = __vmread(GUEST_CS_LIMIT);
402 c->cs_base = __vmread(GUEST_CS_BASE);
403 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
405 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
406 c->ds_limit = __vmread(GUEST_DS_LIMIT);
407 c->ds_base = __vmread(GUEST_DS_BASE);
408 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
410 c->es_sel = __vmread(GUEST_ES_SELECTOR);
411 c->es_limit = __vmread(GUEST_ES_LIMIT);
412 c->es_base = __vmread(GUEST_ES_BASE);
413 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
415 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
416 c->ss_limit = __vmread(GUEST_SS_LIMIT);
417 c->ss_base = __vmread(GUEST_SS_BASE);
418 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
420 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
421 c->fs_limit = __vmread(GUEST_FS_LIMIT);
422 c->fs_base = __vmread(GUEST_FS_BASE);
423 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
425 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
426 c->gs_limit = __vmread(GUEST_GS_LIMIT);
427 c->gs_base = __vmread(GUEST_GS_BASE);
428 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
430 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
431 c->tr_limit = __vmread(GUEST_TR_LIMIT);
432 c->tr_base = __vmread(GUEST_TR_BASE);
433 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
435 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
436 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
437 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
438 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
440 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
441 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
442 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
444 return 1;
445 }
447 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
448 {
449 unsigned long mfn, old_base_mfn;
451 vmx_vmcs_enter(v);
453 __vmwrite(GUEST_RIP, c->eip);
454 __vmwrite(GUEST_RSP, c->esp);
455 __vmwrite(GUEST_RFLAGS, c->eflags);
457 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
458 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
460 #ifdef HVM_DEBUG_SUSPEND
461 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
462 c->cr3,
463 c->cr0,
464 c->cr4);
465 #endif
467 if (!vmx_paging_enabled(v)) {
468 printk("vmx_vmcs_restore: paging not enabled.");
469 goto skip_cr3;
470 }
472 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
473 /*
474 * This is simple TLB flush, implying the guest has
475 * removed some translation or changed page attributes.
476 * We simply invalidate the shadow.
477 */
478 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
479 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
480 goto bad_cr3;
481 }
482 } else {
483 /*
484 * If different, make a shadow. Check if the PDBR is valid
485 * first.
486 */
487 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
488 /* current!=vcpu as not called by arch_vmx_do_launch */
489 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
490 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
491 goto bad_cr3;
492 }
493 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
494 v->arch.guest_table = pagetable_from_pfn(mfn);
495 if (old_base_mfn)
496 put_page(mfn_to_page(old_base_mfn));
497 /*
498 * arch.shadow_table should now hold the next CR3 for shadow
499 */
500 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
501 }
503 skip_cr3:
504 #if defined(__x86_64__)
505 if (vmx_long_mode_enabled(v)) {
506 unsigned long vm_entry_value;
507 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
508 vm_entry_value |= VM_ENTRY_IA32E_MODE;
509 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
510 }
511 #endif
513 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
514 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
515 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
517 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
518 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
520 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
521 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
523 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
524 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
525 __vmwrite(GUEST_CS_BASE, c->cs_base);
526 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
528 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
529 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
530 __vmwrite(GUEST_DS_BASE, c->ds_base);
531 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
533 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
534 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
535 __vmwrite(GUEST_ES_BASE, c->es_base);
536 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
538 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
539 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
540 __vmwrite(GUEST_SS_BASE, c->ss_base);
541 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
543 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
544 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
545 __vmwrite(GUEST_FS_BASE, c->fs_base);
546 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
548 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
549 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
550 __vmwrite(GUEST_GS_BASE, c->gs_base);
551 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
553 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
554 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
555 __vmwrite(GUEST_TR_BASE, c->tr_base);
556 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
558 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
559 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
560 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
561 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
563 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
564 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
565 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
567 vmx_vmcs_exit(v);
569 shadow_update_paging_modes(v);
570 return 0;
572 bad_cr3:
573 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
574 vmx_vmcs_exit(v);
575 return -EINVAL;
576 }
578 #ifdef HVM_DEBUG_SUSPEND
579 static void dump_msr_state(struct vmx_msr_state *m)
580 {
581 int i = 0;
582 printk("**** msr state ****\n");
583 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
584 for (i = 0; i < VMX_MSR_COUNT; i++)
585 printk("0x%lx,", m->msrs[i]);
586 printk("\n");
587 }
588 #else
589 static void dump_msr_state(struct vmx_msr_state *m)
590 {
591 }
592 #endif
594 void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
595 {
596 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
597 unsigned long guest_flags = guest_state->flags;
598 int i = 0;
600 data->shadow_gs = guest_state->shadow_gs;
601 data->vmxassist_enabled = v->arch.hvm_vmx.vmxassist_enabled;
602 /* save msrs */
603 data->flags = guest_flags;
604 for (i = 0; i < VMX_MSR_COUNT; i++)
605 data->msr_items[i] = guest_state->msrs[i];
607 data->tsc = hvm_get_guest_time(v);
609 dump_msr_state(guest_state);
610 }
612 void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
613 {
614 int i = 0;
615 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
617 /* restore msrs */
618 guest_state->flags = data->flags;
619 for (i = 0; i < VMX_MSR_COUNT; i++)
620 guest_state->msrs[i] = data->msr_items[i];
622 guest_state->shadow_gs = data->shadow_gs;
624 /*XXX:no need to restore msrs, current!=vcpu as not called by arch_vmx_do_launch */
625 /* vmx_restore_guest_msrs(v);*/
627 v->arch.hvm_vmx.vmxassist_enabled = data->vmxassist_enabled;
629 hvm_set_guest_time(v, data->tsc);
631 dump_msr_state(guest_state);
632 }
635 void vmx_save_vmcs_ctxt(hvm_domain_context_t *h, void *opaque)
636 {
637 struct vcpu *v = opaque;
638 struct hvm_hw_cpu ctxt;
640 vmx_save_cpu_state(v, &ctxt);
641 vmx_vmcs_enter(v);
642 vmx_vmcs_save(v, &ctxt);
643 vmx_vmcs_exit(v);
645 hvm_put_struct(h, &ctxt);
646 }
648 int vmx_load_vmcs_ctxt(hvm_domain_context_t *h, void *opaque, int version)
649 {
650 struct vcpu *v = opaque;
651 struct hvm_hw_cpu ctxt;
653 if (version != 1)
654 return -EINVAL;
656 hvm_get_struct(h, &ctxt);
657 vmx_load_cpu_state(v, &ctxt);
658 if (vmx_vmcs_restore(v, &ctxt)) {
659 printk("vmx_vmcs restore failed!\n");
660 domain_crash(v->domain);
661 return -EINVAL;
662 }
664 return 0;
665 }
667 /*
668 * DR7 is saved and restored on every vmexit. Other debug registers only
669 * need to be restored if their value is going to affect execution -- i.e.,
670 * if one of the breakpoints is enabled. So mask out all bits that don't
671 * enable some breakpoint functionality.
672 */
673 #define DR7_ACTIVE_MASK 0xff
675 static inline void vmx_restore_dr(struct vcpu *v)
676 {
677 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
678 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
679 __restore_debug_registers(v);
680 }
682 static void vmx_ctxt_switch_from(struct vcpu *v)
683 {
684 vmx_save_guest_msrs(v);
685 vmx_restore_host_msrs();
686 vmx_save_dr(v);
687 }
689 static void vmx_ctxt_switch_to(struct vcpu *v)
690 {
691 vmx_restore_guest_msrs(v);
692 vmx_restore_dr(v);
693 }
695 static void stop_vmx(void)
696 {
697 if ( !(read_cr4() & X86_CR4_VMXE) )
698 return;
700 __vmxoff();
701 clear_in_cr4(X86_CR4_VMXE);
702 }
704 static void vmx_store_cpu_guest_regs(
705 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
706 {
707 vmx_vmcs_enter(v);
709 if ( regs != NULL )
710 {
711 regs->eflags = __vmread(GUEST_RFLAGS);
712 regs->ss = __vmread(GUEST_SS_SELECTOR);
713 regs->cs = __vmread(GUEST_CS_SELECTOR);
714 regs->eip = __vmread(GUEST_RIP);
715 regs->esp = __vmread(GUEST_RSP);
716 }
718 if ( crs != NULL )
719 {
720 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
721 crs[2] = v->arch.hvm_vmx.cpu_cr2;
722 crs[3] = __vmread(GUEST_CR3);
723 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
724 }
726 vmx_vmcs_exit(v);
727 }
729 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
730 {
731 unsigned long base;
733 vmx_vmcs_enter(v);
735 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
736 __vmwrite(GUEST_RSP, regs->esp);
738 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
739 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
741 if ( regs->eflags & EF_TF )
742 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
743 else
744 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
746 if ( regs->eflags & EF_VM )
747 {
748 /*
749 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
750 * Registers) says that virtual-8086 mode guests' segment
751 * base-address fields in the VMCS must be equal to their
752 * corresponding segment selector field shifted right by
753 * four bits upon vmentry.
754 */
755 base = __vmread(GUEST_CS_BASE);
756 if ( (regs->cs << 4) != base )
757 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
758 base = __vmread(GUEST_SS_BASE);
759 if ( (regs->ss << 4) != base )
760 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
761 }
763 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
764 __vmwrite(GUEST_RIP, regs->eip);
766 vmx_vmcs_exit(v);
767 }
769 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
770 {
771 switch ( num )
772 {
773 case 0:
774 return v->arch.hvm_vmx.cpu_cr0;
775 case 2:
776 return v->arch.hvm_vmx.cpu_cr2;
777 case 3:
778 return v->arch.hvm_vmx.cpu_cr3;
779 case 4:
780 return v->arch.hvm_vmx.cpu_shadow_cr4;
781 default:
782 BUG();
783 }
784 return 0; /* dummy */
785 }
787 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
788 {
789 unsigned long base = 0;
790 int long_mode = 0;
792 ASSERT(v == current);
794 #ifdef __x86_64__
795 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
796 long_mode = 1;
797 #endif
799 switch ( seg )
800 {
801 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
802 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
803 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
804 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
805 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
806 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
807 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
808 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
809 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
810 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
811 default: BUG(); break;
812 }
814 return base;
815 }
817 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
818 struct segment_register *reg)
819 {
820 u16 attr = 0;
822 ASSERT(v == current);
824 switch ( seg )
825 {
826 case x86_seg_cs:
827 reg->sel = __vmread(GUEST_CS_SELECTOR);
828 reg->limit = __vmread(GUEST_CS_LIMIT);
829 reg->base = __vmread(GUEST_CS_BASE);
830 attr = __vmread(GUEST_CS_AR_BYTES);
831 break;
832 case x86_seg_ds:
833 reg->sel = __vmread(GUEST_DS_SELECTOR);
834 reg->limit = __vmread(GUEST_DS_LIMIT);
835 reg->base = __vmread(GUEST_DS_BASE);
836 attr = __vmread(GUEST_DS_AR_BYTES);
837 break;
838 case x86_seg_es:
839 reg->sel = __vmread(GUEST_ES_SELECTOR);
840 reg->limit = __vmread(GUEST_ES_LIMIT);
841 reg->base = __vmread(GUEST_ES_BASE);
842 attr = __vmread(GUEST_ES_AR_BYTES);
843 break;
844 case x86_seg_fs:
845 reg->sel = __vmread(GUEST_FS_SELECTOR);
846 reg->limit = __vmread(GUEST_FS_LIMIT);
847 reg->base = __vmread(GUEST_FS_BASE);
848 attr = __vmread(GUEST_FS_AR_BYTES);
849 break;
850 case x86_seg_gs:
851 reg->sel = __vmread(GUEST_GS_SELECTOR);
852 reg->limit = __vmread(GUEST_GS_LIMIT);
853 reg->base = __vmread(GUEST_GS_BASE);
854 attr = __vmread(GUEST_GS_AR_BYTES);
855 break;
856 case x86_seg_ss:
857 reg->sel = __vmread(GUEST_SS_SELECTOR);
858 reg->limit = __vmread(GUEST_SS_LIMIT);
859 reg->base = __vmread(GUEST_SS_BASE);
860 attr = __vmread(GUEST_SS_AR_BYTES);
861 break;
862 case x86_seg_tr:
863 reg->sel = __vmread(GUEST_TR_SELECTOR);
864 reg->limit = __vmread(GUEST_TR_LIMIT);
865 reg->base = __vmread(GUEST_TR_BASE);
866 attr = __vmread(GUEST_TR_AR_BYTES);
867 break;
868 case x86_seg_gdtr:
869 reg->limit = __vmread(GUEST_GDTR_LIMIT);
870 reg->base = __vmread(GUEST_GDTR_BASE);
871 break;
872 case x86_seg_idtr:
873 reg->limit = __vmread(GUEST_IDTR_LIMIT);
874 reg->base = __vmread(GUEST_IDTR_BASE);
875 break;
876 case x86_seg_ldtr:
877 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
878 reg->limit = __vmread(GUEST_LDTR_LIMIT);
879 reg->base = __vmread(GUEST_LDTR_BASE);
880 attr = __vmread(GUEST_LDTR_AR_BYTES);
881 break;
882 default:
883 BUG();
884 }
886 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
887 }
889 /* Make sure that xen intercepts any FP accesses from current */
890 static void vmx_stts(struct vcpu *v)
891 {
892 /* VMX depends on operating on the current vcpu */
893 ASSERT(v == current);
895 /*
896 * If the guest does not have TS enabled then we must cause and handle an
897 * exception on first use of the FPU. If the guest *does* have TS enabled
898 * then this is not necessary: no FPU activity can occur until the guest
899 * clears CR0.TS, and we will initialise the FPU when that happens.
900 */
901 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
902 {
903 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
904 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
905 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
906 }
907 }
909 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
910 {
911 vmx_vmcs_enter(v);
912 __vmwrite(TSC_OFFSET, offset);
913 #if defined (__i386__)
914 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
915 #endif
916 vmx_vmcs_exit(v);
917 }
919 static void vmx_init_ap_context(
920 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
921 {
922 memset(ctxt, 0, sizeof(*ctxt));
923 ctxt->user_regs.eip = VMXASSIST_BASE;
924 ctxt->user_regs.edx = vcpuid;
925 ctxt->user_regs.ebx = trampoline_vector;
926 }
928 void do_nmi(struct cpu_user_regs *);
930 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
931 {
932 char *p;
933 int i;
935 memset(hypercall_page, 0, PAGE_SIZE);
937 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
938 {
939 p = (char *)(hypercall_page + (i * 32));
940 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
941 *(u32 *)(p + 1) = i;
942 *(u8 *)(p + 5) = 0x0f; /* vmcall */
943 *(u8 *)(p + 6) = 0x01;
944 *(u8 *)(p + 7) = 0xc1;
945 *(u8 *)(p + 8) = 0xc3; /* ret */
946 }
948 /* Don't support HYPERVISOR_iret at the moment */
949 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
950 }
952 static int vmx_realmode(struct vcpu *v)
953 {
954 unsigned long rflags;
956 ASSERT(v == current);
958 rflags = __vmread(GUEST_RFLAGS);
959 return rflags & X86_EFLAGS_VM;
960 }
962 static int vmx_guest_x86_mode(struct vcpu *v)
963 {
964 unsigned long cs_ar_bytes;
966 ASSERT(v == current);
968 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
970 if ( vmx_long_mode_enabled(v) && (cs_ar_bytes & (1u<<13)) )
971 return 8;
973 if ( vmx_realmode(v) )
974 return 2;
976 return ((cs_ar_bytes & (1u<<14)) ? 4 : 2);
977 }
979 static int vmx_pae_enabled(struct vcpu *v)
980 {
981 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
982 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
983 }
985 static void vmx_update_host_cr3(struct vcpu *v)
986 {
987 ASSERT( (v == current) || !vcpu_runnable(v) );
988 vmx_vmcs_enter(v);
989 __vmwrite(HOST_CR3, v->arch.cr3);
990 vmx_vmcs_exit(v);
991 }
993 static void vmx_update_guest_cr3(struct vcpu *v)
994 {
995 ASSERT( (v == current) || !vcpu_runnable(v) );
996 vmx_vmcs_enter(v);
997 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
998 vmx_vmcs_exit(v);
999 }
1002 static void vmx_inject_exception(
1003 unsigned int trapnr, int errcode, unsigned long cr2)
1005 struct vcpu *v = current;
1006 vmx_inject_hw_exception(v, trapnr, errcode);
1007 if ( trapnr == TRAP_page_fault )
1008 v->arch.hvm_vmx.cpu_cr2 = cr2;
1011 /* Setup HVM interfaces */
1012 static void vmx_setup_hvm_funcs(void)
1014 hvm_funcs.disable = stop_vmx;
1016 hvm_funcs.vcpu_initialise = vmx_vcpu_initialise;
1017 hvm_funcs.vcpu_destroy = vmx_vcpu_destroy;
1019 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
1020 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
1022 hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
1023 hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
1025 hvm_funcs.paging_enabled = vmx_paging_enabled;
1026 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
1027 hvm_funcs.pae_enabled = vmx_pae_enabled;
1028 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
1029 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
1030 hvm_funcs.get_segment_base = vmx_get_segment_base;
1031 hvm_funcs.get_segment_register = vmx_get_segment_register;
1033 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
1034 hvm_funcs.update_guest_cr3 = vmx_update_guest_cr3;
1036 hvm_funcs.stts = vmx_stts;
1037 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
1039 hvm_funcs.inject_exception = vmx_inject_exception;
1041 hvm_funcs.init_ap_context = vmx_init_ap_context;
1043 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
1046 int start_vmx(void)
1048 u32 eax, edx;
1049 struct vmcs_struct *vmcs;
1051 /*
1052 * Xen does not fill x86_capability words except 0.
1053 */
1054 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1056 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1057 return 0;
1059 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1061 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1063 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1065 printk("VMX disabled by Feature Control MSR.\n");
1066 return 0;
1069 else
1071 wrmsr(IA32_FEATURE_CONTROL_MSR,
1072 IA32_FEATURE_CONTROL_MSR_LOCK |
1073 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1076 set_in_cr4(X86_CR4_VMXE);
1078 vmx_init_vmcs_config();
1080 if ( smp_processor_id() == 0 )
1081 setup_vmcs_dump();
1083 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1085 clear_in_cr4(X86_CR4_VMXE);
1086 printk("Failed to allocate host VMCS\n");
1087 return 0;
1090 if ( __vmxon(virt_to_maddr(vmcs)) )
1092 clear_in_cr4(X86_CR4_VMXE);
1093 printk("VMXON failed\n");
1094 vmx_free_host_vmcs(vmcs);
1095 return 0;
1098 printk("VMXON is done\n");
1100 vmx_save_host_msrs();
1102 vmx_setup_hvm_funcs();
1104 hvm_enable();
1106 return 1;
1109 /*
1110 * Not all cases receive valid value in the VM-exit instruction length field.
1111 * Callers must know what they're doing!
1112 */
1113 static int __get_instruction_length(void)
1115 int len;
1116 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1117 BUG_ON((len < 1) || (len > 15));
1118 return len;
1121 static void inline __update_guest_eip(unsigned long inst_len)
1123 unsigned long current_eip;
1125 current_eip = __vmread(GUEST_RIP);
1126 __vmwrite(GUEST_RIP, current_eip + inst_len);
1127 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1130 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1132 int result;
1134 #if 0 /* keep for debugging */
1136 unsigned long eip, cs;
1138 cs = __vmread(GUEST_CS_BASE);
1139 eip = __vmread(GUEST_RIP);
1140 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1141 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
1142 "eip = %lx, error_code = %lx\n",
1143 va, cs, eip, (unsigned long)regs->error_code);
1145 #endif
1147 result = shadow_fault(va, regs);
1149 TRACE_VMEXIT(2, result);
1150 #if 0
1151 if ( !result )
1153 eip = __vmread(GUEST_RIP);
1154 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
1156 #endif
1158 return result;
1161 static void vmx_do_no_device_fault(void)
1163 struct vcpu *v = current;
1165 setup_fpu(current);
1166 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1168 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1169 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1171 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1172 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1176 #define bitmaskof(idx) (1U << ((idx) & 31))
1177 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1179 unsigned int input = (unsigned int)regs->eax;
1180 unsigned int count = (unsigned int)regs->ecx;
1181 unsigned int eax, ebx, ecx, edx;
1183 if ( input == 0x00000004 )
1185 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1186 eax &= NUM_CORES_RESET_MASK;
1188 else if ( input == 0x40000003 )
1190 /*
1191 * NB. Unsupported interface for private use of VMXASSIST only.
1192 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1193 */
1194 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1195 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1196 struct vcpu *v = current;
1197 char *p;
1199 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1201 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1202 if ( (value & 7) || (mfn == INVALID_MFN) ||
1203 !v->arch.hvm_vmx.vmxassist_enabled )
1205 domain_crash(v->domain);
1206 return;
1209 p = map_domain_page(mfn);
1210 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1211 unmap_domain_page(p);
1213 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1214 ecx = (u32)value;
1215 edx = (u32)(value >> 32);
1216 } else {
1217 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1219 if ( input == 0x00000001 )
1221 /* Mask off reserved bits. */
1222 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1224 ebx &= NUM_THREADS_RESET_MASK;
1226 /* Unsupportable for virtualised CPUs. */
1227 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1228 bitmaskof(X86_FEATURE_EST) |
1229 bitmaskof(X86_FEATURE_TM2) |
1230 bitmaskof(X86_FEATURE_CID));
1232 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1233 bitmaskof(X86_FEATURE_ACPI) |
1234 bitmaskof(X86_FEATURE_ACC));
1237 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1238 eax = ebx = ecx = edx = 0x0;
1241 regs->eax = (unsigned long)eax;
1242 regs->ebx = (unsigned long)ebx;
1243 regs->ecx = (unsigned long)ecx;
1244 regs->edx = (unsigned long)edx;
1247 #define CASE_GET_REG_P(REG, reg) \
1248 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1250 #ifdef __i386__
1251 #define CASE_EXTEND_GET_REG_P
1252 #else
1253 #define CASE_EXTEND_GET_REG_P \
1254 CASE_GET_REG_P(R8, r8); \
1255 CASE_GET_REG_P(R9, r9); \
1256 CASE_GET_REG_P(R10, r10); \
1257 CASE_GET_REG_P(R11, r11); \
1258 CASE_GET_REG_P(R12, r12); \
1259 CASE_GET_REG_P(R13, r13); \
1260 CASE_GET_REG_P(R14, r14); \
1261 CASE_GET_REG_P(R15, r15)
1262 #endif
1264 static void vmx_dr_access(unsigned long exit_qualification,
1265 struct cpu_user_regs *regs)
1267 struct vcpu *v = current;
1269 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1271 /* We could probably be smarter about this */
1272 __restore_debug_registers(v);
1274 /* Allow guest direct access to DR registers */
1275 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1276 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1277 v->arch.hvm_vcpu.u.vmx.exec_control);
1280 /*
1281 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1282 * the address va.
1283 */
1284 static void vmx_do_invlpg(unsigned long va)
1286 unsigned long eip;
1287 struct vcpu *v = current;
1289 eip = __vmread(GUEST_RIP);
1291 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1292 eip, va);
1294 /*
1295 * We do the safest things first, then try to update the shadow
1296 * copying from guest
1297 */
1298 shadow_invlpg(v, va);
1302 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1303 enum x86_segment seg, unsigned long *base,
1304 u32 *limit, u32 *ar_bytes)
1306 enum vmcs_field ar_field, base_field, limit_field;
1308 *base = 0;
1309 *limit = 0;
1310 if ( seg != x86_seg_es )
1312 unsigned char inst[MAX_INST_LEN];
1313 int i;
1314 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1316 if ( !long_mode )
1317 eip += __vmread(GUEST_CS_BASE);
1318 memset(inst, 0, MAX_INST_LEN);
1319 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1321 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1322 domain_crash(current->domain);
1323 return 0;
1326 for ( i = 0; i < inst_len; i++ )
1328 switch ( inst[i] )
1330 case 0xf3: /* REPZ */
1331 case 0xf2: /* REPNZ */
1332 case 0xf0: /* LOCK */
1333 case 0x66: /* data32 */
1334 case 0x67: /* addr32 */
1335 #ifdef __x86_64__
1336 case 0x40 ... 0x4f: /* REX */
1337 #endif
1338 continue;
1339 case 0x2e: /* CS */
1340 seg = x86_seg_cs;
1341 continue;
1342 case 0x36: /* SS */
1343 seg = x86_seg_ss;
1344 continue;
1345 case 0x26: /* ES */
1346 seg = x86_seg_es;
1347 continue;
1348 case 0x64: /* FS */
1349 seg = x86_seg_fs;
1350 continue;
1351 case 0x65: /* GS */
1352 seg = x86_seg_gs;
1353 continue;
1354 case 0x3e: /* DS */
1355 seg = x86_seg_ds;
1356 continue;
1361 switch ( seg )
1363 case x86_seg_cs:
1364 ar_field = GUEST_CS_AR_BYTES;
1365 base_field = GUEST_CS_BASE;
1366 limit_field = GUEST_CS_LIMIT;
1367 break;
1368 case x86_seg_ds:
1369 ar_field = GUEST_DS_AR_BYTES;
1370 base_field = GUEST_DS_BASE;
1371 limit_field = GUEST_DS_LIMIT;
1372 break;
1373 case x86_seg_es:
1374 ar_field = GUEST_ES_AR_BYTES;
1375 base_field = GUEST_ES_BASE;
1376 limit_field = GUEST_ES_LIMIT;
1377 break;
1378 case x86_seg_fs:
1379 ar_field = GUEST_FS_AR_BYTES;
1380 base_field = GUEST_FS_BASE;
1381 limit_field = GUEST_FS_LIMIT;
1382 break;
1383 case x86_seg_gs:
1384 ar_field = GUEST_FS_AR_BYTES;
1385 base_field = GUEST_FS_BASE;
1386 limit_field = GUEST_FS_LIMIT;
1387 break;
1388 case x86_seg_ss:
1389 ar_field = GUEST_GS_AR_BYTES;
1390 base_field = GUEST_GS_BASE;
1391 limit_field = GUEST_GS_LIMIT;
1392 break;
1393 default:
1394 BUG();
1395 return 0;
1398 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1400 *base = __vmread(base_field);
1401 *limit = __vmread(limit_field);
1403 *ar_bytes = __vmread(ar_field);
1405 return !(*ar_bytes & 0x10000);
1408 static void vmx_io_instruction(unsigned long exit_qualification,
1409 unsigned long inst_len)
1411 struct cpu_user_regs *regs;
1412 struct hvm_io_op *pio_opp;
1413 unsigned int port, size;
1414 int dir, df, vm86;
1416 pio_opp = &current->arch.hvm_vcpu.io_op;
1417 pio_opp->instr = INSTR_PIO;
1418 pio_opp->flags = 0;
1420 regs = &pio_opp->io_context;
1422 /* Copy current guest state into io instruction state structure. */
1423 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1424 hvm_store_cpu_guest_regs(current, regs, NULL);
1426 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1427 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1429 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1430 "exit_qualification = %lx",
1431 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1433 if ( test_bit(6, &exit_qualification) )
1434 port = (exit_qualification >> 16) & 0xFFFF;
1435 else
1436 port = regs->edx & 0xffff;
1438 TRACE_VMEXIT(1, port);
1440 size = (exit_qualification & 7) + 1;
1441 dir = test_bit(3, &exit_qualification); /* direction */
1443 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1444 unsigned long addr, count = 1, base;
1445 u32 ar_bytes, limit;
1446 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1447 int long_mode = 0;
1449 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1450 #ifdef __x86_64__
1451 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1452 long_mode = 1;
1453 #endif
1454 addr = __vmread(GUEST_LINEAR_ADDRESS);
1456 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1457 pio_opp->flags |= REPZ;
1458 count = regs->ecx;
1459 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1460 count &= 0xFFFF;
1463 /*
1464 * In protected mode, guest linear address is invalid if the
1465 * selector is null.
1466 */
1467 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1468 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1469 &base, &limit, &ar_bytes) ) {
1470 if ( !long_mode ) {
1471 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1472 return;
1474 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1477 if ( !long_mode ) {
1478 unsigned long ea = addr - base;
1480 /* Segment must be readable for outs and writeable for ins. */
1481 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1482 : (ar_bytes & 0xa) != 0x2 ) {
1483 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1484 return;
1487 /* Offset must be within limits. */
1488 ASSERT(ea == (u32)ea);
1489 if ( (u32)(ea + size - 1) < (u32)ea ||
1490 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1491 : ea <= limit )
1493 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1494 return;
1497 /* Check the limit for repeated instructions, as above we checked
1498 only the first instance. Truncate the count if a limit violation
1499 would occur. Note that the checking is not necessary for page
1500 granular segments as transfers crossing page boundaries will be
1501 broken up anyway. */
1502 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1504 if ( (ar_bytes & 0xc) != 0x4 )
1506 /* expand-up */
1507 if ( !df )
1509 if ( ea + count * size - 1 < ea ||
1510 ea + count * size - 1 > limit )
1511 count = (limit + 1UL - ea) / size;
1513 else
1515 if ( count - 1 > ea / size )
1516 count = ea / size + 1;
1519 else
1521 /* expand-down */
1522 if ( !df )
1524 if ( count - 1 > -(s32)ea / size )
1525 count = -(s32)ea / size + 1UL;
1527 else
1529 if ( ea < (count - 1) * size ||
1530 ea - (count - 1) * size <= limit )
1531 count = (ea - limit - 1) / size + 1;
1534 ASSERT(count);
1537 #ifdef __x86_64__
1538 else
1540 if ( !is_canonical_address(addr) ||
1541 !is_canonical_address(addr + size - 1) )
1543 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1544 return;
1546 if ( count > (1UL << 48) / size )
1547 count = (1UL << 48) / size;
1548 if ( !(regs->eflags & EF_DF) )
1550 if ( addr + count * size - 1 < addr ||
1551 !is_canonical_address(addr + count * size - 1) )
1552 count = (addr & ~((1UL << 48) - 1)) / size;
1554 else
1556 if ( (count - 1) * size > addr ||
1557 !is_canonical_address(addr + (count - 1) * size) )
1558 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1560 ASSERT(count);
1562 #endif
1564 /*
1565 * Handle string pio instructions that cross pages or that
1566 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1567 */
1568 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1569 unsigned long value = 0;
1571 pio_opp->flags |= OVERLAP;
1573 if ( dir == IOREQ_WRITE ) /* OUTS */
1575 if ( hvm_paging_enabled(current) )
1576 (void)hvm_copy_from_guest_virt(&value, addr, size);
1577 else
1578 (void)hvm_copy_from_guest_phys(&value, addr, size);
1579 } else
1580 pio_opp->addr = addr;
1582 if ( count == 1 )
1583 regs->eip += inst_len;
1585 send_pio_req(port, 1, size, value, dir, df, 0);
1586 } else {
1587 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1588 : addr - (count - 1) * size;
1590 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1592 if ( sign > 0 )
1593 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1594 else
1595 count = (addr & ~PAGE_MASK) / size + 1;
1596 } else
1597 regs->eip += inst_len;
1599 send_pio_req(port, count, size, addr, dir, df, 1);
1601 } else {
1602 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1603 hvm_print_line(current, regs->eax); /* guest debug output */
1605 if ( dir == IOREQ_WRITE )
1606 TRACE_VMEXIT(2, regs->eax);
1608 regs->eip += inst_len;
1609 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1613 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1615 /* NB. Skip transition instruction. */
1616 c->eip = __vmread(GUEST_RIP);
1617 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1619 c->esp = __vmread(GUEST_RSP);
1620 c->eflags = __vmread(GUEST_RFLAGS);
1622 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1623 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1624 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1626 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1627 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1629 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1630 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1632 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1633 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1634 c->cs_base = __vmread(GUEST_CS_BASE);
1635 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1637 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1638 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1639 c->ds_base = __vmread(GUEST_DS_BASE);
1640 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1642 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1643 c->es_limit = __vmread(GUEST_ES_LIMIT);
1644 c->es_base = __vmread(GUEST_ES_BASE);
1645 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1647 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1648 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1649 c->ss_base = __vmread(GUEST_SS_BASE);
1650 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1652 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1653 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1654 c->fs_base = __vmread(GUEST_FS_BASE);
1655 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1657 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1658 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1659 c->gs_base = __vmread(GUEST_GS_BASE);
1660 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1662 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1663 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1664 c->tr_base = __vmread(GUEST_TR_BASE);
1665 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1667 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1668 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1669 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1670 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1673 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1675 unsigned long mfn, old_base_mfn;
1677 __vmwrite(GUEST_RIP, c->eip);
1678 __vmwrite(GUEST_RSP, c->esp);
1679 __vmwrite(GUEST_RFLAGS, c->eflags);
1681 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1682 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1684 if ( !vmx_paging_enabled(v) )
1685 goto skip_cr3;
1687 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1689 /*
1690 * This is simple TLB flush, implying the guest has
1691 * removed some translation or changed page attributes.
1692 * We simply invalidate the shadow.
1693 */
1694 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1695 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1696 goto bad_cr3;
1698 else
1700 /*
1701 * If different, make a shadow. Check if the PDBR is valid
1702 * first.
1703 */
1704 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1705 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1706 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1707 goto bad_cr3;
1708 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1709 v->arch.guest_table = pagetable_from_pfn(mfn);
1710 if (old_base_mfn)
1711 put_page(mfn_to_page(old_base_mfn));
1712 /*
1713 * arch.shadow_table should now hold the next CR3 for shadow
1714 */
1715 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1718 skip_cr3:
1719 if ( !vmx_paging_enabled(v) )
1720 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1721 else
1722 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1724 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1725 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1726 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1728 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1729 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1731 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1732 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1734 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1735 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1736 __vmwrite(GUEST_CS_BASE, c->cs_base);
1737 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1739 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1740 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1741 __vmwrite(GUEST_DS_BASE, c->ds_base);
1742 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1744 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1745 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1746 __vmwrite(GUEST_ES_BASE, c->es_base);
1747 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1749 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1750 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1751 __vmwrite(GUEST_SS_BASE, c->ss_base);
1752 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1754 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1755 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1756 __vmwrite(GUEST_FS_BASE, c->fs_base);
1757 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1759 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1760 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1761 __vmwrite(GUEST_GS_BASE, c->gs_base);
1762 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1764 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1765 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1766 __vmwrite(GUEST_TR_BASE, c->tr_base);
1767 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1769 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1770 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1771 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1772 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1774 shadow_update_paging_modes(v);
1775 return 0;
1777 bad_cr3:
1778 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1779 return -EINVAL;
1782 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1784 static int vmx_assist(struct vcpu *v, int mode)
1786 struct vmx_assist_context c;
1787 u32 magic;
1788 u32 cp;
1790 /* make sure vmxassist exists (this is not an error) */
1791 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1792 sizeof(magic)))
1793 return 0;
1794 if (magic != VMXASSIST_MAGIC)
1795 return 0;
1797 switch (mode) {
1798 /*
1799 * Transfer control to vmxassist.
1800 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1801 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1802 * by vmxassist and will transfer control to it.
1803 */
1804 case VMX_ASSIST_INVOKE:
1805 /* save the old context */
1806 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1807 goto error;
1808 if (cp != 0) {
1809 vmx_world_save(v, &c);
1810 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1811 goto error;
1814 /* restore the new context, this should activate vmxassist */
1815 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1816 goto error;
1817 if (cp != 0) {
1818 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1819 goto error;
1820 if ( vmx_world_restore(v, &c) != 0 )
1821 goto error;
1822 v->arch.hvm_vmx.vmxassist_enabled = 1;
1823 return 1;
1825 break;
1827 /*
1828 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1829 * VMX_ASSIST_INVOKE above.
1830 */
1831 case VMX_ASSIST_RESTORE:
1832 /* save the old context */
1833 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1834 goto error;
1835 if (cp != 0) {
1836 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1837 goto error;
1838 if ( vmx_world_restore(v, &c) != 0 )
1839 goto error;
1840 v->arch.hvm_vmx.vmxassist_enabled = 0;
1841 return 1;
1843 break;
1846 error:
1847 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1848 domain_crash(v->domain);
1849 return 0;
1852 static int vmx_set_cr0(unsigned long value)
1854 struct vcpu *v = current;
1855 unsigned long mfn;
1856 unsigned long eip;
1857 int paging_enabled;
1858 unsigned long vm_entry_value;
1859 unsigned long old_cr0;
1860 unsigned long old_base_mfn;
1862 /*
1863 * CR0: We don't want to lose PE and PG.
1864 */
1865 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1866 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1868 /* TS cleared? Then initialise FPU now. */
1869 if ( !(value & X86_CR0_TS) )
1871 setup_fpu(v);
1872 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1875 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1876 | X86_CR0_NE | X86_CR0_WP);
1877 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1879 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1880 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1882 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1884 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1886 /*
1887 * Trying to enable guest paging.
1888 * The guest CR3 must be pointing to the guest physical.
1889 */
1890 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1891 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1893 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1894 v->arch.hvm_vmx.cpu_cr3, mfn);
1895 domain_crash(v->domain);
1896 return 0;
1899 #if defined(__x86_64__)
1900 if ( vmx_lme_is_set(v) )
1902 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1904 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1905 "with EFER.LME set but not CR4.PAE\n");
1906 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1908 else
1910 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1911 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1912 |= EFER_LMA;
1913 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1914 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1915 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1918 #endif
1920 /*
1921 * Now arch.guest_table points to machine physical.
1922 */
1923 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1924 v->arch.guest_table = pagetable_from_pfn(mfn);
1925 if (old_base_mfn)
1926 put_page(mfn_to_page(old_base_mfn));
1927 shadow_update_paging_modes(v);
1929 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1930 (unsigned long) (mfn << PAGE_SHIFT));
1932 /*
1933 * arch->shadow_table should hold the next CR3 for shadow
1934 */
1935 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1936 v->arch.hvm_vmx.cpu_cr3, mfn);
1939 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1940 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1941 put_page(mfn_to_page(get_mfn_from_gpfn(
1942 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1943 v->arch.guest_table = pagetable_null();
1946 /*
1947 * VMX does not implement real-mode virtualization. We emulate
1948 * real-mode by performing a world switch to VMXAssist whenever
1949 * a partition disables the CR0.PE bit.
1950 */
1951 if ( (value & X86_CR0_PE) == 0 )
1953 if ( value & X86_CR0_PG ) {
1954 /* inject GP here */
1955 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1956 return 0;
1957 } else {
1958 /*
1959 * Disable paging here.
1960 * Same to PE == 1 && PG == 0
1961 */
1962 if ( vmx_long_mode_enabled(v) )
1964 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1965 &= ~EFER_LMA;
1966 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1967 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1968 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1972 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1974 eip = __vmread(GUEST_RIP);
1975 HVM_DBG_LOG(DBG_LEVEL_1,
1976 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1977 return 0; /* do not update eip! */
1980 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1982 eip = __vmread(GUEST_RIP);
1983 HVM_DBG_LOG(DBG_LEVEL_1,
1984 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1985 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1987 eip = __vmread(GUEST_RIP);
1988 HVM_DBG_LOG(DBG_LEVEL_1,
1989 "Restoring to %%eip 0x%lx\n", eip);
1990 return 0; /* do not update eip! */
1993 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1995 if ( vmx_long_mode_enabled(v) )
1997 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
1998 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1999 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2000 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2002 shadow_update_paging_modes(v);
2005 return 1;
2008 #define CASE_SET_REG(REG, reg) \
2009 case REG_ ## REG: regs->reg = value; break
2010 #define CASE_GET_REG(REG, reg) \
2011 case REG_ ## REG: value = regs->reg; break
2013 #define CASE_EXTEND_SET_REG \
2014 CASE_EXTEND_REG(S)
2015 #define CASE_EXTEND_GET_REG \
2016 CASE_EXTEND_REG(G)
2018 #ifdef __i386__
2019 #define CASE_EXTEND_REG(T)
2020 #else
2021 #define CASE_EXTEND_REG(T) \
2022 CASE_ ## T ## ET_REG(R8, r8); \
2023 CASE_ ## T ## ET_REG(R9, r9); \
2024 CASE_ ## T ## ET_REG(R10, r10); \
2025 CASE_ ## T ## ET_REG(R11, r11); \
2026 CASE_ ## T ## ET_REG(R12, r12); \
2027 CASE_ ## T ## ET_REG(R13, r13); \
2028 CASE_ ## T ## ET_REG(R14, r14); \
2029 CASE_ ## T ## ET_REG(R15, r15)
2030 #endif
2032 /*
2033 * Write to control registers
2034 */
2035 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2037 unsigned long value, old_cr, old_base_mfn, mfn;
2038 struct vcpu *v = current;
2039 struct vlapic *vlapic = vcpu_vlapic(v);
2041 switch ( gp )
2043 CASE_GET_REG(EAX, eax);
2044 CASE_GET_REG(ECX, ecx);
2045 CASE_GET_REG(EDX, edx);
2046 CASE_GET_REG(EBX, ebx);
2047 CASE_GET_REG(EBP, ebp);
2048 CASE_GET_REG(ESI, esi);
2049 CASE_GET_REG(EDI, edi);
2050 CASE_EXTEND_GET_REG;
2051 case REG_ESP:
2052 value = __vmread(GUEST_RSP);
2053 break;
2054 default:
2055 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2056 goto exit_and_crash;
2059 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
2060 TRACE_VMEXIT(2, cr);
2061 TRACE_VMEXIT(3, value);
2063 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2065 switch ( cr )
2067 case 0:
2068 return vmx_set_cr0(value);
2070 case 3:
2071 /*
2072 * If paging is not enabled yet, simply copy the value to CR3.
2073 */
2074 if (!vmx_paging_enabled(v)) {
2075 v->arch.hvm_vmx.cpu_cr3 = value;
2076 break;
2079 /*
2080 * We make a new one if the shadow does not exist.
2081 */
2082 if (value == v->arch.hvm_vmx.cpu_cr3) {
2083 /*
2084 * This is simple TLB flush, implying the guest has
2085 * removed some translation or changed page attributes.
2086 * We simply invalidate the shadow.
2087 */
2088 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2089 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2090 goto bad_cr3;
2091 shadow_update_cr3(v);
2092 } else {
2093 /*
2094 * If different, make a shadow. Check if the PDBR is valid
2095 * first.
2096 */
2097 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2098 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2099 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2100 goto bad_cr3;
2101 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2102 v->arch.guest_table = pagetable_from_pfn(mfn);
2103 if (old_base_mfn)
2104 put_page(mfn_to_page(old_base_mfn));
2105 /*
2106 * arch.shadow_table should now hold the next CR3 for shadow
2107 */
2108 v->arch.hvm_vmx.cpu_cr3 = value;
2109 update_cr3(v);
2110 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2112 break;
2114 case 4: /* CR4 */
2115 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2117 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2119 if ( vmx_pgbit_test(v) )
2121 /* The guest is a 32-bit PAE guest. */
2122 #if CONFIG_PAGING_LEVELS >= 3
2123 unsigned long mfn, old_base_mfn;
2124 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2125 if ( !mfn_valid(mfn) ||
2126 !get_page(mfn_to_page(mfn), v->domain) )
2127 goto bad_cr3;
2129 /*
2130 * Now arch.guest_table points to machine physical.
2131 */
2133 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2134 v->arch.guest_table = pagetable_from_pfn(mfn);
2135 if ( old_base_mfn )
2136 put_page(mfn_to_page(old_base_mfn));
2138 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2139 (unsigned long) (mfn << PAGE_SHIFT));
2141 /*
2142 * arch->shadow_table should hold the next CR3 for shadow
2143 */
2144 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2145 "Update CR3 value = %lx, mfn = %lx",
2146 v->arch.hvm_vmx.cpu_cr3, mfn);
2147 #endif
2150 else if ( !(value & X86_CR4_PAE) )
2152 if ( unlikely(vmx_long_mode_enabled(v)) )
2154 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2155 "EFER.LMA is set\n");
2156 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2160 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2161 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2162 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2164 /*
2165 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2166 * all TLB entries except global entries.
2167 */
2168 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2169 shadow_update_paging_modes(v);
2170 break;
2172 case 8:
2173 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2174 break;
2176 default:
2177 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2178 domain_crash(v->domain);
2179 return 0;
2182 return 1;
2184 bad_cr3:
2185 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2186 exit_and_crash:
2187 domain_crash(v->domain);
2188 return 0;
2191 /*
2192 * Read from control registers. CR0 and CR4 are read from the shadow.
2193 */
2194 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2196 unsigned long value = 0;
2197 struct vcpu *v = current;
2198 struct vlapic *vlapic = vcpu_vlapic(v);
2200 switch ( cr )
2202 case 3:
2203 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2204 break;
2205 case 8:
2206 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2207 value = (value & 0xF0) >> 4;
2208 break;
2209 default:
2210 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2211 domain_crash(v->domain);
2212 break;
2215 switch ( gp ) {
2216 CASE_SET_REG(EAX, eax);
2217 CASE_SET_REG(ECX, ecx);
2218 CASE_SET_REG(EDX, edx);
2219 CASE_SET_REG(EBX, ebx);
2220 CASE_SET_REG(EBP, ebp);
2221 CASE_SET_REG(ESI, esi);
2222 CASE_SET_REG(EDI, edi);
2223 CASE_EXTEND_SET_REG;
2224 case REG_ESP:
2225 __vmwrite(GUEST_RSP, value);
2226 regs->esp = value;
2227 break;
2228 default:
2229 printk("invalid gp: %d\n", gp);
2230 domain_crash(v->domain);
2231 break;
2234 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
2235 TRACE_VMEXIT(2, cr);
2236 TRACE_VMEXIT(3, value);
2238 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2241 static int vmx_cr_access(unsigned long exit_qualification,
2242 struct cpu_user_regs *regs)
2244 unsigned int gp, cr;
2245 unsigned long value;
2246 struct vcpu *v = current;
2248 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2249 case TYPE_MOV_TO_CR:
2250 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2251 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2252 return mov_to_cr(gp, cr, regs);
2253 case TYPE_MOV_FROM_CR:
2254 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2255 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2256 mov_from_cr(cr, gp, regs);
2257 break;
2258 case TYPE_CLTS:
2259 TRACE_VMEXIT(1, TYPE_CLTS);
2261 /* We initialise the FPU now, to avoid needing another vmexit. */
2262 setup_fpu(v);
2263 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
2265 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2266 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2268 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2269 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2270 break;
2271 case TYPE_LMSW:
2272 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2273 value = (value & ~0xF) |
2274 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2275 TRACE_VMEXIT(1, TYPE_LMSW);
2276 TRACE_VMEXIT(2, value);
2277 return vmx_set_cr0(value);
2278 break;
2279 default:
2280 BUG();
2283 return 1;
2286 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2288 u64 msr_content = 0;
2289 u32 ecx = regs->ecx, eax, edx;
2290 struct vcpu *v = current;
2292 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2293 ecx, (u32)regs->eax, (u32)regs->edx);
2295 switch (ecx) {
2296 case MSR_IA32_TIME_STAMP_COUNTER:
2297 msr_content = hvm_get_guest_time(v);
2298 break;
2299 case MSR_IA32_SYSENTER_CS:
2300 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2301 break;
2302 case MSR_IA32_SYSENTER_ESP:
2303 msr_content = __vmread(GUEST_SYSENTER_ESP);
2304 break;
2305 case MSR_IA32_SYSENTER_EIP:
2306 msr_content = __vmread(GUEST_SYSENTER_EIP);
2307 break;
2308 case MSR_IA32_APICBASE:
2309 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2310 break;
2311 default:
2312 if ( long_mode_do_msr_read(regs) )
2313 goto done;
2315 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2316 rdmsr_safe(ecx, eax, edx) == 0 )
2318 regs->eax = eax;
2319 regs->edx = edx;
2320 goto done;
2322 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2323 return 0;
2326 regs->eax = msr_content & 0xFFFFFFFF;
2327 regs->edx = msr_content >> 32;
2329 done:
2330 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2331 ecx, (unsigned long)regs->eax,
2332 (unsigned long)regs->edx);
2333 return 1;
2336 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2338 u32 ecx = regs->ecx;
2339 u64 msr_content;
2340 struct vcpu *v = current;
2342 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2343 ecx, (u32)regs->eax, (u32)regs->edx);
2345 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2347 switch (ecx) {
2348 case MSR_IA32_TIME_STAMP_COUNTER:
2349 hvm_set_guest_time(v, msr_content);
2350 pt_reset(v);
2351 break;
2352 case MSR_IA32_SYSENTER_CS:
2353 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2354 break;
2355 case MSR_IA32_SYSENTER_ESP:
2356 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2357 break;
2358 case MSR_IA32_SYSENTER_EIP:
2359 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2360 break;
2361 case MSR_IA32_APICBASE:
2362 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2363 break;
2364 default:
2365 if ( !long_mode_do_msr_write(regs) )
2366 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2367 break;
2370 return 1;
2373 static void vmx_do_hlt(void)
2375 unsigned long rflags;
2376 rflags = __vmread(GUEST_RFLAGS);
2377 hvm_hlt(rflags);
2380 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2382 unsigned int vector;
2384 asmlinkage void do_IRQ(struct cpu_user_regs *);
2385 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2386 fastcall void smp_event_check_interrupt(void);
2387 fastcall void smp_invalidate_interrupt(void);
2388 fastcall void smp_call_function_interrupt(void);
2389 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2390 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2391 #ifdef CONFIG_X86_MCE_P4THERMAL
2392 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2393 #endif
2395 vector = __vmread(VM_EXIT_INTR_INFO);
2396 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2398 vector &= INTR_INFO_VECTOR_MASK;
2399 TRACE_VMEXIT(1, vector);
2401 switch(vector) {
2402 case LOCAL_TIMER_VECTOR:
2403 smp_apic_timer_interrupt(regs);
2404 break;
2405 case EVENT_CHECK_VECTOR:
2406 smp_event_check_interrupt();
2407 break;
2408 case INVALIDATE_TLB_VECTOR:
2409 smp_invalidate_interrupt();
2410 break;
2411 case CALL_FUNCTION_VECTOR:
2412 smp_call_function_interrupt();
2413 break;
2414 case SPURIOUS_APIC_VECTOR:
2415 smp_spurious_interrupt(regs);
2416 break;
2417 case ERROR_APIC_VECTOR:
2418 smp_error_interrupt(regs);
2419 break;
2420 #ifdef CONFIG_X86_MCE_P4THERMAL
2421 case THERMAL_APIC_VECTOR:
2422 smp_thermal_interrupt(regs);
2423 break;
2424 #endif
2425 default:
2426 regs->entry_vector = vector;
2427 do_IRQ(regs);
2428 break;
2432 #if defined (__x86_64__)
2433 void store_cpu_user_regs(struct cpu_user_regs *regs)
2435 regs->ss = __vmread(GUEST_SS_SELECTOR);
2436 regs->rsp = __vmread(GUEST_RSP);
2437 regs->rflags = __vmread(GUEST_RFLAGS);
2438 regs->cs = __vmread(GUEST_CS_SELECTOR);
2439 regs->ds = __vmread(GUEST_DS_SELECTOR);
2440 regs->es = __vmread(GUEST_ES_SELECTOR);
2441 regs->rip = __vmread(GUEST_RIP);
2443 #elif defined (__i386__)
2444 void store_cpu_user_regs(struct cpu_user_regs *regs)
2446 regs->ss = __vmread(GUEST_SS_SELECTOR);
2447 regs->esp = __vmread(GUEST_RSP);
2448 regs->eflags = __vmread(GUEST_RFLAGS);
2449 regs->cs = __vmread(GUEST_CS_SELECTOR);
2450 regs->ds = __vmread(GUEST_DS_SELECTOR);
2451 regs->es = __vmread(GUEST_ES_SELECTOR);
2452 regs->eip = __vmread(GUEST_RIP);
2454 #endif
2456 #ifdef XEN_DEBUGGER
2457 void save_cpu_user_regs(struct cpu_user_regs *regs)
2459 regs->xss = __vmread(GUEST_SS_SELECTOR);
2460 regs->esp = __vmread(GUEST_RSP);
2461 regs->eflags = __vmread(GUEST_RFLAGS);
2462 regs->xcs = __vmread(GUEST_CS_SELECTOR);
2463 regs->eip = __vmread(GUEST_RIP);
2465 regs->xgs = __vmread(GUEST_GS_SELECTOR);
2466 regs->xfs = __vmread(GUEST_FS_SELECTOR);
2467 regs->xes = __vmread(GUEST_ES_SELECTOR);
2468 regs->xds = __vmread(GUEST_DS_SELECTOR);
2471 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2473 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2474 __vmwrite(GUEST_RSP, regs->esp);
2475 __vmwrite(GUEST_RFLAGS, regs->eflags);
2476 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2477 __vmwrite(GUEST_RIP, regs->eip);
2479 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2480 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2481 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2482 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2484 #endif
2486 static void vmx_reflect_exception(struct vcpu *v)
2488 int error_code, intr_info, vector;
2490 intr_info = __vmread(VM_EXIT_INTR_INFO);
2491 vector = intr_info & 0xff;
2492 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2493 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2494 else
2495 error_code = VMX_DELIVER_NO_ERROR_CODE;
2497 #ifndef NDEBUG
2499 unsigned long rip;
2501 rip = __vmread(GUEST_RIP);
2502 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2503 rip, error_code);
2505 #endif /* NDEBUG */
2507 /*
2508 * According to Intel Virtualization Technology Specification for
2509 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2510 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2511 * HW_EXCEPTION used for everything else. The main difference
2512 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2513 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2514 * it is not.
2515 */
2516 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2518 int ilen = __get_instruction_length(); /* Safe: software exception */
2519 vmx_inject_sw_exception(v, vector, ilen);
2521 else
2523 vmx_inject_hw_exception(v, vector, error_code);
2527 static void vmx_failed_vmentry(unsigned int exit_reason)
2529 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2530 unsigned long exit_qualification;
2532 exit_qualification = __vmread(EXIT_QUALIFICATION);
2533 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2534 switch ( failed_vmentry_reason )
2536 case EXIT_REASON_INVALID_GUEST_STATE:
2537 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2538 break;
2539 case EXIT_REASON_MSR_LOADING:
2540 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2541 break;
2542 case EXIT_REASON_MACHINE_CHECK:
2543 printk("caused by machine check.\n");
2544 break;
2545 default:
2546 printk("reason not known yet!");
2547 break;
2550 printk("************* VMCS Area **************\n");
2551 vmcs_dump_vcpu();
2552 printk("**************************************\n");
2554 domain_crash(current->domain);
2557 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2559 unsigned int exit_reason;
2560 unsigned long exit_qualification, inst_len = 0;
2561 struct vcpu *v = current;
2563 TRACE_3D(TRC_VMX_VMEXIT + v->vcpu_id, 0, 0, 0);
2565 exit_reason = __vmread(VM_EXIT_REASON);
2567 perfc_incra(vmexits, exit_reason);
2568 TRACE_VMEXIT(0, exit_reason);
2570 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2571 local_irq_enable();
2573 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2574 return vmx_failed_vmentry(exit_reason);
2576 switch ( exit_reason )
2578 case EXIT_REASON_EXCEPTION_NMI:
2580 /*
2581 * We don't set the software-interrupt exiting (INT n).
2582 * (1) We can get an exception (e.g. #PG) in the guest, or
2583 * (2) NMI
2584 */
2585 unsigned int intr_info, vector;
2587 intr_info = __vmread(VM_EXIT_INTR_INFO);
2588 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2590 vector = intr_info & INTR_INFO_VECTOR_MASK;
2592 TRACE_VMEXIT(1, vector);
2593 perfc_incra(cause_vector, vector);
2595 switch ( vector )
2597 #ifdef XEN_DEBUGGER
2598 case TRAP_debug:
2600 save_cpu_user_regs(regs);
2601 pdb_handle_exception(1, regs, 1);
2602 restore_cpu_user_regs(regs);
2603 break;
2605 case TRAP_int3:
2607 save_cpu_user_regs(regs);
2608 pdb_handle_exception(3, regs, 1);
2609 restore_cpu_user_regs(regs);
2610 break;
2612 #else
2613 case TRAP_debug:
2615 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2617 store_cpu_user_regs(regs);
2618 domain_pause_for_debugger();
2619 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2620 PENDING_DEBUG_EXC_BS);
2622 else
2624 vmx_reflect_exception(v);
2625 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2626 PENDING_DEBUG_EXC_BS);
2629 break;
2631 case TRAP_int3:
2633 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2634 domain_pause_for_debugger();
2635 else
2636 vmx_reflect_exception(v);
2637 break;
2639 #endif
2640 case TRAP_no_device:
2642 vmx_do_no_device_fault();
2643 break;
2645 case TRAP_page_fault:
2647 exit_qualification = __vmread(EXIT_QUALIFICATION);
2648 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2650 TRACE_VMEXIT(3, regs->error_code);
2651 TRACE_VMEXIT(4, exit_qualification);
2653 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2654 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2655 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2656 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2657 (unsigned long)regs->esi, (unsigned long)regs->edi);
2659 if ( !vmx_do_page_fault(exit_qualification, regs) )
2661 /* Inject #PG using Interruption-Information Fields. */
2662 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2663 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2664 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2665 TRAP_page_fault, exit_qualification);
2667 break;
2669 case TRAP_nmi:
2670 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2671 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2672 else
2673 vmx_reflect_exception(v);
2674 break;
2675 default:
2676 vmx_reflect_exception(v);
2677 break;
2679 break;
2681 case EXIT_REASON_EXTERNAL_INTERRUPT:
2682 vmx_do_extint(regs);
2683 break;
2684 case EXIT_REASON_TRIPLE_FAULT:
2685 hvm_triple_fault();
2686 break;
2687 case EXIT_REASON_PENDING_INTERRUPT:
2688 /* Disable the interrupt window. */
2689 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2690 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2691 v->arch.hvm_vcpu.u.vmx.exec_control);
2692 break;
2693 case EXIT_REASON_TASK_SWITCH:
2694 goto exit_and_crash;
2695 case EXIT_REASON_CPUID:
2696 inst_len = __get_instruction_length(); /* Safe: CPUID */
2697 __update_guest_eip(inst_len);
2698 vmx_do_cpuid(regs);
2699 break;
2700 case EXIT_REASON_HLT:
2701 inst_len = __get_instruction_length(); /* Safe: HLT */
2702 __update_guest_eip(inst_len);
2703 vmx_do_hlt();
2704 break;
2705 case EXIT_REASON_INVLPG:
2707 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2708 __update_guest_eip(inst_len);
2709 exit_qualification = __vmread(EXIT_QUALIFICATION);
2710 vmx_do_invlpg(exit_qualification);
2711 TRACE_VMEXIT(4, exit_qualification);
2712 break;
2714 case EXIT_REASON_VMCALL:
2716 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2717 __update_guest_eip(inst_len);
2718 hvm_do_hypercall(regs);
2719 break;
2721 case EXIT_REASON_CR_ACCESS:
2723 exit_qualification = __vmread(EXIT_QUALIFICATION);
2724 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2725 if ( vmx_cr_access(exit_qualification, regs) )
2726 __update_guest_eip(inst_len);
2727 TRACE_VMEXIT(4, exit_qualification);
2728 break;
2730 case EXIT_REASON_DR_ACCESS:
2731 exit_qualification = __vmread(EXIT_QUALIFICATION);
2732 vmx_dr_access(exit_qualification, regs);
2733 break;
2734 case EXIT_REASON_IO_INSTRUCTION:
2735 exit_qualification = __vmread(EXIT_QUALIFICATION);
2736 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2737 vmx_io_instruction(exit_qualification, inst_len);
2738 TRACE_VMEXIT(4, exit_qualification);
2739 break;
2740 case EXIT_REASON_MSR_READ:
2741 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2742 if ( vmx_do_msr_read(regs) )
2743 __update_guest_eip(inst_len);
2744 TRACE_VMEXIT(1, regs->ecx);
2745 TRACE_VMEXIT(2, regs->eax);
2746 TRACE_VMEXIT(3, regs->edx);
2747 break;
2748 case EXIT_REASON_MSR_WRITE:
2749 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2750 if ( vmx_do_msr_write(regs) )
2751 __update_guest_eip(inst_len);
2752 TRACE_VMEXIT(1, regs->ecx);
2753 TRACE_VMEXIT(2, regs->eax);
2754 TRACE_VMEXIT(3, regs->edx);
2755 break;
2756 case EXIT_REASON_MWAIT_INSTRUCTION:
2757 case EXIT_REASON_MONITOR_INSTRUCTION:
2758 case EXIT_REASON_PAUSE_INSTRUCTION:
2759 goto exit_and_crash;
2760 case EXIT_REASON_VMCLEAR:
2761 case EXIT_REASON_VMLAUNCH:
2762 case EXIT_REASON_VMPTRLD:
2763 case EXIT_REASON_VMPTRST:
2764 case EXIT_REASON_VMREAD:
2765 case EXIT_REASON_VMRESUME:
2766 case EXIT_REASON_VMWRITE:
2767 case EXIT_REASON_VMXOFF:
2768 case EXIT_REASON_VMXON:
2769 /* Report invalid opcode exception when a VMX guest tries to execute
2770 any of the VMX instructions */
2771 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2772 break;
2774 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2775 break;
2777 default:
2778 exit_and_crash:
2779 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2780 domain_crash(v->domain);
2781 break;
2785 asmlinkage void vmx_trace_vmentry(void)
2787 struct vcpu *v = current;
2788 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2789 v->arch.hvm_vcpu.hvm_trace_values[0],
2790 v->arch.hvm_vcpu.hvm_trace_values[1],
2791 v->arch.hvm_vcpu.hvm_trace_values[2],
2792 v->arch.hvm_vcpu.hvm_trace_values[3],
2793 v->arch.hvm_vcpu.hvm_trace_values[4]);
2795 TRACE_VMEXIT(0, 0);
2796 TRACE_VMEXIT(1, 0);
2797 TRACE_VMEXIT(2, 0);
2798 TRACE_VMEXIT(3, 0);
2799 TRACE_VMEXIT(4, 0);
2802 /*
2803 * Local variables:
2804 * mode: C
2805 * c-set-style: "BSD"
2806 * c-basic-offset: 4
2807 * tab-width: 4
2808 * indent-tabs-mode: nil
2809 * End:
2810 */