debuggers.hg

view xen/arch/x86/hvm/svm/svm.c @ 19942:c1541cc7e985

vmx: Add support for Pause-Loop Exiting

New NHM processors will support Pause-Loop Exiting by adding 2
VM-execution control fields:
PLE_Gap - upper bound on the amount of time between two successive
executions of PAUSE in a loop.
PLE_Window - upper bound on the amount of time a guest is allowed to
execute in a PAUSE loop

If the time, between this execution of PAUSE and previous one, exceeds
the PLE_Gap, processor consider this PAUSE belongs to a new loop.
Otherwise, processor determins the the total execution time of this
loop(since 1st PAUSE in this loop), and triggers a VM exit if total
time exceeds the PLE_Window.
* Refer SDM volume 3b section 21.6.13 & 22.1.3.

Pause-Loop Exiting can be used to detect Lock-Holder Preemption, where
one VP is sched-out after hold a spinlock, then other VPs for same
lock are sched-in to waste the CPU time.

Our tests indicate that most spinlocks are held for less than 2^12
cycles. Performance tests show that with 2X LP over-commitment we can
get +2% perf improvement for kernel build(Even more perf gain with
more LPs).

Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jul 07 14:06:35 2009 +0100 (2009-07-07)
parents 479f1fa084d6
children c0cb307d927f
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/hypercall.h>
28 #include <xen/domain_page.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/paging.h>
32 #include <asm/p2m.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/types.h>
37 #include <asm/debugreg.h>
38 #include <asm/msr.h>
39 #include <asm/spinlock.h>
40 #include <asm/hvm/emulate.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/hvm/io.h>
44 #include <asm/hvm/svm/asid.h>
45 #include <asm/hvm/svm/svm.h>
46 #include <asm/hvm/svm/vmcb.h>
47 #include <asm/hvm/svm/emulate.h>
48 #include <asm/hvm/svm/intr.h>
49 #include <asm/x86_emulate.h>
50 #include <public/sched.h>
51 #include <asm/hvm/vpt.h>
52 #include <asm/hvm/trace.h>
53 #include <asm/hap.h>
55 u32 svm_feature_flags;
57 #define set_segment_register(name, value) \
58 asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
60 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
62 asmlinkage void do_IRQ(struct cpu_user_regs *);
64 static void svm_update_guest_cr(struct vcpu *v, unsigned int cr);
65 static void svm_update_guest_efer(struct vcpu *v);
66 static void svm_inject_exception(
67 unsigned int trapnr, int errcode, unsigned long cr2);
68 static void svm_cpuid_intercept(
69 unsigned int *eax, unsigned int *ebx,
70 unsigned int *ecx, unsigned int *edx);
71 static void svm_wbinvd_intercept(void);
72 static void svm_fpu_dirty_intercept(void);
73 static int svm_msr_read_intercept(struct cpu_user_regs *regs);
74 static int svm_msr_write_intercept(struct cpu_user_regs *regs);
75 static void svm_invlpg_intercept(unsigned long vaddr);
77 /* va of hardware host save area */
78 static void *hsa[NR_CPUS] __read_mostly;
80 /* vmcb used for extended host state */
81 static void *root_vmcb[NR_CPUS] __read_mostly;
83 static void inline __update_guest_eip(
84 struct cpu_user_regs *regs, unsigned int inst_len)
85 {
86 struct vcpu *curr = current;
88 if ( unlikely(inst_len == 0) )
89 return;
91 if ( unlikely(inst_len > 15) )
92 {
93 gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
94 domain_crash(curr->domain);
95 return;
96 }
98 ASSERT(regs == guest_cpu_user_regs());
100 regs->eip += inst_len;
101 regs->eflags &= ~X86_EFLAGS_RF;
103 curr->arch.hvm_svm.vmcb->interrupt_shadow = 0;
105 if ( regs->eflags & X86_EFLAGS_TF )
106 svm_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
107 }
109 static void svm_cpu_down(void)
110 {
111 write_efer(read_efer() & ~EFER_SVME);
112 }
114 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
115 {
116 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
117 u32 ecx = regs->ecx;
119 HVM_DBG_LOG(DBG_LEVEL_0, "msr %x msr_content %"PRIx64,
120 ecx, msr_content);
122 switch ( ecx )
123 {
124 case MSR_EFER:
125 if ( hvm_set_efer(msr_content) )
126 return HNDL_exception_raised;
127 break;
129 case MSR_IA32_MC4_MISC: /* Threshold register */
130 case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
131 /*
132 * MCA/MCE: Threshold register is reported to be locked, so we ignore
133 * all write accesses. This behaviour matches real HW, so guests should
134 * have no problem with this.
135 */
136 break;
138 default:
139 return HNDL_unhandled;
140 }
142 return HNDL_done;
143 }
145 static void svm_save_dr(struct vcpu *v)
146 {
147 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
149 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
150 return;
152 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
153 v->arch.hvm_vcpu.flag_dr_dirty = 0;
154 v->arch.hvm_svm.vmcb->dr_intercepts = ~0u;
156 v->arch.guest_context.debugreg[0] = read_debugreg(0);
157 v->arch.guest_context.debugreg[1] = read_debugreg(1);
158 v->arch.guest_context.debugreg[2] = read_debugreg(2);
159 v->arch.guest_context.debugreg[3] = read_debugreg(3);
160 v->arch.guest_context.debugreg[6] = vmcb->dr6;
161 v->arch.guest_context.debugreg[7] = vmcb->dr7;
162 }
164 static void __restore_debug_registers(struct vcpu *v)
165 {
166 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
168 if ( v->arch.hvm_vcpu.flag_dr_dirty )
169 return;
171 v->arch.hvm_vcpu.flag_dr_dirty = 1;
172 vmcb->dr_intercepts = 0;
174 write_debugreg(0, v->arch.guest_context.debugreg[0]);
175 write_debugreg(1, v->arch.guest_context.debugreg[1]);
176 write_debugreg(2, v->arch.guest_context.debugreg[2]);
177 write_debugreg(3, v->arch.guest_context.debugreg[3]);
178 vmcb->dr6 = v->arch.guest_context.debugreg[6];
179 vmcb->dr7 = v->arch.guest_context.debugreg[7];
180 }
182 /*
183 * DR7 is saved and restored on every vmexit. Other debug registers only
184 * need to be restored if their value is going to affect execution -- i.e.,
185 * if one of the breakpoints is enabled. So mask out all bits that don't
186 * enable some breakpoint functionality.
187 */
188 static void svm_restore_dr(struct vcpu *v)
189 {
190 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
191 __restore_debug_registers(v);
192 }
194 static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
195 {
196 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
198 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
199 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
200 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
201 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
203 c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs;
204 c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp;
205 c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip;
207 c->pending_event = 0;
208 c->error_code = 0;
209 if ( vmcb->eventinj.fields.v &&
210 hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
211 vmcb->eventinj.fields.vector) )
212 {
213 c->pending_event = (uint32_t)vmcb->eventinj.bytes;
214 c->error_code = vmcb->eventinj.fields.errorcode;
215 }
217 return 1;
218 }
220 static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
221 {
222 unsigned long mfn = 0;
223 p2m_type_t p2mt;
224 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
226 if ( c->pending_valid &&
227 ((c->pending_type == 1) || (c->pending_type > 6) ||
228 (c->pending_reserved != 0)) )
229 {
230 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
231 c->pending_event);
232 return -EINVAL;
233 }
235 if ( !paging_mode_hap(v->domain) )
236 {
237 if ( c->cr0 & X86_CR0_PG )
238 {
239 mfn = mfn_x(gfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT, &p2mt));
240 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
241 {
242 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n",
243 c->cr3);
244 return -EINVAL;
245 }
246 }
248 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
249 put_page(pagetable_get_page(v->arch.guest_table));
251 v->arch.guest_table = pagetable_from_pfn(mfn);
252 }
254 v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET;
255 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
256 v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
257 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
258 svm_update_guest_cr(v, 0);
259 svm_update_guest_cr(v, 2);
260 svm_update_guest_cr(v, 4);
262 v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
263 v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp;
264 v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip;
266 if ( paging_mode_hap(v->domain) )
267 {
268 vmcb->np_enable = 1;
269 vmcb->g_pat = 0x0007040600070406ULL; /* guest PAT */
270 vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
271 }
273 if ( c->pending_valid )
274 {
275 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
276 c->pending_event, c->error_code);
278 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
279 {
280 vmcb->eventinj.bytes = c->pending_event;
281 vmcb->eventinj.fields.errorcode = c->error_code;
282 }
283 }
285 paging_update_paging_modes(v);
287 return 0;
288 }
291 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
292 {
293 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
295 data->shadow_gs = vmcb->kerngsbase;
296 data->msr_lstar = vmcb->lstar;
297 data->msr_star = vmcb->star;
298 data->msr_cstar = vmcb->cstar;
299 data->msr_syscall_mask = vmcb->sfmask;
300 data->msr_efer = v->arch.hvm_vcpu.guest_efer;
301 data->msr_flags = -1ULL;
303 data->tsc = hvm_get_guest_tsc(v);
304 }
307 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
308 {
309 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
311 vmcb->kerngsbase = data->shadow_gs;
312 vmcb->lstar = data->msr_lstar;
313 vmcb->star = data->msr_star;
314 vmcb->cstar = data->msr_cstar;
315 vmcb->sfmask = data->msr_syscall_mask;
316 v->arch.hvm_vcpu.guest_efer = data->msr_efer;
317 svm_update_guest_efer(v);
319 hvm_set_guest_tsc(v, data->tsc);
320 }
322 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
323 {
324 svm_save_cpu_state(v, ctxt);
325 svm_vmcb_save(v, ctxt);
326 }
328 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
329 {
330 svm_load_cpu_state(v, ctxt);
331 if (svm_vmcb_restore(v, ctxt)) {
332 printk("svm_vmcb restore failed!\n");
333 domain_crash(v->domain);
334 return -EINVAL;
335 }
337 return 0;
338 }
340 static void svm_fpu_enter(struct vcpu *v)
341 {
342 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
344 setup_fpu(v);
345 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
346 }
348 static void svm_fpu_leave(struct vcpu *v)
349 {
350 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
352 ASSERT(!v->fpu_dirtied);
353 ASSERT(read_cr0() & X86_CR0_TS);
355 /*
356 * If the guest does not have TS enabled then we must cause and handle an
357 * exception on first use of the FPU. If the guest *does* have TS enabled
358 * then this is not necessary: no FPU activity can occur until the guest
359 * clears CR0.TS, and we will initialise the FPU when that happens.
360 */
361 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
362 {
363 v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device;
364 vmcb->cr0 |= X86_CR0_TS;
365 }
366 }
368 static unsigned int svm_get_interrupt_shadow(struct vcpu *v)
369 {
370 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
371 unsigned int intr_shadow = 0;
373 if ( vmcb->interrupt_shadow )
374 intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI;
376 if ( vmcb->general1_intercepts & GENERAL1_INTERCEPT_IRET )
377 intr_shadow |= HVM_INTR_SHADOW_NMI;
379 return intr_shadow;
380 }
382 static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
383 {
384 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
386 vmcb->interrupt_shadow =
387 !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI));
389 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
390 if ( intr_shadow & HVM_INTR_SHADOW_NMI )
391 vmcb->general1_intercepts |= GENERAL1_INTERCEPT_IRET;
392 }
394 static int svm_guest_x86_mode(struct vcpu *v)
395 {
396 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
398 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
399 return 0;
400 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
401 return 1;
402 if ( hvm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) )
403 return 8;
404 return (likely(vmcb->cs.attr.fields.db) ? 4 : 2);
405 }
407 static void svm_update_host_cr3(struct vcpu *v)
408 {
409 /* SVM doesn't have a HOST_CR3 equivalent to update. */
410 }
412 static void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
413 {
414 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
416 switch ( cr )
417 {
418 case 0: {
419 unsigned long hw_cr0_mask = 0;
421 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
422 {
423 if ( v != current )
424 hw_cr0_mask |= X86_CR0_TS;
425 else if ( vmcb->cr0 & X86_CR0_TS )
426 svm_fpu_enter(v);
427 }
429 vmcb->cr0 = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
430 if ( !paging_mode_hap(v->domain) )
431 vmcb->cr0 |= X86_CR0_PG | X86_CR0_WP;
432 break;
433 }
434 case 2:
435 vmcb->cr2 = v->arch.hvm_vcpu.guest_cr[2];
436 break;
437 case 3:
438 vmcb->cr3 = v->arch.hvm_vcpu.hw_cr[3];
439 svm_asid_inv_asid(v);
440 break;
441 case 4:
442 vmcb->cr4 = HVM_CR4_HOST_MASK;
443 if ( paging_mode_hap(v->domain) )
444 vmcb->cr4 &= ~X86_CR4_PAE;
445 vmcb->cr4 |= v->arch.hvm_vcpu.guest_cr[4];
446 break;
447 default:
448 BUG();
449 }
450 }
452 static void svm_update_guest_efer(struct vcpu *v)
453 {
454 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
455 bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA);
457 vmcb->efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME;
458 if ( lma )
459 vmcb->efer |= EFER_LME;
461 /*
462 * In legacy mode (EFER.LMA=0) we natively support SYSENTER/SYSEXIT with
463 * no need for MSR intercepts. Ehen EFER.LMA=1 we must trap and emulate.
464 */
465 svm_intercept_msr(v, MSR_IA32_SYSENTER_CS, lma);
466 svm_intercept_msr(v, MSR_IA32_SYSENTER_ESP, lma);
467 svm_intercept_msr(v, MSR_IA32_SYSENTER_EIP, lma);
468 }
470 static void svm_flush_guest_tlbs(void)
471 {
472 /* Roll over the CPU's ASID generation, so it gets a clean TLB when we
473 * next VMRUN. (If ASIDs are disabled, the whole TLB is flushed on
474 * VMRUN anyway). */
475 svm_asid_inc_generation();
476 }
478 static void svm_sync_vmcb(struct vcpu *v)
479 {
480 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
482 if ( arch_svm->vmcb_in_sync )
483 return;
485 arch_svm->vmcb_in_sync = 1;
487 svm_vmsave(arch_svm->vmcb);
488 }
490 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
491 struct segment_register *reg)
492 {
493 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
495 ASSERT((v == current) || !vcpu_runnable(v));
497 switch ( seg )
498 {
499 case x86_seg_cs:
500 memcpy(reg, &vmcb->cs, sizeof(*reg));
501 reg->attr.fields.g = reg->limit > 0xFFFFF;
502 break;
503 case x86_seg_ds:
504 memcpy(reg, &vmcb->ds, sizeof(*reg));
505 if ( reg->attr.fields.type != 0 )
506 reg->attr.fields.type |= 0x1;
507 break;
508 case x86_seg_es:
509 memcpy(reg, &vmcb->es, sizeof(*reg));
510 if ( reg->attr.fields.type != 0 )
511 reg->attr.fields.type |= 0x1;
512 break;
513 case x86_seg_fs:
514 svm_sync_vmcb(v);
515 memcpy(reg, &vmcb->fs, sizeof(*reg));
516 if ( reg->attr.fields.type != 0 )
517 reg->attr.fields.type |= 0x1;
518 break;
519 case x86_seg_gs:
520 svm_sync_vmcb(v);
521 memcpy(reg, &vmcb->gs, sizeof(*reg));
522 if ( reg->attr.fields.type != 0 )
523 reg->attr.fields.type |= 0x1;
524 break;
525 case x86_seg_ss:
526 memcpy(reg, &vmcb->ss, sizeof(*reg));
527 reg->attr.fields.dpl = vmcb->cpl;
528 if ( reg->attr.fields.type == 0 )
529 reg->attr.fields.db = 0;
530 break;
531 case x86_seg_tr:
532 svm_sync_vmcb(v);
533 memcpy(reg, &vmcb->tr, sizeof(*reg));
534 reg->attr.fields.type |= 0x2;
535 break;
536 case x86_seg_gdtr:
537 memcpy(reg, &vmcb->gdtr, sizeof(*reg));
538 break;
539 case x86_seg_idtr:
540 memcpy(reg, &vmcb->idtr, sizeof(*reg));
541 break;
542 case x86_seg_ldtr:
543 svm_sync_vmcb(v);
544 memcpy(reg, &vmcb->ldtr, sizeof(*reg));
545 break;
546 default:
547 BUG();
548 }
549 }
551 static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
552 struct segment_register *reg)
553 {
554 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
555 int sync = 0;
557 ASSERT((v == current) || !vcpu_runnable(v));
559 switch ( seg )
560 {
561 case x86_seg_fs:
562 case x86_seg_gs:
563 case x86_seg_tr:
564 case x86_seg_ldtr:
565 sync = (v == current);
566 break;
567 default:
568 break;
569 }
571 if ( sync )
572 svm_sync_vmcb(v);
574 switch ( seg )
575 {
576 case x86_seg_cs:
577 memcpy(&vmcb->cs, reg, sizeof(*reg));
578 break;
579 case x86_seg_ds:
580 memcpy(&vmcb->ds, reg, sizeof(*reg));
581 break;
582 case x86_seg_es:
583 memcpy(&vmcb->es, reg, sizeof(*reg));
584 break;
585 case x86_seg_fs:
586 memcpy(&vmcb->fs, reg, sizeof(*reg));
587 break;
588 case x86_seg_gs:
589 memcpy(&vmcb->gs, reg, sizeof(*reg));
590 break;
591 case x86_seg_ss:
592 memcpy(&vmcb->ss, reg, sizeof(*reg));
593 vmcb->cpl = vmcb->ss.attr.fields.dpl;
594 break;
595 case x86_seg_tr:
596 memcpy(&vmcb->tr, reg, sizeof(*reg));
597 break;
598 case x86_seg_gdtr:
599 vmcb->gdtr.base = reg->base;
600 vmcb->gdtr.limit = (uint16_t)reg->limit;
601 break;
602 case x86_seg_idtr:
603 vmcb->idtr.base = reg->base;
604 vmcb->idtr.limit = (uint16_t)reg->limit;
605 break;
606 case x86_seg_ldtr:
607 memcpy(&vmcb->ldtr, reg, sizeof(*reg));
608 break;
609 default:
610 BUG();
611 }
613 if ( sync )
614 svm_vmload(vmcb);
615 }
617 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
618 {
619 v->arch.hvm_svm.vmcb->tsc_offset = offset;
620 }
622 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
623 {
624 char *p;
625 int i;
627 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
628 {
629 p = (char *)(hypercall_page + (i * 32));
630 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
631 *(u32 *)(p + 1) = i;
632 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
633 *(u8 *)(p + 6) = 0x01;
634 *(u8 *)(p + 7) = 0xd9;
635 *(u8 *)(p + 8) = 0xc3; /* ret */
636 }
638 /* Don't support HYPERVISOR_iret at the moment */
639 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
640 }
642 static void svm_ctxt_switch_from(struct vcpu *v)
643 {
644 int cpu = smp_processor_id();
646 svm_fpu_leave(v);
648 svm_save_dr(v);
650 svm_sync_vmcb(v);
651 svm_vmload(root_vmcb[cpu]);
653 #ifdef __x86_64__
654 /* Resume use of ISTs now that the host TR is reinstated. */
655 idt_tables[cpu][TRAP_double_fault].a |= IST_DF << 32;
656 idt_tables[cpu][TRAP_nmi].a |= IST_NMI << 32;
657 idt_tables[cpu][TRAP_machine_check].a |= IST_MCE << 32;
658 #endif
659 }
661 static void svm_ctxt_switch_to(struct vcpu *v)
662 {
663 int cpu = smp_processor_id();
665 #ifdef __x86_64__
666 /*
667 * This is required, because VMRUN does consistency check
668 * and some of the DOM0 selectors are pointing to
669 * invalid GDT locations, and cause AMD processors
670 * to shutdown.
671 */
672 set_segment_register(ds, 0);
673 set_segment_register(es, 0);
674 set_segment_register(ss, 0);
676 /*
677 * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
678 * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
679 */
680 idt_tables[cpu][TRAP_double_fault].a &= ~(7UL << 32);
681 idt_tables[cpu][TRAP_nmi].a &= ~(7UL << 32);
682 idt_tables[cpu][TRAP_machine_check].a &= ~(7UL << 32);
683 #endif
685 svm_restore_dr(v);
687 svm_vmsave(root_vmcb[cpu]);
688 svm_vmload(v->arch.hvm_svm.vmcb);
689 }
691 static void svm_do_resume(struct vcpu *v)
692 {
693 bool_t debug_state = v->domain->debugger_attached;
695 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
696 {
697 uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
698 v->arch.hvm_vcpu.debug_state_latch = debug_state;
699 if ( debug_state )
700 v->arch.hvm_svm.vmcb->exception_intercepts |= mask;
701 else
702 v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask;
703 }
705 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
706 {
707 v->arch.hvm_svm.launch_core = smp_processor_id();
708 hvm_migrate_timers(v);
710 /* Migrating to another ASID domain. Request a new ASID. */
711 svm_asid_init_vcpu(v);
712 }
714 /* Reflect the vlapic's TPR in the hardware vtpr */
715 v->arch.hvm_svm.vmcb->vintr.fields.tpr =
716 (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
718 hvm_do_resume(v);
719 reset_stack_and_jump(svm_asm_do_resume);
720 }
722 static int svm_domain_initialise(struct domain *d)
723 {
724 return 0;
725 }
727 static void svm_domain_destroy(struct domain *d)
728 {
729 }
731 static int svm_vcpu_initialise(struct vcpu *v)
732 {
733 int rc;
735 v->arch.schedule_tail = svm_do_resume;
736 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
737 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
739 v->arch.hvm_svm.launch_core = -1;
741 if ( (rc = svm_create_vmcb(v)) != 0 )
742 {
743 dprintk(XENLOG_WARNING,
744 "Failed to create VMCB for vcpu %d: err=%d.\n",
745 v->vcpu_id, rc);
746 return rc;
747 }
749 return 0;
750 }
752 static void svm_vcpu_destroy(struct vcpu *v)
753 {
754 svm_destroy_vmcb(v);
755 }
757 static void svm_inject_exception(
758 unsigned int trapnr, int errcode, unsigned long cr2)
759 {
760 struct vcpu *curr = current;
761 struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
762 eventinj_t event = vmcb->eventinj;
764 switch ( trapnr )
765 {
766 case TRAP_debug:
767 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
768 {
769 __restore_debug_registers(curr);
770 vmcb->dr6 |= 0x4000;
771 }
772 case TRAP_int3:
773 if ( curr->domain->debugger_attached )
774 {
775 /* Debug/Int3: Trap to debugger. */
776 domain_pause_for_debugger();
777 return;
778 }
779 }
781 if ( unlikely(event.fields.v) &&
782 (event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
783 {
784 trapnr = hvm_combine_hw_exceptions(event.fields.vector, trapnr);
785 if ( trapnr == TRAP_double_fault )
786 errcode = 0;
787 }
789 event.bytes = 0;
790 event.fields.v = 1;
791 event.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
792 event.fields.vector = trapnr;
793 event.fields.ev = (errcode != HVM_DELIVER_NO_ERROR_CODE);
794 event.fields.errorcode = errcode;
796 vmcb->eventinj = event;
798 if ( trapnr == TRAP_page_fault )
799 {
800 vmcb->cr2 = curr->arch.hvm_vcpu.guest_cr[2] = cr2;
801 HVMTRACE_LONG_2D(PF_INJECT, errcode, TRC_PAR_LONG(cr2));
802 }
803 else
804 {
805 HVMTRACE_2D(INJ_EXC, trapnr, errcode);
806 }
807 }
809 static int svm_event_pending(struct vcpu *v)
810 {
811 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
812 return vmcb->eventinj.fields.v;
813 }
815 static int svm_do_pmu_interrupt(struct cpu_user_regs *regs)
816 {
817 return 0;
818 }
820 static struct hvm_function_table svm_function_table = {
821 .name = "SVM",
822 .cpu_down = svm_cpu_down,
823 .domain_initialise = svm_domain_initialise,
824 .domain_destroy = svm_domain_destroy,
825 .vcpu_initialise = svm_vcpu_initialise,
826 .vcpu_destroy = svm_vcpu_destroy,
827 .save_cpu_ctxt = svm_save_vmcb_ctxt,
828 .load_cpu_ctxt = svm_load_vmcb_ctxt,
829 .get_interrupt_shadow = svm_get_interrupt_shadow,
830 .set_interrupt_shadow = svm_set_interrupt_shadow,
831 .guest_x86_mode = svm_guest_x86_mode,
832 .get_segment_register = svm_get_segment_register,
833 .set_segment_register = svm_set_segment_register,
834 .update_host_cr3 = svm_update_host_cr3,
835 .update_guest_cr = svm_update_guest_cr,
836 .update_guest_efer = svm_update_guest_efer,
837 .flush_guest_tlbs = svm_flush_guest_tlbs,
838 .set_tsc_offset = svm_set_tsc_offset,
839 .inject_exception = svm_inject_exception,
840 .init_hypercall_page = svm_init_hypercall_page,
841 .event_pending = svm_event_pending,
842 .do_pmu_interrupt = svm_do_pmu_interrupt,
843 .cpuid_intercept = svm_cpuid_intercept,
844 .wbinvd_intercept = svm_wbinvd_intercept,
845 .fpu_dirty_intercept = svm_fpu_dirty_intercept,
846 .msr_read_intercept = svm_msr_read_intercept,
847 .msr_write_intercept = svm_msr_write_intercept,
848 .invlpg_intercept = svm_invlpg_intercept
849 };
851 static int svm_cpu_up(struct cpuinfo_x86 *c)
852 {
853 u32 eax, edx, phys_hsa_lo, phys_hsa_hi;
854 u64 phys_hsa;
855 int cpu = smp_processor_id();
857 /* Check whether SVM feature is disabled in BIOS */
858 rdmsr(MSR_K8_VM_CR, eax, edx);
859 if ( eax & K8_VMCR_SVME_DISABLE )
860 {
861 printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu);
862 return 0;
863 }
865 if ( ((hsa[cpu] == NULL) &&
866 ((hsa[cpu] = alloc_host_save_area()) == NULL)) ||
867 ((root_vmcb[cpu] == NULL) &&
868 ((root_vmcb[cpu] = alloc_vmcb()) == NULL)) )
869 return 0;
871 write_efer(read_efer() | EFER_SVME);
873 /* Initialize the HSA for this core. */
874 phys_hsa = (u64)virt_to_maddr(hsa[cpu]);
875 phys_hsa_lo = (u32)phys_hsa;
876 phys_hsa_hi = (u32)(phys_hsa >> 32);
877 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
879 /* Initialize core's ASID handling. */
880 svm_asid_init(c);
882 return 1;
883 }
885 void start_svm(struct cpuinfo_x86 *c)
886 {
887 static bool_t bootstrapped;
889 if ( test_and_set_bool(bootstrapped) )
890 {
891 if ( hvm_enabled && !svm_cpu_up(c) )
892 {
893 printk("SVM: FATAL: failed to initialise CPU%d!\n",
894 smp_processor_id());
895 BUG();
896 }
897 return;
898 }
900 /* Xen does not fill x86_capability words except 0. */
901 boot_cpu_data.x86_capability[5] = cpuid_ecx(0x80000001);
903 if ( !test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability) )
904 return;
906 if ( !svm_cpu_up(c) )
907 {
908 printk("SVM: failed to initialise.\n");
909 return;
910 }
912 setup_vmcb_dump();
914 svm_feature_flags = ((cpuid_eax(0x80000000) >= 0x8000000A) ?
915 cpuid_edx(0x8000000A) : 0);
917 svm_function_table.hap_supported = cpu_has_svm_npt;
919 hvm_enable(&svm_function_table);
920 }
922 static void svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
923 {
924 p2m_type_t p2mt;
925 mfn_t mfn;
926 unsigned long gfn = gpa >> PAGE_SHIFT;
928 /*
929 * If this GFN is emulated MMIO or marked as read-only, pass the fault
930 * to the mmio handler.
931 */
932 mfn = gfn_to_mfn_type_current(gfn, &p2mt, p2m_guest);
933 if ( (p2mt == p2m_mmio_dm) || (p2mt == p2m_ram_ro) )
934 {
935 if ( !handle_mmio() )
936 hvm_inject_exception(TRAP_gp_fault, 0, 0);
937 return;
938 }
940 /* Log-dirty: mark the page dirty and let the guest write it again */
941 paging_mark_dirty(current->domain, mfn_x(mfn));
942 p2m_change_type(current->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
943 }
945 static void svm_fpu_dirty_intercept(void)
946 {
947 struct vcpu *curr = current;
948 struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
950 svm_fpu_enter(curr);
952 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
953 vmcb->cr0 &= ~X86_CR0_TS;
954 }
956 #define bitmaskof(idx) (1U << ((idx) & 31))
957 static void svm_cpuid_intercept(
958 unsigned int *eax, unsigned int *ebx,
959 unsigned int *ecx, unsigned int *edx)
960 {
961 unsigned int input = *eax;
962 struct vcpu *v = current;
964 hvm_cpuid(input, eax, ebx, ecx, edx);
966 if ( input == 0x80000001 )
967 {
968 /* Fix up VLAPIC details. */
969 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
970 __clear_bit(X86_FEATURE_APIC & 31, edx);
971 }
973 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
974 }
976 static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
977 {
978 unsigned int eax, ebx, ecx, edx, inst_len;
980 if ( (inst_len = __get_instruction_length(current, INSTR_CPUID)) == 0 )
981 return;
983 eax = regs->eax;
984 ebx = regs->ebx;
985 ecx = regs->ecx;
986 edx = regs->edx;
988 svm_cpuid_intercept(&eax, &ebx, &ecx, &edx);
990 regs->eax = eax;
991 regs->ebx = ebx;
992 regs->ecx = ecx;
993 regs->edx = edx;
995 __update_guest_eip(regs, inst_len);
996 }
998 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
999 {
1000 HVMTRACE_0D(DR_WRITE);
1001 __restore_debug_registers(v);
1004 static int svm_msr_read_intercept(struct cpu_user_regs *regs)
1006 u64 msr_content = 0;
1007 u32 ecx = regs->ecx, eax, edx;
1008 struct vcpu *v = current;
1009 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1011 switch ( ecx )
1013 case MSR_EFER:
1014 msr_content = v->arch.hvm_vcpu.guest_efer;
1015 break;
1017 case MSR_IA32_SYSENTER_CS:
1018 msr_content = v->arch.hvm_svm.guest_sysenter_cs;
1019 break;
1020 case MSR_IA32_SYSENTER_ESP:
1021 msr_content = v->arch.hvm_svm.guest_sysenter_esp;
1022 break;
1023 case MSR_IA32_SYSENTER_EIP:
1024 msr_content = v->arch.hvm_svm.guest_sysenter_eip;
1025 break;
1027 case MSR_IA32_MC4_MISC: /* Threshold register */
1028 case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
1029 /*
1030 * MCA/MCE: We report that the threshold register is unavailable
1031 * for OS use (locked by the BIOS).
1032 */
1033 msr_content = 1ULL << 61; /* MC4_MISC.Locked */
1034 break;
1036 case MSR_IA32_EBC_FREQUENCY_ID:
1037 /*
1038 * This Intel-only register may be accessed if this HVM guest
1039 * has been migrated from an Intel host. The value zero is not
1040 * particularly meaningful, but at least avoids the guest crashing!
1041 */
1042 msr_content = 0;
1043 break;
1045 case MSR_K8_VM_HSAVE_PA:
1046 goto gpf;
1048 case MSR_IA32_DEBUGCTLMSR:
1049 msr_content = vmcb->debugctlmsr;
1050 break;
1052 case MSR_IA32_LASTBRANCHFROMIP:
1053 msr_content = vmcb->lastbranchfromip;
1054 break;
1056 case MSR_IA32_LASTBRANCHTOIP:
1057 msr_content = vmcb->lastbranchtoip;
1058 break;
1060 case MSR_IA32_LASTINTFROMIP:
1061 msr_content = vmcb->lastintfromip;
1062 break;
1064 case MSR_IA32_LASTINTTOIP:
1065 msr_content = vmcb->lastinttoip;
1066 break;
1068 default:
1069 if ( rdmsr_viridian_regs(ecx, &eax, &edx) ||
1070 rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
1071 rdmsr_safe(ecx, eax, edx) == 0 )
1073 regs->eax = eax;
1074 regs->edx = edx;
1075 goto done;
1077 goto gpf;
1079 regs->eax = msr_content & 0xFFFFFFFF;
1080 regs->edx = msr_content >> 32;
1082 done:
1083 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1084 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1085 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
1086 return X86EMUL_OKAY;
1088 gpf:
1089 svm_inject_exception(TRAP_gp_fault, 0, 0);
1090 return X86EMUL_EXCEPTION;
1093 static int svm_msr_write_intercept(struct cpu_user_regs *regs)
1095 u64 msr_content = 0;
1096 u32 ecx = regs->ecx;
1097 struct vcpu *v = current;
1098 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1100 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1102 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
1104 switch ( ecx )
1106 case MSR_K8_VM_HSAVE_PA:
1107 goto gpf;
1109 case MSR_IA32_SYSENTER_CS:
1110 v->arch.hvm_svm.guest_sysenter_cs = msr_content;
1111 break;
1112 case MSR_IA32_SYSENTER_ESP:
1113 v->arch.hvm_svm.guest_sysenter_esp = msr_content;
1114 break;
1115 case MSR_IA32_SYSENTER_EIP:
1116 v->arch.hvm_svm.guest_sysenter_eip = msr_content;
1117 break;
1119 case MSR_IA32_DEBUGCTLMSR:
1120 vmcb->debugctlmsr = msr_content;
1121 if ( !msr_content || !cpu_has_svm_lbrv )
1122 break;
1123 vmcb->lbr_control.fields.enable = 1;
1124 svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR);
1125 svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP);
1126 svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP);
1127 svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP);
1128 svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP);
1129 break;
1131 case MSR_IA32_LASTBRANCHFROMIP:
1132 vmcb->lastbranchfromip = msr_content;
1133 break;
1135 case MSR_IA32_LASTBRANCHTOIP:
1136 vmcb->lastbranchtoip = msr_content;
1137 break;
1139 case MSR_IA32_LASTINTFROMIP:
1140 vmcb->lastintfromip = msr_content;
1141 break;
1143 case MSR_IA32_LASTINTTOIP:
1144 vmcb->lastinttoip = msr_content;
1145 break;
1147 default:
1148 if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) )
1149 break;
1151 switch ( long_mode_do_msr_write(regs) )
1153 case HNDL_unhandled:
1154 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
1155 break;
1156 case HNDL_exception_raised:
1157 return X86EMUL_EXCEPTION;
1158 case HNDL_done:
1159 break;
1161 break;
1164 return X86EMUL_OKAY;
1166 gpf:
1167 svm_inject_exception(TRAP_gp_fault, 0, 0);
1168 return X86EMUL_EXCEPTION;
1171 static void svm_do_msr_access(struct cpu_user_regs *regs)
1173 int rc, inst_len;
1174 struct vcpu *v = current;
1175 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1177 if ( vmcb->exitinfo1 == 0 )
1179 if ( (inst_len = __get_instruction_length(v, INSTR_RDMSR)) == 0 )
1180 return;
1181 rc = hvm_msr_read_intercept(regs);
1183 else
1185 if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 )
1186 return;
1187 rc = hvm_msr_write_intercept(regs);
1190 if ( rc == X86EMUL_OKAY )
1191 __update_guest_eip(regs, inst_len);
1194 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb,
1195 struct cpu_user_regs *regs)
1197 unsigned int inst_len;
1199 if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 )
1200 return;
1201 __update_guest_eip(regs, inst_len);
1203 hvm_hlt(regs->eflags);
1206 static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs)
1208 unsigned int inst_len;
1210 if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 )
1211 return;
1212 __update_guest_eip(regs, inst_len);
1214 hvm_rdtsc_intercept(regs);
1217 static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs)
1219 struct hvm_emulate_ctxt ctxt;
1220 int rc;
1222 hvm_emulate_prepare(&ctxt, regs);
1224 rc = hvm_emulate_one(&ctxt);
1226 switch ( rc )
1228 case X86EMUL_UNHANDLEABLE:
1229 svm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
1230 break;
1231 case X86EMUL_EXCEPTION:
1232 if ( ctxt.exn_pending )
1233 hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0);
1234 /* fall through */
1235 default:
1236 hvm_emulate_writeback(&ctxt);
1237 break;
1241 static void wbinvd_ipi(void *info)
1243 wbinvd();
1246 static void svm_wbinvd_intercept(void)
1248 if ( has_arch_pdevs(current->domain) )
1249 on_each_cpu(wbinvd_ipi, NULL, 1);
1252 static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs)
1254 enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD };
1255 int inst_len;
1257 inst_len = __get_instruction_length_from_list(
1258 current, list, ARRAY_SIZE(list));
1259 if ( inst_len == 0 )
1260 return;
1262 svm_wbinvd_intercept();
1264 __update_guest_eip(regs, inst_len);
1267 static void svm_invlpg_intercept(unsigned long vaddr)
1269 struct vcpu *curr = current;
1270 HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
1271 paging_invlpg(curr, vaddr);
1272 svm_asid_g_invlpg(curr, vaddr);
1275 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
1277 unsigned int exit_reason;
1278 struct vcpu *v = current;
1279 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1280 eventinj_t eventinj;
1281 int inst_len, rc;
1283 /*
1284 * Before doing anything else, we need to sync up the VLAPIC's TPR with
1285 * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
1286 * because we update the vTPR on MMIO writes to the TPR.
1287 */
1288 vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
1289 (vmcb->vintr.fields.tpr & 0x0F) << 4);
1291 exit_reason = vmcb->exitcode;
1293 if ( hvm_long_mode_enabled(v) )
1294 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
1295 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
1296 0, 0, 0);
1297 else
1298 HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason,
1299 (uint32_t)regs->eip,
1300 0, 0, 0, 0);
1302 if ( unlikely(exit_reason == VMEXIT_INVALID) )
1304 svm_dump_vmcb(__func__, vmcb);
1305 goto exit_and_crash;
1308 perfc_incra(svmexits, exit_reason);
1310 hvm_maybe_deassert_evtchn_irq();
1312 /* Event delivery caused this intercept? Queue for redelivery. */
1313 eventinj = vmcb->exitintinfo;
1314 if ( unlikely(eventinj.fields.v) &&
1315 hvm_event_needs_reinjection(eventinj.fields.type,
1316 eventinj.fields.vector) )
1317 vmcb->eventinj = eventinj;
1319 switch ( exit_reason )
1321 case VMEXIT_INTR:
1322 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
1323 HVMTRACE_0D(INTR);
1324 break;
1326 case VMEXIT_NMI:
1327 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
1328 HVMTRACE_0D(NMI);
1329 break;
1331 case VMEXIT_SMI:
1332 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
1333 HVMTRACE_0D(SMI);
1334 break;
1336 case VMEXIT_EXCEPTION_DB:
1337 if ( !v->domain->debugger_attached )
1338 goto exit_and_crash;
1339 domain_pause_for_debugger();
1340 break;
1342 case VMEXIT_EXCEPTION_BP:
1343 if ( !v->domain->debugger_attached )
1344 goto exit_and_crash;
1345 /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
1346 if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 )
1347 break;
1348 __update_guest_eip(regs, inst_len);
1349 domain_pause_for_debugger();
1350 break;
1352 case VMEXIT_EXCEPTION_NM:
1353 svm_fpu_dirty_intercept();
1354 break;
1356 case VMEXIT_EXCEPTION_PF: {
1357 unsigned long va;
1358 va = vmcb->exitinfo2;
1359 regs->error_code = vmcb->exitinfo1;
1360 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1361 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
1362 (unsigned long)regs->eax, (unsigned long)regs->ebx,
1363 (unsigned long)regs->ecx, (unsigned long)regs->edx,
1364 (unsigned long)regs->esi, (unsigned long)regs->edi);
1366 if ( paging_fault(va, regs) )
1368 if ( trace_will_trace_event(TRC_SHADOW) )
1369 break;
1370 if ( hvm_long_mode_enabled(v) )
1371 HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
1372 else
1373 HVMTRACE_2D(PF_XEN, regs->error_code, va);
1374 break;
1377 svm_inject_exception(TRAP_page_fault, regs->error_code, va);
1378 break;
1381 case VMEXIT_EXCEPTION_UD:
1382 svm_vmexit_ud_intercept(regs);
1383 break;
1385 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
1386 case VMEXIT_EXCEPTION_MC:
1387 HVMTRACE_0D(MCE);
1388 break;
1390 case VMEXIT_VINTR:
1391 vmcb->vintr.fields.irq = 0;
1392 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
1393 break;
1395 case VMEXIT_INVD:
1396 case VMEXIT_WBINVD:
1397 svm_vmexit_do_invalidate_cache(regs);
1398 break;
1400 case VMEXIT_TASK_SWITCH: {
1401 enum hvm_task_switch_reason reason;
1402 int32_t errcode = -1;
1403 if ( (vmcb->exitinfo2 >> 36) & 1 )
1404 reason = TSW_iret;
1405 else if ( (vmcb->exitinfo2 >> 38) & 1 )
1406 reason = TSW_jmp;
1407 else
1408 reason = TSW_call_or_int;
1409 if ( (vmcb->exitinfo2 >> 44) & 1 )
1410 errcode = (uint32_t)vmcb->exitinfo2;
1412 /*
1413 * Some processors set the EXITINTINFO field when the task switch
1414 * is caused by a task gate in the IDT. In this case we will be
1415 * emulating the event injection, so we do not want the processor
1416 * to re-inject the original event!
1417 */
1418 vmcb->eventinj.bytes = 0;
1420 hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
1421 break;
1424 case VMEXIT_CPUID:
1425 svm_vmexit_do_cpuid(regs);
1426 break;
1428 case VMEXIT_HLT:
1429 svm_vmexit_do_hlt(vmcb, regs);
1430 break;
1432 case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
1433 case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
1434 case VMEXIT_INVLPG:
1435 case VMEXIT_INVLPGA:
1436 case VMEXIT_IOIO:
1437 if ( !handle_mmio() )
1438 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1439 break;
1441 case VMEXIT_VMMCALL:
1442 if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
1443 break;
1444 HVMTRACE_1D(VMMCALL, regs->eax);
1445 rc = hvm_do_hypercall(regs);
1446 if ( rc != HVM_HCALL_preempted )
1448 __update_guest_eip(regs, inst_len);
1449 if ( rc == HVM_HCALL_invalidate )
1450 send_invalidate_req();
1452 break;
1454 case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
1455 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
1456 svm_dr_access(v, regs);
1457 break;
1459 case VMEXIT_MSR:
1460 svm_do_msr_access(regs);
1461 break;
1463 case VMEXIT_SHUTDOWN:
1464 hvm_triple_fault();
1465 break;
1467 case VMEXIT_RDTSC:
1468 svm_vmexit_do_rdtsc(regs);
1469 break;
1471 case VMEXIT_RDTSCP:
1472 case VMEXIT_MONITOR:
1473 case VMEXIT_MWAIT:
1474 case VMEXIT_VMRUN:
1475 case VMEXIT_VMLOAD:
1476 case VMEXIT_VMSAVE:
1477 case VMEXIT_STGI:
1478 case VMEXIT_CLGI:
1479 case VMEXIT_SKINIT:
1480 svm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
1481 break;
1483 case VMEXIT_NPF:
1484 perfc_incra(svmexits, VMEXIT_NPF_PERFC);
1485 regs->error_code = vmcb->exitinfo1;
1486 svm_do_nested_pgfault(vmcb->exitinfo2, regs);
1487 break;
1489 case VMEXIT_IRET:
1490 /*
1491 * IRET clears the NMI mask. However because we clear the mask
1492 * /before/ executing IRET, we set the interrupt shadow to prevent
1493 * a pending NMI from being injected immediately. This will work
1494 * perfectly unless the IRET instruction faults: in that case we
1495 * may inject an NMI before the NMI handler's IRET instruction is
1496 * retired.
1497 */
1498 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
1499 vmcb->interrupt_shadow = 1;
1500 break;
1502 case VMEXIT_PAUSE:
1503 /*
1504 * The guest is running a contended spinlock and we've detected it.
1505 * Do something useful, like reschedule the guest
1506 */
1507 perfc_incr(pauseloop_exits);
1508 do_sched_op_compat(SCHEDOP_yield, 0);
1509 break;
1511 default:
1512 exit_and_crash:
1513 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
1514 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
1515 exit_reason,
1516 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
1517 domain_crash(v->domain);
1518 break;
1521 /* The exit may have updated the TPR: reflect this in the hardware vtpr */
1522 vmcb->vintr.fields.tpr =
1523 (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
1526 asmlinkage void svm_trace_vmentry(void)
1528 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
1531 /*
1532 * Local variables:
1533 * mode: C
1534 * c-set-style: "BSD"
1535 * c-basic-offset: 4
1536 * tab-width: 4
1537 * indent-tabs-mode: nil
1538 * End:
1539 */