debuggers.hg

view xen/arch/x86/hvm/hvm.c @ 17986:f2148e532c81

x86 hvm: Fix RTC handling.
1. Clean up initialisation/destruction.
2. Better handle per-domain time-offset changes.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jul 02 17:25:05 2008 +0100 (2008-07-02)
parents 469d9b00382d
children 1e9df5cb885f
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 * Copyright (c) 2008, Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 */
22 #include <xen/config.h>
23 #include <xen/init.h>
24 #include <xen/lib.h>
25 #include <xen/trace.h>
26 #include <xen/sched.h>
27 #include <xen/irq.h>
28 #include <xen/softirq.h>
29 #include <xen/domain.h>
30 #include <xen/domain_page.h>
31 #include <xen/hypercall.h>
32 #include <xen/guest_access.h>
33 #include <xen/event.h>
34 #include <asm/current.h>
35 #include <asm/e820.h>
36 #include <asm/io.h>
37 #include <asm/paging.h>
38 #include <asm/regs.h>
39 #include <asm/cpufeature.h>
40 #include <asm/processor.h>
41 #include <asm/types.h>
42 #include <asm/msr.h>
43 #include <asm/mc146818rtc.h>
44 #include <asm/spinlock.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/vpt.h>
47 #include <asm/hvm/support.h>
48 #include <asm/hvm/cacheattr.h>
49 #include <asm/hvm/trace.h>
50 #include <public/sched.h>
51 #include <public/hvm/ioreq.h>
52 #include <public/version.h>
53 #include <public/memory.h>
55 int hvm_enabled __read_mostly;
57 unsigned int opt_hvm_debug_level __read_mostly;
58 integer_param("hvm_debug", opt_hvm_debug_level);
60 struct hvm_function_table hvm_funcs __read_mostly;
62 /* I/O permission bitmap is globally shared by all HVM guests. */
63 unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
64 hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
66 void hvm_enable(struct hvm_function_table *fns)
67 {
68 BUG_ON(hvm_enabled);
69 printk("HVM: %s enabled\n", fns->name);
71 /*
72 * Allow direct access to the PC debug port (it is often used for I/O
73 * delays, but the vmexits simply slow things down).
74 */
75 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
76 __clear_bit(0x80, hvm_io_bitmap);
78 hvm_funcs = *fns;
79 hvm_enabled = 1;
81 if ( hvm_funcs.hap_supported )
82 printk("HVM: Hardware Assisted Paging detected.\n");
83 }
85 /*
86 * Need to re-inject a given event? We avoid re-injecting software exceptions
87 * and interrupts because the faulting/trapping instruction can simply be
88 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
89 * INT3/INTO/INTn).
90 */
91 int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
92 {
93 switch ( type )
94 {
95 case X86_EVENTTYPE_EXT_INTR:
96 case X86_EVENTTYPE_NMI:
97 return 1;
98 case X86_EVENTTYPE_HW_EXCEPTION:
99 /*
100 * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
101 * check for these vectors, as they are really SW Exceptions. SVM has
102 * not updated RIP to point after the trapping instruction (INT3/INTO).
103 */
104 return (vector != 3) && (vector != 4);
105 default:
106 /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
107 break;
108 }
109 return 0;
110 }
112 /*
113 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
114 * This means we can assume that @vec2 is contributory or a page fault.
115 */
116 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
117 {
118 /* Exception during double-fault delivery always causes a triple fault. */
119 if ( vec1 == TRAP_double_fault )
120 {
121 hvm_triple_fault();
122 return TRAP_double_fault; /* dummy return */
123 }
125 /* Exception during page-fault delivery always causes a double fault. */
126 if ( vec1 == TRAP_page_fault )
127 return TRAP_double_fault;
129 /* Discard the first exception if it's benign or if we now have a #PF. */
130 if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
131 return vec2;
133 /* Cannot combine the exceptions: double fault. */
134 return TRAP_double_fault;
135 }
137 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
138 {
139 u64 host_tsc;
141 rdtscll(host_tsc);
143 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
144 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
145 }
147 u64 hvm_get_guest_tsc(struct vcpu *v)
148 {
149 u64 host_tsc;
151 rdtscll(host_tsc);
152 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
153 }
155 void hvm_migrate_timers(struct vcpu *v)
156 {
157 rtc_migrate_timers(v);
158 hpet_migrate_timers(v);
159 pt_migrate(v);
160 }
162 void hvm_do_resume(struct vcpu *v)
163 {
164 ioreq_t *p;
166 pt_restore_timer(v);
168 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
169 p = &get_ioreq(v)->vp_ioreq;
170 while ( p->state != STATE_IOREQ_NONE )
171 {
172 switch ( p->state )
173 {
174 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
175 hvm_io_assist();
176 break;
177 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
178 case STATE_IOREQ_INPROCESS:
179 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
180 (p->state != STATE_IOREQ_READY) &&
181 (p->state != STATE_IOREQ_INPROCESS));
182 break;
183 default:
184 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
185 domain_crash(v->domain);
186 return; /* bail */
187 }
188 }
189 }
191 static void hvm_init_ioreq_page(
192 struct domain *d, struct hvm_ioreq_page *iorp)
193 {
194 memset(iorp, 0, sizeof(*iorp));
195 spin_lock_init(&iorp->lock);
196 domain_pause(d);
197 }
199 static void hvm_destroy_ioreq_page(
200 struct domain *d, struct hvm_ioreq_page *iorp)
201 {
202 spin_lock(&iorp->lock);
204 ASSERT(d->is_dying);
206 if ( iorp->va != NULL )
207 {
208 unmap_domain_page_global(iorp->va);
209 put_page_and_type(iorp->page);
210 iorp->va = NULL;
211 }
213 spin_unlock(&iorp->lock);
214 }
216 static int hvm_set_ioreq_page(
217 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
218 {
219 struct page_info *page;
220 p2m_type_t p2mt;
221 unsigned long mfn;
222 void *va;
224 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
225 if ( !p2m_is_ram(p2mt) )
226 return -EINVAL;
227 ASSERT(mfn_valid(mfn));
229 page = mfn_to_page(mfn);
230 if ( !get_page_and_type(page, d, PGT_writable_page) )
231 return -EINVAL;
233 va = map_domain_page_global(mfn);
234 if ( va == NULL )
235 {
236 put_page_and_type(page);
237 return -ENOMEM;
238 }
240 spin_lock(&iorp->lock);
242 if ( (iorp->va != NULL) || d->is_dying )
243 {
244 spin_unlock(&iorp->lock);
245 unmap_domain_page_global(va);
246 put_page_and_type(mfn_to_page(mfn));
247 return -EINVAL;
248 }
250 iorp->va = va;
251 iorp->page = page;
253 spin_unlock(&iorp->lock);
255 domain_unpause(d);
257 return 0;
258 }
260 static int hvm_print_line(
261 int dir, uint32_t port, uint32_t bytes, uint32_t *val)
262 {
263 struct vcpu *curr = current;
264 struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
265 char c = *val;
267 BUG_ON(bytes != 1);
269 spin_lock(&hd->pbuf_lock);
270 hd->pbuf[hd->pbuf_idx++] = c;
271 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
272 {
273 if ( c != '\n' )
274 hd->pbuf[hd->pbuf_idx++] = '\n';
275 hd->pbuf[hd->pbuf_idx] = '\0';
276 printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
277 hd->pbuf_idx = 0;
278 }
279 spin_unlock(&hd->pbuf_lock);
281 return X86EMUL_OKAY;
282 }
284 int hvm_domain_initialise(struct domain *d)
285 {
286 int rc;
288 if ( !hvm_enabled )
289 {
290 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
291 "on a non-VT/AMDV platform.\n");
292 return -EINVAL;
293 }
295 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
296 spin_lock_init(&d->arch.hvm_domain.irq_lock);
297 spin_lock_init(&d->arch.hvm_domain.uc_lock);
299 hvm_init_guest_time(d);
301 d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
303 hvm_init_cacheattr_region_list(d);
305 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
306 if ( rc != 0 )
307 goto fail1;
309 vpic_init(d);
311 rc = vioapic_init(d);
312 if ( rc != 0 )
313 goto fail1;
315 stdvga_init(d);
317 rtc_init(d);
319 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
320 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
322 register_portio_handler(d, 0xe9, 1, hvm_print_line);
324 rc = hvm_funcs.domain_initialise(d);
325 if ( rc != 0 )
326 goto fail2;
328 return 0;
330 fail2:
331 rtc_deinit(d);
332 stdvga_deinit(d);
333 vioapic_deinit(d);
334 fail1:
335 hvm_destroy_cacheattr_region_list(d);
336 return rc;
337 }
339 void hvm_domain_relinquish_resources(struct domain *d)
340 {
341 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
342 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
344 /* Stop all asynchronous timer actions. */
345 rtc_deinit(d);
346 if ( d->vcpu[0] != NULL )
347 {
348 pit_deinit(d);
349 pmtimer_deinit(d);
350 hpet_deinit(d);
351 }
352 }
354 void hvm_domain_destroy(struct domain *d)
355 {
356 hvm_funcs.domain_destroy(d);
357 rtc_deinit(d);
358 stdvga_deinit(d);
359 vioapic_deinit(d);
360 hvm_destroy_cacheattr_region_list(d);
361 }
363 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
364 {
365 struct vcpu *v;
366 struct hvm_hw_cpu ctxt;
367 struct segment_register seg;
368 struct vcpu_guest_context *vc;
370 for_each_vcpu ( d, v )
371 {
372 /* We don't need to save state for a vcpu that is down; the restore
373 * code will leave it down if there is nothing saved. */
374 if ( test_bit(_VPF_down, &v->pause_flags) )
375 continue;
377 /* Architecture-specific vmcs/vmcb bits */
378 hvm_funcs.save_cpu_ctxt(v, &ctxt);
380 hvm_get_segment_register(v, x86_seg_idtr, &seg);
381 ctxt.idtr_limit = seg.limit;
382 ctxt.idtr_base = seg.base;
384 hvm_get_segment_register(v, x86_seg_gdtr, &seg);
385 ctxt.gdtr_limit = seg.limit;
386 ctxt.gdtr_base = seg.base;
388 hvm_get_segment_register(v, x86_seg_cs, &seg);
389 ctxt.cs_sel = seg.sel;
390 ctxt.cs_limit = seg.limit;
391 ctxt.cs_base = seg.base;
392 ctxt.cs_arbytes = seg.attr.bytes;
394 hvm_get_segment_register(v, x86_seg_ds, &seg);
395 ctxt.ds_sel = seg.sel;
396 ctxt.ds_limit = seg.limit;
397 ctxt.ds_base = seg.base;
398 ctxt.ds_arbytes = seg.attr.bytes;
400 hvm_get_segment_register(v, x86_seg_es, &seg);
401 ctxt.es_sel = seg.sel;
402 ctxt.es_limit = seg.limit;
403 ctxt.es_base = seg.base;
404 ctxt.es_arbytes = seg.attr.bytes;
406 hvm_get_segment_register(v, x86_seg_ss, &seg);
407 ctxt.ss_sel = seg.sel;
408 ctxt.ss_limit = seg.limit;
409 ctxt.ss_base = seg.base;
410 ctxt.ss_arbytes = seg.attr.bytes;
412 hvm_get_segment_register(v, x86_seg_fs, &seg);
413 ctxt.fs_sel = seg.sel;
414 ctxt.fs_limit = seg.limit;
415 ctxt.fs_base = seg.base;
416 ctxt.fs_arbytes = seg.attr.bytes;
418 hvm_get_segment_register(v, x86_seg_gs, &seg);
419 ctxt.gs_sel = seg.sel;
420 ctxt.gs_limit = seg.limit;
421 ctxt.gs_base = seg.base;
422 ctxt.gs_arbytes = seg.attr.bytes;
424 hvm_get_segment_register(v, x86_seg_tr, &seg);
425 ctxt.tr_sel = seg.sel;
426 ctxt.tr_limit = seg.limit;
427 ctxt.tr_base = seg.base;
428 ctxt.tr_arbytes = seg.attr.bytes;
430 hvm_get_segment_register(v, x86_seg_ldtr, &seg);
431 ctxt.ldtr_sel = seg.sel;
432 ctxt.ldtr_limit = seg.limit;
433 ctxt.ldtr_base = seg.base;
434 ctxt.ldtr_arbytes = seg.attr.bytes;
436 vc = &v->arch.guest_context;
438 if ( v->fpu_initialised )
439 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
440 else
441 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
443 ctxt.rax = vc->user_regs.eax;
444 ctxt.rbx = vc->user_regs.ebx;
445 ctxt.rcx = vc->user_regs.ecx;
446 ctxt.rdx = vc->user_regs.edx;
447 ctxt.rbp = vc->user_regs.ebp;
448 ctxt.rsi = vc->user_regs.esi;
449 ctxt.rdi = vc->user_regs.edi;
450 ctxt.rsp = vc->user_regs.esp;
451 ctxt.rip = vc->user_regs.eip;
452 ctxt.rflags = vc->user_regs.eflags;
453 #ifdef __x86_64__
454 ctxt.r8 = vc->user_regs.r8;
455 ctxt.r9 = vc->user_regs.r9;
456 ctxt.r10 = vc->user_regs.r10;
457 ctxt.r11 = vc->user_regs.r11;
458 ctxt.r12 = vc->user_regs.r12;
459 ctxt.r13 = vc->user_regs.r13;
460 ctxt.r14 = vc->user_regs.r14;
461 ctxt.r15 = vc->user_regs.r15;
462 #endif
463 ctxt.dr0 = vc->debugreg[0];
464 ctxt.dr1 = vc->debugreg[1];
465 ctxt.dr2 = vc->debugreg[2];
466 ctxt.dr3 = vc->debugreg[3];
467 ctxt.dr6 = vc->debugreg[6];
468 ctxt.dr7 = vc->debugreg[7];
470 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
471 return 1;
472 }
473 return 0;
474 }
476 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
477 {
478 int vcpuid, rc;
479 struct vcpu *v;
480 struct hvm_hw_cpu ctxt;
481 struct segment_register seg;
482 struct vcpu_guest_context *vc;
484 /* Which vcpu is this? */
485 vcpuid = hvm_load_instance(h);
486 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
487 {
488 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
489 return -EINVAL;
490 }
491 vc = &v->arch.guest_context;
493 /* Need to init this vcpu before loading its contents */
494 domain_lock(d);
495 if ( !v->is_initialised )
496 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
497 return rc;
498 domain_unlock(d);
500 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
501 return -EINVAL;
503 /* Sanity check some control registers. */
504 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
505 !(ctxt.cr0 & X86_CR0_ET) ||
506 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
507 {
508 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
509 ctxt.cr0);
510 return -EINVAL;
511 }
513 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
514 {
515 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
516 ctxt.cr4);
517 return -EINVAL;
518 }
520 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
521 EFER_NX | EFER_SCE)) ||
522 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
523 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
524 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
525 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
526 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
527 {
528 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
529 ctxt.msr_efer);
530 return -EINVAL;
531 }
533 /* Architecture-specific vmcs/vmcb bits */
534 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
535 return -EINVAL;
537 seg.limit = ctxt.idtr_limit;
538 seg.base = ctxt.idtr_base;
539 hvm_set_segment_register(v, x86_seg_idtr, &seg);
541 seg.limit = ctxt.gdtr_limit;
542 seg.base = ctxt.gdtr_base;
543 hvm_set_segment_register(v, x86_seg_gdtr, &seg);
545 seg.sel = ctxt.cs_sel;
546 seg.limit = ctxt.cs_limit;
547 seg.base = ctxt.cs_base;
548 seg.attr.bytes = ctxt.cs_arbytes;
549 hvm_set_segment_register(v, x86_seg_cs, &seg);
551 seg.sel = ctxt.ds_sel;
552 seg.limit = ctxt.ds_limit;
553 seg.base = ctxt.ds_base;
554 seg.attr.bytes = ctxt.ds_arbytes;
555 hvm_set_segment_register(v, x86_seg_ds, &seg);
557 seg.sel = ctxt.es_sel;
558 seg.limit = ctxt.es_limit;
559 seg.base = ctxt.es_base;
560 seg.attr.bytes = ctxt.es_arbytes;
561 hvm_set_segment_register(v, x86_seg_es, &seg);
563 seg.sel = ctxt.ss_sel;
564 seg.limit = ctxt.ss_limit;
565 seg.base = ctxt.ss_base;
566 seg.attr.bytes = ctxt.ss_arbytes;
567 hvm_set_segment_register(v, x86_seg_ss, &seg);
569 seg.sel = ctxt.fs_sel;
570 seg.limit = ctxt.fs_limit;
571 seg.base = ctxt.fs_base;
572 seg.attr.bytes = ctxt.fs_arbytes;
573 hvm_set_segment_register(v, x86_seg_fs, &seg);
575 seg.sel = ctxt.gs_sel;
576 seg.limit = ctxt.gs_limit;
577 seg.base = ctxt.gs_base;
578 seg.attr.bytes = ctxt.gs_arbytes;
579 hvm_set_segment_register(v, x86_seg_gs, &seg);
581 seg.sel = ctxt.tr_sel;
582 seg.limit = ctxt.tr_limit;
583 seg.base = ctxt.tr_base;
584 seg.attr.bytes = ctxt.tr_arbytes;
585 hvm_set_segment_register(v, x86_seg_tr, &seg);
587 seg.sel = ctxt.ldtr_sel;
588 seg.limit = ctxt.ldtr_limit;
589 seg.base = ctxt.ldtr_base;
590 seg.attr.bytes = ctxt.ldtr_arbytes;
591 hvm_set_segment_register(v, x86_seg_ldtr, &seg);
593 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
595 vc->user_regs.eax = ctxt.rax;
596 vc->user_regs.ebx = ctxt.rbx;
597 vc->user_regs.ecx = ctxt.rcx;
598 vc->user_regs.edx = ctxt.rdx;
599 vc->user_regs.ebp = ctxt.rbp;
600 vc->user_regs.esi = ctxt.rsi;
601 vc->user_regs.edi = ctxt.rdi;
602 vc->user_regs.esp = ctxt.rsp;
603 vc->user_regs.eip = ctxt.rip;
604 vc->user_regs.eflags = ctxt.rflags | 2;
605 #ifdef __x86_64__
606 vc->user_regs.r8 = ctxt.r8;
607 vc->user_regs.r9 = ctxt.r9;
608 vc->user_regs.r10 = ctxt.r10;
609 vc->user_regs.r11 = ctxt.r11;
610 vc->user_regs.r12 = ctxt.r12;
611 vc->user_regs.r13 = ctxt.r13;
612 vc->user_regs.r14 = ctxt.r14;
613 vc->user_regs.r15 = ctxt.r15;
614 #endif
615 vc->debugreg[0] = ctxt.dr0;
616 vc->debugreg[1] = ctxt.dr1;
617 vc->debugreg[2] = ctxt.dr2;
618 vc->debugreg[3] = ctxt.dr3;
619 vc->debugreg[6] = ctxt.dr6;
620 vc->debugreg[7] = ctxt.dr7;
622 vc->flags = VGCF_online;
623 v->fpu_initialised = 1;
625 /* Auxiliary processors should be woken immediately. */
626 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
627 vcpu_wake(v);
629 return 0;
630 }
632 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
633 1, HVMSR_PER_VCPU);
635 int hvm_vcpu_initialise(struct vcpu *v)
636 {
637 int rc;
639 if ( (rc = vlapic_init(v)) != 0 )
640 goto fail1;
642 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
643 goto fail2;
645 /* Create ioreq event channel. */
646 rc = alloc_unbound_xen_event_channel(v, 0);
647 if ( rc < 0 )
648 goto fail3;
650 /* Register ioreq event channel. */
651 v->arch.hvm_vcpu.xen_port = rc;
652 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
653 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
654 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
655 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
657 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
658 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
660 rc = hvm_vcpu_cacheattr_init(v);
661 if ( rc != 0 )
662 goto fail3;
664 v->arch.guest_context.user_regs.eflags = 2;
666 if ( v->vcpu_id == 0 )
667 {
668 /* NB. All these really belong in hvm_domain_initialise(). */
669 pit_init(v, cpu_khz);
670 pmtimer_init(v);
671 hpet_init(v);
673 /* Init guest TSC to start from zero. */
674 hvm_set_guest_tsc(v, 0);
676 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
677 v->is_initialised = 1;
678 clear_bit(_VPF_down, &v->pause_flags);
679 }
681 return 0;
683 fail3:
684 hvm_funcs.vcpu_destroy(v);
685 fail2:
686 vlapic_destroy(v);
687 fail1:
688 return rc;
689 }
691 void hvm_vcpu_destroy(struct vcpu *v)
692 {
693 hvm_vcpu_cacheattr_destroy(v);
694 vlapic_destroy(v);
695 hvm_funcs.vcpu_destroy(v);
697 /* Event channel is already freed by evtchn_destroy(). */
698 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
699 }
701 void hvm_vcpu_down(struct vcpu *v)
702 {
703 struct domain *d = v->domain;
704 int online_count = 0;
706 /* Doesn't halt us immediately, but we'll never return to guest context. */
707 set_bit(_VPF_down, &v->pause_flags);
708 vcpu_sleep_nosync(v);
710 /* Any other VCPUs online? ... */
711 domain_lock(d);
712 for_each_vcpu ( d, v )
713 if ( !test_bit(_VPF_down, &v->pause_flags) )
714 online_count++;
715 domain_unlock(d);
717 /* ... Shut down the domain if not. */
718 if ( online_count == 0 )
719 {
720 gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
721 domain_shutdown(d, SHUTDOWN_poweroff);
722 }
723 }
725 void hvm_send_assist_req(struct vcpu *v)
726 {
727 ioreq_t *p;
729 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
730 return; /* implicitly bins the i/o operation */
732 p = &get_ioreq(v)->vp_ioreq;
733 if ( unlikely(p->state != STATE_IOREQ_NONE) )
734 {
735 /* This indicates a bug in the device model. Crash the domain. */
736 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
737 domain_crash(v->domain);
738 return;
739 }
741 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
743 /*
744 * Following happens /after/ blocking and setting up ioreq contents.
745 * prepare_wait_on_xen_event_channel() is an implicit barrier.
746 */
747 p->state = STATE_IOREQ_READY;
748 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
749 }
751 void hvm_hlt(unsigned long rflags)
752 {
753 struct vcpu *curr = current;
755 if ( hvm_event_pending(curr) )
756 return;
758 /*
759 * If we halt with interrupts disabled, that's a pretty sure sign that we
760 * want to shut down. In a real processor, NMIs are the only way to break
761 * out of this.
762 */
763 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
764 return hvm_vcpu_down(curr);
766 do_sched_op_compat(SCHEDOP_block, 0);
768 HVMTRACE_1D(HLT, curr, /* pending = */ vcpu_runnable(curr));
769 }
771 void hvm_triple_fault(void)
772 {
773 struct vcpu *v = current;
774 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
775 "invoking HVM system reset.\n", v->vcpu_id);
776 domain_shutdown(v->domain, SHUTDOWN_reboot);
777 }
779 int hvm_set_efer(uint64_t value)
780 {
781 struct vcpu *v = current;
783 value &= ~EFER_LMA;
785 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
786 ((sizeof(long) != 8) && (value & EFER_LME)) ||
787 (!cpu_has_nx && (value & EFER_NX)) ||
788 (!cpu_has_syscall && (value & EFER_SCE)) ||
789 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
790 {
791 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
792 "EFER: %"PRIx64"\n", value);
793 hvm_inject_exception(TRAP_gp_fault, 0, 0);
794 return X86EMUL_EXCEPTION;
795 }
797 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
798 hvm_paging_enabled(v) )
799 {
800 gdprintk(XENLOG_WARNING,
801 "Trying to change EFER.LME with paging enabled\n");
802 hvm_inject_exception(TRAP_gp_fault, 0, 0);
803 return X86EMUL_EXCEPTION;
804 }
806 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
807 v->arch.hvm_vcpu.guest_efer = value;
808 hvm_update_guest_efer(v);
810 return X86EMUL_OKAY;
811 }
813 extern void shadow_blow_tables_per_domain(struct domain *d);
814 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
816 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
817 static bool_t domain_exit_uc_mode(struct vcpu *v)
818 {
819 struct domain *d = v->domain;
820 struct vcpu *vs;
822 for_each_vcpu ( d, vs )
823 {
824 if ( (vs == v) || !vs->is_initialised )
825 continue;
826 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
827 mtrr_pat_not_equal(vs, v) )
828 return 0;
829 }
831 return 1;
832 }
834 static void local_flush_cache(void *info)
835 {
836 wbinvd();
837 }
839 int hvm_set_cr0(unsigned long value)
840 {
841 struct vcpu *v = current;
842 p2m_type_t p2mt;
843 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
845 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
847 if ( (u32)value != value )
848 {
849 HVM_DBG_LOG(DBG_LEVEL_1,
850 "Guest attempts to set upper 32 bits in CR0: %lx",
851 value);
852 goto gpf;
853 }
855 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
857 /* ET is reserved and should be always be 1. */
858 value |= X86_CR0_ET;
860 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
861 goto gpf;
863 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
864 {
865 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
866 {
867 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
868 {
869 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
870 goto gpf;
871 }
872 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
873 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
874 hvm_update_guest_efer(v);
875 }
877 if ( !paging_mode_hap(v->domain) )
878 {
879 /* The guest CR3 must be pointing to the guest physical. */
880 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
881 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
882 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
883 !get_page(mfn_to_page(mfn), v->domain))
884 {
885 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
886 v->arch.hvm_vcpu.guest_cr[3], mfn);
887 domain_crash(v->domain);
888 return X86EMUL_UNHANDLEABLE;
889 }
891 /* Now arch.guest_table points to machine physical. */
892 v->arch.guest_table = pagetable_from_pfn(mfn);
894 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
895 v->arch.hvm_vcpu.guest_cr[3], mfn);
896 }
897 }
898 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
899 {
900 /* When CR0.PG is cleared, LMA is cleared immediately. */
901 if ( hvm_long_mode_enabled(v) )
902 {
903 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
904 hvm_update_guest_efer(v);
905 }
907 if ( !paging_mode_hap(v->domain) )
908 {
909 put_page(pagetable_get_page(v->arch.guest_table));
910 v->arch.guest_table = pagetable_null();
911 }
912 }
914 if ( !list_empty(&domain_hvm_iommu(v->domain)->pdev_list) )
915 {
916 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
917 {
918 /* Entering no fill cache mode. */
919 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
920 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
922 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
923 {
924 /* Flush physical caches. */
925 on_each_cpu(local_flush_cache, NULL, 1, 1);
926 /* Shadow pagetables must recognise UC mode. */
927 v->domain->arch.hvm_domain.is_in_uc_mode = 1;
928 shadow_blow_tables_per_domain(v->domain);
929 }
930 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
931 }
932 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
933 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
934 {
935 /* Exit from no fill cache mode. */
936 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
937 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
939 if ( domain_exit_uc_mode(v) )
940 {
941 /* Shadow pagetables must recognise normal caching mode. */
942 v->domain->arch.hvm_domain.is_in_uc_mode = 0;
943 shadow_blow_tables_per_domain(v->domain);
944 }
945 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
946 }
947 }
949 v->arch.hvm_vcpu.guest_cr[0] = value;
950 hvm_update_guest_cr(v, 0);
952 if ( (value ^ old_value) & X86_CR0_PG )
953 paging_update_paging_modes(v);
955 return X86EMUL_OKAY;
957 gpf:
958 hvm_inject_exception(TRAP_gp_fault, 0, 0);
959 return X86EMUL_EXCEPTION;
960 }
962 int hvm_set_cr3(unsigned long value)
963 {
964 unsigned long mfn;
965 p2m_type_t p2mt;
966 struct vcpu *v = current;
968 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
969 (value != v->arch.hvm_vcpu.guest_cr[3]) )
970 {
971 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
972 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
973 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
974 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
975 !get_page(mfn_to_page(mfn), v->domain) )
976 goto bad_cr3;
978 put_page(pagetable_get_page(v->arch.guest_table));
979 v->arch.guest_table = pagetable_from_pfn(mfn);
981 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
982 }
984 v->arch.hvm_vcpu.guest_cr[3] = value;
985 paging_update_cr3(v);
986 return X86EMUL_OKAY;
988 bad_cr3:
989 gdprintk(XENLOG_ERR, "Invalid CR3\n");
990 domain_crash(v->domain);
991 return X86EMUL_UNHANDLEABLE;
992 }
994 int hvm_set_cr4(unsigned long value)
995 {
996 struct vcpu *v = current;
997 unsigned long old_cr;
999 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1001 HVM_DBG_LOG(DBG_LEVEL_1,
1002 "Guest attempts to set reserved bit in CR4: %lx",
1003 value);
1004 goto gpf;
1007 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
1009 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1010 "EFER.LMA is set");
1011 goto gpf;
1014 old_cr = v->arch.hvm_vcpu.guest_cr[4];
1015 v->arch.hvm_vcpu.guest_cr[4] = value;
1016 hvm_update_guest_cr(v, 4);
1018 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
1019 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1020 paging_update_paging_modes(v);
1022 return X86EMUL_OKAY;
1024 gpf:
1025 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1026 return X86EMUL_EXCEPTION;
1029 int hvm_virtual_to_linear_addr(
1030 enum x86_segment seg,
1031 struct segment_register *reg,
1032 unsigned long offset,
1033 unsigned int bytes,
1034 enum hvm_access_type access_type,
1035 unsigned int addr_size,
1036 unsigned long *linear_addr)
1038 unsigned long addr = offset;
1039 uint32_t last_byte;
1041 if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1043 /*
1044 * REAL MODE: Don't bother with segment access checks.
1045 * Certain of them are not done in native real mode anyway.
1046 */
1047 addr = (uint32_t)(addr + reg->base);
1049 else if ( addr_size != 64 )
1051 /*
1052 * COMPATIBILITY MODE: Apply segment checks and add base.
1053 */
1055 switch ( access_type )
1057 case hvm_access_read:
1058 if ( (reg->attr.fields.type & 0xa) == 0x8 )
1059 goto gpf; /* execute-only code segment */
1060 break;
1061 case hvm_access_write:
1062 if ( (reg->attr.fields.type & 0xa) != 0x2 )
1063 goto gpf; /* not a writable data segment */
1064 break;
1065 default:
1066 break;
1069 last_byte = offset + bytes - 1;
1071 /* Is this a grows-down data segment? Special limit check if so. */
1072 if ( (reg->attr.fields.type & 0xc) == 0x4 )
1074 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
1075 if ( !reg->attr.fields.db )
1076 last_byte = (uint16_t)last_byte;
1078 /* Check first byte and last byte against respective bounds. */
1079 if ( (offset <= reg->limit) || (last_byte < offset) )
1080 goto gpf;
1082 else if ( (last_byte > reg->limit) || (last_byte < offset) )
1083 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
1085 /*
1086 * Hardware truncates to 32 bits in compatibility mode.
1087 * It does not truncate to 16 bits in 16-bit address-size mode.
1088 */
1089 addr = (uint32_t)(addr + reg->base);
1091 else
1093 /*
1094 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
1095 */
1097 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
1098 addr += reg->base;
1100 if ( !is_canonical_address(addr) )
1101 goto gpf;
1104 *linear_addr = addr;
1105 return 1;
1107 gpf:
1108 return 0;
1111 static void *hvm_map_entry(unsigned long va)
1113 unsigned long gfn, mfn;
1114 p2m_type_t p2mt;
1115 uint32_t pfec;
1117 if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE )
1119 gdprintk(XENLOG_ERR, "Descriptor table entry "
1120 "straddles page boundary\n");
1121 domain_crash(current->domain);
1122 return NULL;
1125 /* We're mapping on behalf of the segment-load logic, which might
1126 * write the accessed flags in the descriptors (in 32-bit mode), but
1127 * we still treat it as a kernel-mode read (i.e. no access checks). */
1128 pfec = PFEC_page_present;
1129 gfn = paging_gva_to_gfn(current, va, &pfec);
1130 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1131 if ( !p2m_is_ram(p2mt) )
1133 gdprintk(XENLOG_ERR, "Failed to look up descriptor table entry\n");
1134 domain_crash(current->domain);
1135 return NULL;
1138 ASSERT(mfn_valid(mfn));
1140 paging_mark_dirty(current->domain, mfn);
1142 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
1145 static void hvm_unmap_entry(void *p)
1147 if ( p )
1148 unmap_domain_page(p);
1151 static int hvm_load_segment_selector(
1152 struct vcpu *v, enum x86_segment seg, uint16_t sel)
1154 struct segment_register desctab, cs, segr;
1155 struct desc_struct *pdesc, desc;
1156 u8 dpl, rpl, cpl;
1157 int fault_type = TRAP_invalid_tss;
1159 /* NULL selector? */
1160 if ( (sel & 0xfffc) == 0 )
1162 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
1163 goto fail;
1164 memset(&segr, 0, sizeof(segr));
1165 hvm_set_segment_register(v, seg, &segr);
1166 return 0;
1169 /* LDT descriptor must be in the GDT. */
1170 if ( (seg == x86_seg_ldtr) && (sel & 4) )
1171 goto fail;
1173 hvm_get_segment_register(v, x86_seg_cs, &cs);
1174 hvm_get_segment_register(
1175 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
1177 /* Check against descriptor table limit. */
1178 if ( ((sel & 0xfff8) + 7) > desctab.limit )
1179 goto fail;
1181 pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
1182 if ( pdesc == NULL )
1183 goto hvm_map_fail;
1185 do {
1186 desc = *pdesc;
1188 /* Segment present in memory? */
1189 if ( !(desc.b & (1u<<15)) )
1191 fault_type = TRAP_no_segment;
1192 goto unmap_and_fail;
1195 /* LDT descriptor is a system segment. All others are code/data. */
1196 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1197 goto unmap_and_fail;
1199 dpl = (desc.b >> 13) & 3;
1200 rpl = sel & 3;
1201 cpl = cs.sel & 3;
1203 switch ( seg )
1205 case x86_seg_cs:
1206 /* Code segment? */
1207 if ( !(desc.b & (1u<<11)) )
1208 goto unmap_and_fail;
1209 /* Non-conforming segment: check DPL against RPL. */
1210 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1211 goto unmap_and_fail;
1212 break;
1213 case x86_seg_ss:
1214 /* Writable data segment? */
1215 if ( (desc.b & (5u<<9)) != (1u<<9) )
1216 goto unmap_and_fail;
1217 if ( (dpl != cpl) || (dpl != rpl) )
1218 goto unmap_and_fail;
1219 break;
1220 case x86_seg_ldtr:
1221 /* LDT system segment? */
1222 if ( (desc.b & (15u<<8)) != (2u<<8) )
1223 goto unmap_and_fail;
1224 goto skip_accessed_flag;
1225 default:
1226 /* Readable code or data segment? */
1227 if ( (desc.b & (5u<<9)) == (4u<<9) )
1228 goto unmap_and_fail;
1229 /* Non-conforming segment: check DPL against RPL and CPL. */
1230 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1231 goto unmap_and_fail;
1232 break;
1234 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1235 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1237 /* Force the Accessed flag in our local copy. */
1238 desc.b |= 0x100;
1240 skip_accessed_flag:
1241 hvm_unmap_entry(pdesc);
1243 segr.base = (((desc.b << 0) & 0xff000000u) |
1244 ((desc.b << 16) & 0x00ff0000u) |
1245 ((desc.a >> 16) & 0x0000ffffu));
1246 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1247 ((desc.b >> 12) & 0x0f00u));
1248 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1249 if ( segr.attr.fields.g )
1250 segr.limit = (segr.limit << 12) | 0xfffu;
1251 segr.sel = sel;
1252 hvm_set_segment_register(v, seg, &segr);
1254 return 0;
1256 unmap_and_fail:
1257 hvm_unmap_entry(pdesc);
1258 fail:
1259 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1260 hvm_map_fail:
1261 return 1;
1264 void hvm_task_switch(
1265 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1266 int32_t errcode)
1268 struct vcpu *v = current;
1269 struct cpu_user_regs *regs = guest_cpu_user_regs();
1270 struct segment_register gdt, tr, prev_tr, segr;
1271 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1272 unsigned long eflags;
1273 int exn_raised, rc;
1274 struct {
1275 u16 back_link,__blh;
1276 u32 esp0;
1277 u16 ss0, _0;
1278 u32 esp1;
1279 u16 ss1, _1;
1280 u32 esp2;
1281 u16 ss2, _2;
1282 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1283 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1284 u16 trace, iomap;
1285 } tss = { 0 };
1287 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1288 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1290 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1292 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1293 TRAP_invalid_tss : TRAP_gp_fault,
1294 tss_sel & 0xfff8, 0);
1295 goto out;
1298 optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8));
1299 if ( optss_desc == NULL )
1300 goto out;
1302 nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8));
1303 if ( nptss_desc == NULL )
1304 goto out;
1306 tss_desc = *nptss_desc;
1307 tr.sel = tss_sel;
1308 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1309 ((tss_desc.b << 16) & 0x00ff0000u) |
1310 ((tss_desc.a >> 16) & 0x0000ffffu));
1311 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1312 ((tss_desc.b >> 12) & 0x0f00u));
1313 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1314 if ( tr.attr.fields.g )
1315 tr.limit = (tr.limit << 12) | 0xfffu;
1317 if ( !tr.attr.fields.p )
1319 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1320 goto out;
1323 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1325 hvm_inject_exception(
1326 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1327 tss_sel & 0xfff8, 0);
1328 goto out;
1331 if ( tr.limit < (sizeof(tss)-1) )
1333 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1334 goto out;
1337 rc = hvm_copy_from_guest_virt(
1338 &tss, prev_tr.base, sizeof(tss), PFEC_page_present);
1339 if ( rc == HVMCOPY_bad_gva_to_gfn )
1340 goto out;
1342 eflags = regs->eflags;
1343 if ( taskswitch_reason == TSW_iret )
1344 eflags &= ~X86_EFLAGS_NT;
1346 tss.cr3 = v->arch.hvm_vcpu.guest_cr[3];
1347 tss.eip = regs->eip;
1348 tss.eflags = eflags;
1349 tss.eax = regs->eax;
1350 tss.ecx = regs->ecx;
1351 tss.edx = regs->edx;
1352 tss.ebx = regs->ebx;
1353 tss.esp = regs->esp;
1354 tss.ebp = regs->ebp;
1355 tss.esi = regs->esi;
1356 tss.edi = regs->edi;
1358 hvm_get_segment_register(v, x86_seg_es, &segr);
1359 tss.es = segr.sel;
1360 hvm_get_segment_register(v, x86_seg_cs, &segr);
1361 tss.cs = segr.sel;
1362 hvm_get_segment_register(v, x86_seg_ss, &segr);
1363 tss.ss = segr.sel;
1364 hvm_get_segment_register(v, x86_seg_ds, &segr);
1365 tss.ds = segr.sel;
1366 hvm_get_segment_register(v, x86_seg_fs, &segr);
1367 tss.fs = segr.sel;
1368 hvm_get_segment_register(v, x86_seg_gs, &segr);
1369 tss.gs = segr.sel;
1370 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1371 tss.ldt = segr.sel;
1373 rc = hvm_copy_to_guest_virt(
1374 prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
1375 if ( rc == HVMCOPY_bad_gva_to_gfn )
1376 goto out;
1378 rc = hvm_copy_from_guest_virt(
1379 &tss, tr.base, sizeof(tss), PFEC_page_present);
1380 if ( rc == HVMCOPY_bad_gva_to_gfn )
1381 goto out;
1383 if ( hvm_set_cr3(tss.cr3) )
1384 goto out;
1386 regs->eip = tss.eip;
1387 regs->eflags = tss.eflags | 2;
1388 regs->eax = tss.eax;
1389 regs->ecx = tss.ecx;
1390 regs->edx = tss.edx;
1391 regs->ebx = tss.ebx;
1392 regs->esp = tss.esp;
1393 regs->ebp = tss.ebp;
1394 regs->esi = tss.esi;
1395 regs->edi = tss.edi;
1397 if ( (taskswitch_reason == TSW_call_or_int) )
1399 regs->eflags |= X86_EFLAGS_NT;
1400 tss.back_link = prev_tr.sel;
1403 exn_raised = 0;
1404 if ( hvm_load_segment_selector(v, x86_seg_es, tss.es) ||
1405 hvm_load_segment_selector(v, x86_seg_cs, tss.cs) ||
1406 hvm_load_segment_selector(v, x86_seg_ss, tss.ss) ||
1407 hvm_load_segment_selector(v, x86_seg_ds, tss.ds) ||
1408 hvm_load_segment_selector(v, x86_seg_fs, tss.fs) ||
1409 hvm_load_segment_selector(v, x86_seg_gs, tss.gs) ||
1410 hvm_load_segment_selector(v, x86_seg_ldtr, tss.ldt) )
1411 exn_raised = 1;
1413 rc = hvm_copy_to_guest_virt(
1414 tr.base, &tss, sizeof(tss), PFEC_page_present);
1415 if ( rc == HVMCOPY_bad_gva_to_gfn )
1416 exn_raised = 1;
1418 if ( (tss.trace & 1) && !exn_raised )
1419 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1421 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1422 hvm_set_segment_register(v, x86_seg_tr, &tr);
1424 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1425 hvm_update_guest_cr(v, 0);
1427 if ( (taskswitch_reason == TSW_iret) ||
1428 (taskswitch_reason == TSW_jmp) )
1429 clear_bit(41, optss_desc); /* clear B flag of old task */
1431 if ( taskswitch_reason != TSW_iret )
1432 set_bit(41, nptss_desc); /* set B flag of new task */
1434 if ( errcode >= 0 )
1436 struct segment_register reg;
1437 unsigned long linear_addr;
1438 regs->esp -= 4;
1439 hvm_get_segment_register(current, x86_seg_ss, &reg);
1440 /* Todo: do not ignore access faults here. */
1441 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1442 4, hvm_access_write, 32,
1443 &linear_addr) )
1444 hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0);
1447 out:
1448 hvm_unmap_entry(optss_desc);
1449 hvm_unmap_entry(nptss_desc);
1452 #define HVMCOPY_from_guest (0u<<0)
1453 #define HVMCOPY_to_guest (1u<<0)
1454 #define HVMCOPY_no_fault (0u<<1)
1455 #define HVMCOPY_fault (1u<<1)
1456 #define HVMCOPY_phys (0u<<2)
1457 #define HVMCOPY_virt (1u<<2)
1458 static enum hvm_copy_result __hvm_copy(
1459 void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
1461 struct vcpu *curr = current;
1462 unsigned long gfn, mfn;
1463 p2m_type_t p2mt;
1464 char *p;
1465 int count, todo = size;
1467 while ( todo > 0 )
1469 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1471 if ( flags & HVMCOPY_virt )
1473 gfn = paging_gva_to_gfn(curr, addr, &pfec);
1474 if ( gfn == INVALID_GFN )
1476 if ( flags & HVMCOPY_fault )
1477 hvm_inject_exception(TRAP_page_fault, pfec, addr);
1478 return HVMCOPY_bad_gva_to_gfn;
1481 else
1483 gfn = addr >> PAGE_SHIFT;
1486 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1488 if ( !p2m_is_ram(p2mt) )
1489 return HVMCOPY_bad_gfn_to_mfn;
1490 ASSERT(mfn_valid(mfn));
1492 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1494 if ( flags & HVMCOPY_to_guest )
1496 memcpy(p, buf, count);
1497 paging_mark_dirty(curr->domain, mfn);
1499 else
1501 memcpy(buf, p, count);
1504 unmap_domain_page(p);
1506 addr += count;
1507 buf += count;
1508 todo -= count;
1511 return HVMCOPY_okay;
1514 enum hvm_copy_result hvm_copy_to_guest_phys(
1515 paddr_t paddr, void *buf, int size)
1517 return __hvm_copy(buf, paddr, size,
1518 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys,
1519 0);
1522 enum hvm_copy_result hvm_copy_from_guest_phys(
1523 void *buf, paddr_t paddr, int size)
1525 return __hvm_copy(buf, paddr, size,
1526 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys,
1527 0);
1530 enum hvm_copy_result hvm_copy_to_guest_virt(
1531 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1533 return __hvm_copy(buf, vaddr, size,
1534 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt,
1535 PFEC_page_present | PFEC_write_access | pfec);
1538 enum hvm_copy_result hvm_copy_from_guest_virt(
1539 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1541 return __hvm_copy(buf, vaddr, size,
1542 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1543 PFEC_page_present | pfec);
1546 enum hvm_copy_result hvm_fetch_from_guest_virt(
1547 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1549 if ( hvm_nx_enabled(current) )
1550 pfec |= PFEC_insn_fetch;
1551 return __hvm_copy(buf, vaddr, size,
1552 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1553 PFEC_page_present | pfec);
1556 enum hvm_copy_result hvm_copy_to_guest_virt_nofault(
1557 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1559 return __hvm_copy(buf, vaddr, size,
1560 HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1561 PFEC_page_present | PFEC_write_access | pfec);
1564 enum hvm_copy_result hvm_copy_from_guest_virt_nofault(
1565 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1567 return __hvm_copy(buf, vaddr, size,
1568 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1569 PFEC_page_present | pfec);
1572 enum hvm_copy_result hvm_fetch_from_guest_virt_nofault(
1573 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1575 if ( hvm_nx_enabled(current) )
1576 pfec |= PFEC_insn_fetch;
1577 return __hvm_copy(buf, vaddr, size,
1578 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1579 PFEC_page_present | pfec);
1582 #ifdef __x86_64__
1583 DEFINE_PER_CPU(bool_t, hvm_64bit_hcall);
1584 #endif
1586 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
1588 int rc;
1590 #ifdef __x86_64__
1591 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(to, len) )
1593 memcpy(to, from, len);
1594 return 0;
1596 #endif
1598 rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from,
1599 len, 0);
1600 return rc ? len : 0; /* fake a copy_to_user() return code */
1603 unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
1605 int rc;
1607 #ifdef __x86_64__
1608 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(from, len) )
1610 memcpy(to, from, len);
1611 return 0;
1613 #endif
1615 rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0);
1616 return rc ? len : 0; /* fake a copy_from_user() return code */
1619 #define bitmaskof(idx) (1U << ((idx) & 31))
1620 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1621 unsigned int *ecx, unsigned int *edx)
1623 struct vcpu *v = current;
1625 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1626 return;
1628 domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
1630 if ( input == 0x00000001 )
1632 /* Fix up VLAPIC details. */
1633 *ebx &= 0x00FFFFFFu;
1634 *ebx |= (v->vcpu_id * 2) << 24;
1635 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1636 __clear_bit(X86_FEATURE_APIC & 31, edx);
1640 int hvm_msr_read_intercept(struct cpu_user_regs *regs)
1642 uint32_t ecx = regs->ecx;
1643 uint64_t msr_content = 0;
1644 struct vcpu *v = current;
1645 uint64_t *var_range_base, *fixed_range_base;
1646 int index, mtrr;
1647 uint32_t cpuid[4];
1649 var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
1650 fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
1652 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1653 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1655 switch ( ecx )
1657 case MSR_IA32_TSC:
1658 msr_content = hvm_get_guest_tsc(v);
1659 break;
1661 case MSR_IA32_APICBASE:
1662 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
1663 break;
1665 case MSR_IA32_MCG_CAP:
1666 case MSR_IA32_MCG_STATUS:
1667 case MSR_IA32_MC0_STATUS:
1668 case MSR_IA32_MC1_STATUS:
1669 case MSR_IA32_MC2_STATUS:
1670 case MSR_IA32_MC3_STATUS:
1671 case MSR_IA32_MC4_STATUS:
1672 case MSR_IA32_MC5_STATUS:
1673 /* No point in letting the guest see real MCEs */
1674 msr_content = 0;
1675 break;
1677 case MSR_IA32_CR_PAT:
1678 msr_content = v->arch.hvm_vcpu.pat_cr;
1679 break;
1681 case MSR_MTRRcap:
1682 if ( !mtrr )
1683 goto gp_fault;
1684 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
1685 break;
1686 case MSR_MTRRdefType:
1687 if ( !mtrr )
1688 goto gp_fault;
1689 msr_content = v->arch.hvm_vcpu.mtrr.def_type
1690 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
1691 break;
1692 case MSR_MTRRfix64K_00000:
1693 if ( !mtrr )
1694 goto gp_fault;
1695 msr_content = fixed_range_base[0];
1696 break;
1697 case MSR_MTRRfix16K_80000:
1698 case MSR_MTRRfix16K_A0000:
1699 if ( !mtrr )
1700 goto gp_fault;
1701 index = regs->ecx - MSR_MTRRfix16K_80000;
1702 msr_content = fixed_range_base[index + 1];
1703 break;
1704 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1705 if ( !mtrr )
1706 goto gp_fault;
1707 index = regs->ecx - MSR_MTRRfix4K_C0000;
1708 msr_content = fixed_range_base[index + 3];
1709 break;
1710 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1711 if ( !mtrr )
1712 goto gp_fault;
1713 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
1714 msr_content = var_range_base[index];
1715 break;
1717 default:
1718 return hvm_funcs.msr_read_intercept(regs);
1721 regs->eax = (uint32_t)msr_content;
1722 regs->edx = (uint32_t)(msr_content >> 32);
1723 return X86EMUL_OKAY;
1725 gp_fault:
1726 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1727 return X86EMUL_EXCEPTION;
1730 int hvm_msr_write_intercept(struct cpu_user_regs *regs)
1732 extern bool_t mtrr_var_range_msr_set(
1733 struct mtrr_state *v, u32 msr, u64 msr_content);
1734 extern bool_t mtrr_fix_range_msr_set(
1735 struct mtrr_state *v, int row, u64 msr_content);
1736 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
1737 extern bool_t pat_msr_set(u64 *pat, u64 msr);
1739 uint32_t ecx = regs->ecx;
1740 uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
1741 struct vcpu *v = current;
1742 int index, mtrr;
1743 uint32_t cpuid[4];
1745 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1746 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1748 switch ( ecx )
1750 case MSR_IA32_TSC:
1751 hvm_set_guest_tsc(v, msr_content);
1752 pt_reset(v);
1753 break;
1755 case MSR_IA32_APICBASE:
1756 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1757 break;
1759 case MSR_IA32_CR_PAT:
1760 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
1761 goto gp_fault;
1762 break;
1764 case MSR_MTRRcap:
1765 if ( !mtrr )
1766 goto gp_fault;
1767 goto gp_fault;
1768 case MSR_MTRRdefType:
1769 if ( !mtrr )
1770 goto gp_fault;
1771 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
1772 goto gp_fault;
1773 break;
1774 case MSR_MTRRfix64K_00000:
1775 if ( !mtrr )
1776 goto gp_fault;
1777 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
1778 goto gp_fault;
1779 break;
1780 case MSR_MTRRfix16K_80000:
1781 case MSR_MTRRfix16K_A0000:
1782 if ( !mtrr )
1783 goto gp_fault;
1784 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
1785 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1786 index, msr_content) )
1787 goto gp_fault;
1788 break;
1789 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1790 if ( !mtrr )
1791 goto gp_fault;
1792 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
1793 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1794 index, msr_content) )
1795 goto gp_fault;
1796 break;
1797 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1798 if ( !mtrr )
1799 goto gp_fault;
1800 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1801 regs->ecx, msr_content) )
1802 goto gp_fault;
1803 break;
1805 default:
1806 return hvm_funcs.msr_write_intercept(regs);
1809 return X86EMUL_OKAY;
1811 gp_fault:
1812 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1813 return X86EMUL_EXCEPTION;
1816 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
1818 unsigned long intr_shadow;
1820 ASSERT(v == current);
1822 if ( (intack.source != hvm_intsrc_nmi) &&
1823 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1824 return hvm_intblk_rflags_ie;
1826 intr_shadow = hvm_funcs.get_interrupt_shadow(v);
1828 if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
1829 return hvm_intblk_shadow;
1831 if ( intack.source == hvm_intsrc_nmi )
1832 return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
1833 hvm_intblk_nmi_iret : hvm_intblk_none);
1835 if ( intack.source == hvm_intsrc_lapic )
1837 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1838 if ( (tpr >> 4) >= (intack.vector >> 4) )
1839 return hvm_intblk_tpr;
1842 return hvm_intblk_none;
1845 static long hvm_grant_table_op(
1846 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1848 if ( (cmd != GNTTABOP_query_size) && (cmd != GNTTABOP_setup_table) )
1849 return -ENOSYS; /* all other commands need auditing */
1850 return do_grant_table_op(cmd, uop, count);
1853 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
1855 long rc = do_memory_op(cmd, arg);
1856 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
1857 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
1858 return rc;
1861 typedef unsigned long hvm_hypercall_t(
1862 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1864 #define HYPERCALL(x) \
1865 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1867 #if defined(__i386__)
1869 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1870 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
1871 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1872 HYPERCALL(xen_version),
1873 HYPERCALL(event_channel_op),
1874 HYPERCALL(sched_op),
1875 HYPERCALL(hvm_op)
1876 };
1878 #else /* defined(__x86_64__) */
1880 static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1882 long rc = compat_memory_op(cmd, arg);
1883 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
1884 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
1885 return rc;
1888 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1889 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
1890 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1891 HYPERCALL(xen_version),
1892 HYPERCALL(event_channel_op),
1893 HYPERCALL(sched_op),
1894 HYPERCALL(hvm_op)
1895 };
1897 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1898 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
1899 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1900 HYPERCALL(xen_version),
1901 HYPERCALL(event_channel_op),
1902 HYPERCALL(sched_op),
1903 HYPERCALL(hvm_op)
1904 };
1906 #endif /* defined(__x86_64__) */
1908 int hvm_do_hypercall(struct cpu_user_regs *regs)
1910 struct vcpu *curr = current;
1911 struct segment_register sreg;
1912 int mode = hvm_guest_x86_mode(curr);
1913 uint32_t eax = regs->eax;
1915 switch ( mode )
1917 #ifdef __x86_64__
1918 case 8:
1919 #endif
1920 case 4:
1921 case 2:
1922 hvm_get_segment_register(curr, x86_seg_ss, &sreg);
1923 if ( unlikely(sreg.attr.fields.dpl == 3) )
1925 default:
1926 regs->eax = -EPERM;
1927 return HVM_HCALL_completed;
1929 case 0:
1930 break;
1933 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1935 regs->eax = -ENOSYS;
1936 return HVM_HCALL_completed;
1939 this_cpu(hc_preempted) = 0;
1941 #ifdef __x86_64__
1942 if ( mode == 8 )
1944 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1945 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1947 this_cpu(hvm_64bit_hcall) = 1;
1948 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1949 regs->rsi,
1950 regs->rdx,
1951 regs->r10,
1952 regs->r8);
1953 this_cpu(hvm_64bit_hcall) = 0;
1955 else
1956 #endif
1958 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1959 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1960 (uint32_t)regs->edx, (uint32_t)regs->esi,
1961 (uint32_t)regs->edi);
1963 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1964 (uint32_t)regs->ecx,
1965 (uint32_t)regs->edx,
1966 (uint32_t)regs->esi,
1967 (uint32_t)regs->edi);
1970 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1971 eax, (unsigned long)regs->eax);
1973 if ( this_cpu(hc_preempted) )
1974 return HVM_HCALL_preempted;
1976 if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
1977 test_and_clear_bool(curr->domain->arch.hvm_domain.
1978 qemu_mapcache_invalidate) )
1979 return HVM_HCALL_invalidate;
1981 return HVM_HCALL_completed;
1984 static void hvm_latch_shinfo_size(struct domain *d)
1986 /*
1987 * Called from operations which are among the very first executed by
1988 * PV drivers on initialisation or after save/restore. These are sensible
1989 * points at which to sample the execution mode of the guest and latch
1990 * 32- or 64-bit format for shared state.
1991 */
1992 if ( current->domain == d )
1993 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1996 /* Initialise a hypercall transfer page for a VMX domain using
1997 paravirtualised drivers. */
1998 void hvm_hypercall_page_initialise(struct domain *d,
1999 void *hypercall_page)
2001 hvm_latch_shinfo_size(d);
2002 hvm_funcs.init_hypercall_page(d, hypercall_page);
2005 static int hvmop_set_pci_intx_level(
2006 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
2008 struct xen_hvm_set_pci_intx_level op;
2009 struct domain *d;
2010 int rc;
2012 if ( copy_from_guest(&op, uop, 1) )
2013 return -EFAULT;
2015 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
2016 return -EINVAL;
2018 d = rcu_lock_domain_by_id(op.domid);
2019 if ( d == NULL )
2020 return -ESRCH;
2022 rc = -EPERM;
2023 if ( !IS_PRIV_FOR(current->domain, d) )
2024 goto out;
2026 rc = -EINVAL;
2027 if ( !is_hvm_domain(d) )
2028 goto out;
2030 rc = xsm_hvm_set_pci_intx_level(d);
2031 if ( rc )
2032 goto out;
2034 rc = 0;
2035 switch ( op.level )
2037 case 0:
2038 hvm_pci_intx_deassert(d, op.device, op.intx);
2039 break;
2040 case 1:
2041 hvm_pci_intx_assert(d, op.device, op.intx);
2042 break;
2043 default:
2044 rc = -EINVAL;
2045 break;
2048 out:
2049 rcu_unlock_domain(d);
2050 return rc;
2053 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
2055 struct domain *d = current->domain;
2056 struct vcpu_guest_context *ctxt;
2057 struct segment_register reg;
2059 BUG_ON(vcpu_runnable(v));
2061 domain_lock(d);
2063 if ( v->is_initialised )
2064 goto out;
2066 if ( !paging_mode_hap(d) )
2068 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
2069 put_page(pagetable_get_page(v->arch.guest_table));
2070 v->arch.guest_table = pagetable_null();
2073 ctxt = &v->arch.guest_context;
2074 memset(ctxt, 0, sizeof(*ctxt));
2075 ctxt->flags = VGCF_online;
2076 ctxt->user_regs.eflags = 2;
2077 ctxt->user_regs.edx = 0x00000f00;
2078 ctxt->user_regs.eip = ip;
2080 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
2081 hvm_update_guest_cr(v, 0);
2083 v->arch.hvm_vcpu.guest_cr[2] = 0;
2084 hvm_update_guest_cr(v, 2);
2086 v->arch.hvm_vcpu.guest_cr[3] = 0;
2087 hvm_update_guest_cr(v, 3);
2089 v->arch.hvm_vcpu.guest_cr[4] = 0;
2090 hvm_update_guest_cr(v, 4);
2092 v->arch.hvm_vcpu.guest_efer = 0;
2093 hvm_update_guest_efer(v);
2095 reg.sel = cs;
2096 reg.base = (uint32_t)reg.sel << 4;
2097 reg.limit = 0xffff;
2098 reg.attr.bytes = 0x09b;
2099 hvm_set_segment_register(v, x86_seg_cs, &reg);
2101 reg.sel = reg.base = 0;
2102 reg.limit = 0xffff;
2103 reg.attr.bytes = 0x093;
2104 hvm_set_segment_register(v, x86_seg_ds, &reg);
2105 hvm_set_segment_register(v, x86_seg_es, &reg);
2106 hvm_set_segment_register(v, x86_seg_fs, &reg);
2107 hvm_set_segment_register(v, x86_seg_gs, &reg);
2108 hvm_set_segment_register(v, x86_seg_ss, &reg);
2110 reg.attr.bytes = 0x82; /* LDT */
2111 hvm_set_segment_register(v, x86_seg_ldtr, &reg);
2113 reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
2114 hvm_set_segment_register(v, x86_seg_tr, &reg);
2116 reg.attr.bytes = 0;
2117 hvm_set_segment_register(v, x86_seg_gdtr, &reg);
2118 hvm_set_segment_register(v, x86_seg_idtr, &reg);
2120 /* Sync AP's TSC with BSP's. */
2121 v->arch.hvm_vcpu.cache_tsc_offset =
2122 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
2123 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
2125 paging_update_paging_modes(v);
2127 v->arch.flags |= TF_kernel_mode;
2128 v->is_initialised = 1;
2129 clear_bit(_VPF_down, &v->pause_flags);
2131 out:
2132 domain_unlock(d);
2135 static void hvm_s3_suspend(struct domain *d)
2137 struct vcpu *v;
2139 domain_pause(d);
2140 domain_lock(d);
2142 if ( d->is_dying || (d->vcpu[0] == NULL) ||
2143 test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) )
2145 domain_unlock(d);
2146 domain_unpause(d);
2147 return;
2150 for_each_vcpu ( d, v )
2152 vlapic_reset(vcpu_vlapic(v));
2153 vcpu_reset(v);
2156 vpic_reset(d);
2157 vioapic_reset(d);
2158 pit_reset(d);
2159 rtc_reset(d);
2160 pmtimer_reset(d);
2161 hpet_reset(d);
2163 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
2165 domain_unlock(d);
2168 static void hvm_s3_resume(struct domain *d)
2170 if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
2171 domain_unpause(d);
2174 static int hvmop_set_isa_irq_level(
2175 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
2177 struct xen_hvm_set_isa_irq_level op;
2178 struct domain *d;
2179 int rc;
2181 if ( copy_from_guest(&op, uop, 1) )
2182 return -EFAULT;
2184 if ( op.isa_irq > 15 )
2185 return -EINVAL;
2187 d = rcu_lock_domain_by_id(op.domid);
2188 if ( d == NULL )
2189 return -ESRCH;
2191 rc = -EPERM;
2192 if ( !IS_PRIV_FOR(current->domain, d) )
2193 goto out;
2195 rc = -EINVAL;
2196 if ( !is_hvm_domain(d) )
2197 goto out;
2199 rc = xsm_hvm_set_isa_irq_level(d);
2200 if ( rc )
2201 goto out;
2203 rc = 0;
2204 switch ( op.level )
2206 case 0:
2207 hvm_isa_irq_deassert(d, op.isa_irq);
2208 break;
2209 case 1:
2210 hvm_isa_irq_assert(d, op.isa_irq);
2211 break;
2212 default:
2213 rc = -EINVAL;
2214 break;
2217 out:
2218 rcu_unlock_domain(d);
2219 return rc;
2222 static int hvmop_set_pci_link_route(
2223 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
2225 struct xen_hvm_set_pci_link_route op;
2226 struct domain *d;
2227 int rc;
2229 if ( copy_from_guest(&op, uop, 1) )
2230 return -EFAULT;
2232 if ( (op.link > 3) || (op.isa_irq > 15) )
2233 return -EINVAL;
2235 d = rcu_lock_domain_by_id(op.domid);
2236 if ( d == NULL )
2237 return -ESRCH;
2239 rc = -EPERM;
2240 if ( !IS_PRIV_FOR(current->domain, d) )
2241 goto out;
2243 rc = -EINVAL;
2244 if ( !is_hvm_domain(d) )
2245 goto out;
2247 rc = xsm_hvm_set_pci_link_route(d);
2248 if ( rc )
2249 goto out;
2251 rc = 0;
2252 hvm_set_pci_link_route(d, op.link, op.isa_irq);
2254 out:
2255 rcu_unlock_domain(d);
2256 return rc;
2259 static int hvmop_flush_tlb_all(void)
2261 struct domain *d = current->domain;
2262 struct vcpu *v;
2264 /* Avoid deadlock if more than one vcpu tries this at the same time. */
2265 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
2266 return -EAGAIN;
2268 /* Pause all other vcpus. */
2269 for_each_vcpu ( d, v )
2270 if ( v != current )
2271 vcpu_pause_nosync(v);
2273 /* Now that all VCPUs are signalled to deschedule, we wait... */
2274 for_each_vcpu ( d, v )
2275 if ( v != current )
2276 while ( !vcpu_runnable(v) && v->is_running )
2277 cpu_relax();
2279 /* All other vcpus are paused, safe to unlock now. */
2280 spin_unlock(&d->hypercall_deadlock_mutex);
2282 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
2283 for_each_vcpu ( d, v )
2284 paging_update_cr3(v);
2286 /* Flush all dirty TLBs. */
2287 flush_tlb_mask(d->domain_dirty_cpumask);
2289 /* Done. */
2290 for_each_vcpu ( d, v )
2291 if ( v != current )
2292 vcpu_unpause(v);
2294 return 0;
2297 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
2300 long rc = 0;
2302 switch ( op )
2304 case HVMOP_set_param:
2305 case HVMOP_get_param:
2307 struct xen_hvm_param a;
2308 struct hvm_ioreq_page *iorp;
2309 struct domain *d;
2310 struct vcpu *v;
2312 if ( copy_from_guest(&a, arg, 1) )
2313 return -EFAULT;
2315 if ( a.index >= HVM_NR_PARAMS )
2316 return -EINVAL;
2318 if ( a.domid == DOMID_SELF )
2320 d = rcu_lock_current_domain();
2322 else
2324 if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
2325 return -ESRCH;
2326 if ( !IS_PRIV_FOR(current->domain, d) )
2328 rc = -EPERM;
2329 goto param_fail;
2334 rc = -EINVAL;
2335 if ( !is_hvm_domain(d) )
2336 goto param_fail;
2338 rc = xsm_hvm_param(d, op);
2339 if ( rc )
2340 goto param_fail;
2342 if ( op == HVMOP_set_param )
2344 rc = 0;
2346 switch ( a.index )
2348 case HVM_PARAM_IOREQ_PFN:
2349 iorp = &d->arch.hvm_domain.ioreq;
2350 if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
2351 break;
2352 spin_lock(&iorp->lock);
2353 if ( iorp->va != NULL )
2354 /* Initialise evtchn port info if VCPUs already created. */
2355 for_each_vcpu ( d, v )
2356 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2357 spin_unlock(&iorp->lock);
2358 break;
2359 case HVM_PARAM_BUFIOREQ_PFN:
2360 iorp = &d->arch.hvm_domain.buf_ioreq;
2361 rc = hvm_set_ioreq_page(d, iorp, a.value);
2362 break;
2363 case HVM_PARAM_CALLBACK_IRQ:
2364 hvm_set_callback_via(d, a.value);
2365 hvm_latch_shinfo_size(d);
2366 break;
2367 case HVM_PARAM_TIMER_MODE:
2368 if ( a.value > HVMPTM_one_missed_tick_pending )
2369 rc = -EINVAL;
2370 break;
2371 case HVM_PARAM_IDENT_PT:
2372 rc = -EPERM;
2373 if ( !IS_PRIV(current->domain) )
2374 break;
2376 rc = -EINVAL;
2377 if ( d->arch.hvm_domain.params[a.index] != 0 )
2378 break;
2380 rc = 0;
2381 if ( !paging_mode_hap(d) )
2382 break;
2384 domain_pause(d);
2386 /*
2387 * Update GUEST_CR3 in each VMCS to point at identity map.
2388 * All foreign updates to guest state must synchronise on
2389 * the domctl_lock.
2390 */
2391 spin_lock(&domctl_lock);
2392 d->arch.hvm_domain.params[a.index] = a.value;
2393 for_each_vcpu ( d, v )
2394 paging_update_cr3(v);
2395 spin_unlock(&domctl_lock);
2397 domain_unpause(d);
2398 break;
2399 case HVM_PARAM_DM_DOMAIN:
2400 /* Privileged domains only, as we must domain_pause(d). */
2401 rc = -EPERM;
2402 if ( !IS_PRIV_FOR(current->domain, d) )
2403 break;
2405 if ( a.value == DOMID_SELF )
2406 a.value = current->domain->domain_id;
2408 rc = 0;
2409 domain_pause(d); /* safe to change per-vcpu xen_port */
2410 iorp = &d->arch.hvm_domain.ioreq;
2411 for_each_vcpu ( d, v )
2413 int old_port, new_port;
2414 new_port = alloc_unbound_xen_event_channel(v, a.value);
2415 if ( new_port < 0 )
2417 rc = new_port;
2418 break;
2420 /* xchg() ensures that only we free_xen_event_channel() */
2421 old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
2422 free_xen_event_channel(v, old_port);
2423 spin_lock(&iorp->lock);
2424 if ( iorp->va != NULL )
2425 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2426 spin_unlock(&iorp->lock);
2428 domain_unpause(d);
2429 break;
2430 case HVM_PARAM_ACPI_S_STATE:
2431 /* Privileged domains only, as we must domain_pause(d). */
2432 rc = -EPERM;
2433 if ( !IS_PRIV_FOR(current->domain, d) )
2434 break;
2436 rc = 0;
2437 if ( a.value == 3 )
2438 hvm_s3_suspend(d);
2439 else if ( a.value == 0 )
2440 hvm_s3_resume(d);
2441 else
2442 rc = -EINVAL;
2444 break;
2447 if ( rc == 0 )
2448 d->arch.hvm_domain.params[a.index] = a.value;
2450 else
2452 switch ( a.index )
2454 case HVM_PARAM_ACPI_S_STATE:
2455 a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
2456 break;
2457 default:
2458 a.value = d->arch.hvm_domain.params[a.index];
2459 break;
2461 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
2464 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
2465 op == HVMOP_set_param ? "set" : "get",
2466 a.index, a.value);
2468 param_fail:
2469 rcu_unlock_domain(d);
2470 break;
2473 case HVMOP_set_pci_intx_level:
2474 rc = hvmop_set_pci_intx_level(
2475 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
2476 break;
2478 case HVMOP_set_isa_irq_level:
2479 rc = hvmop_set_isa_irq_level(
2480 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
2481 break;
2483 case HVMOP_set_pci_link_route:
2484 rc = hvmop_set_pci_link_route(
2485 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
2486 break;
2488 case HVMOP_flush_tlbs:
2489 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
2490 break;
2492 case HVMOP_track_dirty_vram:
2494 struct xen_hvm_track_dirty_vram a;
2495 struct domain *d;
2497 if ( copy_from_guest(&a, arg, 1) )
2498 return -EFAULT;
2500 if ( a.domid == DOMID_SELF )
2502 d = rcu_lock_current_domain();
2504 else
2506 if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
2507 return -ESRCH;
2508 if ( !IS_PRIV_FOR(current->domain, d) )
2510 rc = -EPERM;
2511 goto param_fail2;
2515 rc = -EINVAL;
2516 if ( !is_hvm_domain(d) )
2517 goto param_fail2;
2519 rc = xsm_hvm_param(d, op);
2520 if ( rc )
2521 goto param_fail2;
2523 rc = -ESRCH;
2524 if ( d->is_dying )
2525 goto param_fail2;
2527 rc = -EINVAL;
2528 if ( !shadow_mode_enabled(d))
2529 goto param_fail2;
2530 if ( d->vcpu[0] == NULL )
2531 goto param_fail2;
2533 rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2535 param_fail2:
2536 rcu_unlock_domain(d);
2537 break;
2540 case HVMOP_modified_memory:
2542 struct xen_hvm_modified_memory a;
2543 struct domain *d;
2544 unsigned long pfn;
2546 if ( copy_from_guest(&a, arg, 1) )
2547 return -EFAULT;
2549 if ( a.domid == DOMID_SELF )
2551 d = rcu_lock_current_domain();
2553 else
2555 if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
2556 return -ESRCH;
2557 if ( !IS_PRIV_FOR(current->domain, d) )
2559 rc = -EPERM;
2560 goto param_fail3;
2564 rc = -EINVAL;
2565 if ( !is_hvm_domain(d) )
2566 goto param_fail3;
2568 rc = xsm_hvm_param(d, op);
2569 if ( rc )
2570 goto param_fail3;
2572 rc = -EINVAL;
2573 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
2574 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
2575 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
2576 goto param_fail3;
2578 rc = 0;
2579 if ( !paging_mode_log_dirty(d) )
2580 goto param_fail3;
2582 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
2584 p2m_type_t t;
2585 mfn_t mfn = gfn_to_mfn(d, pfn, &t);
2586 if ( mfn_x(mfn) != INVALID_MFN )
2588 paging_mark_dirty(d, mfn_x(mfn));
2589 /* These are most probably not page tables any more */
2590 /* don't take a long time and don't die either */
2591 sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
2595 param_fail3:
2596 rcu_unlock_domain(d);
2597 break;
2600 default:
2602 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
2603 rc = -ENOSYS;
2604 break;
2608 if ( rc == -EAGAIN )
2609 rc = hypercall_create_continuation(
2610 __HYPERVISOR_hvm_op, "lh", op, arg);
2612 return rc;
2615 /*
2616 * Local variables:
2617 * mode: C
2618 * c-set-style: "BSD"
2619 * c-basic-offset: 4
2620 * tab-width: 4
2621 * indent-tabs-mode: nil
2622 * End:
2623 */