debuggers.hg

view xen/arch/x86/hvm/hvm.c @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents cb6f8a34b59a
children 81edfffb3aff
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 * Copyright (c) 2008, Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 */
22 #include <xen/config.h>
23 #include <xen/ctype.h>
24 #include <xen/init.h>
25 #include <xen/lib.h>
26 #include <xen/trace.h>
27 #include <xen/sched.h>
28 #include <xen/irq.h>
29 #include <xen/softirq.h>
30 #include <xen/domain.h>
31 #include <xen/domain_page.h>
32 #include <xen/hypercall.h>
33 #include <xen/guest_access.h>
34 #include <xen/event.h>
35 #include <xen/paging.h>
36 #include <asm/shadow.h>
37 #include <asm/hap.h>
38 #include <asm/current.h>
39 #include <asm/e820.h>
40 #include <asm/io.h>
41 #include <asm/regs.h>
42 #include <asm/cpufeature.h>
43 #include <asm/processor.h>
44 #include <asm/types.h>
45 #include <asm/msr.h>
46 #include <asm/mc146818rtc.h>
47 #include <asm/spinlock.h>
48 #include <asm/hvm/hvm.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/support.h>
51 #include <asm/hvm/cacheattr.h>
52 #include <asm/hvm/trace.h>
53 #include <public/sched.h>
54 #include <public/hvm/ioreq.h>
55 #include <public/version.h>
56 #include <public/memory.h>
58 int hvm_enabled __read_mostly;
60 unsigned int opt_hvm_debug_level __read_mostly;
61 integer_param("hvm_debug", opt_hvm_debug_level);
63 int opt_softtsc;
64 boolean_param("softtsc", opt_softtsc);
66 struct hvm_function_table hvm_funcs __read_mostly;
68 /* I/O permission bitmap is globally shared by all HVM guests. */
69 unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
70 hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
72 void hvm_enable(struct hvm_function_table *fns)
73 {
74 extern int hvm_port80_allowed;
76 BUG_ON(hvm_enabled);
77 printk("HVM: %s enabled\n", fns->name);
79 /*
80 * Allow direct access to the PC debug ports 0x80 and 0xed (they are
81 * often used for I/O delays, but the vmexits simply slow things down).
82 */
83 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
84 if ( hvm_port80_allowed )
85 __clear_bit(0x80, hvm_io_bitmap);
86 __clear_bit(0xed, hvm_io_bitmap);
88 hvm_funcs = *fns;
89 hvm_enabled = 1;
91 if ( hvm_funcs.hap_supported )
92 printk("HVM: Hardware Assisted Paging detected.\n");
93 }
95 /*
96 * Need to re-inject a given event? We avoid re-injecting software exceptions
97 * and interrupts because the faulting/trapping instruction can simply be
98 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
99 * INT3/INTO/INTn).
100 */
101 int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
102 {
103 switch ( type )
104 {
105 case X86_EVENTTYPE_EXT_INTR:
106 case X86_EVENTTYPE_NMI:
107 return 1;
108 case X86_EVENTTYPE_HW_EXCEPTION:
109 /*
110 * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
111 * check for these vectors, as they are really SW Exceptions. SVM has
112 * not updated RIP to point after the trapping instruction (INT3/INTO).
113 */
114 return (vector != 3) && (vector != 4);
115 default:
116 /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
117 break;
118 }
119 return 0;
120 }
122 /*
123 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
124 * This means we can assume that @vec2 is contributory or a page fault.
125 */
126 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
127 {
128 /* Exception during double-fault delivery always causes a triple fault. */
129 if ( vec1 == TRAP_double_fault )
130 {
131 hvm_triple_fault();
132 return TRAP_double_fault; /* dummy return */
133 }
135 /* Exception during page-fault delivery always causes a double fault. */
136 if ( vec1 == TRAP_page_fault )
137 return TRAP_double_fault;
139 /* Discard the first exception if it's benign or if we now have a #PF. */
140 if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
141 return vec2;
143 /* Cannot combine the exceptions: double fault. */
144 return TRAP_double_fault;
145 }
147 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
148 {
149 u64 host_tsc;
151 rdtscll(host_tsc);
153 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
154 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
155 }
157 u64 hvm_get_guest_tsc(struct vcpu *v)
158 {
159 u64 host_tsc;
161 if ( opt_softtsc )
162 host_tsc = hvm_get_guest_time(v);
163 else
164 rdtscll(host_tsc);
166 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
167 }
169 void hvm_migrate_timers(struct vcpu *v)
170 {
171 rtc_migrate_timers(v);
172 pt_migrate(v);
173 }
175 void hvm_do_resume(struct vcpu *v)
176 {
177 ioreq_t *p;
179 pt_restore_timer(v);
181 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
182 p = &get_ioreq(v)->vp_ioreq;
183 while ( p->state != STATE_IOREQ_NONE )
184 {
185 switch ( p->state )
186 {
187 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
188 hvm_io_assist();
189 break;
190 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
191 case STATE_IOREQ_INPROCESS:
192 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
193 (p->state != STATE_IOREQ_READY) &&
194 (p->state != STATE_IOREQ_INPROCESS));
195 break;
196 default:
197 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
198 domain_crash(v->domain);
199 return; /* bail */
200 }
201 }
202 }
204 static void hvm_init_ioreq_page(
205 struct domain *d, struct hvm_ioreq_page *iorp)
206 {
207 memset(iorp, 0, sizeof(*iorp));
208 spin_lock_init(&iorp->lock);
209 domain_pause(d);
210 }
212 static void hvm_destroy_ioreq_page(
213 struct domain *d, struct hvm_ioreq_page *iorp)
214 {
215 spin_lock(&iorp->lock);
217 ASSERT(d->is_dying);
219 if ( iorp->va != NULL )
220 {
221 unmap_domain_page_global(iorp->va);
222 put_page_and_type(iorp->page);
223 iorp->va = NULL;
224 }
226 spin_unlock(&iorp->lock);
227 }
229 static int hvm_set_ioreq_page(
230 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
231 {
232 struct page_info *page;
233 p2m_type_t p2mt;
234 unsigned long mfn;
235 void *va;
237 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
238 if ( !p2m_is_ram(p2mt) )
239 return -EINVAL;
240 ASSERT(mfn_valid(mfn));
242 page = mfn_to_page(mfn);
243 if ( !get_page_and_type(page, d, PGT_writable_page) )
244 return -EINVAL;
246 va = map_domain_page_global(mfn);
247 if ( va == NULL )
248 {
249 put_page_and_type(page);
250 return -ENOMEM;
251 }
253 spin_lock(&iorp->lock);
255 if ( (iorp->va != NULL) || d->is_dying )
256 {
257 spin_unlock(&iorp->lock);
258 unmap_domain_page_global(va);
259 put_page_and_type(mfn_to_page(mfn));
260 return -EINVAL;
261 }
263 iorp->va = va;
264 iorp->page = page;
266 spin_unlock(&iorp->lock);
268 domain_unpause(d);
270 return 0;
271 }
273 static int hvm_print_line(
274 int dir, uint32_t port, uint32_t bytes, uint32_t *val)
275 {
276 struct vcpu *curr = current;
277 struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
278 char c = *val;
280 BUG_ON(bytes != 1);
282 /* Accept only printable characters, newline, and horizontal tab. */
283 if ( !isprint(c) && (c != '\n') && (c != '\t') )
284 return X86EMUL_OKAY;
286 spin_lock(&hd->pbuf_lock);
287 hd->pbuf[hd->pbuf_idx++] = c;
288 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
289 {
290 if ( c != '\n' )
291 hd->pbuf[hd->pbuf_idx++] = '\n';
292 hd->pbuf[hd->pbuf_idx] = '\0';
293 printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
294 hd->pbuf_idx = 0;
295 }
296 spin_unlock(&hd->pbuf_lock);
298 return X86EMUL_OKAY;
299 }
301 int hvm_domain_initialise(struct domain *d)
302 {
303 int rc;
305 if ( !hvm_enabled )
306 {
307 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
308 "on a non-VT/AMDV platform.\n");
309 return -EINVAL;
310 }
312 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
313 spin_lock_init(&d->arch.hvm_domain.irq_lock);
314 spin_lock_init(&d->arch.hvm_domain.uc_lock);
316 INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
317 spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
319 hvm_init_guest_time(d);
321 d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
323 hvm_init_cacheattr_region_list(d);
325 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
326 if ( rc != 0 )
327 goto fail1;
329 vpic_init(d);
331 rc = vioapic_init(d);
332 if ( rc != 0 )
333 goto fail1;
335 stdvga_init(d);
337 rtc_init(d);
339 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
340 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
342 register_portio_handler(d, 0xe9, 1, hvm_print_line);
344 rc = hvm_funcs.domain_initialise(d);
345 if ( rc != 0 )
346 goto fail2;
348 return 0;
350 fail2:
351 rtc_deinit(d);
352 stdvga_deinit(d);
353 vioapic_deinit(d);
354 fail1:
355 hvm_destroy_cacheattr_region_list(d);
356 return rc;
357 }
359 extern void msixtbl_pt_cleanup(struct domain *d);
361 void hvm_domain_relinquish_resources(struct domain *d)
362 {
363 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
364 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
366 msixtbl_pt_cleanup(d);
368 /* Stop all asynchronous timer actions. */
369 rtc_deinit(d);
370 if ( d->vcpu != NULL && d->vcpu[0] != NULL )
371 {
372 pit_deinit(d);
373 pmtimer_deinit(d);
374 hpet_deinit(d);
375 }
376 }
378 void hvm_domain_destroy(struct domain *d)
379 {
380 hvm_funcs.domain_destroy(d);
381 rtc_deinit(d);
382 stdvga_deinit(d);
383 vioapic_deinit(d);
384 hvm_destroy_cacheattr_region_list(d);
385 }
387 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
388 {
389 struct vcpu *v;
390 struct hvm_hw_cpu ctxt;
391 struct segment_register seg;
392 struct vcpu_guest_context *vc;
394 for_each_vcpu ( d, v )
395 {
396 /* We don't need to save state for a vcpu that is down; the restore
397 * code will leave it down if there is nothing saved. */
398 if ( test_bit(_VPF_down, &v->pause_flags) )
399 continue;
401 /* Architecture-specific vmcs/vmcb bits */
402 hvm_funcs.save_cpu_ctxt(v, &ctxt);
404 hvm_get_segment_register(v, x86_seg_idtr, &seg);
405 ctxt.idtr_limit = seg.limit;
406 ctxt.idtr_base = seg.base;
408 hvm_get_segment_register(v, x86_seg_gdtr, &seg);
409 ctxt.gdtr_limit = seg.limit;
410 ctxt.gdtr_base = seg.base;
412 hvm_get_segment_register(v, x86_seg_cs, &seg);
413 ctxt.cs_sel = seg.sel;
414 ctxt.cs_limit = seg.limit;
415 ctxt.cs_base = seg.base;
416 ctxt.cs_arbytes = seg.attr.bytes;
418 hvm_get_segment_register(v, x86_seg_ds, &seg);
419 ctxt.ds_sel = seg.sel;
420 ctxt.ds_limit = seg.limit;
421 ctxt.ds_base = seg.base;
422 ctxt.ds_arbytes = seg.attr.bytes;
424 hvm_get_segment_register(v, x86_seg_es, &seg);
425 ctxt.es_sel = seg.sel;
426 ctxt.es_limit = seg.limit;
427 ctxt.es_base = seg.base;
428 ctxt.es_arbytes = seg.attr.bytes;
430 hvm_get_segment_register(v, x86_seg_ss, &seg);
431 ctxt.ss_sel = seg.sel;
432 ctxt.ss_limit = seg.limit;
433 ctxt.ss_base = seg.base;
434 ctxt.ss_arbytes = seg.attr.bytes;
436 hvm_get_segment_register(v, x86_seg_fs, &seg);
437 ctxt.fs_sel = seg.sel;
438 ctxt.fs_limit = seg.limit;
439 ctxt.fs_base = seg.base;
440 ctxt.fs_arbytes = seg.attr.bytes;
442 hvm_get_segment_register(v, x86_seg_gs, &seg);
443 ctxt.gs_sel = seg.sel;
444 ctxt.gs_limit = seg.limit;
445 ctxt.gs_base = seg.base;
446 ctxt.gs_arbytes = seg.attr.bytes;
448 hvm_get_segment_register(v, x86_seg_tr, &seg);
449 ctxt.tr_sel = seg.sel;
450 ctxt.tr_limit = seg.limit;
451 ctxt.tr_base = seg.base;
452 ctxt.tr_arbytes = seg.attr.bytes;
454 hvm_get_segment_register(v, x86_seg_ldtr, &seg);
455 ctxt.ldtr_sel = seg.sel;
456 ctxt.ldtr_limit = seg.limit;
457 ctxt.ldtr_base = seg.base;
458 ctxt.ldtr_arbytes = seg.attr.bytes;
460 vc = &v->arch.guest_context;
462 if ( v->fpu_initialised )
463 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
464 else
465 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
467 ctxt.rax = vc->user_regs.eax;
468 ctxt.rbx = vc->user_regs.ebx;
469 ctxt.rcx = vc->user_regs.ecx;
470 ctxt.rdx = vc->user_regs.edx;
471 ctxt.rbp = vc->user_regs.ebp;
472 ctxt.rsi = vc->user_regs.esi;
473 ctxt.rdi = vc->user_regs.edi;
474 ctxt.rsp = vc->user_regs.esp;
475 ctxt.rip = vc->user_regs.eip;
476 ctxt.rflags = vc->user_regs.eflags;
477 #ifdef __x86_64__
478 ctxt.r8 = vc->user_regs.r8;
479 ctxt.r9 = vc->user_regs.r9;
480 ctxt.r10 = vc->user_regs.r10;
481 ctxt.r11 = vc->user_regs.r11;
482 ctxt.r12 = vc->user_regs.r12;
483 ctxt.r13 = vc->user_regs.r13;
484 ctxt.r14 = vc->user_regs.r14;
485 ctxt.r15 = vc->user_regs.r15;
486 #endif
487 ctxt.dr0 = vc->debugreg[0];
488 ctxt.dr1 = vc->debugreg[1];
489 ctxt.dr2 = vc->debugreg[2];
490 ctxt.dr3 = vc->debugreg[3];
491 ctxt.dr6 = vc->debugreg[6];
492 ctxt.dr7 = vc->debugreg[7];
494 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
495 return 1;
496 }
497 return 0;
498 }
500 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
501 {
502 int vcpuid, rc;
503 struct vcpu *v;
504 struct hvm_hw_cpu ctxt;
505 struct segment_register seg;
506 struct vcpu_guest_context *vc;
508 /* Which vcpu is this? */
509 vcpuid = hvm_load_instance(h);
510 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
511 {
512 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
513 return -EINVAL;
514 }
515 vc = &v->arch.guest_context;
517 /* Need to init this vcpu before loading its contents */
518 rc = 0;
519 domain_lock(d);
520 if ( !v->is_initialised )
521 rc = boot_vcpu(d, vcpuid, vc);
522 domain_unlock(d);
523 if ( rc != 0 )
524 return rc;
526 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
527 return -EINVAL;
529 /* Sanity check some control registers. */
530 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
531 !(ctxt.cr0 & X86_CR0_ET) ||
532 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
533 {
534 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
535 ctxt.cr0);
536 return -EINVAL;
537 }
539 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
540 {
541 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
542 ctxt.cr4);
543 return -EINVAL;
544 }
546 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
547 EFER_NX | EFER_SCE)) ||
548 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
549 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
550 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
551 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
552 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
553 {
554 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
555 ctxt.msr_efer);
556 return -EINVAL;
557 }
559 /* Older Xen versions used to save the segment arbytes directly
560 * from the VMCS on Intel hosts. Detect this and rearrange them
561 * into the struct segment_register format. */
562 #define UNFOLD_ARBYTES(_r) \
563 if ( (_r & 0xf000) && !(_r & 0x0f00) ) \
564 _r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
565 UNFOLD_ARBYTES(ctxt.cs_arbytes);
566 UNFOLD_ARBYTES(ctxt.ds_arbytes);
567 UNFOLD_ARBYTES(ctxt.es_arbytes);
568 UNFOLD_ARBYTES(ctxt.fs_arbytes);
569 UNFOLD_ARBYTES(ctxt.gs_arbytes);
570 UNFOLD_ARBYTES(ctxt.ss_arbytes);
571 UNFOLD_ARBYTES(ctxt.tr_arbytes);
572 UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
573 #undef UNFOLD_ARBYTES
575 /* Architecture-specific vmcs/vmcb bits */
576 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
577 return -EINVAL;
579 seg.limit = ctxt.idtr_limit;
580 seg.base = ctxt.idtr_base;
581 hvm_set_segment_register(v, x86_seg_idtr, &seg);
583 seg.limit = ctxt.gdtr_limit;
584 seg.base = ctxt.gdtr_base;
585 hvm_set_segment_register(v, x86_seg_gdtr, &seg);
587 seg.sel = ctxt.cs_sel;
588 seg.limit = ctxt.cs_limit;
589 seg.base = ctxt.cs_base;
590 seg.attr.bytes = ctxt.cs_arbytes;
591 hvm_set_segment_register(v, x86_seg_cs, &seg);
593 seg.sel = ctxt.ds_sel;
594 seg.limit = ctxt.ds_limit;
595 seg.base = ctxt.ds_base;
596 seg.attr.bytes = ctxt.ds_arbytes;
597 hvm_set_segment_register(v, x86_seg_ds, &seg);
599 seg.sel = ctxt.es_sel;
600 seg.limit = ctxt.es_limit;
601 seg.base = ctxt.es_base;
602 seg.attr.bytes = ctxt.es_arbytes;
603 hvm_set_segment_register(v, x86_seg_es, &seg);
605 seg.sel = ctxt.ss_sel;
606 seg.limit = ctxt.ss_limit;
607 seg.base = ctxt.ss_base;
608 seg.attr.bytes = ctxt.ss_arbytes;
609 hvm_set_segment_register(v, x86_seg_ss, &seg);
611 seg.sel = ctxt.fs_sel;
612 seg.limit = ctxt.fs_limit;
613 seg.base = ctxt.fs_base;
614 seg.attr.bytes = ctxt.fs_arbytes;
615 hvm_set_segment_register(v, x86_seg_fs, &seg);
617 seg.sel = ctxt.gs_sel;
618 seg.limit = ctxt.gs_limit;
619 seg.base = ctxt.gs_base;
620 seg.attr.bytes = ctxt.gs_arbytes;
621 hvm_set_segment_register(v, x86_seg_gs, &seg);
623 seg.sel = ctxt.tr_sel;
624 seg.limit = ctxt.tr_limit;
625 seg.base = ctxt.tr_base;
626 seg.attr.bytes = ctxt.tr_arbytes;
627 hvm_set_segment_register(v, x86_seg_tr, &seg);
629 seg.sel = ctxt.ldtr_sel;
630 seg.limit = ctxt.ldtr_limit;
631 seg.base = ctxt.ldtr_base;
632 seg.attr.bytes = ctxt.ldtr_arbytes;
633 hvm_set_segment_register(v, x86_seg_ldtr, &seg);
635 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
637 vc->user_regs.eax = ctxt.rax;
638 vc->user_regs.ebx = ctxt.rbx;
639 vc->user_regs.ecx = ctxt.rcx;
640 vc->user_regs.edx = ctxt.rdx;
641 vc->user_regs.ebp = ctxt.rbp;
642 vc->user_regs.esi = ctxt.rsi;
643 vc->user_regs.edi = ctxt.rdi;
644 vc->user_regs.esp = ctxt.rsp;
645 vc->user_regs.eip = ctxt.rip;
646 vc->user_regs.eflags = ctxt.rflags | 2;
647 #ifdef __x86_64__
648 vc->user_regs.r8 = ctxt.r8;
649 vc->user_regs.r9 = ctxt.r9;
650 vc->user_regs.r10 = ctxt.r10;
651 vc->user_regs.r11 = ctxt.r11;
652 vc->user_regs.r12 = ctxt.r12;
653 vc->user_regs.r13 = ctxt.r13;
654 vc->user_regs.r14 = ctxt.r14;
655 vc->user_regs.r15 = ctxt.r15;
656 #endif
657 vc->debugreg[0] = ctxt.dr0;
658 vc->debugreg[1] = ctxt.dr1;
659 vc->debugreg[2] = ctxt.dr2;
660 vc->debugreg[3] = ctxt.dr3;
661 vc->debugreg[6] = ctxt.dr6;
662 vc->debugreg[7] = ctxt.dr7;
664 vc->flags = VGCF_online;
665 v->fpu_initialised = 1;
667 /* Auxiliary processors should be woken immediately. */
668 v->is_initialised = 1;
669 clear_bit(_VPF_down, &v->pause_flags);
670 vcpu_wake(v);
672 return 0;
673 }
675 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
676 1, HVMSR_PER_VCPU);
678 int hvm_vcpu_initialise(struct vcpu *v)
679 {
680 int rc;
682 if ( (rc = vlapic_init(v)) != 0 )
683 goto fail1;
685 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
686 goto fail2;
688 /* Create ioreq event channel. */
689 rc = alloc_unbound_xen_event_channel(v, 0);
690 if ( rc < 0 )
691 goto fail3;
693 /* Register ioreq event channel. */
694 v->arch.hvm_vcpu.xen_port = rc;
695 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
696 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
697 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
698 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
700 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
701 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
703 rc = hvm_vcpu_cacheattr_init(v);
704 if ( rc != 0 )
705 goto fail3;
707 tasklet_init(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
708 (void(*)(unsigned long))hvm_assert_evtchn_irq,
709 (unsigned long)v);
711 v->arch.guest_context.user_regs.eflags = 2;
713 if ( v->vcpu_id == 0 )
714 {
715 /* NB. All these really belong in hvm_domain_initialise(). */
716 pit_init(v, cpu_khz);
717 pmtimer_init(v);
718 hpet_init(v);
720 /* Init guest TSC to start from zero. */
721 hvm_set_guest_tsc(v, 0);
723 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
724 v->is_initialised = 1;
725 clear_bit(_VPF_down, &v->pause_flags);
726 }
728 return 0;
730 fail3:
731 hvm_funcs.vcpu_destroy(v);
732 fail2:
733 vlapic_destroy(v);
734 fail1:
735 return rc;
736 }
738 void hvm_vcpu_destroy(struct vcpu *v)
739 {
740 tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
741 hvm_vcpu_cacheattr_destroy(v);
742 vlapic_destroy(v);
743 hvm_funcs.vcpu_destroy(v);
745 /* Event channel is already freed by evtchn_destroy(). */
746 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
747 }
749 void hvm_vcpu_down(struct vcpu *v)
750 {
751 struct domain *d = v->domain;
752 int online_count = 0;
754 /* Doesn't halt us immediately, but we'll never return to guest context. */
755 set_bit(_VPF_down, &v->pause_flags);
756 vcpu_sleep_nosync(v);
758 /* Any other VCPUs online? ... */
759 domain_lock(d);
760 for_each_vcpu ( d, v )
761 if ( !test_bit(_VPF_down, &v->pause_flags) )
762 online_count++;
763 domain_unlock(d);
765 /* ... Shut down the domain if not. */
766 if ( online_count == 0 )
767 {
768 gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
769 domain_shutdown(d, SHUTDOWN_poweroff);
770 }
771 }
773 void hvm_send_assist_req(struct vcpu *v)
774 {
775 ioreq_t *p;
777 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
778 return; /* implicitly bins the i/o operation */
780 p = &get_ioreq(v)->vp_ioreq;
781 if ( unlikely(p->state != STATE_IOREQ_NONE) )
782 {
783 /* This indicates a bug in the device model. Crash the domain. */
784 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
785 domain_crash(v->domain);
786 return;
787 }
789 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
791 /*
792 * Following happens /after/ blocking and setting up ioreq contents.
793 * prepare_wait_on_xen_event_channel() is an implicit barrier.
794 */
795 p->state = STATE_IOREQ_READY;
796 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
797 }
799 void hvm_hlt(unsigned long rflags)
800 {
801 struct vcpu *curr = current;
803 if ( hvm_event_pending(curr) )
804 return;
806 /*
807 * If we halt with interrupts disabled, that's a pretty sure sign that we
808 * want to shut down. In a real processor, NMIs are the only way to break
809 * out of this.
810 */
811 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
812 return hvm_vcpu_down(curr);
814 do_sched_op_compat(SCHEDOP_block, 0);
816 HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
817 }
819 void hvm_triple_fault(void)
820 {
821 struct vcpu *v = current;
822 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
823 "invoking HVM system reset.\n", v->vcpu_id);
824 domain_shutdown(v->domain, SHUTDOWN_reboot);
825 }
827 int hvm_set_efer(uint64_t value)
828 {
829 struct vcpu *v = current;
831 value &= ~EFER_LMA;
833 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
834 ((sizeof(long) != 8) && (value & EFER_LME)) ||
835 (!cpu_has_nx && (value & EFER_NX)) ||
836 (!cpu_has_syscall && (value & EFER_SCE)) ||
837 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
838 {
839 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
840 "EFER: %"PRIx64"\n", value);
841 hvm_inject_exception(TRAP_gp_fault, 0, 0);
842 return X86EMUL_EXCEPTION;
843 }
845 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
846 hvm_paging_enabled(v) )
847 {
848 gdprintk(XENLOG_WARNING,
849 "Trying to change EFER.LME with paging enabled\n");
850 hvm_inject_exception(TRAP_gp_fault, 0, 0);
851 return X86EMUL_EXCEPTION;
852 }
854 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
855 v->arch.hvm_vcpu.guest_efer = value;
856 hvm_update_guest_efer(v);
858 return X86EMUL_OKAY;
859 }
861 extern void shadow_blow_tables_per_domain(struct domain *d);
862 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
864 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
865 static bool_t domain_exit_uc_mode(struct vcpu *v)
866 {
867 struct domain *d = v->domain;
868 struct vcpu *vs;
870 for_each_vcpu ( d, vs )
871 {
872 if ( (vs == v) || !vs->is_initialised )
873 continue;
874 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
875 mtrr_pat_not_equal(vs, v) )
876 return 0;
877 }
879 return 1;
880 }
882 static void local_flush_cache(void *info)
883 {
884 wbinvd();
885 }
887 static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode)
888 {
889 v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode;
890 shadow_blow_tables_per_domain(v->domain);
891 if ( hvm_funcs.set_uc_mode )
892 return hvm_funcs.set_uc_mode(v);
893 }
895 int hvm_set_cr0(unsigned long value)
896 {
897 struct vcpu *v = current;
898 p2m_type_t p2mt;
899 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
901 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
903 if ( (u32)value != value )
904 {
905 HVM_DBG_LOG(DBG_LEVEL_1,
906 "Guest attempts to set upper 32 bits in CR0: %lx",
907 value);
908 goto gpf;
909 }
911 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
913 /* ET is reserved and should be always be 1. */
914 value |= X86_CR0_ET;
916 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
917 goto gpf;
919 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
920 {
921 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
922 {
923 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
924 {
925 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
926 goto gpf;
927 }
928 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
929 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
930 hvm_update_guest_efer(v);
931 }
933 if ( !paging_mode_hap(v->domain) )
934 {
935 /* The guest CR3 must be pointing to the guest physical. */
936 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
937 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
938 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
939 !get_page(mfn_to_page(mfn), v->domain))
940 {
941 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
942 v->arch.hvm_vcpu.guest_cr[3], mfn);
943 domain_crash(v->domain);
944 return X86EMUL_UNHANDLEABLE;
945 }
947 /* Now arch.guest_table points to machine physical. */
948 v->arch.guest_table = pagetable_from_pfn(mfn);
950 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
951 v->arch.hvm_vcpu.guest_cr[3], mfn);
952 }
953 }
954 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
955 {
956 /* When CR0.PG is cleared, LMA is cleared immediately. */
957 if ( hvm_long_mode_enabled(v) )
958 {
959 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
960 hvm_update_guest_efer(v);
961 }
963 if ( !paging_mode_hap(v->domain) )
964 {
965 put_page(pagetable_get_page(v->arch.guest_table));
966 v->arch.guest_table = pagetable_null();
967 }
968 }
970 if ( has_arch_pdevs(v->domain) )
971 {
972 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
973 {
974 /* Entering no fill cache mode. */
975 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
976 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
978 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
979 {
980 /* Flush physical caches. */
981 on_each_cpu(local_flush_cache, NULL, 1);
982 hvm_set_uc_mode(v, 1);
983 }
984 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
985 }
986 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
987 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
988 {
989 /* Exit from no fill cache mode. */
990 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
991 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
993 if ( domain_exit_uc_mode(v) )
994 hvm_set_uc_mode(v, 0);
996 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
997 }
998 }
1000 v->arch.hvm_vcpu.guest_cr[0] = value;
1001 hvm_update_guest_cr(v, 0);
1003 if ( (value ^ old_value) & X86_CR0_PG )
1004 paging_update_paging_modes(v);
1006 return X86EMUL_OKAY;
1008 gpf:
1009 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1010 return X86EMUL_EXCEPTION;
1013 int hvm_set_cr3(unsigned long value)
1015 unsigned long mfn;
1016 p2m_type_t p2mt;
1017 struct vcpu *v = current;
1019 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
1020 (value != v->arch.hvm_vcpu.guest_cr[3]) )
1022 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
1023 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1024 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1025 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
1026 !get_page(mfn_to_page(mfn), v->domain) )
1027 goto bad_cr3;
1029 put_page(pagetable_get_page(v->arch.guest_table));
1030 v->arch.guest_table = pagetable_from_pfn(mfn);
1032 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1035 v->arch.hvm_vcpu.guest_cr[3] = value;
1036 paging_update_cr3(v);
1037 return X86EMUL_OKAY;
1039 bad_cr3:
1040 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1041 domain_crash(v->domain);
1042 return X86EMUL_UNHANDLEABLE;
1045 int hvm_set_cr4(unsigned long value)
1047 struct vcpu *v = current;
1048 unsigned long old_cr;
1050 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1052 HVM_DBG_LOG(DBG_LEVEL_1,
1053 "Guest attempts to set reserved bit in CR4: %lx",
1054 value);
1055 goto gpf;
1058 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
1060 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1061 "EFER.LMA is set");
1062 goto gpf;
1065 old_cr = v->arch.hvm_vcpu.guest_cr[4];
1066 v->arch.hvm_vcpu.guest_cr[4] = value;
1067 hvm_update_guest_cr(v, 4);
1069 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
1070 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1071 paging_update_paging_modes(v);
1073 return X86EMUL_OKAY;
1075 gpf:
1076 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1077 return X86EMUL_EXCEPTION;
1080 int hvm_virtual_to_linear_addr(
1081 enum x86_segment seg,
1082 struct segment_register *reg,
1083 unsigned long offset,
1084 unsigned int bytes,
1085 enum hvm_access_type access_type,
1086 unsigned int addr_size,
1087 unsigned long *linear_addr)
1089 unsigned long addr = offset;
1090 uint32_t last_byte;
1092 if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1094 /*
1095 * REAL MODE: Don't bother with segment access checks.
1096 * Certain of them are not done in native real mode anyway.
1097 */
1098 addr = (uint32_t)(addr + reg->base);
1100 else if ( addr_size != 64 )
1102 /*
1103 * COMPATIBILITY MODE: Apply segment checks and add base.
1104 */
1106 switch ( access_type )
1108 case hvm_access_read:
1109 if ( (reg->attr.fields.type & 0xa) == 0x8 )
1110 goto gpf; /* execute-only code segment */
1111 break;
1112 case hvm_access_write:
1113 if ( (reg->attr.fields.type & 0xa) != 0x2 )
1114 goto gpf; /* not a writable data segment */
1115 break;
1116 default:
1117 break;
1120 last_byte = offset + bytes - 1;
1122 /* Is this a grows-down data segment? Special limit check if so. */
1123 if ( (reg->attr.fields.type & 0xc) == 0x4 )
1125 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
1126 if ( !reg->attr.fields.db )
1127 last_byte = (uint16_t)last_byte;
1129 /* Check first byte and last byte against respective bounds. */
1130 if ( (offset <= reg->limit) || (last_byte < offset) )
1131 goto gpf;
1133 else if ( (last_byte > reg->limit) || (last_byte < offset) )
1134 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
1136 /*
1137 * Hardware truncates to 32 bits in compatibility mode.
1138 * It does not truncate to 16 bits in 16-bit address-size mode.
1139 */
1140 addr = (uint32_t)(addr + reg->base);
1142 else
1144 /*
1145 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
1146 */
1148 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
1149 addr += reg->base;
1151 if ( !is_canonical_address(addr) )
1152 goto gpf;
1155 *linear_addr = addr;
1156 return 1;
1158 gpf:
1159 return 0;
1162 static void *hvm_map_entry(unsigned long va)
1164 unsigned long gfn, mfn;
1165 p2m_type_t p2mt;
1166 uint32_t pfec;
1168 if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE )
1170 gdprintk(XENLOG_ERR, "Descriptor table entry "
1171 "straddles page boundary\n");
1172 domain_crash(current->domain);
1173 return NULL;
1176 /* We're mapping on behalf of the segment-load logic, which might
1177 * write the accessed flags in the descriptors (in 32-bit mode), but
1178 * we still treat it as a kernel-mode read (i.e. no access checks). */
1179 pfec = PFEC_page_present;
1180 gfn = paging_gva_to_gfn(current, va, &pfec);
1181 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1182 if ( !p2m_is_ram(p2mt) )
1184 gdprintk(XENLOG_ERR, "Failed to look up descriptor table entry\n");
1185 domain_crash(current->domain);
1186 return NULL;
1189 ASSERT(mfn_valid(mfn));
1191 paging_mark_dirty(current->domain, mfn);
1193 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
1196 static void hvm_unmap_entry(void *p)
1198 if ( p )
1199 unmap_domain_page(p);
1202 static int hvm_load_segment_selector(
1203 enum x86_segment seg, uint16_t sel)
1205 struct segment_register desctab, cs, segr;
1206 struct desc_struct *pdesc, desc;
1207 u8 dpl, rpl, cpl;
1208 int fault_type = TRAP_invalid_tss;
1209 struct cpu_user_regs *regs = guest_cpu_user_regs();
1210 struct vcpu *v = current;
1212 if ( regs->eflags & EF_VM )
1214 segr.sel = sel;
1215 segr.base = (uint32_t)sel << 4;
1216 segr.limit = 0xffffu;
1217 segr.attr.bytes = 0xf3;
1218 hvm_set_segment_register(v, seg, &segr);
1219 return 0;
1222 /* NULL selector? */
1223 if ( (sel & 0xfffc) == 0 )
1225 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
1226 goto fail;
1227 memset(&segr, 0, sizeof(segr));
1228 hvm_set_segment_register(v, seg, &segr);
1229 return 0;
1232 /* LDT descriptor must be in the GDT. */
1233 if ( (seg == x86_seg_ldtr) && (sel & 4) )
1234 goto fail;
1236 hvm_get_segment_register(v, x86_seg_cs, &cs);
1237 hvm_get_segment_register(
1238 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
1240 /* Check against descriptor table limit. */
1241 if ( ((sel & 0xfff8) + 7) > desctab.limit )
1242 goto fail;
1244 pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
1245 if ( pdesc == NULL )
1246 goto hvm_map_fail;
1248 do {
1249 desc = *pdesc;
1251 /* Segment present in memory? */
1252 if ( !(desc.b & (1u<<15)) )
1254 fault_type = TRAP_no_segment;
1255 goto unmap_and_fail;
1258 /* LDT descriptor is a system segment. All others are code/data. */
1259 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1260 goto unmap_and_fail;
1262 dpl = (desc.b >> 13) & 3;
1263 rpl = sel & 3;
1264 cpl = cs.sel & 3;
1266 switch ( seg )
1268 case x86_seg_cs:
1269 /* Code segment? */
1270 if ( !(desc.b & (1u<<11)) )
1271 goto unmap_and_fail;
1272 /* Non-conforming segment: check DPL against RPL. */
1273 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1274 goto unmap_and_fail;
1275 break;
1276 case x86_seg_ss:
1277 /* Writable data segment? */
1278 if ( (desc.b & (5u<<9)) != (1u<<9) )
1279 goto unmap_and_fail;
1280 if ( (dpl != cpl) || (dpl != rpl) )
1281 goto unmap_and_fail;
1282 break;
1283 case x86_seg_ldtr:
1284 /* LDT system segment? */
1285 if ( (desc.b & (15u<<8)) != (2u<<8) )
1286 goto unmap_and_fail;
1287 goto skip_accessed_flag;
1288 default:
1289 /* Readable code or data segment? */
1290 if ( (desc.b & (5u<<9)) == (4u<<9) )
1291 goto unmap_and_fail;
1292 /* Non-conforming segment: check DPL against RPL and CPL. */
1293 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1294 goto unmap_and_fail;
1295 break;
1297 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1298 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1300 /* Force the Accessed flag in our local copy. */
1301 desc.b |= 0x100;
1303 skip_accessed_flag:
1304 hvm_unmap_entry(pdesc);
1306 segr.base = (((desc.b << 0) & 0xff000000u) |
1307 ((desc.b << 16) & 0x00ff0000u) |
1308 ((desc.a >> 16) & 0x0000ffffu));
1309 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1310 ((desc.b >> 12) & 0x0f00u));
1311 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1312 if ( segr.attr.fields.g )
1313 segr.limit = (segr.limit << 12) | 0xfffu;
1314 segr.sel = sel;
1315 hvm_set_segment_register(v, seg, &segr);
1317 return 0;
1319 unmap_and_fail:
1320 hvm_unmap_entry(pdesc);
1321 fail:
1322 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1323 hvm_map_fail:
1324 return 1;
1327 void hvm_task_switch(
1328 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1329 int32_t errcode)
1331 struct vcpu *v = current;
1332 struct cpu_user_regs *regs = guest_cpu_user_regs();
1333 struct segment_register gdt, tr, prev_tr, segr;
1334 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1335 unsigned long eflags;
1336 int exn_raised, rc;
1337 struct {
1338 u16 back_link,__blh;
1339 u32 esp0;
1340 u16 ss0, _0;
1341 u32 esp1;
1342 u16 ss1, _1;
1343 u32 esp2;
1344 u16 ss2, _2;
1345 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1346 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1347 u16 trace, iomap;
1348 } tss = { 0 };
1350 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1351 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1353 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1355 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1356 TRAP_invalid_tss : TRAP_gp_fault,
1357 tss_sel & 0xfff8, 0);
1358 goto out;
1361 optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8));
1362 if ( optss_desc == NULL )
1363 goto out;
1365 nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8));
1366 if ( nptss_desc == NULL )
1367 goto out;
1369 tss_desc = *nptss_desc;
1370 tr.sel = tss_sel;
1371 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1372 ((tss_desc.b << 16) & 0x00ff0000u) |
1373 ((tss_desc.a >> 16) & 0x0000ffffu));
1374 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1375 ((tss_desc.b >> 12) & 0x0f00u));
1376 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1377 if ( tr.attr.fields.g )
1378 tr.limit = (tr.limit << 12) | 0xfffu;
1380 if ( !tr.attr.fields.p )
1382 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1383 goto out;
1386 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1388 hvm_inject_exception(
1389 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1390 tss_sel & 0xfff8, 0);
1391 goto out;
1394 if ( tr.limit < (sizeof(tss)-1) )
1396 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1397 goto out;
1400 rc = hvm_copy_from_guest_virt(
1401 &tss, prev_tr.base, sizeof(tss), PFEC_page_present);
1402 if ( rc == HVMCOPY_bad_gva_to_gfn )
1403 goto out;
1405 eflags = regs->eflags;
1406 if ( taskswitch_reason == TSW_iret )
1407 eflags &= ~X86_EFLAGS_NT;
1409 tss.cr3 = v->arch.hvm_vcpu.guest_cr[3];
1410 tss.eip = regs->eip;
1411 tss.eflags = eflags;
1412 tss.eax = regs->eax;
1413 tss.ecx = regs->ecx;
1414 tss.edx = regs->edx;
1415 tss.ebx = regs->ebx;
1416 tss.esp = regs->esp;
1417 tss.ebp = regs->ebp;
1418 tss.esi = regs->esi;
1419 tss.edi = regs->edi;
1421 hvm_get_segment_register(v, x86_seg_es, &segr);
1422 tss.es = segr.sel;
1423 hvm_get_segment_register(v, x86_seg_cs, &segr);
1424 tss.cs = segr.sel;
1425 hvm_get_segment_register(v, x86_seg_ss, &segr);
1426 tss.ss = segr.sel;
1427 hvm_get_segment_register(v, x86_seg_ds, &segr);
1428 tss.ds = segr.sel;
1429 hvm_get_segment_register(v, x86_seg_fs, &segr);
1430 tss.fs = segr.sel;
1431 hvm_get_segment_register(v, x86_seg_gs, &segr);
1432 tss.gs = segr.sel;
1433 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1434 tss.ldt = segr.sel;
1436 rc = hvm_copy_to_guest_virt(
1437 prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
1438 if ( rc == HVMCOPY_bad_gva_to_gfn )
1439 goto out;
1441 rc = hvm_copy_from_guest_virt(
1442 &tss, tr.base, sizeof(tss), PFEC_page_present);
1443 if ( rc == HVMCOPY_bad_gva_to_gfn )
1444 goto out;
1446 if ( hvm_set_cr3(tss.cr3) )
1447 goto out;
1449 regs->eip = tss.eip;
1450 regs->eflags = tss.eflags | 2;
1451 regs->eax = tss.eax;
1452 regs->ecx = tss.ecx;
1453 regs->edx = tss.edx;
1454 regs->ebx = tss.ebx;
1455 regs->esp = tss.esp;
1456 regs->ebp = tss.ebp;
1457 regs->esi = tss.esi;
1458 regs->edi = tss.edi;
1460 if ( (taskswitch_reason == TSW_call_or_int) )
1462 regs->eflags |= X86_EFLAGS_NT;
1463 tss.back_link = prev_tr.sel;
1466 exn_raised = 0;
1467 if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt) ||
1468 hvm_load_segment_selector(x86_seg_es, tss.es) ||
1469 hvm_load_segment_selector(x86_seg_cs, tss.cs) ||
1470 hvm_load_segment_selector(x86_seg_ss, tss.ss) ||
1471 hvm_load_segment_selector(x86_seg_ds, tss.ds) ||
1472 hvm_load_segment_selector(x86_seg_fs, tss.fs) ||
1473 hvm_load_segment_selector(x86_seg_gs, tss.gs) )
1474 exn_raised = 1;
1476 rc = hvm_copy_to_guest_virt(
1477 tr.base, &tss, sizeof(tss), PFEC_page_present);
1478 if ( rc == HVMCOPY_bad_gva_to_gfn )
1479 exn_raised = 1;
1481 if ( (tss.trace & 1) && !exn_raised )
1482 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1484 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1485 hvm_set_segment_register(v, x86_seg_tr, &tr);
1487 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1488 hvm_update_guest_cr(v, 0);
1490 if ( (taskswitch_reason == TSW_iret) ||
1491 (taskswitch_reason == TSW_jmp) )
1492 clear_bit(41, optss_desc); /* clear B flag of old task */
1494 if ( taskswitch_reason != TSW_iret )
1495 set_bit(41, nptss_desc); /* set B flag of new task */
1497 if ( errcode >= 0 )
1499 struct segment_register reg;
1500 unsigned long linear_addr;
1501 regs->esp -= 4;
1502 hvm_get_segment_register(current, x86_seg_ss, &reg);
1503 /* Todo: do not ignore access faults here. */
1504 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1505 4, hvm_access_write, 32,
1506 &linear_addr) )
1507 hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0);
1510 out:
1511 hvm_unmap_entry(optss_desc);
1512 hvm_unmap_entry(nptss_desc);
1515 #define HVMCOPY_from_guest (0u<<0)
1516 #define HVMCOPY_to_guest (1u<<0)
1517 #define HVMCOPY_no_fault (0u<<1)
1518 #define HVMCOPY_fault (1u<<1)
1519 #define HVMCOPY_phys (0u<<2)
1520 #define HVMCOPY_virt (1u<<2)
1521 static enum hvm_copy_result __hvm_copy(
1522 void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
1524 struct vcpu *curr = current;
1525 unsigned long gfn, mfn;
1526 p2m_type_t p2mt;
1527 char *p;
1528 int count, todo = size;
1530 while ( todo > 0 )
1532 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1534 if ( flags & HVMCOPY_virt )
1536 gfn = paging_gva_to_gfn(curr, addr, &pfec);
1537 if ( gfn == INVALID_GFN )
1539 if ( flags & HVMCOPY_fault )
1540 hvm_inject_exception(TRAP_page_fault, pfec, addr);
1541 return HVMCOPY_bad_gva_to_gfn;
1544 else
1546 gfn = addr >> PAGE_SHIFT;
1549 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1551 if ( !p2m_is_ram(p2mt) )
1552 return HVMCOPY_bad_gfn_to_mfn;
1553 ASSERT(mfn_valid(mfn));
1555 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1557 if ( flags & HVMCOPY_to_guest )
1559 if ( p2mt == p2m_ram_ro )
1561 static unsigned long lastpage;
1562 if ( xchg(&lastpage, gfn) != gfn )
1563 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only"
1564 " memory page. gfn=%#lx, mfn=%#lx\n",
1565 gfn, mfn);
1567 else
1569 memcpy(p, buf, count);
1570 paging_mark_dirty(curr->domain, mfn);
1573 else
1575 memcpy(buf, p, count);
1578 unmap_domain_page(p);
1580 addr += count;
1581 buf += count;
1582 todo -= count;
1585 return HVMCOPY_okay;
1588 enum hvm_copy_result hvm_copy_to_guest_phys(
1589 paddr_t paddr, void *buf, int size)
1591 return __hvm_copy(buf, paddr, size,
1592 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys,
1593 0);
1596 enum hvm_copy_result hvm_copy_from_guest_phys(
1597 void *buf, paddr_t paddr, int size)
1599 return __hvm_copy(buf, paddr, size,
1600 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys,
1601 0);
1604 enum hvm_copy_result hvm_copy_to_guest_virt(
1605 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1607 return __hvm_copy(buf, vaddr, size,
1608 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt,
1609 PFEC_page_present | PFEC_write_access | pfec);
1612 enum hvm_copy_result hvm_copy_from_guest_virt(
1613 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1615 return __hvm_copy(buf, vaddr, size,
1616 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1617 PFEC_page_present | pfec);
1620 enum hvm_copy_result hvm_fetch_from_guest_virt(
1621 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1623 if ( hvm_nx_enabled(current) )
1624 pfec |= PFEC_insn_fetch;
1625 return __hvm_copy(buf, vaddr, size,
1626 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1627 PFEC_page_present | pfec);
1630 enum hvm_copy_result hvm_copy_to_guest_virt_nofault(
1631 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1633 return __hvm_copy(buf, vaddr, size,
1634 HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1635 PFEC_page_present | PFEC_write_access | pfec);
1638 enum hvm_copy_result hvm_copy_from_guest_virt_nofault(
1639 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1641 return __hvm_copy(buf, vaddr, size,
1642 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1643 PFEC_page_present | pfec);
1646 enum hvm_copy_result hvm_fetch_from_guest_virt_nofault(
1647 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1649 if ( hvm_nx_enabled(current) )
1650 pfec |= PFEC_insn_fetch;
1651 return __hvm_copy(buf, vaddr, size,
1652 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1653 PFEC_page_present | pfec);
1656 #ifdef __x86_64__
1657 DEFINE_PER_CPU(bool_t, hvm_64bit_hcall);
1658 #endif
1660 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
1662 int rc;
1664 #ifdef __x86_64__
1665 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(to, len) )
1667 memcpy(to, from, len);
1668 return 0;
1670 #endif
1672 rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from,
1673 len, 0);
1674 return rc ? len : 0; /* fake a copy_to_user() return code */
1677 unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
1679 int rc;
1681 #ifdef __x86_64__
1682 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(from, len) )
1684 memcpy(to, from, len);
1685 return 0;
1687 #endif
1689 rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0);
1690 return rc ? len : 0; /* fake a copy_from_user() return code */
1693 #define bitmaskof(idx) (1U << ((idx) & 31))
1694 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1695 unsigned int *ecx, unsigned int *edx)
1697 struct vcpu *v = current;
1699 if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
1700 return;
1702 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1703 return;
1705 domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
1707 switch ( input )
1709 case 0x1:
1710 /* Fix up VLAPIC details. */
1711 *ebx &= 0x00FFFFFFu;
1712 *ebx |= (v->vcpu_id * 2) << 24;
1713 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1714 __clear_bit(X86_FEATURE_APIC & 31, edx);
1715 break;
1716 case 0xb:
1717 /* Fix the x2APIC identifier. */
1718 *edx = v->vcpu_id * 2;
1719 break;
1723 void hvm_rdtsc_intercept(struct cpu_user_regs *regs)
1725 uint64_t tsc;
1726 struct vcpu *v = current;
1728 tsc = hvm_get_guest_tsc(v);
1729 regs->eax = (uint32_t)tsc;
1730 regs->edx = (uint32_t)(tsc >> 32);
1733 int hvm_msr_read_intercept(struct cpu_user_regs *regs)
1735 uint32_t ecx = regs->ecx;
1736 uint64_t msr_content = 0;
1737 struct vcpu *v = current;
1738 uint64_t *var_range_base, *fixed_range_base;
1739 int index, mtrr;
1740 uint32_t cpuid[4];
1742 var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
1743 fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
1745 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1746 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1748 switch ( ecx )
1750 case MSR_IA32_TSC:
1751 msr_content = hvm_get_guest_tsc(v);
1752 break;
1754 case MSR_IA32_APICBASE:
1755 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
1756 break;
1758 case MSR_IA32_MCG_CAP:
1759 case MSR_IA32_MCG_STATUS:
1760 case MSR_IA32_MC0_STATUS:
1761 case MSR_IA32_MC1_STATUS:
1762 case MSR_IA32_MC2_STATUS:
1763 case MSR_IA32_MC3_STATUS:
1764 case MSR_IA32_MC4_STATUS:
1765 case MSR_IA32_MC5_STATUS:
1766 /* No point in letting the guest see real MCEs */
1767 msr_content = 0;
1768 break;
1770 case MSR_IA32_CR_PAT:
1771 msr_content = v->arch.hvm_vcpu.pat_cr;
1772 break;
1774 case MSR_MTRRcap:
1775 if ( !mtrr )
1776 goto gp_fault;
1777 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
1778 break;
1779 case MSR_MTRRdefType:
1780 if ( !mtrr )
1781 goto gp_fault;
1782 msr_content = v->arch.hvm_vcpu.mtrr.def_type
1783 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
1784 break;
1785 case MSR_MTRRfix64K_00000:
1786 if ( !mtrr )
1787 goto gp_fault;
1788 msr_content = fixed_range_base[0];
1789 break;
1790 case MSR_MTRRfix16K_80000:
1791 case MSR_MTRRfix16K_A0000:
1792 if ( !mtrr )
1793 goto gp_fault;
1794 index = regs->ecx - MSR_MTRRfix16K_80000;
1795 msr_content = fixed_range_base[index + 1];
1796 break;
1797 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1798 if ( !mtrr )
1799 goto gp_fault;
1800 index = regs->ecx - MSR_MTRRfix4K_C0000;
1801 msr_content = fixed_range_base[index + 3];
1802 break;
1803 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1804 if ( !mtrr )
1805 goto gp_fault;
1806 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
1807 msr_content = var_range_base[index];
1808 break;
1810 case MSR_K8_ENABLE_C1E:
1811 /* There's no point in letting the guest see C-States.
1812 * Further, this AMD-only register may be accessed if this HVM guest
1813 * has been migrated to an Intel host. This fixes a guest crash
1814 * in this case.
1815 */
1816 msr_content = 0;
1817 break;
1819 default:
1820 return hvm_funcs.msr_read_intercept(regs);
1823 regs->eax = (uint32_t)msr_content;
1824 regs->edx = (uint32_t)(msr_content >> 32);
1825 return X86EMUL_OKAY;
1827 gp_fault:
1828 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1829 return X86EMUL_EXCEPTION;
1832 int hvm_msr_write_intercept(struct cpu_user_regs *regs)
1834 extern bool_t mtrr_var_range_msr_set(
1835 struct mtrr_state *v, u32 msr, u64 msr_content);
1836 extern bool_t mtrr_fix_range_msr_set(
1837 struct mtrr_state *v, int row, u64 msr_content);
1838 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
1839 extern bool_t pat_msr_set(u64 *pat, u64 msr);
1841 uint32_t ecx = regs->ecx;
1842 uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
1843 struct vcpu *v = current;
1844 int index, mtrr;
1845 uint32_t cpuid[4];
1847 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1848 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1850 switch ( ecx )
1852 case MSR_IA32_TSC:
1853 hvm_set_guest_tsc(v, msr_content);
1854 pt_reset(v);
1855 break;
1857 case MSR_IA32_APICBASE:
1858 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1859 break;
1861 case MSR_IA32_CR_PAT:
1862 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
1863 goto gp_fault;
1864 break;
1866 case MSR_MTRRcap:
1867 if ( !mtrr )
1868 goto gp_fault;
1869 goto gp_fault;
1870 case MSR_MTRRdefType:
1871 if ( !mtrr )
1872 goto gp_fault;
1873 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
1874 goto gp_fault;
1875 break;
1876 case MSR_MTRRfix64K_00000:
1877 if ( !mtrr )
1878 goto gp_fault;
1879 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
1880 goto gp_fault;
1881 break;
1882 case MSR_MTRRfix16K_80000:
1883 case MSR_MTRRfix16K_A0000:
1884 if ( !mtrr )
1885 goto gp_fault;
1886 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
1887 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1888 index, msr_content) )
1889 goto gp_fault;
1890 break;
1891 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1892 if ( !mtrr )
1893 goto gp_fault;
1894 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
1895 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1896 index, msr_content) )
1897 goto gp_fault;
1898 break;
1899 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1900 if ( !mtrr )
1901 goto gp_fault;
1902 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1903 regs->ecx, msr_content) )
1904 goto gp_fault;
1905 break;
1907 default:
1908 return hvm_funcs.msr_write_intercept(regs);
1911 return X86EMUL_OKAY;
1913 gp_fault:
1914 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1915 return X86EMUL_EXCEPTION;
1918 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
1920 unsigned long intr_shadow;
1922 ASSERT(v == current);
1924 if ( (intack.source != hvm_intsrc_nmi) &&
1925 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1926 return hvm_intblk_rflags_ie;
1928 intr_shadow = hvm_funcs.get_interrupt_shadow(v);
1930 if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
1931 return hvm_intblk_shadow;
1933 if ( intack.source == hvm_intsrc_nmi )
1934 return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
1935 hvm_intblk_nmi_iret : hvm_intblk_none);
1937 if ( intack.source == hvm_intsrc_lapic )
1939 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1940 if ( (tpr >> 4) >= (intack.vector >> 4) )
1941 return hvm_intblk_tpr;
1944 return hvm_intblk_none;
1947 static long hvm_grant_table_op(
1948 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1950 if ( (cmd != GNTTABOP_query_size) && (cmd != GNTTABOP_setup_table) )
1951 return -ENOSYS; /* all other commands need auditing */
1952 return do_grant_table_op(cmd, uop, count);
1955 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
1957 long rc = do_memory_op(cmd, arg);
1958 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
1959 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
1960 return rc;
1963 static long hvm_vcpu_op(
1964 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
1966 long rc;
1968 switch ( cmd )
1970 case VCPUOP_register_runstate_memory_area:
1971 case VCPUOP_get_runstate_info:
1972 rc = do_vcpu_op(cmd, vcpuid, arg);
1973 break;
1974 default:
1975 rc = -ENOSYS;
1976 break;
1979 return rc;
1982 typedef unsigned long hvm_hypercall_t(
1983 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1985 #define HYPERCALL(x) \
1986 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1988 #if defined(__i386__)
1990 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1991 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
1992 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1993 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
1994 HYPERCALL(xen_version),
1995 HYPERCALL(event_channel_op),
1996 HYPERCALL(sched_op),
1997 HYPERCALL(hvm_op)
1998 };
2000 #else /* defined(__x86_64__) */
2002 static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
2004 long rc = compat_memory_op(cmd, arg);
2005 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
2006 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
2007 return rc;
2010 static long hvm_vcpu_op_compat32(
2011 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
2013 long rc;
2015 switch ( cmd )
2017 case VCPUOP_register_runstate_memory_area:
2018 case VCPUOP_get_runstate_info:
2019 rc = compat_vcpu_op(cmd, vcpuid, arg);
2020 break;
2021 default:
2022 rc = -ENOSYS;
2023 break;
2026 return rc;
2029 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
2030 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
2031 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2032 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
2033 HYPERCALL(xen_version),
2034 HYPERCALL(event_channel_op),
2035 HYPERCALL(sched_op),
2036 HYPERCALL(hvm_op)
2037 };
2039 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
2040 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
2041 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2042 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32,
2043 HYPERCALL(xen_version),
2044 HYPERCALL(event_channel_op),
2045 HYPERCALL(sched_op),
2046 HYPERCALL(hvm_op)
2047 };
2049 #endif /* defined(__x86_64__) */
2051 int hvm_do_hypercall(struct cpu_user_regs *regs)
2053 struct vcpu *curr = current;
2054 struct segment_register sreg;
2055 int mode = hvm_guest_x86_mode(curr);
2056 uint32_t eax = regs->eax;
2058 switch ( mode )
2060 #ifdef __x86_64__
2061 case 8:
2062 #endif
2063 case 4:
2064 case 2:
2065 hvm_get_segment_register(curr, x86_seg_ss, &sreg);
2066 if ( unlikely(sreg.attr.fields.dpl == 3) )
2068 default:
2069 regs->eax = -EPERM;
2070 return HVM_HCALL_completed;
2072 case 0:
2073 break;
2076 if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) )
2077 return viridian_hypercall(regs);
2079 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
2081 regs->eax = -ENOSYS;
2082 return HVM_HCALL_completed;
2085 this_cpu(hc_preempted) = 0;
2087 #ifdef __x86_64__
2088 if ( mode == 8 )
2090 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
2091 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
2093 this_cpu(hvm_64bit_hcall) = 1;
2094 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
2095 regs->rsi,
2096 regs->rdx,
2097 regs->r10,
2098 regs->r8);
2099 this_cpu(hvm_64bit_hcall) = 0;
2101 else
2102 #endif
2104 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
2105 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
2106 (uint32_t)regs->edx, (uint32_t)regs->esi,
2107 (uint32_t)regs->edi);
2109 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
2110 (uint32_t)regs->ecx,
2111 (uint32_t)regs->edx,
2112 (uint32_t)regs->esi,
2113 (uint32_t)regs->edi);
2116 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
2117 eax, (unsigned long)regs->eax);
2119 if ( this_cpu(hc_preempted) )
2120 return HVM_HCALL_preempted;
2122 if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
2123 test_and_clear_bool(curr->domain->arch.hvm_domain.
2124 qemu_mapcache_invalidate) )
2125 return HVM_HCALL_invalidate;
2127 return HVM_HCALL_completed;
2130 static void hvm_latch_shinfo_size(struct domain *d)
2132 /*
2133 * Called from operations which are among the very first executed by
2134 * PV drivers on initialisation or after save/restore. These are sensible
2135 * points at which to sample the execution mode of the guest and latch
2136 * 32- or 64-bit format for shared state.
2137 */
2138 if ( current->domain == d )
2139 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
2142 /* Initialise a hypercall transfer page for a VMX domain using
2143 paravirtualised drivers. */
2144 void hvm_hypercall_page_initialise(struct domain *d,
2145 void *hypercall_page)
2147 hvm_latch_shinfo_size(d);
2148 hvm_funcs.init_hypercall_page(d, hypercall_page);
2151 static int hvmop_set_pci_intx_level(
2152 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
2154 struct xen_hvm_set_pci_intx_level op;
2155 struct domain *d;
2156 int rc;
2158 if ( copy_from_guest(&op, uop, 1) )
2159 return -EFAULT;
2161 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
2162 return -EINVAL;
2164 d = rcu_lock_domain_by_id(op.domid);
2165 if ( d == NULL )
2166 return -ESRCH;
2168 rc = -EPERM;
2169 if ( !IS_PRIV_FOR(current->domain, d) )
2170 goto out;
2172 rc = -EINVAL;
2173 if ( !is_hvm_domain(d) )
2174 goto out;
2176 rc = xsm_hvm_set_pci_intx_level(d);
2177 if ( rc )
2178 goto out;
2180 rc = 0;
2181 switch ( op.level )
2183 case 0:
2184 hvm_pci_intx_deassert(d, op.device, op.intx);
2185 break;
2186 case 1:
2187 hvm_pci_intx_assert(d, op.device, op.intx);
2188 break;
2189 default:
2190 rc = -EINVAL;
2191 break;
2194 out:
2195 rcu_unlock_domain(d);
2196 return rc;
2199 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
2201 struct domain *d = v->domain;
2202 struct vcpu_guest_context *ctxt;
2203 struct segment_register reg;
2205 BUG_ON(vcpu_runnable(v));
2207 domain_lock(d);
2209 if ( v->is_initialised )
2210 goto out;
2212 if ( !paging_mode_hap(d) )
2214 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
2215 put_page(pagetable_get_page(v->arch.guest_table));
2216 v->arch.guest_table = pagetable_null();
2219 ctxt = &v->arch.guest_context;
2220 memset(ctxt, 0, sizeof(*ctxt));
2221 ctxt->flags = VGCF_online;
2222 ctxt->user_regs.eflags = 2;
2223 ctxt->user_regs.edx = 0x00000f00;
2224 ctxt->user_regs.eip = ip;
2226 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
2227 hvm_update_guest_cr(v, 0);
2229 v->arch.hvm_vcpu.guest_cr[2] = 0;
2230 hvm_update_guest_cr(v, 2);
2232 v->arch.hvm_vcpu.guest_cr[3] = 0;
2233 hvm_update_guest_cr(v, 3);
2235 v->arch.hvm_vcpu.guest_cr[4] = 0;
2236 hvm_update_guest_cr(v, 4);
2238 v->arch.hvm_vcpu.guest_efer = 0;
2239 hvm_update_guest_efer(v);
2241 reg.sel = cs;
2242 reg.base = (uint32_t)reg.sel << 4;
2243 reg.limit = 0xffff;
2244 reg.attr.bytes = 0x09b;
2245 hvm_set_segment_register(v, x86_seg_cs, &reg);
2247 reg.sel = reg.base = 0;
2248 reg.limit = 0xffff;
2249 reg.attr.bytes = 0x093;
2250 hvm_set_segment_register(v, x86_seg_ds, &reg);
2251 hvm_set_segment_register(v, x86_seg_es, &reg);
2252 hvm_set_segment_register(v, x86_seg_fs, &reg);
2253 hvm_set_segment_register(v, x86_seg_gs, &reg);
2254 hvm_set_segment_register(v, x86_seg_ss, &reg);
2256 reg.attr.bytes = 0x82; /* LDT */
2257 hvm_set_segment_register(v, x86_seg_ldtr, &reg);
2259 reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
2260 hvm_set_segment_register(v, x86_seg_tr, &reg);
2262 reg.attr.bytes = 0;
2263 hvm_set_segment_register(v, x86_seg_gdtr, &reg);
2264 hvm_set_segment_register(v, x86_seg_idtr, &reg);
2266 /* Sync AP's TSC with BSP's. */
2267 v->arch.hvm_vcpu.cache_tsc_offset =
2268 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
2269 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
2271 paging_update_paging_modes(v);
2273 v->arch.flags |= TF_kernel_mode;
2274 v->is_initialised = 1;
2275 clear_bit(_VPF_down, &v->pause_flags);
2277 out:
2278 domain_unlock(d);
2281 static void hvm_s3_suspend(struct domain *d)
2283 struct vcpu *v;
2285 domain_pause(d);
2286 domain_lock(d);
2288 if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ||
2289 test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) )
2291 domain_unlock(d);
2292 domain_unpause(d);
2293 return;
2296 for_each_vcpu ( d, v )
2298 vlapic_reset(vcpu_vlapic(v));
2299 vcpu_reset(v);
2302 vpic_reset(d);
2303 vioapic_reset(d);
2304 pit_reset(d);
2305 rtc_reset(d);
2306 pmtimer_reset(d);
2307 hpet_reset(d);
2309 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
2311 domain_unlock(d);
2314 static void hvm_s3_resume(struct domain *d)
2316 if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
2317 domain_unpause(d);
2320 static int hvmop_set_isa_irq_level(
2321 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
2323 struct xen_hvm_set_isa_irq_level op;
2324 struct domain *d;
2325 int rc;
2327 if ( copy_from_guest(&op, uop, 1) )
2328 return -EFAULT;
2330 if ( op.isa_irq > 15 )
2331 return -EINVAL;
2333 d = rcu_lock_domain_by_id(op.domid);
2334 if ( d == NULL )
2335 return -ESRCH;
2337 rc = -EPERM;
2338 if ( !IS_PRIV_FOR(current->domain, d) )
2339 goto out;
2341 rc = -EINVAL;
2342 if ( !is_hvm_domain(d) )
2343 goto out;
2345 rc = xsm_hvm_set_isa_irq_level(d);
2346 if ( rc )
2347 goto out;
2349 rc = 0;
2350 switch ( op.level )
2352 case 0:
2353 hvm_isa_irq_deassert(d, op.isa_irq);
2354 break;
2355 case 1:
2356 hvm_isa_irq_assert(d, op.isa_irq);
2357 break;
2358 default:
2359 rc = -EINVAL;
2360 break;
2363 out:
2364 rcu_unlock_domain(d);
2365 return rc;
2368 static int hvmop_set_pci_link_route(
2369 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
2371 struct xen_hvm_set_pci_link_route op;
2372 struct domain *d;
2373 int rc;
2375 if ( copy_from_guest(&op, uop, 1) )
2376 return -EFAULT;
2378 if ( (op.link > 3) || (op.isa_irq > 15) )
2379 return -EINVAL;
2381 d = rcu_lock_domain_by_id(op.domid);
2382 if ( d == NULL )
2383 return -ESRCH;
2385 rc = -EPERM;
2386 if ( !IS_PRIV_FOR(current->domain, d) )
2387 goto out;
2389 rc = -EINVAL;
2390 if ( !is_hvm_domain(d) )
2391 goto out;
2393 rc = xsm_hvm_set_pci_link_route(d);
2394 if ( rc )
2395 goto out;
2397 rc = 0;
2398 hvm_set_pci_link_route(d, op.link, op.isa_irq);
2400 out:
2401 rcu_unlock_domain(d);
2402 return rc;
2405 static int hvmop_flush_tlb_all(void)
2407 struct domain *d = current->domain;
2408 struct vcpu *v;
2410 if ( !is_hvm_domain(d) )
2411 return -EINVAL;
2413 /* Avoid deadlock if more than one vcpu tries this at the same time. */
2414 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
2415 return -EAGAIN;
2417 /* Pause all other vcpus. */
2418 for_each_vcpu ( d, v )
2419 if ( v != current )
2420 vcpu_pause_nosync(v);
2422 /* Now that all VCPUs are signalled to deschedule, we wait... */
2423 for_each_vcpu ( d, v )
2424 if ( v != current )
2425 while ( !vcpu_runnable(v) && v->is_running )
2426 cpu_relax();
2428 /* All other vcpus are paused, safe to unlock now. */
2429 spin_unlock(&d->hypercall_deadlock_mutex);
2431 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
2432 for_each_vcpu ( d, v )
2433 paging_update_cr3(v);
2435 /* Flush all dirty TLBs. */
2436 flush_tlb_mask(&d->domain_dirty_cpumask);
2438 /* Done. */
2439 for_each_vcpu ( d, v )
2440 if ( v != current )
2441 vcpu_unpause(v);
2443 return 0;
2446 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
2449 struct domain *curr_d = current->domain;
2450 long rc = 0;
2452 switch ( op )
2454 case HVMOP_set_param:
2455 case HVMOP_get_param:
2457 struct xen_hvm_param a;
2458 struct hvm_ioreq_page *iorp;
2459 struct domain *d;
2460 struct vcpu *v;
2462 if ( copy_from_guest(&a, arg, 1) )
2463 return -EFAULT;
2465 if ( a.index >= HVM_NR_PARAMS )
2466 return -EINVAL;
2468 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2469 if ( rc != 0 )
2470 return rc;
2472 rc = -EINVAL;
2473 if ( !is_hvm_domain(d) )
2474 goto param_fail;
2476 rc = xsm_hvm_param(d, op);
2477 if ( rc )
2478 goto param_fail;
2480 if ( op == HVMOP_set_param )
2482 rc = 0;
2484 switch ( a.index )
2486 case HVM_PARAM_IOREQ_PFN:
2487 iorp = &d->arch.hvm_domain.ioreq;
2488 if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
2489 break;
2490 spin_lock(&iorp->lock);
2491 if ( iorp->va != NULL )
2492 /* Initialise evtchn port info if VCPUs already created. */
2493 for_each_vcpu ( d, v )
2494 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2495 spin_unlock(&iorp->lock);
2496 break;
2497 case HVM_PARAM_BUFIOREQ_PFN:
2498 iorp = &d->arch.hvm_domain.buf_ioreq;
2499 rc = hvm_set_ioreq_page(d, iorp, a.value);
2500 break;
2501 case HVM_PARAM_CALLBACK_IRQ:
2502 hvm_set_callback_via(d, a.value);
2503 hvm_latch_shinfo_size(d);
2504 break;
2505 case HVM_PARAM_TIMER_MODE:
2506 if ( a.value > HVMPTM_one_missed_tick_pending )
2507 rc = -EINVAL;
2508 break;
2509 case HVM_PARAM_VIRIDIAN:
2510 if ( a.value > 1 )
2511 rc = -EINVAL;
2512 break;
2513 case HVM_PARAM_IDENT_PT:
2514 /* Not reflexive, as we must domain_pause(). */
2515 rc = -EPERM;
2516 if ( curr_d == d )
2517 break;
2519 rc = -EINVAL;
2520 if ( d->arch.hvm_domain.params[a.index] != 0 )
2521 break;
2523 rc = 0;
2524 if ( !paging_mode_hap(d) )
2525 break;
2527 /*
2528 * Update GUEST_CR3 in each VMCS to point at identity map.
2529 * All foreign updates to guest state must synchronise on
2530 * the domctl_lock.
2531 */
2532 rc = -EAGAIN;
2533 if ( !domctl_lock_acquire() )
2534 break;
2536 rc = 0;
2537 domain_pause(d);
2538 d->arch.hvm_domain.params[a.index] = a.value;
2539 for_each_vcpu ( d, v )
2540 paging_update_cr3(v);
2541 domain_unpause(d);
2543 domctl_lock_release();
2544 break;
2545 case HVM_PARAM_DM_DOMAIN:
2546 /* Not reflexive, as we must domain_pause(). */
2547 rc = -EPERM;
2548 if ( curr_d == d )
2549 break;
2551 if ( a.value == DOMID_SELF )
2552 a.value = curr_d->domain_id;
2554 rc = 0;
2555 domain_pause(d); /* safe to change per-vcpu xen_port */
2556 iorp = &d->arch.hvm_domain.ioreq;
2557 for_each_vcpu ( d, v )
2559 int old_port, new_port;
2560 new_port = alloc_unbound_xen_event_channel(v, a.value);
2561 if ( new_port < 0 )
2563 rc = new_port;
2564 break;
2566 /* xchg() ensures that only we free_xen_event_channel() */
2567 old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
2568 free_xen_event_channel(v, old_port);
2569 spin_lock(&iorp->lock);
2570 if ( iorp->va != NULL )
2571 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2572 spin_unlock(&iorp->lock);
2574 domain_unpause(d);
2575 break;
2576 case HVM_PARAM_ACPI_S_STATE:
2577 /* Not reflexive, as we must domain_pause(). */
2578 rc = -EPERM;
2579 if ( curr_d == d )
2580 break;
2582 rc = 0;
2583 if ( a.value == 3 )
2584 hvm_s3_suspend(d);
2585 else if ( a.value == 0 )
2586 hvm_s3_resume(d);
2587 else
2588 rc = -EINVAL;
2590 break;
2593 if ( rc == 0 )
2594 d->arch.hvm_domain.params[a.index] = a.value;
2596 else
2598 switch ( a.index )
2600 case HVM_PARAM_ACPI_S_STATE:
2601 a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
2602 break;
2603 default:
2604 a.value = d->arch.hvm_domain.params[a.index];
2605 break;
2607 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
2610 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
2611 op == HVMOP_set_param ? "set" : "get",
2612 a.index, a.value);
2614 param_fail:
2615 rcu_unlock_domain(d);
2616 break;
2619 case HVMOP_set_pci_intx_level:
2620 rc = hvmop_set_pci_intx_level(
2621 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
2622 break;
2624 case HVMOP_set_isa_irq_level:
2625 rc = hvmop_set_isa_irq_level(
2626 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
2627 break;
2629 case HVMOP_set_pci_link_route:
2630 rc = hvmop_set_pci_link_route(
2631 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
2632 break;
2634 case HVMOP_flush_tlbs:
2635 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
2636 break;
2638 case HVMOP_track_dirty_vram:
2640 struct xen_hvm_track_dirty_vram a;
2641 struct domain *d;
2643 if ( copy_from_guest(&a, arg, 1) )
2644 return -EFAULT;
2646 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2647 if ( rc != 0 )
2648 return rc;
2650 rc = -EINVAL;
2651 if ( !is_hvm_domain(d) )
2652 goto param_fail2;
2654 rc = xsm_hvm_param(d, op);
2655 if ( rc )
2656 goto param_fail2;
2658 rc = -ESRCH;
2659 if ( d->is_dying )
2660 goto param_fail2;
2662 rc = -EINVAL;
2663 if ( d->vcpu == NULL || d->vcpu[0] == NULL )
2664 goto param_fail2;
2666 if ( shadow_mode_enabled(d) )
2667 rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2668 else
2669 rc = hap_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2671 param_fail2:
2672 rcu_unlock_domain(d);
2673 break;
2676 case HVMOP_modified_memory:
2678 struct xen_hvm_modified_memory a;
2679 struct domain *d;
2680 unsigned long pfn;
2682 if ( copy_from_guest(&a, arg, 1) )
2683 return -EFAULT;
2685 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2686 if ( rc != 0 )
2687 return rc;
2689 rc = -EINVAL;
2690 if ( !is_hvm_domain(d) )
2691 goto param_fail3;
2693 rc = xsm_hvm_param(d, op);
2694 if ( rc )
2695 goto param_fail3;
2697 rc = -EINVAL;
2698 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
2699 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
2700 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
2701 goto param_fail3;
2703 rc = 0;
2704 if ( !paging_mode_log_dirty(d) )
2705 goto param_fail3;
2707 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
2709 p2m_type_t t;
2710 mfn_t mfn = gfn_to_mfn(d, pfn, &t);
2711 if ( mfn_x(mfn) != INVALID_MFN )
2713 paging_mark_dirty(d, mfn_x(mfn));
2714 /* These are most probably not page tables any more */
2715 /* don't take a long time and don't die either */
2716 sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
2720 param_fail3:
2721 rcu_unlock_domain(d);
2722 break;
2725 case HVMOP_set_mem_type:
2727 struct xen_hvm_set_mem_type a;
2728 struct domain *d;
2729 unsigned long pfn;
2731 /* Interface types to internal p2m types */
2732 p2m_type_t memtype[] = {
2733 p2m_ram_rw, /* HVMMEM_ram_rw */
2734 p2m_ram_ro, /* HVMMEM_ram_ro */
2735 p2m_mmio_dm /* HVMMEM_mmio_dm */
2736 };
2738 if ( copy_from_guest(&a, arg, 1) )
2739 return -EFAULT;
2741 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2742 if ( rc != 0 )
2743 return rc;
2745 rc = -EINVAL;
2746 if ( !is_hvm_domain(d) )
2747 goto param_fail4;
2749 rc = -EINVAL;
2750 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
2751 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
2752 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
2753 goto param_fail4;
2755 if ( a.hvmmem_type >= ARRAY_SIZE(memtype) )
2756 goto param_fail4;
2758 rc = 0;
2760 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
2762 p2m_type_t t;
2763 mfn_t mfn;
2764 mfn = gfn_to_mfn(d, pfn, &t);
2765 p2m_change_type(d, pfn, t, memtype[a.hvmmem_type]);
2768 param_fail4:
2769 rcu_unlock_domain(d);
2770 break;
2773 default:
2775 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
2776 rc = -ENOSYS;
2777 break;
2781 if ( rc == -EAGAIN )
2782 rc = hypercall_create_continuation(
2783 __HYPERVISOR_hvm_op, "lh", op, arg);
2785 return rc;
2788 int hvm_debug_op(struct vcpu *v, int32_t op)
2790 int rc;
2792 switch ( op )
2794 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
2795 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
2796 rc = -ENOSYS;
2797 if ( !cpu_has_monitor_trap_flag )
2798 break;
2799 rc = 0;
2800 vcpu_pause(v);
2801 v->arch.hvm_vcpu.single_step =
2802 (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
2803 vcpu_unpause(v); /* guest will latch new state */
2804 break;
2805 default:
2806 rc = -ENOSYS;
2807 break;
2810 return rc;
2814 /*
2815 * Local variables:
2816 * mode: C
2817 * c-set-style: "BSD"
2818 * c-basic-offset: 4
2819 * tab-width: 4
2820 * indent-tabs-mode: nil
2821 * End:
2822 */