debuggers.hg

view xen/arch/x86/vmx_platform.c @ 4640:e02fc4c21740

bitkeeper revision 1.1345 (4266bd05lHlHunb0CEvOq60j2DvKCQ)

[PATCH] VMX world switch

The attached code implements a VMX world switch to vmxassist (a small assist
module residing in a VMX enabled partition where it is responsible for
emulating real mode) whever CR0.PE is disabled.

The patch temporarily disables the PGE feature flag in cpuid as it is
currently broken (try running an unmodified 2.6 kernel that sets PGE in
mm/init.c/paging_init()).

The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT state
to detect race conditions on SMP systems.

Signed-Off-By: Leendert van Doorn <leendert@watson.ibm.com>
Signed-off-by: ian@xensource.com
author leendert@watson.ibm.com[iap10]
date Wed Apr 20 20:35:17 2005 +0000 (2005-04-20)
parents 1803018b3b05
children 38a02ee9a9c8 65b28c74cec2
line source
1 /*
2 * vmx_platform.c: handling x86 platform related MMIO instructions
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/types.h>
22 #include <xen/mm.h>
23 #include <asm/shadow.h>
24 #include <asm/domain_page.h>
25 #include <asm/page.h>
26 #include <xen/event.h>
27 #include <xen/trace.h>
28 #include <asm/vmx.h>
29 #include <asm/vmx_platform.h>
30 #include <public/io/ioreq.h>
32 #include <xen/lib.h>
33 #include <xen/sched.h>
34 #include <asm/current.h>
36 #ifdef CONFIG_VMX
38 #define DECODE_success 1
39 #define DECODE_failure 0
41 #if defined (__x86_64__)
42 static void store_xen_regs(struct xen_regs *regs)
43 {
45 }
47 static long get_reg_value(int size, int index, int seg, struct xen_regs *regs)
48 {
49 return 0;
50 }
51 #elif defined (__i386__)
52 static void store_xen_regs(struct xen_regs *regs)
53 {
54 __vmread(GUEST_SS_SELECTOR, &regs->ss);
55 __vmread(GUEST_ESP, &regs->esp);
56 __vmread(GUEST_EFLAGS, &regs->eflags);
57 __vmread(GUEST_CS_SELECTOR, &regs->cs);
58 __vmread(GUEST_DS_SELECTOR, &regs->ds);
59 __vmread(GUEST_ES_SELECTOR, &regs->es);
60 __vmread(GUEST_EIP, &regs->eip);
61 }
63 static long get_reg_value(int size, int index, int seg, struct xen_regs *regs)
64 {
65 /*
66 * Reference the db_reg[] table
67 */
68 switch (size) {
69 case BYTE:
70 switch (index) {
71 case 0: //%al
72 return (char)(regs->eax & 0xFF);
73 case 1: //%cl
74 return (char)(regs->ecx & 0xFF);
75 case 2: //%dl
76 return (char)(regs->edx & 0xFF);
77 case 3: //%bl
78 return (char)(regs->ebx & 0xFF);
79 case 4: //%ah
80 return (char)((regs->eax & 0xFF00) >> 8);
81 case 5: //%ch
82 return (char)((regs->ecx & 0xFF00) >> 8);
83 case 6: //%dh
84 return (char)((regs->edx & 0xFF00) >> 8);
85 case 7: //%bh
86 return (char)((regs->ebx & 0xFF00) >> 8);
87 default:
88 printk("(get_reg_value)size case 0 error\n");
89 return -1;
90 }
91 case WORD:
92 switch (index) {
93 case 0: //%ax
94 return (short)(regs->eax & 0xFFFF);
95 case 1: //%cx
96 return (short)(regs->ecx & 0xFFFF);
97 case 2: //%dx
98 return (short)(regs->edx & 0xFFFF);
99 case 3: //%bx
100 return (short)(regs->ebx & 0xFFFF);
101 case 4: //%sp
102 return (short)(regs->esp & 0xFFFF);
103 break;
104 case 5: //%bp
105 return (short)(regs->ebp & 0xFFFF);
106 case 6: //%si
107 return (short)(regs->esi & 0xFFFF);
108 case 7: //%di
109 return (short)(regs->edi & 0xFFFF);
110 default:
111 printk("(get_reg_value)size case 1 error\n");
112 return -1;
113 }
114 case LONG:
115 switch (index) {
116 case 0: //%eax
117 return regs->eax;
118 case 1: //%ecx
119 return regs->ecx;
120 case 2: //%edx
121 return regs->edx;
123 case 3: //%ebx
124 return regs->ebx;
125 case 4: //%esp
126 return regs->esp;
127 case 5: //%ebp
128 return regs->ebp;
129 case 6: //%esi
130 return regs->esi;
131 case 7: //%edi
132 return regs->edi;
133 default:
134 printk("(get_reg_value)size case 2 error\n");
135 return -1;
136 }
137 default:
138 printk("(get_reg_value)size case error\n");
139 return -1;
140 }
141 }
142 #endif
144 static inline unsigned char *check_prefix(unsigned char *inst, struct instruction *thread_inst)
145 {
146 while (1) {
147 switch (*inst) {
148 case 0xf3: //REPZ
149 thread_inst->flags = REPZ;
150 break;
151 case 0xf2: //REPNZ
152 thread_inst->flags = REPNZ;
153 break;
154 case 0xf0: //LOCK
155 break;
156 case 0x2e: //CS
157 case 0x36: //SS
158 case 0x3e: //DS
159 case 0x26: //ES
160 case 0x64: //FS
161 case 0x65: //GS
162 thread_inst->seg_sel = *inst;
163 break;
164 case 0x66: //32bit->16bit
165 thread_inst->op_size = WORD;
166 break;
167 case 0x67:
168 printf("Not handling 0x67 (yet)\n");
169 domain_crash_synchronous();
170 break;
171 default:
172 return inst;
173 }
174 inst++;
175 }
176 }
178 static inline unsigned long get_immediate(int op16, const unsigned char *inst, int op_size)
179 {
180 int mod, reg, rm;
181 unsigned long val = 0;
182 int i;
184 mod = (*inst >> 6) & 3;
185 reg = (*inst >> 3) & 7;
186 rm = *inst & 7;
188 inst++; //skip ModR/M byte
189 if (mod != 3 && rm == 4) {
190 inst++; //skip SIB byte
191 }
193 switch(mod) {
194 case 0:
195 if (rm == 5) {
196 if (op16)
197 inst = inst + 2; //disp16, skip 2 bytes
198 else
199 inst = inst + 4; //disp32, skip 4 bytes
200 }
201 break;
202 case 1:
203 inst++; //disp8, skip 1 byte
204 break;
205 case 2:
206 if (op16)
207 inst = inst + 2; //disp16, skip 2 bytes
208 else
209 inst = inst + 4; //disp32, skip 4 bytes
210 break;
211 }
212 for (i = 0; i < op_size; i++) {
213 val |= (*inst++ & 0xff) << (8 * i);
214 }
216 return val;
217 }
219 static inline int get_index(const unsigned char *inst)
220 {
221 int mod, reg, rm;
223 mod = (*inst >> 6) & 3;
224 reg = (*inst >> 3) & 7;
225 rm = *inst & 7;
227 //Only one operand in the instruction is register
228 if (mod == 3) {
229 return rm;
230 } else {
231 return reg;
232 }
233 return 0;
234 }
236 static int vmx_decode(const unsigned char *inst, struct instruction *thread_inst)
237 {
238 unsigned long eflags;
239 int index, vm86 = 0;
241 __vmread(GUEST_EFLAGS, &eflags);
242 if (eflags & X86_EFLAGS_VM)
243 vm86 = 1;
245 if (vm86) { /* meaning is reversed */
246 if (thread_inst->op_size == WORD)
247 thread_inst->op_size = LONG;
248 else if (thread_inst->op_size == LONG)
249 thread_inst->op_size = WORD;
250 else if (thread_inst->op_size == 0)
251 thread_inst->op_size = WORD;
252 }
254 switch(*inst) {
255 case 0x88:
256 /* mov r8 to m8 */
257 thread_inst->op_size = BYTE;
258 index = get_index((inst + 1));
259 thread_inst->operand[0] = mk_operand(BYTE, index, 0, REGISTER);
260 break;
261 case 0x89:
262 /* mov r32/16 to m32/16 */
263 index = get_index((inst + 1));
264 if (thread_inst->op_size == WORD) {
265 thread_inst->operand[0] = mk_operand(WORD, index, 0, REGISTER);
266 } else {
267 thread_inst->op_size = LONG;
268 thread_inst->operand[0] = mk_operand(LONG, index, 0, REGISTER);
269 }
270 break;
271 case 0x8a:
272 /* mov m8 to r8 */
273 thread_inst->op_size = BYTE;
274 index = get_index((inst + 1));
275 thread_inst->operand[1] = mk_operand(BYTE, index, 0, REGISTER);
276 break;
277 case 0x8b:
278 /* mov r32/16 to m32/16 */
279 index = get_index((inst + 1));
280 if (thread_inst->op_size == WORD) {
281 thread_inst->operand[1] = mk_operand(WORD, index, 0, REGISTER);
282 } else {
283 thread_inst->op_size = LONG;
284 thread_inst->operand[1] = mk_operand(LONG, index, 0, REGISTER);
285 }
286 break;
287 case 0x8c:
288 case 0x8e:
289 printk("%x, This opcode hasn't been handled yet!", *inst);
290 return DECODE_failure;
291 /* Not handle it yet. */
292 case 0xa0:
293 /* mov byte to al */
294 thread_inst->op_size = BYTE;
295 thread_inst->operand[1] = mk_operand(BYTE, 0, 0, REGISTER);
296 break;
297 case 0xa1:
298 /* mov word/doubleword to ax/eax */
299 if (thread_inst->op_size == WORD) {
300 thread_inst->operand[1] = mk_operand(WORD, 0, 0, REGISTER);
301 } else {
302 thread_inst->op_size = LONG;
303 thread_inst->operand[1] = mk_operand(LONG, 0, 0, REGISTER);
304 }
305 break;
306 case 0xa2:
307 /* mov al to (seg:offset) */
308 thread_inst->op_size = BYTE;
309 thread_inst->operand[0] = mk_operand(BYTE, 0, 0, REGISTER);
310 break;
311 case 0xa3:
312 /* mov ax/eax to (seg:offset) */
313 if (thread_inst->op_size == WORD) {
314 thread_inst->operand[0] = mk_operand(WORD, 0, 0, REGISTER);
315 } else {
316 thread_inst->op_size = LONG;
317 thread_inst->operand[0] = mk_operand(LONG, 0, 0, REGISTER);
318 }
319 break;
320 case 0xa4:
321 /* movsb */
322 thread_inst->op_size = BYTE;
323 strcpy((char *)thread_inst->i_name, "movs");
324 return DECODE_success;
325 case 0xa5:
326 /* movsw/movsl */
327 if (thread_inst->op_size == WORD) {
328 } else {
329 thread_inst->op_size = LONG;
330 }
331 strcpy((char *)thread_inst->i_name, "movs");
332 return DECODE_success;
333 case 0xaa:
334 /* stosb */
335 thread_inst->op_size = BYTE;
336 strcpy((char *)thread_inst->i_name, "stosb");
337 return DECODE_success;
338 case 0xab:
339 /* stosw/stosl */
340 if (thread_inst->op_size == WORD) {
341 strcpy((char *)thread_inst->i_name, "stosw");
342 } else {
343 thread_inst->op_size = LONG;
344 strcpy((char *)thread_inst->i_name, "stosl");
345 }
346 return DECODE_success;
347 case 0xc6:
348 /* mov imm8 to m8 */
349 thread_inst->op_size = BYTE;
350 thread_inst->operand[0] = mk_operand(BYTE, 0, 0, IMMEDIATE);
351 thread_inst->immediate = get_immediate(vm86,
352 (inst+1), thread_inst->op_size);
353 break;
354 case 0xc7:
355 /* mov imm16/32 to m16/32 */
356 if (thread_inst->op_size == WORD) {
357 thread_inst->operand[0] = mk_operand(WORD, 0, 0, IMMEDIATE);
358 } else {
359 thread_inst->op_size = LONG;
360 thread_inst->operand[0] = mk_operand(LONG, 0, 0, IMMEDIATE);
361 }
362 thread_inst->immediate = get_immediate(vm86,
363 (inst+1), thread_inst->op_size);
364 break;
365 case 0x0f:
366 break;
367 default:
368 printk("%x, This opcode hasn't been handled yet!", *inst);
369 return DECODE_failure;
370 }
372 strcpy((char *)thread_inst->i_name, "mov");
373 if (*inst != 0x0f) {
374 return DECODE_success;
375 }
377 inst++;
378 switch (*inst) {
380 /* movz */
381 case 0xb6:
382 index = get_index((inst + 1));
383 if (thread_inst->op_size == WORD) {
384 thread_inst->operand[1] = mk_operand(WORD, index, 0, REGISTER);
385 } else {
386 thread_inst->operand[1] = mk_operand(LONG, index, 0, REGISTER);
388 }
389 thread_inst->op_size = BYTE;
390 strcpy((char *)thread_inst->i_name, "movzb");
392 return DECODE_success;
393 case 0xb7:
394 thread_inst->op_size = WORD;
395 index = get_index((inst + 1));
396 thread_inst->operand[1] = mk_operand(LONG, index, 0, REGISTER);
397 strcpy((char *)thread_inst->i_name, "movzw");
399 return DECODE_success;
400 default:
401 printk("0f %x, This opcode hasn't been handled yet!", *inst);
402 return DECODE_failure;
403 }
405 /* will never reach here */
406 return DECODE_failure;
407 }
409 static int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip, int inst_len)
410 {
411 l1_pgentry_t gpte;
412 unsigned long mfn;
413 unsigned long ma;
414 unsigned char * inst_start;
416 if (inst_len > MAX_INST_LEN || inst_len <= 0) {
417 return 0;
418 }
420 if ((guest_eip & PAGE_MASK) == ((guest_eip + inst_len) & PAGE_MASK)) {
421 gpte = gva_to_gpte(guest_eip);
422 mfn = phys_to_machine_mapping(l1e_get_pfn(gpte));
423 ma = (mfn << PAGE_SHIFT) | (guest_eip & (PAGE_SIZE - 1));
424 inst_start = (unsigned char *)map_domain_mem(ma);
426 memcpy((char *)buf, inst_start, inst_len);
427 unmap_domain_mem(inst_start);
428 } else {
429 // Todo: In two page frames
430 BUG();
431 }
433 return inst_len;
434 }
436 static void init_instruction(struct instruction *mmio_inst)
437 {
438 memset(mmio_inst->i_name, '0', I_NAME_LEN);
439 mmio_inst->op_size = 0;
440 mmio_inst->offset = 0;
441 mmio_inst->immediate = 0;
442 mmio_inst->seg_sel = 0;
443 mmio_inst->op_num = 0;
445 mmio_inst->operand[0] = 0;
446 mmio_inst->operand[1] = 0;
447 mmio_inst->operand[2] = 0;
449 mmio_inst->flags = 0;
450 }
452 static int read_from_mmio(struct instruction *inst_p)
453 {
454 // Only for mov instruction now!!!
455 if (inst_p->operand[1] & REGISTER)
456 return 1;
458 return 0;
459 }
461 // dir: 1 read from mmio
462 // 0 write to mmio
463 static void send_mmio_req(unsigned long gpa,
464 struct instruction *inst_p, long value, int dir, int pvalid)
465 {
466 struct exec_domain *d = current;
467 vcpu_iodata_t *vio;
468 ioreq_t *p;
469 int vm86;
470 struct mi_per_cpu_info *mpci_p;
471 struct xen_regs *inst_decoder_regs;
472 extern long evtchn_send(int lport);
473 extern long do_block(void);
475 mpci_p = &current->arch.arch_vmx.vmx_platform.mpci;
476 inst_decoder_regs = mpci_p->inst_decoder_regs;
478 vio = (vcpu_iodata_t *) d->arch.arch_vmx.vmx_platform.shared_page_va;
479 if (vio == NULL) {
480 printk("bad shared page\n");
481 domain_crash_synchronous();
482 }
483 p = &vio->vp_ioreq;
485 vm86 = inst_decoder_regs->eflags & X86_EFLAGS_VM;
487 if (test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)) {
488 printf("VMX I/O has not yet completed\n");
489 domain_crash_synchronous();
490 }
492 set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
493 p->dir = dir;
494 p->pdata_valid = pvalid;
496 p->port_mm = 1;
497 p->size = inst_p->op_size;
498 p->addr = gpa;
499 p->u.data = value;
501 p->state = STATE_IOREQ_READY;
503 if (inst_p->flags & REPZ) {
504 if (vm86)
505 p->count = inst_decoder_regs->ecx & 0xFFFF;
506 else
507 p->count = inst_decoder_regs->ecx;
508 p->df = (inst_decoder_regs->eflags & EF_DF) ? 1 : 0;
509 } else
510 p->count = 1;
512 if (pvalid)
513 p->u.pdata = (void *) gva_to_gpa(p->u.data);
515 #if 0
516 printf("send_mmio_req: eip 0x%lx:0x%lx, dir %d, pdata_valid %d, ",
517 inst_decoder_regs->cs, inst_decoder_regs->eip, p->dir, p->pdata_valid);
518 printf("port_mm %d, size %lld, addr 0x%llx, value 0x%lx, count %lld\n",
519 p->port_mm, p->size, p->addr, value, p->count);
520 #endif
522 evtchn_send(IOPACKET_PORT);
523 do_block();
524 }
526 void handle_mmio(unsigned long va, unsigned long gpa)
527 {
528 unsigned long eip, eflags, cs;
529 unsigned long inst_len, inst_addr;
530 struct mi_per_cpu_info *mpci_p;
531 struct xen_regs *inst_decoder_regs;
532 struct instruction mmio_inst;
533 unsigned char inst[MAX_INST_LEN];
534 int vm86, ret;
536 mpci_p = &current->arch.arch_vmx.vmx_platform.mpci;
537 inst_decoder_regs = mpci_p->inst_decoder_regs;
539 __vmread(GUEST_EIP, &eip);
540 __vmread(INSTRUCTION_LEN, &inst_len);
542 __vmread(GUEST_EFLAGS, &eflags);
543 vm86 = eflags & X86_EFLAGS_VM;
545 if (vm86) {
546 __vmread(GUEST_CS_SELECTOR, &cs);
547 inst_addr = (cs << 4) | eip;
548 } else
549 inst_addr = eip; /* XXX should really look at GDT[cs].base too */
551 memset(inst, '0', MAX_INST_LEN);
552 ret = inst_copy_from_guest(inst, inst_addr, inst_len);
553 if (ret != inst_len) {
554 printk("handle_mmio - EXIT: get guest instruction fault\n");
555 domain_crash_synchronous();
556 }
558 #if 0
559 printk("handle_mmio: cs:eip 0x%lx:0x%lx(0x%lx): opcode",
560 cs, eip, inst_addr, inst_len);
561 for (ret = 0; ret < inst_len; ret++)
562 printk(" %02x", inst[ret]);
563 printk("\n");
564 #endif
566 init_instruction(&mmio_inst);
568 if (vmx_decode(check_prefix(inst, &mmio_inst), &mmio_inst) == DECODE_failure)
569 domain_crash_synchronous();
571 __vmwrite(GUEST_EIP, eip + inst_len);
572 store_xen_regs(inst_decoder_regs);
574 // Only handle "mov" and "movs" instructions!
575 if (!strncmp((char *)mmio_inst.i_name, "movz", 4)) {
576 if (read_from_mmio(&mmio_inst)) {
577 // Send the request and waiting for return value.
578 mpci_p->mmio_target = mmio_inst.operand[1] | WZEROEXTEND;
579 send_mmio_req(gpa, &mmio_inst, 0, IOREQ_READ, 0);
580 return ;
581 } else {
582 printk("handle_mmio - EXIT: movz error!\n");
583 domain_crash_synchronous();
584 }
585 }
587 if (!strncmp((char *)mmio_inst.i_name, "movs", 4)) {
588 unsigned long addr = 0;
589 int dir;
591 if (vm86) {
592 unsigned long seg;
594 __vmread(GUEST_ES_SELECTOR, &seg);
595 if (((seg << 4) | (inst_decoder_regs->edi & 0xFFFF)) == va) {
596 dir = IOREQ_WRITE;
597 __vmread(GUEST_DS_SELECTOR, &seg);
598 addr = (seg << 4) | (inst_decoder_regs->esi & 0xFFFF);
599 } else {
600 dir = IOREQ_READ;
601 addr = (seg << 4) | (inst_decoder_regs->edi & 0xFFFF);
602 }
603 } else { /* XXX should really look at GDT[ds/es].base too */
604 if (va == inst_decoder_regs->edi) {
605 dir = IOREQ_WRITE;
606 addr = inst_decoder_regs->esi;
607 } else {
608 dir = IOREQ_READ;
609 addr = inst_decoder_regs->edi;
610 }
611 }
613 send_mmio_req(gpa, &mmio_inst, addr, dir, 1);
614 return;
615 }
617 if (!strncmp((char *)mmio_inst.i_name, "mov", 3)) {
618 long value = 0;
619 int size, index;
621 if (read_from_mmio(&mmio_inst)) {
622 // Send the request and waiting for return value.
623 mpci_p->mmio_target = mmio_inst.operand[1];
624 send_mmio_req(gpa, &mmio_inst, value, IOREQ_READ, 0);
625 } else {
626 // Write to MMIO
627 if (mmio_inst.operand[0] & IMMEDIATE) {
628 value = mmio_inst.immediate;
629 } else if (mmio_inst.operand[0] & REGISTER) {
630 size = operand_size(mmio_inst.operand[0]);
631 index = operand_index(mmio_inst.operand[0]);
632 value = get_reg_value(size, index, 0, inst_decoder_regs);
633 } else {
634 domain_crash_synchronous();
635 }
636 send_mmio_req(gpa, &mmio_inst, value, IOREQ_WRITE, 0);
637 return;
638 }
639 }
641 if (!strncmp((char *)mmio_inst.i_name, "stos", 4)) {
642 send_mmio_req(gpa, &mmio_inst,
643 inst_decoder_regs->eax, IOREQ_WRITE, 0);
644 }
646 domain_crash_synchronous();
647 }
649 #endif /* CONFIG_VMX */
651 /*
652 * Local variables:
653 * mode: C
654 * c-set-style: "BSD"
655 * c-basic-offset: 4
656 * tab-width: 4
657 * indent-tabs-mode: nil
658 * End:
659 */